AI-DecisionTree
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/AI/DecisionTree.pm view on Meta::CPAN
# Hypothesis description length (MML):
# describe tree: number of nodes + number of edges
# describe exceptions: num_exceptions * log2(total_num_instances) * log2(total_num_results)
my $r = keys %{ $self->{results} };
my $i = $self->{tree}{instances};
my $exception_cost = log($r) * log($i) / log(2)**2;
# Pruning can turn a branch into a leaf
my $maybe_prune = sub {
my ($self, $node) = @_;
return unless $node->{children}; # Can't prune leaves
my $nodes_below = $self->nodes_below($node);
my $tree_cost = 2 * $nodes_below - 1; # $edges_below == $nodes_below - 1
my $exceptions = $self->exceptions( $node );
my $simple_rule_exceptions = $node->{instances} - $node->{distribution}[1];
my $score = -$nodes_below - ($exceptions - $simple_rule_exceptions) * $exception_cost;
#warn "Score = $score = -$nodes_below - ($exceptions - $simple_rule_exceptions) * $exception_cost\n";
if ($score < 0) {
delete @{$node}{'children', 'split_on', 'exceptions', 'nodes_below'};
$node->{result} = $node->{distribution}[0];
# XXX I'm not cleaning up 'exceptions' or 'nodes_below' keys up the tree
}
};
$self->_traverse($maybe_prune);
}
sub exceptions {
my ($self, $node) = @_;
return $node->{exceptions} if exists $node->{exeptions};
my $count = 0;
if ( exists $node->{result} ) {
$count = $node->{instances} - $node->{distribution}[1];
} else {
foreach my $child ( values %{$node->{children}} ) {
$count += $self->exceptions($child);
}
}
return $node->{exceptions} = $count;
}
sub nodes_below {
my ($self, $node) = @_;
return $node->{nodes_below} if exists $node->{nodes_below};
my $count = 0;
$self->_traverse( sub {$count++}, $node );
return $node->{nodes_below} = $count - 1;
}
# This is *not* for external use, I may change it.
sub _traverse {
my ($self, $callback, $node, $parent, $node_name) = @_;
$node ||= $self->{tree};
ref($callback) ? $callback->($self, $node, $parent, $node_name) : $self->$callback($node, $parent, $node_name);
return unless $node->{children};
foreach my $child ( keys %{$node->{children}} ) {
$self->_traverse($callback, $node->{children}{$child}, $node, $child);
}
}
sub get_result {
my ($self, %args) = @_;
croak "Missing 'attributes' or 'callback' parameter" unless $args{attributes} or $args{callback};
$self->train unless $self->{tree};
my $tree = $self->{tree};
while (1) {
if (exists $tree->{result}) {
my $r = $tree->{result};
return $r unless wantarray;
my %dist = @{$tree->{distribution}};
my $confidence = $tree->{distribution}[1] / $tree->{instances};
# my $confidence = P(H|D) = [P(D|H)P(H)]/[P(D|H)P(H)+P(D|H')P(H')]
# = [P(D|H)P(H)]/P(D);
# my $confidence =
# $confidence *= $self->{prior_freqs}{$r} / $self->{total_instances};
return ($r, $confidence, \%dist);
}
my $instance_val = (exists $args{callback} ? $args{callback}->($tree->{split_on}) :
exists $args{attributes}{$tree->{split_on}} ? $args{attributes}{$tree->{split_on}} :
'<undef>');
## no critic (ProhibitExplicitReturnUndef)
$tree = $tree->{children}{ $instance_val }
or return undef;
}
}
sub as_graphviz {
my ($self, %args) = @_;
my $colors = delete $args{leaf_colors} || {};
require GraphViz;
my $g = GraphViz->new(%args);
my $id = 1;
my $add_edge = sub {
my ($self, $node, $parent, $node_name) = @_;
# We use stringified reference names for node names, as a convenient hack.
if ($node->{split_on}) {
$g->add_node( "$node",
label => $node->{split_on},
shape => 'ellipse',
);
} else {
my $i = 0;
my $distr = join ',', grep {$i++ % 2} @{$node->{distribution}};
my %fill = (exists $colors->{$node->{result}} ?
(fillcolor => $colors->{$node->{result}},
style => 'filled') :
()
);
$g->add_node( "$node",
label => "$node->{result} ($distr)",
shape => 'box',
%fill,
);
}
$g->add_edge( "$parent" => "$node",
label => $node_name,
) if $parent;
};
$self->_traverse( $add_edge );
return $g;
}
sub rule_tree {
my $self = shift;
my ($tree) = @_ ? @_ : $self->{tree};
# build tree:
# [ question, { results => [ question, { ... } ] } ]
return $tree->{result} if exists $tree->{result};
return [
$tree->{split_on}, {
map { $_ => $self->rule_tree($tree->{children}{$_}) } keys %{$tree->{children}},
}
];
}
sub rule_statements {
my $self = shift;
my ($stmt, $tree) = @_ ? @_ : ('', $self->{tree});
return("$stmt -> '$tree->{result}'") if exists $tree->{result};
my @out;
my $prefix = $stmt ? "$stmt and" : "if";
foreach my $val (keys %{$tree->{children}}) {
push @out, $self->rule_statements("$prefix $tree->{split_on}='$val'", $tree->{children}{$val});
}
return @out;
}
### Some instance accessor stuff:
sub _result {
my ($self, $instance) = @_;
my $int = $instance->result_int;
return $self->{results_reverse}[$int];
}
sub _delete_value {
my ($self, $instance, $attr) = @_;
my $val = $self->_value($instance, $attr);
return unless defined $val;
$instance->set_value($self->{attributes}{$attr}, 0);
return $val;
}
sub _value {
my ($self, $instance, $attr) = @_;
return unless exists $self->{attributes}{$attr};
my $val_int = $instance->value_int($self->{attributes}{$attr});
return $self->{attribute_values_reverse}{$attr}[$val_int];
}
view all matches for this distributionview release on metacpan - search on metacpan
( run in 1.248 second using v1.00-cache-2.02-grep-82fe00e-cpan-72ae3ad1e6da )