AI-DecisionTree
view release on metacpan or search on metacpan
lib/AI/DecisionTree.pm view on Meta::CPAN
return \%node;
}
sub best_attr {
my ($self, $instances) = @_;
# 0 is a perfect score, entropy(#instances) is the worst possible score
my ($best_score, $best_attr) = (@$instances * $self->entropy( map $_->result_int, @$instances ), undef);
my $all_attr = $self->{attributes};
foreach my $attr (keys %$all_attr) {
# %tallies is correlation between each attr value and result
# %total is number of instances with each attr value
my (%totals, %tallies);
my $num_undef = AI::DecisionTree::Instance::->tally($instances, \%tallies, \%totals, $all_attr->{$attr});
next unless keys %totals; # Make sure at least one instance defines this attribute
my $score = 0;
while (my ($opt, $vals) = each %tallies) {
$score += $totals{$opt} * $self->entropy2( $vals, $totals{$opt} );
}
($best_attr, $best_score) = ($attr, $score) if $score < $best_score;
}
return $best_attr;
}
sub entropy2 {
shift;
my ($counts, $total) = @_;
# Entropy is defined with log base 2 - we just divide by log(2) at the end to adjust.
my $sum = 0;
$sum += $_ * log($_) foreach values %$counts;
return +(log($total) - $sum/$total)/log(2);
}
sub entropy {
shift;
my %count;
$count{$_}++ foreach @_;
# Entropy is defined with log base 2 - we just divide by log(2) at the end to adjust.
my $sum = 0;
$sum += $_ * log($_) foreach values %count;
return +(log(@_) - $sum/@_)/log(2);
}
sub prune_tree {
my $self = shift;
# We use a minimum-description-length approach. We calculate the
# score of each node:
# n = number of nodes below
# r = number of results (categories) in the entire tree
# i = number of instances in the entire tree
# e = number of errors below this node
# Hypothesis description length (MML):
# describe tree: number of nodes + number of edges
# describe exceptions: num_exceptions * log2(total_num_instances) * log2(total_num_results)
my $r = keys %{ $self->{results} };
my $i = $self->{tree}{instances};
my $exception_cost = log($r) * log($i) / log(2)**2;
# Pruning can turn a branch into a leaf
my $maybe_prune = sub {
my ($self, $node) = @_;
return unless $node->{children}; # Can't prune leaves
my $nodes_below = $self->nodes_below($node);
my $tree_cost = 2 * $nodes_below - 1; # $edges_below == $nodes_below - 1
my $exceptions = $self->exceptions( $node );
my $simple_rule_exceptions = $node->{instances} - $node->{distribution}[1];
my $score = -$nodes_below - ($exceptions - $simple_rule_exceptions) * $exception_cost;
#warn "Score = $score = -$nodes_below - ($exceptions - $simple_rule_exceptions) * $exception_cost\n";
if ($score < 0) {
delete @{$node}{'children', 'split_on', 'exceptions', 'nodes_below'};
$node->{result} = $node->{distribution}[0];
# XXX I'm not cleaning up 'exceptions' or 'nodes_below' keys up the tree
}
};
$self->_traverse($maybe_prune);
}
sub exceptions {
my ($self, $node) = @_;
return $node->{exceptions} if exists $node->{exeptions};
my $count = 0;
if ( exists $node->{result} ) {
$count = $node->{instances} - $node->{distribution}[1];
} else {
foreach my $child ( values %{$node->{children}} ) {
$count += $self->exceptions($child);
}
}
return $node->{exceptions} = $count;
}
sub nodes_below {
my ($self, $node) = @_;
return $node->{nodes_below} if exists $node->{nodes_below};
my $count = 0;
$self->_traverse( sub {$count++}, $node );
return $node->{nodes_below} = $count - 1;
}
# This is *not* for external use, I may change it.
sub _traverse {
( run in 0.345 second using v1.01-cache-2.11-cpan-39bf76dae61 )