AI-DecisionTree
view release on metacpan or search on metacpan
Instance/t/02-leaktest.t view on Meta::CPAN
#!/usr/bin/perl
use Test;
BEGIN { plan tests => 4 }
use AI::DecisionTree::Instance;
ok(1);
my $x = 0;
{
local *{"AI::DecisionTree::Instance::DESTROY"} = sub { $x = 1 };
{
my $i = new AI::DecisionTree::Instance([3,4], 4, "foo");
ok $x, 0;
}
ok $x, 1;
}
ok $x, 1;
lib/AI/DecisionTree.pm view on Meta::CPAN
package AI::DecisionTree;
{
$AI::DecisionTree::VERSION = '0.11';
}
use AI::DecisionTree::Instance;
use Carp;
use vars qw(@ISA);
sub new {
my $package = shift;
return bless {
noise_mode => 'fatal',
prune => 1,
purge => 1,
verbose => 0,
max_depth => 0,
@_,
nodes => 0,
instances => [],
name_gen => 0,
}, $package;
}
sub nodes { $_[0]->{nodes} }
sub noise_mode { $_[0]->{noise_mode} }
sub depth { $_[0]->{depth} }
sub add_instance {
my ($self, %args) = @_;
croak "Missing 'attributes' parameter" unless $args{attributes};
croak "Missing 'result' parameter" unless defined $args{result};
$args{name} = $self->{name_gen}++ unless exists $args{name};
my @attributes;
while (my ($k, $v) = each %{$args{attributes}}) {
$attributes[ _hlookup($self->{attributes}, $k) ] = _hlookup($self->{attribute_values}{$k}, $v);
}
$_ ||= 0 foreach @attributes;
push @{$self->{instances}}, AI::DecisionTree::Instance->new(\@attributes, _hlookup($self->{results}, $args{result}), $args{name});
}
sub _hlookup {
$_[0] ||= {}; # Autovivify as a hash
my ($hash, $key) = @_;
unless (exists $hash->{$key}) {
$hash->{$key} = 1 + keys %$hash;
}
return $hash->{$key};
}
sub _create_lookup_hashes {
my $self = shift;
my $h = $self->{results};
$self->{results_reverse} = [ undef, sort {$h->{$a} <=> $h->{$b}} keys %$h ];
foreach my $attr (keys %{$self->{attribute_values}}) {
my $h = $self->{attribute_values}{$attr};
$self->{attribute_values_reverse}{$attr} = [ undef, sort {$h->{$a} <=> $h->{$b}} keys %$h ];
}
}
sub train {
my ($self, %args) = @_;
if (not @{ $self->{instances} }) {
croak "Training data has been purged, can't re-train" if $self->{tree};
croak "Must add training instances before calling train()";
}
$self->_create_lookup_hashes;
local $self->{curr_depth} = 0;
local $self->{max_depth} = $args{max_depth} if exists $args{max_depth};
$self->{depth} = 0;
$self->{tree} = $self->_expand_node( instances => $self->{instances} );
$self->{total_instances} = @{$self->{instances}};
$self->prune_tree if $self->{prune};
$self->do_purge if $self->purge;
return 1;
}
sub do_purge {
my $self = shift;
delete @{$self}{qw(instances attribute_values attribute_values_reverse results results_reverse)};
}
sub copy_instances {
my ($self, %opt) = @_;
croak "Missing 'from' parameter to copy_instances()" unless exists $opt{from};
my $other = $opt{from};
croak "'from' parameter is not a decision tree" unless UNIVERSAL::isa($other, __PACKAGE__);
foreach (qw(instances attributes attribute_values results)) {
$self->{$_} = $other->{$_};
}
$self->_create_lookup_hashes;
}
sub set_results {
my ($self, $hashref) = @_;
foreach my $instance (@{$self->{instances}}) {
my $name = $instance->name;
croak "No result given for instance '$name'" unless exists $hashref->{$name};
$instance->set_result( $self->{results}{ $hashref->{$name} } );
}
}
sub instances { $_[0]->{instances} }
sub purge {
my $self = shift;
$self->{purge} = shift if @_;
return $self->{purge};
}
# Each node contains:
# { split_on => $attr_name,
# children => { $attr_value1 => $node1,
# $attr_value2 => $node2, ... }
# }
# or
# { result => $result }
sub _expand_node {
my ($self, %args) = @_;
my $instances = $args{instances};
print STDERR '.' if $self->{verbose};
$self->{depth} = $self->{curr_depth} if $self->{curr_depth} > $self->{depth};
local $self->{curr_depth} = $self->{curr_depth} + 1;
$self->{nodes}++;
my %results;
$results{$self->_result($_)}++ foreach @$instances;
lib/AI/DecisionTree.pm view on Meta::CPAN
scalar @$instances, " instances into multiple buckets (@{[ keys %split ]})")
unless keys %split > 1;
foreach my $value (keys %split) {
$node{children}{$value} = $self->_expand_node( instances => $split{$value} );
}
return \%node;
}
sub best_attr {
my ($self, $instances) = @_;
# 0 is a perfect score, entropy(#instances) is the worst possible score
my ($best_score, $best_attr) = (@$instances * $self->entropy( map $_->result_int, @$instances ), undef);
my $all_attr = $self->{attributes};
foreach my $attr (keys %$all_attr) {
# %tallies is correlation between each attr value and result
# %total is number of instances with each attr value
lib/AI/DecisionTree.pm view on Meta::CPAN
while (my ($opt, $vals) = each %tallies) {
$score += $totals{$opt} * $self->entropy2( $vals, $totals{$opt} );
}
($best_attr, $best_score) = ($attr, $score) if $score < $best_score;
}
return $best_attr;
}
sub entropy2 {
shift;
my ($counts, $total) = @_;
# Entropy is defined with log base 2 - we just divide by log(2) at the end to adjust.
my $sum = 0;
$sum += $_ * log($_) foreach values %$counts;
return +(log($total) - $sum/$total)/log(2);
}
sub entropy {
shift;
my %count;
$count{$_}++ foreach @_;
# Entropy is defined with log base 2 - we just divide by log(2) at the end to adjust.
my $sum = 0;
$sum += $_ * log($_) foreach values %count;
return +(log(@_) - $sum/@_)/log(2);
}
sub prune_tree {
my $self = shift;
# We use a minimum-description-length approach. We calculate the
# score of each node:
# n = number of nodes below
# r = number of results (categories) in the entire tree
# i = number of instances in the entire tree
# e = number of errors below this node
# Hypothesis description length (MML):
# describe tree: number of nodes + number of edges
# describe exceptions: num_exceptions * log2(total_num_instances) * log2(total_num_results)
my $r = keys %{ $self->{results} };
my $i = $self->{tree}{instances};
my $exception_cost = log($r) * log($i) / log(2)**2;
# Pruning can turn a branch into a leaf
my $maybe_prune = sub {
my ($self, $node) = @_;
return unless $node->{children}; # Can't prune leaves
my $nodes_below = $self->nodes_below($node);
my $tree_cost = 2 * $nodes_below - 1; # $edges_below == $nodes_below - 1
my $exceptions = $self->exceptions( $node );
my $simple_rule_exceptions = $node->{instances} - $node->{distribution}[1];
my $score = -$nodes_below - ($exceptions - $simple_rule_exceptions) * $exception_cost;
lib/AI/DecisionTree.pm view on Meta::CPAN
if ($score < 0) {
delete @{$node}{'children', 'split_on', 'exceptions', 'nodes_below'};
$node->{result} = $node->{distribution}[0];
# XXX I'm not cleaning up 'exceptions' or 'nodes_below' keys up the tree
}
};
$self->_traverse($maybe_prune);
}
sub exceptions {
my ($self, $node) = @_;
return $node->{exceptions} if exists $node->{exeptions};
my $count = 0;
if ( exists $node->{result} ) {
$count = $node->{instances} - $node->{distribution}[1];
} else {
foreach my $child ( values %{$node->{children}} ) {
$count += $self->exceptions($child);
}
}
return $node->{exceptions} = $count;
}
sub nodes_below {
my ($self, $node) = @_;
return $node->{nodes_below} if exists $node->{nodes_below};
my $count = 0;
$self->_traverse( sub {$count++}, $node );
return $node->{nodes_below} = $count - 1;
}
# This is *not* for external use, I may change it.
sub _traverse {
my ($self, $callback, $node, $parent, $node_name) = @_;
$node ||= $self->{tree};
ref($callback) ? $callback->($self, $node, $parent, $node_name) : $self->$callback($node, $parent, $node_name);
return unless $node->{children};
foreach my $child ( keys %{$node->{children}} ) {
$self->_traverse($callback, $node->{children}{$child}, $node, $child);
}
}
sub get_result {
my ($self, %args) = @_;
croak "Missing 'attributes' or 'callback' parameter" unless $args{attributes} or $args{callback};
$self->train unless $self->{tree};
my $tree = $self->{tree};
while (1) {
if (exists $tree->{result}) {
my $r = $tree->{result};
return $r unless wantarray;
lib/AI/DecisionTree.pm view on Meta::CPAN
my $instance_val = (exists $args{callback} ? $args{callback}->($tree->{split_on}) :
exists $args{attributes}{$tree->{split_on}} ? $args{attributes}{$tree->{split_on}} :
'<undef>');
## no critic (ProhibitExplicitReturnUndef)
$tree = $tree->{children}{ $instance_val }
or return undef;
}
}
sub as_graphviz {
my ($self, %args) = @_;
my $colors = delete $args{leaf_colors} || {};
require GraphViz;
my $g = GraphViz->new(%args);
my $id = 1;
my $add_edge = sub {
my ($self, $node, $parent, $node_name) = @_;
# We use stringified reference names for node names, as a convenient hack.
if ($node->{split_on}) {
$g->add_node( "$node",
label => $node->{split_on},
shape => 'ellipse',
);
} else {
my $i = 0;
lib/AI/DecisionTree.pm view on Meta::CPAN
}
$g->add_edge( "$parent" => "$node",
label => $node_name,
) if $parent;
};
$self->_traverse( $add_edge );
return $g;
}
sub rule_tree {
my $self = shift;
my ($tree) = @_ ? @_ : $self->{tree};
# build tree:
# [ question, { results => [ question, { ... } ] } ]
return $tree->{result} if exists $tree->{result};
return [
$tree->{split_on}, {
map { $_ => $self->rule_tree($tree->{children}{$_}) } keys %{$tree->{children}},
}
];
}
sub rule_statements {
my $self = shift;
my ($stmt, $tree) = @_ ? @_ : ('', $self->{tree});
return("$stmt -> '$tree->{result}'") if exists $tree->{result};
my @out;
my $prefix = $stmt ? "$stmt and" : "if";
foreach my $val (keys %{$tree->{children}}) {
push @out, $self->rule_statements("$prefix $tree->{split_on}='$val'", $tree->{children}{$val});
}
return @out;
}
### Some instance accessor stuff:
sub _result {
my ($self, $instance) = @_;
my $int = $instance->result_int;
return $self->{results_reverse}[$int];
}
sub _delete_value {
my ($self, $instance, $attr) = @_;
my $val = $self->_value($instance, $attr);
return unless defined $val;
$instance->set_value($self->{attributes}{$attr}, 0);
return $val;
}
sub _value {
my ($self, $instance, $attr) = @_;
return unless exists $self->{attributes}{$attr};
my $val_int = $instance->value_int($self->{attributes}{$attr});
return $self->{attribute_values_reverse}{$attr}[$val_int];
}
1;
__END__
t/01-simple.t view on Meta::CPAN
{
# Test attribute callbacks
my %attributes = (
outlook => 'rain',
temperature => 'mild',
humidity => 'high',
wind => 'strong',
);
my $result = $dtree->get_result( callback => sub { $attributes{$_[0]} } );
ok $result, 'no';
}
#print map "$_\n", $dtree->rule_statements;
#use YAML; print Dump $dtree;
if (eval "use GraphViz; 1") {
my $graphviz = $dtree->as_graphviz;
ok $graphviz;
( run in 0.260 second using v1.01-cache-2.11-cpan-a5abf4f5562 )