AI-DecisionTree
view release on metacpan or search on metacpan
lib/AI/DecisionTree.pm view on Meta::CPAN
croak "Must add training instances before calling train()";
}
$self->_create_lookup_hashes;
local $self->{curr_depth} = 0;
local $self->{max_depth} = $args{max_depth} if exists $args{max_depth};
$self->{depth} = 0;
$self->{tree} = $self->_expand_node( instances => $self->{instances} );
$self->{total_instances} = @{$self->{instances}};
$self->prune_tree if $self->{prune};
$self->do_purge if $self->purge;
return 1;
}
sub do_purge {
my $self = shift;
delete @{$self}{qw(instances attribute_values attribute_values_reverse results results_reverse)};
}
sub copy_instances {
my ($self, %opt) = @_;
croak "Missing 'from' parameter to copy_instances()" unless exists $opt{from};
my $other = $opt{from};
croak "'from' parameter is not a decision tree" unless UNIVERSAL::isa($other, __PACKAGE__);
foreach (qw(instances attributes attribute_values results)) {
$self->{$_} = $other->{$_};
}
$self->_create_lookup_hashes;
}
sub set_results {
my ($self, $hashref) = @_;
foreach my $instance (@{$self->{instances}}) {
my $name = $instance->name;
croak "No result given for instance '$name'" unless exists $hashref->{$name};
$instance->set_result( $self->{results}{ $hashref->{$name} } );
}
}
sub instances { $_[0]->{instances} }
sub purge {
my $self = shift;
$self->{purge} = shift if @_;
return $self->{purge};
}
# Each node contains:
# { split_on => $attr_name,
# children => { $attr_value1 => $node1,
# $attr_value2 => $node2, ... }
# }
# or
# { result => $result }
sub _expand_node {
my ($self, %args) = @_;
my $instances = $args{instances};
print STDERR '.' if $self->{verbose};
$self->{depth} = $self->{curr_depth} if $self->{curr_depth} > $self->{depth};
local $self->{curr_depth} = $self->{curr_depth} + 1;
$self->{nodes}++;
my %results;
$results{$self->_result($_)}++ foreach @$instances;
my @results = map {$_,$results{$_}} sort {$results{$b} <=> $results{$a}} keys %results;
my %node = ( distribution => \@results, instances => scalar @$instances );
foreach (keys %results) {
$self->{prior_freqs}{$_} += $results{$_};
}
if (keys(%results) == 1) {
# All these instances have the same result - make this node a leaf
$node{result} = $self->_result($instances->[0]);
return \%node;
}
# Multiple values are present - find the best predictor attribute and split on it
my $best_attr = $self->best_attr($instances);
croak "Inconsistent data, can't build tree with noise_mode='fatal'"
if $self->{noise_mode} eq 'fatal' and !defined $best_attr;
if ( !defined($best_attr)
or $self->{max_depth} && $self->{curr_depth} > $self->{max_depth} ) {
# Pick the most frequent result for this leaf
$node{result} = (sort {$results{$b} <=> $results{$a}} keys %results)[0];
return \%node;
}
$node{split_on} = $best_attr;
my %split;
foreach my $i (@$instances) {
my $v = $self->_value($i, $best_attr);
push @{$split{ defined($v) ? $v : '<undef>' }}, $i;
}
die ("Something's wrong: attribute '$best_attr' didn't split ",
scalar @$instances, " instances into multiple buckets (@{[ keys %split ]})")
unless keys %split > 1;
foreach my $value (keys %split) {
$node{children}{$value} = $self->_expand_node( instances => $split{$value} );
}
return \%node;
}
sub best_attr {
my ($self, $instances) = @_;
# 0 is a perfect score, entropy(#instances) is the worst possible score
my ($best_score, $best_attr) = (@$instances * $self->entropy( map $_->result_int, @$instances ), undef);
my $all_attr = $self->{attributes};
foreach my $attr (keys %$all_attr) {
( run in 0.513 second using v1.01-cache-2.11-cpan-39bf76dae61 )