AI-DecisionTree

 view release on metacpan or  search on metacpan

Instance/t/02-leaktest.t  view on Meta::CPAN

#!/usr/bin/perl

use Test;
BEGIN { plan tests => 4 }

use AI::DecisionTree::Instance;
ok(1);

my $x = 0;
{
  local *{"AI::DecisionTree::Instance::DESTROY"} = sub { $x = 1 };
  {
    my $i = new AI::DecisionTree::Instance([3,4], 4, "foo");
    ok $x, 0;
  }
  ok $x, 1;
}
ok $x, 1;

lib/AI/DecisionTree.pm  view on Meta::CPAN

package AI::DecisionTree;
{
  $AI::DecisionTree::VERSION = '0.11';
}

use AI::DecisionTree::Instance;
use Carp;
use vars qw(@ISA);


sub new {
  my $package = shift;
  return bless {
		noise_mode => 'fatal',
		prune => 1,
		purge => 1,
		verbose => 0,
		max_depth => 0,
		@_,
		nodes => 0,
		instances => [],
		name_gen => 0,
	       }, $package;
}

sub nodes      { $_[0]->{nodes} }
sub noise_mode { $_[0]->{noise_mode} }
sub depth      { $_[0]->{depth} }

sub add_instance {
  my ($self, %args) = @_;
  croak "Missing 'attributes' parameter" unless $args{attributes};
  croak "Missing 'result' parameter" unless defined $args{result};
  $args{name} = $self->{name_gen}++ unless exists $args{name};
  
  my @attributes;
  while (my ($k, $v) = each %{$args{attributes}}) {
    $attributes[ _hlookup($self->{attributes}, $k) ] = _hlookup($self->{attribute_values}{$k}, $v);
  }
  $_ ||= 0 foreach @attributes;
  
  push @{$self->{instances}}, AI::DecisionTree::Instance->new(\@attributes, _hlookup($self->{results}, $args{result}), $args{name});
}

sub _hlookup {
  $_[0] ||= {}; # Autovivify as a hash
  my ($hash, $key) = @_;
  unless (exists $hash->{$key}) {
    $hash->{$key} = 1 + keys %$hash;
  }
  return $hash->{$key};
}

sub _create_lookup_hashes {
  my $self = shift;
  my $h = $self->{results};
  $self->{results_reverse} = [ undef, sort {$h->{$a} <=> $h->{$b}} keys %$h ];
  
  foreach my $attr (keys %{$self->{attribute_values}}) {
    my $h = $self->{attribute_values}{$attr};
    $self->{attribute_values_reverse}{$attr} = [ undef, sort {$h->{$a} <=> $h->{$b}} keys %$h ];
  }
}

sub train {
  my ($self, %args) = @_;
  if (not @{ $self->{instances} }) {
    croak "Training data has been purged, can't re-train" if $self->{tree};
    croak "Must add training instances before calling train()";
  }
  
  $self->_create_lookup_hashes;
  local $self->{curr_depth} = 0;
  local $self->{max_depth} = $args{max_depth} if exists $args{max_depth};
  $self->{depth} = 0;
  $self->{tree} = $self->_expand_node( instances => $self->{instances} );
  $self->{total_instances} = @{$self->{instances}};
  
  $self->prune_tree if $self->{prune};
  $self->do_purge if $self->purge;
  return 1;
}

sub do_purge {
  my $self = shift;
  delete @{$self}{qw(instances attribute_values attribute_values_reverse results results_reverse)};
}

sub copy_instances {
  my ($self, %opt) = @_;
  croak "Missing 'from' parameter to copy_instances()" unless exists $opt{from};
  my $other = $opt{from};
  croak "'from' parameter is not a decision tree" unless UNIVERSAL::isa($other, __PACKAGE__);

  foreach (qw(instances attributes attribute_values results)) {
    $self->{$_} = $other->{$_};
  }
  $self->_create_lookup_hashes;
}

sub set_results {
  my ($self, $hashref) = @_;
  foreach my $instance (@{$self->{instances}}) {
    my $name = $instance->name;
    croak "No result given for instance '$name'" unless exists $hashref->{$name};
    $instance->set_result( $self->{results}{ $hashref->{$name} } );
  }
}

sub instances { $_[0]->{instances} }

sub purge {
  my $self = shift;
  $self->{purge} = shift if @_;
  return $self->{purge};
}

# Each node contains:
#  { split_on => $attr_name,
#    children => { $attr_value1 => $node1,
#                  $attr_value2 => $node2, ... }
#  }
# or
#  { result => $result }

sub _expand_node {
  my ($self, %args) = @_;
  my $instances = $args{instances};
  print STDERR '.' if $self->{verbose};
  
  $self->{depth} = $self->{curr_depth} if $self->{curr_depth} > $self->{depth};
  local $self->{curr_depth} = $self->{curr_depth} + 1;
  $self->{nodes}++;

  my %results;
  $results{$self->_result($_)}++ foreach @$instances;

lib/AI/DecisionTree.pm  view on Meta::CPAN

       scalar @$instances, " instances into multiple buckets (@{[ keys %split ]})")
    unless keys %split > 1;

  foreach my $value (keys %split) {
    $node{children}{$value} = $self->_expand_node( instances => $split{$value} );
  }
  
  return \%node;
}

sub best_attr {
  my ($self, $instances) = @_;

  # 0 is a perfect score, entropy(#instances) is the worst possible score
  
  my ($best_score, $best_attr) = (@$instances * $self->entropy( map $_->result_int, @$instances ), undef);
  my $all_attr = $self->{attributes};
  foreach my $attr (keys %$all_attr) {

    # %tallies is correlation between each attr value and result
    # %total is number of instances with each attr value

lib/AI/DecisionTree.pm  view on Meta::CPAN

    while (my ($opt, $vals) = each %tallies) {
      $score += $totals{$opt} * $self->entropy2( $vals, $totals{$opt} );
    }

    ($best_attr, $best_score) = ($attr, $score) if $score < $best_score;
  }
  
  return $best_attr;
}

sub entropy2 {
  shift;
  my ($counts, $total) = @_;

  # Entropy is defined with log base 2 - we just divide by log(2) at the end to adjust.
  my $sum = 0;
  $sum += $_ * log($_) foreach values %$counts;
  return +(log($total) - $sum/$total)/log(2);
}

sub entropy {
  shift;

  my %count;
  $count{$_}++ foreach @_;

  # Entropy is defined with log base 2 - we just divide by log(2) at the end to adjust.
  my $sum = 0;
  $sum += $_ * log($_) foreach values %count;
  return +(log(@_) - $sum/@_)/log(2);
}

sub prune_tree {
  my $self = shift;

  # We use a minimum-description-length approach.  We calculate the
  # score of each node:
  #  n = number of nodes below
  #  r = number of results (categories) in the entire tree
  #  i = number of instances in the entire tree
  #  e = number of errors below this node

  # Hypothesis description length (MML):
  #  describe tree: number of nodes + number of edges
  #  describe exceptions: num_exceptions * log2(total_num_instances) * log2(total_num_results)
  
  my $r = keys %{ $self->{results} };
  my $i = $self->{tree}{instances};
  my $exception_cost = log($r) * log($i) / log(2)**2;

  # Pruning can turn a branch into a leaf
  my $maybe_prune = sub {
    my ($self, $node) = @_;
    return unless $node->{children};  # Can't prune leaves

    my $nodes_below = $self->nodes_below($node);
    my $tree_cost = 2 * $nodes_below - 1;  # $edges_below == $nodes_below - 1
    
    my $exceptions = $self->exceptions( $node );
    my $simple_rule_exceptions = $node->{instances} - $node->{distribution}[1];

    my $score = -$nodes_below - ($exceptions - $simple_rule_exceptions) * $exception_cost;

lib/AI/DecisionTree.pm  view on Meta::CPAN

    if ($score < 0) {
      delete @{$node}{'children', 'split_on', 'exceptions', 'nodes_below'};
      $node->{result} = $node->{distribution}[0];
      # XXX I'm not cleaning up 'exceptions' or 'nodes_below' keys up the tree
    }
  };

  $self->_traverse($maybe_prune);
}

sub exceptions {
  my ($self, $node) = @_;
  return $node->{exceptions} if exists $node->{exeptions};
  
  my $count = 0;
  if ( exists $node->{result} ) {
    $count = $node->{instances} - $node->{distribution}[1];
  } else {
    foreach my $child ( values %{$node->{children}} ) {
      $count += $self->exceptions($child);
    }
  }
  
  return $node->{exceptions} = $count;
}

sub nodes_below {
  my ($self, $node) = @_;
  return $node->{nodes_below} if exists $node->{nodes_below};

  my $count = 0;
  $self->_traverse( sub {$count++}, $node );

  return $node->{nodes_below} = $count - 1;
}

# This is *not* for external use, I may change it.
sub _traverse {
  my ($self, $callback, $node, $parent, $node_name) = @_;
  $node ||= $self->{tree};
  
  ref($callback) ? $callback->($self, $node, $parent, $node_name) : $self->$callback($node, $parent, $node_name);
  
  return unless $node->{children};
  foreach my $child ( keys %{$node->{children}} ) {
    $self->_traverse($callback, $node->{children}{$child}, $node, $child);
  }
}

sub get_result {
  my ($self, %args) = @_;
  croak "Missing 'attributes' or 'callback' parameter" unless $args{attributes} or $args{callback};

  $self->train unless $self->{tree};
  my $tree = $self->{tree};
  
  while (1) {
    if (exists $tree->{result}) {
      my $r = $tree->{result};
      return $r unless wantarray;

lib/AI/DecisionTree.pm  view on Meta::CPAN

    
    my $instance_val = (exists $args{callback} ? $args{callback}->($tree->{split_on}) :
			exists $args{attributes}{$tree->{split_on}} ? $args{attributes}{$tree->{split_on}} :
			'<undef>');
    ## no critic (ProhibitExplicitReturnUndef)
    $tree = $tree->{children}{ $instance_val }
      or return undef;
  }
}

sub as_graphviz {
  my ($self, %args) = @_;
  my $colors = delete $args{leaf_colors} || {};
  require GraphViz;
  my $g = GraphViz->new(%args);

  my $id = 1;
  my $add_edge = sub {
    my ($self, $node, $parent, $node_name) = @_;
    # We use stringified reference names for node names, as a convenient hack.

    if ($node->{split_on}) {
      $g->add_node( "$node",
		    label => $node->{split_on},
		    shape => 'ellipse',
		  );
    } else {
      my $i = 0;

lib/AI/DecisionTree.pm  view on Meta::CPAN

    }
    $g->add_edge( "$parent" => "$node",
		  label => $node_name,
		) if $parent;
  };

  $self->_traverse( $add_edge );
  return $g;
}

sub rule_tree {
  my $self = shift;
  my ($tree) = @_ ? @_ : $self->{tree};
  
  # build tree:
  # [ question, { results => [ question, { ... } ] } ]
  
  return $tree->{result} if exists $tree->{result};
  
  return [
	  $tree->{split_on}, {
			      map { $_ => $self->rule_tree($tree->{children}{$_}) } keys %{$tree->{children}},
			     }
	 ];
}

sub rule_statements {
  my $self = shift;
  my ($stmt, $tree) = @_ ? @_ : ('', $self->{tree});
  return("$stmt -> '$tree->{result}'") if exists $tree->{result};
  
  my @out;
  my $prefix = $stmt ? "$stmt and" : "if";
  foreach my $val (keys %{$tree->{children}}) {
    push @out, $self->rule_statements("$prefix $tree->{split_on}='$val'", $tree->{children}{$val});
  }
  return @out;
}

### Some instance accessor stuff:

sub _result {
  my ($self, $instance) = @_;
  my $int = $instance->result_int;
  return $self->{results_reverse}[$int];
}

sub _delete_value {
  my ($self, $instance, $attr) = @_;
  my $val = $self->_value($instance, $attr);
  return unless defined $val;
  
  $instance->set_value($self->{attributes}{$attr}, 0);
  return $val;
}

sub _value {
  my ($self, $instance, $attr) = @_;
  return unless exists $self->{attributes}{$attr};
  my $val_int = $instance->value_int($self->{attributes}{$attr});
  return $self->{attribute_values_reverse}{$attr}[$val_int];
}



1;
__END__

t/01-simple.t  view on Meta::CPAN


{
  # Test attribute callbacks
  my %attributes = (
		    outlook => 'rain',
		    temperature => 'mild',
		    humidity => 'high',
		    wind => 'strong',
		   );

  my $result  = $dtree->get_result( callback => sub { $attributes{$_[0]} } );
  ok $result, 'no';
}


#print map "$_\n", $dtree->rule_statements;
#use YAML; print Dump $dtree;

if (eval "use GraphViz; 1") {
  my $graphviz = $dtree->as_graphviz;
  ok $graphviz;



( run in 0.260 second using v1.01-cache-2.11-cpan-a5abf4f5562 )