AI-DecisionTree

 view release on metacpan or  search on metacpan

lib/AI/DecisionTree.pm  view on Meta::CPAN


  # Hypothesis description length (MML):
  #  describe tree: number of nodes + number of edges
  #  describe exceptions: num_exceptions * log2(total_num_instances) * log2(total_num_results)
  
  my $r = keys %{ $self->{results} };
  my $i = $self->{tree}{instances};
  my $exception_cost = log($r) * log($i) / log(2)**2;

  # Pruning can turn a branch into a leaf
  my $maybe_prune = sub {
    my ($self, $node) = @_;
    return unless $node->{children};  # Can't prune leaves

    my $nodes_below = $self->nodes_below($node);
    my $tree_cost = 2 * $nodes_below - 1;  # $edges_below == $nodes_below - 1
    
    my $exceptions = $self->exceptions( $node );
    my $simple_rule_exceptions = $node->{instances} - $node->{distribution}[1];

    my $score = -$nodes_below - ($exceptions - $simple_rule_exceptions) * $exception_cost;
    #warn "Score = $score = -$nodes_below - ($exceptions - $simple_rule_exceptions) * $exception_cost\n";
    if ($score < 0) {
      delete @{$node}{'children', 'split_on', 'exceptions', 'nodes_below'};
      $node->{result} = $node->{distribution}[0];
      # XXX I'm not cleaning up 'exceptions' or 'nodes_below' keys up the tree
    }
  };

  $self->_traverse($maybe_prune);
}

sub exceptions {
  my ($self, $node) = @_;
  return $node->{exceptions} if exists $node->{exeptions};
  
  my $count = 0;
  if ( exists $node->{result} ) {
    $count = $node->{instances} - $node->{distribution}[1];
  } else {
    foreach my $child ( values %{$node->{children}} ) {
      $count += $self->exceptions($child);
    }
  }
  
  return $node->{exceptions} = $count;
}

sub nodes_below {
  my ($self, $node) = @_;
  return $node->{nodes_below} if exists $node->{nodes_below};

  my $count = 0;
  $self->_traverse( sub {$count++}, $node );

  return $node->{nodes_below} = $count - 1;
}

# This is *not* for external use, I may change it.
sub _traverse {
  my ($self, $callback, $node, $parent, $node_name) = @_;
  $node ||= $self->{tree};
  
  ref($callback) ? $callback->($self, $node, $parent, $node_name) : $self->$callback($node, $parent, $node_name);
  
  return unless $node->{children};
  foreach my $child ( keys %{$node->{children}} ) {
    $self->_traverse($callback, $node->{children}{$child}, $node, $child);
  }
}

sub get_result {
  my ($self, %args) = @_;
  croak "Missing 'attributes' or 'callback' parameter" unless $args{attributes} or $args{callback};

  $self->train unless $self->{tree};
  my $tree = $self->{tree};
  
  while (1) {
    if (exists $tree->{result}) {
      my $r = $tree->{result};
      return $r unless wantarray;

      my %dist = @{$tree->{distribution}};
      my $confidence = $tree->{distribution}[1] / $tree->{instances};

#      my $confidence = P(H|D) = [P(D|H)P(H)]/[P(D|H)P(H)+P(D|H')P(H')]
#                              = [P(D|H)P(H)]/P(D);
#      my $confidence = 
#      $confidence *= $self->{prior_freqs}{$r} / $self->{total_instances};
      
      return ($r, $confidence, \%dist);
    }
    
    my $instance_val = (exists $args{callback} ? $args{callback}->($tree->{split_on}) :
			exists $args{attributes}{$tree->{split_on}} ? $args{attributes}{$tree->{split_on}} :
			'<undef>');
    ## no critic (ProhibitExplicitReturnUndef)
    $tree = $tree->{children}{ $instance_val }
      or return undef;
  }
}

sub as_graphviz {
  my ($self, %args) = @_;
  my $colors = delete $args{leaf_colors} || {};
  require GraphViz;
  my $g = GraphViz->new(%args);

  my $id = 1;
  my $add_edge = sub {
    my ($self, $node, $parent, $node_name) = @_;
    # We use stringified reference names for node names, as a convenient hack.

    if ($node->{split_on}) {
      $g->add_node( "$node",
		    label => $node->{split_on},
		    shape => 'ellipse',
		  );
    } else {
      my $i = 0;
      my $distr = join ',', grep {$i++ % 2} @{$node->{distribution}};
      my %fill = (exists $colors->{$node->{result}} ?
		  (fillcolor => $colors->{$node->{result}},
		   style => 'filled') :
		  ()
		 );
      $g->add_node( "$node",
		    label => "$node->{result} ($distr)",
		    shape => 'box',
		    %fill,
		  );
    }
    $g->add_edge( "$parent" => "$node",
		  label => $node_name,
		) if $parent;
  };

  $self->_traverse( $add_edge );
  return $g;
}

sub rule_tree {
  my $self = shift;
  my ($tree) = @_ ? @_ : $self->{tree};
  
  # build tree:
  # [ question, { results => [ question, { ... } ] } ]
  
  return $tree->{result} if exists $tree->{result};
  
  return [
	  $tree->{split_on}, {
			      map { $_ => $self->rule_tree($tree->{children}{$_}) } keys %{$tree->{children}},
			     }
	 ];
}

sub rule_statements {
  my $self = shift;
  my ($stmt, $tree) = @_ ? @_ : ('', $self->{tree});
  return("$stmt -> '$tree->{result}'") if exists $tree->{result};
  
  my @out;
  my $prefix = $stmt ? "$stmt and" : "if";
  foreach my $val (keys %{$tree->{children}}) {
    push @out, $self->rule_statements("$prefix $tree->{split_on}='$val'", $tree->{children}{$val});
  }
  return @out;
}

### Some instance accessor stuff:

sub _result {
  my ($self, $instance) = @_;
  my $int = $instance->result_int;
  return $self->{results_reverse}[$int];
}

sub _delete_value {
  my ($self, $instance, $attr) = @_;
  my $val = $self->_value($instance, $attr);
  return unless defined $val;
  
  $instance->set_value($self->{attributes}{$attr}, 0);
  return $val;
}

sub _value {
  my ($self, $instance, $attr) = @_;
  return unless exists $self->{attributes}{$attr};
  my $val_int = $instance->value_int($self->{attributes}{$attr});
  return $self->{attribute_values_reverse}{$attr}[$val_int];
}


 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 1.248 second using v1.00-cache-2.02-grep-82fe00e-cpan-72ae3ad1e6da )