Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
package Algorithm::DecisionTree;
#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::DecisionTree is a Perl module for decision-tree based classification of
# multidimensional data.
# -------------------------------------------------------------------------------------
#use 5.10.0;
use strict;
use warnings;
use Carp;
our $VERSION = '3.43';
############################################ Constructor ##############################################
sub new {
my ($class, %args, $eval_or_boosting_mode);
if (@_ % 2 != 0) {
($class, %args) = @_;
} else {
$class = shift;
$eval_or_boosting_mode = shift;
die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
die "Only one string arg allowed in eval and boosting modes" if @_;
}
unless ($eval_or_boosting_mode) {
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
if check_for_illegal_params2(@params) == 0;
}
bless {
_training_datafile => $args{training_datafile},
_entropy_threshold => $args{entropy_threshold} || 0.01,
_max_depth_desired => exists $args{max_depth_desired} ?
$args{max_depth_desired} : undef,
_debug1 => $args{debug1} || 0,
_debug2 => $args{debug2} || 0,
_debug3 => $args{debug3} || 0,
_csv_class_column_index => $args{csv_class_column_index} || undef,
_csv_columns_for_features => $args{csv_columns_for_features} || undef,
_symbolic_to_numeric_cardinality_threshold
=> $args{symbolic_to_numeric_cardinality_threshold} || 10,
_number_of_histogram_bins => $args{number_of_histogram_bins} || undef,
_csv_cleanup_needed => $args{csv_cleanup_needed} || 0,
_training_data => [],
_root_node => undef,
_probability_cache => {},
_entropy_cache => {},
_training_data_hash => {},
_features_and_values_hash => {},
_samples_class_label_hash => {},
_class_names => [],
_class_priors => [],
_class_priors_hash => {},
_feature_names => [],
_numeric_features_valuerange_hash => {},
_sampling_points_for_numeric_feature_hash => {},
_feature_values_how_many_uniques_hash => {},
_prob_distribution_numeric_features_hash => {},
_histogram_delta_hash => {},
_num_of_histogram_bins_hash => {},
}, $class;
}
#################################### Classify with Decision Tree #######################################
## Classifies one test sample at a time using the decision tree constructed from
## your training file. The data record for the test sample must be supplied as
## shown in the scripts in the `examples' subdirectory. See the scripts
## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
sub classify {
my $self = shift;
my $root_node = shift;
my $feature_and_values = shift;
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
my @features_and_values = @$feature_and_values;
@features_and_values = @{deep_copy_array(\@features_and_values)};
die "\n\nError in the names you have used for features and/or values. " .
"Try using the csv_cleanup_needed option in the constructor call."
unless $self->check_names_used(\@features_and_values);
my @new_features_and_values = ();
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values) {
$feature_and_value =~ /$pattern/;
my ($feature, $value) = ($1, $2);
my $newvalue = $value;
my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
my $not_all_values_float = 0;
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
sub print_nested_array {
my $arr = shift;
my @arr = @$arr;
print "[";
foreach my $item (@arr) {
if (ref $item) {
print_nested_array($item);
} else {
print "$item";
}
}
print "]";
}
sub cleanup_csv {
my $line = shift;
$line =~ tr/\/:?()[]{}'/ /;
# my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
for (@double_quoted) {
my $item = $_;
$item = substr($item, 1, -1);
$item =~ s/^\s+|,|\s+$//g;
$item = join '_', split /\s+/, $item;
substr($line, index($line, $_), length($_)) = $item;
}
my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
for (@white_spaced) {
my $item = $_;
$item =~ s/\s+/_/g;
$item =~ s/^\s*_|_\s*$//g;
substr($line, index($line, $_), length($_)) = $item;
}
$line =~ s/,\s*(?=,|$)/,NA/g;
return $line;
}
######################################### Class EvalTrainingData ########################################
## This subclass of the DecisionTree class is used to evaluate the quality of your
## training data by running a 10-fold cross-validation test on it. This test divides
## all of the training data into ten parts, with nine parts used for training a
## decision tree and one part used for testing its ability to classify correctly.
## This selection of nine parts for training and one part for testing is carried out
## in all of the ten different possible ways. This testing functionality can also
## be used to find the best values to use for the constructor parameters
## entropy_threshold, max_depth_desired, and
## symbolic_to_numeric_cardinality_threshold.
## Only the CSV training files can be evaluated in this manner (because only CSV
## training are allowed to have numeric features --- which is the more interesting
## case for evaluation analytics.
package EvalTrainingData;
@EvalTrainingData::ISA = ('Algorithm::DecisionTree');
sub new {
my $class = shift;
my $instance = Algorithm::DecisionTree->new(@_);
bless $instance, $class;
}
sub evaluate_training_data {
my $self = shift;
my $evaldebug = 0;
die "The data evaluation function in the module can only be used when your " .
"training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
print "\nWill run a 10-fold cross-validation test on your training data to test its " .
"class-discriminatory power:\n";
my %all_training_data = %{$self->{_training_data_hash}};
my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
my $fold_size = int(0.1 * (scalar keys %all_training_data));
print "fold size: $fold_size\n";
my %confusion_matrix = ();
foreach my $class_name (@{$self->{_class_names}}) {
foreach my $inner_class_name (@{$self->{_class_names}}) {
$confusion_matrix{$class_name}->{$inner_class_name} = 0;
}
}
foreach my $fold_index (0..9) {
print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
@all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
my %testing_data = ();
foreach my $x (@testing_samples) {
$testing_data{$x} = $all_training_data{$x};
}
my %training_data = ();
foreach my $x (@training_samples) {
$training_data{$x} = $all_training_data{$x};
}
my $trainingDT = Algorithm::DecisionTree->new('evalmode');
$trainingDT->{_training_data_hash} = \%training_data;
$trainingDT->{_class_names} = $self->{_class_names};
$trainingDT->{_feature_names} = $self->{_feature_names};
$trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
$trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
$trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
$self->{_symbolic_to_numeric_cardinality_threshold};
foreach my $sample_name (@training_samples) {
$trainingDT->{_samples_class_label_hash}->{$sample_name} =
$self->{_samples_class_label_hash}->{$sample_name};
}
foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
$trainingDT->{_features_and_values_hash}->{$feature} = ();
}
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
Algorithm::DecisionTree::sample_index($b)}
keys %{$trainingDT->{_training_data_hash}}) {
foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1,$2);
push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
}
}
foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
my %seen = ();
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
if ($row_class_name eq $col_class_name) {
$diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
} else {
$off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
}
}
}
my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
if ($data_quality_index <= 80) {
print "\nYour training data does not possess much class discriminatory " .
"information. It could be that the classes are inherently not well " .
"separable or that your constructor parameter choices are not appropriate.\n";
} elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
print "\nYour training data possesses some class discriminatory information " .
"but it may not be sufficient for real-world applications. You might " .
"try tweaking the constructor parameters to see if that improves the " .
"class discriminations.\n";
} elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
print "\nYour training data appears to possess good class discriminatory " .
"information. Whether or not it is acceptable would depend on your " .
"application.\n";
} elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
print "\nYour training data is of excellent quality.\n";
} else {
print "\nYour training data is perfect.\n";
}
}
############################################# Class DTNode #############################################
# The nodes of the decision tree are instances of this class:
package DTNode;
use strict;
use Carp;
# $feature is the feature test at the current node. $branch_features_and_values is
# an anonymous array holding the feature names and corresponding values on the path
# from the root to the current node:
sub new {
my ($class, $feature, $entropy, $class_probabilities,
$branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
$root_or_not = '' if !defined $root_or_not;
if ($root_or_not eq 'root') {
$dt->{nodes_created} = -1;
$dt->{class_names} = undef;
}
my $self = {
_dt => $dt,
_feature => $feature,
_node_creation_entropy => $entropy,
_class_probabilities => $class_probabilities,
_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
_linked_to => [],
};
bless $self, $class;
$self->{_serial_number} = $self->get_next_serial_num();
return $self;
}
sub how_many_nodes {
my $self = shift;
return $self->{_dt}->{nodes_created} + 1;
}
sub set_class_names {
my $self = shift;
my $class_names_list = shift;
$self->{_dt}->{class_names} = $class_names_list;
}
sub get_class_names {
my $self = shift;
return $self->{_dt}->{class_names};
}
sub get_next_serial_num {
my $self = shift;
$self->{_dt}->{nodes_created} += 1;
return $self->{_dt}->{nodes_created};
}
sub get_serial_num {
my $self = shift;
$self->{_serial_number};
}
# this returns the feature test at the current node
sub get_feature {
my $self = shift;
return $self->{ _feature };
}
sub set_feature {
my $self = shift;
my $feature = shift;
$self->{_feature} = $feature;
}
sub get_node_entropy {
my $self = shift;
return $self->{_node_creation_entropy};
}
sub get_class_probabilities {
my $self = shift;
return $self->{ _class_probabilities};
}
sub get_branch_features_and_values_or_thresholds {
my $self = shift;
return $self->{_branch_features_and_values_or_thresholds};
}
sub add_to_branch_features_and_values {
my $self = shift;
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
sub display_decision_tree {
my $self = shift;
my $offset = shift;
my $serial_num = $self->get_serial_num();
if (@{$self->get_children()} > 0) {
my $feature_at_node = $self->get_feature() || " ";
my $node_creation_entropy_at_node = $self->get_node_entropy();
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
my @class_probabilities = @{$self->get_class_probabilities()};
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
my @class_names = @{$self->get_class_names()};
my @print_class_probabilities_with_class =
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
"$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
$offset .= " ";
foreach my $child (@{$self->get_children()}) {
$child->display_decision_tree($offset);
}
} else {
my $node_creation_entropy_at_node = $self->get_node_entropy();
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
my @class_probabilities = @{$self->get_class_probabilities()};
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
my @class_names = @{$self->get_class_names()};
my @print_class_probabilities_with_class =
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
"Class Probs: @print_class_probabilities_with_class\n\n";
}
}
############################## Generate Your Own Numeric Training Data #################################
############################# Class TrainingDataGeneratorNumeric ################################
## See the script generate_training_data_numeric.pl in the examples
## directory on how to use this class for generating your own numeric training and
## test data. The training and test data are generated in accordance with the
## specifications you place in the parameter file that is supplied as an argument to
## the constructor of this class.
package TrainingDataGeneratorNumeric;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params3(@params) == 0;
bless {
_output_training_csv_file => $args{'output_training_csv_file'}
|| croak("name for output_training_csv_file required"),
_output_test_csv_file => $args{'output_test_csv_file'}
|| croak("name for output_test_csv_file required"),
_parameter_file => $args{'parameter_file'}
|| croak("parameter_file required"),
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
|| croak("number_of_samples_for_training"),
_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
|| croak("number_of_samples_for_testing"),
_debug => $args{debug} || 0,
_class_names => [],
_class_names_and_priors => {},
_features_with_value_range => {},
_features_ordered => [],
_classes_and_their_param_values => {},
}, $class;
}
sub check_for_illegal_params3 {
my @params = @_;
my @legal_params = qw / output_training_csv_file
output_test_csv_file
parameter_file
number_of_samples_for_training
number_of_samples_for_testing
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## The training data generated by an instance of the class
## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
## you supply to the class constructor through a constructor variable called
## `parameter_file'. This method is for parsing the parameter file in order to
## order to determine the names to be used for the different data classes, their
## means, and their variances.
sub read_parameter_file_numeric {
my $self = shift;
my @class_names = ();
my %class_names_and_priors = ();
my %features_with_value_range = ();
my %classes_and_their_param_values = ();
# my $regex8 = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
my @params = <FILE>;
my $params = join "", @params;
my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
$params =~ /$regex/si;
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
if ($self->{_debug}) {
foreach my $record (@training_data_records) {
print "$record";
}
foreach my $record (@test_data_records) {
print "$record";
}
}
open OUTPUT, ">$self->{_output_training_csv_file}";
my @feature_names_training = @{$self->{_features_ordered}};
my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
print OUTPUT "$first_row_training\n";
foreach my $i (0..@training_data_records-1) {
my $i1 = $i+1;
my $sample_record = "\"$i1\",$training_data_records[$i]";
print OUTPUT "$sample_record";
}
close OUTPUT;
open OUTPUT, ">$self->{_output_test_csv_file}";
my @feature_names_testing = keys %{$self->{_features_with_value_range}};
my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
print OUTPUT "$first_row_testing\n";
foreach my $i (0..@test_data_records-1) {
my $i1 = $i+1;
my $sample_record = "\"$i1\",$test_data_records[$i]";
print OUTPUT "$sample_record";
}
close OUTPUT;
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
########################### Generate Your Own Symbolic Training Data ###############################
########################### Class TrainingDataGeneratorSymbolic #############################
## See the sample script generate_training_and_test_data_symbolic.pl for how to use
## this class for generating purely symbolic training and test data. The data is
## generated according to the specifications you place in a parameter file whose
## name you supply as one of constructor arguments.
package TrainingDataGeneratorSymbolic;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params4(@params) == 0;
bless {
_output_training_datafile => $args{'output_training_datafile'}
|| die("name for output_training_datafile required"),
_parameter_file => $args{'parameter_file'}
|| die("parameter_file required"),
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
|| die("number_of_samples_for_training required"),
_debug => $args{debug} || 0,
_class_names => [],
_class_priors => [],
_features_and_values_hash => {},
_bias_hash => {},
_training_sample_records => {},
}, $class;
}
sub check_for_illegal_params4 {
my @params = @_;
my @legal_params = qw / output_training_datafile
parameter_file
number_of_samples_for_training
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## Read a parameter file for generating symbolic training data. See the script
## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
## pass the name of the parameter file to the constructor of the
## TrainingDataGeneratorSymbolic class.
sub read_parameter_file_symbolic {
my $self = shift;
my $debug = $self->{_debug};
my $number_of_training_samples = $self->{_number_of_samples_for_training};
my $input_parameter_file = $self->{_parameter_file};
croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
my $output_file_training = $self->{_output_training_datafile};
my $output_file_testing = $self->{_output_test_datafile};
my @all_params;
my $param_string;
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
$param_string = join ' ', @all_params;
my ($class_names, $class_priors, $rest_param) =
$param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
push @{$self->{_class_names}}, @class_names;
my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
$record_string .= ",$item_parts_hash{$kee}";
}
print FILEHANDLE "$record_string\n";
$record_string = '';
}
close FILEHANDLE;
}
sub sample_index {
my $arg = shift;
$arg =~ /_(.+)$/;
return $1;
}
################################# Decision Tree Introspection #######################################
################################# Class DTIntrospection #######################################
package DTIntrospection;
## Instances constructed from this class can provide explanations for the
## classification decisions at the nodes of a decision tree.
##
## When used in the interactive mode, the decision-tree introspection made possible
## by this class provides answers to the following three questions: (1) List of the
## training samples that fall in the portion of the feature space that corresponds
## to a node of the decision tree; (2) The probabilities associated with the last
## feature test that led to the node; and (3) The class probabilities predicated on
## just the last feature test on the path to that node.
##
## CAVEAT: It is possible for a node to exist even when there are no training
## samples in the portion of the feature space that corresponds to the node. That
## is because a decision tree is based on the probability densities estimated from
## the training data. When training data is non-uniformly distributed, it is
## possible for the probability associated with a point in the feature space to be
## non-zero even when there are no training samples at or in the vicinity of that
## point.
##
## For a node to exist even where there are no training samples in the portion of
## the feature space that belongs to the node is an indication of the generalization
## ability of decision-tree based classification.
##
## When used in a non-interactive mode, an instance of this class can be used to
## create a tabular display that shows what training samples belong directly to the
## portion of the feature space that corresponds to each node of the decision tree.
## An instance of this class can also construct a tabular display that shows how the
## influence of each training sample propagates in the decision tree. For each
## training sample, this display first shows the list of nodes that came into
## existence through feature test(s) that used the data provided by that sample.
## This list for each training sample is followed by a subtree of the nodes that owe
## their existence indirectly to the training sample. A training sample influences a
## node indirectly if the node is a descendant of another node that is affected
## directly by the training sample.
use strict;
use Carp;
sub new {
my ($class, $dt) = @_;
croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
unless ref($dt) eq "Algorithm::DecisionTree";
bless {
_dt => $dt,
_root_dtnode => $dt->{_root_node},
_samples_at_nodes_hash => {},
_branch_features_to_nodes_hash => {},
_sample_to_node_mapping_direct_hash => {},
_node_serial_num_to_node_hash => {},
_awareness_raising_msg_shown => 0,
_debug => 0,
}, $class;
}
sub initialize {
my $self = shift;
croak "You must first construct the decision tree before using the DTIntrospection class"
unless $self->{_root_dtnode};
$self->recursive_descent($self->{_root_dtnode});
}
sub recursive_descent {
my $self = shift;
my $node = shift;
my $node_serial_number = $node->get_serial_num();
my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
$self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
$self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
my @samples_at_node = ();
foreach my $item (@$branch_features_and_values_or_thresholds) {
my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
unless (@samples_at_node) {
@samples_at_node = @$samples_for_feature_value_combo;
} else {
my @accum;
foreach my $sample (@samples_at_node) {
push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
}
@samples_at_node = @accum;
}
last unless @samples_at_node;
}
@samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
$self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
if (@samples_at_node) {
foreach my $sample (@samples_at_node) {
if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
$self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
} else {
push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
}
}
}
my $children = $node->get_children();
foreach my $child (@$children) {
$self->recursive_descent($child);
}
}
sub display_training_samples_at_all_nodes_direct_influence_only {
my $self = shift;
( run in 0.590 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )