Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

package Algorithm::DecisionTree;

#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::DecisionTree is a Perl module for decision-tree based classification of 
# multidimensional data.
# -------------------------------------------------------------------------------------

#use 5.10.0;
use strict;
use warnings;
use Carp;

our $VERSION = '3.43';

############################################   Constructor  ##############################################
sub new { 
    my ($class, %args, $eval_or_boosting_mode);
    if (@_ % 2 != 0) {
        ($class, %args) = @_;
    } else {
        $class = shift;
        $eval_or_boosting_mode = shift;
        die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
        die "Only one string arg allowed in eval and boosting modes" if @_;
    }
    unless ($eval_or_boosting_mode) {
        my @params = keys %args;
        croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params2(@params) == 0;
    }
    bless {
        _training_datafile                   =>    $args{training_datafile}, 
        _entropy_threshold                   =>    $args{entropy_threshold} || 0.01,
        _max_depth_desired                   =>    exists $args{max_depth_desired} ? 
                                                                       $args{max_depth_desired} : undef,
        _debug1                              =>    $args{debug1} || 0,
        _debug2                              =>    $args{debug2} || 0,
        _debug3                              =>    $args{debug3} || 0,
        _csv_class_column_index              =>    $args{csv_class_column_index} || undef,
        _csv_columns_for_features            =>    $args{csv_columns_for_features} || undef,
        _symbolic_to_numeric_cardinality_threshold
                                             =>    $args{symbolic_to_numeric_cardinality_threshold} || 10,
        _number_of_histogram_bins            =>    $args{number_of_histogram_bins} || undef,
        _csv_cleanup_needed                  =>    $args{csv_cleanup_needed} || 0,
        _training_data                       =>    [],
        _root_node                           =>    undef,
        _probability_cache                   =>    {},
        _entropy_cache                       =>    {},
        _training_data_hash                  =>    {},
        _features_and_values_hash            =>    {},
        _samples_class_label_hash            =>    {},
        _class_names                         =>    [],
        _class_priors                        =>    [],
        _class_priors_hash                   =>    {},
        _feature_names                       =>    [],
        _numeric_features_valuerange_hash    =>    {},
        _sampling_points_for_numeric_feature_hash      =>      {},
        _feature_values_how_many_uniques_hash          =>      {},
        _prob_distribution_numeric_features_hash       =>      {},
        _histogram_delta_hash                          =>      {},
        _num_of_histogram_bins_hash                    =>      {},
    }, $class;

}

####################################  Classify with Decision Tree  #######################################

##  Classifies one test sample at a time using the decision tree constructed from
##  your training file.  The data record for the test sample must be supplied as
##  shown in the scripts in the `examples' subdirectory.  See the scripts
##  construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
sub classify {
    my $self = shift;
    my $root_node = shift;
    my $feature_and_values = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my @features_and_values = @$feature_and_values;
    @features_and_values = @{deep_copy_array(\@features_and_values)};
    die "\n\nError in the names you have used for features and/or values.  " .
        "Try using the csv_cleanup_needed option in the constructor call." 
                        unless $self->check_names_used(\@features_and_values);
    my @new_features_and_values = ();
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values) {
        $feature_and_value =~ /$pattern/;
        my ($feature, $value) = ($1, $2);
        my $newvalue = $value;
        my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

sub print_nested_array {
    my $arr = shift;
    my @arr = @$arr;
    print "[";
    foreach my $item (@arr) {
        if (ref $item) {
            print_nested_array($item);
        } else {
            print "$item";
        }
    }
    print "]";
}    

sub cleanup_csv {
    my $line = shift;
    $line =~ tr/\/:?()[]{}'/          /;
#    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
    for (@double_quoted) {
        my $item = $_;
        $item = substr($item, 1, -1);
        $item =~ s/^\s+|,|\s+$//g;
        $item = join '_',  split /\s+/, $item;
        substr($line, index($line, $_), length($_)) = $item;
    }
    my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
    for (@white_spaced) {
        my $item = $_;
        $item =~ s/\s+/_/g;
        $item =~ s/^\s*_|_\s*$//g;
        substr($line, index($line, $_), length($_)) = $item;
    }
    $line =~ s/,\s*(?=,|$)/,NA/g;
    return $line;
}

######################################### Class EvalTrainingData  ########################################

##  This subclass of the DecisionTree class is used to evaluate the quality of your
##  training data by running a 10-fold cross-validation test on it. This test divides
##  all of the training data into ten parts, with nine parts used for training a
##  decision tree and one part used for testing its ability to classify correctly.
##  This selection of nine parts for training and one part for testing is carried out
##  in all of the ten different possible ways.  This testing functionality can also
##  be used to find the best values to use for the constructor parameters
##  entropy_threshold, max_depth_desired, and
##  symbolic_to_numeric_cardinality_threshold.

##  Only the CSV training files can be evaluated in this manner (because only CSV
##  training are allowed to have numeric features --- which is the more interesting
##  case for evaluation analytics.

package EvalTrainingData;

@EvalTrainingData::ISA = ('Algorithm::DecisionTree');

sub new {
    my $class = shift;
    my $instance = Algorithm::DecisionTree->new(@_);
    bless $instance, $class;
}

sub evaluate_training_data {
    my $self = shift;
    my $evaldebug = 0;
    die "The data evaluation function in the module can only be used when your " .
        "training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
    print "\nWill run a 10-fold cross-validation test on your training data to test its " .
          "class-discriminatory power:\n";
    my %all_training_data = %{$self->{_training_data_hash}};
    my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=> 
                                     Algorithm::DecisionTree::sample_index($b)}  keys %all_training_data;
    my $fold_size = int(0.1 * (scalar keys %all_training_data));
    print "fold size: $fold_size\n";
    my %confusion_matrix = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        foreach my $inner_class_name (@{$self->{_class_names}}) {
            $confusion_matrix{$class_name}->{$inner_class_name} = 0;
        }
    }
    foreach my $fold_index (0..9) {
        print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n"; 
        my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
        my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],  
                     @all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
        my %testing_data = ();
        foreach my $x (@testing_samples) {
            $testing_data{$x} = $all_training_data{$x};
        }
        my %training_data = ();
        foreach my $x (@training_samples) {
            $training_data{$x} = $all_training_data{$x};
        }
        my $trainingDT = Algorithm::DecisionTree->new('evalmode');
        $trainingDT->{_training_data_hash} = \%training_data;
        $trainingDT->{_class_names} = $self->{_class_names};
        $trainingDT->{_feature_names} = $self->{_feature_names};
        $trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
        $trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
        $trainingDT->{_symbolic_to_numeric_cardinality_threshold} = 
                                                $self->{_symbolic_to_numeric_cardinality_threshold};
        foreach my $sample_name (@training_samples) {
            $trainingDT->{_samples_class_label_hash}->{$sample_name} = 
                                                $self->{_samples_class_label_hash}->{$sample_name};
        }
        foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
            $trainingDT->{_features_and_values_hash}->{$feature} = ();
        }
        my $pattern = '(\S+)\s*=\s*(\S+)';
        foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=> 
                                Algorithm::DecisionTree::sample_index($b)}  
                          keys %{$trainingDT->{_training_data_hash}}) {
            foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
                $feature_and_value =~ /$pattern/;
                my ($feature,$value) = ($1,$2);
                push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
            }
        }
        foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
            my %seen = ();

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
            if ($row_class_name eq $col_class_name) {
                $diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
            } else {
                $off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
            }
        }
    }
    my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
    print "\nTraining Data Quality Index: $data_quality_index    (out of a possible maximum of 100)\n";
    if ($data_quality_index <= 80) {
        print "\nYour training data does not possess much class discriminatory " .
              "information.  It could be that the classes are inherently not well " .
              "separable or that your constructor parameter choices are not appropriate.\n";
    } elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
        print "\nYour training data possesses some class discriminatory information " .
              "but it may not be sufficient for real-world applications.  You might " .
              "try tweaking the constructor parameters to see if that improves the " .
              "class discriminations.\n";
    } elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
        print  "\nYour training data appears to possess good class discriminatory " .
               "information.  Whether or not it is acceptable would depend on your " .
               "application.\n";
    } elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
        print "\nYour training data is of excellent quality.\n";
    } else {
        print "\nYour training data is perfect.\n";
    }

}


#############################################  Class DTNode  #############################################

# The nodes of the decision tree are instances of this class:

package DTNode;

use strict; 
use Carp;

# $feature is the feature test at the current node.  $branch_features_and_values is
# an anonymous array holding the feature names and corresponding values on the path
# from the root to the current node:
sub new {                                                           
    my ($class, $feature, $entropy, $class_probabilities, 
                                       $branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_; 
    $root_or_not = '' if !defined $root_or_not;
    if ($root_or_not eq 'root') {
        $dt->{nodes_created} = -1;
        $dt->{class_names} = undef;
    }
    my $self = {                                                         
            _dt                      => $dt,
            _feature                 => $feature,                                       
            _node_creation_entropy   => $entropy,
            _class_probabilities     => $class_probabilities,
            _branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
            _linked_to => [],                                          
    };
    bless $self, $class;
    $self->{_serial_number} =  $self->get_next_serial_num();
    return $self;
}

sub how_many_nodes {
    my $self = shift;
    return $self->{_dt}->{nodes_created} + 1;
}

sub set_class_names {
    my $self = shift;
    my $class_names_list = shift;
    $self->{_dt}->{class_names} = $class_names_list;
}

sub get_class_names {
    my $self = shift;
    return $self->{_dt}->{class_names};
}

sub get_next_serial_num {
    my $self = shift;
    $self->{_dt}->{nodes_created} += 1;
    return $self->{_dt}->{nodes_created};
}

sub get_serial_num {
    my $self = shift;
    $self->{_serial_number};
}

# this returns the feature test at the current node
sub get_feature {                                  
    my $self = shift;                              
    return $self->{ _feature };                    
}

sub set_feature {
    my $self = shift;
    my $feature = shift;
    $self->{_feature} = $feature;
}

sub get_node_entropy {
    my $self = shift;                              
    return $self->{_node_creation_entropy};
}

sub get_class_probabilities {                                  
    my $self = shift;                              
    return $self->{ _class_probabilities};                    
}

sub get_branch_features_and_values_or_thresholds {
    my $self = shift; 
    return $self->{_branch_features_and_values_or_thresholds};
}

sub add_to_branch_features_and_values {
    my $self = shift;                   

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN


sub display_decision_tree {
    my $self = shift;
    my $offset = shift;
    my $serial_num = $self->get_serial_num();
    if (@{$self->get_children()} > 0) {
        my $feature_at_node = $self->get_feature() || " ";
        my $node_creation_entropy_at_node = $self->get_node_entropy();
        my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
        my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
        my @class_probabilities = @{$self->get_class_probabilities()};
        my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
        my @class_names = @{$self->get_class_names()};
        my @print_class_probabilities_with_class =
            map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
        print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
        my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
        print "$second_line_offset" . "Decision Feature: $feature_at_node    Node Creation Entropy: " ,
              "$print_node_creation_entropy_at_node   Class Probs: @print_class_probabilities_with_class\n\n";
        $offset .= "   ";
        foreach my $child (@{$self->get_children()}) {
            $child->display_decision_tree($offset);
        }
    } else {
        my $node_creation_entropy_at_node = $self->get_node_entropy();
        my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
        my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
        my @class_probabilities = @{$self->get_class_probabilities()};
        my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
        my @class_names = @{$self->get_class_names()};
        my @print_class_probabilities_with_class =
            map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
        print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
        my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
        print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node   " .
              "Class Probs: @print_class_probabilities_with_class\n\n";
    }
}


##############################  Generate Your Own Numeric Training Data  #################################
#############################      Class TrainingDataGeneratorNumeric     ################################

##  See the script generate_training_data_numeric.pl in the examples
##  directory on how to use this class for generating your own numeric training and
##  test data.  The training and test data are generated in accordance with the
##  specifications you place in the parameter file that is supplied as an argument to
##  the constructor of this class.

package TrainingDataGeneratorNumeric;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params3(@params) == 0;   
    bless {
        _output_training_csv_file          =>   $args{'output_training_csv_file'} 
                                                   || croak("name for output_training_csv_file required"),
        _output_test_csv_file              =>   $args{'output_test_csv_file'} 
                                                   || croak("name for output_test_csv_file required"),
        _parameter_file                    =>   $args{'parameter_file'}
                                                         || croak("parameter_file required"),
        _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'} 
                                                         || croak("number_of_samples_for_training"),
        _number_of_samples_for_testing     =>   $args{'number_of_samples_for_testing'} 
                                                         || croak("number_of_samples_for_testing"),
        _debug                             =>    $args{debug} || 0,
        _class_names                       =>    [],
        _class_names_and_priors            =>    {},
        _features_with_value_range         =>    {},
        _features_ordered                  =>    [],
        _classes_and_their_param_values    =>    {},
    }, $class;
}

sub check_for_illegal_params3 {
    my @params = @_;
    my @legal_params = qw / output_training_csv_file
                            output_test_csv_file
                            parameter_file
                            number_of_samples_for_training
                            number_of_samples_for_testing
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

##  The training data generated by an instance of the class
##  TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
##  you supply to the class constructor through a constructor variable called
##  `parameter_file'.  This method is for parsing the parameter file in order to
##  order to determine the names to be used for the different data classes, their
##  means, and their variances.
sub read_parameter_file_numeric {
    my $self = shift;
    my @class_names = ();
    my %class_names_and_priors = ();
    my %features_with_value_range = ();
    my %classes_and_their_param_values = ();
#   my $regex8 =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
    my @params = <FILE>;
    my $params = join "", @params;
    my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
    $params =~ /$regex/si;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    if ($self->{_debug}) {
        foreach my $record (@training_data_records) {
            print "$record";
        }
        foreach my $record (@test_data_records) {
            print "$record";
        }
    }
    open OUTPUT, ">$self->{_output_training_csv_file}";
    my @feature_names_training = @{$self->{_features_ordered}};
    my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
    my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
    print OUTPUT "$first_row_training\n";
    foreach my $i (0..@training_data_records-1) {
        my $i1 = $i+1;
        my $sample_record = "\"$i1\",$training_data_records[$i]";
        print OUTPUT "$sample_record";
    }
    close OUTPUT;
    open OUTPUT, ">$self->{_output_test_csv_file}";
    my @feature_names_testing = keys %{$self->{_features_with_value_range}};
    my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
    my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
    print OUTPUT "$first_row_testing\n";
    foreach my $i (0..@test_data_records-1) {
        my $i1 = $i+1;
        my $sample_record = "\"$i1\",$test_data_records[$i]";
        print OUTPUT "$sample_record";
    }
    close OUTPUT;
}

# from perl docs:                                                                         
sub fisher_yates_shuffle {
    my $arr =  shift;
    my $i = @$arr;
    while (--$i) {
        my $j = int rand( $i + 1 );
        @$arr[$i, $j] = @$arr[$j, $i];
    }
}

###########################  Generate Your Own Symbolic Training Data  ###############################
###########################     Class TrainingDataGeneratorSymbolic      #############################

##  See the sample script generate_training_and_test_data_symbolic.pl for how to use
##  this class for generating purely symbolic training and test data.  The data is
##  generated according to the specifications you place in a parameter file whose
##  name you supply as one of constructor arguments.
package TrainingDataGeneratorSymbolic;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params4(@params) == 0;   
    bless {
        _output_training_datafile          =>   $args{'output_training_datafile'} 
                                                   || die("name for output_training_datafile required"),
        _parameter_file                    =>   $args{'parameter_file'}
                                                   || die("parameter_file required"),
        _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'} 
                                                   || die("number_of_samples_for_training required"),
        _debug                             =>    $args{debug} || 0,
        _class_names                       =>    [],
        _class_priors                      =>    [],
        _features_and_values_hash          =>    {},
        _bias_hash                         =>    {},
        _training_sample_records           =>    {},
    }, $class;
}

sub check_for_illegal_params4 {
    my @params = @_;
    my @legal_params = qw / output_training_datafile
                            parameter_file
                            number_of_samples_for_training
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

##  Read a parameter file for generating symbolic training data. See the script
##  generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
##  pass the name of the parameter file to the constructor of the
##  TrainingDataGeneratorSymbolic class.
sub read_parameter_file_symbolic {
    my $self = shift;
    my $debug = $self->{_debug};
    my $number_of_training_samples = $self->{_number_of_samples_for_training};
    my $input_parameter_file = $self->{_parameter_file};
    croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
    my $output_file_training = $self->{_output_training_datafile};
    my $output_file_testing = $self->{_output_test_datafile};
    my @all_params;
    my $param_string;
    open INPUT, $input_parameter_file || "unable to open parameter file: $!";
    @all_params = <INPUT>;
    @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
    @all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
    $param_string = join ' ', @all_params;
    my ($class_names, $class_priors, $rest_param) = 
              $param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
    my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
    push @{$self->{_class_names}}, @class_names;
    my @class_priors =   grep {defined($_) && length($_) > 0} split /\s+/, $2;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            $record_string .= ",$item_parts_hash{$kee}";
        }
        print FILEHANDLE "$record_string\n";
        $record_string = '';
    }
    close FILEHANDLE;
}    

sub sample_index {
    my $arg = shift;
    $arg =~ /_(.+)$/;
    return $1;
}    

#################################   Decision Tree Introspection   #######################################
#################################      Class DTIntrospection      #######################################

package DTIntrospection;

##  Instances constructed from this class can provide explanations for the
##  classification decisions at the nodes of a decision tree.  
##  
##  When used in the interactive mode, the decision-tree introspection made possible
##  by this class provides answers to the following three questions: (1) List of the
##  training samples that fall in the portion of the feature space that corresponds
##  to a node of the decision tree; (2) The probabilities associated with the last
##  feature test that led to the node; and (3) The class probabilities predicated on
##  just the last feature test on the path to that node.
##  
##  CAVEAT: It is possible for a node to exist even when there are no training
##  samples in the portion of the feature space that corresponds to the node.  That
##  is because a decision tree is based on the probability densities estimated from
##  the training data. When training data is non-uniformly distributed, it is
##  possible for the probability associated with a point in the feature space to be
##  non-zero even when there are no training samples at or in the vicinity of that
##  point.
##  
##  For a node to exist even where there are no training samples in the portion of
##  the feature space that belongs to the node is an indication of the generalization
##  ability of decision-tree based classification.
##  
##  When used in a non-interactive mode, an instance of this class can be used to
##  create a tabular display that shows what training samples belong directly to the
##  portion of the feature space that corresponds to each node of the decision tree.
##  An instance of this class can also construct a tabular display that shows how the
##  influence of each training sample propagates in the decision tree.  For each
##  training sample, this display first shows the list of nodes that came into
##  existence through feature test(s) that used the data provided by that sample.
##  This list for each training sample is followed by a subtree of the nodes that owe
##  their existence indirectly to the training sample. A training sample influences a
##  node indirectly if the node is a descendant of another node that is affected
##  directly by the training sample.

use strict; 
use Carp;

sub new {                                                           
    my ($class, $dt) = @_; 
    croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
        unless ref($dt) eq "Algorithm::DecisionTree";
    bless {                                                         
        _dt                                 => $dt,
        _root_dtnode                        => $dt->{_root_node},
        _samples_at_nodes_hash              => {},
        _branch_features_to_nodes_hash      => {},
        _sample_to_node_mapping_direct_hash => {},
        _node_serial_num_to_node_hash       => {}, 
        _awareness_raising_msg_shown        => 0,
        _debug                              => 0,
    }, $class;                                                     
}

sub initialize {
    my $self = shift;
    croak "You must first construct the decision tree before using the DTIntrospection class"
        unless $self->{_root_dtnode};
    $self->recursive_descent($self->{_root_dtnode});
}

sub recursive_descent {
    my $self = shift;
    my $node = shift;
    my $node_serial_number = $node->get_serial_num();
    my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
    print "\nAt node $node_serial_number:  the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
    $self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
    $self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
    my @samples_at_node = ();
    foreach my $item (@$branch_features_and_values_or_thresholds) {
        my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
        unless (@samples_at_node) {
            @samples_at_node =  @$samples_for_feature_value_combo;
        } else {
            my @accum;
            foreach my $sample (@samples_at_node) {
                push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);  
            }
            @samples_at_node =  @accum;
        }
        last unless @samples_at_node;
    }
    @samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node; 
    print "Node: $node_serial_number    the samples are: [@samples_at_node]\n"  if ($self->{_debug});
    $self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
    if (@samples_at_node) {
        foreach my $sample (@samples_at_node) {
            if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
                $self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number]; 
            } else {
                push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
            }
        }
    }
    my $children = $node->get_children();
    foreach my $child (@$children) {
        $self->recursive_descent($child);
    }
}

sub display_training_samples_at_all_nodes_direct_influence_only {
    my $self = shift;



( run in 0.590 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )