Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

Examples/classify_test_data_in_a_file.pl  view on Meta::CPAN

use strict;
use warnings;
use Algorithm::DecisionTree;

die "This script must be called with exactly three command-line arguments:\n" .
    "     1st arg: name of the training datafile\n" .
    "     2nd arg: name of the test data file\n" .     
    "     3rd arg: the name of the output file to which class labels will be written\n" 
    unless @ARGV == 3;

my $debug = 0;

### When the following variable is set to 1, only the most probable class for each
### data record is written out to the output file.  This works only for the case
### when the output is sent to a `.txt' file.  If the output is sent to a `.csv' 
### file, you'll see all the class names and their probabilities for each data sample
### in your test datafile.
my $show_hard_classifications = 1;

my ($training_datafile, $test_datafile, $outputfile) = @ARGV;

Examples/classify_test_data_in_a_file.pl  view on Meta::CPAN

            next if $key =~ /^\"\"$/;
            my $feature_val = $data_hash{$key}[$i-1];
            $feature_val =~ s/^\s*\"|\"\s*$//g;
            $feature_val = sprintf("%.1f",$feature_val) if $feature_val =~ /^\d+$/;
            push @feature_values, $feature_val;
        }
        $features_and_values_hash{$feature} = \@feature_values;
    }
    my %seen = ();
    @all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
    print "\n All class names: @all_class_names\n" if $debug;
    %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    %features_and_unique_values_hash = ();
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($debug) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";
        }
        print "\nclass label for each data sample:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)}  keys %class_for_sample_hash) {
            print "$sample => $class_for_sample_hash{$sample}\n";
        }
        print "\nFeatures used: @feature_names\n\n";

ExamplesBagging/bagging_for_bulk_classification.pl  view on Meta::CPAN

use strict;
use warnings;
use Algorithm::DecisionTreeWithBagging;

die "This script must be called with exactly three command-line arguments:\n" .
    "     1st arg: name of the training datafile\n" .
    "     2nd arg: name of the test data file\n" .     
    "     3rd arg: the name of the output file to which class labels will be written\n" 
    unless @ARGV == 3;

my $debug = 0;

my ($training_datafile, $test_datafile, $outputfile) = @ARGV;

my $training_file_class_name_in_column       = 1;
my $training_file_columns_for_feature_values = [2,3];
my $how_many_bags                            = 4;
my $bag_overlap_fraction                     = 0.2;

my (@all_class_names, @feature_names, %class_for_sample_hash, %feature_values_for_samples_hash,
    %features_and_values_hash, %features_and_unique_values_hash,

ExamplesBagging/bagging_for_bulk_classification.pl  view on Meta::CPAN

            next if $key =~ /^\"\"$/;
            my $feature_val = $data_hash{$key}[$i-1];
            $feature_val =~ s/^\s*\"|\"\s*$//g;
            $feature_val = sprintf("%.1f",$feature_val) if $feature_val =~ /^\d+$/;
            push @feature_values, $feature_val;
        }
        $features_and_values_hash{$feature} = \@feature_values;
    }
    my %seen = ();
    @all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
    print "\n All class names: @all_class_names\n" if $debug;
    %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    %features_and_unique_values_hash = ();
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($debug) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";
        }
        print "\nclass label for each data sample:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)}  keys %class_for_sample_hash) {
            print "$sample => $class_for_sample_hash{$sample}\n";
        }
        print "\nFeatures used: @feature_names\n\n";

ExamplesBoosting/boosting_for_bulk_classification.pl  view on Meta::CPAN

use strict;
use warnings;
use Algorithm::BoostedDecisionTree;

die "This script must be called with exactly three command-line arguments:\n" .
    "     1st arg: name of the training datafile\n" .
    "     2nd arg: name of the test data file\n" .     
    "     3rd arg: the name of the output file to which class labels will be written\n" 
    unless @ARGV == 3;

my $debug = 0;

my ($training_datafile, $test_datafile, $outputfile) = @ARGV;

my $training_file_class_name_in_column       = 1;
my $training_file_columns_for_feature_values = [2,3];
my $how_many_stages                          = 4;

my (@all_class_names, @feature_names, %class_for_sample_hash, %feature_values_for_samples_hash,
    %features_and_values_hash, %features_and_unique_values_hash,
    %numeric_features_valuerange_hash, %feature_values_how_many_uniques_hash);

ExamplesBoosting/boosting_for_bulk_classification.pl  view on Meta::CPAN

            next if $key =~ /^\"\"$/;
            my $feature_val = $data_hash{$key}[$i-1];
            $feature_val =~ s/^\s*\"|\"\s*$//g;
            $feature_val = sprintf("%.1f",$feature_val) if $feature_val =~ /^\d+$/;
            push @feature_values, $feature_val;
        }
        $features_and_values_hash{$feature} = \@feature_values;
    }
    my %seen = ();
    @all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
    print "\n All class names: @all_class_names\n" if $debug;
    %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    %features_and_unique_values_hash = ();
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($debug) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";
        }
        print "\nclass label for each data sample:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)}  keys %class_for_sample_hash) {
            print "$sample => $class_for_sample_hash{$sample}\n";
        }
        print "\nFeatures used: @feature_names\n\n";

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params(@params) == 0;
    my %dtargs = %args;
    delete $dtargs{how_many_stages};
    my $instance = Algorithm::DecisionTree->new(%dtargs);
    bless $instance, $class;
    $instance->{_how_many_stages}              =  $args{how_many_stages} || undef;
    $instance->{_stagedebug}                   =  $args{stagedebug} || 0;
    $instance->{_training_samples}             =  {map {$_ => []} 0..$args{how_many_stages}};
    $instance->{_all_trees}                    =  {map {$_ => Algorithm::DecisionTree->new(%dtargs)} 0..$args{how_many_stages}};
    $instance->{_root_nodes}                   =  {map {$_ => undef} 0..$args{how_many_stages}};
    $instance->{_sample_selection_probs}       =  {map {$_ => {}} 0..$args{how_many_stages}};
    $instance->{_trust_factors}                =  {map {$_ => undef} 0..$args{how_many_stages}};
    $instance->{_misclassified_samples}        =  {map {$_ => []} 0..$args{how_many_stages}};
    $instance->{_classifications}              =  undef;
    $instance->{_trust_weighted_decision_classes}  =  undef;
    bless $instance, $class;
}

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

        my $record_label = shift @parts;
        $record_label  =~ s/^\s*\"|\"\s*$//g;
        $data_hash{$record_label} = \@parts;
        $all_record_ids_with_class_labels{$record_label} = $classname;
        print "." if $record_index % 10000 == 0;
        $record_index++;
    }
    close FILEIN;    
    $|--;
    $self->{_how_many_total_training_samples} = $record_index - 1;  # must subtract 1 for the header record
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
    my @all_feature_names =   split /,/, substr($firstline, index($firstline,','));
    my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
    my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
    my %class_for_sample_hash = map {"sample_" . $_  =>  "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $d...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};     
    my @all_class_names =  sort keys %{ {map {$_ => 1} values %class_for_sample_hash } };
    $self->{_number_of_training_samples} = scalar @sample_names;
    if ($self->{_debug2}) {
        print "\nDisplaying features and their values for entire training data:\n\n";
        foreach my $fname (keys  %features_and_values_hash) {         
            print "        $fname    =>  @{$features_and_values_hash{$fname}}\n";
        }
    }
    my %features_and_unique_values_hash = ();
    my %feature_values_how_many_uniques_hash  =  ();
    my %numeric_features_valuerange_hash   =   ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

    $self->{_all_trees}->{0}->{_class_names} = \@all_class_names;
    $self->{_all_trees}->{0}->{_feature_names} = \@feature_names;
    $self->{_all_trees}->{0}->{_samples_class_label_hash} = \%class_for_sample_hash;
    $self->{_all_trees}->{0}->{_training_data_hash}  =  \%feature_values_for_samples_hash;
    $self->{_all_trees}->{0}->{_features_and_values_hash}    =  \%features_and_values_hash;
    $self->{_all_trees}->{0}->{_features_and_unique_values_hash}    =  \%features_and_unique_values_hash;
    $self->{_all_trees}->{0}->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
    $self->{_all_trees}->{0}->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
    $self->{_all_training_data} = \%feature_values_for_samples_hash;    
    $self->{_all_sample_names} = [sort {sample_index($a) cmp sample_index($b)} keys %feature_values_for_samples_hash];
    if ($self->{_debug1}) {
        print "\n\n===========================  data ingested for the base tree   ==================================\n\n";
        print "\nAll class names: @{$self->{_all_trees}->{0}->{_class_names}}\n";
        print "\nEach sample data record:\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{0}->{_training_data_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_training_data_hash}->{$kee}}\n";
        }
        print "\nclass label for each data sample:\n";        
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{0}->{_samples_class_label_hash}}) {
            print "$kee    =>   $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$kee}\n";            
        }

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN


sub display_base_decision_tree {
    my $self = shift;
    $self->{_root_nodes}->{0}->display_decision_tree("     ");
}

sub construct_cascade_of_trees {
    my $self = shift;
    $self->{_training_samples}->{0} = $self->{_all_sample_names};
    $self->{_misclassified_samples}->{0} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{0}, $self->{_root_nodes}->{0});
    if ($self->{_stagedebug}) {
        $self->show_class_labels_for_misclassified_samples_in_stage(0);
        print "\n\nSamples misclassified by base classifier: @{$self->{_misclassified_samples}->{0}}\n";
        my $how_many = @{$self->{_misclassified_samples}->{0}};
        print "\nNumber of misclassified samples: $how_many\n";
    }
    my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{0}->{$_}} @{$self->{_misclassified_samples}->{0}};
    print "\nMisclassification_error_rate for base classifier: $misclassification_error_rate\n" if $self->{_stagedebug};
    $self->{_trust_factors}->{0} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
    print "\nBase class trust factor: $self->{_trust_factors}->{0}\n"  if $self->{_stagedebug};
    foreach my $stage_index (1 .. $self->{_how_many_stages} - 1) {
        print "\n\n========================== Constructing stage indexed $stage_index =========================\n"
              if $self->{_stagedebug};
        $self->{_sample_selection_probs}->{$stage_index} =  { map {$_ =>  $self->{_sample_selection_probs}->{$stage_index-1}->{$_} *   exp(-1.0 * $self->{_trust_factors}->{$stage_index - 1} *  (contained_in($_, @{$self->{_misclassified_samples}->{$st...
        my $normalizer = reduce {$a + $b} values %{$self->{_sample_selection_probs}->{$stage_index}};
        print "\nThe normalizer is: $normalizer\n"  if $self->{_stagedebug};
        map {$self->{_sample_selection_probs}->{$stage_index}->{$_}  /= $normalizer} keys %{$self->{_sample_selection_probs}->{$stage_index}};
        my @training_samples_this_stage = ();
        my $sum_of_probs = 0.0;
        foreach my $sample (sort {$self->{_sample_selection_probs}->{$stage_index}->{$b} <=> $self->{_sample_selection_probs}->{$stage_index}->{$a}} keys %{$self->{_sample_selection_probs}->{$stage_index}}) {
            $sum_of_probs += $self->{_sample_selection_probs}->{$stage_index}->{$sample};
            push @training_samples_this_stage, $sample if $sum_of_probs < 0.5;
            last if $sum_of_probs > 0.5;
        }
        $self->{_training_samples}->{$stage_index} = [sort {sample_index($a) <=> sample_index($b)} @training_samples_this_stage];
        if ($self->{_stagedebug}) {
            print "\nTraining samples for stage $stage_index: @{$self->{_training_samples}->{$stage_index}}\n\n";
            my $num_of_training_samples = @{$self->{_training_samples}->{$stage_index}};
            print "\nNumber of training samples this stage $num_of_training_samples\n\n";
        }
        # find intersection of two sets:
        my %misclassified_samples = map {$_ => 1} @{$self->{_misclassified_samples}->{$stage_index-1}};
        my @training_samples_selection_check = grep $misclassified_samples{$_}, @{$self->{_training_samples}->{$stage_index}};
        if ($self->{_stagedebug}) {
            my @training_in_misclassified = sort {sample_index($a) <=> sample_index($b)} @training_samples_selection_check;
            print "\nTraining samples in the misclassified set: @training_in_misclassified\n";
            my $how_many = @training_samples_selection_check;
            print "\nNumber_of_miscalssified_samples_in_training_set: $how_many\n";
        }
        my $dt_this_stage = Algorithm::DecisionTree->new('boostingmode');
        $dt_this_stage->{_training_data_hash} = { map {$_ => $self->{_all_training_data}->{$_} } @{$self->{_training_samples}->{$stage_index}} };

        $dt_this_stage->{_class_names} = $self->{_all_trees}->{0}->{_class_names};
        $dt_this_stage->{_feature_names} = $self->{_all_trees}->{0}->{_feature_names};

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$dt_this_stage->{_training_data_hash}}) { 
            foreach my $feature_and_value (@{$dt_this_stage->{_training_data_hash}->{$sample}}) {
                $feature_and_value =~ /$pattern/;
                my ($feature, $value) = ($1, $2);
                push @{$dt_this_stage->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
            }
        }
        $dt_this_stage->{_features_and_unique_values_hash} = {map {my $feature = $_; $feature => [sort keys %{{map {$_ => 1} @{$dt_this_stage->{_features_and_values_hash}->{$feature}}}}]} keys %{$dt_this_stage->{_features_and_values_hash}}};
        $dt_this_stage->{_numeric_features_valuerange_hash} = {map {$_ => []} keys %{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}}};
        $dt_this_stage->{_numeric_features_valuerange_hash} = {map {my $feature = $_; $feature =>  [min(@{$dt_this_stage->{_features_and_unique_values_hash}->{$feature}}), max(@{$dt_this_stage->{_features_and_unique_values_hash}->{$feature}})]} keys ...
        if ($self->{_stagedebug}) {
            print "\n\nPrinting features and their values in the training set:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_features_and_values_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_features_and_values_hash}->{$kee}}\n";
            }
            print "\n\nPrinting unique values for features:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_features_and_unique_values_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_features_and_unique_values_hash}->{$kee}}\n";            
            }
            print "\n\nPrinting unique value ranges for features:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_numeric_features_valuerange_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_numeric_features_valuerange_hash}->{$kee}}\n";            
            }
        }
        $dt_this_stage->{_feature_values_how_many_uniques_hash} = {map {$_ => undef} keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}};
        $dt_this_stage->{_feature_values_how_many_uniques_hash} = {map {$_ => scalar @{$dt_this_stage->{_features_and_unique_values_hash}->{$_}}} keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}};
        $dt_this_stage->calculate_first_order_probabilities();
        $dt_this_stage->calculate_class_priors();
        print "\n\n>>>>>>>Done with the initialization of the tree for stage $stage_index<<<<<<<<<<\n" if $self->{_stagedebug};
        my $root_node_this_stage = $dt_this_stage->construct_decision_tree_classifier();
        $root_node_this_stage->display_decision_tree("     ") if $self->{_stagedebug};

        $self->{_all_trees}->{$stage_index} = $dt_this_stage;
        $self->{_root_nodes}->{$stage_index} = $root_node_this_stage;
        $self->{_misclassified_samples}->{$stage_index} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{$stage_index}, $self->{_root_nodes}->{$stage_index});
        if ($self->{_stagedebug}) {
            print "\nSamples misclassified by stage $stage_index classifier: @{$self->{_misclassified_samples}->{$stage_index}}\n";
            printf("\nNumber of misclassified samples: %d\n", scalar @{$self->{_misclassified_samples}->{$stage_index}});
            $self->show_class_labels_for_misclassified_samples_in_stage($stage_index);
        }
        my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{$stage_index}->{$_}} @{$self->{_misclassified_samples}->{$stage_index}};
        print "\nStage $stage_index misclassification_error_rate: $misclassification_error_rate\n" if $self->{_stagedebug};

        $self->{_trust_factors}->{$stage_index} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
        print "\nStage $stage_index trust factor: $self->{_trust_factors}->{$stage_index}\n"  if $self->{_stagedebug};
    }
}

sub evaluate_one_stage_of_cascade {
    my $self = shift;
    my $trainingDT = shift;
    my $root_node = shift;
    my @misclassified_samples = ();
    foreach my $test_sample_name (@{$self->{_all_sample_names}}) {
        my @test_sample_data = @{$self->{_all_trees}->{0}->{_training_data_hash}->{$test_sample_name}};
        print "original data in $test_sample_name:@test_sample_data\n" if $self->{_stagedebug};
        @test_sample_data = map {$_ if $_ !~ /=NA$/} @test_sample_data;
        print "$test_sample_name: @test_sample_data\n" if $self->{_stagedebug}; 
        my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
        my @solution_path = @{$classification{'solution_path'}};                                  
        delete $classification{'solution_path'};                                              
        my @which_classes = keys %classification;
        @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
        my $most_likely_class_label = $which_classes[0];
        if ($self->{_stagedebug}) {
            print "\nClassification:\n\n";
            print "     class                         probability\n";
            print "     ----------                    -----------\n";
            foreach my $which_class (@which_classes) {
                my $classstring = sprintf("%-30s", $which_class);
                my $valuestring = sprintf("%-30s", $classification{$which_class});
                print "     $classstring $valuestring\n";
            }
            print "\nSolution path in the decision tree: @solution_path\n";
            print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
        }
        my $true_class_label_for_test_sample = $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$test_sample_name};
        printf("%s:   true_class: %s    estimated_class: %s\n", $test_sample_name, $true_class_label_for_test_sample, $most_likely_class_label) if $self->{_stagedebug};
        push @misclassified_samples, $test_sample_name if $true_class_label_for_test_sample ne $most_likely_class_label;
    }
    return [sort {sample_index($a) <=> sample_index($b)} @misclassified_samples];
}

sub show_class_labels_for_misclassified_samples_in_stage {
    my $self = shift;
    my $stage_index = shift;
    die "\nYou must first call 'construct_cascade_of_trees()' before invoking 'show_class_labels_for_misclassified_samples_in_stage()'" unless @{$self->{_misclassified_samples}->{0}} > 0;
    my @classes_for_misclassified_samples = ();

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

    my @params = @_;
    my @legal_params = qw / how_many_stages
                            training_datafile
                            entropy_threshold
                            max_depth_desired
                            csv_class_column_index
                            csv_columns_for_features
                            symbolic_to_numeric_cardinality_threshold
                            number_of_histogram_bins
                            csv_cleanup_needed
                            debug1
                            debug2
                            debug3
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    unless ($eval_or_boosting_mode) {
        my @params = keys %args;
        croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params2(@params) == 0;
    }
    bless {
        _training_datafile                   =>    $args{training_datafile}, 
        _entropy_threshold                   =>    $args{entropy_threshold} || 0.01,
        _max_depth_desired                   =>    exists $args{max_depth_desired} ? 
                                                                       $args{max_depth_desired} : undef,
        _debug1                              =>    $args{debug1} || 0,
        _debug2                              =>    $args{debug2} || 0,
        _debug3                              =>    $args{debug3} || 0,
        _csv_class_column_index              =>    $args{csv_class_column_index} || undef,
        _csv_columns_for_features            =>    $args{csv_columns_for_features} || undef,
        _symbolic_to_numeric_cardinality_threshold
                                             =>    $args{symbolic_to_numeric_cardinality_threshold} || 10,
        _number_of_histogram_bins            =>    $args{number_of_histogram_bins} || undef,
        _csv_cleanup_needed                  =>    $args{csv_cleanup_needed} || 0,
        _training_data                       =>    [],
        _root_node                           =>    undef,
        _probability_cache                   =>    {},
        _entropy_cache                       =>    {},

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
                                                                       $not_all_values_float == 0) {
            $newvalue = closest_sampling_point($value, \@unique_values_for_feature);
        }
        push @new_features_and_values, "$feature" . '=' . "$newvalue";
    }
    @features_and_values = @new_features_and_values;
    print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
    my %answer = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        $answer{$class_name} = undef;
    }
    $answer{'solution_path'} = [];
    my %classification = %{$self->recursive_descent_for_classification($root_node, 
                                                                    \@features_and_values,\%answer)};
    @{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
    if ($self->{_debug3}) {
        print "\nCL2 The classification:\n";
        foreach my $class_name (@{$self->{_class_names}}) {
            print "    $class_name  with probability $classification{$class_name}\n";
        }
    }
    my %classification_for_display = ();
    foreach my $item (keys %classification) {
        if ($item ne 'solution_path') {
            $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
        } else {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    if (@children == 0) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
        return \%answer;
    }
    my $feature_tested_at_node = $node->get_feature();
    print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n" 
        if $self->{_debug3};
    my $value_for_feature;
    my $path_found;
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values) {
        $feature_and_value =~ /$pattern/;
        $value_for_feature = $2 if $feature_tested_at_node eq $1;
    }
    # The following clause introduced in Version 3.20
    if (!defined $value_for_feature) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
        return \%answer;
    }
    if ($value_for_feature) {
        if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
            print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
            my $pattern1 = '(.+)<(.+)';
            my $pattern2 = '(.+)>(.+)';
            foreach my $child (@children) {
                my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
                my $last_feature_and_value_on_branch = $branch_features_and_values[-1]; 
                if ($last_feature_and_value_on_branch =~ /$pattern1/) {
                    my ($feature, $threshold) = ($1,$2); 
                    if ($value_for_feature <= $threshold) {
                        $path_found = 1;
                        %answer = %{$self->recursive_descent_for_classification($child,

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                                                                            $features_and_values,\%answer)};
                        push @{$answer{'solution_path'}}, $node->get_serial_num();
                        last;
                    }
                }
            }
            return \%answer if $path_found;
        } else {
            my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
            print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n" 
                if $self->{_debug3};
            foreach my $child (@children) {
                my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
                print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
                my $last_feature_and_value_on_branch = $branch_features_and_values[-1]; 
                if ($last_feature_and_value_on_branch eq $feature_value_combo) {
                    %answer = %{$self->recursive_descent_for_classification($child,
                                                                              $features_and_values,\%answer)};
                    push @{$answer{'solution_path'}}, $node->get_serial_num();
                    $path_found = 1;
                    last;
                }
            }
            return \%answer if $path_found;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN


######################################    Decision Tree Construction  ####################################

##  At the root node, we find the best feature that yields the greatest reduction in
##  class entropy from the entropy based on just the class priors. The logic for
##  finding this feature is different for symbolic features and for numeric features.
##  That logic is built into the method shown later for best feature calculations.
sub construct_decision_tree_classifier {
    print "\nConstructing the decision tree ...\n";
    my $self = shift;
    if ($self->{_debug3}) {        
        $self->determine_data_condition(); 
        print "\nStarting construction of the decision tree:\n";
    }
    my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
    if ($self->{_debug3}) { 
        print "\nPrior class probabilities: @class_probabilities\n";
        print "\nClass names: @{$self->{_class_names}}\n";
    }
    my $entropy = $self->class_entropy_on_priors();
    print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
    my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
    $root_node->set_class_names(\@{$self->{_class_names}});
    $self->{_root_node} = $root_node;
    $self->recursive_descent($root_node);
    return $root_node;
}

##  After the root node of the decision tree is calculated by the previous methods,
##  we invoke this method recursively to create the rest of the tree.  At each node,
##  we find the feature that achieves the largest entropy reduction with regard to
##  the partitioning of the training data samples that correspond to that node.
sub recursive_descent {
    my $self = shift;
    my $node = shift;
    print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
        if $self->{_debug3};
    my $node_serial_number = $node->get_serial_num();
    my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
    my $existing_node_entropy = $node->get_node_entropy();
    if ($self->{_debug3}) { 
        print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
        print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
        print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
        my @class_probs = @{$node->get_class_probabilities()};
        print "\nRD4 Class probabilities: @class_probs\n";
    }
    if ($existing_node_entropy < $self->{_entropy_threshold}) { 
        print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
        return;
    }
    my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
    my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
                    $self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
    $node->set_feature($best_feature);
    $node->display_node() if $self->{_debug3};
    if (defined($self->{_max_depth_desired}) && 
               (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
        print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3}; 
        return;
    }
    return if ! defined $best_feature;
    if ($self->{_debug3}) { 
        print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
        print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
        print "\nRD9 Best feature entropy: $best_feature_entropy\n";
        print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
    }
    my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
    print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
    if ($entropy_gain > $self->{_entropy_threshold}) {
        if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} && 
              $self->{_feature_values_how_many_uniques_hash}->{$best_feature} > 
                                        $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my $best_threshold = $decision_val;            # as returned by best feature calculator
            my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
            my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
            my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
            my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
            my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
            push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 
                                                                  $feature_threshold_combo_for_less_than;
            push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child, 
                                                               $feature_threshold_combo_for_greater_than;
            if ($self->{_debug3}) {
                print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
                print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
            }
            my @class_probabilities_for_lessthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                 \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
            my @class_probabilities_for_greaterthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
              \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
            if ($self->{_debug3}) {
                print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
                print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
            }
            if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
                                                         \@class_probabilities_for_lessthan_child_node,
                              \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
                $node->add_child_link($left_child_node);
                $self->recursive_descent($left_child_node);
            }
            if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
                                                         \@class_probabilities_for_greaterthan_child_node,
                            \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
                $node->add_child_link($right_child_node);
                $self->recursive_descent($right_child_node);
            }
        } else {
            print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
                if $self->{_debug3};
            my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
            print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
            my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
            my @class_entropies_for_children = ();
            foreach my $feature_and_value_index (0..@feature_value_combos-1) {
                print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
                    if $self->{_debug3};
                my @extended_branch_features_and_values_or_thresholds;
                if (! @features_and_values_or_thresholds_on_branch) {
                    @extended_branch_features_and_values_or_thresholds = 
                                                          ($feature_value_combos[$feature_and_value_index]);
                } else {
                    @extended_branch_features_and_values_or_thresholds = 
                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @extended_branch_features_and_values_or_thresholds, 
                                           $feature_value_combos[$feature_and_value_index];
                }
                my @class_probabilities =
                   map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                               \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
                my $class_entropy_for_child = 
                      $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
                                                         \@extended_branch_features_and_values_or_thresholds);
                if ($self->{_debug3}) {
                    print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
                    print "\nRD20 class entropy for child: $class_entropy_for_child\n"; 
                }
                if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
                    my $child_node = DTNode->new(undef, $class_entropy_for_child,
                          \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
                    $node->add_child_link($child_node);
                    $self->recursive_descent($child_node);
                } else {
                    print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
                }
            }
        }
    } else {
        print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n" 
            if $self->{_debug3};
        return;
    }
}

##  This is the heart of the decision tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples that
##  correspond to the current node.  The search for the best feature is carried out
##  differently for symbolic features and for numeric features.  For a symbolic
##  feature, the method estimates the entropy for each value of the feature and then
##  averages out these entropies as a measure of the discriminatory power of that

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    my %partitioning_point_threshold = ();
    my %entropies_for_different_values_of_symbolic_feature = ();
    foreach my $feature (@{$self->{_feature_names}}) {
        $entropy_values_for_different_features{$feature} = [];
        $partitioning_point_child_entropies_hash{$feature} = {};
        $partitioning_point_threshold{$feature} = undef;
        $entropies_for_different_values_of_symbolic_feature{$feature} = [];
    }
    foreach my $i (0..@{$self->{_feature_names}}-1) {
        my $feature_name = $self->{_feature_names}->[$i];
        print "\n\nBFC1          FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
        if (contained_in($feature_name, @symbolic_features_already_used)) {
            next;
        } elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
                 $self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
                                      $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};      
            my @newvalues = ();
            if (contained_in($feature_name, @true_numeric_types_feature_names)) {
                if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                              $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
                    next;
                } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                                    $lowerbound{$feature_name} < $upperbound{$feature_name}) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
                    }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN


                push @partitioning_entropies, $partitioning_entropy;
                $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
            }
            my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
            if ($min_entropy < $existing_node_entropy) {
                $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
                $entropy_values_for_different_features{$feature_name} = $min_entropy;
            }
        } else {
            print "\nBFC2:  Entering section reserved for symbolic features\n" if $self->{_debug3};
            print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
            my %seen;
            my @values = grep {$_ ne 'NA' && !$seen{$_}++} 
                                    @{$self->{_features_and_unique_values_hash}->{$feature_name}};
            @values = sort @values;
            print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};

            my $entropy = 0;
            foreach my $value (@values) {
                my $feature_value_string = "$feature_name" . '=' . "$value";
                print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
                my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                if (@features_and_values_or_thresholds_on_branch) {
                    push @extended_attributes, $feature_value_string;
                } else {
                    @extended_attributes = ($feature_value_string);
                }
                $entropy += 
           $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) * 
           $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
                print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
                      "is $entropy\n" if $self->{_debug3};
                push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
            }
            if ($entropy < $existing_node_entropy) {
                $entropy_values_for_different_features{$feature_name} = $entropy;
            }
        }
    }
    my $min_entropy_for_best_feature;
    my $best_feature_name;
    foreach my $feature_nom (keys %entropy_values_for_different_features) { 

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
    } else {
        @val_based_entropies_to_be_returned = ();
    }
    if (exists $partitioning_point_threshold{$best_feature_name}) {
        $decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
    } else {
        $decision_val_to_be_returned = undef;
    }
    print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
        "@val_based_entropies_to_be_returned\n"  if $self->{_debug3};
    return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned, 
                                                                      $decision_val_to_be_returned);
}

#########################################    Entropy Calculators     #####################################

sub class_entropy_on_priors {
    my $self = shift;
    return $self->{_entropy_cache}->{'priors'} 
        if exists $self->{_entropy_cache}->{"priors"};

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    foreach my $class_name (@{$self->{_class_names}}) {
        my $class_name_in_cache = "prior::$class_name";
        my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
        my @all_values = values %{$self->{_samples_class_label_hash}};
        my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
        my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
        $self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
        my $this_class_name_in_cache = "prior::$class_name";
        $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
    }
    if ($self->{_debug1}) {
        foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
            print "$class  =>  $self->{_class_priors_hash}->{$class}\n";
        }
    }
}

sub calculate_first_order_probabilities {
    print "\nEstimating probabilities...\n";
    my $self = shift;
    foreach my $feature (@{$self->{_feature_names}}) {
        $self->probability_of_feature_value($feature, undef);   
        if ($self->{_debug2}) {
            if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
                print "\nPresenting probability distribution for a numeric feature:\n";
                foreach my $sampling_point (sort {$a <=> $b} keys 
                                   %{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
                    my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
                    print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f", 
                          $self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
                }
            } else {
                print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
        $feature_value = closest_sampling_point($feature_value, 
                                        $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
    }
    my $feature_value_class;
    if (defined($feature_value)) {
        $feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
    }
    if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
        print "\nNext answer returned by cache for feature $feature_name and " .
            "value $feature_value given class $class_name\n" if $self->{_debug2};
        return $self->{_probability_cache}->{$feature_value_class};
    }
    my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);

    if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
        if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} > 
                                $self->{_symbolic_to_numeric_cardinality_threshold}) {
            $histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
            $num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
            @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            my ($feature,$value) = ($1,$2);
            push @symbolic_types_feature_names, $feature;
        }
    }
    my %seen1 = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
    my %seen2 = ();
    @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
    my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
    print_array_with_msg("POS: Answer returned by find_bounded: ", 
                                       $bounded_intervals_numeric_types) if $self->{_debug2};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature_name (@true_numeric_types_feature_names) {
        $upperbound{$feature_name} = undef;
        $lowerbound{$feature_name} = undef;
    }
    foreach my $item (@$bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            my ($feature,$value) = ($1,$2);
            push @symbolic_types_feature_names, $feature;
        }
    }
    my %seen1 = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
    my %seen2 = ();
    @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
    my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
    print_array_with_msg("POSC: Answer returned by find_bounded: ", 
                                       $bounded_intervals_numeric_types) if $self->{_debug2};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature_name (@true_numeric_types_feature_names) {
        $upperbound{$feature_name} = undef;
        $lowerbound{$feature_name} = undef;
    }
    foreach my $item (@$bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

##  time, less than 2, AND, at the same time, less than 3, is the same as the
##  probability that the feature less than 1. Similarly, the probability that 'age'
##  is greater than 34 and also greater than 37 is the same as `age' being greater
##  than 37.
sub find_bounded_intervals_for_numeric_features {
    my $self = shift;
    my $arr = shift;    
    my @arr = @$arr;
    my @features = @{$self->{_feature_names}};
    my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;   
    print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
    my @arr3 = ();
    foreach my $feature_name (@features) {
        my @temp = ();
        foreach my $x (@arr1) {
            push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
        }
        push @arr3, \@temp if @temp > 0;
    }
    print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
    # Sort each list so that '<' entries occur before '>' entries:
    my @arr4;
    foreach my $li (@arr3) {
        my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
        push @arr4, \@sorted;
    }
    print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
    my @arr5;
    foreach my $li (@arr4) {
        my @temp1 = ();
        my @temp2 = ();
        foreach my $inner (@$li) {
            if ($inner->[1] eq '<') {
                push @temp1, $inner;
            } else {
                push @temp2, $inner;
            }
        }
        if (@temp1 > 0 && @temp2 > 0) {
            push @arr5, [\@temp1, \@temp2];
        } elsif (@temp1 > 0) {
            push @arr5, [\@temp1];
        } else {
            push @arr5, [\@temp2];
        }
    }
    print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
    my @arr6 = ();
    foreach my $li (@arr5) {
        my @temp1 = ();
        foreach my $inner (@$li) {
            my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
            push @temp1, \@sorted;
        }
        push @arr6, \@temp1;
    }
    print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
    my @arr9 = ();
    foreach my $li (@arr6) {
        foreach my $alist (@$li) {
            my @newalist = ();
            if ($alist->[0][1] eq '<') {
                push @newalist, $alist->[0];
            } else {
                push @newalist, $alist->[-1];
            }
            if ($alist->[0][1] ne $alist->[-1][1]) {
                push @newalist, $alist->[-1];
            }
            push @arr9, \@newalist;
        }
    }
    print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
    return \@arr9;

}

##  This method is used to verify that you used legal feature names in the test
##  sample that you want to classify with the decision tree.
sub check_names_used {
    my $self = shift;
    my $features_and_values_test_data = shift;
    my @features_and_values_test_data = @$features_and_values_test_data;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        my $record_label = shift @parts;
        $record_label  =~ s/^\s*\"|\"\s*$//g;
        $data_hash{$record_label} = \@parts;
        $all_record_ids_with_class_labels{$record_label} = $classname;
        print "." if $record_index % 10000 == 0;
        $record_index++;
    }
    close FILEIN;    
    $|--;
    $self->{_how_many_total_training_samples} = $record_index - 1;  # must subtract 1 for the header record
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
    my @all_feature_names =   split /,/, substr($firstline, index($firstline,','));
    my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
    my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
    my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
    my %class_for_sample_hash = map {"sample_" . $_  =>  "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $d...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};     
    my %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($self->{_debug1}) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";
        }
        print "\nclass label for each data sample:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)}  keys %class_for_sample_hash) {
            print "$sample => $class_for_sample_hash{$sample}\n";
        }
        print "\nFeatures used: @feature_names\n\n";

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

sub check_for_illegal_params2 {
    my @params = @_;
    my @legal_params = qw / training_datafile
                            entropy_threshold
                            max_depth_desired
                            csv_class_column_index
                            csv_columns_for_features
                            symbolic_to_numeric_cardinality_threshold
                            number_of_histogram_bins
                            csv_cleanup_needed
                            debug1
                            debug2
                            debug3
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

@EvalTrainingData::ISA = ('Algorithm::DecisionTree');

sub new {
    my $class = shift;
    my $instance = Algorithm::DecisionTree->new(@_);
    bless $instance, $class;
}

sub evaluate_training_data {
    my $self = shift;
    my $evaldebug = 0;
    die "The data evaluation function in the module can only be used when your " .
        "training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
    print "\nWill run a 10-fold cross-validation test on your training data to test its " .
          "class-discriminatory power:\n";
    my %all_training_data = %{$self->{_training_data_hash}};
    my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=> 
                                     Algorithm::DecisionTree::sample_index($b)}  keys %all_training_data;
    my $fold_size = int(0.1 * (scalar keys %all_training_data));
    print "fold size: $fold_size\n";
    my %confusion_matrix = ();

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            } else {
                @unique_values_for_feature = sort @unique_values_for_feature;
            }
            $trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
        }
        foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
            my @minmaxvalues = Algorithm::DecisionTree::minmax(
                                         \@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
            $trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
        }
        if ($evaldebug) {
            print "\n\nprinting samples in the testing set: @testing_samples\n";
            print "\n\nPrinting features and their values in the training set:\n";
            foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
                print "$item  => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
            }
            print "\n\nPrinting unique values for features:\n";
            foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
                print "$item  => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
            }
            print "\n\nPrinting unique value ranges for features:\n";
            foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
                print "$item  => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
            }
        }
        foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
            $trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} = 
                scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
        }
        $trainingDT->{_debug2} = 1 if $evaldebug;
        $trainingDT->calculate_first_order_probabilities();
        $trainingDT->calculate_class_priors();
        my $root_node = $trainingDT->construct_decision_tree_classifier();
        $root_node->display_decision_tree("     ") if $evaldebug;
        foreach my $test_sample_name (@testing_samples) {
            my @test_sample_data = @{$all_training_data{$test_sample_name}};
            print "original data in test sample: @test_sample_data\n" if $evaldebug;
            @test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
            print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
            my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
            my @solution_path = @{$classification{'solution_path'}};
            delete $classification{'solution_path'};
            my @which_classes = keys %classification;
            @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
            my $most_likely_class_label = $which_classes[0];
            if ($evaldebug) {
                print "\nClassification:\n\n";
                print "     class                         probability\n";
                print "     ----------                    -----------\n";
                foreach my $which_class (@which_classes) {
                    my $classstring = sprintf("%-30s", $which_class);
                    my $valuestring = sprintf("%-30s", $classification{$which_class});
                    print "     $classstring $valuestring\n";
                }
                print "\nSolution path in the decision tree: @solution_path\n";
                print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
            }
            my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
            print "$test_sample_name:    true_class: $true_class_label_for_sample    " .
                     "estimated_class: $most_likely_class_label\n"  if $evaldebug;
            $confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
        }
    }
    print "\n\n       DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
    my $matrix_header = " " x 30;
    foreach my $class_name (@{$self->{_class_names}}) {  
        $matrix_header .= sprintf("%-30s", $class_name);
    }
    print "\n" . $matrix_header . "\n\n";
    foreach my $row_class_name (sort keys %confusion_matrix) {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        _output_training_csv_file          =>   $args{'output_training_csv_file'} 
                                                   || croak("name for output_training_csv_file required"),
        _output_test_csv_file              =>   $args{'output_test_csv_file'} 
                                                   || croak("name for output_test_csv_file required"),
        _parameter_file                    =>   $args{'parameter_file'}
                                                         || croak("parameter_file required"),
        _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'} 
                                                         || croak("number_of_samples_for_training"),
        _number_of_samples_for_testing     =>   $args{'number_of_samples_for_testing'} 
                                                         || croak("number_of_samples_for_testing"),
        _debug                             =>    $args{debug} || 0,
        _class_names                       =>    [],
        _class_names_and_priors            =>    {},
        _features_with_value_range         =>    {},
        _features_ordered                  =>    [],
        _classes_and_their_param_values    =>    {},
    }, $class;
}

sub check_for_illegal_params3 {
    my @params = @_;
    my @legal_params = qw / output_training_csv_file
                            output_test_csv_file
                            parameter_file
                            number_of_samples_for_training
                            number_of_samples_for_testing
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    my @params = <FILE>;
    my $params = join "", @params;
    my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
    $params =~ /$regex/si;
    my ($class_names, $class_priors) = ($1, $2);
    @class_names = split ' ', $class_names; 
    my @class_priors = split ' ', $class_priors;
    foreach my $i (0..@class_names-1) {
        $class_names_and_priors{$class_names[$i]} = $class_priors[$i];
    }
    if ($self->{_debug}) {
        foreach my $cname (keys %class_names_and_priors) {
            print "$cname  =>   $class_names_and_priors{$cname}\n";
        }
    }
    $regex = 'feature name: \w*.*?value range: [\d\. -]+';
    my @features = $params =~ /$regex/gsi;
    my @features_ordered;
    $regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
    foreach my $feature (@features) {
        $feature =~ /$regex/i;
        my $feature_name = $1;
        push @features_ordered, $feature_name;
        my @value_range = split ' ', $2;
        $features_with_value_range{$feature_name} = \@value_range;
    }
    if ($self->{_debug}) {
        foreach my $fname (keys %features_with_value_range) {
            print "$fname  =>   @{$features_with_value_range{$fname}}\n";
        }
    }
    foreach my $i (0..@class_names-1) {
        $classes_and_their_param_values{$class_names[$i]} = {};
    }
    $regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
    my @class_params = $params =~ /$regex/gsi;
    $regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        $classes_and_their_param_values{$class_name}->{'mean'} =  \@class_mean;
        my $class_param_string = $3;
        my @covar_rows = split '\n', $class_param_string;
        my @covar_matrix;
        foreach my $row (@covar_rows) {
            my @row = split ' ', $row;
            push @covar_matrix, \@row;
        }
        $classes_and_their_param_values{$class_name}->{'covariance'} =  \@covar_matrix;
    }
    if ($self->{_debug}) {
        print "\nThe class parameters are:\n\n";
        foreach my $cname (keys %classes_and_their_param_values) {
            print "\nFor class name $cname:\n";
            my %params_hash = %{$classes_and_their_param_values{$cname}};
            foreach my $x (keys %params_hash) {
                if ($x eq 'mean') {
                    print "    $x   =>   @{$params_hash{$x}}\n";
                } else {
                    if ($x eq 'covariance') {
                        print "    The covariance matrix:\n";

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    }
    foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
        my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
        my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
        my @new_training_data = Math::Random::random_multivariate_normal(
              $self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
              @mean, @covariance );
        my @new_test_data = Math::Random::random_multivariate_normal(
              $self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
              @mean, @covariance );
        if ($self->{_debug}) {
            print "training data for class $class_name:\n";
            foreach my $x (@new_training_data) {print "@$x\n";}
            print "\n\ntest data for class $class_name:\n";
            foreach my $x (@new_test_data) {print "@$x\n";}
        }
        $training_samples_for_class{$class_name} = \@new_training_data;
        $test_samples_for_class{$class_name} = \@new_test_data;
    }
    my @training_data_records = ();
    my @test_data_records = ();

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        }
        foreach my $sample_index (0..$num_of_samples_for_testing-1) {
            my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
            @test_vector = map {sprintf("%.3f", $_)} @test_vector;
            my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
            push @test_data_records, $test_data_record;
        }
    }
    fisher_yates_shuffle(\@training_data_records);
    fisher_yates_shuffle(\@test_data_records);
    if ($self->{_debug}) {
        foreach my $record (@training_data_records) {
            print "$record";
        }
        foreach my $record (@test_data_records) {
            print "$record";
        }
    }
    open OUTPUT, ">$self->{_output_training_csv_file}";
    my @feature_names_training = @{$self->{_features_ordered}};
    my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params4(@params) == 0;   
    bless {
        _output_training_datafile          =>   $args{'output_training_datafile'} 
                                                   || die("name for output_training_datafile required"),
        _parameter_file                    =>   $args{'parameter_file'}
                                                   || die("parameter_file required"),
        _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'} 
                                                   || die("number_of_samples_for_training required"),
        _debug                             =>    $args{debug} || 0,
        _class_names                       =>    [],
        _class_priors                      =>    [],
        _features_and_values_hash          =>    {},
        _bias_hash                         =>    {},
        _training_sample_records           =>    {},
    }, $class;
}

sub check_for_illegal_params4 {
    my @params = @_;
    my @legal_params = qw / output_training_datafile
                            parameter_file
                            number_of_samples_for_training
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    }
    return $found_match_flag;
}

##  Read a parameter file for generating symbolic training data. See the script
##  generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
##  pass the name of the parameter file to the constructor of the
##  TrainingDataGeneratorSymbolic class.
sub read_parameter_file_symbolic {
    my $self = shift;
    my $debug = $self->{_debug};
    my $number_of_training_samples = $self->{_number_of_samples_for_training};
    my $input_parameter_file = $self->{_parameter_file};
    croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
    my $output_file_training = $self->{_output_training_datafile};
    my $output_file_testing = $self->{_output_test_datafile};
    my @all_params;
    my $param_string;
    open INPUT, $input_parameter_file || "unable to open parameter file: $!";
    @all_params = <INPUT>;
    @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                $feature_name = $1;
                $bias_hash{$splits[0]}->{$feature_name} = [];
            } else {
                next if !defined $feature_name;
                push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
                        if defined $feature_name;
            }
        }
    }
    $self->{_bias_hash} = \%bias_hash;
    if ($debug) {
        print "\n\nClass names: @class_names\n";
        my $num_of_classes = @class_names;
        print "Class priors: @class_priors\n";
        print "Number of classes: $num_of_classes\n";
        print "\nHere are the features and their possible values:\n";
        while ( my ($k, $v) = each %features_and_values_hash ) {
            print "$k ===>  @$v\n";
        }
        print "\nHere is the biasing for each class:\n";
        while ( my ($k, $v) = each %bias_hash ) {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    my %bias_hash  = %{$self->{_bias_hash}};
    my $how_many_training_samples = $self->{_number_of_samples_for_training};
    my $how_many_test_samples = $self->{_number_of_samples_for_testing};
    my %class_priors_to_unit_interval_map;
    my $accumulated_interval = 0;
    foreach my $i (0..@class_names-1) {
        $class_priors_to_unit_interval_map{$class_names[$i]} 
         = [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
        $accumulated_interval += $class_priors[$i];
    }
    if ($self->{_debug}) {
        print "Mapping of class priors to unit interval: \n";
        while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
            print "$k =>  @$v\n";
        }
        print "\n\n";
    }
    my $ele_index = 0;
    while ($ele_index < $how_many_training_samples) {
        my $sample_name = "sample" . "_$ele_index";
        $training_sample_records{$sample_name} = [];

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
                my $v = $value_priors_to_unit_interval_map{$value_name};
                if ( ($roll_the_dice >= $v->[0]) 
                             && ($roll_the_dice <= $v->[1]) ) {
                    push @{$training_sample_records{$sample_name}}, 
                                            $feature . "=" . $value_name;
                    $value_label = $value_name;
                    last;
                }
            }
            if ($self->{_debug}) {
                print "mapping feature value priors for '$feature' " .
                                          "to unit interval: \n";
                while ( my ($k, $v) = 
                        each %value_priors_to_unit_interval_map ) {
                    print "$k =>  @$v\n";
                }
                print "\n\n";
            }
        }
        $ele_index++;
    }
    $self->{_training_sample_records} = \%training_sample_records;
    if ($self->{_debug}) {
        print "\n\nPRINTING TRAINING RECORDS:\n\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
            print "$kee =>  @{$training_sample_records{$kee}}\n\n";
        }
    }
    my $output_training_file = $self->{_output_training_datafile};
    print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
    open FILEHANDLE, ">$output_training_file";
    my @features = sort keys %features_and_values_hash;
    my $title_string = ',class';
    foreach my $feature_name (@features) {
        $title_string .= ',' . $feature_name;
    }
    print FILEHANDLE "$title_string\n";
    my @sample_names = sort {$a <=> $b}  map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
    my $record_string = '';
    foreach my $sample_name (@sample_names) {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
        unless ref($dt) eq "Algorithm::DecisionTree";
    bless {                                                         
        _dt                                 => $dt,
        _root_dtnode                        => $dt->{_root_node},
        _samples_at_nodes_hash              => {},
        _branch_features_to_nodes_hash      => {},
        _sample_to_node_mapping_direct_hash => {},
        _node_serial_num_to_node_hash       => {}, 
        _awareness_raising_msg_shown        => 0,
        _debug                              => 0,
    }, $class;                                                     
}

sub initialize {
    my $self = shift;
    croak "You must first construct the decision tree before using the DTIntrospection class"
        unless $self->{_root_dtnode};
    $self->recursive_descent($self->{_root_dtnode});
}

sub recursive_descent {
    my $self = shift;
    my $node = shift;
    my $node_serial_number = $node->get_serial_num();
    my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
    print "\nAt node $node_serial_number:  the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
    $self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
    $self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
    my @samples_at_node = ();
    foreach my $item (@$branch_features_and_values_or_thresholds) {
        my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
        unless (@samples_at_node) {
            @samples_at_node =  @$samples_for_feature_value_combo;
        } else {
            my @accum;
            foreach my $sample (@samples_at_node) {
                push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);  
            }
            @samples_at_node =  @accum;
        }
        last unless @samples_at_node;
    }
    @samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node; 
    print "Node: $node_serial_number    the samples are: [@samples_at_node]\n"  if ($self->{_debug});
    $self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
    if (@samples_at_node) {
        foreach my $sample (@samples_at_node) {
            if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
                $self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number]; 
            } else {
                push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
            }
        }
    }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        unless $self->{_root_dtnode};
    $self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
}

sub recursive_descent_for_showing_samples_at_a_node{
    my $self = shift;
    my $node = shift;
    my $node_serial_number = $node->get_serial_num();
    my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
    if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
        print "\nAt node $node_serial_number:  the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n"  if $self->{_debug};
        print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
    }
    map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};            
}

sub display_training_samples_to_nodes_influence_propagation {
    my $self = shift;
    foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)}  keys %{$self->{_dt}->{_training_data_hash}}) {
        if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
            my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN


Calls on each decision tree in the cascade to classify the argument C<$test_sample>.

=item B<display_classification_results_for_each_stage():>

You can call this method to display in your terminal window the classification
decision made by each decision tree in the cascade.  The method also prints out the
trust factor associated with each decision tree.  It is important to look
simultaneously at the classification decision and the trust factor for each tree ---
since a classification decision made by a specific tree may appear bizarre for a
given test sample.  This method is useful primarily for debugging purposes.

=item B<show_class_labels_for_misclassified_samples_in_stage( $stage_index ):>

As with the previous method, this method is useful mostly for debugging. It returns
class labels for the samples misclassified by the stage whose integer index is
supplied as an argument to the method.  Say you have 10 stages in your cascade.  The
value of the argument C<stage_index> would go from 0 to 9, with 0 corresponding to
the base tree.

=item B<trust_weighted_majority_vote_classifier():>

Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
classification decisions made by the individual trees while taking into account the
trust factors associated with the trees.  As mentioned earlier, we associate with

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

C<looking_for_needles_in_haystack> or C<how_many_training_samples_per_tree> --- is
set.  When the former is set, it creates a collection of training datasets for
C<how_many_trees> number of decision trees, with each dataset being a mixture of the
minority class and sample drawn randomly from the majority class.  However, when the
latter option is set, all the datasets are drawn randomly from the training database
with no particular attention given to the relative populations of the two classes.

=item B<show_training_data_for_all_trees():>

As the name implies, this method shows the training data being used for all the
decision trees.  This method is useful for debugging purposes using small datasets.

=item B<calculate_first_order_probabilities():>

Calls on the appropriate method of the main C<DecisionTree class> to estimate the
first-order probabilities for the training dataset to be used for each decision tree.

=item B<calculate_class_priors():>

Calls on the appropriate method of the main C<DecisionTree> class to estimate the
class priors for the training dataset to be used for each decision tree.

lib/Algorithm/DecisionTreeWithBagging.pm  view on Meta::CPAN

    delete $dtargs{bag_overlap_fraction};    
    croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params(@params) == 0;
    bless {
        _training_datafile            =>  $args{training_datafile}, 
        _csv_class_column_index       =>  $args{csv_class_column_index} || undef,
        _csv_columns_for_features     =>  $args{csv_columns_for_features} || undef,
        _how_many_bags                =>  $args{how_many_bags} || croak("you must specify how_many_bags"),
        _bag_overlap_fraction         =>  $args{bag_overlap_fraction} || 0.20, 
        _csv_cleanup_needed           =>  $args{csv_cleanup_needed} || 0,
        _debug1                       =>  $args{debug1} || 0,
        _number_of_training_samples   =>  undef,
        _segmented_training_data      =>  {},
        _all_trees                    =>  {map {$_ => Algorithm::DecisionTree->new(%dtargs)} 0..$args{how_many_bags} - 1},
        _root_nodes                   =>  [],
        _bag_sizes                    =>  [],
        _classifications              =>  undef,
    }, $class;
}

##############################################  Methods  #################################################

lib/Algorithm/DecisionTreeWithBagging.pm  view on Meta::CPAN

        my $record_label = shift @parts;
        $record_label  =~ s/^\s*\"|\"\s*$//g;
        $data_hash{$record_label} = \@parts;
        $all_record_ids_with_class_labels{$record_label} = $classname;
        print "." if $record_index % 10000 == 0;
        $record_index++;
    }
    close FILEIN;    
    $|--;
    $self->{_how_many_total_training_samples} = $record_index - 1;  # must subtract 1 for the header record
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
    my @all_feature_names =   split /,/, substr($firstline, index($firstline,','));
    my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
#    my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
    my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
    my %class_for_sample_hash = map {"sample_" . $_  =>  "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $d...
    $self->{_number_of_training_samples} = scalar @sample_names;
    fisher_yates_shuffle(\@sample_names);
    print "\nsample names for all samples: @sample_names\n" if $self->{_debug2};
    my $bag_size = int(@sample_names / $self->{_how_many_bags});
    my @data_sample_bags;
    push @data_sample_bags, [splice @sample_names, 0, $bag_size] while @sample_names;
    if (@{$data_sample_bags[-1]} < $bag_size) {
        push @{$data_sample_bags[-2]}, @{$data_sample_bags[-1]}; 
        $#data_sample_bags = @data_sample_bags - 2;
    }
    $self->{_bag_sizes} = [map scalar(@$_), @data_sample_bags];
    print "bag sizes: @{$self->{_bag_sizes}}\n" if $self->{_debug2};
    my @augmented_data_sample_bags = ();
    if ($self->{_bag_overlap_fraction}) {
        my $number_of_samples_needed_from_other_bags = int(@{$data_sample_bags[0]} * $self->{_bag_overlap_fraction});
        print "number of samples needed from other bags: $number_of_samples_needed_from_other_bags\n" if $self->{_debug2};
        foreach my $i (0..$self->{_how_many_bags}-1) {
            my @samples_in_other_bags = ();
            foreach my $j (0..$self->{_how_many_bags}-1) {
                push @samples_in_other_bags, @{$data_sample_bags[$j]} if $j != $i;
            }
            print "\n\nin other bags for i=$i: @samples_in_other_bags\n" if $self->{_debug2};
            push @{$augmented_data_sample_bags[$i]}, @{$data_sample_bags[$i]};
            push @{$augmented_data_sample_bags[$i]}, map $samples_in_other_bags[rand(@samples_in_other_bags)], 0 .. $number_of_samples_needed_from_other_bags -1;
            print "\naugmented bage $i: @{$augmented_data_sample_bags[$i]}\n" if $self->{_debug2};
        }
    }
    @data_sample_bags = @augmented_data_sample_bags;
    $self->{_bag_sizes} = [map scalar(@$_), @data_sample_bags];
    my %class_for_sample_hash_bags =  map { $_ => { map { $_ => $class_for_sample_hash{$_} } @{$data_sample_bags[$_]} } } 0 .. $self->{_how_many_bags} - 1;
    if ($self->{_debug2}) {
        foreach my $bag_index (keys  %class_for_sample_hash_bags) {    
            my %keyval = %{$class_for_sample_hash_bags{$bag_index}};
            print "\nFor bag $bag_index  =>:\n";
            foreach my $sname (keys %keyval) {
                print "      $sname    =>  $keyval{$sname}\n";
            }
        }
    }
    my %feature_values_for_samples_hash_bags = map { $_ => { map { $_ => $feature_values_for_samples_hash{$_} } @{$data_sample_bags[$_]} } } 0 .. $self->{_how_many_bags} - 1;
    if ($self->{_debug2}) {
        print "\nDisplaying samples and their values in each bag:\n\n";
        foreach my $bag_index (keys  %feature_values_for_samples_hash_bags) {   
            my %keyval = %{$feature_values_for_samples_hash_bags{$bag_index}};
            print "\nFor bag $bag_index  =>:\n";
            foreach my $sname (keys %keyval) {
                print "      $sname    =>  @{$keyval{$sname}}\n";
            }
        }
    }
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};     
    if ($self->{_debug2}) {
        print "\nDisplaying features and their values for entire training data:\n\n";
        foreach my $fname (keys  %features_and_values_hash) {         
            print "        $fname    =>  @{$features_and_values_hash{$fname}}\n";
        }
    }
    my %features_and_values_hash_bags =  map { my $c = $_; { $c =>  { map { my $d = $_; {$all_feature_names[$d] => [ sort {$a cmp $b} map {my $f = $_; $f =~ /^\d+$/ ? sprintf("%.1f",$f) : $f} map {$data_hash{sample_index($_)}->[$d-1]} @{$data_sample_...
    if ($self->{_debug2}) {
        print "\nDisplaying features and their values in each bag:\n\n";
        foreach my $bag_index (keys  %features_and_values_hash_bags) {           
            my %keyval = %{$features_and_values_hash_bags{$bag_index}};
            print "\nFor bag $bag_index  =>:\n";
            foreach my $fname (keys %keyval) {
                print "      $fname    =>  @{$keyval{$fname}}\n";
            }
        }
    }
    my @all_class_names =  sort keys %{ {map {$_ => 1} values %class_for_sample_hash } };
    print "all class names: @all_class_names\n" if $self->{_debug2};
    my %numeric_features_valuerange_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
    my %feature_values_how_many_uniques_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
    my %features_and_unique_values_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $i (0 .. $self->{_how_many_bags} - 1) {
        foreach my $feature (keys %{$features_and_values_hash_bags{$i}}) {
            my %seen = ();
            my @unique_values_for_feature_in_bag =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash_bags{$i}{$feature}};
            $feature_values_how_many_uniques_hash_bags{$i}->{$feature} = scalar @unique_values_for_feature_in_bag;
            my $not_all_values_float = 0;
            map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature_in_bag;
            if ($not_all_values_float == 0) {
                my @minmaxvalues = minmax(\@unique_values_for_feature_in_bag);
                $numeric_features_valuerange_hash_bags{$i}->{$feature} = \@minmaxvalues; 
            }
            $features_and_unique_values_hash_bags{$i}->{$feature} = \@unique_values_for_feature_in_bag;
        }
    }
    if ($self->{_debug2}) {
        print "\nDisplaying value ranges for numeric features in each bag:\n\n";
        foreach my $bag_index (keys  %numeric_features_valuerange_hash_bags) {        
            my %keyval = %{$numeric_features_valuerange_hash_bags{$bag_index}};
            print "\nFor bag $bag_index  =>:\n";
            foreach my $fname (keys %keyval) {
                print "      $fname    =>  @{$keyval{$fname}}\n";
            }
        }
        print "\nDisplaying number of unique values for each features in each bag:\n\n";
        foreach my $bag_index (keys  %feature_values_how_many_uniques_hash_bags) {    

lib/Algorithm/DecisionTreeWithBagging.pm  view on Meta::CPAN

    foreach my $i (0..$self->{_how_many_bags}-1) {
        $self->{_all_trees}->{$i}->{_class_names} = \@all_class_names;
        $self->{_all_trees}->{$i}->{_feature_names} = \@feature_names;
        $self->{_all_trees}->{$i}->{_samples_class_label_hash} = $class_for_sample_hash_bags{$i};
        $self->{_all_trees}->{$i}->{_training_data_hash}  =  $feature_values_for_samples_hash_bags{$i};
        $self->{_all_trees}->{$i}->{_features_and_values_hash}    =  $features_and_values_hash_bags{$i};
        $self->{_all_trees}->{$i}->{_features_and_unique_values_hash} = $features_and_unique_values_hash_bags{$i};
        $self->{_all_trees}->{$i}->{_numeric_features_valuerange_hash} = $numeric_features_valuerange_hash_bags{$i}; 
        $self->{_all_trees}->{$i}->{_feature_values_how_many_uniques_hash} = $feature_values_how_many_uniques_hash_bags{$i};
    }
    if ($self->{_debug1}) {
        foreach my $i (0..$self->{_how_many_bags}-1) {
            print "\n\n=============================   For bag $i   ==================================\n";
            print "\nAll class names: @{$self->{_all_trees}->{$i}->{_class_names}}\n";
            print "\nSamples and their feature values in each bag:\n";
            foreach my $item (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{$i}->{_training_data_hash}}) {
                print "$item  =>  @{$self->{_all_trees}->{$i}->{_training_data_hash}->{$item}}\n";
            }
            print "\nclass label for each data sample in each bag:\n";
            foreach my $item (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{$i}->{_samples_class_label_hash}} ) {
                print "$item  =>  $self->{_all_trees}->{$i}->{_samples_class_label_hash}->{$item}\n";

lib/Algorithm/DecisionTreeWithBagging.pm  view on Meta::CPAN

    my @legal_params = qw / how_many_bags
                            bag_overlap_fraction
                            training_datafile
                            entropy_threshold
                            max_depth_desired
                            csv_class_column_index
                            csv_columns_for_features
                            symbolic_to_numeric_cardinality_threshold
                            number_of_histogram_bins
                            csv_cleanup_needed
                            debug1
                            debug2
                            debug3
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

        _how_many_training_samples_per_tree    =>  $args{how_many_training_samples_per_tree},
        _training_datafile                     =>  $args{training_datafile}, 
        _csv_class_column_index                =>  $args{csv_class_column_index} || undef,
        _csv_columns_for_features              =>  $args{csv_columns_for_features} || undef,
        _how_many_trees                        =>  $args{how_many_trees} || die "must specify number of trees",
        _root_nodes                            =>  [],
        _training_data_for_trees               =>  {map {$_ => []} 0..$args{how_many_trees} - 1},
        _all_record_ids                        =>  [],
        _training_data_record_indexes          =>  {},
        _classifications                       =>  undef,
        _debug1                                =>  $args{debug1},
    }, $class;
}

##############################################   Methods  ################################################
sub get_training_data_for_N_trees {
    my $self = shift;
    die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
    my @all_record_ids;
    open FILEIN, $self->{_training_datafile} or die "Unable to open $self->{_training_datafile} $!";
    my $record_index = 0;
    while (<FILEIN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
        push @{$self->{_all_record_ids}}, substr($record, 0, index($record, ','));
        $record_index++;
    }
    close FILEIN;
    $self->{_how_many_total_training_samples} = $record_index - 1;
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
    print "\n\nAll record labels: @{$self->{_all_record_ids}}\n" if $self->{_debug1};
    if ($self->{_looking_for_needles_in_haystack}) {
        $self->get_training_data_for_N_trees_balanced();
    } else {
        $self->get_training_data_for_N_trees_regular();
    }
}
    
sub get_training_data_for_N_trees_balanced {
    my $self = shift;    
    die "You cannot use the contructor option 'how_many_training_samples_per_tree' if you " .

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

        $i++;
    }
    close FILEIN;
    $|--;
    my @unique_class_names = sort keys %unique_class_names;
    my $num_unique_classnames = @unique_class_names;
    die "\n\n'looking_for_needles_in_haystack' option has only been tested for the case of " .
        "two data classes.  You appear to have $num_unique_classnames data classes. If you know " .
        "that you have specified only two classes, perhaps you need to use the constructor option " .
        "'csv_cleanup_needed'. Aborting." if @unique_class_names > 2;
    print "\n\nunique class names: @unique_class_names\n" if $self->{_debug1};
    my %hist = map {$_ => 0} @unique_class_names;
    foreach my $item (@class_names) {
        foreach my $unique_val (@unique_class_names) {
            if ($item eq $unique_val) {
                $hist{$unique_val}++;
                last;
            }
        }
    }
    if ($self->{_debug1}) {
        print "\nhistogram of the values for the field : ";
        foreach my $key (sort keys %hist) {
            print "$key => $hist{$key}   ";
        }
    }
    my @histvals = values %hist;
    my @hist_minmax = minmax( \@histvals );
    my $ max_number_of_trees_possible = int($hist_minmax[1] / $hist_minmax[0]);
    if ($self->{_debug1}) {      
        print "\n\nmaximum number of trees possible: $max_number_of_trees_possible\n";
    }
    die "\n\nYou have asked for more trees than can be supported by the training data. " .
        "Maxinum number of trees that can be constructed from the training file is: $max_number_of_trees_possible\n"
        if $self->{_how_many_trees} > $ max_number_of_trees_possible;
    
    my %class1 = map {$_->[0] => $_->[1]} grep {@$_ > 0} map {  $_->[1] eq $unique_class_names[0] ? [$_->[0], $_->[1]] : [] } pairmap {[$a,$b]} %all_record_ids_with_class_labels;    

    my %class2 = map {$_->[0] => $_->[1]} grep {@$_ > 0} map {  $_->[1] eq $unique_class_names[1] ? [$_->[0], $_->[1]] : [] } pairmap {[$a,$b]} %all_record_ids_with_class_labels;        
    my %minority_class = scalar(keys %class1) >= scalar(keys %class2) ? %class2 : %class1;
    my %majority_class = scalar(keys %class1) >= scalar(keys %class2) ? %class1 : %class2;
    my @minority_records = sort keys %minority_class;
    my @majority_records = sort keys %majority_class;
    print "\n\nminority records: @minority_records\n" if $self->{_debug1};
    $self->{_how_many_training_samples_per_tree} = 2 * @minority_records;
    $self->{_training_data_record_indexes}  = {map {my $t = $_; $t => [map { $majority_records[rand @majority_records] } 0 .. @minority_records - 1]}   0 .. $self->{_how_many_trees} - 1};  
    map {my $t = $_; push @{$self->{_training_data_record_indexes}->{$t}}, @minority_records} 0 .. $self->{_how_many_trees} - 1;      
    if ($self->{_debug1}) {
        print "\n Displaying records in the different training sets:\n";
        foreach my $t (sort {$a <=> $b} keys %{$self->{_training_data_record_indexes}}) {
            print "\n\n$t   =>   @{$self->{_training_data_record_indexes}->{$t}}\n";
        }
    }
    $self->_digest_training_data_all_trees();
}

sub get_training_data_for_N_trees_regular {
    my $self = shift;        

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

        foreach my $t (keys %{$self->{_training_data_record_indexes}}) {
            push @{$self->{_training_data_for_trees}->{$t}}, $record
                if (contained_in(substr($record, 0, index($record, ',')), 
                                 @{$self->{_training_data_record_indexes}->{$t}}));
        }
        $record_index++;
    }
    close FILEIN;    
    my $splitup_data_for_trees = {map {my $t = $_; $t => [map {my $record = $_; [split /,/, $record]} @{$self->{_training_data_for_trees}->{$t}}]} 0 .. $self->{_how_many_trees} - 1};
    my $data_hash_for_all_trees = {map {my $t = $_; $t => {map {my $record = $_; my $record_lbl = shift @{$record}; $record_lbl =~ s/^\s*\"|\"\s*$//g; $record_lbl => $record} @{$splitup_data_for_trees->{$t}}}} 0 .. $self->{_how_many_trees} - 1};
    if ($self->{_debug1}) {
        foreach my $t (0 .. $self->{_how_many_trees} - 1) {
            my @record_labels = keys %{$data_hash_for_all_trees->{$t}};
            print "\n\nFor tree $t: record labels: @record_labels\n";
            for my $kee (sort keys %{$data_hash_for_all_trees->{$t}}) {
                print "$kee   ----->   @{$data_hash_for_all_trees->{$t}->{$kee}}\n";
            }
        }
    }
    my @all_feature_names = split /,/, $firstline;
    my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
    my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
    print "\n\nclass column heading: $class_column_heading\n";
    print "feature names: @feature_names\n";
    my $class_for_sample_all_trees = {map {my $t = $_; $t => {map {my $lbl = $_; "sample_$lbl" => "$class_column_heading=$data_hash_for_all_trees->{$t}->{$lbl}->[$self->{_csv_class_column_index} - 1]" } keys %{$data_hash_for_all_trees->{$t}} } }  0 ....
    if ($self->{_debug1}) {
        foreach my $t (0 .. $self->{_how_many_trees} - 1) {
            my @sample_labels = keys %{$class_for_sample_all_trees->{$t}};
            print "\n\nFor tree $t: sample labels: @sample_labels\n";    
            for my $kee (sort keys %{$class_for_sample_all_trees->{$t}}) {
                print "$kee   ----->   $class_for_sample_all_trees->{$t}->{$kee}\n";
            }
        }
    }
    my $sample_names_in_all_trees = {map {my $t = $_; $t => [map {"sample_$_"} keys %{$data_hash_for_all_trees->{$t}}]}  0 .. $self->{_how_many_trees} - 1};
    my $feature_values_for_samples_all_trees = {map {my $t = $_; $t => {map {my $key = $_; "sample_$key" => [map {my $feature_name = $all_feature_names[$_]; "$feature_name=$data_hash_for_all_trees->{$t}->{$key}->[$_-1]"} @{$self->{_csv_columns_for_fe...
    if ($self->{_debug1}) {
        foreach my $t (0 .. $self->{_how_many_trees} - 1) {
            my @sample_labels = keys %{$feature_values_for_samples_all_trees->{$t}};
            print "\n\nFor tree $t: sample labels: @sample_labels\n";    
            for my $kee (sort keys %{$feature_values_for_samples_all_trees->{$t}}) {
                print "$kee   ----->   @{$feature_values_for_samples_all_trees->{$t}->{$kee}}\n";
            }
        }
    }
    my $features_and_values_all_trees = {map {my $t = $_; $t => {map {my $i = $_; $all_feature_names[$i] => [map {my $key = $_; $data_hash_for_all_trees->{$t}->{$key}->[$i-1]} keys %{$data_hash_for_all_trees->{$t}}]} @{$self->{_csv_columns_for_featur...
    if ($self->{_debug1}) {
        foreach my $t (0 .. $self->{_how_many_trees} - 1) {
            my @feature_labels = keys %{$features_and_values_all_trees->{$t}};
            print "\n\nFor tree $t: feature labels: @feature_labels\n";    
            for my $kee (sort keys %{$features_and_values_all_trees->{$t}}) {
                print "$kee   ----->   @{$features_and_values_all_trees->{$t}->{$kee}}\n";
            }
        }
    }
    my $all_class_names_all_trees = {map {my $t = $_; my %all_class_labels_in_tree = map {$_ => 1} values %{$class_for_sample_all_trees->{$t}}; my @uniques = keys %all_class_labels_in_tree; $t => \@uniques } 0 .. $self->{_how_many_trees} - 1};
    if ($self->{_debug1}) {
        foreach my $t (0 .. $self->{_how_many_trees} - 1) {
            my @unique_class_names_for_tree = @{$all_class_names_all_trees->{$t}};
            print "\n\nFor tree $t: all unique class names: @unique_class_names_for_tree\n";    
        }
    }
    my $numeric_features_valuerange_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
    my $feature_values_how_many_uniques_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
    my $features_and_unique_values_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $t (0 .. $self->{_how_many_trees} - 1) {    

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

            $feature_values_how_many_uniques_all_trees->{$t}->{$feature} = scalar @unique_values_for_feature;
            my $not_all_values_float = 0;
            map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
            if ($not_all_values_float == 0) {
                my @minmaxvalues = minmax(\@unique_values_for_feature);
                $numeric_features_valuerange_all_trees->{$t}->{$feature} = \@minmaxvalues; 
            }
            $features_and_unique_values_all_trees->{$t}->{$feature} = \@unique_values_for_feature;            
        }
    }
    if ($self->{_debug1}) {
        print "\nDisplaying value ranges for numeric features for all trees:\n\n";
        foreach my $tree_index (keys  %{$numeric_features_valuerange_all_trees}) {        
            my %keyval = %{$numeric_features_valuerange_all_trees->{$tree_index}};
            print "\nFor tree $tree_index  =>:\n";
            foreach my $fname (keys %keyval) {
                print "      $fname    =>  @{$keyval{$fname}}\n";
            }
        }
        print "\nDisplaying number of unique values for each features for each tree:\n\n";
        foreach my $tree_index (keys  %{$feature_values_how_many_uniques_all_trees}) {    

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

    foreach my $t (0..$self->{_how_many_trees}-1) {
        $self->{_all_trees}->{$t}->{_class_names} = $all_class_names_all_trees->{$t};
        $self->{_all_trees}->{$t}->{_feature_names} = \@feature_names;
        $self->{_all_trees}->{$t}->{_samples_class_label_hash} = $class_for_sample_all_trees->{$t};
        $self->{_all_trees}->{$t}->{_training_data_hash}  =  $feature_values_for_samples_all_trees->{$t};
        $self->{_all_trees}->{$t}->{_features_and_values_hash}    =  $features_and_values_all_trees->{$t};
        $self->{_all_trees}->{$t}->{_features_and_unique_values_hash} = $features_and_unique_values_all_trees->{$t};
        $self->{_all_trees}->{$t}->{_numeric_features_valuerange_hash} = $numeric_features_valuerange_all_trees->{$t}; 
        $self->{_all_trees}->{$t}->{_feature_values_how_many_uniques_hash} = $feature_values_how_many_uniques_all_trees->{$t};
    }
    if ($self->{_debug1}) {
        foreach my $t (0..$self->{_how_many_trees}-1) {
            print "\n\n=============================   For Tree $t   ==================================\n";
            print "\nAll class names: @{$self->{_all_trees}->{$t}->{_class_names}}\n";
            print "\nSamples and their feature values for each tree:\n";
            foreach my $item (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{$t}->{_training_data_hash}}) {
                print "$item  =>  @{$self->{_all_trees}->{$t}->{_training_data_hash}->{$item}}\n";
            }
            print "\nclass label for each data sample for each tree:\n";
            foreach my $item (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{$t}->{_samples_class_label_hash}} ) {
                print "$item  =>  $self->{_all_trees}->{$t}->{_samples_class_label_hash}->{$item}\n";

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

                            entropy_threshold
                            max_depth_desired
                            csv_class_column_index
                            csv_columns_for_features
                            symbolic_to_numeric_cardinality_threshold
                            number_of_histogram_bins
                            how_many_trees
                            how_many_training_samples_per_tree
                            looking_for_needles_in_haystack
                            csv_cleanup_needed
                            debug1
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params(@params) == 0;
    my %dtargs = %args;
    delete $dtargs{dependent_variable_column};
    delete $dtargs{predictor_columns};
    delete $dtargs{mse_threshold};
    delete $dtargs{need_data_normalization};
    delete $dtargs{jacobian_choice};
    delete $dtargs{debug1_r};
    delete $dtargs{debug2_r};
    delete $dtargs{debug3_r};
    my $instance = Algorithm::DecisionTree->new(%dtargs);
    bless $instance, $class;
    $instance->{_dependent_variable_column}       =  $args{dependent_variable_column} || undef;
    $instance->{_predictor_columns}               =  $args{predictor_columns} || 0;
    $instance->{_mse_threshold}                   =  $args{mse_threshold} || 0.01;
    $instance->{_jacobian_choice}                 =  $args{jacobian_choice} || 0;
    $instance->{_need_data_normalization}         =  $args{need_data_normalization} || 0;
    $instance->{_dependent_var}                   =  undef;
    $instance->{_dependent_var_values}            =  undef;
    $instance->{_samples_dependent_var_val_hash}  =  undef;
    $instance->{_root_node}                       =  undef;
    $instance->{_debug1_r}                        =  $args{debug1_r} || 0;
    $instance->{_debug2_r}                        =  $args{debug2_r} || 0;
    $instance->{_debug3_r}                        =  $args{debug3_r} || 0;
    $instance->{_sample_points_for_dependent_var} =  [];
    $instance->{_output_for_plots}                =  {};
    $instance->{_output_for_surface_plots}        =  {};
    bless $instance, $class;
}

##############################################  Methods  #################################################
sub get_training_data_for_regression {
    my $self = shift;
    die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

        my $record_label = shift @parts;
        $record_label  =~ s/^\s*\"|\"\s*$//g;
        $data_hash{$record_label} = \@parts;
        push @dependent_var_values, $parts[$self->{_dependent_variable_column}-1];
        $all_record_ids_with_dependent_var_values{$parts[0]} = $parts[$self->{_dependent_variable_column}-1];
        print "." if $record_index % 10000 == 0;
        $record_index++;
    }
    close FILEIN;    
    $self->{_how_many_total_training_samples} = $record_index; #it's less by 1 from total num of records; okay
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1_r};
    my @all_feature_names =   grep $_, split /,/, substr($firstline, index($firstline,','));
    my $dependent_var_column_heading = $all_feature_names[$self->{_dependent_variable_column} - 1];
    my @feature_names = map {$all_feature_names[$_-1]} @{$self->{_predictor_columns}};
    my %dependent_var_value_for_sample_hash = map {"sample_" . $_  =>  "$dependent_var_column_heading=" . $data_hash{$_}->[$self->{_dependent_variable_column} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_-1]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : ...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a-1] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_predictor_columns}};     
    my %numeric_features_valuerange_hash   =   ();
    my %feature_values_how_many_uniques_hash  =  ();
    my %features_and_unique_values_hash = ();

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

        my @unique_values_for_feature =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($self->{_debug1_r}) {
        print "\nDependent var values: @dependent_var_values\n";
        print "\nEach sample data record:\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$kee    =>   @{$feature_values_for_samples_hash{$kee}}\n";
        }
        print "\ndependent var value for each data sample:\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %dependent_var_value_for_sample_hash) {
            print "$kee    =>   $dependent_var_value_for_sample_hash{$kee}\n";
        }
        print "\nfeatures and the values taken by them:\n";

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

    $self->{_features_and_unique_values_hash}  =  \%features_and_unique_values_hash;
    $self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
    $self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
    $self->SUPER::calculate_first_order_probabilities();
}

sub construct_XMatrix_and_YVector_all_data {
    my $self = shift;
    my $matrix_rows_as_lists =  [ map {my @arr = @$_; [map substr($_,index($_,'=')+1), @arr] } map {$self->{_training_data_hash}->{$_}} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}} ];
    map {push @$_, 1} @{$matrix_rows_as_lists};
    map {print "XMatrix row: @$_\n"} @{$matrix_rows_as_lists} if $self->{_debug1_r};
    my $XMatrix = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, scalar @{$matrix_rows_as_lists->[0]});
    pairmap {$XMatrix->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, @{$matrix_rows_as_lists} )
                       [ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
    if ($self->{_debug1_r}) {
        foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
            my @onerow = $XMatrix->row($rowindex)->as_list;
            print "XMatrix row again: @onerow\n";
        }
    }
    $self->{_XMatrix} = $XMatrix;
    my @dependent_var_values =  map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_samples_dependent_var_val_hash}};
    print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
    my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
    pairmap {$YVector->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, map {[$_]} @dependent_var_values )
                       [ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
    if ($self->{_debug1_r}) {
        foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
            my @onerow = $YVector->row($rowindex)->as_list;
            print "YVector row: @onerow\n";
        }
    }
    $self->{_YVector} = $YVector;
    return ($XMatrix, $YVector);
}

sub estimate_regression_coefficients {
    my $self = shift;
    my ($XMatrix, $YVector, $display) = @_;
    $display = 0 unless defined $display;
    my ($nrows, $ncols) = $XMatrix->dim;
    print "nrows=$nrows   ncols=$ncols\n" if $self->{_debug2_r};
    my $jacobian_choice = $self->{_jacobian_choice};
    my $X = $XMatrix->copy();
    my $y = $YVector->copy();
    if ($self->{_need_data_normalization}) {
        die "normalization feature is yet to be added to the module --- sorry";
    }
    my $beta0 = (transpose($X) * $X)->inverse() * transpose($X) * $y;
    my ($betarows, $betacols) = $beta0->dim;
    die "Something has gone wrong with the calculation of beta coefficients" if $betacols > 1;
    if ($jacobian_choice == 0) {

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

                    push @xvalues, "@onerow";
                }
                $self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
            } else {
                print "no display when the overall dimensionality of the data exceeds 3\n";
            }
        }
        return ($error, $beta0);    
    }
    my $beta = $beta0; 
    if ($self->{_debug2_r}) {
        print "\ndisplaying beta0 matrix\n";
        display_matrix($beta);
    }
    my $gamma = 0.1;
    my $iterate_again_flag = 1;
    my $delta = 0.001;
    my $master_interation_index = 0;
    $|++;
    while (1) {
        print "*" unless $master_interation_index++ % 100;
        last unless $iterate_again_flag;
        $gamma *= 0.1;
        $beta0 = 0.99 * $beta0;
        print "\n\n======== starting iterations with gamma= $gamma ===========\n\n\n" if $self->{_debug2_r};
        $beta = $beta0;
        my $beta_old = Math::GSL::Matrix->new($betarows, 1)->zero;
        my $error_old = sum( map abs, ($y - ($X * $beta_old) )->col(0)->as_list ) / $nrows;   
        my $error;
        foreach my $iteration (0 .. 1499) {
            print "." unless $iteration % 100;
            $beta_old = $beta->copy;
            my $jacobian;
            if ($jacobian_choice == 1) {
                $jacobian = $X;

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

            $beta = $beta_old + 2 * $gamma * transpose($jacobian) * ( $y - ($X * $beta) );
            $error = sum( map abs, ($y - ($X * $beta) )->col(0)->as_list ) / $nrows;   
            if ($error > $error_old) {
                if (vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old))) {
                    $iterate_again_flag = 0;
                    last;
                } else {
                    last;
                }
            }
            if ($self->{_debug2_r}) {
                print "\n\niteration: $iteration   gamma: $gamma   current error: $error\n";
                print "\nnew beta:\n";
                display_matrix $beta;
            }
            if ( vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old)) ) { 
                print "iterations used: $iteration with gamma: $gamma\n" if $self->{_debug2_r};
                $iterate_again_flag = 0;
                last;
            }
            $error_old = $error;
        }
    }
    display_matrix($beta) if $self->{_debug2_r};
    my $predictions = $X * $beta;
    my @error_distribution = ($y - ($X * $beta))->as_list;
    my $squared_error =  sum map abs, @error_distribution;
    my $error = $squared_error / $nrows;
    if ($display) {
        if ($ncols == 2) {
            my @xvalues = $X->col(0)->as_list;
            my @yvalues = $predictions->col(0)->as_list;
            $self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
        } elsif ($ncols == 3) {

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

        print "\nFeatures and values or thresholds on branch: @features_and_values_or_thresholds_on_branch\n";
        return if $error <= $self->{_mse_threshold}; 
    }
    my ($best_feature,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point) = 
                                               $self->best_feature_calculator(\@copy_of_path_attributes);
    return unless defined $best_feature_partitioning_point;
    print "\nBest feature found: $best_feature\n";
    print "Best feature partitioning_point: $best_feature_partitioning_point\n";
    print "Best minmax error at partitioning point: $best_minmax_error_at_partitioning_point\n";
    $node->set_feature($best_feature);
    $node->display_node() if $self->{_debug2_r}; 
    return if (defined $self->{_max_depth_desired}) && 
                            (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired}); 
    if ($best_minmax_error_at_partitioning_point > $self->{_mse_threshold}) {
        my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
        my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
        my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_feature_partitioning_point";
        my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_feature_partitioning_point";
        push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

                }
            }
        }
    }
    my @training_samples_at_node = keys %training_samples_at_node;
    my $matrix_rows_as_lists =  [ map {my @arr = @$_; [map substr($_,index($_,'=')+1), @arr] } map {$self->{_training_data_hash}->{$_}} sort {sample_index($a) <=> sample_index($b)} @training_samples_at_node ];;
    map {push @$_, 1} @{$matrix_rows_as_lists};
    my $XMatrix = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, scalar @{$matrix_rows_as_lists->[0]});
    pairmap {$XMatrix->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, @{$matrix_rows_as_lists} )
                       [ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
    if ($self->{_debug3_r}) {
        print "\nXMatrix as its transpose:";        
        my $displayX = transpose($XMatrix);
        display_matrix($displayX);
    }
    my @dependent_var_values =  map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} @training_samples_at_node;
    print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
    my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
    pairmap {$YVector->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, map {[$_]} @dependent_var_values )
                       [ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
    if ($self->{_debug3_r}) {
        print "\nYVector:";  
        my $displayY = transpose($YVector);
        display_matrix($displayY);
    }
    my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector);
    if ($self->{_debug3_r}) {
        display_matrix($beta);
        print("\n\nerror distribution at node: ", $error);
    }
    return ($error,$beta,$XMatrix,$YVector);
}

#-----------------------------    Predict with Regression Tree   ------------------------------

sub predictions_for_all_data_used_for_regression_estimation {
    my $self = shift;

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

    my $feature_and_values = shift;
    my $answer = shift;
    my @children = @{$node->get_children()};
    if (@children == 0) {
        my $leaf_node_prediction = $node->node_prediction_from_features_and_values($feature_and_values); 
        $answer->{'prediction'} = $leaf_node_prediction;
        push @{$answer->{'solution_path'}}, $node->get_serial_num();
        return $answer;
    }
    my $feature_tested_at_node = $node->get_feature();
    print "\nFeature tested at node for prediction: $feature_tested_at_node\n" if $self->{_debug3};
    my $value_for_feature;
    my $path_found;
    my $pattern = '(\S+)\s*=\s*(\S+)';
    my ($feature,$value);
    foreach my $feature_and_value (@$feature_and_values) {
        $feature_and_value =~ /$pattern/;
        my ($feature,$value) =  ($1, $2);
        if ($feature eq $feature_tested_at_node) {
            $value_for_feature = $value;
        }

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / training_datafile
                            max_depth_desired
                            dependent_variable_column
                            predictor_columns
                            mse_threshold
                            need_data_normalization
                            jacobian_choice
                            csv_cleanup_needed
                            debug1_r
                            debug2_r
                            debug3_r
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }



( run in 1.357 second using v1.01-cache-2.11-cpan-49f99fa48dc )