Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

}

##############################################  Methods  #################################################
sub get_training_data_for_base_tree {
    my $self = shift;
    die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
    my %class_names = ();
    my %all_record_ids_with_class_labels;
    my $firstline;
    my %data_hash;
    $|++;
    open FILEIN, $self->{_training_datafile};
    my $record_index = 0;
    my $firsetline;
    while (<FILEIN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my $record =  $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
        if ($record_index == 0) {
            $firstline = $record;
            $record_index++;
            next;
        }
        my @parts = split /,/, $record;
        my $classname = $parts[$self->{_csv_class_column_index}];
        $class_names{$classname} = 1;
        my $record_label = shift @parts;
        $record_label  =~ s/^\s*\"|\"\s*$//g;
        $data_hash{$record_label} = \@parts;
        $all_record_ids_with_class_labels{$record_label} = $classname;
        print "." if $record_index % 10000 == 0;
        $record_index++;
    }
    close FILEIN;    
    $|--;
    $self->{_how_many_total_training_samples} = $record_index - 1;  # must subtract 1 for the header record
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
    my @all_feature_names =   split /,/, substr($firstline, index($firstline,','));
    my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
    my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
    my %class_for_sample_hash = map {"sample_" . $_  =>  "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $d...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};     
    my @all_class_names =  sort keys %{ {map {$_ => 1} values %class_for_sample_hash } };
    $self->{_number_of_training_samples} = scalar @sample_names;
    if ($self->{_debug2}) {
        print "\nDisplaying features and their values for entire training data:\n\n";
        foreach my $fname (keys  %features_and_values_hash) {         
            print "        $fname    =>  @{$features_and_values_hash{$fname}}\n";
        }
    }
    my %features_and_unique_values_hash = ();
    my %feature_values_how_many_uniques_hash  =  ();
    my %numeric_features_valuerange_hash   =   ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {
        my %seen = ();
        my @unique_values_for_feature =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    $self->{_all_trees}->{0}->{_class_names} = \@all_class_names;
    $self->{_all_trees}->{0}->{_feature_names} = \@feature_names;
    $self->{_all_trees}->{0}->{_samples_class_label_hash} = \%class_for_sample_hash;
    $self->{_all_trees}->{0}->{_training_data_hash}  =  \%feature_values_for_samples_hash;
    $self->{_all_trees}->{0}->{_features_and_values_hash}    =  \%features_and_values_hash;
    $self->{_all_trees}->{0}->{_features_and_unique_values_hash}    =  \%features_and_unique_values_hash;
    $self->{_all_trees}->{0}->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
    $self->{_all_trees}->{0}->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
    $self->{_all_training_data} = \%feature_values_for_samples_hash;    
    $self->{_all_sample_names} = [sort {sample_index($a) cmp sample_index($b)} keys %feature_values_for_samples_hash];
    if ($self->{_debug1}) {
        print "\n\n===========================  data ingested for the base tree   ==================================\n\n";
        print "\nAll class names: @{$self->{_all_trees}->{0}->{_class_names}}\n";
        print "\nEach sample data record:\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{0}->{_training_data_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_training_data_hash}->{$kee}}\n";
        }
        print "\nclass label for each data sample:\n";        
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{0}->{_samples_class_label_hash}}) {
            print "$kee    =>   $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$kee}\n";            
        }
        print "\nfeatures and the values taken by them:\n";
        for my $kee  (sort keys %{$self->{_all_trees}->{0}->{_features_and_values_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_features_and_values_hash}->{$kee}}\n";                        
        }
        print "\nnumeric features and their ranges:\n";
        for my $kee  (sort keys %{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}->{$kee}}\n";
        }
        print "\nunique values for the features:\n";
        for my $kee  (sort keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}->{$kee}}\n";  
        }
        print "\nnumber of unique values in each feature:\n";        
        for my $kee  (sort keys %{$self->{_all_trees}->{0}->{_feature_values_how_many_uniques_hash}}) {
            print "$kee    =>   $self->{_all_trees}->{0}->{_feature_values_how_many_uniques_hash}->{$kee}\n";
        }
    }
}

sub show_training_data_for_base_tree {
    my $self = shift;
    $self->{_all_trees}->{0}->show_training_data();
}

sub calculate_first_order_probabilities_and_class_priors {
    my $self = shift;
    $self->{_all_trees}->{0}->calculate_first_order_probabilities();
    $self->{_all_trees}->{0}->calculate_class_priors();
    $self->{_sample_selection_probs}->{0} =  {map { $_ => 1.0/@{$self->{_all_sample_names}} } @{$self->{_all_sample_names}}};
}

sub construct_base_decision_tree {
    my $self = shift;
    $self->{_root_nodes}->{0} = $self->{_all_trees}->{0}->construct_decision_tree_classifier();



( run in 1.825 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )