Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

        $dt_this_stage->{_max_depth_desired} = $self->{_all_trees}->{0}->{_max_depth_desired};        
        $dt_this_stage->{_symbolic_to_numeric_cardinality_threshold} = $self->{_all_trees}->{0}->{_symbolic_to_numeric_cardinality_threshold};
        $dt_this_stage->{_samples_class_label_hash} = {map {$_ => $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$_}} keys %{$dt_this_stage->{_training_data_hash}}};
        $dt_this_stage->{_features_and_values_hash} = {map {$_ => []} keys %{$self->{_all_trees}->{0}->{_features_and_values_hash}}};
        my $pattern = '(\S+)\s*=\s*(\S+)';        
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$dt_this_stage->{_training_data_hash}}) { 
            foreach my $feature_and_value (@{$dt_this_stage->{_training_data_hash}->{$sample}}) {
                $feature_and_value =~ /$pattern/;
                my ($feature, $value) = ($1, $2);
                push @{$dt_this_stage->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
            }
        }
        $dt_this_stage->{_features_and_unique_values_hash} = {map {my $feature = $_; $feature => [sort keys %{{map {$_ => 1} @{$dt_this_stage->{_features_and_values_hash}->{$feature}}}}]} keys %{$dt_this_stage->{_features_and_values_hash}}};
        $dt_this_stage->{_numeric_features_valuerange_hash} = {map {$_ => []} keys %{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}}};
        $dt_this_stage->{_numeric_features_valuerange_hash} = {map {my $feature = $_; $feature =>  [min(@{$dt_this_stage->{_features_and_unique_values_hash}->{$feature}}), max(@{$dt_this_stage->{_features_and_unique_values_hash}->{$feature}})]} keys ...
        if ($self->{_stagedebug}) {
            print "\n\nPrinting features and their values in the training set:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_features_and_values_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_features_and_values_hash}->{$kee}}\n";
            }
            print "\n\nPrinting unique values for features:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_features_and_unique_values_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_features_and_unique_values_hash}->{$kee}}\n";            
            }
            print "\n\nPrinting unique value ranges for features:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_numeric_features_valuerange_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_numeric_features_valuerange_hash}->{$kee}}\n";            
            }
        }
        $dt_this_stage->{_feature_values_how_many_uniques_hash} = {map {$_ => undef} keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}};
        $dt_this_stage->{_feature_values_how_many_uniques_hash} = {map {$_ => scalar @{$dt_this_stage->{_features_and_unique_values_hash}->{$_}}} keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}};
        $dt_this_stage->calculate_first_order_probabilities();
        $dt_this_stage->calculate_class_priors();
        print "\n\n>>>>>>>Done with the initialization of the tree for stage $stage_index<<<<<<<<<<\n" if $self->{_stagedebug};
        my $root_node_this_stage = $dt_this_stage->construct_decision_tree_classifier();
        $root_node_this_stage->display_decision_tree("     ") if $self->{_stagedebug};

        $self->{_all_trees}->{$stage_index} = $dt_this_stage;
        $self->{_root_nodes}->{$stage_index} = $root_node_this_stage;
        $self->{_misclassified_samples}->{$stage_index} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{$stage_index}, $self->{_root_nodes}->{$stage_index});
        if ($self->{_stagedebug}) {
            print "\nSamples misclassified by stage $stage_index classifier: @{$self->{_misclassified_samples}->{$stage_index}}\n";
            printf("\nNumber of misclassified samples: %d\n", scalar @{$self->{_misclassified_samples}->{$stage_index}});
            $self->show_class_labels_for_misclassified_samples_in_stage($stage_index);
        }
        my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{$stage_index}->{$_}} @{$self->{_misclassified_samples}->{$stage_index}};
        print "\nStage $stage_index misclassification_error_rate: $misclassification_error_rate\n" if $self->{_stagedebug};

        $self->{_trust_factors}->{$stage_index} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
        print "\nStage $stage_index trust factor: $self->{_trust_factors}->{$stage_index}\n"  if $self->{_stagedebug};
    }
}

sub evaluate_one_stage_of_cascade {
    my $self = shift;
    my $trainingDT = shift;
    my $root_node = shift;
    my @misclassified_samples = ();
    foreach my $test_sample_name (@{$self->{_all_sample_names}}) {
        my @test_sample_data = @{$self->{_all_trees}->{0}->{_training_data_hash}->{$test_sample_name}};
        print "original data in $test_sample_name:@test_sample_data\n" if $self->{_stagedebug};
        @test_sample_data = map {$_ if $_ !~ /=NA$/} @test_sample_data;
        print "$test_sample_name: @test_sample_data\n" if $self->{_stagedebug}; 
        my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
        my @solution_path = @{$classification{'solution_path'}};                                  
        delete $classification{'solution_path'};                                              
        my @which_classes = keys %classification;
        @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
        my $most_likely_class_label = $which_classes[0];
        if ($self->{_stagedebug}) {
            print "\nClassification:\n\n";
            print "     class                         probability\n";
            print "     ----------                    -----------\n";
            foreach my $which_class (@which_classes) {
                my $classstring = sprintf("%-30s", $which_class);
                my $valuestring = sprintf("%-30s", $classification{$which_class});
                print "     $classstring $valuestring\n";
            }
            print "\nSolution path in the decision tree: @solution_path\n";
            print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
        }
        my $true_class_label_for_test_sample = $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$test_sample_name};
        printf("%s:   true_class: %s    estimated_class: %s\n", $test_sample_name, $true_class_label_for_test_sample, $most_likely_class_label) if $self->{_stagedebug};
        push @misclassified_samples, $test_sample_name if $true_class_label_for_test_sample ne $most_likely_class_label;
    }
    return [sort {sample_index($a) <=> sample_index($b)} @misclassified_samples];
}

sub show_class_labels_for_misclassified_samples_in_stage {
    my $self = shift;
    my $stage_index = shift;
    die "\nYou must first call 'construct_cascade_of_trees()' before invoking 'show_class_labels_for_misclassified_samples_in_stage()'" unless @{$self->{_misclassified_samples}->{0}} > 0;
    my @classes_for_misclassified_samples = ();
    my @just_class_labels = ();

    for my $sample (@{$self->{_misclassified_samples}->{$stage_index}}) {    
        my $true_class_label_for_sample = $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$sample};            
        push @classes_for_misclassified_samples, sprintf("%s => %s", $sample, $true_class_label_for_sample);
        push @just_class_labels, $true_class_label_for_sample; 
    }
    print "\nSamples misclassified by the classifier for Stage $stage_index: @{$self->{_misclassified_samples}->{$stage_index}}\n";
    my $how_many = @{$self->{_misclassified_samples}->{$stage_index}};
    print "\nNumber of misclassified samples: $how_many\n";
    print "\nShowing class labels for samples misclassified by stage $stage_index: ";
    print "\nClass labels for samples: @classes_for_misclassified_samples\n";
    my @class_names_unique =  sort keys %{{map {$_ => 1} @just_class_labels}};
    print "\nClass names (unique) for misclassified samples: @class_names_unique\n";
    print "\nFinished displaying class labels for samples misclassified by stage $stage_index\n\n";
}

sub display_decision_trees_for_different_stages {
    my $self = shift;
    print "\nDisplaying the decisions trees for all stages:\n\n";
    foreach my $i (0..$self->{_how_many_stages}-1) {
        print "\n\n=============================   For stage $i   ==================================\n\n";
        $self->{_root_nodes}->{$i}->display_decision_tree("     ");
    }
    print "\n==================================================================================\n\n\n";
}

sub classify_with_boosting {



( run in 0.690 second using v1.01-cache-2.11-cpan-6b5c3043376 )