Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        foreach my $sample_name (@training_samples) {
            $trainingDT->{_samples_class_label_hash}->{$sample_name} = 
                                                $self->{_samples_class_label_hash}->{$sample_name};
        }
        foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
            $trainingDT->{_features_and_values_hash}->{$feature} = ();
        }
        my $pattern = '(\S+)\s*=\s*(\S+)';
        foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=> 
                                Algorithm::DecisionTree::sample_index($b)}  
                          keys %{$trainingDT->{_training_data_hash}}) {
            foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
                $feature_and_value =~ /$pattern/;
                my ($feature,$value) = ($1,$2);
                push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
            }
        }
        foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
            my %seen = ();
            my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++} 
                                                @{$trainingDT->{_features_and_values_hash}->{$feature}}; 
            if (Algorithm::DecisionTree::contained_in($feature, 
                                                keys %{$self->{_numeric_features_valuerange_hash}})) {
                @unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
            } else {
                @unique_values_for_feature = sort @unique_values_for_feature;
            }
            $trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
        }
        foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
            my @minmaxvalues = Algorithm::DecisionTree::minmax(
                                         \@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
            $trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
        }
        if ($evaldebug) {
            print "\n\nprinting samples in the testing set: @testing_samples\n";
            print "\n\nPrinting features and their values in the training set:\n";
            foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
                print "$item  => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
            }
            print "\n\nPrinting unique values for features:\n";
            foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
                print "$item  => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
            }
            print "\n\nPrinting unique value ranges for features:\n";
            foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
                print "$item  => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
            }
        }
        foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
            $trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} = 
                scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
        }
        $trainingDT->{_debug2} = 1 if $evaldebug;
        $trainingDT->calculate_first_order_probabilities();
        $trainingDT->calculate_class_priors();
        my $root_node = $trainingDT->construct_decision_tree_classifier();
        $root_node->display_decision_tree("     ") if $evaldebug;
        foreach my $test_sample_name (@testing_samples) {
            my @test_sample_data = @{$all_training_data{$test_sample_name}};
            print "original data in test sample: @test_sample_data\n" if $evaldebug;
            @test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
            print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
            my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
            my @solution_path = @{$classification{'solution_path'}};
            delete $classification{'solution_path'};
            my @which_classes = keys %classification;
            @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
            my $most_likely_class_label = $which_classes[0];
            if ($evaldebug) {
                print "\nClassification:\n\n";
                print "     class                         probability\n";
                print "     ----------                    -----------\n";
                foreach my $which_class (@which_classes) {
                    my $classstring = sprintf("%-30s", $which_class);
                    my $valuestring = sprintf("%-30s", $classification{$which_class});
                    print "     $classstring $valuestring\n";
                }
                print "\nSolution path in the decision tree: @solution_path\n";
                print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
            }
            my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
            print "$test_sample_name:    true_class: $true_class_label_for_sample    " .
                     "estimated_class: $most_likely_class_label\n"  if $evaldebug;
            $confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
        }
    }
    print "\n\n       DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
    my $matrix_header = " " x 30;
    foreach my $class_name (@{$self->{_class_names}}) {  
        $matrix_header .= sprintf("%-30s", $class_name);
    }
    print "\n" . $matrix_header . "\n\n";
    foreach my $row_class_name (sort keys %confusion_matrix) {
        my $row_display = sprintf("%-30s", $row_class_name);
        foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
            $row_display .= sprintf( "%-30u",  $confusion_matrix{$row_class_name}->{$col_class_name} );
        }
        print "$row_display\n\n";
    }
    print "\n\n";
    my ($diagonal_sum, $off_diagonal_sum) = (0,0);
    foreach my $row_class_name (sort keys %confusion_matrix) {
        foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
            if ($row_class_name eq $col_class_name) {
                $diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
            } else {
                $off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
            }
        }
    }
    my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
    print "\nTraining Data Quality Index: $data_quality_index    (out of a possible maximum of 100)\n";
    if ($data_quality_index <= 80) {
        print "\nYour training data does not possess much class discriminatory " .
              "information.  It could be that the classes are inherently not well " .
              "separable or that your constructor parameter choices are not appropriate.\n";
    } elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
        print "\nYour training data possesses some class discriminatory information " .
              "but it may not be sufficient for real-world applications.  You might " .
              "try tweaking the constructor parameters to see if that improves the " .



( run in 0.869 second using v1.01-cache-2.11-cpan-6b5c3043376 )