Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
foreach my $sample_name (@training_samples) {
$trainingDT->{_samples_class_label_hash}->{$sample_name} =
$self->{_samples_class_label_hash}->{$sample_name};
}
foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
$trainingDT->{_features_and_values_hash}->{$feature} = ();
}
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
Algorithm::DecisionTree::sample_index($b)}
keys %{$trainingDT->{_training_data_hash}}) {
foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1,$2);
push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
}
}
foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
my %seen = ();
my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
@{$trainingDT->{_features_and_values_hash}->{$feature}};
if (Algorithm::DecisionTree::contained_in($feature,
keys %{$self->{_numeric_features_valuerange_hash}})) {
@unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
} else {
@unique_values_for_feature = sort @unique_values_for_feature;
}
$trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
}
foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
my @minmaxvalues = Algorithm::DecisionTree::minmax(
\@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
$trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
}
if ($evaldebug) {
print "\n\nprinting samples in the testing set: @testing_samples\n";
print "\n\nPrinting features and their values in the training set:\n";
foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
}
print "\n\nPrinting unique values for features:\n";
foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
}
print "\n\nPrinting unique value ranges for features:\n";
foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
}
}
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
$trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
}
$trainingDT->{_debug2} = 1 if $evaldebug;
$trainingDT->calculate_first_order_probabilities();
$trainingDT->calculate_class_priors();
my $root_node = $trainingDT->construct_decision_tree_classifier();
$root_node->display_decision_tree(" ") if $evaldebug;
foreach my $test_sample_name (@testing_samples) {
my @test_sample_data = @{$all_training_data{$test_sample_name}};
print "original data in test sample: @test_sample_data\n" if $evaldebug;
@test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
my @solution_path = @{$classification{'solution_path'}};
delete $classification{'solution_path'};
my @which_classes = keys %classification;
@which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
my $most_likely_class_label = $which_classes[0];
if ($evaldebug) {
print "\nClassification:\n\n";
print " class probability\n";
print " ---------- -----------\n";
foreach my $which_class (@which_classes) {
my $classstring = sprintf("%-30s", $which_class);
my $valuestring = sprintf("%-30s", $classification{$which_class});
print " $classstring $valuestring\n";
}
print "\nSolution path in the decision tree: @solution_path\n";
print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
}
my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
print "$test_sample_name: true_class: $true_class_label_for_sample " .
"estimated_class: $most_likely_class_label\n" if $evaldebug;
$confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
}
}
print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
my $matrix_header = " " x 30;
foreach my $class_name (@{$self->{_class_names}}) {
$matrix_header .= sprintf("%-30s", $class_name);
}
print "\n" . $matrix_header . "\n\n";
foreach my $row_class_name (sort keys %confusion_matrix) {
my $row_display = sprintf("%-30s", $row_class_name);
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
$row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
}
print "$row_display\n\n";
}
print "\n\n";
my ($diagonal_sum, $off_diagonal_sum) = (0,0);
foreach my $row_class_name (sort keys %confusion_matrix) {
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
if ($row_class_name eq $col_class_name) {
$diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
} else {
$off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
}
}
}
my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
if ($data_quality_index <= 80) {
print "\nYour training data does not possess much class discriminatory " .
"information. It could be that the classes are inherently not well " .
"separable or that your constructor parameter choices are not appropriate.\n";
} elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
print "\nYour training data possesses some class discriminatory information " .
"but it may not be sufficient for real-world applications. You might " .
"try tweaking the constructor parameters to see if that improves the " .
( run in 0.869 second using v1.01-cache-2.11-cpan-6b5c3043376 )