Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/BoostedDecisionTree.pm view on Meta::CPAN
}
}
}
sub show_training_data_for_base_tree {
my $self = shift;
$self->{_all_trees}->{0}->show_training_data();
}
sub calculate_first_order_probabilities_and_class_priors {
my $self = shift;
$self->{_all_trees}->{0}->calculate_first_order_probabilities();
$self->{_all_trees}->{0}->calculate_class_priors();
$self->{_sample_selection_probs}->{0} = {map { $_ => 1.0/@{$self->{_all_sample_names}} } @{$self->{_all_sample_names}}};
}
sub construct_base_decision_tree {
my $self = shift;
$self->{_root_nodes}->{0} = $self->{_all_trees}->{0}->construct_decision_tree_classifier();
}
sub display_base_decision_tree {
my $self = shift;
$self->{_root_nodes}->{0}->display_decision_tree(" ");
}
sub construct_cascade_of_trees {
my $self = shift;
$self->{_training_samples}->{0} = $self->{_all_sample_names};
$self->{_misclassified_samples}->{0} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{0}, $self->{_root_nodes}->{0});
if ($self->{_stagedebug}) {
$self->show_class_labels_for_misclassified_samples_in_stage(0);
print "\n\nSamples misclassified by base classifier: @{$self->{_misclassified_samples}->{0}}\n";
my $how_many = @{$self->{_misclassified_samples}->{0}};
print "\nNumber of misclassified samples: $how_many\n";
}
my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{0}->{$_}} @{$self->{_misclassified_samples}->{0}};
print "\nMisclassification_error_rate for base classifier: $misclassification_error_rate\n" if $self->{_stagedebug};
$self->{_trust_factors}->{0} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
print "\nBase class trust factor: $self->{_trust_factors}->{0}\n" if $self->{_stagedebug};
foreach my $stage_index (1 .. $self->{_how_many_stages} - 1) {
print "\n\n========================== Constructing stage indexed $stage_index =========================\n"
if $self->{_stagedebug};
$self->{_sample_selection_probs}->{$stage_index} = { map {$_ => $self->{_sample_selection_probs}->{$stage_index-1}->{$_} * exp(-1.0 * $self->{_trust_factors}->{$stage_index - 1} * (contained_in($_, @{$self->{_misclassified_samples}->{$st...
my $normalizer = reduce {$a + $b} values %{$self->{_sample_selection_probs}->{$stage_index}};
print "\nThe normalizer is: $normalizer\n" if $self->{_stagedebug};
map {$self->{_sample_selection_probs}->{$stage_index}->{$_} /= $normalizer} keys %{$self->{_sample_selection_probs}->{$stage_index}};
my @training_samples_this_stage = ();
my $sum_of_probs = 0.0;
foreach my $sample (sort {$self->{_sample_selection_probs}->{$stage_index}->{$b} <=> $self->{_sample_selection_probs}->{$stage_index}->{$a}} keys %{$self->{_sample_selection_probs}->{$stage_index}}) {
$sum_of_probs += $self->{_sample_selection_probs}->{$stage_index}->{$sample};
push @training_samples_this_stage, $sample if $sum_of_probs < 0.5;
last if $sum_of_probs > 0.5;
}
$self->{_training_samples}->{$stage_index} = [sort {sample_index($a) <=> sample_index($b)} @training_samples_this_stage];
if ($self->{_stagedebug}) {
print "\nTraining samples for stage $stage_index: @{$self->{_training_samples}->{$stage_index}}\n\n";
my $num_of_training_samples = @{$self->{_training_samples}->{$stage_index}};
print "\nNumber of training samples this stage $num_of_training_samples\n\n";
}
# find intersection of two sets:
my %misclassified_samples = map {$_ => 1} @{$self->{_misclassified_samples}->{$stage_index-1}};
my @training_samples_selection_check = grep $misclassified_samples{$_}, @{$self->{_training_samples}->{$stage_index}};
if ($self->{_stagedebug}) {
my @training_in_misclassified = sort {sample_index($a) <=> sample_index($b)} @training_samples_selection_check;
print "\nTraining samples in the misclassified set: @training_in_misclassified\n";
my $how_many = @training_samples_selection_check;
print "\nNumber_of_miscalssified_samples_in_training_set: $how_many\n";
}
my $dt_this_stage = Algorithm::DecisionTree->new('boostingmode');
$dt_this_stage->{_training_data_hash} = { map {$_ => $self->{_all_training_data}->{$_} } @{$self->{_training_samples}->{$stage_index}} };
$dt_this_stage->{_class_names} = $self->{_all_trees}->{0}->{_class_names};
$dt_this_stage->{_feature_names} = $self->{_all_trees}->{0}->{_feature_names};
$dt_this_stage->{_entropy_threshold} = $self->{_all_trees}->{0}->{_entropy_threshold};
$dt_this_stage->{_max_depth_desired} = $self->{_all_trees}->{0}->{_max_depth_desired};
$dt_this_stage->{_symbolic_to_numeric_cardinality_threshold} = $self->{_all_trees}->{0}->{_symbolic_to_numeric_cardinality_threshold};
$dt_this_stage->{_samples_class_label_hash} = {map {$_ => $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$_}} keys %{$dt_this_stage->{_training_data_hash}}};
$dt_this_stage->{_features_and_values_hash} = {map {$_ => []} keys %{$self->{_all_trees}->{0}->{_features_and_values_hash}}};
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$dt_this_stage->{_training_data_hash}}) {
foreach my $feature_and_value (@{$dt_this_stage->{_training_data_hash}->{$sample}}) {
$feature_and_value =~ /$pattern/;
my ($feature, $value) = ($1, $2);
push @{$dt_this_stage->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
}
}
$dt_this_stage->{_features_and_unique_values_hash} = {map {my $feature = $_; $feature => [sort keys %{{map {$_ => 1} @{$dt_this_stage->{_features_and_values_hash}->{$feature}}}}]} keys %{$dt_this_stage->{_features_and_values_hash}}};
$dt_this_stage->{_numeric_features_valuerange_hash} = {map {$_ => []} keys %{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}}};
$dt_this_stage->{_numeric_features_valuerange_hash} = {map {my $feature = $_; $feature => [min(@{$dt_this_stage->{_features_and_unique_values_hash}->{$feature}}), max(@{$dt_this_stage->{_features_and_unique_values_hash}->{$feature}})]} keys ...
if ($self->{_stagedebug}) {
print "\n\nPrinting features and their values in the training set:\n\n";
foreach my $kee (sort keys %{$dt_this_stage->{_features_and_values_hash}}) {
print "$kee => @{$dt_this_stage->{_features_and_values_hash}->{$kee}}\n";
}
print "\n\nPrinting unique values for features:\n\n";
foreach my $kee (sort keys %{$dt_this_stage->{_features_and_unique_values_hash}}) {
print "$kee => @{$dt_this_stage->{_features_and_unique_values_hash}->{$kee}}\n";
}
print "\n\nPrinting unique value ranges for features:\n\n";
foreach my $kee (sort keys %{$dt_this_stage->{_numeric_features_valuerange_hash}}) {
print "$kee => @{$dt_this_stage->{_numeric_features_valuerange_hash}->{$kee}}\n";
}
}
$dt_this_stage->{_feature_values_how_many_uniques_hash} = {map {$_ => undef} keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}};
$dt_this_stage->{_feature_values_how_many_uniques_hash} = {map {$_ => scalar @{$dt_this_stage->{_features_and_unique_values_hash}->{$_}}} keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}};
$dt_this_stage->calculate_first_order_probabilities();
$dt_this_stage->calculate_class_priors();
print "\n\n>>>>>>>Done with the initialization of the tree for stage $stage_index<<<<<<<<<<\n" if $self->{_stagedebug};
my $root_node_this_stage = $dt_this_stage->construct_decision_tree_classifier();
$root_node_this_stage->display_decision_tree(" ") if $self->{_stagedebug};
$self->{_all_trees}->{$stage_index} = $dt_this_stage;
$self->{_root_nodes}->{$stage_index} = $root_node_this_stage;
$self->{_misclassified_samples}->{$stage_index} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{$stage_index}, $self->{_root_nodes}->{$stage_index});
if ($self->{_stagedebug}) {
print "\nSamples misclassified by stage $stage_index classifier: @{$self->{_misclassified_samples}->{$stage_index}}\n";
printf("\nNumber of misclassified samples: %d\n", scalar @{$self->{_misclassified_samples}->{$stage_index}});
$self->show_class_labels_for_misclassified_samples_in_stage($stage_index);
}
my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{$stage_index}->{$_}} @{$self->{_misclassified_samples}->{$stage_index}};
( run in 0.738 second using v1.01-cache-2.11-cpan-39bf76dae61 )