view release on metacpan or search on metacpan
ExamplesRandomizedTrees/classify_database_records.pl view on Meta::CPAN
### and checks whether the majority vote classification returned by all the
### decision trees agrees with the true labels for the data samples used for
### evaluation.
### The script shown below has the following outputs:
###
### --- It shows for each test sample the class label as calculated by
### RandomizedTreesForBigData and the class label as present in the training
### database.
###
### --- It presents the overall classification error.
###
### --- It presents the confusion matrix that is obtaining by aggregating the
### calculated-class-labels versus the true-class-labels
### IMPORTANT IMPORTANT IMPORTANT IMPORTANT IMPORTANT IMPORTANT IMPORTANT
###
### It is relatively common for real-life databases to have no values at all for
### for some the fields. When a test record extracted randomly from the database
### does NOT contain a value for one of the fields that went into the construction
ExamplesRandomizedTrees/classify_database_records.pl view on Meta::CPAN
map {$unknown_value_for_a_feature_flag = 1 if $_ =~ /=NA$/} @test_sample;
next if $unknown_value_for_a_feature_flag;
$rt->classify_with_all_trees( \@test_sample );
my $classification = $rt->get_majority_vote_classification();
printf("\nclassification for %5d: %10s original classification: %s", $record_index, $classification, $record_ids_with_class_labels{$record_index});
$original_classifications{$record_index} = $record_ids_with_class_labels{$record_index};
$classification =~ /=(.+)$/;
$calculated_classifications{$record_index} = $1;
}
my $total_errors = 0;
my @confusion_matrix_row1 = (0,0);
my @confusion_matrix_row2 = (0,0);
print "\n\nCalculating the error rate and the confusion matrix:\n";
foreach my $record_index (sort keys %calculated_classifications) {
$total_errors += 1 if $original_classifications{$record_index} ne $calculated_classifications{$record_index};
if ($original_classifications{$record_index} eq $class_names_used_in_database[1]) {
if ($calculated_classifications{$record_index} eq $class_names_used_in_database[1]) {
$confusion_matrix_row1[0] += 1;
} else {
$confusion_matrix_row1[1] += 1;
}
}
if ($original_classifications{$record_index} eq $class_names_used_in_database[0]) {
if ($calculated_classifications{$record_index} eq $class_names_used_in_database[1]) {
$confusion_matrix_row2[0] += 1;
} else {
$confusion_matrix_row2[1] += 1;
}
}
}
my $percentage_errors = ($total_errors * 100.0) / scalar keys %calculated_classifications;
print "\n\nClassification error rate: $percentage_errors\n";
print "\nConfusion Matrix:\n\n";
printf("%50s %25s\n", "classified as NOT at risk", "classified as at risk");
printf("Known to be NOT at risk: %10d %35d\n\n", @confusion_matrix_row1); #(G)
printf("Known to be at risk:%15d %35d\n\n", @confusion_matrix_row2); #(H)
#============== Now interact with the user for classifying additional records ==========
if ($interaction_needed) {
while (1) {
lib/Algorithm/BoostedDecisionTree.pm view on Meta::CPAN
sub construct_cascade_of_trees {
my $self = shift;
$self->{_training_samples}->{0} = $self->{_all_sample_names};
$self->{_misclassified_samples}->{0} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{0}, $self->{_root_nodes}->{0});
if ($self->{_stagedebug}) {
$self->show_class_labels_for_misclassified_samples_in_stage(0);
print "\n\nSamples misclassified by base classifier: @{$self->{_misclassified_samples}->{0}}\n";
my $how_many = @{$self->{_misclassified_samples}->{0}};
print "\nNumber of misclassified samples: $how_many\n";
}
my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{0}->{$_}} @{$self->{_misclassified_samples}->{0}};
print "\nMisclassification_error_rate for base classifier: $misclassification_error_rate\n" if $self->{_stagedebug};
$self->{_trust_factors}->{0} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
print "\nBase class trust factor: $self->{_trust_factors}->{0}\n" if $self->{_stagedebug};
foreach my $stage_index (1 .. $self->{_how_many_stages} - 1) {
print "\n\n========================== Constructing stage indexed $stage_index =========================\n"
if $self->{_stagedebug};
$self->{_sample_selection_probs}->{$stage_index} = { map {$_ => $self->{_sample_selection_probs}->{$stage_index-1}->{$_} * exp(-1.0 * $self->{_trust_factors}->{$stage_index - 1} * (contained_in($_, @{$self->{_misclassified_samples}->{$st...
my $normalizer = reduce {$a + $b} values %{$self->{_sample_selection_probs}->{$stage_index}};
print "\nThe normalizer is: $normalizer\n" if $self->{_stagedebug};
map {$self->{_sample_selection_probs}->{$stage_index}->{$_} /= $normalizer} keys %{$self->{_sample_selection_probs}->{$stage_index}};
my @training_samples_this_stage = ();
my $sum_of_probs = 0.0;
lib/Algorithm/BoostedDecisionTree.pm view on Meta::CPAN
$root_node_this_stage->display_decision_tree(" ") if $self->{_stagedebug};
$self->{_all_trees}->{$stage_index} = $dt_this_stage;
$self->{_root_nodes}->{$stage_index} = $root_node_this_stage;
$self->{_misclassified_samples}->{$stage_index} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{$stage_index}, $self->{_root_nodes}->{$stage_index});
if ($self->{_stagedebug}) {
print "\nSamples misclassified by stage $stage_index classifier: @{$self->{_misclassified_samples}->{$stage_index}}\n";
printf("\nNumber of misclassified samples: %d\n", scalar @{$self->{_misclassified_samples}->{$stage_index}});
$self->show_class_labels_for_misclassified_samples_in_stage($stage_index);
}
my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{$stage_index}->{$_}} @{$self->{_misclassified_samples}->{$stage_index}};
print "\nStage $stage_index misclassification_error_rate: $misclassification_error_rate\n" if $self->{_stagedebug};
$self->{_trust_factors}->{$stage_index} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
print "\nStage $stage_index trust factor: $self->{_trust_factors}->{$stage_index}\n" if $self->{_stagedebug};
}
}
sub evaluate_one_stage_of_cascade {
my $self = shift;
my $trainingDT = shift;
my $root_node = shift;
my @misclassified_samples = ();
foreach my $test_sample_name (@{$self->{_all_sample_names}}) {
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
my $log_prob = undef;
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
if ($prob >= 0.0001 && $prob <= 0.999) {
$log_prob = log($prob) / log(2.0);
} elsif ($prob < 0.0001) {
$log_prob = 0;
} elsif ($prob > 0.999) {
$log_prob = 0;
} else {
die "An error has occurred in log_prob calculation";
}
$entropy += -1.0 * $prob * $log_prob;
}
if (abs($entropy) < 0.0000001) {
$entropy = 0.0;
}
$self->{_entropy_cache}->{$sequence} = $entropy;
return $entropy;
}
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
my $log_prob = undef;
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
if ($prob >= 0.0001 && $prob <= 0.999) {
$log_prob = log($prob) / log(2.0);
} elsif ($prob < 0.0001) {
$log_prob = 0;
} elsif ($prob > 0.999) {
$log_prob = 0;
} else {
die "An error has occurred in log_prob calculation";
}
$entropy += -1.0 * $prob * $log_prob;
}
if (abs($entropy) < 0.0000001) {
$entropy = 0.0;
}
$self->{_entropy_cache}->{$sequence} = $entropy;
return $entropy;
}
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
my $log_prob = undef;
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
$class_name, \@array_of_features_and_values_or_thresholds);
if ($prob >= 0.0001 && $prob <= 0.999) {
$log_prob = log($prob) / log(2.0);
} elsif ($prob < 0.0001) {
$log_prob = 0;
} elsif ($prob > 0.999) {
$log_prob = 0;
} else {
die "An error has occurred in log_prob calculation";
}
$entropy += -1.0 * $prob * $log_prob;
}
if (abs($entropy) < 0.0000001) {
$entropy = 0.0;
}
$self->{_entropy_cache}->{$sequence} = $entropy;
return $entropy;
}
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
## This method is used to verify that you used legal feature names in the test
## sample that you want to classify with the decision tree.
sub check_names_used {
my $self = shift;
my $features_and_values_test_data = shift;
my @features_and_values_test_data = @$features_and_values_test_data;
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values_test_data) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1,$2);
die "Your test data has formatting error" unless defined($feature) && defined($value);
return 0 unless contained_in($feature, @{$self->{_feature_names}});
}
return 1;
}
####################################### Data Condition Calculator ######################################
## This method estimates the worst-case fan-out of the decision tree taking into
## account the number of values (and therefore the number of branches emanating from
## a node) for the symbolic features.
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
can be seen by calling C<display_all_plots()>, which is the method mentioned below.
=item B<display_all_plots():>
This method displays the results obtained by calling the prediction method of the
previous entry. This method also creates a hardcopy of the plots and saves it as a
C<.png> disk file. The name of this output file is always C<regression_plots.png>.
=item B<mse_for_tree_regression_for_all_training_samples( $root_node ):>
This method carries out an error analysis of the predictions for the samples in your
training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
regression, the MSE for the data samples at each of the leaf nodes of the regression
tree, and the MSE for the plain old Linear Regression as applied to all of the data.
The parameter C<$root_node> in the call syntax is what is returned by the call to
C<construct_regression_tree()>.
=item B<bulk_predictions_for_data_in_a_csv_file( $root_node, $filename, $columns ):>
Call this method if you want to apply the regression tree to all your test data in a
disk file. The predictions for all of the test samples in the disk file are written
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
my $jacobian_choice = $self->{_jacobian_choice};
my $X = $XMatrix->copy();
my $y = $YVector->copy();
if ($self->{_need_data_normalization}) {
die "normalization feature is yet to be added to the module --- sorry";
}
my $beta0 = (transpose($X) * $X)->inverse() * transpose($X) * $y;
my ($betarows, $betacols) = $beta0->dim;
die "Something has gone wrong with the calculation of beta coefficients" if $betacols > 1;
if ($jacobian_choice == 0) {
# my $error = sum(abs_vector_as_list( $y - ($X * $beta) )) / $nrows;
my $error = sum( map abs, ($y - ($X * $beta0) )->col(0)->as_list ) / $nrows;
my $predictions = $X * $beta0;
if ($display) {
if ($ncols == 2) {
my @xvalues = $X->col(0)->as_list;
my @yvalues = $predictions->col(0)->as_list;
$self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
} elsif ($ncols == 3) {
my @xvalues;
my @yvalues = $predictions->col(0)->as_list;
foreach my $row_index (0 .. $X->rows - 1) {
my @onerow = $X->row($row_index)->as_list;
pop @onerow;
push @xvalues, "@onerow";
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
} else {
print "no display when the overall dimensionality of the data exceeds 3\n";
}
}
return ($error, $beta0);
}
my $beta = $beta0;
if ($self->{_debug2_r}) {
print "\ndisplaying beta0 matrix\n";
display_matrix($beta);
}
my $gamma = 0.1;
my $iterate_again_flag = 1;
my $delta = 0.001;
my $master_interation_index = 0;
$|++;
while (1) {
print "*" unless $master_interation_index++ % 100;
last unless $iterate_again_flag;
$gamma *= 0.1;
$beta0 = 0.99 * $beta0;
print "\n\n======== starting iterations with gamma= $gamma ===========\n\n\n" if $self->{_debug2_r};
$beta = $beta0;
my $beta_old = Math::GSL::Matrix->new($betarows, 1)->zero;
my $error_old = sum( map abs, ($y - ($X * $beta_old) )->col(0)->as_list ) / $nrows;
my $error;
foreach my $iteration (0 .. 1499) {
print "." unless $iteration % 100;
$beta_old = $beta->copy;
my $jacobian;
if ($jacobian_choice == 1) {
$jacobian = $X;
} elsif ($jacobian_choice == 2) {
my $x_times_delta_beta = $delta * $X * $beta;
$jacobian = Math::GSL::Matrix->new($nrows, $ncols);
foreach my $i (0 .. $nrows - 1) {
my @row = ($x_times_delta_beta->get_elem($i,0)) x $ncols;
$jacobian->set_row($i, \@row);
}
$jacobian = (1.0/$delta) * $jacobian;
} else {
die "wrong choice for the jacobian_choice";
}
# $beta = $beta_old + 2 * $gamma * transpose($X) * ( $y - ($X * $beta) );
$beta = $beta_old + 2 * $gamma * transpose($jacobian) * ( $y - ($X * $beta) );
$error = sum( map abs, ($y - ($X * $beta) )->col(0)->as_list ) / $nrows;
if ($error > $error_old) {
if (vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old))) {
$iterate_again_flag = 0;
last;
} else {
last;
}
}
if ($self->{_debug2_r}) {
print "\n\niteration: $iteration gamma: $gamma current error: $error\n";
print "\nnew beta:\n";
display_matrix $beta;
}
if ( vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old)) ) {
print "iterations used: $iteration with gamma: $gamma\n" if $self->{_debug2_r};
$iterate_again_flag = 0;
last;
}
$error_old = $error;
}
}
display_matrix($beta) if $self->{_debug2_r};
my $predictions = $X * $beta;
my @error_distribution = ($y - ($X * $beta))->as_list;
my $squared_error = sum map abs, @error_distribution;
my $error = $squared_error / $nrows;
if ($display) {
if ($ncols == 2) {
my @xvalues = $X->col(0)->as_list;
my @yvalues = $predictions->col(0)->as_list;
$self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
} elsif ($ncols == 3) {
my @xvalues;
my @yvalues = $predictions->col(0)->as_list;
foreach my $row_index (0 .. $X->rows - 1) {
my @onerow = $X->row($row_index)->as_list;
pop @onerow;
push @xvalues, "@onerow";
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
} else {
print "no display when the overall dimensionality of the data exceeds 3\n";
}
}
return ($error, $beta);
}
##------------------------------- Construct Regression Tree ------------------------------------
## At the root node, you do ordinary linear regression for the entire dataset so that you
## can later compare the linear regression fit with the results obtained through the
## regression tree. Subsequently, you call the recursive_descent() method to construct
## the tree.
sub construct_regression_tree {
my $self = shift;
print "\nConstructing regression tree...\n";
my $root_node = RTNode->new(undef, undef, undef, [], $self, 'root');
my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector, 1);
$root_node->set_node_XMatrix($XMatrix);
$root_node->set_node_YVector($YVector);
$root_node->set_node_error($error);
$root_node->set_node_beta($beta);
$root_node->set_num_data_points($XMatrix->cols);
print "\nerror at root: $error\n";
print "\nbeta at root:\n";
display_matrix($beta);
$self->{_root_node} = $root_node;
$self->recursive_descent($root_node) if $self->{_max_depth_desired} > 0;
return $root_node;
}
## We first look for a feature, along with its partitioning point, that yields the
## largest reduction in MSE compared to the MSE at the parent node. This feature and
## its partitioning point are then used to create two child nodes in the tree.
sub recursive_descent {
my $self = shift;
my $node = shift;
print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n";
my $node_serial_number = $node->get_serial_num();
my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
if (@features_and_values_or_thresholds_on_branch > 0) {
my ($error,$beta,$XMatrix,$YVector) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@copy_of_path_attributes);
$node->set_node_XMatrix($XMatrix);
$node->set_node_YVector($YVector);
$node->set_node_error($error);
$node->set_node_beta($beta);
$node->set_num_data_points($XMatrix->cols);
print "\nNODE SERIAL NUMBER: $node_serial_number\n";
print "\nFeatures and values or thresholds on branch: @features_and_values_or_thresholds_on_branch\n";
return if $error <= $self->{_mse_threshold};
}
my ($best_feature,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point) =
$self->best_feature_calculator(\@copy_of_path_attributes);
return unless defined $best_feature_partitioning_point;
print "\nBest feature found: $best_feature\n";
print "Best feature partitioning_point: $best_feature_partitioning_point\n";
print "Best minmax error at partitioning point: $best_minmax_error_at_partitioning_point\n";
$node->set_feature($best_feature);
$node->display_node() if $self->{_debug2_r};
return if (defined $self->{_max_depth_desired}) &&
(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired});
if ($best_minmax_error_at_partitioning_point > $self->{_mse_threshold}) {
my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_feature_partitioning_point";
my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_feature_partitioning_point";
push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
$feature_threshold_combo_for_less_than;
push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
$feature_threshold_combo_for_greater_than;
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
my $right_child_node = RTNode->new(undef, undef, undef,
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
$node->add_child_link($right_child_node);
$self->recursive_descent($right_child_node);
}
}
## This is the heart of the regression tree constructor. Its main job is to figure
## out the best feature to use for partitioning the training data samples at the
## current node. The partitioning criterion is that the largest of the MSE's in
## the two partitions should be smaller than the error associated with the parent
## node.
sub best_feature_calculator {
my $self = shift;
my $features_and_values_or_thresholds_on_branch = shift;
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
print "\n\nfeatures_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
if (@features_and_values_or_thresholds_on_branch == 0) {
my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
foreach my $i (0 .. @{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
my @partitioning_errors;
my %partitioning_error_hash;
foreach my $value (@values[10 .. $#values - 10]) {
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
my @for_left_child;
my @for_right_child;
if (@features_and_values_or_thresholds_on_branch) {
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_left_child, $feature_and_less_than_value_string;
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_right_child, $feature_and_greater_than_value_string;
} else {
@for_left_child = ($feature_and_less_than_value_string);
@for_right_child = ($feature_and_greater_than_value_string);
}
my ($error1,$beta1,$XMatrix1,$YVector1) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
my ($error2,$beta2,$XMatrix2,$YVector2) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
my $partitioning_error = max($error1, $error2);
push @partitioning_errors, $partitioning_error;
$partitioning_error_hash{$partitioning_error} = $value;
}
my $min_max_error_for_feature = min(@partitioning_errors);
$best_partition_point_for_feature_hash->{$feature_name} =
$partitioning_error_hash{$min_max_error_for_feature};
$best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
}
my $best_feature_name;
my $best_feature_paritioning_point;
my $best_minmax_error_at_partitioning_point;
foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
if (! defined $best_minmax_error_at_partitioning_point) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
} elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
}
}
my $best_feature_partitioning_point = $best_partition_point_for_feature_hash->{$best_feature_name};
return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
} else {
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @true_numeric_types;
my @symbolic_types;
my @true_numeric_types_feature_names;
my @symbolic_types_feature_names;
foreach my $item (@features_and_values_or_thresholds_on_branch) {
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
foreach my $item (@bounded_intervals_numeric_types) {
foreach my $feature_grouping (@$item) {
if ($feature_grouping->[1] eq '>') {
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
} else {
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
}
}
}
my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
foreach my $i (0 .. @{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
my @newvalues;
if (contained_in($feature_name, @true_numeric_types_feature_names)) {
if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
next;
} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
$lowerbound{$feature_name} < $upperbound{$feature_name}) {
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
foreach my $x (@values) {
push @newvalues, $x if $x > $lowerbound{$feature_name};
}
} else {
die "Error is bound specifications in best feature calculator";
}
} else {
@newvalues = @{deep_copy_array(\@values)};
}
next if @newvalues < 30;
my @partitioning_errors;
my %partitioning_error_hash;
foreach my $value (@newvalues[10 .. $#newvalues - 10]) {
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
my @for_left_child;
my @for_right_child;
if (@features_and_values_or_thresholds_on_branch) {
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_left_child, $feature_and_less_than_value_string;
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_right_child, $feature_and_greater_than_value_string;
} else {
@for_left_child = ($feature_and_less_than_value_string);
@for_right_child = ($feature_and_greater_than_value_string);
}
my ($error1,$beta1,$XMatrix1,$YVector1) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
my ($error2,$beta2,$XMatrix2,$YVector2) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
my $partitioning_error = max($error1, $error2);
push @partitioning_errors, $partitioning_error;
$partitioning_error_hash{$partitioning_error} = $value;
}
my $min_max_error_for_feature = min(@partitioning_errors);
$best_partition_point_for_feature_hash->{$feature_name} =
$partitioning_error_hash{$min_max_error_for_feature};
$best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
}
my $best_feature_name;
my $best_feature_paritioning_point;
my $best_minmax_error_at_partitioning_point;
foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
if (! defined $best_minmax_error_at_partitioning_point) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
} elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
}
}
my $best_feature_partitioning_point = $best_partition_point_for_feature_hash->{$best_feature_name};
return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
}
}
## This method requires that all truly numeric types only be expressed as '<' or '>'
## constructs in the array of branch features and thresholds
sub _error_for_given_sequence_of_features_and_values_or_thresholds{
my $self = shift;
my $array_of_features_and_values_or_thresholds = shift;
if (@$array_of_features_and_values_or_thresholds == 0) {
my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
my ($errors,$beta) = $self->estimate_regression_coefficients($XMatrix,$YVector);
return ($errors,$beta,$XMatrix,$YVector)
}
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @true_numeric_types;
my @symbolic_types;
my @true_numeric_types_feature_names;
my @symbolic_types_feature_names;
foreach my $item (@$array_of_features_and_values_or_thresholds) {
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
my $feature_involved = substr($feature_and_val, 0, index($feature_and_val,'=') );
if (($feature_name eq $feature_involved) &&
($value_for_feature > $lowerbound{$feature_name})) {
$training_samples_at_node{$sample} = 1;
last;
}
}
}
} else {
die "Ill formatted call to the '_error_for_given_sequence_...' method";
}
}
foreach my $feature_and_value (@symbolic_types) {
if ($feature_and_value =~ /$pattern1/) {
my ($feature,$value) = ($1,$2);
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
foreach my $feature_and_val (@feature_val_pairs) {
my $feature_in_sample = substr($feature_and_val, 0, index($feature_and_val,'=') );
my $value_in_sample = substr($feature_and_val, index($feature_and_val,'=')+1 );
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
my @dependent_var_values = map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} @training_samples_at_node;
print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
pairmap {$YVector->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, map {[$_]} @dependent_var_values )
[ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
if ($self->{_debug3_r}) {
print "\nYVector:";
my $displayY = transpose($YVector);
display_matrix($displayY);
}
my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector);
if ($self->{_debug3_r}) {
display_matrix($beta);
print("\n\nerror distribution at node: ", $error);
}
return ($error,$beta,$XMatrix,$YVector);
}
#----------------------------- Predict with Regression Tree ------------------------------
sub predictions_for_all_data_used_for_regression_estimation {
my $self = shift;
my $root_node = shift;
my %predicted_values;
my %leafnode_for_values;
my $ncols = $self->{_XMatrix}->cols;
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
close FILEIN;
close FILEOUT;
}
sub mse_for_tree_regression_for_all_training_samples {
my $self = shift;
my $root_node = shift;
my %predicted_values;
my %dependent_var_values;
my %leafnode_for_values;
my $total_error = 0.0;
my %samples_at_leafnode;
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my $pattern = '(\S+)\s*=\s*(\S+)';
my @features_and_vals;
my $newvalue;
foreach my $feature_and_val (@{$self->{_training_data_hash}->{$sample}}) {
$feature_and_val =~ /$pattern/;
my ($feature,$value) = ($1, $2);
if (contained_in($feature, @{$self->{_feature_names}})) {
my $new_feature_and_value = "$feature=$value";
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
}
$newvalue = $value;
}
my $answer = $self->prediction_for_single_data_point($root_node, \@features_and_vals);
$predicted_values{"@features_and_vals"} = $answer->{'prediction'};
$self->{_samples_dependent_var_val_hash}->{$sample} =~ /$pattern/;
my ($dep_feature,$dep_value) = ($1, $2);
$dependent_var_values{"@features_and_vals"} = $dep_value;
my $leafnode_for_sample = $answer->{'solution_path'}->[-1];
$leafnode_for_values{"@features_and_vals"} = $answer->{'solution_path'}->[-1];
my $error_for_sample = abs($predicted_values{"@features_and_vals"} - $dependent_var_values{"@features_and_vals"});
$total_error += $error_for_sample;
if (exists $samples_at_leafnode{$leafnode_for_sample}) {
push @{$samples_at_leafnode{$leafnode_for_sample}}, $sample;
} else {
$samples_at_leafnode{$leafnode_for_sample} = [$sample];
}
}
my @leafnodes_used = keys %{{map {$_ => 1} values %leafnode_for_values}};
my $errors_at_leafnode = { map {$_ => 0.0} @leafnodes_used };
foreach my $kee (keys %predicted_values) {
foreach my $leaf (@leafnodes_used) {
$errors_at_leafnode->{$leaf} += abs($predicted_values{$kee} - $dependent_var_values{$kee})
if $leaf == $leafnode_for_values{$kee};
}
}
my $total_error_per_data_point = $total_error / (scalar keys %{$self->{_training_data_hash}});
print "\n\nTree Regression: Total MSE per sample with tree regression: $total_error_per_data_point\n";
foreach my $leafnode (@leafnodes_used) {
my $error_per_data_point = $errors_at_leafnode->{$leafnode} / @{$samples_at_leafnode{$leafnode}};
print " MSE per sample at leafnode $leafnode: $error_per_data_point\n";
}
my $error_with_linear_regression = $self->{_root_node}->get_node_error();
print "For comparision, the MSE per sample error with Linear Regression: $error_with_linear_regression\n";
}
## Calculated the predicted value for the dependent variable from a given value for all
## the predictor variables.
sub prediction_for_single_data_point {
my $self = shift;
my $root_node = shift;
my $features_and_values = shift;
die "Error in the names you have used for features and/or values when calling " .
"prediction_for_single_data_point()" unless $self->_check_names_used($features_and_values);
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
## This method is used to verify that you used legal feature names in the test
## sample that you want to classify with the decision tree.
sub _check_names_used {
my $self = shift;
my $features_and_values_test_data = shift;
my @features_and_values_test_data = @$features_and_values_test_data;
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values_test_data) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1,$2);
die "Your test data has formatting error" unless defined($feature) && defined($value);
return 0 unless contained_in($feature, @{$self->{_feature_names}});
}
return 1;
}
sub display_all_plots {
my $self = shift;
my $ncols = $self->{_XMatrix}->cols;
unlink "regression_plots.png" if -e "regression_plots.png";
my $master_datafile = $self->{_training_datafile};
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
# The nodes of a regression tree are instances of this class:
package RTNode;
use strict;
use Carp;
# $feature is the feature test at the current node. $branch_features_and_values is
# an anonymous array holding the feature names and corresponding values on the path
# from the root to the current node:
sub new {
my ($class, $feature, $error, $beta, $branch_features_and_values_or_thresholds, $rt, $root_or_not) = @_;
$root_or_not = '' if !defined $root_or_not;
if ($root_or_not eq 'root') {
$rt->{nodes_created} = -1;
$rt->{class_names} = undef;
}
my $self = {
_rt => $rt,
_feature => $feature,
_error => $error,
_beta => $beta,
_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
_num_data_points => undef,
_XMatrix => undef,
_YVector => undef,
_linked_to => [],
};
bless $self, $class;
$self->{_serial_number} = $self->get_next_serial_num();
return $self;
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
my $self = shift;
my $yvector = shift;
$self->{_YVector} = $yvector;
}
sub get_node_YVector {
my $self = shift;
return $self->{_YVector};
}
sub set_node_error {
my $self = shift;
my $error = shift;
$self->{_error} = $error;
}
sub get_node_error {
my $self = shift;
return $self->{_error};
}
sub set_node_beta {
my $self = shift;
my $beta = shift;
$self->{_beta} = $beta;
}
sub get_node_beta {
my $self = shift;