Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
$node->add_child_link($child_node);
$self->recursive_descent($child_node);
} else {
print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
}
}
}
} else {
print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
if $self->{_debug3};
return;
}
}
## This is the heart of the decision tree constructor. Its main job is to figure
## out the best feature to use for partitioning the training data samples that
## correspond to the current node. The search for the best feature is carried out
## differently for symbolic features and for numeric features. For a symbolic
## feature, the method estimates the entropy for each value of the feature and then
## averages out these entropies as a measure of the discriminatory power of that
## features. For a numeric feature, on the other hand, it estimates the entropy
## reduction that can be achieved if were to partition the set of training samples
## for each possible threshold. For a numeric feature, all possible sampling points
## relevant to the node in question are considered as candidates for thresholds.
sub best_feature_calculator {
my $self = shift;
my $features_and_values_or_thresholds_on_branch = shift;
my $existing_node_entropy = shift;
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @all_symbolic_features = ();
foreach my $feature_name (@{$self->{_feature_names}}) {
push @all_symbolic_features, $feature_name
if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
}
my @symbolic_features_already_used = ();
foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
}
my @symbolic_features_not_yet_used;
foreach my $x (@all_symbolic_features) {
push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
}
my @true_numeric_types = ();
my @symbolic_types = ();
my @true_numeric_types_feature_names = ();
my @symbolic_types_feature_names = ();
foreach my $item (@features_and_values_or_thresholds_on_branch) {
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
my (%upperbound, %lowerbound);
foreach my $feature (@true_numeric_types_feature_names) {
$upperbound{$feature} = undef;
$lowerbound{$feature} = undef;
}
foreach my $item (@bounded_intervals_numeric_types) {
foreach my $feature_grouping (@$item) {
if ($feature_grouping->[1] eq '>') {
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
} else {
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
}
}
}
my %entropy_values_for_different_features = ();
my %partitioning_point_child_entropies_hash = ();
my %partitioning_point_threshold = ();
my %entropies_for_different_values_of_symbolic_feature = ();
foreach my $feature (@{$self->{_feature_names}}) {
$entropy_values_for_different_features{$feature} = [];
$partitioning_point_child_entropies_hash{$feature} = {};
$partitioning_point_threshold{$feature} = undef;
$entropies_for_different_values_of_symbolic_feature{$feature} = [];
}
foreach my $i (0..@{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
if (contained_in($feature_name, @symbolic_features_already_used)) {
next;
} elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
$self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
$self->{_symbolic_to_numeric_cardinality_threshold}) {
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
my @newvalues = ();
if (contained_in($feature_name, @true_numeric_types_feature_names)) {
if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
next;
} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
$lowerbound{$feature_name} < $upperbound{$feature_name}) {
foreach my $x (@values) {
push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
}
} elsif (defined($upperbound{$feature_name})) {
foreach my $x (@values) {
push @newvalues, $x if $x <= $upperbound{$feature_name};
}
} elsif (defined($lowerbound{$feature_name})) {
foreach my $x (@values) {
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
foreach my $class (@class_names) {
my $prob = $self->prior_probability_for_class($class);
my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
$log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
$log_prob = 0 if $prob > 0.999; # since log(1) = 0
if (!defined $entropy) {
$entropy = -1.0 * $prob * $log_prob;
next;
}
$entropy += -1.0 * $prob * $log_prob;
}
$self->{_entropy_cache}->{'priors'} = $entropy;
return $entropy;
}
sub entropy_scanner_for_a_numeric_feature {
local $| = 1;
my $self = shift;
my $feature = shift;
my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
my @entropies_for_less_than_thresholds = ();
my @entropies_for_greater_than_thresholds = ();
foreach my $point (@all_sampling_points) {
print ". ";
push @entropies_for_less_than_thresholds,
$self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
push @entropies_for_greater_than_thresholds,
$self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
}
print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
"@entropies_for_less_than_thresholds\n";
print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
"@entropies_for_greater_than_thresholds\n";
}
sub class_entropy_for_less_than_threshold_for_feature {
my $self = shift;
my $arr = shift;
my $feature = shift;
my $threshold = shift;
my @array_of_features_and_values_or_thresholds = @$arr;
my $feature_threshold_combo = "$feature" . '<' . "$threshold";
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
$sequence .= ":" . $feature_threshold_combo;
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
my @copy_of_array_of_features_and_values_or_thresholds =
@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
my $entropy = 0;
foreach my $class_name (@{$self->{_class_names}}) {
my $log_prob = undef;
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
if ($prob >= 0.0001 && $prob <= 0.999) {
$log_prob = log($prob) / log(2.0);
} elsif ($prob < 0.0001) {
$log_prob = 0;
} elsif ($prob > 0.999) {
$log_prob = 0;
} else {
die "An error has occurred in log_prob calculation";
}
$entropy += -1.0 * $prob * $log_prob;
}
if (abs($entropy) < 0.0000001) {
$entropy = 0.0;
}
$self->{_entropy_cache}->{$sequence} = $entropy;
return $entropy;
}
sub class_entropy_for_greater_than_threshold_for_feature {
my $self = shift;
my $arr = shift;
my $feature = shift;
my $threshold = shift;
my @array_of_features_and_values_or_thresholds = @$arr;
my $feature_threshold_combo = "$feature" . '>' . "$threshold";
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
$sequence .= ":" . $feature_threshold_combo;
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
my @copy_of_array_of_features_and_values_or_thresholds =
@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
my $entropy = 0;
foreach my $class_name (@{$self->{_class_names}}) {
my $log_prob = undef;
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
if ($prob >= 0.0001 && $prob <= 0.999) {
$log_prob = log($prob) / log(2.0);
} elsif ($prob < 0.0001) {
$log_prob = 0;
} elsif ($prob > 0.999) {
$log_prob = 0;
} else {
die "An error has occurred in log_prob calculation";
}
$entropy += -1.0 * $prob * $log_prob;
}
if (abs($entropy) < 0.0000001) {
$entropy = 0.0;
}
$self->{_entropy_cache}->{$sequence} = $entropy;
return $entropy;
}
sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
my $self = shift;
my $array_of_features_and_values_or_thresholds = shift;
my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
my $entropy = 0;
foreach my $class_name (@{$self->{_class_names}}) {
my $log_prob = undef;
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
$class_name, \@array_of_features_and_values_or_thresholds);
if ($prob >= 0.0001 && $prob <= 0.999) {
$log_prob = log($prob) / log(2.0);
} elsif ($prob < 0.0001) {
$log_prob = 0;
} elsif ($prob > 0.999) {
$log_prob = 0;
} else {
die "An error has occurred in log_prob calculation";
}
$entropy += -1.0 * $prob * $log_prob;
}
if (abs($entropy) < 0.0000001) {
$entropy = 0.0;
}
$self->{_entropy_cache}->{$sequence} = $entropy;
return $entropy;
}
##################################### Probability Calculators ########################################
sub prior_probability_for_class {
my $self = shift;
my $class = shift;
my $class_name_in_cache = "prior" . '::' . $class;
return $self->{_probability_cache}->{$class_name_in_cache}
if exists $self->{_probability_cache}->{$class_name_in_cache};
my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
my @values = values %{$self->{_samples_class_label_hash}};
foreach my $class_name (@{$self->{_class_names}}) {
my @trues = grep {$_ eq $class_name} @values;
my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
my $this_class_name_in_cache = "prior" . '::' . $class_name;
$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
}
return $self->{_probability_cache}->{$class_name_in_cache};
}
sub calculate_class_priors {
my $self = shift;
return if scalar keys %{$self->{_class_priors_hash}} > 1;
foreach my $class_name (@{$self->{_class_names}}) {
my $class_name_in_cache = "prior::$class_name";
my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
my @all_values = values %{$self->{_samples_class_label_hash}};
my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
$self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
my $this_class_name_in_cache = "prior::$class_name";
$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
}
if ($self->{_debug1}) {
foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
print "$class => $self->{_class_priors_hash}->{$class}\n";
}
}
}
sub calculate_first_order_probabilities {
print "\nEstimating probabilities...\n";
my $self = shift;
foreach my $feature (@{$self->{_feature_names}}) {
$self->probability_of_feature_value($feature, undef);
if ($self->{_debug2}) {
if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
print "\nPresenting probability distribution for a numeric feature:\n";
foreach my $sampling_point (sort {$a <=> $b} keys
%{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
my @arr5;
foreach my $li (@arr4) {
my @temp1 = ();
my @temp2 = ();
foreach my $inner (@$li) {
if ($inner->[1] eq '<') {
push @temp1, $inner;
} else {
push @temp2, $inner;
}
}
if (@temp1 > 0 && @temp2 > 0) {
push @arr5, [\@temp1, \@temp2];
} elsif (@temp1 > 0) {
push @arr5, [\@temp1];
} else {
push @arr5, [\@temp2];
}
}
print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
my @arr6 = ();
foreach my $li (@arr5) {
my @temp1 = ();
foreach my $inner (@$li) {
my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
push @temp1, \@sorted;
}
push @arr6, \@temp1;
}
print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
my @arr9 = ();
foreach my $li (@arr6) {
foreach my $alist (@$li) {
my @newalist = ();
if ($alist->[0][1] eq '<') {
push @newalist, $alist->[0];
} else {
push @newalist, $alist->[-1];
}
if ($alist->[0][1] ne $alist->[-1][1]) {
push @newalist, $alist->[-1];
}
push @arr9, \@newalist;
}
}
print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
return \@arr9;
}
## This method is used to verify that you used legal feature names in the test
## sample that you want to classify with the decision tree.
sub check_names_used {
my $self = shift;
my $features_and_values_test_data = shift;
my @features_and_values_test_data = @$features_and_values_test_data;
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values_test_data) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1,$2);
die "Your test data has formatting error" unless defined($feature) && defined($value);
return 0 unless contained_in($feature, @{$self->{_feature_names}});
}
return 1;
}
####################################### Data Condition Calculator ######################################
## This method estimates the worst-case fan-out of the decision tree taking into
## account the number of values (and therefore the number of branches emanating from
## a node) for the symbolic features.
sub determine_data_condition {
my $self = shift;
my $num_of_features = scalar @{$self->{_feature_names}};
my @values = ();
my @number_of_values;
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
push @number_of_values, scalar @values;
}
return if ! @values;
print "Number of features: $num_of_features\n";
my @minmax = minmax(\@number_of_values);
my $max_num_values = $minmax[1];
print "Largest number of values for symbolic features is: $max_num_values\n";
my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
"nodes. The exact number of nodes created depends critically on " .
"the entropy_threshold used for node expansion (the default value " .
"for this threshold is 0.01) and on the value set for max_depth_desired " .
"for the depth of the tree.\n";
if ($estimated_number_of_nodes > 10000) {
print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
"large value for entropy_threshold and/or a small value for " .
"for max_depth_desired to reduce the number of nodes created.\n";
print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
my $answer = <STDIN>;
$answer =~ s/\r?\n?$//;
while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
$answer = <STDIN>;
$answer =~ s/\r?\n?$//;
}
die unless $answer =~ /y(es)?/i;
}
}
#################################### Read Training Data From File ######################################
sub get_training_data {
my $self = shift;
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
my %class_names = ();
my %all_record_ids_with_class_labels;
my $firstline;
my %data_hash;
$|++;
open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
approximation to the Jacobian. That, however, is at a cost of much longer
computation time. B<NOTE:> For most cases, using 0 for C<jacobian_choice> is the
best choice. See my tutorial "I<Linear Regression and Regression Trees>" for why
that is the case.
=back
=head2 B<Methods defined for C<RegressionTree> class>
=over 8
=item B<get_training_data_for_regression():>
Only CSV training datafiles are allowed. Additionally, the first record in the file
must list the names of the fields, and the first column must contain an integer ID
for each record.
=item B<construct_regression_tree():>
As the name implies, this is the method that construct a regression tree.
=item B<display_regression_tree(" "):>
Displays the regression tree, as the name implies. The white-space string argument
specifies the offset to use in displaying the child nodes in relation to a parent
node.
=item B<prediction_for_single_data_point( $root_node, $test_sample ):>
You call this method after you have constructed a regression tree if you want to
calculate the prediction for one sample. The parameter C<$root_node> is what is
returned by the call C<construct_regression_tree()>. The formatting of the argument
bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
this case, the C<$test_sample> parameter will be bound to a list that will look like
['xvar1 = 23.4', 'xvar2 = 12.9']
Arbitrary amount of white space, including none, on the two sides of the equality
symbol is allowed in the construct shown above. A call to this method returns a
dictionary with two key-value pairs. One of the keys is called C<solution_path> and
the other C<prediction>. The value associated with key C<solution_path> is the path
in the regression tree to the leaf node that yielded the prediction. And the value
associated with the key C<prediction> is the answer you are looking for.
=item B<predictions_for_all_data_used_for_regression_estimation( $root_node ):>
This call calculates the predictions for all of the predictor variables data in your
training file. The parameter C<$root_node> is what is returned by the call to
C<construct_regression_tree()>. The values for the dependent variable thus predicted
can be seen by calling C<display_all_plots()>, which is the method mentioned below.
=item B<display_all_plots():>
This method displays the results obtained by calling the prediction method of the
previous entry. This method also creates a hardcopy of the plots and saves it as a
C<.png> disk file. The name of this output file is always C<regression_plots.png>.
=item B<mse_for_tree_regression_for_all_training_samples( $root_node ):>
This method carries out an error analysis of the predictions for the samples in your
training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
regression, the MSE for the data samples at each of the leaf nodes of the regression
tree, and the MSE for the plain old Linear Regression as applied to all of the data.
The parameter C<$root_node> in the call syntax is what is returned by the call to
C<construct_regression_tree()>.
=item B<bulk_predictions_for_data_in_a_csv_file( $root_node, $filename, $columns ):>
Call this method if you want to apply the regression tree to all your test data in a
disk file. The predictions for all of the test samples in the disk file are written
out to another file whose name is the same as that of the test file except for the
addition of C<_output> in the name of the file. The parameter C<$filename> is the
name of the disk file that contains the test data. And the parameter C<$columns> is a
list of the column indices for the predictor variables in the test file.
=back
=head1 GENERATING SYNTHETIC TRAINING DATA
The module file contains the following additional classes: (1)
C<TrainingDataGeneratorNumeric>, and (2) C<TrainingDataGeneratorSymbolic> for
generating synthetic training data.
The class C<TrainingDataGeneratorNumeric> outputs one CSV file for the
training data and another one for the test data for experimenting with numeric
features. The numeric values are generated using a multivariate Gaussian
distribution whose mean and covariance are specified in a parameter file. See the
file C<param_numeric.txt> in the C<Examples> directory for an example of such a
parameter file. Note that the dimensionality of the data is inferred from the
information you place in the parameter file.
The class C<TrainingDataGeneratorSymbolic> generates synthetic training for the
purely symbolic case. The relative frequencies of the different possible values for
the features is controlled by the biasing information you place in a parameter file.
See C<param_symbolic.txt> for an example of such a file.
=head1 THE C<Examples> DIRECTORY
See the C<Examples> directory in the distribution for how to construct a decision
tree, and how to then classify new data using the decision tree. To become more
familiar with the module, run the scripts
construct_dt_and_classify_one_sample_case1.pl
construct_dt_and_classify_one_sample_case2.pl
construct_dt_and_classify_one_sample_case3.pl
construct_dt_and_classify_one_sample_case4.pl
The first script is for the purely symbolic case, the second for the case that
involves both numeric and symbolic features, the third for the case of purely numeric
features, and the last for the case when the training data is synthetically generated
by the script C<generate_training_data_numeric.pl>.
Next run the following script as it is for bulk classification of data records placed
in a CSV file:
classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
The script first constructs a decision tree using the training data in the training
file supplied by the first argument file C<training4.csv>. The script then
( run in 1.803 second using v1.01-cache-2.11-cpan-0bd6704ced7 )