view release on metacpan or search on metacpan
Examples/evaluate_training_data1.pl view on Meta::CPAN
#!/usr/bin/env perl
## evaluate_training_data1.pl
## This script is for testing the class discriminatory power of the training data
## contained in the file `stage3cancer.csv'.
## Through the class EvalTrainingData as shown below, this script runs a 10-fold
## cross-validation test on the training data. This test divides all of the
## training data into ten parts, with nine parts used for training a decision tree
## and one part used for testing its ability to classify correctly. This selection
## of nine parts for training and one part for testing is carried out in all of the
## ten different possible ways.
## A script like this can also be used to test the appropriateness of your choices
## for the constructor parameters entropy_threshold, max_depth_desired, and
## symbolic_to_numeric_cardinality_threshold.
use strict;
use warnings;
use Algorithm::DecisionTree;
Examples/evaluate_training_data2.pl view on Meta::CPAN
## This script is for testing the class discriminatory power of the training data
## contained in the training files `training.csv', `training2.csv', and
## `training3.csv'.
## The three training files mentioned above contain two Gaussian classes with
## increasing degrees of overlap between them.
## Through the class EvalTrainingData as shown below, this script runs a 10-fold
## cross-validation test on the training data. This test divides all of the
## training data into ten parts, with nine parts used for training a decision tree
## and one part used for testing its ability to classify correctly. This selection
## of nine parts for training and one part for testing is carried out in all of the
## ten different possible ways.
## A script like this can also be used to test the appropriateness of your choices
## for the constructor parameters entropy_threshold, max_depth_desired, and
## symbolic_to_numeric_cardinality_threshold.
use strict;
use warnings;
use Algorithm::DecisionTree;
lib/Algorithm/BoostedDecisionTree.pm view on Meta::CPAN
if check_for_illegal_params(@params) == 0;
my %dtargs = %args;
delete $dtargs{how_many_stages};
my $instance = Algorithm::DecisionTree->new(%dtargs);
bless $instance, $class;
$instance->{_how_many_stages} = $args{how_many_stages} || undef;
$instance->{_stagedebug} = $args{stagedebug} || 0;
$instance->{_training_samples} = {map {$_ => []} 0..$args{how_many_stages}};
$instance->{_all_trees} = {map {$_ => Algorithm::DecisionTree->new(%dtargs)} 0..$args{how_many_stages}};
$instance->{_root_nodes} = {map {$_ => undef} 0..$args{how_many_stages}};
$instance->{_sample_selection_probs} = {map {$_ => {}} 0..$args{how_many_stages}};
$instance->{_trust_factors} = {map {$_ => undef} 0..$args{how_many_stages}};
$instance->{_misclassified_samples} = {map {$_ => []} 0..$args{how_many_stages}};
$instance->{_classifications} = undef;
$instance->{_trust_weighted_decision_classes} = undef;
bless $instance, $class;
}
############################################## Methods #################################################
sub get_training_data_for_base_tree {
my $self = shift;
lib/Algorithm/BoostedDecisionTree.pm view on Meta::CPAN
sub show_training_data_for_base_tree {
my $self = shift;
$self->{_all_trees}->{0}->show_training_data();
}
sub calculate_first_order_probabilities_and_class_priors {
my $self = shift;
$self->{_all_trees}->{0}->calculate_first_order_probabilities();
$self->{_all_trees}->{0}->calculate_class_priors();
$self->{_sample_selection_probs}->{0} = {map { $_ => 1.0/@{$self->{_all_sample_names}} } @{$self->{_all_sample_names}}};
}
sub construct_base_decision_tree {
my $self = shift;
$self->{_root_nodes}->{0} = $self->{_all_trees}->{0}->construct_decision_tree_classifier();
}
sub display_base_decision_tree {
my $self = shift;
$self->{_root_nodes}->{0}->display_decision_tree(" ");
lib/Algorithm/BoostedDecisionTree.pm view on Meta::CPAN
sub construct_cascade_of_trees {
my $self = shift;
$self->{_training_samples}->{0} = $self->{_all_sample_names};
$self->{_misclassified_samples}->{0} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{0}, $self->{_root_nodes}->{0});
if ($self->{_stagedebug}) {
$self->show_class_labels_for_misclassified_samples_in_stage(0);
print "\n\nSamples misclassified by base classifier: @{$self->{_misclassified_samples}->{0}}\n";
my $how_many = @{$self->{_misclassified_samples}->{0}};
print "\nNumber of misclassified samples: $how_many\n";
}
my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{0}->{$_}} @{$self->{_misclassified_samples}->{0}};
print "\nMisclassification_error_rate for base classifier: $misclassification_error_rate\n" if $self->{_stagedebug};
$self->{_trust_factors}->{0} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
print "\nBase class trust factor: $self->{_trust_factors}->{0}\n" if $self->{_stagedebug};
foreach my $stage_index (1 .. $self->{_how_many_stages} - 1) {
print "\n\n========================== Constructing stage indexed $stage_index =========================\n"
if $self->{_stagedebug};
$self->{_sample_selection_probs}->{$stage_index} = { map {$_ => $self->{_sample_selection_probs}->{$stage_index-1}->{$_} * exp(-1.0 * $self->{_trust_factors}->{$stage_index - 1} * (contained_in($_, @{$self->{_misclassified_samples}->{$st...
my $normalizer = reduce {$a + $b} values %{$self->{_sample_selection_probs}->{$stage_index}};
print "\nThe normalizer is: $normalizer\n" if $self->{_stagedebug};
map {$self->{_sample_selection_probs}->{$stage_index}->{$_} /= $normalizer} keys %{$self->{_sample_selection_probs}->{$stage_index}};
my @training_samples_this_stage = ();
my $sum_of_probs = 0.0;
foreach my $sample (sort {$self->{_sample_selection_probs}->{$stage_index}->{$b} <=> $self->{_sample_selection_probs}->{$stage_index}->{$a}} keys %{$self->{_sample_selection_probs}->{$stage_index}}) {
$sum_of_probs += $self->{_sample_selection_probs}->{$stage_index}->{$sample};
push @training_samples_this_stage, $sample if $sum_of_probs < 0.5;
last if $sum_of_probs > 0.5;
}
$self->{_training_samples}->{$stage_index} = [sort {sample_index($a) <=> sample_index($b)} @training_samples_this_stage];
if ($self->{_stagedebug}) {
print "\nTraining samples for stage $stage_index: @{$self->{_training_samples}->{$stage_index}}\n\n";
my $num_of_training_samples = @{$self->{_training_samples}->{$stage_index}};
print "\nNumber of training samples this stage $num_of_training_samples\n\n";
}
# find intersection of two sets:
my %misclassified_samples = map {$_ => 1} @{$self->{_misclassified_samples}->{$stage_index-1}};
my @training_samples_selection_check = grep $misclassified_samples{$_}, @{$self->{_training_samples}->{$stage_index}};
if ($self->{_stagedebug}) {
my @training_in_misclassified = sort {sample_index($a) <=> sample_index($b)} @training_samples_selection_check;
print "\nTraining samples in the misclassified set: @training_in_misclassified\n";
my $how_many = @training_samples_selection_check;
print "\nNumber_of_miscalssified_samples_in_training_set: $how_many\n";
}
my $dt_this_stage = Algorithm::DecisionTree->new('boostingmode');
$dt_this_stage->{_training_data_hash} = { map {$_ => $self->{_all_training_data}->{$_} } @{$self->{_training_samples}->{$stage_index}} };
$dt_this_stage->{_class_names} = $self->{_all_trees}->{0}->{_class_names};
$dt_this_stage->{_feature_names} = $self->{_all_trees}->{0}->{_feature_names};
$dt_this_stage->{_entropy_threshold} = $self->{_all_trees}->{0}->{_entropy_threshold};
$dt_this_stage->{_max_depth_desired} = $self->{_all_trees}->{0}->{_max_depth_desired};
$dt_this_stage->{_symbolic_to_numeric_cardinality_threshold} = $self->{_all_trees}->{0}->{_symbolic_to_numeric_cardinality_threshold};
lib/Algorithm/BoostedDecisionTree.pm view on Meta::CPAN
$root_node_this_stage->display_decision_tree(" ") if $self->{_stagedebug};
$self->{_all_trees}->{$stage_index} = $dt_this_stage;
$self->{_root_nodes}->{$stage_index} = $root_node_this_stage;
$self->{_misclassified_samples}->{$stage_index} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{$stage_index}, $self->{_root_nodes}->{$stage_index});
if ($self->{_stagedebug}) {
print "\nSamples misclassified by stage $stage_index classifier: @{$self->{_misclassified_samples}->{$stage_index}}\n";
printf("\nNumber of misclassified samples: %d\n", scalar @{$self->{_misclassified_samples}->{$stage_index}});
$self->show_class_labels_for_misclassified_samples_in_stage($stage_index);
}
my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{$stage_index}->{$_}} @{$self->{_misclassified_samples}->{$stage_index}};
print "\nStage $stage_index misclassification_error_rate: $misclassification_error_rate\n" if $self->{_stagedebug};
$self->{_trust_factors}->{$stage_index} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
print "\nStage $stage_index trust factor: $self->{_trust_factors}->{$stage_index}\n" if $self->{_stagedebug};
}
}
sub evaluate_one_stage_of_cascade {
my $self = shift;
my $trainingDT = shift;
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
$line =~ s/,\s*(?=,|$)/,NA/g;
return $line;
}
######################################### Class EvalTrainingData ########################################
## This subclass of the DecisionTree class is used to evaluate the quality of your
## training data by running a 10-fold cross-validation test on it. This test divides
## all of the training data into ten parts, with nine parts used for training a
## decision tree and one part used for testing its ability to classify correctly.
## This selection of nine parts for training and one part for testing is carried out
## in all of the ten different possible ways. This testing functionality can also
## be used to find the best values to use for the constructor parameters
## entropy_threshold, max_depth_desired, and
## symbolic_to_numeric_cardinality_threshold.
## Only the CSV training files can be evaluated in this manner (because only CSV
## training are allowed to have numeric features --- which is the more interesting
## case for evaluation analytics.
package EvalTrainingData;
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
data samples in a CSV file. The bulk classifications obtained can be written out to
either a CSV file or to a regular text file. See the script
C<classify_test_data_in_a_file_numeric.pl> in the C<Examples> directory for how to
classify all of your test data records in a CSV file. This version also includes
improved code for generating synthetic numeric/symbolic training and test data
records for experimenting with the decision tree classifier.
B<Version 2.1> allows you to test the quality of your training data by running a 10-fold
cross-validation test on the data. This test divides all of the training data into
ten parts, with nine parts used for training a decision tree and one part used for
testing its ability to classify correctly. This selection of nine parts for training
and one part for testing is carried out in all of the ten different ways that are
possible. This testing functionality in Version 2.1 can also be used to find the
best values to use for the constructor parameters C<entropy_threshold>,
C<max_depth_desired>, and C<symbolic_to_numeric_cardinality_threshold>.
B<Version 2.0 is a major rewrite of this module.> Now you can use both numeric and
symbolic features for constructing a decision tree. A feature is numeric if it can
take any floating-point value over an interval.
B<Version 1.71> fixes a bug in the code that was triggered by 0 being declared as one of
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
that each dimension of the data measures. You then use the training data to carve up
the feature space into different regions, each corresponding to a different class.
Subsequently, when you try to classify a new data sample, you locate it in the
feature space and find the class label of the region to which it belongs. One can
also give the new data point the same class label as that of the nearest training
sample. This is referred to as the nearest neighbor classification. There exist
hundreds of variations of varying power on these two basic approaches to the
classification of multidimensional data.
A decision tree classifier works differently. When you construct a decision tree,
you select for the root node a feature test that partitions the training data in a
way that causes maximal disambiguation of the class labels associated with the data.
In terms of information content as measured by entropy, such a feature test would
cause maximum reduction in class entropy in going from all of the training data taken
together to the data as partitioned by the feature test. You then drop from the root
node a set of child nodes, one for each partition of the training data created by the
feature test at the root node. When your features are purely symbolic, you'll have
one child node for each value of the feature chosen for the feature test at the root.
When the test at the root involves a numeric feature, you find the decision threshold
for the feature that best bipartitions the data and you drop from the root node two
child nodes, one for each partition. Now at each child node you pose the same
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
=head1 TESTING THE QUALITY OF YOUR TRAINING DATA
Versions 2.1 and higher include a new class named C<EvalTrainingData>, derived from
the main class C<DecisionTree>, that runs a 10-fold cross-validation test on your
training data to test its ability to discriminate between the classes mentioned in
the training file.
The 10-fold cross-validation test divides all of the training data into ten parts,
with nine parts used for training a decision tree and one part used for testing its
ability to classify correctly. This selection of nine parts for training and one part
for testing is carried out in all of the ten different possible ways.
The following code fragment illustrates how you invoke the testing function of the
EvalTrainingData class:
my $training_datafile = "training.csv";
my $eval_data = EvalTrainingData->new(
training_datafile => $training_datafile,
csv_class_column_index => 1,
csv_columns_for_features => [2,3],
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
See the example scripts in the directory C<bagging_examples> for how to call these
methods for classifying individual samples and for bulk classification when you place
all your test samples in a single file.
=head1 USING BOOSTING
Starting with Version 3.20, you can use the class C<BoostedDecisionTree> for
constructing a boosted decision-tree classifier. Boosting results in a cascade of
decision trees in which each decision tree is constructed with samples that are
mostly those that are misclassified by the previous decision tree. To be precise,
you create a probability distribution over the training samples for the selection of
samples for training each decision tree in the cascade. To start out, the
distribution is uniform over all of the samples. Subsequently, this probability
distribution changes according to the misclassifications by each tree in the cascade:
if a sample is misclassified by a given tree in the cascade, the probability of its
being selected for training the next tree is increased significantly. You also
associate a trust factor with each decision tree depending on its power to classify
correctly all of the training data samples. After a cascade of decision trees is
constructed in this manner, you construct a final classifier that calculates the
class label for a test data sample by taking into account the classification
decisions made by each individual tree in the cascade, the decisions being weighted
by the trust factors associated with the individual classifiers. These boosting
notions --- generally referred to as the AdaBoost algorithm --- are based on a now
celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
large database of training samples:
(1) Consider a two-class problem for which the training database is grossly
imbalanced in how many majority-class samples it contains vis-a-vis the number of
minority class samples. Let's assume for a moment that the ratio of majority class
samples to minority class samples is 1000 to 1. Let's also assume that you have a
test dataset that is drawn randomly from the same population mixture from which the
training database was created. Now consider a stupid data classification program
that classifies everything as belonging to the majority class. If you measure the
classification accuracy rate as the ratio of the number of samples correctly
classified to the total number of test samples selected randomly from the population,
this classifier would work with an accuracy of 99.99%.
(2) Let's now consider another situation in which we are faced with a huge training
database but in which every class is equally well represented. Feeding all the data
into a single decision tree would be akin to polling all of the population of the
United States for measuring the Coke-versus-Pepsi preference in the country. You are
likely to get better results if you construct multiple decision trees, each trained
with a collection of training samples drawn randomly from the training database.
After you have created all the decision trees, your final classification decision
could then be based on, say, majority voting by the trees.
lib/Algorithm/RandomizedTreesForBigData.pm view on Meta::CPAN
package Algorithm::RandomizedTreesForBigData;
#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::RandomizedTreesForBigData is a Perl module for inducing multiple decision
# trees using randomized selection of samples from a large training data file.
# -------------------------------------------------------------------------------------
#use lib 'blib/lib', 'blib/arch';
#use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(pairmap);
use Algorithm::DecisionTree 3.43;