Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
sub check_for_illegal_params2 {
my @params = @_;
my @legal_params = qw / training_datafile
entropy_threshold
max_depth_desired
csv_class_column_index
csv_columns_for_features
symbolic_to_numeric_cardinality_threshold
number_of_histogram_bins
csv_cleanup_needed
debug1
debug2
debug3
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub print_array_with_msg {
my $message = shift;
my $arr = shift;
print "\n$message: ";
print_nested_array( $arr );
}
sub print_nested_array {
my $arr = shift;
my @arr = @$arr;
print "[";
foreach my $item (@arr) {
if (ref $item) {
print_nested_array($item);
} else {
print "$item";
}
}
print "]";
}
sub cleanup_csv {
my $line = shift;
$line =~ tr/\/:?()[]{}'/ /;
# my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
for (@double_quoted) {
my $item = $_;
$item = substr($item, 1, -1);
$item =~ s/^\s+|,|\s+$//g;
$item = join '_', split /\s+/, $item;
substr($line, index($line, $_), length($_)) = $item;
}
my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
for (@white_spaced) {
my $item = $_;
$item =~ s/\s+/_/g;
$item =~ s/^\s*_|_\s*$//g;
substr($line, index($line, $_), length($_)) = $item;
}
$line =~ s/,\s*(?=,|$)/,NA/g;
return $line;
}
######################################### Class EvalTrainingData ########################################
## This subclass of the DecisionTree class is used to evaluate the quality of your
## training data by running a 10-fold cross-validation test on it. This test divides
## all of the training data into ten parts, with nine parts used for training a
## decision tree and one part used for testing its ability to classify correctly.
## This selection of nine parts for training and one part for testing is carried out
## in all of the ten different possible ways. This testing functionality can also
## be used to find the best values to use for the constructor parameters
## entropy_threshold, max_depth_desired, and
## symbolic_to_numeric_cardinality_threshold.
## Only the CSV training files can be evaluated in this manner (because only CSV
## training are allowed to have numeric features --- which is the more interesting
## case for evaluation analytics.
package EvalTrainingData;
@EvalTrainingData::ISA = ('Algorithm::DecisionTree');
sub new {
my $class = shift;
my $instance = Algorithm::DecisionTree->new(@_);
bless $instance, $class;
}
sub evaluate_training_data {
my $self = shift;
my $evaldebug = 0;
die "The data evaluation function in the module can only be used when your " .
"training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
print "\nWill run a 10-fold cross-validation test on your training data to test its " .
"class-discriminatory power:\n";
my %all_training_data = %{$self->{_training_data_hash}};
my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
my $fold_size = int(0.1 * (scalar keys %all_training_data));
print "fold size: $fold_size\n";
my %confusion_matrix = ();
foreach my $class_name (@{$self->{_class_names}}) {
foreach my $inner_class_name (@{$self->{_class_names}}) {
$confusion_matrix{$class_name}->{$inner_class_name} = 0;
}
}
foreach my $fold_index (0..9) {
print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
@all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
my %testing_data = ();
foreach my $x (@testing_samples) {
$testing_data{$x} = $all_training_data{$x};
}
my %training_data = ();
foreach my $x (@training_samples) {
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
return $self->{ _class_probabilities};
}
sub get_branch_features_and_values_or_thresholds {
my $self = shift;
return $self->{_branch_features_and_values_or_thresholds};
}
sub add_to_branch_features_and_values {
my $self = shift;
my $feature_and_value = shift;
push @{$self->{ _branch_features_and_values }}, $feature_and_value;
}
sub get_children {
my $self = shift;
return $self->{_linked_to};
}
sub add_child_link {
my ($self, $new_node, ) = @_;
push @{$self->{_linked_to}}, $new_node;
}
sub delete_all_links {
my $self = shift;
$self->{_linked_to} = undef;
}
sub display_node {
my $self = shift;
my $feature_at_node = $self->get_feature() || " ";
my $node_creation_entropy_at_node = $self->get_node_entropy();
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
my @class_probabilities = @{$self->get_class_probabilities()};
my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
my $serial_num = $self->get_serial_num();
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
print "\n\nNODE $serial_num" .
":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
"\n Class probabilities at current node: @class_probabilities_for_display" .
"\n Entropy at current node: $print_node_creation_entropy_at_node" .
"\n Best feature test at current node: $feature_at_node\n\n";
}
sub display_decision_tree {
my $self = shift;
my $offset = shift;
my $serial_num = $self->get_serial_num();
if (@{$self->get_children()} > 0) {
my $feature_at_node = $self->get_feature() || " ";
my $node_creation_entropy_at_node = $self->get_node_entropy();
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
my @class_probabilities = @{$self->get_class_probabilities()};
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
my @class_names = @{$self->get_class_names()};
my @print_class_probabilities_with_class =
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
"$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
$offset .= " ";
foreach my $child (@{$self->get_children()}) {
$child->display_decision_tree($offset);
}
} else {
my $node_creation_entropy_at_node = $self->get_node_entropy();
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
my @class_probabilities = @{$self->get_class_probabilities()};
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
my @class_names = @{$self->get_class_names()};
my @print_class_probabilities_with_class =
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
"Class Probs: @print_class_probabilities_with_class\n\n";
}
}
############################## Generate Your Own Numeric Training Data #################################
############################# Class TrainingDataGeneratorNumeric ################################
## See the script generate_training_data_numeric.pl in the examples
## directory on how to use this class for generating your own numeric training and
## test data. The training and test data are generated in accordance with the
## specifications you place in the parameter file that is supplied as an argument to
## the constructor of this class.
package TrainingDataGeneratorNumeric;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params3(@params) == 0;
bless {
_output_training_csv_file => $args{'output_training_csv_file'}
|| croak("name for output_training_csv_file required"),
_output_test_csv_file => $args{'output_test_csv_file'}
|| croak("name for output_test_csv_file required"),
_parameter_file => $args{'parameter_file'}
|| croak("parameter_file required"),
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
|| croak("number_of_samples_for_training"),
_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
|| croak("number_of_samples_for_testing"),
_debug => $args{debug} || 0,
_class_names => [],
_class_names_and_priors => {},
_features_with_value_range => {},
_features_ordered => [],
_classes_and_their_param_values => {},
}, $class;
}
sub check_for_illegal_params3 {
my @params = @_;
my @legal_params = qw / output_training_csv_file
output_test_csv_file
parameter_file
number_of_samples_for_training
number_of_samples_for_testing
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
"--- perhaps a misspelling\n"
if check_for_illegal_params4(@params) == 0;
bless {
_output_training_datafile => $args{'output_training_datafile'}
|| die("name for output_training_datafile required"),
_parameter_file => $args{'parameter_file'}
|| die("parameter_file required"),
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
|| die("number_of_samples_for_training required"),
_debug => $args{debug} || 0,
_class_names => [],
_class_priors => [],
_features_and_values_hash => {},
_bias_hash => {},
_training_sample_records => {},
}, $class;
}
sub check_for_illegal_params4 {
my @params = @_;
my @legal_params = qw / output_training_datafile
parameter_file
number_of_samples_for_training
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## Read a parameter file for generating symbolic training data. See the script
## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
## pass the name of the parameter file to the constructor of the
## TrainingDataGeneratorSymbolic class.
sub read_parameter_file_symbolic {
my $self = shift;
my $debug = $self->{_debug};
my $number_of_training_samples = $self->{_number_of_samples_for_training};
my $input_parameter_file = $self->{_parameter_file};
croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
my $output_file_training = $self->{_output_training_datafile};
my $output_file_testing = $self->{_output_test_datafile};
my @all_params;
my $param_string;
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
$param_string = join ' ', @all_params;
my ($class_names, $class_priors, $rest_param) =
$param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
push @{$self->{_class_names}}, @class_names;
my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
push @{$self->{_class_priors}}, @class_priors;
my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
my %features_and_values_hash;
my @features = split /(feature[:])/, $feature_string;
@features = grep {defined($_) && length($_) > 0} @features;
foreach my $item (@features) {
next if $item =~ /feature/;
my @splits = split / /, $item;
@splits = grep {defined($_) && length($_) > 0} @splits;
foreach my $i (0..@splits-1) {
if ($i == 0) {
$features_and_values_hash{$splits[0]} = [];
} else {
next if $splits[$i] =~ /values/;
push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
}
}
}
$self->{_features_and_values_hash} = \%features_and_values_hash;
my %bias_hash = %{$self->{_bias_hash}};
my @biases = split /(bias[:]\s*class[:])/, $bias_string;
@biases = grep {defined($_) && length($_) > 0} @biases;
foreach my $item (@biases) {
next if $item =~ /bias/;
my @splits = split /\s+/, $item;
@splits = grep {defined($_) && length($_) > 0} @splits;
my $feature_name;
foreach my $i (0..@splits-1) {
if ($i == 0) {
$bias_hash{$splits[0]} = {};
} elsif ($splits[$i] =~ /(^.+)[:]$/) {
$feature_name = $1;
$bias_hash{$splits[0]}->{$feature_name} = [];
} else {
next if !defined $feature_name;
push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
if defined $feature_name;
}
}
}
$self->{_bias_hash} = \%bias_hash;
if ($debug) {
print "\n\nClass names: @class_names\n";
my $num_of_classes = @class_names;
print "Class priors: @class_priors\n";
print "Number of classes: $num_of_classes\n";
print "\nHere are the features and their possible values:\n";
while ( my ($k, $v) = each %features_and_values_hash ) {
print "$k ===> @$v\n";
}
print "\nHere is the biasing for each class:\n";
while ( my ($k, $v) = each %bias_hash ) {
print "$k:\n";
while ( my ($k1, $v1) = each %$v ) {
print " $k1 ===> @$v1\n";
}
}
}
}
## This method generates training data according to the specifications placed in a
## parameter file that is read by the previous method.
sub gen_symbolic_training_data {
my $self = shift;
my @class_names = @{$self->{_class_names}};
my @class_priors = @{$self->{_class_priors}};
my %training_sample_records;
my %features_and_values_hash = %{$self->{_features_and_values_hash}};
my %bias_hash = %{$self->{_bias_hash}};
my $how_many_training_samples = $self->{_number_of_samples_for_training};
my $how_many_test_samples = $self->{_number_of_samples_for_testing};
my %class_priors_to_unit_interval_map;
my $accumulated_interval = 0;
foreach my $i (0..@class_names-1) {
$class_priors_to_unit_interval_map{$class_names[$i]}
= [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
$accumulated_interval += $class_priors[$i];
}
if ($self->{_debug}) {
print "Mapping of class priors to unit interval: \n";
while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
print "$k => @$v\n";
}
print "\n\n";
}
my $ele_index = 0;
while ($ele_index < $how_many_training_samples) {
my $sample_name = "sample" . "_$ele_index";
$training_sample_records{$sample_name} = [];
# Generate class label for this training sample:
my $roll_the_dice = rand(1.0);
my $class_label;
foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
my $v = $class_priors_to_unit_interval_map{$class_name};
if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
push @{$training_sample_records{$sample_name}},
"class=" . $class_name;
$class_label = $class_name;
last;
}
}
foreach my $feature (keys %features_and_values_hash) {
my @values = @{$features_and_values_hash{$feature}};
my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
my $no_bias = 1.0 / @values;
$bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
my %value_priors_to_unit_interval_map;
my @splits = split /\s*=\s*/, $bias_string;
my $chosen_for_bias_value = $splits[0];
my $chosen_bias = $splits[1];
my $remaining_bias = 1 - $chosen_bias;
my $remaining_portion_bias = $remaining_bias / (@values -1);
@splits = grep {defined($_) && length($_) > 0} @splits;
my $accumulated = 0;
foreach my $i (0..@values-1) {
if ($values[$i] eq $chosen_for_bias_value) {
$value_priors_to_unit_interval_map{$values[$i]}
= [$accumulated, $accumulated + $chosen_bias];
$accumulated += $chosen_bias;
} else {
$value_priors_to_unit_interval_map{$values[$i]}
= [$accumulated, $accumulated + $remaining_portion_bias];
$accumulated += $remaining_portion_bias;
}
}
my $roll_the_dice = rand(1.0);
my $value_label;
foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
my $v = $value_priors_to_unit_interval_map{$value_name};
if ( ($roll_the_dice >= $v->[0])
&& ($roll_the_dice <= $v->[1]) ) {
push @{$training_sample_records{$sample_name}},
$feature . "=" . $value_name;
$value_label = $value_name;
last;
}
}
if ($self->{_debug}) {
print "mapping feature value priors for '$feature' " .
"to unit interval: \n";
while ( my ($k, $v) =
each %value_priors_to_unit_interval_map ) {
print "$k => @$v\n";
}
print "\n\n";
}
}
$ele_index++;
}
$self->{_training_sample_records} = \%training_sample_records;
if ($self->{_debug}) {
print "\n\nPRINTING TRAINING RECORDS:\n\n";
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
print "$kee => @{$training_sample_records{$kee}}\n\n";
}
}
my $output_training_file = $self->{_output_training_datafile};
print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
open FILEHANDLE, ">$output_training_file";
my @features = sort keys %features_and_values_hash;
my $title_string = ',class';
foreach my $feature_name (@features) {
$title_string .= ',' . $feature_name;
}
print FILEHANDLE "$title_string\n";
my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
my $record_string = '';
foreach my $sample_name (@sample_names) {
$record_string .= "$sample_name,";
my @record = @{$training_sample_records{"sample_$sample_name"}};
my %item_parts_hash;
foreach my $item (@record) {
my @splits = grep $_, split /=/, $item;
( run in 1.318 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )