Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
if check_for_illegal_params2(@params) == 0;
}
bless {
_training_datafile => $args{training_datafile},
_entropy_threshold => $args{entropy_threshold} || 0.01,
_max_depth_desired => exists $args{max_depth_desired} ?
$args{max_depth_desired} : undef,
_debug1 => $args{debug1} || 0,
_debug2 => $args{debug2} || 0,
_debug3 => $args{debug3} || 0,
_csv_class_column_index => $args{csv_class_column_index} || undef,
_csv_columns_for_features => $args{csv_columns_for_features} || undef,
_symbolic_to_numeric_cardinality_threshold
=> $args{symbolic_to_numeric_cardinality_threshold} || 10,
_number_of_histogram_bins => $args{number_of_histogram_bins} || undef,
_csv_cleanup_needed => $args{csv_cleanup_needed} || 0,
_training_data => [],
_root_node => undef,
_probability_cache => {},
_entropy_cache => {},
_training_data_hash => {},
_features_and_values_hash => {},
_samples_class_label_hash => {},
_class_names => [],
_class_priors => [],
_class_priors_hash => {},
_feature_names => [],
_numeric_features_valuerange_hash => {},
_sampling_points_for_numeric_feature_hash => {},
_feature_values_how_many_uniques_hash => {},
_prob_distribution_numeric_features_hash => {},
_histogram_delta_hash => {},
_num_of_histogram_bins_hash => {},
}, $class;
}
#################################### Classify with Decision Tree #######################################
## Classifies one test sample at a time using the decision tree constructed from
## your training file. The data record for the test sample must be supplied as
## shown in the scripts in the `examples' subdirectory. See the scripts
## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
sub classify {
my $self = shift;
my $root_node = shift;
my $feature_and_values = shift;
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
my @features_and_values = @$feature_and_values;
@features_and_values = @{deep_copy_array(\@features_and_values)};
die "\n\nError in the names you have used for features and/or values. " .
"Try using the csv_cleanup_needed option in the constructor call."
unless $self->check_names_used(\@features_and_values);
my @new_features_and_values = ();
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values) {
$feature_and_value =~ /$pattern/;
my ($feature, $value) = ($1, $2);
my $newvalue = $value;
my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
my $not_all_values_float = 0;
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
$not_all_values_float == 0) {
$newvalue = closest_sampling_point($value, \@unique_values_for_feature);
}
push @new_features_and_values, "$feature" . '=' . "$newvalue";
}
@features_and_values = @new_features_and_values;
print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
my %answer = ();
foreach my $class_name (@{$self->{_class_names}}) {
$answer{$class_name} = undef;
}
$answer{'solution_path'} = [];
my %classification = %{$self->recursive_descent_for_classification($root_node,
\@features_and_values,\%answer)};
@{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
if ($self->{_debug3}) {
print "\nCL2 The classification:\n";
foreach my $class_name (@{$self->{_class_names}}) {
print " $class_name with probability $classification{$class_name}\n";
}
}
my %classification_for_display = ();
foreach my $item (keys %classification) {
if ($item ne 'solution_path') {
$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
} else {
my @outlist = ();
foreach my $x (@{$classification{$item}}) {
push @outlist, "NODE$x";
}
$classification_for_display{$item} = \@outlist;
}
}
return \%classification_for_display;
}
sub recursive_descent_for_classification {
my $self = shift;
my $node = shift;
my $features_and_values = shift;
my $answer = shift;
my @features_and_values = @$features_and_values;
my %answer = %$answer;
my @children = @{$node->get_children()};
if (@children == 0) {
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
foreach my $i (0..@{$self->{_class_names}}-1) {
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
}
push @{$answer{'solution_path'}}, $node->get_serial_num();
return \%answer;
}
my $feature_tested_at_node = $node->get_feature();
print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
if $self->{_debug3};
my $value_for_feature;
my $path_found;
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values) {
$feature_and_value =~ /$pattern/;
$value_for_feature = $2 if $feature_tested_at_node eq $1;
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
} else {
@extended_branch_features_and_values_or_thresholds =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @extended_branch_features_and_values_or_thresholds,
$feature_value_combos[$feature_and_value_index];
}
my @class_probabilities =
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
\@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
my $class_entropy_for_child =
$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
\@extended_branch_features_and_values_or_thresholds);
if ($self->{_debug3}) {
print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
print "\nRD20 class entropy for child: $class_entropy_for_child\n";
}
if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
my $child_node = DTNode->new(undef, $class_entropy_for_child,
\@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
$node->add_child_link($child_node);
$self->recursive_descent($child_node);
} else {
print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
}
}
}
} else {
print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
if $self->{_debug3};
return;
}
}
## This is the heart of the decision tree constructor. Its main job is to figure
## out the best feature to use for partitioning the training data samples that
## correspond to the current node. The search for the best feature is carried out
## differently for symbolic features and for numeric features. For a symbolic
## feature, the method estimates the entropy for each value of the feature and then
## averages out these entropies as a measure of the discriminatory power of that
## features. For a numeric feature, on the other hand, it estimates the entropy
## reduction that can be achieved if were to partition the set of training samples
## for each possible threshold. For a numeric feature, all possible sampling points
## relevant to the node in question are considered as candidates for thresholds.
sub best_feature_calculator {
my $self = shift;
my $features_and_values_or_thresholds_on_branch = shift;
my $existing_node_entropy = shift;
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @all_symbolic_features = ();
foreach my $feature_name (@{$self->{_feature_names}}) {
push @all_symbolic_features, $feature_name
if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
}
my @symbolic_features_already_used = ();
foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
}
my @symbolic_features_not_yet_used;
foreach my $x (@all_symbolic_features) {
push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
}
my @true_numeric_types = ();
my @symbolic_types = ();
my @true_numeric_types_feature_names = ();
my @symbolic_types_feature_names = ();
foreach my $item (@features_and_values_or_thresholds_on_branch) {
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
my (%upperbound, %lowerbound);
foreach my $feature (@true_numeric_types_feature_names) {
$upperbound{$feature} = undef;
$lowerbound{$feature} = undef;
}
foreach my $item (@bounded_intervals_numeric_types) {
foreach my $feature_grouping (@$item) {
if ($feature_grouping->[1] eq '>') {
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
} else {
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
}
}
}
my %entropy_values_for_different_features = ();
my %partitioning_point_child_entropies_hash = ();
my %partitioning_point_threshold = ();
my %entropies_for_different_values_of_symbolic_feature = ();
foreach my $feature (@{$self->{_feature_names}}) {
$entropy_values_for_different_features{$feature} = [];
$partitioning_point_child_entropies_hash{$feature} = {};
$partitioning_point_threshold{$feature} = undef;
$entropies_for_different_values_of_symbolic_feature{$feature} = [];
}
foreach my $i (0..@{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
if (contained_in($feature_name, @symbolic_features_already_used)) {
next;
} elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
$self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
$self->{_symbolic_to_numeric_cardinality_threshold}) {
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
}
die unless $answer =~ /y(es)?/i;
}
}
#################################### Read Training Data From File ######################################
sub get_training_data {
my $self = shift;
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
my %class_names = ();
my %all_record_ids_with_class_labels;
my $firstline;
my %data_hash;
$|++;
open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
my $record_index = 0;
my $firsetline;
while (<FILEIN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
if ($record_index == 0) {
$firstline = $record;
$record_index++;
next;
}
my @parts = split /,/, $record;
my $classname = $parts[$self->{_csv_class_column_index}];
$class_names{$classname} = 1;
my $record_label = shift @parts;
$record_label =~ s/^\s*\"|\"\s*$//g;
$data_hash{$record_label} = \@parts;
$all_record_ids_with_class_labels{$record_label} = $classname;
print "." if $record_index % 10000 == 0;
$record_index++;
}
close FILEIN;
$|--;
$self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
my @sample_names = map {"sample_$_"} keys %data_hash;
my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $d...
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
my %numeric_features_valuerange_hash = ();
my %feature_values_how_many_uniques_hash = ();
my %features_and_unique_values_hash = ();
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $feature (keys %features_and_values_hash) {
my %seen1 = ();
my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
@{$features_and_values_hash{$feature}};
$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
my $not_all_values_float = 0;
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
if ($not_all_values_float == 0) {
my @minmaxvalues = minmax(\@unique_values_for_feature);
$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
}
$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
if ($self->{_debug1}) {
print "\nAll class names: @all_class_names\n";
print "\nEach sample data record:\n";
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
}
print "\nclass label for each data sample:\n";
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
print "$sample => $class_for_sample_hash{$sample}\n";
}
print "\nFeatures used: @feature_names\n\n";
print "\nfeatures and the values taken by them:\n";
foreach my $feature (sort keys %features_and_values_hash) {
print "$feature => @{$features_and_values_hash{$feature}}\n";
}
print "\nnumeric features and their ranges:\n";
foreach my $feature (sort keys %numeric_features_valuerange_hash) {
print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
}
print "\nnumber of unique values in each feature:\n";
foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
}
}
$self->{_class_names} = \@all_class_names;
$self->{_feature_names} = \@feature_names;
$self->{_samples_class_label_hash} = \%class_for_sample_hash;
$self->{_training_data_hash} = \%feature_values_for_samples_hash;
$self->{_features_and_values_hash} = \%features_and_values_hash;
$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
}
sub show_training_data {
my $self = shift;
my @class_names = @{$self->{_class_names}};
my %features_and_values_hash = %{$self->{_features_and_values_hash}};
my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
my %training_data_hash = %{$self->{_training_data_hash}};
print "\n\nClass Names: @class_names\n";
print "\n\nFeatures and Their Values:\n\n";
while ( my ($k, $v) = each %features_and_values_hash ) {
print "$k ---> @{$features_and_values_hash{$k}}\n";
}
print "\n\nSamples vs. Class Labels:\n\n";
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
print "$kee => $samples_class_label_hash{$kee}\n";
}
print "\n\nTraining Samples:\n\n";
foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
keys %training_data_hash) {
print "$kee => @{$training_data_hash{$kee}}\n";
}
}
( run in 0.464 second using v1.01-cache-2.11-cpan-5a3173703d6 )