Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
my $min;
my $index;
foreach my $i (0..@{$arr}-1) {
if ( (!defined $min) || ($arr->[$i] < $min) ) {
$index = $i;
$min = $arr->[$i];
}
}
return ($min, $index);
}
# Returns an array of two values, the min and the max, of an array of floats
sub minmax {
my $arr = shift;
my ($min, $max);
foreach my $i (0..@{$arr}-1) {
if ( (!defined $min) || ($arr->[$i] < $min) ) {
$min = $arr->[$i];
}
if ( (!defined $max) || ($arr->[$i] > $max) ) {
$max = $arr->[$i];
}
}
return ($min, $max);
}
# checks whether an element is in an array:
sub contained_in {
my $ele = shift;
my @array = @_;
my $count = 0;
map {$count++ if $ele eq $_} @array;
return $count;
}
# Meant only for an array of strings (no nesting):
sub deep_copy_array {
my $ref_in = shift;
my $ref_out;
return [] if scalar @$ref_in == 0;
foreach my $i (0..@{$ref_in}-1) {
$ref_out->[$i] = $ref_in->[$i];
}
return $ref_out;
}
sub check_for_illegal_params2 {
my @params = @_;
my @legal_params = qw / training_datafile
entropy_threshold
max_depth_desired
csv_class_column_index
csv_columns_for_features
symbolic_to_numeric_cardinality_threshold
number_of_histogram_bins
csv_cleanup_needed
debug1
debug2
debug3
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub print_array_with_msg {
my $message = shift;
my $arr = shift;
print "\n$message: ";
print_nested_array( $arr );
}
sub print_nested_array {
my $arr = shift;
my @arr = @$arr;
print "[";
foreach my $item (@arr) {
if (ref $item) {
print_nested_array($item);
} else {
print "$item";
}
}
print "]";
}
sub cleanup_csv {
my $line = shift;
$line =~ tr/\/:?()[]{}'/ /;
# my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
for (@double_quoted) {
my $item = $_;
$item = substr($item, 1, -1);
$item =~ s/^\s+|,|\s+$//g;
$item = join '_', split /\s+/, $item;
substr($line, index($line, $_), length($_)) = $item;
}
my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
for (@white_spaced) {
my $item = $_;
$item =~ s/\s+/_/g;
$item =~ s/^\s*_|_\s*$//g;
substr($line, index($line, $_), length($_)) = $item;
}
$line =~ s/,\s*(?=,|$)/,NA/g;
return $line;
}
######################################### Class EvalTrainingData ########################################
## This subclass of the DecisionTree class is used to evaluate the quality of your
## training data by running a 10-fold cross-validation test on it. This test divides
## all of the training data into ten parts, with nine parts used for training a
## decision tree and one part used for testing its ability to classify correctly.
## This selection of nine parts for training and one part for testing is carried out
## in all of the ten different possible ways. This testing functionality can also
## be used to find the best values to use for the constructor parameters
## entropy_threshold, max_depth_desired, and
## symbolic_to_numeric_cardinality_threshold.
## Only the CSV training files can be evaluated in this manner (because only CSV
## training are allowed to have numeric features --- which is the more interesting
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
my @class_names = @{$self->get_class_names()};
my @print_class_probabilities_with_class =
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
"Class Probs: @print_class_probabilities_with_class\n\n";
}
}
############################## Generate Your Own Numeric Training Data #################################
############################# Class TrainingDataGeneratorNumeric ################################
## See the script generate_training_data_numeric.pl in the examples
## directory on how to use this class for generating your own numeric training and
## test data. The training and test data are generated in accordance with the
## specifications you place in the parameter file that is supplied as an argument to
## the constructor of this class.
package TrainingDataGeneratorNumeric;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params3(@params) == 0;
bless {
_output_training_csv_file => $args{'output_training_csv_file'}
|| croak("name for output_training_csv_file required"),
_output_test_csv_file => $args{'output_test_csv_file'}
|| croak("name for output_test_csv_file required"),
_parameter_file => $args{'parameter_file'}
|| croak("parameter_file required"),
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
|| croak("number_of_samples_for_training"),
_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
|| croak("number_of_samples_for_testing"),
_debug => $args{debug} || 0,
_class_names => [],
_class_names_and_priors => {},
_features_with_value_range => {},
_features_ordered => [],
_classes_and_their_param_values => {},
}, $class;
}
sub check_for_illegal_params3 {
my @params = @_;
my @legal_params = qw / output_training_csv_file
output_test_csv_file
parameter_file
number_of_samples_for_training
number_of_samples_for_testing
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## The training data generated by an instance of the class
## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
## you supply to the class constructor through a constructor variable called
## `parameter_file'. This method is for parsing the parameter file in order to
## order to determine the names to be used for the different data classes, their
## means, and their variances.
sub read_parameter_file_numeric {
my $self = shift;
my @class_names = ();
my %class_names_and_priors = ();
my %features_with_value_range = ();
my %classes_and_their_param_values = ();
# my $regex8 = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
my @params = <FILE>;
my $params = join "", @params;
my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
$params =~ /$regex/si;
my ($class_names, $class_priors) = ($1, $2);
@class_names = split ' ', $class_names;
my @class_priors = split ' ', $class_priors;
foreach my $i (0..@class_names-1) {
$class_names_and_priors{$class_names[$i]} = $class_priors[$i];
}
if ($self->{_debug}) {
foreach my $cname (keys %class_names_and_priors) {
print "$cname => $class_names_and_priors{$cname}\n";
}
}
$regex = 'feature name: \w*.*?value range: [\d\. -]+';
my @features = $params =~ /$regex/gsi;
my @features_ordered;
$regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
foreach my $feature (@features) {
$feature =~ /$regex/i;
my $feature_name = $1;
push @features_ordered, $feature_name;
my @value_range = split ' ', $2;
$features_with_value_range{$feature_name} = \@value_range;
}
if ($self->{_debug}) {
foreach my $fname (keys %features_with_value_range) {
print "$fname => @{$features_with_value_range{$fname}}\n";
}
}
foreach my $i (0..@class_names-1) {
$classes_and_their_param_values{$class_names[$i]} = {};
}
$regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
my @class_params = $params =~ /$regex/gsi;
$regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';
foreach my $class_param (@class_params) {
$class_param =~ /$regex/gsi;
my $class_name = $1;
my @class_mean = split ' ', $2;
$classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
my $class_param_string = $3;
my @covar_rows = split '\n', $class_param_string;
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
print OUTPUT "$first_row_testing\n";
foreach my $i (0..@test_data_records-1) {
my $i1 = $i+1;
my $sample_record = "\"$i1\",$test_data_records[$i]";
print OUTPUT "$sample_record";
}
close OUTPUT;
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
########################### Generate Your Own Symbolic Training Data ###############################
########################### Class TrainingDataGeneratorSymbolic #############################
## See the sample script generate_training_and_test_data_symbolic.pl for how to use
## this class for generating purely symbolic training and test data. The data is
## generated according to the specifications you place in a parameter file whose
## name you supply as one of constructor arguments.
package TrainingDataGeneratorSymbolic;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params4(@params) == 0;
bless {
_output_training_datafile => $args{'output_training_datafile'}
|| die("name for output_training_datafile required"),
_parameter_file => $args{'parameter_file'}
|| die("parameter_file required"),
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
|| die("number_of_samples_for_training required"),
_debug => $args{debug} || 0,
_class_names => [],
_class_priors => [],
_features_and_values_hash => {},
_bias_hash => {},
_training_sample_records => {},
}, $class;
}
sub check_for_illegal_params4 {
my @params = @_;
my @legal_params = qw / output_training_datafile
parameter_file
number_of_samples_for_training
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## Read a parameter file for generating symbolic training data. See the script
## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
## pass the name of the parameter file to the constructor of the
## TrainingDataGeneratorSymbolic class.
sub read_parameter_file_symbolic {
my $self = shift;
my $debug = $self->{_debug};
my $number_of_training_samples = $self->{_number_of_samples_for_training};
my $input_parameter_file = $self->{_parameter_file};
croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
my $output_file_training = $self->{_output_training_datafile};
my $output_file_testing = $self->{_output_test_datafile};
my @all_params;
my $param_string;
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
$param_string = join ' ', @all_params;
my ($class_names, $class_priors, $rest_param) =
$param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
push @{$self->{_class_names}}, @class_names;
my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
push @{$self->{_class_priors}}, @class_priors;
my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
my %features_and_values_hash;
my @features = split /(feature[:])/, $feature_string;
@features = grep {defined($_) && length($_) > 0} @features;
foreach my $item (@features) {
next if $item =~ /feature/;
my @splits = split / /, $item;
@splits = grep {defined($_) && length($_) > 0} @splits;
foreach my $i (0..@splits-1) {
if ($i == 0) {
$features_and_values_hash{$splits[0]} = [];
} else {
next if $splits[$i] =~ /values/;
push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
}
}
}
$self->{_features_and_values_hash} = \%features_and_values_hash;
my %bias_hash = %{$self->{_bias_hash}};
my @biases = split /(bias[:]\s*class[:])/, $bias_string;
@biases = grep {defined($_) && length($_) > 0} @biases;
foreach my $item (@biases) {
next if $item =~ /bias/;
my @splits = split /\s+/, $item;
@splits = grep {defined($_) && length($_) > 0} @splits;
my $feature_name;
foreach my $i (0..@splits-1) {
if ($i == 0) {
$bias_hash{$splits[0]} = {};
} elsif ($splits[$i] =~ /(^.+)[:]$/) {
$feature_name = $1;
$bias_hash{$splits[0]}->{$feature_name} = [];
} else {
( run in 1.199 second using v1.01-cache-2.11-cpan-6b5c3043376 )