Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
## samples. Your training and test data can be of any number of of dimensions, can
## have any mean, and any covariance. The training and test data must obviously be
## drawn from the same distribution.
sub gen_numeric_training_and_test_data_and_write_to_csv {
use Math::Random;
my $self = shift;
my %training_samples_for_class;
my %test_samples_for_class;
foreach my $class_name (@{$self->{_class_names}}) {
$training_samples_for_class{$class_name} = [];
$test_samples_for_class{$class_name} = [];
}
foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
my @new_training_data = Math::Random::random_multivariate_normal(
$self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
@mean, @covariance );
my @new_test_data = Math::Random::random_multivariate_normal(
$self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
@mean, @covariance );
if ($self->{_debug}) {
print "training data for class $class_name:\n";
foreach my $x (@new_training_data) {print "@$x\n";}
print "\n\ntest data for class $class_name:\n";
foreach my $x (@new_test_data) {print "@$x\n";}
}
$training_samples_for_class{$class_name} = \@new_training_data;
$test_samples_for_class{$class_name} = \@new_test_data;
}
my @training_data_records = ();
my @test_data_records = ();
foreach my $class_name (keys %training_samples_for_class) {
my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
$self->{_class_names_and_priors}->{$class_name};
my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
$self->{_class_names_and_priors}->{$class_name};
foreach my $sample_index (0..$num_of_samples_for_training-1) {
my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
@training_vector = map {sprintf("%.3f", $_)} @training_vector;
my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
push @training_data_records, $training_data_record;
}
foreach my $sample_index (0..$num_of_samples_for_testing-1) {
my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
@test_vector = map {sprintf("%.3f", $_)} @test_vector;
my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
push @test_data_records, $test_data_record;
}
}
fisher_yates_shuffle(\@training_data_records);
fisher_yates_shuffle(\@test_data_records);
if ($self->{_debug}) {
foreach my $record (@training_data_records) {
print "$record";
}
foreach my $record (@test_data_records) {
print "$record";
}
}
open OUTPUT, ">$self->{_output_training_csv_file}";
my @feature_names_training = @{$self->{_features_ordered}};
my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
print OUTPUT "$first_row_training\n";
foreach my $i (0..@training_data_records-1) {
my $i1 = $i+1;
my $sample_record = "\"$i1\",$training_data_records[$i]";
print OUTPUT "$sample_record";
}
close OUTPUT;
open OUTPUT, ">$self->{_output_test_csv_file}";
my @feature_names_testing = keys %{$self->{_features_with_value_range}};
my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
print OUTPUT "$first_row_testing\n";
foreach my $i (0..@test_data_records-1) {
my $i1 = $i+1;
my $sample_record = "\"$i1\",$test_data_records[$i]";
print OUTPUT "$sample_record";
}
close OUTPUT;
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
########################### Generate Your Own Symbolic Training Data ###############################
########################### Class TrainingDataGeneratorSymbolic #############################
## See the sample script generate_training_and_test_data_symbolic.pl for how to use
## this class for generating purely symbolic training and test data. The data is
## generated according to the specifications you place in a parameter file whose
## name you supply as one of constructor arguments.
package TrainingDataGeneratorSymbolic;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params4(@params) == 0;
bless {
_output_training_datafile => $args{'output_training_datafile'}
|| die("name for output_training_datafile required"),
_parameter_file => $args{'parameter_file'}
|| die("parameter_file required"),
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
|| die("number_of_samples_for_training required"),
_debug => $args{debug} || 0,
_class_names => [],
_class_priors => [],
_features_and_values_hash => {},
_bias_hash => {},
_training_sample_records => {},
}, $class;
}
sub check_for_illegal_params4 {
my @params = @_;
my @legal_params = qw / output_training_datafile
parameter_file
number_of_samples_for_training
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## Read a parameter file for generating symbolic training data. See the script
## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
## pass the name of the parameter file to the constructor of the
## TrainingDataGeneratorSymbolic class.
sub read_parameter_file_symbolic {
my $self = shift;
my $debug = $self->{_debug};
my $number_of_training_samples = $self->{_number_of_samples_for_training};
my $input_parameter_file = $self->{_parameter_file};
croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
my $output_file_training = $self->{_output_training_datafile};
my $output_file_testing = $self->{_output_test_datafile};
my @all_params;
my $param_string;
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
$param_string = join ' ', @all_params;
my ($class_names, $class_priors, $rest_param) =
$param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
push @{$self->{_class_names}}, @class_names;
my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
push @{$self->{_class_priors}}, @class_priors;
my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
my %features_and_values_hash;
my @features = split /(feature[:])/, $feature_string;
@features = grep {defined($_) && length($_) > 0} @features;
foreach my $item (@features) {
next if $item =~ /feature/;
my @splits = split / /, $item;
@splits = grep {defined($_) && length($_) > 0} @splits;
foreach my $i (0..@splits-1) {
if ($i == 0) {
$features_and_values_hash{$splits[0]} = [];
} else {
next if $splits[$i] =~ /values/;
push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
}
}
}
$self->{_features_and_values_hash} = \%features_and_values_hash;
my %bias_hash = %{$self->{_bias_hash}};
my @biases = split /(bias[:]\s*class[:])/, $bias_string;
@biases = grep {defined($_) && length($_) > 0} @biases;
foreach my $item (@biases) {
next if $item =~ /bias/;
my @splits = split /\s+/, $item;
@splits = grep {defined($_) && length($_) > 0} @splits;
my $feature_name;
foreach my $i (0..@splits-1) {
if ($i == 0) {
$bias_hash{$splits[0]} = {};
} elsif ($splits[$i] =~ /(^.+)[:]$/) {
$feature_name = $1;
$bias_hash{$splits[0]}->{$feature_name} = [];
} else {
next if !defined $feature_name;
push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
if defined $feature_name;
}
}
}
$self->{_bias_hash} = \%bias_hash;
if ($debug) {
print "\n\nClass names: @class_names\n";
my $num_of_classes = @class_names;
print "Class priors: @class_priors\n";
print "Number of classes: $num_of_classes\n";
print "\nHere are the features and their possible values:\n";
while ( my ($k, $v) = each %features_and_values_hash ) {
print "$k ===> @$v\n";
}
print "\nHere is the biasing for each class:\n";
while ( my ($k, $v) = each %bias_hash ) {
( run in 1.628 second using v1.01-cache-2.11-cpan-e1769b4cff6 )