Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

##  samples. Your training and test data can be of any number of of dimensions, can
##  have any mean, and any covariance.  The training and test data must obviously be
##  drawn from the same distribution.
sub gen_numeric_training_and_test_data_and_write_to_csv {
    use Math::Random;
    my $self = shift;
    my %training_samples_for_class;
    my %test_samples_for_class;
    foreach my $class_name (@{$self->{_class_names}}) {
        $training_samples_for_class{$class_name} = [];
        $test_samples_for_class{$class_name} = [];
    }
    foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
        my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
        my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
        my @new_training_data = Math::Random::random_multivariate_normal(
              $self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
              @mean, @covariance );
        my @new_test_data = Math::Random::random_multivariate_normal(
              $self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
              @mean, @covariance );
        if ($self->{_debug}) {
            print "training data for class $class_name:\n";
            foreach my $x (@new_training_data) {print "@$x\n";}
            print "\n\ntest data for class $class_name:\n";
            foreach my $x (@new_test_data) {print "@$x\n";}
        }
        $training_samples_for_class{$class_name} = \@new_training_data;
        $test_samples_for_class{$class_name} = \@new_test_data;
    }
    my @training_data_records = ();
    my @test_data_records = ();
    foreach my $class_name (keys %training_samples_for_class) {
        my $num_of_samples_for_training = $self->{_number_of_samples_for_training} * 
                                         $self->{_class_names_and_priors}->{$class_name};
        my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} * 
                                         $self->{_class_names_and_priors}->{$class_name};
        foreach my $sample_index (0..$num_of_samples_for_training-1) {
            my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
            @training_vector = map {sprintf("%.3f", $_)} @training_vector;
            my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
            push @training_data_records, $training_data_record;
        }
        foreach my $sample_index (0..$num_of_samples_for_testing-1) {
            my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
            @test_vector = map {sprintf("%.3f", $_)} @test_vector;
            my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
            push @test_data_records, $test_data_record;
        }
    }
    fisher_yates_shuffle(\@training_data_records);
    fisher_yates_shuffle(\@test_data_records);
    if ($self->{_debug}) {
        foreach my $record (@training_data_records) {
            print "$record";
        }
        foreach my $record (@test_data_records) {
            print "$record";
        }
    }
    open OUTPUT, ">$self->{_output_training_csv_file}";
    my @feature_names_training = @{$self->{_features_ordered}};
    my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
    my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
    print OUTPUT "$first_row_training\n";
    foreach my $i (0..@training_data_records-1) {
        my $i1 = $i+1;
        my $sample_record = "\"$i1\",$training_data_records[$i]";
        print OUTPUT "$sample_record";
    }
    close OUTPUT;
    open OUTPUT, ">$self->{_output_test_csv_file}";
    my @feature_names_testing = keys %{$self->{_features_with_value_range}};
    my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
    my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
    print OUTPUT "$first_row_testing\n";
    foreach my $i (0..@test_data_records-1) {
        my $i1 = $i+1;
        my $sample_record = "\"$i1\",$test_data_records[$i]";
        print OUTPUT "$sample_record";
    }
    close OUTPUT;
}

# from perl docs:                                                                         
sub fisher_yates_shuffle {
    my $arr =  shift;
    my $i = @$arr;
    while (--$i) {
        my $j = int rand( $i + 1 );
        @$arr[$i, $j] = @$arr[$j, $i];
    }
}

###########################  Generate Your Own Symbolic Training Data  ###############################
###########################     Class TrainingDataGeneratorSymbolic      #############################

##  See the sample script generate_training_and_test_data_symbolic.pl for how to use
##  this class for generating purely symbolic training and test data.  The data is
##  generated according to the specifications you place in a parameter file whose
##  name you supply as one of constructor arguments.
package TrainingDataGeneratorSymbolic;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params4(@params) == 0;   
    bless {
        _output_training_datafile          =>   $args{'output_training_datafile'} 
                                                   || die("name for output_training_datafile required"),
        _parameter_file                    =>   $args{'parameter_file'}
                                                   || die("parameter_file required"),
        _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'} 
                                                   || die("number_of_samples_for_training required"),
        _debug                             =>    $args{debug} || 0,
        _class_names                       =>    [],
        _class_priors                      =>    [],
        _features_and_values_hash          =>    {},
        _bias_hash                         =>    {},
        _training_sample_records           =>    {},
    }, $class;
}

sub check_for_illegal_params4 {
    my @params = @_;
    my @legal_params = qw / output_training_datafile
                            parameter_file
                            number_of_samples_for_training
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

##  Read a parameter file for generating symbolic training data. See the script
##  generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
##  pass the name of the parameter file to the constructor of the
##  TrainingDataGeneratorSymbolic class.
sub read_parameter_file_symbolic {
    my $self = shift;
    my $debug = $self->{_debug};
    my $number_of_training_samples = $self->{_number_of_samples_for_training};
    my $input_parameter_file = $self->{_parameter_file};
    croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
    my $output_file_training = $self->{_output_training_datafile};
    my $output_file_testing = $self->{_output_test_datafile};
    my @all_params;
    my $param_string;
    open INPUT, $input_parameter_file || "unable to open parameter file: $!";
    @all_params = <INPUT>;
    @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
    @all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
    $param_string = join ' ', @all_params;
    my ($class_names, $class_priors, $rest_param) = 
              $param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
    my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
    push @{$self->{_class_names}}, @class_names;
    my @class_priors =   grep {defined($_) && length($_) > 0} split /\s+/, $2;
    push @{$self->{_class_priors}}, @class_priors;    
    my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
    my %features_and_values_hash;
    my @features = split /(feature[:])/, $feature_string;
    @features = grep {defined($_) && length($_) > 0} @features;
    foreach my $item (@features) {
        next if $item =~ /feature/;
        my @splits = split / /, $item;
        @splits = grep {defined($_) && length($_) > 0} @splits;
        foreach my $i (0..@splits-1) {
            if ($i == 0) {
                $features_and_values_hash{$splits[0]} = [];
            } else {
                next if $splits[$i] =~ /values/;
                push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
            }
        }
    }
    $self->{_features_and_values_hash} = \%features_and_values_hash;
    my %bias_hash = %{$self->{_bias_hash}};
    my @biases = split /(bias[:]\s*class[:])/, $bias_string;
    @biases = grep {defined($_) && length($_) > 0} @biases;
    foreach my $item (@biases) {
        next if $item =~ /bias/;
        my @splits = split /\s+/, $item;
        @splits = grep {defined($_) && length($_) > 0} @splits;
        my $feature_name;
        foreach my $i (0..@splits-1) {
            if ($i == 0) {
                $bias_hash{$splits[0]} = {};
            } elsif ($splits[$i] =~ /(^.+)[:]$/) {
                $feature_name = $1;
                $bias_hash{$splits[0]}->{$feature_name} = [];
            } else {
                next if !defined $feature_name;
                push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
                        if defined $feature_name;
            }
        }
    }
    $self->{_bias_hash} = \%bias_hash;
    if ($debug) {
        print "\n\nClass names: @class_names\n";
        my $num_of_classes = @class_names;
        print "Class priors: @class_priors\n";
        print "Number of classes: $num_of_classes\n";
        print "\nHere are the features and their possible values:\n";
        while ( my ($k, $v) = each %features_and_values_hash ) {
            print "$k ===>  @$v\n";
        }
        print "\nHere is the biasing for each class:\n";
        while ( my ($k, $v) = each %bias_hash ) {



( run in 1.628 second using v1.01-cache-2.11-cpan-e1769b4cff6 )