Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
$instance->{_dependent_var} = undef;
$instance->{_dependent_var_values} = undef;
$instance->{_samples_dependent_var_val_hash} = undef;
$instance->{_root_node} = undef;
$instance->{_debug1_r} = $args{debug1_r} || 0;
$instance->{_debug2_r} = $args{debug2_r} || 0;
$instance->{_debug3_r} = $args{debug3_r} || 0;
$instance->{_sample_points_for_dependent_var} = [];
$instance->{_output_for_plots} = {};
$instance->{_output_for_surface_plots} = {};
bless $instance, $class;
}
############################################## Methods #################################################
sub get_training_data_for_regression {
my $self = shift;
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
my @dependent_var_values;
my %all_record_ids_with_dependent_var_values;
my $firstline;
my %data_hash;
$|++;
open FILEIN, $self->{_training_datafile};
my $record_index = 0;
while (<FILEIN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
if ($record_index == 0) {
$firstline = $record;
$record_index++;
next;
}
my @parts = split /,/, $record;
my $record_label = shift @parts;
$record_label =~ s/^\s*\"|\"\s*$//g;
$data_hash{$record_label} = \@parts;
push @dependent_var_values, $parts[$self->{_dependent_variable_column}-1];
$all_record_ids_with_dependent_var_values{$parts[0]} = $parts[$self->{_dependent_variable_column}-1];
print "." if $record_index % 10000 == 0;
$record_index++;
}
close FILEIN;
$self->{_how_many_total_training_samples} = $record_index; #it's less by 1 from total num of records; okay
print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1_r};
my @all_feature_names = grep $_, split /,/, substr($firstline, index($firstline,','));
my $dependent_var_column_heading = $all_feature_names[$self->{_dependent_variable_column} - 1];
my @feature_names = map {$all_feature_names[$_-1]} @{$self->{_predictor_columns}};
my %dependent_var_value_for_sample_hash = map {"sample_" . $_ => "$dependent_var_column_heading=" . $data_hash{$_}->[$self->{_dependent_variable_column} - 1 ] } keys %data_hash;
my @sample_names = map {"sample_$_"} keys %data_hash;
my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_-1]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : ...
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a-1] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_predictor_columns}};
my %numeric_features_valuerange_hash = ();
my %feature_values_how_many_uniques_hash = ();
my %features_and_unique_values_hash = ();
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $feature (keys %features_and_values_hash) {
my %seen = ();
my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
my $not_all_values_float = 0;
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
if ($not_all_values_float == 0) {
my @minmaxvalues = minmax(\@unique_values_for_feature);
$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
}
$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
if ($self->{_debug1_r}) {
print "\nDependent var values: @dependent_var_values\n";
print "\nEach sample data record:\n";
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
print "$kee => @{$feature_values_for_samples_hash{$kee}}\n";
}
print "\ndependent var value for each data sample:\n";
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %dependent_var_value_for_sample_hash) {
print "$kee => $dependent_var_value_for_sample_hash{$kee}\n";
}
print "\nfeatures and the values taken by them:\n";
for my $kee (sort keys %features_and_values_hash) {
print "$kee => @{$features_and_values_hash{$kee}}\n";
}
print "\nnumeric features and their ranges:\n";
for my $kee (sort keys %numeric_features_valuerange_hash) {
print "$kee => @{$numeric_features_valuerange_hash{$kee}}\n";
}
print "\nnumber of unique values in each feature:\n";
for my $kee (sort keys %feature_values_how_many_uniques_hash) {
print "$kee => $feature_values_how_many_uniques_hash{$kee}\n";
}
}
$self->{_XMatrix} = undef;
$self->{_YVector} = undef;
$self->{_dependent_var} = $dependent_var_column_heading;
$self->{_dependent_var_values} = \@dependent_var_values;
$self->{_feature_names} = \@feature_names;
$self->{_samples_dependent_var_val_hash} = \%dependent_var_value_for_sample_hash;
$self->{_training_data_hash} = \%feature_values_for_samples_hash;
$self->{_features_and_values_hash} = \%features_and_values_hash;
$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
$self->SUPER::calculate_first_order_probabilities();
}
sub construct_XMatrix_and_YVector_all_data {
my $self = shift;
my $matrix_rows_as_lists = [ map {my @arr = @$_; [map substr($_,index($_,'=')+1), @arr] } map {$self->{_training_data_hash}->{$_}} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}} ];
map {push @$_, 1} @{$matrix_rows_as_lists};
map {print "XMatrix row: @$_\n"} @{$matrix_rows_as_lists} if $self->{_debug1_r};
my $XMatrix = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, scalar @{$matrix_rows_as_lists->[0]});
pairmap {$XMatrix->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, @{$matrix_rows_as_lists} )
[ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
if ($self->{_debug1_r}) {
foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
my @onerow = $XMatrix->row($rowindex)->as_list;
print "XMatrix row again: @onerow\n";
}
}
$self->{_XMatrix} = $XMatrix;
my @dependent_var_values = map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_samples_dependent_var_val_hash}};
print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
( run in 0.751 second using v1.01-cache-2.11-cpan-ceb78f64989 )