Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

package Algorithm::RegressionTree;

#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::RegressionTree is a Perl module for constructing regression trees.  It calls
# on the main Algorithm::DecisionTree module for some of its functionality.
# -------------------------------------------------------------------------------------

#use lib 'blib/lib', 'blib/arch';

#use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Algorithm::DecisionTree 3.43;
use List::Util qw(reduce min max pairmap sum);
use Math::GSL::Matrix;
use Graphics::GnuplotIF;

our $VERSION = '3.43';

@Algorithm::RegressionTree::ISA = ('Algorithm::DecisionTree');

############################################   Constructor  ##############################################
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params(@params) == 0;
    my %dtargs = %args;
    delete $dtargs{dependent_variable_column};
    delete $dtargs{predictor_columns};
    delete $dtargs{mse_threshold};
    delete $dtargs{need_data_normalization};
    delete $dtargs{jacobian_choice};
    delete $dtargs{debug1_r};
    delete $dtargs{debug2_r};
    delete $dtargs{debug3_r};
    my $instance = Algorithm::DecisionTree->new(%dtargs);
    bless $instance, $class;
    $instance->{_dependent_variable_column}       =  $args{dependent_variable_column} || undef;
    $instance->{_predictor_columns}               =  $args{predictor_columns} || 0;
    $instance->{_mse_threshold}                   =  $args{mse_threshold} || 0.01;
    $instance->{_jacobian_choice}                 =  $args{jacobian_choice} || 0;
    $instance->{_need_data_normalization}         =  $args{need_data_normalization} || 0;
    $instance->{_dependent_var}                   =  undef;
    $instance->{_dependent_var_values}            =  undef;
    $instance->{_samples_dependent_var_val_hash}  =  undef;
    $instance->{_root_node}                       =  undef;
    $instance->{_debug1_r}                        =  $args{debug1_r} || 0;
    $instance->{_debug2_r}                        =  $args{debug2_r} || 0;
    $instance->{_debug3_r}                        =  $args{debug3_r} || 0;
    $instance->{_sample_points_for_dependent_var} =  [];
    $instance->{_output_for_plots}                =  {};
    $instance->{_output_for_surface_plots}        =  {};
    bless $instance, $class;
}

##############################################  Methods  #################################################
sub get_training_data_for_regression {
    my $self = shift;
    die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
    my @dependent_var_values;
    my %all_record_ids_with_dependent_var_values;
    my $firstline;
    my %data_hash;
    $|++;
    open FILEIN, $self->{_training_datafile};
    my $record_index = 0;
    while (<FILEIN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
        if ($record_index == 0) {
            $firstline = $record;
            $record_index++;
            next;
        }
        my @parts = split /,/, $record;
        my $record_label = shift @parts;
        $record_label  =~ s/^\s*\"|\"\s*$//g;
        $data_hash{$record_label} = \@parts;
        push @dependent_var_values, $parts[$self->{_dependent_variable_column}-1];
        $all_record_ids_with_dependent_var_values{$parts[0]} = $parts[$self->{_dependent_variable_column}-1];
        print "." if $record_index % 10000 == 0;
        $record_index++;
    }
    close FILEIN;    
    $self->{_how_many_total_training_samples} = $record_index; #it's less by 1 from total num of records; okay
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1_r};
    my @all_feature_names =   grep $_, split /,/, substr($firstline, index($firstline,','));
    my $dependent_var_column_heading = $all_feature_names[$self->{_dependent_variable_column} - 1];
    my @feature_names = map {$all_feature_names[$_-1]} @{$self->{_predictor_columns}};
    my %dependent_var_value_for_sample_hash = map {"sample_" . $_  =>  "$dependent_var_column_heading=" . $data_hash{$_}->[$self->{_dependent_variable_column} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_-1]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : ...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a-1] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_predictor_columns}};     
    my %numeric_features_valuerange_hash   =   ();
    my %feature_values_how_many_uniques_hash  =  ();
    my %features_and_unique_values_hash = ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {
        my %seen = ();
        my @unique_values_for_feature =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($self->{_debug1_r}) {
        print "\nDependent var values: @dependent_var_values\n";
        print "\nEach sample data record:\n";

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

    die "vector_norm() can only be called for a single column matrix" if $cols > 1;
    my @norm = (transpose($vec) * $vec)->as_list;
    return sqrt($norm[0]);
}

sub display_matrix {
    my $matrix = shift;
    my $nrows = $matrix->rows();
    my $ncols = $matrix->cols();
    print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
    foreach my $i (0..$nrows-1) {
        my $row = $matrix->row($i);
        my @row_as_list = $row->as_list;
        map { printf("%.4f ", $_) } @row_as_list;
        print "\n";
    }
    print "\n\n";
}

# Meant only for an array of strings (no nesting):
sub deep_copy_array {
    my $ref_in = shift;
    my $ref_out;
    return [] if scalar @$ref_in == 0;
    foreach my $i (0..@{$ref_in}-1) {
        $ref_out->[$i] = $ref_in->[$i];
    }
    return $ref_out;
}


#############################################  Class RTNode  #############################################

# The nodes of a regression tree are instances of this class:
package RTNode;

use strict; 
use Carp;

# $feature is the feature test at the current node.  $branch_features_and_values is
# an anonymous array holding the feature names and corresponding values on the path
# from the root to the current node:
sub new {                                                           
    my ($class, $feature, $error, $beta, $branch_features_and_values_or_thresholds, $rt, $root_or_not) = @_; 
    $root_or_not = '' if !defined $root_or_not;
    if ($root_or_not eq 'root') {
        $rt->{nodes_created} = -1;
        $rt->{class_names} = undef;
    }
    my $self = {                                                         
            _rt                      => $rt,
            _feature                 => $feature,                                       
            _error                   => $error,                                       
            _beta                    => $beta,                                       
            _branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
            _num_data_points         => undef,                                       
            _XMatrix                 => undef,
            _YVector                 => undef,
            _linked_to               => [],                                          
    };
    bless $self, $class;
    $self->{_serial_number} =  $self->get_next_serial_num();
    return $self;
}

sub node_prediction_from_features_and_values {
    my $self = shift;
    my $feature_and_values = shift;
    my $ncols = $self->{_XMatrix}->cols;
    my $pattern = '(\S+)\s*=\s*(\S+)';
    my ($feature,$value);
    my @Xlist;
    foreach my $feature_name (@{$self->{_rt}->{_feature_names}}) {
        foreach my $feature_and_value (@{$feature_and_values}) {
            $feature_and_value =~ /$pattern/;
            my ($feature, $value) = ($1, $2);
            push @Xlist, $value if $feature_name eq $feature; 
        }
    }
    push @Xlist, 1;
    my $dataMatrix = Math::GSL::Matrix->new(1, $ncols);
    $dataMatrix->set_row(0, \@Xlist);
    my $prediction = $dataMatrix * $self->get_node_beta();
    return $prediction->get_elem(0,0);
}

sub node_prediction_from_data_as_matrix {
    my $self = shift;
    my $dataMatrix = shift;
    my $prediction = $dataMatrix * $self->get_node_beta();
    return $prediction->get_elem(0,0);
}

sub node_prediction_from_data_as_list {
    my $self = shift;
    my $data_as_list = shift;
    my @data_arr =  @{$data_as_list};
    my $ncols = $self->{_XMatrix}->cols;
    die "wrong number of elements in data list" if @data_arr != $ncols - 1;
    push @data_arr, 1;
    my $dataMatrix = Math::GSL::Matrix->new(1, $self->{_XMatrix}->cols);
    my $prediction = $dataMatrix * $self->get_node_beta();
    return $prediction->get_elem(0,0);
}

sub how_many_nodes {
    my $self = shift;
    return $self->{_rt}->{nodes_created} + 1;
}

sub get_num_data_points {
    my $self = shift;
    return $self->{_num_data_points};
}  

sub set_num_data_points {
    my $self = shift;
    my $how_many = shift;
    $self->{_num_data_points} = $how_many;
}



( run in 1.734 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )