Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

    close FILEIN;    
    $self->{_how_many_total_training_samples} = $record_index; #it's less by 1 from total num of records; okay
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1_r};
    my @all_feature_names =   grep $_, split /,/, substr($firstline, index($firstline,','));
    my $dependent_var_column_heading = $all_feature_names[$self->{_dependent_variable_column} - 1];
    my @feature_names = map {$all_feature_names[$_-1]} @{$self->{_predictor_columns}};
    my %dependent_var_value_for_sample_hash = map {"sample_" . $_  =>  "$dependent_var_column_heading=" . $data_hash{$_}->[$self->{_dependent_variable_column} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_-1]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : ...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a-1] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_predictor_columns}};     
    my %numeric_features_valuerange_hash   =   ();
    my %feature_values_how_many_uniques_hash  =  ();
    my %features_and_unique_values_hash = ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {
        my %seen = ();
        my @unique_values_for_feature =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($self->{_debug1_r}) {
        print "\nDependent var values: @dependent_var_values\n";
        print "\nEach sample data record:\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$kee    =>   @{$feature_values_for_samples_hash{$kee}}\n";
        }
        print "\ndependent var value for each data sample:\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %dependent_var_value_for_sample_hash) {
            print "$kee    =>   $dependent_var_value_for_sample_hash{$kee}\n";
        }
        print "\nfeatures and the values taken by them:\n";
        for my $kee  (sort keys %features_and_values_hash) {
            print "$kee    =>   @{$features_and_values_hash{$kee}}\n";                        
        }
        print "\nnumeric features and their ranges:\n";
        for my $kee  (sort keys %numeric_features_valuerange_hash) {
            print "$kee    =>   @{$numeric_features_valuerange_hash{$kee}}\n";
        }
        print "\nnumber of unique values in each feature:\n";        
        for my $kee  (sort keys %feature_values_how_many_uniques_hash) {
            print "$kee    =>   $feature_values_how_many_uniques_hash{$kee}\n";
        }
    }
    $self->{_XMatrix}  =  undef;
    $self->{_YVector}  =  undef;
    $self->{_dependent_var} = $dependent_var_column_heading;
    $self->{_dependent_var_values} = \@dependent_var_values;
    $self->{_feature_names} = \@feature_names;
    $self->{_samples_dependent_var_val_hash}  = \%dependent_var_value_for_sample_hash;
    $self->{_training_data_hash}  =  \%feature_values_for_samples_hash;
    $self->{_features_and_values_hash}  = \%features_and_values_hash;
    $self->{_features_and_unique_values_hash}  =  \%features_and_unique_values_hash;
    $self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
    $self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
    $self->SUPER::calculate_first_order_probabilities();
}

sub construct_XMatrix_and_YVector_all_data {
    my $self = shift;
    my $matrix_rows_as_lists =  [ map {my @arr = @$_; [map substr($_,index($_,'=')+1), @arr] } map {$self->{_training_data_hash}->{$_}} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}} ];
    map {push @$_, 1} @{$matrix_rows_as_lists};
    map {print "XMatrix row: @$_\n"} @{$matrix_rows_as_lists} if $self->{_debug1_r};
    my $XMatrix = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, scalar @{$matrix_rows_as_lists->[0]});
    pairmap {$XMatrix->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, @{$matrix_rows_as_lists} )
                       [ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
    if ($self->{_debug1_r}) {
        foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
            my @onerow = $XMatrix->row($rowindex)->as_list;
            print "XMatrix row again: @onerow\n";
        }
    }
    $self->{_XMatrix} = $XMatrix;
    my @dependent_var_values =  map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_samples_dependent_var_val_hash}};
    print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
    my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
    pairmap {$YVector->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, map {[$_]} @dependent_var_values )
                       [ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
    if ($self->{_debug1_r}) {
        foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
            my @onerow = $YVector->row($rowindex)->as_list;
            print "YVector row: @onerow\n";
        }
    }
    $self->{_YVector} = $YVector;
    return ($XMatrix, $YVector);
}

sub estimate_regression_coefficients {
    my $self = shift;
    my ($XMatrix, $YVector, $display) = @_;
    $display = 0 unless defined $display;
    my ($nrows, $ncols) = $XMatrix->dim;
    print "nrows=$nrows   ncols=$ncols\n" if $self->{_debug2_r};
    my $jacobian_choice = $self->{_jacobian_choice};
    my $X = $XMatrix->copy();
    my $y = $YVector->copy();
    if ($self->{_need_data_normalization}) {
        die "normalization feature is yet to be added to the module --- sorry";
    }
    my $beta0 = (transpose($X) * $X)->inverse() * transpose($X) * $y;
    my ($betarows, $betacols) = $beta0->dim;
    die "Something has gone wrong with the calculation of beta coefficients" if $betacols > 1;
    if ($jacobian_choice == 0) {
#        my $error = sum(abs_vector_as_list( $y - ($X * $beta) )) / $nrows;   
        my $error = sum( map abs, ($y - ($X * $beta0) )->col(0)->as_list ) / $nrows;   
        my $predictions = $X * $beta0;
        if ($display) {
            if ($ncols == 2) {
                my @xvalues = $X->col(0)->as_list;
                my @yvalues = $predictions->col(0)->as_list;
                $self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
            } elsif ($ncols == 3) {
                my @xvalues;
                my @yvalues = $predictions->col(0)->as_list;
                foreach my $row_index (0 .. $X->rows - 1) {



( run in 0.498 second using v1.01-cache-2.11-cpan-39bf76dae61 )