Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

    my @dependent_var_values =  map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_samples_dependent_var_val_hash}};
    print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
    my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
    pairmap {$YVector->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, map {[$_]} @dependent_var_values )
                       [ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
    if ($self->{_debug1_r}) {
        foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
            my @onerow = $YVector->row($rowindex)->as_list;
            print "YVector row: @onerow\n";
        }
    }
    $self->{_YVector} = $YVector;
    return ($XMatrix, $YVector);
}

sub estimate_regression_coefficients {
    my $self = shift;
    my ($XMatrix, $YVector, $display) = @_;
    $display = 0 unless defined $display;
    my ($nrows, $ncols) = $XMatrix->dim;
    print "nrows=$nrows   ncols=$ncols\n" if $self->{_debug2_r};
    my $jacobian_choice = $self->{_jacobian_choice};
    my $X = $XMatrix->copy();
    my $y = $YVector->copy();
    if ($self->{_need_data_normalization}) {
        die "normalization feature is yet to be added to the module --- sorry";
    }
    my $beta0 = (transpose($X) * $X)->inverse() * transpose($X) * $y;
    my ($betarows, $betacols) = $beta0->dim;
    die "Something has gone wrong with the calculation of beta coefficients" if $betacols > 1;
    if ($jacobian_choice == 0) {
#        my $error = sum(abs_vector_as_list( $y - ($X * $beta) )) / $nrows;   
        my $error = sum( map abs, ($y - ($X * $beta0) )->col(0)->as_list ) / $nrows;   
        my $predictions = $X * $beta0;
        if ($display) {
            if ($ncols == 2) {
                my @xvalues = $X->col(0)->as_list;
                my @yvalues = $predictions->col(0)->as_list;
                $self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
            } elsif ($ncols == 3) {
                my @xvalues;
                my @yvalues = $predictions->col(0)->as_list;
                foreach my $row_index (0 .. $X->rows - 1) {
                    my @onerow = $X->row($row_index)->as_list;
                    pop @onerow;
                    push @xvalues, "@onerow";
                }
                $self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
            } else {
                print "no display when the overall dimensionality of the data exceeds 3\n";
            }
        }
        return ($error, $beta0);    
    }
    my $beta = $beta0; 
    if ($self->{_debug2_r}) {
        print "\ndisplaying beta0 matrix\n";
        display_matrix($beta);
    }
    my $gamma = 0.1;
    my $iterate_again_flag = 1;
    my $delta = 0.001;
    my $master_interation_index = 0;
    $|++;
    while (1) {
        print "*" unless $master_interation_index++ % 100;
        last unless $iterate_again_flag;
        $gamma *= 0.1;
        $beta0 = 0.99 * $beta0;
        print "\n\n======== starting iterations with gamma= $gamma ===========\n\n\n" if $self->{_debug2_r};
        $beta = $beta0;
        my $beta_old = Math::GSL::Matrix->new($betarows, 1)->zero;
        my $error_old = sum( map abs, ($y - ($X * $beta_old) )->col(0)->as_list ) / $nrows;   
        my $error;
        foreach my $iteration (0 .. 1499) {
            print "." unless $iteration % 100;
            $beta_old = $beta->copy;
            my $jacobian;
            if ($jacobian_choice == 1) {
                $jacobian = $X;
            } elsif ($jacobian_choice == 2) {      
                my $x_times_delta_beta = $delta * $X * $beta;
                $jacobian = Math::GSL::Matrix->new($nrows, $ncols);
                foreach my $i (0 .. $nrows - 1) {
                    my @row = ($x_times_delta_beta->get_elem($i,0)) x $ncols;
                    $jacobian->set_row($i, \@row);
                }
                $jacobian = (1.0/$delta) * $jacobian;
            } else {
                die "wrong choice for the jacobian_choice";
            }
#            $beta = $beta_old + 2 * $gamma * transpose($X) * ( $y - ($X * $beta) );
            $beta = $beta_old + 2 * $gamma * transpose($jacobian) * ( $y - ($X * $beta) );
            $error = sum( map abs, ($y - ($X * $beta) )->col(0)->as_list ) / $nrows;   
            if ($error > $error_old) {
                if (vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old))) {
                    $iterate_again_flag = 0;
                    last;
                } else {
                    last;
                }
            }
            if ($self->{_debug2_r}) {
                print "\n\niteration: $iteration   gamma: $gamma   current error: $error\n";
                print "\nnew beta:\n";
                display_matrix $beta;
            }
            if ( vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old)) ) { 
                print "iterations used: $iteration with gamma: $gamma\n" if $self->{_debug2_r};
                $iterate_again_flag = 0;
                last;
            }
            $error_old = $error;
        }
    }
    display_matrix($beta) if $self->{_debug2_r};
    my $predictions = $X * $beta;
    my @error_distribution = ($y - ($X * $beta))->as_list;
    my $squared_error =  sum map abs, @error_distribution;
    my $error = $squared_error / $nrows;
    if ($display) {
        if ($ncols == 2) {
            my @xvalues = $X->col(0)->as_list;
            my @yvalues = $predictions->col(0)->as_list;
            $self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
        } elsif ($ncols == 3) {
            my @xvalues;
            my @yvalues = $predictions->col(0)->as_list;
            foreach my $row_index (0 .. $X->rows - 1) {
                my @onerow = $X->row($row_index)->as_list;
                pop @onerow;
                push @xvalues, "@onerow";
            }
            $self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
        } else {
            print "no display when the overall dimensionality of the data exceeds 3\n";
        }
    }
    return ($error, $beta);
}

##-------------------------------  Construct Regression Tree  ------------------------------------


##  At the root node, you do ordinary linear regression for the entire dataset so that you
##  can later compare the linear regression fit with the results obtained through the 
##  regression tree.  Subsequently, you call the recursive_descent() method to construct
##  the tree.

sub construct_regression_tree {
    my $self = shift;
    print "\nConstructing regression tree...\n";
    my $root_node = RTNode->new(undef, undef, undef, [], $self, 'root');
    my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
    my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector, 1); 
    $root_node->set_node_XMatrix($XMatrix);
    $root_node->set_node_YVector($YVector);
    $root_node->set_node_error($error);
    $root_node->set_node_beta($beta);
    $root_node->set_num_data_points($XMatrix->cols);
    print "\nerror at root: $error\n";
    print "\nbeta at root:\n";
    display_matrix($beta);
    $self->{_root_node} = $root_node;
    $self->recursive_descent($root_node) if $self->{_max_depth_desired} > 0;
    return $root_node;
}

##  We first look for a feature, along with its partitioning point, that yields the 
##  largest reduction in MSE compared to the MSE at the parent node.  This feature and

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

            }
        }
        $arg_string = $arg_string =~ /^(.*),[ ]+$/;
        $arg_string = $1;
        $hardcopy_plot->gnuplot_cmd( "splot $arg_string" );
        $gplot->gnuplot_cmd( "splot $arg_string" );
        $gplot->gnuplot_pause(-1);
    } else {
        die "no visual displays for regression from more then 2 predictor vars";
    }   
}  

sub DESTROY {
    unlink glob "__temp_*";
}

############################################## Utility Routines ##########################################
# checks whether an element is in an array:
sub contained_in {
    my $ele = shift;
    my @array = @_;
    my $count = 0;
    map {$count++ if $ele eq $_} @array;
    return $count;
}

sub minmax {
    my $arr = shift;
    my ($min, $max);
    foreach my $i (0..@{$arr}-1) {
        if ( (!defined $min) || ($arr->[$i] < $min) ) {
            $min = $arr->[$i];
        }
        if ( (!defined $max) || ($arr->[$i] > $max) ) {
            $max = $arr->[$i];
        }
    }
    return ($min, $max);
}

sub sample_index {
    my $arg = shift;
    $arg =~ /_(.+)$/;
    return $1;
}    

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / training_datafile
                            max_depth_desired
                            dependent_variable_column
                            predictor_columns
                            mse_threshold
                            need_data_normalization
                            jacobian_choice
                            csv_cleanup_needed
                            debug1_r
                            debug2_r
                            debug3_r
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub cleanup_csv {
    my $line = shift;
    $line =~ tr/\/:?()[]{}'/          /;
#    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
    for (@double_quoted) {
        my $item = $_;
        $item = substr($item, 1, -1);
        $item =~ s/^\s+|,|\s+$//g;
        $item = join '_',  split /\s+/, $item;
        substr($line, index($line, $_), length($_)) = $item;
    }
    my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
    for (@white_spaced) {
        my $item = $_;
        $item =~ s/\s+/_/g;
        $item =~ s/^\s*_|_\s*$//g;
        substr($line, index($line, $_), length($_)) = $item;
    }
    $line =~ s/,\s*(?=,|$)/,NA/g;
    return $line;
}

sub transpose {
    my $matrix = shift;
    my $num_rows = $matrix->rows();
    my $num_cols = $matrix->cols();
    my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
    foreach my $i (0..$num_rows-1) {
        my @row = $matrix->row($i)->as_list;
        $transpose->set_col($i, \@row );
    }
    return $transpose;
}

sub vector_subtract {
    my $vec1 = shift;
    my $vec2 = shift;
    die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
    my @result;
    foreach my $i (0..@$vec1-1){
        push @result, $vec1->[$i] - $vec2->[$i];
    }
    return @result;
}

sub vector_norm {
    my $vec = shift;       # assume it to be a column vector
    my ($rows, $cols) = $vec->dim;
    die "vector_norm() can only be called for a single column matrix" if $cols > 1;
    my @norm = (transpose($vec) * $vec)->as_list;
    return sqrt($norm[0]);
}

sub display_matrix {
    my $matrix = shift;
    my $nrows = $matrix->rows();
    my $ncols = $matrix->cols();



( run in 0.869 second using v1.01-cache-2.11-cpan-6b5c3043376 )