Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
my @dependent_var_values = map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_samples_dependent_var_val_hash}};
print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
pairmap {$YVector->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, map {[$_]} @dependent_var_values )
[ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
if ($self->{_debug1_r}) {
foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
my @onerow = $YVector->row($rowindex)->as_list;
print "YVector row: @onerow\n";
}
}
$self->{_YVector} = $YVector;
return ($XMatrix, $YVector);
}
sub estimate_regression_coefficients {
my $self = shift;
my ($XMatrix, $YVector, $display) = @_;
$display = 0 unless defined $display;
my ($nrows, $ncols) = $XMatrix->dim;
print "nrows=$nrows ncols=$ncols\n" if $self->{_debug2_r};
my $jacobian_choice = $self->{_jacobian_choice};
my $X = $XMatrix->copy();
my $y = $YVector->copy();
if ($self->{_need_data_normalization}) {
die "normalization feature is yet to be added to the module --- sorry";
}
my $beta0 = (transpose($X) * $X)->inverse() * transpose($X) * $y;
my ($betarows, $betacols) = $beta0->dim;
die "Something has gone wrong with the calculation of beta coefficients" if $betacols > 1;
if ($jacobian_choice == 0) {
# my $error = sum(abs_vector_as_list( $y - ($X * $beta) )) / $nrows;
my $error = sum( map abs, ($y - ($X * $beta0) )->col(0)->as_list ) / $nrows;
my $predictions = $X * $beta0;
if ($display) {
if ($ncols == 2) {
my @xvalues = $X->col(0)->as_list;
my @yvalues = $predictions->col(0)->as_list;
$self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
} elsif ($ncols == 3) {
my @xvalues;
my @yvalues = $predictions->col(0)->as_list;
foreach my $row_index (0 .. $X->rows - 1) {
my @onerow = $X->row($row_index)->as_list;
pop @onerow;
push @xvalues, "@onerow";
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
} else {
print "no display when the overall dimensionality of the data exceeds 3\n";
}
}
return ($error, $beta0);
}
my $beta = $beta0;
if ($self->{_debug2_r}) {
print "\ndisplaying beta0 matrix\n";
display_matrix($beta);
}
my $gamma = 0.1;
my $iterate_again_flag = 1;
my $delta = 0.001;
my $master_interation_index = 0;
$|++;
while (1) {
print "*" unless $master_interation_index++ % 100;
last unless $iterate_again_flag;
$gamma *= 0.1;
$beta0 = 0.99 * $beta0;
print "\n\n======== starting iterations with gamma= $gamma ===========\n\n\n" if $self->{_debug2_r};
$beta = $beta0;
my $beta_old = Math::GSL::Matrix->new($betarows, 1)->zero;
my $error_old = sum( map abs, ($y - ($X * $beta_old) )->col(0)->as_list ) / $nrows;
my $error;
foreach my $iteration (0 .. 1499) {
print "." unless $iteration % 100;
$beta_old = $beta->copy;
my $jacobian;
if ($jacobian_choice == 1) {
$jacobian = $X;
} elsif ($jacobian_choice == 2) {
my $x_times_delta_beta = $delta * $X * $beta;
$jacobian = Math::GSL::Matrix->new($nrows, $ncols);
foreach my $i (0 .. $nrows - 1) {
my @row = ($x_times_delta_beta->get_elem($i,0)) x $ncols;
$jacobian->set_row($i, \@row);
}
$jacobian = (1.0/$delta) * $jacobian;
} else {
die "wrong choice for the jacobian_choice";
}
# $beta = $beta_old + 2 * $gamma * transpose($X) * ( $y - ($X * $beta) );
$beta = $beta_old + 2 * $gamma * transpose($jacobian) * ( $y - ($X * $beta) );
$error = sum( map abs, ($y - ($X * $beta) )->col(0)->as_list ) / $nrows;
if ($error > $error_old) {
if (vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old))) {
$iterate_again_flag = 0;
last;
} else {
last;
}
}
if ($self->{_debug2_r}) {
print "\n\niteration: $iteration gamma: $gamma current error: $error\n";
print "\nnew beta:\n";
display_matrix $beta;
}
if ( vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old)) ) {
print "iterations used: $iteration with gamma: $gamma\n" if $self->{_debug2_r};
$iterate_again_flag = 0;
last;
}
$error_old = $error;
}
}
display_matrix($beta) if $self->{_debug2_r};
my $predictions = $X * $beta;
my @error_distribution = ($y - ($X * $beta))->as_list;
my $squared_error = sum map abs, @error_distribution;
my $error = $squared_error / $nrows;
if ($display) {
if ($ncols == 2) {
my @xvalues = $X->col(0)->as_list;
my @yvalues = $predictions->col(0)->as_list;
$self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
} elsif ($ncols == 3) {
my @xvalues;
my @yvalues = $predictions->col(0)->as_list;
foreach my $row_index (0 .. $X->rows - 1) {
my @onerow = $X->row($row_index)->as_list;
pop @onerow;
push @xvalues, "@onerow";
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
} else {
print "no display when the overall dimensionality of the data exceeds 3\n";
}
}
return ($error, $beta);
}
##------------------------------- Construct Regression Tree ------------------------------------
## At the root node, you do ordinary linear regression for the entire dataset so that you
## can later compare the linear regression fit with the results obtained through the
## regression tree. Subsequently, you call the recursive_descent() method to construct
## the tree.
sub construct_regression_tree {
my $self = shift;
print "\nConstructing regression tree...\n";
my $root_node = RTNode->new(undef, undef, undef, [], $self, 'root');
my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector, 1);
$root_node->set_node_XMatrix($XMatrix);
$root_node->set_node_YVector($YVector);
$root_node->set_node_error($error);
$root_node->set_node_beta($beta);
$root_node->set_num_data_points($XMatrix->cols);
print "\nerror at root: $error\n";
print "\nbeta at root:\n";
display_matrix($beta);
$self->{_root_node} = $root_node;
$self->recursive_descent($root_node) if $self->{_max_depth_desired} > 0;
return $root_node;
}
## We first look for a feature, along with its partitioning point, that yields the
## largest reduction in MSE compared to the MSE at the parent node. This feature and
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
$hardcopy_plot->gnuplot_cmd( "splot $arg_string" );
$gplot->gnuplot_cmd( "splot $arg_string" );
$gplot->gnuplot_pause(-1);
} else {
die "no visual displays for regression from more then 2 predictor vars";
}
}
sub DESTROY {
unlink glob "__temp_*";
}
############################################## Utility Routines ##########################################
# checks whether an element is in an array:
sub contained_in {
my $ele = shift;
my @array = @_;
my $count = 0;
map {$count++ if $ele eq $_} @array;
return $count;
}
sub minmax {
my $arr = shift;
my ($min, $max);
foreach my $i (0..@{$arr}-1) {
if ( (!defined $min) || ($arr->[$i] < $min) ) {
$min = $arr->[$i];
}
if ( (!defined $max) || ($arr->[$i] > $max) ) {
$max = $arr->[$i];
}
}
return ($min, $max);
}
sub sample_index {
my $arg = shift;
$arg =~ /_(.+)$/;
return $1;
}
sub check_for_illegal_params {
my @params = @_;
my @legal_params = qw / training_datafile
max_depth_desired
dependent_variable_column
predictor_columns
mse_threshold
need_data_normalization
jacobian_choice
csv_cleanup_needed
debug1_r
debug2_r
debug3_r
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub cleanup_csv {
my $line = shift;
$line =~ tr/\/:?()[]{}'/ /;
# my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
for (@double_quoted) {
my $item = $_;
$item = substr($item, 1, -1);
$item =~ s/^\s+|,|\s+$//g;
$item = join '_', split /\s+/, $item;
substr($line, index($line, $_), length($_)) = $item;
}
my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
for (@white_spaced) {
my $item = $_;
$item =~ s/\s+/_/g;
$item =~ s/^\s*_|_\s*$//g;
substr($line, index($line, $_), length($_)) = $item;
}
$line =~ s/,\s*(?=,|$)/,NA/g;
return $line;
}
sub transpose {
my $matrix = shift;
my $num_rows = $matrix->rows();
my $num_cols = $matrix->cols();
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
foreach my $i (0..$num_rows-1) {
my @row = $matrix->row($i)->as_list;
$transpose->set_col($i, \@row );
}
return $transpose;
}
sub vector_subtract {
my $vec1 = shift;
my $vec2 = shift;
die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
my @result;
foreach my $i (0..@$vec1-1){
push @result, $vec1->[$i] - $vec2->[$i];
}
return @result;
}
sub vector_norm {
my $vec = shift; # assume it to be a column vector
my ($rows, $cols) = $vec->dim;
die "vector_norm() can only be called for a single column matrix" if $cols > 1;
my @norm = (transpose($vec) * $vec)->as_list;
return sqrt($norm[0]);
}
sub display_matrix {
my $matrix = shift;
my $nrows = $matrix->rows();
my $ncols = $matrix->cols();
( run in 0.869 second using v1.01-cache-2.11-cpan-6b5c3043376 )