Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

                $iterate_again_flag = 0;
                last;
            }
            $error_old = $error;
        }
    }
    display_matrix($beta) if $self->{_debug2_r};
    my $predictions = $X * $beta;
    my @error_distribution = ($y - ($X * $beta))->as_list;
    my $squared_error =  sum map abs, @error_distribution;
    my $error = $squared_error / $nrows;
    if ($display) {
        if ($ncols == 2) {
            my @xvalues = $X->col(0)->as_list;
            my @yvalues = $predictions->col(0)->as_list;
            $self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
        } elsif ($ncols == 3) {
            my @xvalues;
            my @yvalues = $predictions->col(0)->as_list;
            foreach my $row_index (0 .. $X->rows - 1) {
                my @onerow = $X->row($row_index)->as_list;
                pop @onerow;
                push @xvalues, "@onerow";
            }
            $self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
        } else {
            print "no display when the overall dimensionality of the data exceeds 3\n";
        }
    }
    return ($error, $beta);
}

##-------------------------------  Construct Regression Tree  ------------------------------------


##  At the root node, you do ordinary linear regression for the entire dataset so that you
##  can later compare the linear regression fit with the results obtained through the 
##  regression tree.  Subsequently, you call the recursive_descent() method to construct
##  the tree.

sub construct_regression_tree {
    my $self = shift;
    print "\nConstructing regression tree...\n";
    my $root_node = RTNode->new(undef, undef, undef, [], $self, 'root');
    my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
    my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector, 1); 
    $root_node->set_node_XMatrix($XMatrix);
    $root_node->set_node_YVector($YVector);
    $root_node->set_node_error($error);
    $root_node->set_node_beta($beta);
    $root_node->set_num_data_points($XMatrix->cols);
    print "\nerror at root: $error\n";
    print "\nbeta at root:\n";
    display_matrix($beta);
    $self->{_root_node} = $root_node;
    $self->recursive_descent($root_node) if $self->{_max_depth_desired} > 0;
    return $root_node;
}

##  We first look for a feature, along with its partitioning point, that yields the 
##  largest reduction in MSE compared to the MSE at the parent node.  This feature and
##  its partitioning point are then used to create two child nodes in the tree.
sub recursive_descent {
    my $self = shift;
    my $node = shift;
    print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n";
    my $node_serial_number = $node->get_serial_num();
    my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
    my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
    if (@features_and_values_or_thresholds_on_branch > 0) {
        my ($error,$beta,$XMatrix,$YVector) = 
          $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@copy_of_path_attributes);
        $node->set_node_XMatrix($XMatrix);
        $node->set_node_YVector($YVector);
        $node->set_node_error($error);
        $node->set_node_beta($beta);
        $node->set_num_data_points($XMatrix->cols);
        print "\nNODE SERIAL NUMBER: $node_serial_number\n";
        print "\nFeatures and values or thresholds on branch: @features_and_values_or_thresholds_on_branch\n";
        return if $error <= $self->{_mse_threshold}; 
    }
    my ($best_feature,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point) = 
                                               $self->best_feature_calculator(\@copy_of_path_attributes);
    return unless defined $best_feature_partitioning_point;
    print "\nBest feature found: $best_feature\n";
    print "Best feature partitioning_point: $best_feature_partitioning_point\n";
    print "Best minmax error at partitioning point: $best_minmax_error_at_partitioning_point\n";
    $node->set_feature($best_feature);
    $node->display_node() if $self->{_debug2_r}; 
    return if (defined $self->{_max_depth_desired}) && 
                            (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired}); 
    if ($best_minmax_error_at_partitioning_point > $self->{_mse_threshold}) {
        my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
        my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
        my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_feature_partitioning_point";
        my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_feature_partitioning_point";
        push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 
                                                                  $feature_threshold_combo_for_less_than;
        push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child, 
                                                               $feature_threshold_combo_for_greater_than;
        my $left_child_node = RTNode->new(undef, undef, undef, 
                          \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
        $node->add_child_link($left_child_node);
        $self->recursive_descent($left_child_node);
        my $right_child_node = RTNode->new(undef, undef, undef, 
                        \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
        $node->add_child_link($right_child_node);
        $self->recursive_descent($right_child_node);
    }
}

##  This is the heart of the regression tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples at the
##  current node.  The partitioning criterion is that the largest of the MSE's in 
##  the two partitions should be smaller than the error associated with the parent
##  node.
sub best_feature_calculator {
    my $self = shift;
    my $features_and_values_or_thresholds_on_branch = shift;
    my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;
    print "\n\nfeatures_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
    if (@features_and_values_or_thresholds_on_branch == 0) {
        my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        foreach my $i (0 .. @{$self->{_feature_names}}-1) {
            my $feature_name = $self->{_feature_names}->[$i];
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            my @partitioning_errors;
            my %partitioning_error_hash;
            foreach my $value (@values[10 .. $#values - 10]) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
                my @for_left_child;
                my @for_right_child;
                if (@features_and_values_or_thresholds_on_branch) {
                    @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_left_child, $feature_and_less_than_value_string;
                    @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_right_child, $feature_and_greater_than_value_string;
                } else {
                    @for_left_child = ($feature_and_less_than_value_string);
                    @for_right_child = ($feature_and_greater_than_value_string);
                }
                my ($error1,$beta1,$XMatrix1,$YVector1) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
                my ($error2,$beta2,$XMatrix2,$YVector2) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
                my $partitioning_error = max($error1, $error2);
                push @partitioning_errors, $partitioning_error;
                $partitioning_error_hash{$partitioning_error} = $value;
            }
            my $min_max_error_for_feature = min(@partitioning_errors);
            $best_partition_point_for_feature_hash->{$feature_name} = 
                                             $partitioning_error_hash{$min_max_error_for_feature};
            $best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
        }
        my $best_feature_name;
        my $best_feature_paritioning_point;
        my $best_minmax_error_at_partitioning_point;
        foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
            if (! defined $best_minmax_error_at_partitioning_point) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            } elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            }
        }
        my $best_feature_partitioning_point =  $best_partition_point_for_feature_hash->{$best_feature_name};
        return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
    } else {
        my $pattern1 = '(.+)=(.+)';
        my $pattern2 = '(.+)<(.+)';
        my $pattern3 = '(.+)>(.+)';
        my @true_numeric_types;



( run in 0.411 second using v1.01-cache-2.11-cpan-e1769b4cff6 )