Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
$iterate_again_flag = 0;
last;
}
$error_old = $error;
}
}
display_matrix($beta) if $self->{_debug2_r};
my $predictions = $X * $beta;
my @error_distribution = ($y - ($X * $beta))->as_list;
my $squared_error = sum map abs, @error_distribution;
my $error = $squared_error / $nrows;
if ($display) {
if ($ncols == 2) {
my @xvalues = $X->col(0)->as_list;
my @yvalues = $predictions->col(0)->as_list;
$self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
} elsif ($ncols == 3) {
my @xvalues;
my @yvalues = $predictions->col(0)->as_list;
foreach my $row_index (0 .. $X->rows - 1) {
my @onerow = $X->row($row_index)->as_list;
pop @onerow;
push @xvalues, "@onerow";
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
} else {
print "no display when the overall dimensionality of the data exceeds 3\n";
}
}
return ($error, $beta);
}
##------------------------------- Construct Regression Tree ------------------------------------
## At the root node, you do ordinary linear regression for the entire dataset so that you
## can later compare the linear regression fit with the results obtained through the
## regression tree. Subsequently, you call the recursive_descent() method to construct
## the tree.
sub construct_regression_tree {
my $self = shift;
print "\nConstructing regression tree...\n";
my $root_node = RTNode->new(undef, undef, undef, [], $self, 'root');
my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector, 1);
$root_node->set_node_XMatrix($XMatrix);
$root_node->set_node_YVector($YVector);
$root_node->set_node_error($error);
$root_node->set_node_beta($beta);
$root_node->set_num_data_points($XMatrix->cols);
print "\nerror at root: $error\n";
print "\nbeta at root:\n";
display_matrix($beta);
$self->{_root_node} = $root_node;
$self->recursive_descent($root_node) if $self->{_max_depth_desired} > 0;
return $root_node;
}
## We first look for a feature, along with its partitioning point, that yields the
## largest reduction in MSE compared to the MSE at the parent node. This feature and
## its partitioning point are then used to create two child nodes in the tree.
sub recursive_descent {
my $self = shift;
my $node = shift;
print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n";
my $node_serial_number = $node->get_serial_num();
my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
if (@features_and_values_or_thresholds_on_branch > 0) {
my ($error,$beta,$XMatrix,$YVector) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@copy_of_path_attributes);
$node->set_node_XMatrix($XMatrix);
$node->set_node_YVector($YVector);
$node->set_node_error($error);
$node->set_node_beta($beta);
$node->set_num_data_points($XMatrix->cols);
print "\nNODE SERIAL NUMBER: $node_serial_number\n";
print "\nFeatures and values or thresholds on branch: @features_and_values_or_thresholds_on_branch\n";
return if $error <= $self->{_mse_threshold};
}
my ($best_feature,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point) =
$self->best_feature_calculator(\@copy_of_path_attributes);
return unless defined $best_feature_partitioning_point;
print "\nBest feature found: $best_feature\n";
print "Best feature partitioning_point: $best_feature_partitioning_point\n";
print "Best minmax error at partitioning point: $best_minmax_error_at_partitioning_point\n";
$node->set_feature($best_feature);
$node->display_node() if $self->{_debug2_r};
return if (defined $self->{_max_depth_desired}) &&
(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired});
if ($best_minmax_error_at_partitioning_point > $self->{_mse_threshold}) {
my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_feature_partitioning_point";
my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_feature_partitioning_point";
push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
$feature_threshold_combo_for_less_than;
push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
$feature_threshold_combo_for_greater_than;
my $left_child_node = RTNode->new(undef, undef, undef,
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
$node->add_child_link($left_child_node);
$self->recursive_descent($left_child_node);
my $right_child_node = RTNode->new(undef, undef, undef,
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
$node->add_child_link($right_child_node);
$self->recursive_descent($right_child_node);
}
}
## This is the heart of the regression tree constructor. Its main job is to figure
## out the best feature to use for partitioning the training data samples at the
## current node. The partitioning criterion is that the largest of the MSE's in
## the two partitions should be smaller than the error associated with the parent
## node.
sub best_feature_calculator {
my $self = shift;
my $features_and_values_or_thresholds_on_branch = shift;
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
print "\n\nfeatures_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
if (@features_and_values_or_thresholds_on_branch == 0) {
my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
foreach my $i (0 .. @{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
my @partitioning_errors;
my %partitioning_error_hash;
foreach my $value (@values[10 .. $#values - 10]) {
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
my @for_left_child;
my @for_right_child;
if (@features_and_values_or_thresholds_on_branch) {
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_left_child, $feature_and_less_than_value_string;
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_right_child, $feature_and_greater_than_value_string;
} else {
@for_left_child = ($feature_and_less_than_value_string);
@for_right_child = ($feature_and_greater_than_value_string);
}
my ($error1,$beta1,$XMatrix1,$YVector1) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
my ($error2,$beta2,$XMatrix2,$YVector2) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
my $partitioning_error = max($error1, $error2);
push @partitioning_errors, $partitioning_error;
$partitioning_error_hash{$partitioning_error} = $value;
}
my $min_max_error_for_feature = min(@partitioning_errors);
$best_partition_point_for_feature_hash->{$feature_name} =
$partitioning_error_hash{$min_max_error_for_feature};
$best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
}
my $best_feature_name;
my $best_feature_paritioning_point;
my $best_minmax_error_at_partitioning_point;
foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
if (! defined $best_minmax_error_at_partitioning_point) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
} elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
}
}
my $best_feature_partitioning_point = $best_partition_point_for_feature_hash->{$best_feature_name};
return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
} else {
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @true_numeric_types;
( run in 0.411 second using v1.01-cache-2.11-cpan-e1769b4cff6 )