Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/RegressionTree.pm view on Meta::CPAN
pop @onerow;
push @xvalues, "@onerow";
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
} else {
print "no display when the overall dimensionality of the data exceeds 3\n";
}
}
return ($error, $beta);
}
##------------------------------- Construct Regression Tree ------------------------------------
## At the root node, you do ordinary linear regression for the entire dataset so that you
## can later compare the linear regression fit with the results obtained through the
## regression tree. Subsequently, you call the recursive_descent() method to construct
## the tree.
sub construct_regression_tree {
my $self = shift;
print "\nConstructing regression tree...\n";
my $root_node = RTNode->new(undef, undef, undef, [], $self, 'root');
my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector, 1);
$root_node->set_node_XMatrix($XMatrix);
$root_node->set_node_YVector($YVector);
$root_node->set_node_error($error);
$root_node->set_node_beta($beta);
$root_node->set_num_data_points($XMatrix->cols);
print "\nerror at root: $error\n";
print "\nbeta at root:\n";
display_matrix($beta);
$self->{_root_node} = $root_node;
$self->recursive_descent($root_node) if $self->{_max_depth_desired} > 0;
return $root_node;
}
## We first look for a feature, along with its partitioning point, that yields the
## largest reduction in MSE compared to the MSE at the parent node. This feature and
## its partitioning point are then used to create two child nodes in the tree.
sub recursive_descent {
my $self = shift;
my $node = shift;
print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n";
my $node_serial_number = $node->get_serial_num();
my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
if (@features_and_values_or_thresholds_on_branch > 0) {
my ($error,$beta,$XMatrix,$YVector) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@copy_of_path_attributes);
$node->set_node_XMatrix($XMatrix);
$node->set_node_YVector($YVector);
$node->set_node_error($error);
$node->set_node_beta($beta);
$node->set_num_data_points($XMatrix->cols);
print "\nNODE SERIAL NUMBER: $node_serial_number\n";
print "\nFeatures and values or thresholds on branch: @features_and_values_or_thresholds_on_branch\n";
return if $error <= $self->{_mse_threshold};
}
my ($best_feature,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point) =
$self->best_feature_calculator(\@copy_of_path_attributes);
return unless defined $best_feature_partitioning_point;
print "\nBest feature found: $best_feature\n";
print "Best feature partitioning_point: $best_feature_partitioning_point\n";
print "Best minmax error at partitioning point: $best_minmax_error_at_partitioning_point\n";
$node->set_feature($best_feature);
$node->display_node() if $self->{_debug2_r};
return if (defined $self->{_max_depth_desired}) &&
(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired});
if ($best_minmax_error_at_partitioning_point > $self->{_mse_threshold}) {
my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_feature_partitioning_point";
my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_feature_partitioning_point";
push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
$feature_threshold_combo_for_less_than;
push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
$feature_threshold_combo_for_greater_than;
my $left_child_node = RTNode->new(undef, undef, undef,
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
$node->add_child_link($left_child_node);
$self->recursive_descent($left_child_node);
my $right_child_node = RTNode->new(undef, undef, undef,
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
$node->add_child_link($right_child_node);
$self->recursive_descent($right_child_node);
}
}
## This is the heart of the regression tree constructor. Its main job is to figure
## out the best feature to use for partitioning the training data samples at the
## current node. The partitioning criterion is that the largest of the MSE's in
## the two partitions should be smaller than the error associated with the parent
## node.
sub best_feature_calculator {
my $self = shift;
my $features_and_values_or_thresholds_on_branch = shift;
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
print "\n\nfeatures_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
if (@features_and_values_or_thresholds_on_branch == 0) {
my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
foreach my $i (0 .. @{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
my @partitioning_errors;
my %partitioning_error_hash;
foreach my $value (@values[10 .. $#values - 10]) {
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
my @for_left_child;
my @for_right_child;
if (@features_and_values_or_thresholds_on_branch) {
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_left_child, $feature_and_less_than_value_string;
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_right_child, $feature_and_greater_than_value_string;
} else {
@for_left_child = ($feature_and_less_than_value_string);
@for_right_child = ($feature_and_greater_than_value_string);
}
my ($error1,$beta1,$XMatrix1,$YVector1) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
my ($error2,$beta2,$XMatrix2,$YVector2) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
my $partitioning_error = max($error1, $error2);
push @partitioning_errors, $partitioning_error;
$partitioning_error_hash{$partitioning_error} = $value;
}
my $min_max_error_for_feature = min(@partitioning_errors);
$best_partition_point_for_feature_hash->{$feature_name} =
$partitioning_error_hash{$min_max_error_for_feature};
$best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
}
my $best_feature_name;
my $best_feature_paritioning_point;
my $best_minmax_error_at_partitioning_point;
foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
if (! defined $best_minmax_error_at_partitioning_point) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
} elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
}
}
my $best_feature_partitioning_point = $best_partition_point_for_feature_hash->{$best_feature_name};
return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
} else {
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @true_numeric_types;
my @symbolic_types;
my @true_numeric_types_feature_names;
my @symbolic_types_feature_names;
foreach my $item (@features_and_values_or_thresholds_on_branch) {
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
my (%upperbound, %lowerbound);
foreach my $feature (@true_numeric_types_feature_names) {
$upperbound{$feature} = undef;
$lowerbound{$feature} = undef;
}
foreach my $item (@bounded_intervals_numeric_types) {
foreach my $feature_grouping (@$item) {
if ($feature_grouping->[1] eq '>') {
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
} else {
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
}
}
}
my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
foreach my $i (0 .. @{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
my @newvalues;
if (contained_in($feature_name, @true_numeric_types_feature_names)) {
if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
next;
} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
$lowerbound{$feature_name} < $upperbound{$feature_name}) {
foreach my $x (@values) {
push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
}
} elsif (defined($upperbound{$feature_name})) {
foreach my $x (@values) {
push @newvalues, $x if $x <= $upperbound{$feature_name};
}
} elsif (defined($lowerbound{$feature_name})) {
foreach my $x (@values) {
push @newvalues, $x if $x > $lowerbound{$feature_name};
}
} else {
die "Error is bound specifications in best feature calculator";
}
} else {
@newvalues = @{deep_copy_array(\@values)};
}
next if @newvalues < 30;
my @partitioning_errors;
my %partitioning_error_hash;
foreach my $value (@newvalues[10 .. $#newvalues - 10]) {
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
my @for_left_child;
my @for_right_child;
if (@features_and_values_or_thresholds_on_branch) {
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_left_child, $feature_and_less_than_value_string;
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_right_child, $feature_and_greater_than_value_string;
} else {
@for_left_child = ($feature_and_less_than_value_string);
@for_right_child = ($feature_and_greater_than_value_string);
}
my ($error1,$beta1,$XMatrix1,$YVector1) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
my ($error2,$beta2,$XMatrix2,$YVector2) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
my $partitioning_error = max($error1, $error2);
push @partitioning_errors, $partitioning_error;
$partitioning_error_hash{$partitioning_error} = $value;
}
my $min_max_error_for_feature = min(@partitioning_errors);
$best_partition_point_for_feature_hash->{$feature_name} =
$partitioning_error_hash{$min_max_error_for_feature};
$best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
}
my $best_feature_name;
my $best_feature_paritioning_point;
my $best_minmax_error_at_partitioning_point;
foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
if (! defined $best_minmax_error_at_partitioning_point) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
} elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
}
}
my $best_feature_partitioning_point = $best_partition_point_for_feature_hash->{$best_feature_name};
return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
}
}
## This method requires that all truly numeric types only be expressed as '<' or '>'
## constructs in the array of branch features and thresholds
sub _error_for_given_sequence_of_features_and_values_or_thresholds{
my $self = shift;
my $array_of_features_and_values_or_thresholds = shift;
if (@$array_of_features_and_values_or_thresholds == 0) {
my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
my ($errors,$beta) = $self->estimate_regression_coefficients($XMatrix,$YVector);
return ($errors,$beta,$XMatrix,$YVector)
}
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @true_numeric_types;
my @symbolic_types;
my @true_numeric_types_feature_names;
my @symbolic_types_feature_names;
foreach my $item (@$array_of_features_and_values_or_thresholds) {
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
my (%upperbound, %lowerbound);
foreach my $feature (@true_numeric_types_feature_names) {
$upperbound{$feature} = undef;
$lowerbound{$feature} = undef;
}
foreach my $item (@bounded_intervals_numeric_types) {
foreach my $feature_grouping (@$item) {
if ($feature_grouping->[1] eq '>') {
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
} else {
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
}
}
}
my %training_samples_at_node;
foreach my $feature_name (@true_numeric_types_feature_names) {
if ((defined $lowerbound{$feature_name}) && (defined $upperbound{$feature_name}) &&
($upperbound{$feature_name} <= $lowerbound{$feature_name})) {
return (undef,undef,undef,undef);
} elsif ((defined $lowerbound{$feature_name}) && (defined $upperbound{$feature_name})) {
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
foreach my $feature_and_val (@feature_val_pairs) {
my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
my $feature_involved = substr($feature_and_val, 0, index($feature_and_val,'=') );
if (($feature_name eq $feature_involved) &&
($lowerbound{$feature_name} < $value_for_feature) &&
($value_for_feature <= $upperbound{$feature_name})) {
$training_samples_at_node{$sample} = 1;
last;
}
}
}
} elsif ((defined $upperbound{$feature_name}) && (! defined $lowerbound{$feature_name})) {
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
foreach my $feature_and_val (@feature_val_pairs) {
my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
my $feature_involved = substr($feature_and_val, 0, index($feature_and_val,'=') );
if (($feature_name eq $feature_involved) &&
($value_for_feature <= $upperbound{$feature_name})) {
$training_samples_at_node{$sample} = 1;
last;
}
}
}
} elsif ((defined $lowerbound{$feature_name}) && (! defined $upperbound{$feature_name})) {
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
foreach my $feature_and_val (@feature_val_pairs) {
my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
my $feature_involved = substr($feature_and_val, 0, index($feature_and_val,'=') );
if (($feature_name eq $feature_involved) &&
($value_for_feature > $lowerbound{$feature_name})) {
$training_samples_at_node{$sample} = 1;
last;
}
}
}
( run in 0.585 second using v1.01-cache-2.11-cpan-ceb78f64989 )