Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

                pop @onerow;
                push @xvalues, "@onerow";
            }
            $self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
        } else {
            print "no display when the overall dimensionality of the data exceeds 3\n";
        }
    }
    return ($error, $beta);
}

##-------------------------------  Construct Regression Tree  ------------------------------------


##  At the root node, you do ordinary linear regression for the entire dataset so that you
##  can later compare the linear regression fit with the results obtained through the 
##  regression tree.  Subsequently, you call the recursive_descent() method to construct
##  the tree.

sub construct_regression_tree {
    my $self = shift;
    print "\nConstructing regression tree...\n";
    my $root_node = RTNode->new(undef, undef, undef, [], $self, 'root');
    my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
    my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector, 1); 
    $root_node->set_node_XMatrix($XMatrix);
    $root_node->set_node_YVector($YVector);
    $root_node->set_node_error($error);
    $root_node->set_node_beta($beta);
    $root_node->set_num_data_points($XMatrix->cols);
    print "\nerror at root: $error\n";
    print "\nbeta at root:\n";
    display_matrix($beta);
    $self->{_root_node} = $root_node;
    $self->recursive_descent($root_node) if $self->{_max_depth_desired} > 0;
    return $root_node;
}

##  We first look for a feature, along with its partitioning point, that yields the 
##  largest reduction in MSE compared to the MSE at the parent node.  This feature and
##  its partitioning point are then used to create two child nodes in the tree.
sub recursive_descent {
    my $self = shift;
    my $node = shift;
    print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n";
    my $node_serial_number = $node->get_serial_num();
    my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
    my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
    if (@features_and_values_or_thresholds_on_branch > 0) {
        my ($error,$beta,$XMatrix,$YVector) = 
          $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@copy_of_path_attributes);
        $node->set_node_XMatrix($XMatrix);
        $node->set_node_YVector($YVector);
        $node->set_node_error($error);
        $node->set_node_beta($beta);
        $node->set_num_data_points($XMatrix->cols);
        print "\nNODE SERIAL NUMBER: $node_serial_number\n";
        print "\nFeatures and values or thresholds on branch: @features_and_values_or_thresholds_on_branch\n";
        return if $error <= $self->{_mse_threshold}; 
    }
    my ($best_feature,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point) = 
                                               $self->best_feature_calculator(\@copy_of_path_attributes);
    return unless defined $best_feature_partitioning_point;
    print "\nBest feature found: $best_feature\n";
    print "Best feature partitioning_point: $best_feature_partitioning_point\n";
    print "Best minmax error at partitioning point: $best_minmax_error_at_partitioning_point\n";
    $node->set_feature($best_feature);
    $node->display_node() if $self->{_debug2_r}; 
    return if (defined $self->{_max_depth_desired}) && 
                            (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired}); 
    if ($best_minmax_error_at_partitioning_point > $self->{_mse_threshold}) {
        my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
        my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
        my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_feature_partitioning_point";
        my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_feature_partitioning_point";
        push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 
                                                                  $feature_threshold_combo_for_less_than;
        push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child, 
                                                               $feature_threshold_combo_for_greater_than;
        my $left_child_node = RTNode->new(undef, undef, undef, 
                          \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
        $node->add_child_link($left_child_node);
        $self->recursive_descent($left_child_node);
        my $right_child_node = RTNode->new(undef, undef, undef, 
                        \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
        $node->add_child_link($right_child_node);
        $self->recursive_descent($right_child_node);
    }
}

##  This is the heart of the regression tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples at the
##  current node.  The partitioning criterion is that the largest of the MSE's in 
##  the two partitions should be smaller than the error associated with the parent
##  node.
sub best_feature_calculator {
    my $self = shift;
    my $features_and_values_or_thresholds_on_branch = shift;
    my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;
    print "\n\nfeatures_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
    if (@features_and_values_or_thresholds_on_branch == 0) {
        my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        foreach my $i (0 .. @{$self->{_feature_names}}-1) {
            my $feature_name = $self->{_feature_names}->[$i];
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            my @partitioning_errors;
            my %partitioning_error_hash;
            foreach my $value (@values[10 .. $#values - 10]) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
                my @for_left_child;
                my @for_right_child;
                if (@features_and_values_or_thresholds_on_branch) {
                    @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_left_child, $feature_and_less_than_value_string;
                    @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_right_child, $feature_and_greater_than_value_string;
                } else {
                    @for_left_child = ($feature_and_less_than_value_string);
                    @for_right_child = ($feature_and_greater_than_value_string);
                }
                my ($error1,$beta1,$XMatrix1,$YVector1) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
                my ($error2,$beta2,$XMatrix2,$YVector2) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
                my $partitioning_error = max($error1, $error2);
                push @partitioning_errors, $partitioning_error;
                $partitioning_error_hash{$partitioning_error} = $value;
            }
            my $min_max_error_for_feature = min(@partitioning_errors);
            $best_partition_point_for_feature_hash->{$feature_name} = 
                                             $partitioning_error_hash{$min_max_error_for_feature};
            $best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
        }
        my $best_feature_name;
        my $best_feature_paritioning_point;
        my $best_minmax_error_at_partitioning_point;
        foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
            if (! defined $best_minmax_error_at_partitioning_point) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            } elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            }
        }
        my $best_feature_partitioning_point =  $best_partition_point_for_feature_hash->{$best_feature_name};
        return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
    } else {
        my $pattern1 = '(.+)=(.+)';
        my $pattern2 = '(.+)<(.+)';
        my $pattern3 = '(.+)>(.+)';
        my @true_numeric_types;
        my @symbolic_types;
        my @true_numeric_types_feature_names;
        my @symbolic_types_feature_names;
        foreach my $item (@features_and_values_or_thresholds_on_branch) {
            if ($item =~ /$pattern2/) {
                push @true_numeric_types, $item;
                push @true_numeric_types_feature_names, $1;
            } elsif ($item =~ /$pattern3/) {
                push @true_numeric_types, $item;
                push @true_numeric_types_feature_names, $1;
            } elsif ($item =~ /$pattern1/) {
                push @symbolic_types, $item;
                push @symbolic_types_feature_names, $1;
            } else {
                die "format error in the representation of feature and values or thresholds";
            }
        }
        my %seen = ();
        @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
        %seen = ();
        @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
        my @bounded_intervals_numeric_types = 
                           @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
        # Calculate the upper and the lower bounds to be used when searching for the best
        # threshold for each of the numeric features that are in play at the current node:
        my (%upperbound, %lowerbound);
        foreach my $feature (@true_numeric_types_feature_names) {
            $upperbound{$feature} = undef;
            $lowerbound{$feature} = undef;
        }
        foreach my $item (@bounded_intervals_numeric_types) {
            foreach my $feature_grouping (@$item) {
                if ($feature_grouping->[1] eq '>') {
                    $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
                } else {
                    $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
                }
            }
        }
        my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        foreach my $i (0 .. @{$self->{_feature_names}}-1) {
            my $feature_name = $self->{_feature_names}->[$i];
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            my @newvalues;
            if (contained_in($feature_name, @true_numeric_types_feature_names)) {
                if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                              $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
                    next;
                } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                                    $lowerbound{$feature_name} < $upperbound{$feature_name}) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($upperbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($lowerbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name};
                    }
                } else {
                    die "Error is bound specifications in best feature calculator";
                }
            } else {
                @newvalues = @{deep_copy_array(\@values)};
            }
            next if @newvalues < 30;
            my @partitioning_errors;
            my %partitioning_error_hash;
            foreach my $value (@newvalues[10 .. $#newvalues - 10]) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
                my @for_left_child;
                my @for_right_child;
                if (@features_and_values_or_thresholds_on_branch) {
                    @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_left_child, $feature_and_less_than_value_string;
                    @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_right_child, $feature_and_greater_than_value_string;
                } else {
                    @for_left_child = ($feature_and_less_than_value_string);
                    @for_right_child = ($feature_and_greater_than_value_string);
                }
                my ($error1,$beta1,$XMatrix1,$YVector1) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
                my ($error2,$beta2,$XMatrix2,$YVector2) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
                my $partitioning_error = max($error1, $error2);
                push @partitioning_errors, $partitioning_error;
                $partitioning_error_hash{$partitioning_error} = $value;
            }
            my $min_max_error_for_feature = min(@partitioning_errors);
            $best_partition_point_for_feature_hash->{$feature_name} = 
                                             $partitioning_error_hash{$min_max_error_for_feature};
            $best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
        }
        my $best_feature_name;
        my $best_feature_paritioning_point;
        my $best_minmax_error_at_partitioning_point;
        foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
            if (! defined $best_minmax_error_at_partitioning_point) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            } elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            }
        }
        my $best_feature_partitioning_point =  $best_partition_point_for_feature_hash->{$best_feature_name};
        return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
    }
}

##  This method requires that all truly numeric types only be expressed as '<' or '>'
##  constructs in the array of branch features and thresholds
sub _error_for_given_sequence_of_features_and_values_or_thresholds{
    my $self = shift;
    my $array_of_features_and_values_or_thresholds = shift;
    if (@$array_of_features_and_values_or_thresholds == 0) { 
        my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
        my ($errors,$beta) = $self->estimate_regression_coefficients($XMatrix,$YVector);
        return ($errors,$beta,$XMatrix,$YVector)
    }
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @true_numeric_types;
    my @symbolic_types;
    my @true_numeric_types_feature_names;
    my @symbolic_types_feature_names;
    foreach my $item (@$array_of_features_and_values_or_thresholds) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern3/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern1/) {
            push @symbolic_types, $item;
            push @symbolic_types_feature_names, $1;
        } else {
            die "format error in the representation of feature and values or thresholds";
        }
    }
    my %seen = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
    %seen = ();
    @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
    my @bounded_intervals_numeric_types = 
                           @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature (@true_numeric_types_feature_names) {
        $upperbound{$feature} = undef;
        $lowerbound{$feature} = undef;
    }
    foreach my $item (@bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
            } else {
                $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
            }
        }
    }
    my %training_samples_at_node;
    foreach my $feature_name (@true_numeric_types_feature_names) {
        if ((defined $lowerbound{$feature_name}) && (defined $upperbound{$feature_name}) && 
                                     ($upperbound{$feature_name} <= $lowerbound{$feature_name})) {
            return (undef,undef,undef,undef); 
        } elsif ((defined $lowerbound{$feature_name}) && (defined $upperbound{$feature_name})) {
            foreach my $sample (keys %{$self->{_training_data_hash}}) {
                my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
                foreach my $feature_and_val (@feature_val_pairs) {
                    my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
                    my $feature_involved =  substr($feature_and_val, 0, index($feature_and_val,'=') );
                    if (($feature_name eq $feature_involved) && 
                        ($lowerbound{$feature_name} < $value_for_feature) && 
                        ($value_for_feature <= $upperbound{$feature_name})) {
                        $training_samples_at_node{$sample} = 1;
                        last;
                    }
                }
            }  
        } elsif ((defined $upperbound{$feature_name}) && (! defined $lowerbound{$feature_name})) {
            foreach my $sample (keys %{$self->{_training_data_hash}}) {
                my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
                foreach my $feature_and_val (@feature_val_pairs) {
                    my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
                    my $feature_involved =  substr($feature_and_val, 0, index($feature_and_val,'=') );
                    if (($feature_name eq $feature_involved) && 
                        ($value_for_feature <= $upperbound{$feature_name})) {
                        $training_samples_at_node{$sample} = 1;
                        last;
                    }
                }
            }  
        } elsif ((defined $lowerbound{$feature_name}) && (! defined $upperbound{$feature_name})) {
            foreach my $sample (keys %{$self->{_training_data_hash}}) {
                my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
                foreach my $feature_and_val (@feature_val_pairs) {
                    my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
                    my $feature_involved =  substr($feature_and_val, 0, index($feature_and_val,'=') );
                    if (($feature_name eq $feature_involved) && 
                        ($value_for_feature > $lowerbound{$feature_name})) {
                        $training_samples_at_node{$sample} = 1;
                        last;
                    }
                }
            }  



( run in 0.585 second using v1.01-cache-2.11-cpan-ceb78f64989 )