best results from the CPAN

best

Algorithm-DecisionTree

view release on metacpan or search on metacpan

 classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv



(4) Let's now talk about how you can deal with features that, statistically
    speaking, are not so "nice".  We are talking about features with
    heavy-tailed distributions over large value ranges.  As mentioned in
    the HTML based API for this module, such features can create problems
    with the estimation of the probability distributions associated with
    them.  As mentioned there, the main problem that such features cause
    is with deciding how best to sample the value range.

    Beginning with Version 2.22, you have two options in dealing with such
    features.  You can choose to go with the default behavior of the
    module, which is to sample the value range for such a feature over a
    maximum of 500 points.  Or, you can supply an additional option to the
    constructor that sets a user-defined value for the number of points to
    use.  The name of the option is "number_of_histogram_bins".  The
    following script

          construct_dt_for_heavytailed.pl

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
    }
    return \%answer;
}

######################################    Decision Tree Construction  ####################################

##  At the root node, we find the best feature that yields the greatest reduction in
##  class entropy from the entropy based on just the class priors. The logic for
##  finding this feature is different for symbolic features and for numeric features.
##  That logic is built into the method shown later for best feature calculations.
sub construct_decision_tree_classifier {
    print "\nConstructing the decision tree ...\n";
    my $self = shift;
    if ($self->{_debug3}) {        
        $self->determine_data_condition(); 
        print "\nStarting construction of the decision tree:\n";
    }
    my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
    if ($self->{_debug3}) { 
        print "\nPrior class probabilities: @class_probabilities\n";

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

        print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
        print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
        my @class_probs = @{$node->get_class_probabilities()};
        print "\nRD4 Class probabilities: @class_probs\n";
    }
    if ($existing_node_entropy < $self->{_entropy_threshold}) { 
        print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
        return;
    }
    my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
    my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
                    $self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
    $node->set_feature($best_feature);
    $node->display_node() if $self->{_debug3};
    if (defined($self->{_max_depth_desired}) && 
               (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
        print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3}; 
        return;
    }
    return if ! defined $best_feature;
    if ($self->{_debug3}) { 
        print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
        print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
        print "\nRD9 Best feature entropy: $best_feature_entropy\n";
        print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
    }
    my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
    print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
    if ($entropy_gain > $self->{_entropy_threshold}) {
        if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} && 
              $self->{_feature_values_how_many_uniques_hash}->{$best_feature} > 
                                        $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my $best_threshold = $decision_val;            # as returned by best feature calculator
            my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
            my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
            my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
            my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
            my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
            push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 
                                                                  $feature_threshold_combo_for_less_than;
            push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child, 
                                                               $feature_threshold_combo_for_greater_than;
            if ($self->{_debug3}) {
                print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
                print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
            }
            my @class_probabilities_for_lessthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                 \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
            my @class_probabilities_for_greaterthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
              \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
            if ($self->{_debug3}) {
                print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
                print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
            }
            if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
                                                         \@class_probabilities_for_lessthan_child_node,
                              \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
                $node->add_child_link($left_child_node);
                $self->recursive_descent($left_child_node);
            }
            if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
                                                         \@class_probabilities_for_greaterthan_child_node,
                            \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
                $node->add_child_link($right_child_node);
                $self->recursive_descent($right_child_node);
            }
        } else {
            print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
                if $self->{_debug3};
            my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
            print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
            my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
            my @class_entropies_for_children = ();
            foreach my $feature_and_value_index (0..@feature_value_combos-1) {
                print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
                    if $self->{_debug3};
                my @extended_branch_features_and_values_or_thresholds;
                if (! @features_and_values_or_thresholds_on_branch) {
                    @extended_branch_features_and_values_or_thresholds = 
                                                          ($feature_value_combos[$feature_and_value_index]);
                } else {
                    @extended_branch_features_and_values_or_thresholds =

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

            }
        }
    } else {
        print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n" 
            if $self->{_debug3};
        return;
    }
}

##  This is the heart of the decision tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples that
##  correspond to the current node.  The search for the best feature is carried out
##  differently for symbolic features and for numeric features.  For a symbolic
##  feature, the method estimates the entropy for each value of the feature and then
##  averages out these entropies as a measure of the discriminatory power of that
##  features.  For a numeric feature, on the other hand, it estimates the entropy
##  reduction that can be achieved if were to partition the set of training samples
##  for each possible threshold.  For a numeric feature, all possible sampling points
##  relevant to the node in question are considered as candidates for thresholds.
sub best_feature_calculator {
    my $self = shift;
    my $features_and_values_or_thresholds_on_branch = shift;
    my $existing_node_entropy = shift;
    my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @all_symbolic_features = ();
    foreach my $feature_name (@{$self->{_feature_names}}) {
        push @all_symbolic_features, $feature_name

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

        } else {
            die "format error in the representation of feature and values or thresholds";
        }
    }
    my %seen = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
    %seen = ();
    @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
    my @bounded_intervals_numeric_types = 
                       @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature (@true_numeric_types_feature_names) {
        $upperbound{$feature} = undef;
        $lowerbound{$feature} = undef;
    }
    foreach my $item (@bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

                    }
                } elsif (defined($upperbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($lowerbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name};
                    }
                } else {
                    die "Error is bound specifications in best feature calculator";
                }
            } else {
                @newvalues = @{deep_copy_array(\@values)};
            }
            next if @newvalues == 0;
            my @partitioning_entropies = ();            
            foreach my $value (@newvalues) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
                my @for_left_child;

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

                my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
                                    \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
                my $partitioning_entropy = $entropy1 * 
                     $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
                                           $entropy2 *
                     $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);

                push @partitioning_entropies, $partitioning_entropy;
                $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
            }
            my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
            if ($min_entropy < $existing_node_entropy) {
                $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
                $entropy_values_for_different_features{$feature_name} = $min_entropy;
            }
        } else {
            print "\nBFC2:  Entering section reserved for symbolic features\n" if $self->{_debug3};
            print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
            my %seen;
            my @values = grep {$_ ne 'NA' && !$seen{$_}++} 
                                    @{$self->{_features_and_unique_values_hash}->{$feature_name}};
            @values = sort @values;
            print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

           $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
                print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
                      "is $entropy\n" if $self->{_debug3};
                push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
            }
            if ($entropy < $existing_node_entropy) {
                $entropy_values_for_different_features{$feature_name} = $entropy;
            }
        }
    }
    my $min_entropy_for_best_feature;
    my $best_feature_name;
    foreach my $feature_nom (keys %entropy_values_for_different_features) { 
        if (!defined($best_feature_name)) {
            $best_feature_name = $feature_nom;
            $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
        } else {
            if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
                $best_feature_name = $feature_nom;
                $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
            }
        }
    }
    my $threshold_for_best_feature;
    if (exists $partitioning_point_threshold{$best_feature_name}) {
        $threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
    } else {
        $threshold_for_best_feature = undef;
    }
    my $best_feature_entropy = $min_entropy_for_best_feature;
    my @val_based_entropies_to_be_returned;
    my $decision_val_to_be_returned;
    if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} && 
          $self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} > 
                                    $self->{_symbolic_to_numeric_cardinality_threshold}) {
        @val_based_entropies_to_be_returned = 
            @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
    } else {
        @val_based_entropies_to_be_returned = ();
    }
    if (exists $partitioning_point_threshold{$best_feature_name}) {
        $decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
    } else {
        $decision_val_to_be_returned = undef;
    }
    print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
        "@val_based_entropies_to_be_returned\n"  if $self->{_debug3};
    return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned, 
                                                                      $decision_val_to_be_returned);
}

#########################################    Entropy Calculators     #####################################

sub class_entropy_on_priors {
    my $self = shift;
    return $self->{_entropy_cache}->{'priors'} 
        if exists $self->{_entropy_cache}->{"priors"};
    my @class_names = @{$self->{_class_names}};

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

            push @symbolic_types_feature_names, $feature;
        }
    }
    my %seen1 = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
    my %seen2 = ();
    @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
    my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
    print_array_with_msg("POS: Answer returned by find_bounded: ", 
                                       $bounded_intervals_numeric_types) if $self->{_debug2};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature_name (@true_numeric_types_feature_names) {
        $upperbound{$feature_name} = undef;
        $lowerbound{$feature_name} = undef;
    }
    foreach my $item (@$bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

            push @symbolic_types_feature_names, $feature;
        }
    }
    my %seen1 = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
    my %seen2 = ();
    @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
    my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
    print_array_with_msg("POSC: Answer returned by find_bounded: ", 
                                       $bounded_intervals_numeric_types) if $self->{_debug2};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature_name (@true_numeric_types_feature_names) {
        $upperbound{$feature_name} = undef;
        $lowerbound{$feature_name} = undef;
    }
    foreach my $item (@$bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

}

######################################### Class EvalTrainingData  ########################################

##  This subclass of the DecisionTree class is used to evaluate the quality of your
##  training data by running a 10-fold cross-validation test on it. This test divides
##  all of the training data into ten parts, with nine parts used for training a
##  decision tree and one part used for testing its ability to classify correctly.
##  This selection of nine parts for training and one part for testing is carried out
##  in all of the ten different possible ways.  This testing functionality can also
##  be used to find the best values to use for the constructor parameters
##  entropy_threshold, max_depth_desired, and
##  symbolic_to_numeric_cardinality_threshold.

##  Only the CSV training files can be evaluated in this manner (because only CSV
##  training are allowed to have numeric features --- which is the more interesting
##  case for evaluation analytics.

package EvalTrainingData;

@EvalTrainingData::ISA = ('Algorithm::DecisionTree');

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

classify all of your test data records in a CSV file.  This version also includes
improved code for generating synthetic numeric/symbolic training and test data
records for experimenting with the decision tree classifier.

B<Version 2.1> allows you to test the quality of your training data by running a 10-fold
cross-validation test on the data.  This test divides all of the training data into
ten parts, with nine parts used for training a decision tree and one part used for
testing its ability to classify correctly. This selection of nine parts for training
and one part for testing is carried out in all of the ten different ways that are
possible.  This testing functionality in Version 2.1 can also be used to find the
best values to use for the constructor parameters C<entropy_threshold>,
C<max_depth_desired>, and C<symbolic_to_numeric_cardinality_threshold>.

B<Version 2.0 is a major rewrite of this module.> Now you can use both numeric and
symbolic features for constructing a decision tree. A feature is numeric if it can
take any floating-point value over an interval.

B<Version 1.71> fixes a bug in the code that was triggered by 0 being declared as one of
the features values in the training datafile. Version 1.71 also include an additional
safety feature that is useful for training datafiles that contain a very large number
of features.  The new version makes sure that the number of values you declare for

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

A decision tree classifier works differently.  When you construct a decision tree,
you select for the root node a feature test that partitions the training data in a
way that causes maximal disambiguation of the class labels associated with the data.
In terms of information content as measured by entropy, such a feature test would
cause maximum reduction in class entropy in going from all of the training data taken
together to the data as partitioned by the feature test.  You then drop from the root
node a set of child nodes, one for each partition of the training data created by the
feature test at the root node. When your features are purely symbolic, you'll have
one child node for each value of the feature chosen for the feature test at the root.
When the test at the root involves a numeric feature, you find the decision threshold
for the feature that best bipartitions the data and you drop from the root node two
child nodes, one for each partition.  Now at each child node you pose the same
question that you posed when you found the best feature to use at the root: Which
feature at the child node in question would maximally disambiguate the class labels
associated with the training data corresponding to that child node?

As the reader would expect, the two key steps in any approach to decision-tree based
classification are the construction of the decision tree itself from a file
containing the training data, and then using the decision tree thus obtained for
classifying new data.

What is cool about decision tree classification is that it gives you soft
classification, meaning it may associate more than one class label with a given data

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

L<https://engineering.purdue.edu/kak/Tutorials/DecisionTreeClassifiers.pdf>

This module also allows you to generate your own synthetic training and test
data. Generating your own training data, using it for constructing a decision-tree
classifier, and subsequently testing the classifier on a synthetically generated
test set of data is a good way to develop greater proficiency with decision trees.


=head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE

If you are new to the concept of a decision tree, their practical utility is best
understood with an example that only involves symbolic features. However, as
mentioned earlier, versions of the module higher than 2.0 allow you to use both
symbolic and numeric features.

Consider the following scenario: Let's say you are running a small investment company
that employs a team of stockbrokers who make buy/sell decisions for the customers of
your company.  Assume that your company has asked the traders to make each investment
decision on the basis of the following four criteria:

  price_to_earnings_ratio   (P_to_E)

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

    $dt->get_training_data(); 
    $dt->calculate_first_order_probabilities();
    $dt->calculate_class_priors();

Subsequently, you would construct a decision tree by calling:

    my $root_node = $dt->construct_decision_tree_classifier();

Now you and your company (with practically no employees) are ready to service the
customers again. Suppose your computer needs to make a buy/sell decision about an
investment prospect that is best described by:

    price_to_earnings_ratio  =  low
    price_to_sales_ratio     =  very_low
    return_on_equity         =  none
    market_share             =  medium    

All that your computer would need to do would be to construct a data vector like

   my @data =   qw / P_to_E=low
                     P_to_S=very_low

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

                                  csv_cleanup_needed => 1,
                    );
    $eval_data->get_training_data();
    $eval_data->evaluate_training_data()

The last statement above prints out a Confusion Matrix and the value of Training Data
Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
cross-validation test.

This testing functionality can also be used to find the best values to use for the
constructor parameters C<entropy_threshold>, C<max_depth_desired>, and
C<symbolic_to_numeric_cardinality_threshold>.

The following two scripts in the C<Examples> directory illustrate the use of the
C<EvalTrainingData> class for testing the quality of your data:

    evaluate_training_data1.pl
    evaluate_training_data2.pl

lib/Algorithm/DecisionTree.pm view on Meta::CPAN

when you have a very large training dataset:> In general, the larger the dataset, the
smaller the smallest difference between any two values for a numeric feature in
relation to the overall range of values for that feature. In such cases, the module
may use too large a number of bins for estimating the probabilities and that may slow
down the calculation of the decision tree.  You can get around this difficulty by
explicitly giving a value to the 'C<number_of_histogram_bins>' parameter.

=back


You can choose the best values to use for the last three constructor parameters by
running a 10-fold cross-validation test on your training data through the class
C<EvalTrainingData> that comes with Versions 2.1 and higher of this module.  See the
section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.

=over

=item B<get_training_data():>

After you have constructed a new instance of the C<Algorithm::DecisionTree> class,
you must now read in the training data that is the file named in the call to the

lib/Algorithm/DecisionTree.pm view on Meta::CPAN


With regard to the soft classifications returned by this classifier, if the
probability distributions for the different classes overlap in the underlying feature
space, you would want the classifier to return all of the applicable class labels for
a data vector along with the corresponding class probabilities.  Another reason for
why the decision tree classifier may associate significant probabilities with
multiple class labels is that you used inadequate number of training samples to
induce the decision tree.  The good thing is that the classifier does not lie to you
(unlike, say, a hard classification rule that would return a single class label
corresponding to the partitioning of the underlying feature space).  The decision
tree classifier give you the best classification that can be made given the training
data you fed into it.


=head1 USING BAGGING

Starting with Version 3.0, you can use the class C<DecisionTreeWithBagging> that
comes with the module to incorporate bagging in your decision tree based
classification.  Bagging means constructing multiple decision trees for different
(possibly overlapping) segments of the data extracted from your training dataset and
then aggregating the decisions made by the individual decision trees for the final

lib/Algorithm/DecisionTree.pm view on Meta::CPAN


The last parameter, C<jacobian_choice>, must be set to either 0 or 1 or 2.  Its
default value is 0. When this parameter equals 0, the regression coefficients are
calculated using the linear least-squares method and no further "refinement" of the
coefficients is carried out using gradient descent.  This is the fastest way to
calculate the regression coefficients.  When C<jacobian_choice> is set to 1, you get
a weak version of gradient descent in which the Jacobian is set to the "design
matrix" itself. Choosing 2 for C<jacobian_choice> results in a more reasonable
approximation to the Jacobian.  That, however, is at a cost of much longer
computation time.  B<NOTE:> For most cases, using 0 for C<jacobian_choice> is the
best choice.  See my tutorial "I<Linear Regression and Regression Trees>" for why
that is the case.

=back

=head2 B<Methods defined for C<RegressionTree> class>

=over 8

=item B<get_training_data_for_regression():>

lib/Algorithm/RegressionTree.pm view on Meta::CPAN

          $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@copy_of_path_attributes);
        $node->set_node_XMatrix($XMatrix);
        $node->set_node_YVector($YVector);
        $node->set_node_error($error);
        $node->set_node_beta($beta);
        $node->set_num_data_points($XMatrix->cols);
        print "\nNODE SERIAL NUMBER: $node_serial_number\n";
        print "\nFeatures and values or thresholds on branch: @features_and_values_or_thresholds_on_branch\n";
        return if $error <= $self->{_mse_threshold}; 
    }
    my ($best_feature,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point) = 
                                               $self->best_feature_calculator(\@copy_of_path_attributes);
    return unless defined $best_feature_partitioning_point;
    print "\nBest feature found: $best_feature\n";
    print "Best feature partitioning_point: $best_feature_partitioning_point\n";
    print "Best minmax error at partitioning point: $best_minmax_error_at_partitioning_point\n";
    $node->set_feature($best_feature);
    $node->display_node() if $self->{_debug2_r}; 
    return if (defined $self->{_max_depth_desired}) && 
                            (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired}); 
    if ($best_minmax_error_at_partitioning_point > $self->{_mse_threshold}) {
        my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
        my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
        my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_feature_partitioning_point";
        my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_feature_partitioning_point";
        push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 
                                                                  $feature_threshold_combo_for_less_than;
        push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child, 
                                                               $feature_threshold_combo_for_greater_than;
        my $left_child_node = RTNode->new(undef, undef, undef, 
                          \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
        $node->add_child_link($left_child_node);
        $self->recursive_descent($left_child_node);
        my $right_child_node = RTNode->new(undef, undef, undef, 
                        \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
        $node->add_child_link($right_child_node);
        $self->recursive_descent($right_child_node);
    }
}

##  This is the heart of the regression tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples at the
##  current node.  The partitioning criterion is that the largest of the MSE's in 
##  the two partitions should be smaller than the error associated with the parent
##  node.
sub best_feature_calculator {
    my $self = shift;
    my $features_and_values_or_thresholds_on_branch = shift;
    my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;
    print "\n\nfeatures_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
    if (@features_and_values_or_thresholds_on_branch == 0) {
        my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        foreach my $i (0 .. @{$self->{_feature_names}}-1) {
            my $feature_name = $self->{_feature_names}->[$i];
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            my @partitioning_errors;
            my %partitioning_error_hash;
            foreach my $value (@values[10 .. $#values - 10]) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
                my @for_left_child;
                my @for_right_child;

lib/Algorithm/RegressionTree.pm view on Meta::CPAN

                }
                my ($error1,$beta1,$XMatrix1,$YVector1) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
                my ($error2,$beta2,$XMatrix2,$YVector2) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
                my $partitioning_error = max($error1, $error2);
                push @partitioning_errors, $partitioning_error;
                $partitioning_error_hash{$partitioning_error} = $value;
            }
            my $min_max_error_for_feature = min(@partitioning_errors);
            $best_partition_point_for_feature_hash->{$feature_name} = 
                                             $partitioning_error_hash{$min_max_error_for_feature};
            $best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
        }
        my $best_feature_name;
        my $best_feature_paritioning_point;
        my $best_minmax_error_at_partitioning_point;
        foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
            if (! defined $best_minmax_error_at_partitioning_point) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            } elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            }
        }
        my $best_feature_partitioning_point =  $best_partition_point_for_feature_hash->{$best_feature_name};
        return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
    } else {
        my $pattern1 = '(.+)=(.+)';
        my $pattern2 = '(.+)<(.+)';
        my $pattern3 = '(.+)>(.+)';
        my @true_numeric_types;
        my @symbolic_types;
        my @true_numeric_types_feature_names;
        my @symbolic_types_feature_names;
        foreach my $item (@features_and_values_or_thresholds_on_branch) {
            if ($item =~ /$pattern2/) {

lib/Algorithm/RegressionTree.pm view on Meta::CPAN

            } else {
                die "format error in the representation of feature and values or thresholds";
            }
        }
        my %seen = ();
        @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
        %seen = ();
        @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
        my @bounded_intervals_numeric_types = 
                           @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
        # Calculate the upper and the lower bounds to be used when searching for the best
        # threshold for each of the numeric features that are in play at the current node:
        my (%upperbound, %lowerbound);
        foreach my $feature (@true_numeric_types_feature_names) {
            $upperbound{$feature} = undef;
            $lowerbound{$feature} = undef;
        }
        foreach my $item (@bounded_intervals_numeric_types) {
            foreach my $feature_grouping (@$item) {
                if ($feature_grouping->[1] eq '>') {
                    $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
                } else {
                    $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
                }
            }
        }
        my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
        foreach my $i (0 .. @{$self->{_feature_names}}-1) {
            my $feature_name = $self->{_feature_names}->[$i];
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            my @newvalues;
            if (contained_in($feature_name, @true_numeric_types_feature_names)) {
                if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                              $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
                    next;
                } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                                    $lowerbound{$feature_name} < $upperbound{$feature_name}) {

lib/Algorithm/RegressionTree.pm view on Meta::CPAN

                    }
                } elsif (defined($upperbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($lowerbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name};
                    }
                } else {
                    die "Error is bound specifications in best feature calculator";
                }
            } else {
                @newvalues = @{deep_copy_array(\@values)};
            }
            next if @newvalues < 30;
            my @partitioning_errors;
            my %partitioning_error_hash;
            foreach my $value (@newvalues[10 .. $#newvalues - 10]) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";

lib/Algorithm/RegressionTree.pm view on Meta::CPAN

                }
                my ($error1,$beta1,$XMatrix1,$YVector1) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
                my ($error2,$beta2,$XMatrix2,$YVector2) = 
                        $self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
                my $partitioning_error = max($error1, $error2);
                push @partitioning_errors, $partitioning_error;
                $partitioning_error_hash{$partitioning_error} = $value;
            }
            my $min_max_error_for_feature = min(@partitioning_errors);
            $best_partition_point_for_feature_hash->{$feature_name} = 
                                             $partitioning_error_hash{$min_max_error_for_feature};
            $best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
        }
        my $best_feature_name;
        my $best_feature_paritioning_point;
        my $best_minmax_error_at_partitioning_point;
        foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
            if (! defined $best_minmax_error_at_partitioning_point) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            } elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
                $best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
                $best_feature_name = $feature;
            }
        }
        my $best_feature_partitioning_point =  $best_partition_point_for_feature_hash->{$best_feature_name};
        return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
    }
}

##  This method requires that all truly numeric types only be expressed as '<' or '>'
##  constructs in the array of branch features and thresholds
sub _error_for_given_sequence_of_features_and_values_or_thresholds{
    my $self = shift;
    my $array_of_features_and_values_or_thresholds = shift;
    if (@$array_of_features_and_values_or_thresholds == 0) { 
        my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();

lib/Algorithm/RegressionTree.pm view on Meta::CPAN

        } else {
            die "format error in the representation of feature and values or thresholds";
        }
    }
    my %seen = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
    %seen = ();
    @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
    my @bounded_intervals_numeric_types = 
                           @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature (@true_numeric_types_feature_names) {
        $upperbound{$feature} = undef;
        $lowerbound{$feature} = undef;
    }
    foreach my $item (@bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];

( run in 0.603 second using v1.01-cache-2.11-cpan-4e96b696675 )