Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                    %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
                                                                     \%answer, \%scratchpad_for_numerics)};
                    $path_found = 1;
                    push @{$answer{'solution_path'}}, $node->get_serial_num();
                    last;
                }
            }
            if ($branch_attribute =~ /$pattern2/) {
                my ($feature,$threshold) = ($1,$2);
                if ($user_value_for_feature > $threshold) {
                    %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
                                                                     \%answer, \%scratchpad_for_numerics)};
                    $path_found = 1;
                    push @{$answer{'solution_path'}}, $node->get_serial_num();
                    last;
                }
            }
        }
        return \%answer if $path_found;
    } else {
        my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
        while (1) {
            print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
            print "\nEnter a value from the list (@possible_values_for_feature): ";
            $user_value_for_feature = <STDIN>;
            $user_value_for_feature =~ s/\r?\n?$//;
            $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
            my $answer_found = 0;
            if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
                $answer_found = 1;
                last;
            }
            last if $answer_found;
            print("You entered illegal value. Let's try again");
        }
        $feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
        foreach my $i (0..@list_of_branch_attributes_to_children-1) {
            my $branch_attribute = $list_of_branch_attributes_to_children[$i];
            if ($branch_attribute eq $feature_value_combo) {
                %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
                                                                     \%answer, \%scratchpad_for_numerics)};
                $path_found = 1;
                push @{$answer{'solution_path'}}, $node->get_serial_num();
                last;
            }
        }
        return \%answer if $path_found;
    }
    if (! $path_found) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
    }
    return \%answer;
}

######################################    Decision Tree Construction  ####################################

##  At the root node, we find the best feature that yields the greatest reduction in
##  class entropy from the entropy based on just the class priors. The logic for
##  finding this feature is different for symbolic features and for numeric features.
##  That logic is built into the method shown later for best feature calculations.
sub construct_decision_tree_classifier {
    print "\nConstructing the decision tree ...\n";
    my $self = shift;
    if ($self->{_debug3}) {        
        $self->determine_data_condition(); 
        print "\nStarting construction of the decision tree:\n";
    }
    my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
    if ($self->{_debug3}) { 
        print "\nPrior class probabilities: @class_probabilities\n";
        print "\nClass names: @{$self->{_class_names}}\n";
    }
    my $entropy = $self->class_entropy_on_priors();
    print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
    my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
    $root_node->set_class_names(\@{$self->{_class_names}});
    $self->{_root_node} = $root_node;
    $self->recursive_descent($root_node);
    return $root_node;
}

##  After the root node of the decision tree is calculated by the previous methods,
##  we invoke this method recursively to create the rest of the tree.  At each node,
##  we find the feature that achieves the largest entropy reduction with regard to
##  the partitioning of the training data samples that correspond to that node.
sub recursive_descent {
    my $self = shift;
    my $node = shift;
    print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
        if $self->{_debug3};
    my $node_serial_number = $node->get_serial_num();
    my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
    my $existing_node_entropy = $node->get_node_entropy();
    if ($self->{_debug3}) { 
        print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
        print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
        print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
        my @class_probs = @{$node->get_class_probabilities()};
        print "\nRD4 Class probabilities: @class_probs\n";
    }
    if ($existing_node_entropy < $self->{_entropy_threshold}) { 
        print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
        return;
    }
    my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
    my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
                    $self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
    $node->set_feature($best_feature);
    $node->display_node() if $self->{_debug3};
    if (defined($self->{_max_depth_desired}) && 
               (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
        print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3}; 
        return;
    }
    return if ! defined $best_feature;
    if ($self->{_debug3}) { 
        print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
        print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
        print "\nRD9 Best feature entropy: $best_feature_entropy\n";
        print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
    }
    my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
    print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
    if ($entropy_gain > $self->{_entropy_threshold}) {
        if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} && 
              $self->{_feature_values_how_many_uniques_hash}->{$best_feature} > 
                                        $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my $best_threshold = $decision_val;            # as returned by best feature calculator
            my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
            my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
            my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
            my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
            my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
            push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 
                                                                  $feature_threshold_combo_for_less_than;
            push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child, 
                                                               $feature_threshold_combo_for_greater_than;
            if ($self->{_debug3}) {
                print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
                print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
            }
            my @class_probabilities_for_lessthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                 \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
            my @class_probabilities_for_greaterthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
              \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
            if ($self->{_debug3}) {
                print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
                print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
            }
            if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
                                                         \@class_probabilities_for_lessthan_child_node,
                              \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
                $node->add_child_link($left_child_node);
                $self->recursive_descent($left_child_node);
            }
            if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
                                                         \@class_probabilities_for_greaterthan_child_node,
                            \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
                $node->add_child_link($right_child_node);
                $self->recursive_descent($right_child_node);
            }
        } else {
            print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
                if $self->{_debug3};
            my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
            print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
            my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
            my @class_entropies_for_children = ();
            foreach my $feature_and_value_index (0..@feature_value_combos-1) {
                print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
                    if $self->{_debug3};
                my @extended_branch_features_and_values_or_thresholds;
                if (! @features_and_values_or_thresholds_on_branch) {
                    @extended_branch_features_and_values_or_thresholds = 
                                                          ($feature_value_combos[$feature_and_value_index]);
                } else {
                    @extended_branch_features_and_values_or_thresholds = 
                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @extended_branch_features_and_values_or_thresholds, 
                                           $feature_value_combos[$feature_and_value_index];
                }
                my @class_probabilities =
                   map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                               \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
                my $class_entropy_for_child = 
                      $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
                                                         \@extended_branch_features_and_values_or_thresholds);
                if ($self->{_debug3}) {
                    print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
                    print "\nRD20 class entropy for child: $class_entropy_for_child\n"; 
                }
                if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
                    my $child_node = DTNode->new(undef, $class_entropy_for_child,
                          \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
                    $node->add_child_link($child_node);
                    $self->recursive_descent($child_node);
                } else {
                    print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
                }
            }
        }
    } else {
        print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n" 
            if $self->{_debug3};
        return;
    }
}

##  This is the heart of the decision tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples that
##  correspond to the current node.  The search for the best feature is carried out
##  differently for symbolic features and for numeric features.  For a symbolic
##  feature, the method estimates the entropy for each value of the feature and then
##  averages out these entropies as a measure of the discriminatory power of that
##  features.  For a numeric feature, on the other hand, it estimates the entropy
##  reduction that can be achieved if were to partition the set of training samples
##  for each possible threshold.  For a numeric feature, all possible sampling points
##  relevant to the node in question are considered as candidates for thresholds.
sub best_feature_calculator {
    my $self = shift;
    my $features_and_values_or_thresholds_on_branch = shift;
    my $existing_node_entropy = shift;
    my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @all_symbolic_features = ();
    foreach my $feature_name (@{$self->{_feature_names}}) {
        push @all_symbolic_features, $feature_name 
            if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
    }
    my @symbolic_features_already_used = ();  
    foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
        push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
    }
    my @symbolic_features_not_yet_used;
    foreach my $x (@all_symbolic_features) {
        push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
    }
    my @true_numeric_types = ();
    my @symbolic_types = ();
    my @true_numeric_types_feature_names = ();
    my @symbolic_types_feature_names = ();
    foreach my $item (@features_and_values_or_thresholds_on_branch) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern3/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern1/) {
            push @symbolic_types, $item;
            push @symbolic_types_feature_names, $1;
        } else {
            die "format error in the representation of feature and values or thresholds";
        }
    }
    my %seen = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
    %seen = ();
    @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
    my @bounded_intervals_numeric_types = 
                       @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature (@true_numeric_types_feature_names) {
        $upperbound{$feature} = undef;
        $lowerbound{$feature} = undef;
    }
    foreach my $item (@bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
            } else {
                $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
            }
        }
    }
    my %entropy_values_for_different_features = ();
    my %partitioning_point_child_entropies_hash = ();
    my %partitioning_point_threshold = ();
    my %entropies_for_different_values_of_symbolic_feature = ();
    foreach my $feature (@{$self->{_feature_names}}) {
        $entropy_values_for_different_features{$feature} = [];
        $partitioning_point_child_entropies_hash{$feature} = {};
        $partitioning_point_threshold{$feature} = undef;
        $entropies_for_different_values_of_symbolic_feature{$feature} = [];
    }
    foreach my $i (0..@{$self->{_feature_names}}-1) {
        my $feature_name = $self->{_feature_names}->[$i];
        print "\n\nBFC1          FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
        if (contained_in($feature_name, @symbolic_features_already_used)) {
            next;
        } elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
                 $self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
                                      $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};      
            my @newvalues = ();
            if (contained_in($feature_name, @true_numeric_types_feature_names)) {
                if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                              $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
                    next;
                } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                                    $lowerbound{$feature_name} < $upperbound{$feature_name}) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($upperbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($lowerbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name};
                    }
                } else {
                    die "Error is bound specifications in best feature calculator";
                }
            } else {
                @newvalues = @{deep_copy_array(\@values)};
            }
            next if @newvalues == 0;
            my @partitioning_entropies = ();            
            foreach my $value (@newvalues) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
                my @for_left_child;
                my @for_right_child;
                if (@features_and_values_or_thresholds_on_branch) {
                    @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_left_child, $feature_and_less_than_value_string;
                    @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_right_child, $feature_and_greater_than_value_string;
                } else {
                    @for_left_child = ($feature_and_less_than_value_string);
                    @for_right_child = ($feature_and_greater_than_value_string);
                }
                my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
                                    \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
                my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
                                    \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
                my $partitioning_entropy = $entropy1 * 
                     $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
                                           $entropy2 *
                     $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);

                push @partitioning_entropies, $partitioning_entropy;
                $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
            }
            my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
            if ($min_entropy < $existing_node_entropy) {
                $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
                $entropy_values_for_different_features{$feature_name} = $min_entropy;
            }
        } else {
            print "\nBFC2:  Entering section reserved for symbolic features\n" if $self->{_debug3};
            print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
            my %seen;
            my @values = grep {$_ ne 'NA' && !$seen{$_}++} 
                                    @{$self->{_features_and_unique_values_hash}->{$feature_name}};
            @values = sort @values;
            print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};

            my $entropy = 0;
            foreach my $value (@values) {
                my $feature_value_string = "$feature_name" . '=' . "$value";
                print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
                my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                if (@features_and_values_or_thresholds_on_branch) {
                    push @extended_attributes, $feature_value_string;
                } else {
                    @extended_attributes = ($feature_value_string);
                }
                $entropy += 
           $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) * 
           $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
                print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
                      "is $entropy\n" if $self->{_debug3};
                push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
            }
            if ($entropy < $existing_node_entropy) {
                $entropy_values_for_different_features{$feature_name} = $entropy;
            }
        }
    }
    my $min_entropy_for_best_feature;
    my $best_feature_name;
    foreach my $feature_nom (keys %entropy_values_for_different_features) { 
        if (!defined($best_feature_name)) {
            $best_feature_name = $feature_nom;
            $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
        } else {
            if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
                $best_feature_name = $feature_nom;
                $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
            }
        }
    }
    my $threshold_for_best_feature;
    if (exists $partitioning_point_threshold{$best_feature_name}) {
        $threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
    } else {
        $threshold_for_best_feature = undef;
    }
    my $best_feature_entropy = $min_entropy_for_best_feature;
    my @val_based_entropies_to_be_returned;
    my $decision_val_to_be_returned;
    if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} && 
          $self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} > 
                                    $self->{_symbolic_to_numeric_cardinality_threshold}) {
        @val_based_entropies_to_be_returned = 
            @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
    } else {
        @val_based_entropies_to_be_returned = ();
    }
    if (exists $partitioning_point_threshold{$best_feature_name}) {
        $decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
    } else {
        $decision_val_to_be_returned = undef;
    }
    print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
        "@val_based_entropies_to_be_returned\n"  if $self->{_debug3};
    return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned, 
                                                                      $decision_val_to_be_returned);
}

#########################################    Entropy Calculators     #####################################

sub class_entropy_on_priors {
    my $self = shift;
    return $self->{_entropy_cache}->{'priors'} 
        if exists $self->{_entropy_cache}->{"priors"};
    my @class_names = @{$self->{_class_names}};
    my $entropy;
    foreach my $class (@class_names) {
        my $prob = $self->prior_probability_for_class($class);
        my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
        $log_prob = 0 if $prob < 0.0001;           # since X.log(X)->0 as X->0
        $log_prob = 0 if $prob > 0.999;            # since log(1) = 0
        if (!defined $entropy) {
            $entropy = -1.0 * $prob * $log_prob; 
            next;
        }
        $entropy += -1.0 * $prob * $log_prob;
    }
    $self->{_entropy_cache}->{'priors'} = $entropy;
    return $entropy;
}

sub entropy_scanner_for_a_numeric_feature {
    local $| = 1;
    my $self = shift;
    my $feature = shift;
    my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
    my @entropies_for_less_than_thresholds = ();
    my @entropies_for_greater_than_thresholds = ();
    foreach my $point (@all_sampling_points) {
        print ". ";
        push @entropies_for_less_than_thresholds, 
                         $self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
        push @entropies_for_greater_than_thresholds,
                      $self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
    }
    print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ". 
                                                                "@entropies_for_less_than_thresholds\n";
    print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ". 
                                                             "@entropies_for_greater_than_thresholds\n";
}   

sub class_entropy_for_less_than_threshold_for_feature {
    my $self = shift;
    my $arr = shift;
    my $feature = shift;
    my $threshold = shift;
    my @array_of_features_and_values_or_thresholds = @$arr;
    my $feature_threshold_combo = "$feature" . '<' . "$threshold";
    my $sequence = join ":", @array_of_features_and_values_or_thresholds;
    $sequence .= ":" . $feature_threshold_combo;
    return $self->{_entropy_cache}->{$sequence}  if exists $self->{_entropy_cache}->{$sequence};
    my @copy_of_array_of_features_and_values_or_thresholds = 
                                       @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
    push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
    my $entropy = 0;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        push @data_samples_for_class, $sample_name 
                  if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
    }
    my @actual_feature_values_for_samples_in_class = ();
    foreach my $sample (@data_samples_for_class) {
        foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
            my $pattern = '(.+)=(.+)';
            $feature_and_value =~ /$pattern/;
            my ($feature,$value) = ($1,$2);
            push @actual_feature_values_for_samples_in_class, $value
                                    if $feature eq $feature_name && $value ne 'NA';
        }
    }
    my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
    # The condition in the assignment that follows was a bug correction in Version 3.20
    my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
    $self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
    return $probability;
}

# This method requires that all truly numeric types only be expressed as '<' or '>'
# constructs in the array of branch features and thresholds
sub probability_of_a_sequence_of_features_and_values_or_thresholds {
    my $self = shift;
    my $arr = shift;
    my @array_of_features_and_values_or_thresholds = @$arr;
    return if scalar @array_of_features_and_values_or_thresholds == 0;
    my $sequence = join ':', @array_of_features_and_values_or_thresholds;
    return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
    my $probability = undef;
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @true_numeric_types = ();
    my @true_numeric_types_feature_names = ();
    my @symbolic_types = ();
    my @symbolic_types_feature_names = ();
    foreach my $item (@array_of_features_and_values_or_thresholds) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            my ($feature,$value) = ($1,$2);
            push @true_numeric_types_feature_names, $feature;
        } elsif ($item =~ /$pattern3/) {
            push @true_numeric_types, $item;
            my ($feature,$value) = ($1,$2);
            push @true_numeric_types_feature_names, $feature;
        } else {
            push @symbolic_types, $item;
            $item =~ /$pattern1/;
            my ($feature,$value) = ($1,$2);
            push @symbolic_types_feature_names, $feature;
        }
    }
    my %seen1 = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
    my %seen2 = ();
    @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
    my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
    print_array_with_msg("POS: Answer returned by find_bounded: ", 
                                       $bounded_intervals_numeric_types) if $self->{_debug2};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature_name (@true_numeric_types_feature_names) {
        $upperbound{$feature_name} = undef;
        $lowerbound{$feature_name} = undef;
    }
    foreach my $item (@$bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
            } else {
                $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
            }
        }
    }
    foreach my $feature_name (@true_numeric_types_feature_names) {
        if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) && 
                          $upperbound{$feature_name} <= $lowerbound{$feature_name}) { 
            return 0;
        } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
            if (! $probability) {
                $probability = $self->probability_of_feature_less_than_threshold($feature_name, 
                                                                                 $upperbound{$feature_name}) -
                   $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
            } else {
                $probability *= ($self->probability_of_feature_less_than_threshold($feature_name, 
                                                                                   $upperbound{$feature_name}) -
                 $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
            }
        } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
            if (! $probability) {
                $probability = $self->probability_of_feature_less_than_threshold($feature_name,
                                                                                 $upperbound{$feature_name});
            } else {
                $probability *= $self->probability_of_feature_less_than_threshold($feature_name, 
                                                                                  $upperbound{$feature_name});
            }
        } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
            if (! $probability) {
                $probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
                                                                                 $lowerbound{$feature_name});
            } else {
                $probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name, 
                                                                                $lowerbound{$feature_name}));
            }
        } else {
            die("Ill formatted call to 'probability_of_sequence' method");
        }
    }
    foreach my $feature_and_value (@symbolic_types) {
        if ($feature_and_value =~ /$pattern1/) {
            my ($feature,$value) = ($1,$2);
            if (! $probability) {        
                $probability = $self->probability_of_feature_value($feature, $value);
            } else {
                $probability *= $self->probability_of_feature_value($feature, $value);
            }
        }
    }
    $self->{_probability_cache}->{$sequence} = $probability;
    return $probability;
}

##  The following method requires that all truly numeric types only be expressed as
##  '<' or '>' constructs in the array of branch features and thresholds
sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
    my $self = shift;
    my $arr = shift;
    my $class_name = shift;
    my @array_of_features_and_values_or_thresholds = @$arr;
    return if scalar @array_of_features_and_values_or_thresholds == 0;
    my $sequence = join ':', @array_of_features_and_values_or_thresholds;
    my $sequence_with_class = "$sequence" . "::" . $class_name;
    return $self->{_probability_cache}->{$sequence_with_class} 
                      if exists $self->{_probability_cache}->{$sequence_with_class};
    my $probability = undef;
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @true_numeric_types = ();
    my @true_numeric_types_feature_names = ();
    my @symbolic_types = ();
    my @symbolic_types_feature_names = ();
    foreach my $item (@array_of_features_and_values_or_thresholds) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            my ($feature,$value) = ($1,$2);
            push @true_numeric_types_feature_names, $feature;
        } elsif ($item =~ /$pattern3/) {
            push @true_numeric_types, $item;
            my ($feature,$value) = ($1,$2);
            push @true_numeric_types_feature_names, $feature;
        } else {
            push @symbolic_types, $item;
            $item =~ /$pattern1/;
            my ($feature,$value) = ($1,$2);
            push @symbolic_types_feature_names, $feature;
        }
    }
    my %seen1 = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
    my %seen2 = ();
    @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
    my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
    print_array_with_msg("POSC: Answer returned by find_bounded: ", 
                                       $bounded_intervals_numeric_types) if $self->{_debug2};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature_name (@true_numeric_types_feature_names) {
        $upperbound{$feature_name} = undef;
        $lowerbound{$feature_name} = undef;
    }
    foreach my $item (@$bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
            } else {
                $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
            }
        }
    }
    foreach my $feature_name (@true_numeric_types_feature_names) {
        if ($lowerbound{$feature_name} && $upperbound{$feature_name} && 
                          $upperbound{$feature_name} <= $lowerbound{$feature_name}) { 
            return 0;
        } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
            if (! $probability) {

                $probability =   $self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $upperbound{$feature_name}, $class_name) -
                                 $self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $lowerbound{$feature_name}, $class_name);
            } else {
                $probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $upperbound{$feature_name}, $class_name) -
                                 $self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $lowerbound{$feature_name}, $class_name))
            }
        } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
            if (! $probability) {
                $probability =   $self->probability_of_feature_less_than_threshold_given_class($feature_name,
                                                               $upperbound{$feature_name}, $class_name);
            } else {
                $probability *=  $self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $upperbound{$feature_name}, $class_name);
            }
        } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
            if (! $probability) {
                $probability =   1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
                                                               $lowerbound{$feature_name}, $class_name);
            } else {
                $probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
                                                               $lowerbound{$feature_name}, $class_name));
            }
        } else {
            die("Ill formatted call to 'probability of sequence given class' method");
        }
    }
    foreach my $feature_and_value (@symbolic_types) {
        if ($feature_and_value =~ /$pattern1/) {
            my ($feature,$value) = ($1,$2);
            if (! $probability) {        
                $probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
            } else {
                $probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
            }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub print_array_with_msg {
    my $message = shift;
    my $arr = shift;
    print "\n$message: ";
    print_nested_array( $arr );
}

sub print_nested_array {
    my $arr = shift;
    my @arr = @$arr;
    print "[";
    foreach my $item (@arr) {
        if (ref $item) {
            print_nested_array($item);
        } else {
            print "$item";
        }
    }
    print "]";
}    

sub cleanup_csv {
    my $line = shift;
    $line =~ tr/\/:?()[]{}'/          /;
#    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
    for (@double_quoted) {
        my $item = $_;
        $item = substr($item, 1, -1);
        $item =~ s/^\s+|,|\s+$//g;
        $item = join '_',  split /\s+/, $item;
        substr($line, index($line, $_), length($_)) = $item;
    }
    my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
    for (@white_spaced) {
        my $item = $_;
        $item =~ s/\s+/_/g;
        $item =~ s/^\s*_|_\s*$//g;
        substr($line, index($line, $_), length($_)) = $item;
    }
    $line =~ s/,\s*(?=,|$)/,NA/g;
    return $line;
}

######################################### Class EvalTrainingData  ########################################

##  This subclass of the DecisionTree class is used to evaluate the quality of your
##  training data by running a 10-fold cross-validation test on it. This test divides
##  all of the training data into ten parts, with nine parts used for training a
##  decision tree and one part used for testing its ability to classify correctly.
##  This selection of nine parts for training and one part for testing is carried out
##  in all of the ten different possible ways.  This testing functionality can also
##  be used to find the best values to use for the constructor parameters
##  entropy_threshold, max_depth_desired, and
##  symbolic_to_numeric_cardinality_threshold.

##  Only the CSV training files can be evaluated in this manner (because only CSV
##  training are allowed to have numeric features --- which is the more interesting
##  case for evaluation analytics.

package EvalTrainingData;

@EvalTrainingData::ISA = ('Algorithm::DecisionTree');

sub new {
    my $class = shift;
    my $instance = Algorithm::DecisionTree->new(@_);
    bless $instance, $class;
}

sub evaluate_training_data {
    my $self = shift;
    my $evaldebug = 0;
    die "The data evaluation function in the module can only be used when your " .
        "training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
    print "\nWill run a 10-fold cross-validation test on your training data to test its " .
          "class-discriminatory power:\n";
    my %all_training_data = %{$self->{_training_data_hash}};
    my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=> 
                                     Algorithm::DecisionTree::sample_index($b)}  keys %all_training_data;
    my $fold_size = int(0.1 * (scalar keys %all_training_data));
    print "fold size: $fold_size\n";
    my %confusion_matrix = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        foreach my $inner_class_name (@{$self->{_class_names}}) {
            $confusion_matrix{$class_name}->{$inner_class_name} = 0;
        }
    }
    foreach my $fold_index (0..9) {
        print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n"; 
        my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
        my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],  
                     @all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
        my %testing_data = ();
        foreach my $x (@testing_samples) {
            $testing_data{$x} = $all_training_data{$x};
        }
        my %training_data = ();
        foreach my $x (@training_samples) {
            $training_data{$x} = $all_training_data{$x};
        }
        my $trainingDT = Algorithm::DecisionTree->new('evalmode');
        $trainingDT->{_training_data_hash} = \%training_data;
        $trainingDT->{_class_names} = $self->{_class_names};
        $trainingDT->{_feature_names} = $self->{_feature_names};
        $trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
        $trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
        $trainingDT->{_symbolic_to_numeric_cardinality_threshold} = 
                                                $self->{_symbolic_to_numeric_cardinality_threshold};
        foreach my $sample_name (@training_samples) {
            $trainingDT->{_samples_class_label_hash}->{$sample_name} = 
                                                $self->{_samples_class_label_hash}->{$sample_name};
        }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

of the data.  The 'examples' directory of this version includes additional scripts
that illustrate these enhancements to the introspection capability.  See the section
"The Introspection API" for a declaration of the introspection related methods, old
and new.

B<Version 2.30:> In response to requests from several users, this version includes a new
capability: You can now ask the module to introspect about the classification
decisions returned by the decision tree.  Toward that end, the module includes a new
class named C<DTIntrospection>.  Perhaps the most important bit of information you
are likely to seek through DT introspection is the list of the training samples that
fall directly in the portion of the feature space that is assigned to a node.
B<CAVEAT:> When training samples are non-uniformly distributed in the underlying
feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE.  B<(This is an important
part of the generalization achieved by probabilistic modeling of the training data.)>
For additional information related to DT introspection, see the section titled
"DECISION TREE INTROSPECTION" in this documentation page.

B<Version 2.26> fixes a bug in the part of the module that some folks use for generating
synthetic data for experimenting with decision tree construction and classification.
In the class C<TrainingDataGeneratorNumeric> that is a part of the module, there
was a problem with the order in which the features were recorded from the
user-supplied parameter file.  The basic code for decision tree construction and
classification remains unchanged.

B<Version 2.25> further downshifts the required version of Perl for this module.  This
was a result of testing the module with Version 5.10.1 of Perl.  Only one statement
in the module code needed to be changed for the module to work with the older version
of Perl.

B<Version 2.24> fixes the C<Makefile.PL> restriction on the required Perl version.  This
version should work with Perl versions 5.14.0 and higher.

B<Version 2.23> changes the required version of Perl from 5.18.0 to 5.14.0.  Everything
else remains the same.

B<Version 2.22> should prove more robust when the probability distribution for the
values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
observations can occur with significant probabilities.  A new option in the
DecisionTree constructor lets the user specify the precision with which the
probability distributions are estimated for such features.

B<Version 2.21> fixes a bug that was caused by the explicitly set zero values for
numerical features being misconstrued as "false" in the conditional statements in
some of the method definitions.

B<Version 2.2> makes it easier to write code for classifying in one go all of your test
data samples in a CSV file.  The bulk classifications obtained can be written out to
either a CSV file or to a regular text file.  See the script
C<classify_test_data_in_a_file_numeric.pl> in the C<Examples> directory for how to
classify all of your test data records in a CSV file.  This version also includes
improved code for generating synthetic numeric/symbolic training and test data
records for experimenting with the decision tree classifier.

B<Version 2.1> allows you to test the quality of your training data by running a 10-fold
cross-validation test on the data.  This test divides all of the training data into
ten parts, with nine parts used for training a decision tree and one part used for
testing its ability to classify correctly. This selection of nine parts for training
and one part for testing is carried out in all of the ten different ways that are
possible.  This testing functionality in Version 2.1 can also be used to find the
best values to use for the constructor parameters C<entropy_threshold>,
C<max_depth_desired>, and C<symbolic_to_numeric_cardinality_threshold>.

B<Version 2.0 is a major rewrite of this module.> Now you can use both numeric and
symbolic features for constructing a decision tree. A feature is numeric if it can
take any floating-point value over an interval.

B<Version 1.71> fixes a bug in the code that was triggered by 0 being declared as one of
the features values in the training datafile. Version 1.71 also include an additional
safety feature that is useful for training datafiles that contain a very large number
of features.  The new version makes sure that the number of values you declare for
each sample record matches the number of features declared at the beginning of the
training datafile.

B<Version 1.7> includes safety checks on the consistency of the data you place in your
training datafile. When a training file contains thousands of samples, it is
difficult to manually check that you used the same class names in your sample records
that you declared at top of your training file or that the values you have for your
features are legal vis-a-vis the earlier declarations of the values in the training
file.  Another safety feature incorporated in this version is the non-consideration
of classes that are declared at the top of the training file but that have no sample
records in the file.

B<Version 1.6> uses probability caching much more extensively compared to the previous
versions.  This should result in faster construction of large decision trees.
Another new feature in Version 1.6 is the use of a decision tree for interactive
classification. In this mode, after you have constructed a decision tree from the
training data, the user is prompted for answers to the questions pertaining to the
feature tests at the nodes of the tree.

Some key elements of the documentation were cleaned up and made more readable in
B<Version 1.41>.  The implementation code remains unchanged from Version 1.4.

B<Version 1.4> should make things faster (and easier) for folks who want to use this
module with training data that creates very large decision trees (that is, trees with
tens of thousands or more decision nodes).  The speedup in Version 1.4 has been
achieved by eliminating duplicate calculation of probabilities as the tree grows.  In
addition, this version provides an additional constructor parameter,
C<max_depth_desired> for controlling the size of the decision tree.  This is in
addition to the tree size control achieved by the parameter C<entropy_threshold> that
was introduced in Version 1.3.  Since large decision trees can take a long time to
create, you may find yourself wishing you could store the tree you just created in a
disk file and that, subsequently, you could use the stored tree for classification
work.  The C<Examples> directory contains two scripts, C<store_dt_on_disk.pl> and
C<classify_from_disk_stored_dt.pl>, that show how you can do exactly that with the
help of Perl's C<Storable> module.

B<Version 1.3> addresses the issue that arises when the header of a training datafile
declares a certain possible value for a feature but that (feature,value) pair does
NOT show up anywhere in the training data.  Version 1.3 also makes it possible for a
user to control the size of the decision tree by changing the value of the parameter
C<entropy_threshold.> Additionally, Version 1.3 includes a method called
C<determine_data_condition()> that displays useful information regarding the size and
some other attributes of the training data.  It also warns the user if the training
data might result in a decision tree that would simply be much too large --- unless
the user controls the size with the entropy_threshold parameter.

In addition to the removal of a couple of serious bugs, B<version 1.2> incorporates a
number of enhancements: (1) Version 1.2 includes checks on the names of the features
and values used in test data --- this is the data you want to classify with the
decision tree classifier constructed by this module.  (2) Version 1.2 includes a
separate constructor for generating test data.  To make it easier to generate test
data whose probabilistic parameters may not be identical to that used for the
training data, I have used separate routines for generating the test data.  (3)
Version 1.2 also includes in its examples directory a script that classifies the test
data in a file and outputs the class labels into another file.  This is for folks who
do not wish to write their own scripts using this module. (4) Version 1.2 also
includes addition to the documentation regarding the issue of numeric values for
features.

=head1 DESCRIPTION

B<Algorithm::DecisionTree> is a I<perl5> module for constructing a decision tree from
a training datafile containing multidimensional data.  In one form or another,
decision trees have been around for about fifty years.  From a statistical
perspective, they are closely related to classification and regression by recursive
partitioning of multidimensional data.  Early work that demonstrated the usefulness
of such partitioning of data for classification and regression can be traced to the
work of Terry Therneau in the early 1980's in the statistics community, and to the
work of Ross Quinlan in the mid 1990's in the machine learning community.

For those not familiar with decision tree ideas, the traditional way to classify
multidimensional data is to start with a feature space whose dimensionality is the
same as that of the data.  Each feature in this space corresponds to the attribute
that each dimension of the data measures.  You then use the training data to carve up
the feature space into different regions, each corresponding to a different class.
Subsequently, when you try to classify a new data sample, you locate it in the
feature space and find the class label of the region to which it belongs.  One can
also give the new data point the same class label as that of the nearest training
sample. This is referred to as the nearest neighbor classification.  There exist
hundreds of variations of varying power on these two basic approaches to the
classification of multidimensional data.

A decision tree classifier works differently.  When you construct a decision tree,
you select for the root node a feature test that partitions the training data in a
way that causes maximal disambiguation of the class labels associated with the data.
In terms of information content as measured by entropy, such a feature test would
cause maximum reduction in class entropy in going from all of the training data taken
together to the data as partitioned by the feature test.  You then drop from the root
node a set of child nodes, one for each partition of the training data created by the
feature test at the root node. When your features are purely symbolic, you'll have
one child node for each value of the feature chosen for the feature test at the root.
When the test at the root involves a numeric feature, you find the decision threshold
for the feature that best bipartitions the data and you drop from the root node two
child nodes, one for each partition.  Now at each child node you pose the same
question that you posed when you found the best feature to use at the root: Which
feature at the child node in question would maximally disambiguate the class labels
associated with the training data corresponding to that child node?

As the reader would expect, the two key steps in any approach to decision-tree based
classification are the construction of the decision tree itself from a file
containing the training data, and then using the decision tree thus obtained for
classifying new data.

What is cool about decision tree classification is that it gives you soft
classification, meaning it may associate more than one class label with a given data
vector.  When this happens, it may mean that your classes are indeed overlapping in
the underlying feature space.  It could also mean that you simply have not supplied
sufficient training data to the decision tree classifier.  For a tutorial
introduction to how a decision tree is constructed and used, visit
L<https://engineering.purdue.edu/kak/Tutorials/DecisionTreeClassifiers.pdf>

This module also allows you to generate your own synthetic training and test
data. Generating your own training data, using it for constructing a decision-tree
classifier, and subsequently testing the classifier on a synthetically generated
test set of data is a good way to develop greater proficiency with decision trees.


=head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE

If you are new to the concept of a decision tree, their practical utility is best
understood with an example that only involves symbolic features. However, as
mentioned earlier, versions of the module higher than 2.0 allow you to use both
symbolic and numeric features.

Consider the following scenario: Let's say you are running a small investment company
that employs a team of stockbrokers who make buy/sell decisions for the customers of
your company.  Assume that your company has asked the traders to make each investment
decision on the basis of the following four criteria:

  price_to_earnings_ratio   (P_to_E)

  price_to_sales_ratio      (P_to_S)

  return_on_equity          (R_on_E)

  market_share              (MS)

Since you are the boss, you keep track of the buy/sell decisions made by the
individual traders.  But one unfortunate day, all of your traders decide to quit
because you did not pay them enough.  So what do you do?  If you had a module like
the one here, you could still run your company and do so in such a way that, on the
average, would do better than any of the individual traders who worked for your
company.  This is what you do: You pool together the individual trader buy/sell
decisions you have accumulated during the last one year.  This pooled information is
likely to look like:


  example      buy/sell     P_to_E     P_to_S     R_on_E      MS
  ============================================================+=

  example_1     buy          high       low        medium    low
  example_2     buy          medium     medium     low       low
  example_3     sell         low        medium     low       high
  ....
  ....

This data, when formatted according to CSV, would constitute your training file. You
could feed this file into the module by calling:

    my $dt = Algorithm::DecisionTree->new( 
                     training_datafile => $training_datafile,
                     csv_class_column_index => 1,
                     csv_columns_for_features => [2,3,4,5],
             );
    $dt->get_training_data(); 
    $dt->calculate_first_order_probabilities();
    $dt->calculate_class_priors();

Subsequently, you would construct a decision tree by calling:

    my $root_node = $dt->construct_decision_tree_classifier();

Now you and your company (with practically no employees) are ready to service the
customers again. Suppose your computer needs to make a buy/sell decision about an
investment prospect that is best described by:

    price_to_earnings_ratio  =  low
    price_to_sales_ratio     =  very_low
    return_on_equity         =  none
    market_share             =  medium    

All that your computer would need to do would be to construct a data vector like

   my @data =   qw / P_to_E=low
                     P_to_S=very_low
                     R_on_E=none
                     MS=medium /;

and call the decision tree classifier you just constructed by

    $dt->classify($root_node, \@data); 

The answer returned will be 'buy' and 'sell', along with the associated
probabilities.  So if the probability of 'buy' is considerably greater than the
probability of 'sell', that's what you should instruct your computer to do.

The chances are that, on the average, this approach would beat the performance of any
of your individual traders who worked for you previously since the buy/sell decisions
made by the computer would be based on the collective wisdom of all your previous
traders.  B<DISCLAIMER: There is obviously a lot more to good investing than what is
captured by the silly little example here. However, it does nicely the convey the
sense in which the current module could be used.>

=head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES

A feature is symbolic when its values are compared using string comparison operators.
By the same token, a feature is numeric when its values are compared using numeric
comparison operators.  Having said that, features that take only a small number of
numeric values in the training data can be treated symbolically provided you are
careful about handling their values in the test data.  At the least, you have to set
the test data value for such a feature to its closest value in the training data.
The module does that automatically for you for those numeric features for which the
number different numeric values is less than a user-specified threshold.  For those
numeric features that the module is allowed to treat symbolically, this snapping of
the values of the features in the test data to the small set of values in the training
data is carried out automatically by the module.  That is, after a user has told the
module which numeric features to treat symbolically, the user need not worry about
how the feature values appear in the test data.

The constructor parameter C<symbolic_to_numeric_cardinality_threshold> let's you tell
the module when to consider an otherwise numeric feature symbolically. Suppose you
set this parameter to 10, that means that all numeric looking features that take 10
or fewer different values in the training datafile will be considered to be symbolic
features by the module.  See the tutorial at
L<https://engineering.purdue.edu/kak/Tutorials/DecisionTreeClassifiers.pdf> for
further information on the implementation issues related to the symbolic and numeric
features.

=head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES

For the purpose of estimating the probabilities, it is necessary to sample the range
of values taken on by a numerical feature. For features with "nice" statistical
properties, this sampling interval is set to the median of the differences between
the successive feature values in the training data.  (Obviously, as you would expect,
you first sort all the values for a feature before computing the successive
differences.)  This logic will not work for the sort of a feature described below.

Consider a feature whose values are heavy-tailed, and, at the same time, the values
span a million to one range.  What I mean by heavy-tailed is that rare values can
occur with significant probabilities.  It could happen that most of the values for
such a feature are clustered at one of the two ends of the range. At the same time,
there may exist a significant number of values near the end of the range that is less
populated.  (Typically, features related to human economic activities --- such as
wealth, incomes, etc. --- are of this type.)  With the logic described in the
previous paragraph, you could end up with a sampling interval that is much too small,
which could result in millions of sampling points for the feature if you are not
careful.

Beginning with Version 2.22, you have two options in dealing with such features.  You
can choose to go with the default behavior of the module, which is to sample the
value range for such a feature over a maximum of 500 points.  Or, you can supply an
additional option to the constructor that sets a user-defined value for the number of
points to use.  The name of the option is C<number_of_histogram_bins>.  The following
script 

    construct_dt_for_heavytailed.pl 

in the C<Examples> directory shows an example of how to call the constructor of the
module with the C<number_of_histogram_bins> option.


=head1 TESTING THE QUALITY OF YOUR TRAINING DATA

Versions 2.1 and higher include a new class named C<EvalTrainingData>, derived from
the main class C<DecisionTree>, that runs a 10-fold cross-validation test on your
training data to test its ability to discriminate between the classes mentioned in
the training file.

The 10-fold cross-validation test divides all of the training data into ten parts,
with nine parts used for training a decision tree and one part used for testing its
ability to classify correctly. This selection of nine parts for training and one part
for testing is carried out in all of the ten different possible ways.

The following code fragment illustrates how you invoke the testing function of the
EvalTrainingData class:

    my $training_datafile = "training.csv";                                         
    my $eval_data = EvalTrainingData->new(
                                  training_datafile => $training_datafile,
                                  csv_class_column_index => 1,
                                  csv_columns_for_features => [2,3],
                                  entropy_threshold => 0.01,
                                  max_depth_desired => 3,
                                  symbolic_to_numeric_cardinality_threshold => 10,
                                  csv_cleanup_needed => 1,
                    );
    $eval_data->get_training_data();
    $eval_data->evaluate_training_data()

The last statement above prints out a Confusion Matrix and the value of Training Data
Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
cross-validation test.

This testing functionality can also be used to find the best values to use for the
constructor parameters C<entropy_threshold>, C<max_depth_desired>, and
C<symbolic_to_numeric_cardinality_threshold>.

The following two scripts in the C<Examples> directory illustrate the use of the
C<EvalTrainingData> class for testing the quality of your data:

    evaluate_training_data1.pl
    evaluate_training_data2.pl


=head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS

Assuming your training data is good, the quality of the results you get from a
decision tree would depend on the choices you make for the constructor parameters
C<entropy_threshold>, C<max_depth_desired>, and
C<symbolic_to_numeric_cardinality_threshold>.  You can optimize your choices for
these parameters by running the 10-fold cross-validation test that is made available
in Versions 2.2 and higher through the new class C<EvalTrainingData> that is included
in the module file.  A description of how to run this test is in the previous section
of this document.


=head1 DECISION TREE INTROSPECTION

Starting with Version 2.30, you can ask the C<DTIntrospection> class of the module to
explain the classification decisions made at the different nodes of the decision
tree.

Perhaps the most important bit of information you are likely to seek through DT
introspection is the list of the training samples that fall directly in the portion
of the feature space that is assigned to a node.

However, note that, when training samples are non-uniformly distributed in the
underlying feature space, it is possible for a node to exist even when there are no
training samples in the portion of the feature space assigned to the node.  That is
because the decision tree is constructed from the probability densities estimated
from the training data.  When the training samples are non-uniformly distributed, it
is entirely possible for the estimated probability densities to be non-zero in a
small region around a point even when there are no training samples specifically in
that region.  (After you have created a statistical model for, say, the height
distribution of people in a community, the model may return a non-zero probability
for the height values in a small interval even if the community does not include a
single individual whose height falls in that interval.)

That a decision-tree node can exist even when there are no training samples in that
portion of the feature space that belongs to the node is an important indication of
the generalization ability of a decision-tree-based classifier.

In light of the explanation provided above, before the DTIntrospection class supplies
any answers at all, it asks you to accept the fact that features can take on non-zero
probabilities at a point in the feature space even though there are zero training
samples at that point (or in a small region around that point).  If you do not accept
this rudimentary fact, the introspection class will not yield any answers (since you
are not going to believe the answers anyway).

The point made above implies that the path leading to a node in the decision tree may
test a feature for a certain value or threshold despite the fact that the portion of
the feature space assigned to that node is devoid of any training data.

See the following three scripts in the Examples directory for how to carry out DT

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN


=over 8

=item C<training_datafile>:

This parameter supplies the name of the file that contains the training data.

=item C<csv_class_column_index>:

When using a CSV file for your training data, this parameter supplies the zero-based
column index for the column that contains the class label for each data record in the
training file.


=item C<csv_cleanup_needed>:

You need to set this parameter to 1 if your CSV file has double quoted strings (which
may include commas) as values for the fields and if such values are allowed to
include commas for, presumably, better readability.

=item C<csv_columns_for_features>:

When using a CSV file for your training data, this parameter supplies a list of
columns corresponding to the features you wish to use for decision tree construction.
Each column is specified by its zero-based index.

=item C<entropy_threshold>:

This parameter sets the granularity with which the entropies are sampled by the
module.  For example, a feature test at a node in the decision tree is acceptable if
the entropy gain achieved by the test exceeds this threshold.  The larger the value
you choose for this parameter, the smaller the tree.  Its default value is 0.001.

=item C<max_depth_desired>:

This parameter sets the maximum depth of the decision tree.  For obvious reasons, the
smaller the value you choose for this parameter, the smaller the tree.

=item C<symbolic_to_numeric_cardinality_threshold>:

This parameter allows the module to treat an otherwise numeric feature symbolically
if the number of different values the feature takes in the training data file does
not exceed the value of this parameter.

=item C<number_of_histogram_bins>:

This parameter gives the user the option to set the number of points at which the
value range for a feature should be sampled for estimating the probabilities.  This
parameter is effective only for those features that occupy a large value range and
whose probability distributions are heavy tailed.  B<This parameter is also important
when you have a very large training dataset:> In general, the larger the dataset, the
smaller the smallest difference between any two values for a numeric feature in
relation to the overall range of values for that feature. In such cases, the module
may use too large a number of bins for estimating the probabilities and that may slow
down the calculation of the decision tree.  You can get around this difficulty by
explicitly giving a value to the 'C<number_of_histogram_bins>' parameter.

=back


You can choose the best values to use for the last three constructor parameters by
running a 10-fold cross-validation test on your training data through the class
C<EvalTrainingData> that comes with Versions 2.1 and higher of this module.  See the
section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.

=over

=item B<get_training_data():>

After you have constructed a new instance of the C<Algorithm::DecisionTree> class,
you must now read in the training data that is the file named in the call to the
constructor.  This you do by:

    $dt->get_training_data(); 


=item B<show_training_data():>

If you wish to see the training data that was just digested by the module,
call 

    $dt->show_training_data(); 

=item B<calculate_first_order_probabilities():>

=item B<calculate_class_priors():>

After the module has read the training data file, it needs to initialize the
probability cache.  This you do by invoking:

    $dt->calculate_first_order_probabilities()
    $dt->calculate_class_priors() 

=item B<construct_decision_tree_classifier():>

With the probability cache initialized, it is time to construct a decision tree
classifier.  This you do by

    my $root_node = $dt->construct_decision_tree_classifier();

This call returns an instance of type C<DTNode>.  The C<DTNode> class is defined
within the main package file.  So, don't forget, that C<$root_node> in the above
example call will be instantiated to an object of type C<DTNode>.

=item B<$root_nodeC<< -> >>display_decision_tree(" "):>

    $root_node->display_decision_tree("   ");

This will display the decision tree in your terminal window by using a recursively
determined offset for each node as the display routine descends down the tree.

I have intentionally left the syntax fragment C<$root_node> in the above call to
remind the reader that C<display_decision_tree()> is NOT called on the instance of
the C<DecisionTree> we constructed earlier, but on the C<DTNode> instance returned by
the call to C<construct_decision_tree_classifier()>.

=item B<classify($root_node, \@test_sample):>

Let's say you want to classify the following data record:

    my @test_sample  = qw /  g2=4.2

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

If you want to output a tabular display that shows for each node in the decision tree
all the training samples that fall in the portion of the feature space that belongs
to that node, call

    $introspector->display_training_samples_at_all_nodes_direct_influence_only();

If you want to output a tabular display that shows for each training sample a list of
all the nodes that are affected directly AND indirectly by that sample, call

    $introspector->display_training_training_samples_to_nodes_influence_propagation();

A training sample affects a node directly if the sample falls in the portion of the
features space assigned to that node. On the other hand, a training sample is
considered to affect a node indirectly if the node is a descendant of a node that is
affected directly by the sample.


=head1 BULK CLASSIFICATION OF DATA RECORDS

For large test datasets, you would obviously want to process an entire file of test
data at a time. The following scripts in the C<Examples> directory illustrate how you
can do that:

      classify_test_data_in_a_file.pl

This script requires three command-line arguments, the first argument names the
training datafile, the second the test datafile, and the third the file in which the
classification results are to be deposited.  

The other examples directories, C<ExamplesBagging>, C<ExamplesBoosting>, and
C<ExamplesRandomizedTrees>, also contain scripts that illustrate how to carry out
bulk classification of data records when you wish to take advantage of bagging,
boosting, or tree randomization.  In their respective directories, these scripts are
named:

    bagging_for_bulk_classification.pl
    boosting_for_bulk_classification.pl
    classify_database_records.pl


=head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED

It depends on whether you apply the classifier at once to all the data samples in a
file, or whether you feed one data sample at a time into the classifier.

In general, the classifier returns soft classification for a test data vector.  What
that means is that, in general, the classifier will list all the classes to which a
given data vector could belong and the probability of each such class label for the
data vector. Run the examples scripts in the Examples directory to see how the output
of classification can be displayed.

With regard to the soft classifications returned by this classifier, if the
probability distributions for the different classes overlap in the underlying feature
space, you would want the classifier to return all of the applicable class labels for
a data vector along with the corresponding class probabilities.  Another reason for
why the decision tree classifier may associate significant probabilities with
multiple class labels is that you used inadequate number of training samples to
induce the decision tree.  The good thing is that the classifier does not lie to you
(unlike, say, a hard classification rule that would return a single class label
corresponding to the partitioning of the underlying feature space).  The decision
tree classifier give you the best classification that can be made given the training
data you fed into it.


=head1 USING BAGGING

Starting with Version 3.0, you can use the class C<DecisionTreeWithBagging> that
comes with the module to incorporate bagging in your decision tree based
classification.  Bagging means constructing multiple decision trees for different
(possibly overlapping) segments of the data extracted from your training dataset and
then aggregating the decisions made by the individual decision trees for the final
classification.  The aggregation of the classification decisions can average out the
noise and bias that may otherwise affect the classification decision obtained from
just one tree.

=over 4

=item B<Calling the bagging constructor::>

A typical call to the constructor for the C<DecisionTreeWithBagging> class looks
like:

    use Algorithm::DecisionTreeWithBagging;
    
    my $training_datafile = "stage3cancer.csv";
    
    my $dtbag = Algorithm::DecisionTreeWithBagging->new(
                                  training_datafile => $training_datafile,
                                  csv_class_column_index => 2,
                                  csv_columns_for_features => [3,4,5,6,7,8],
                                  entropy_threshold => 0.01,
                                  max_depth_desired => 8,
                                  symbolic_to_numeric_cardinality_threshold => 10,
                                  how_many_bags => 4,
                                  bag_overlap_fraction => 0.2,
                                  csv_cleanup_needed => 1,
                 );
    
Note in particular the following two constructor parameters:
                                                                                               
    how_many_bags

    bag_overlap_fraction

where, as the name implies, the parameter C<how_many_bags> controls how many bags
(and, therefore, how many decision trees) will be constructed from your training
dataset; and where the parameter C<bag_overlap_fraction> controls the degree of
overlap between the bags.  To understand what exactly is achieved by setting the
parameter C<bag_overlap_fraction> to 0.2 in the above example, let's say that the
non-overlapping partitioning of the training data between the bags results in 100
training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
drawn randomly from the other bags will be added to the data in each bag.

=back

=head2 B<Methods defined for C<DecisionTreeWithBagging> class>

=over 8

=item B<get_training_data_for_bagging():>

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

can inject continuities between the predictor and the predicted variables that may
not really exist in the real data.  Regression trees, on the other hand, give you a
piecewise linear relationship between the predictor and the predicted variables that
is freed from the constraints of superimposed continuities at the joins between the
different segments.  See the following tutorial for further information regarding the
standard linear regression approach and the regression that can be achieved with the
RegressionTree class in this module:
L<https://engineering.purdue.edu/kak/Tutorials/RegressionTree.pdf>

The RegressionTree class in the current version of the module assumes that all of
your data is numerical.  That is, unlike what is possible with the DecisionTree class
(and the other more closely related classes in this module) that allow your training
file to contain a mixture of numerical and symbolic data, the RegressionTree class
requires that ALL of your data be numerical.  I hope to relax this constraint in
future versions of this module.  Obviously, the dependent variable will always be
numerical for regression.

See the example scripts in the directory C<ExamplesRegression> if you wish to become
more familiar with the regression capabilities of the module.

=over 4

=item B<Calling the RegressionTree constructor:>

    my $training_datafile = "gendata5.csv";
    my $rt = Algorithm::RegressionTree->new(
                              training_datafile => $training_datafile,
                              dependent_variable_column => 2,
                              predictor_columns => [1],
                              mse_threshold => 0.01,
                              max_depth_desired => 2,
                              jacobian_choice => 0,
                              csv_cleanup_needed => 1,
             );

Note in particular the constructor parameters:

    dependent_variable
    predictor_columns
    mse_threshold
    jacobian_choice

The first of these parameters, C<dependent_variable>, is set to the column index in
the CSV file for the dependent variable.  The second constructor parameter,
C<predictor_columns>, tells the system as to which columns contain values for the
predictor variables.  The third parameter, C<mse_threshold>, is for deciding when to
partition the data at a node into two child nodes as a regression tree is being
constructed.  If the minmax of MSE (Mean Squared Error) that can be achieved by
partitioning any of the features at a node is smaller than C<mse_threshold>, that
node becomes a leaf node of the regression tree.

The last parameter, C<jacobian_choice>, must be set to either 0 or 1 or 2.  Its
default value is 0. When this parameter equals 0, the regression coefficients are
calculated using the linear least-squares method and no further "refinement" of the
coefficients is carried out using gradient descent.  This is the fastest way to
calculate the regression coefficients.  When C<jacobian_choice> is set to 1, you get
a weak version of gradient descent in which the Jacobian is set to the "design
matrix" itself. Choosing 2 for C<jacobian_choice> results in a more reasonable
approximation to the Jacobian.  That, however, is at a cost of much longer
computation time.  B<NOTE:> For most cases, using 0 for C<jacobian_choice> is the
best choice.  See my tutorial "I<Linear Regression and Regression Trees>" for why
that is the case.

=back

=head2 B<Methods defined for C<RegressionTree> class>

=over 8

=item B<get_training_data_for_regression():>

Only CSV training datafiles are allowed. Additionally, the first record in the file
must list the names of the fields, and the first column must contain an integer ID
for each record.

=item B<construct_regression_tree():>

As the name implies, this is the method that construct a regression tree.

=item B<display_regression_tree("     "):>

Displays the regression tree, as the name implies.  The white-space string argument
specifies the offset to use in displaying the child nodes in relation to a parent
node.

=item B<prediction_for_single_data_point( $root_node, $test_sample ):>

You call this method after you have constructed a regression tree if you want to
calculate the prediction for one sample.  The parameter C<$root_node> is what is
returned by the call C<construct_regression_tree()>.  The formatting of the argument
bound to the C<$test_sample> parameter is important.  To elaborate, let's say you are
using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
this case, the C<$test_sample> parameter will be bound to a list that will look like

    ['xvar1 = 23.4', 'xvar2 = 12.9'] 

Arbitrary amount of white space, including none, on the two sides of the equality
symbol is allowed in the construct shown above.  A call to this method returns a
dictionary with two key-value pairs.  One of the keys is called C<solution_path> and
the other C<prediction>.  The value associated with key C<solution_path> is the path
in the regression tree to the leaf node that yielded the prediction.  And the value
associated with the key C<prediction> is the answer you are looking for.

=item B<predictions_for_all_data_used_for_regression_estimation( $root_node ):>

This call calculates the predictions for all of the predictor variables data in your
training file.  The parameter C<$root_node> is what is returned by the call to
C<construct_regression_tree()>.  The values for the dependent variable thus predicted
can be seen by calling C<display_all_plots()>, which is the method mentioned below.

=item B<display_all_plots():>

This method displays the results obtained by calling the prediction method of the
previous entry.  This method also creates a hardcopy of the plots and saves it as a
C<.png> disk file. The name of this output file is always C<regression_plots.png>.

=item B<mse_for_tree_regression_for_all_training_samples( $root_node ):>

This method carries out an error analysis of the predictions for the samples in your
training datafile.  It shows you the overall MSE (Mean Squared Error) with tree-based
regression, the MSE for the data samples at each of the leaf nodes of the regression



( run in 0.652 second using v1.01-cache-2.11-cpan-ceb78f64989 )