Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

package Algorithm::DecisionTree;

#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::DecisionTree is a Perl module for decision-tree based classification of 
# multidimensional data.
# -------------------------------------------------------------------------------------

#use 5.10.0;
use strict;
use warnings;
use Carp;

our $VERSION = '3.43';

############################################   Constructor  ##############################################
sub new { 
    my ($class, %args, $eval_or_boosting_mode);
    if (@_ % 2 != 0) {
        ($class, %args) = @_;
    } else {
        $class = shift;
        $eval_or_boosting_mode = shift;
        die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
        die "Only one string arg allowed in eval and boosting modes" if @_;
    }
    unless ($eval_or_boosting_mode) {
        my @params = keys %args;
        croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params2(@params) == 0;
    }
    bless {
        _training_datafile                   =>    $args{training_datafile}, 
        _entropy_threshold                   =>    $args{entropy_threshold} || 0.01,
        _max_depth_desired                   =>    exists $args{max_depth_desired} ? 
                                                                       $args{max_depth_desired} : undef,
        _debug1                              =>    $args{debug1} || 0,
        _debug2                              =>    $args{debug2} || 0,
        _debug3                              =>    $args{debug3} || 0,
        _csv_class_column_index              =>    $args{csv_class_column_index} || undef,
        _csv_columns_for_features            =>    $args{csv_columns_for_features} || undef,
        _symbolic_to_numeric_cardinality_threshold
                                             =>    $args{symbolic_to_numeric_cardinality_threshold} || 10,
        _number_of_histogram_bins            =>    $args{number_of_histogram_bins} || undef,
        _csv_cleanup_needed                  =>    $args{csv_cleanup_needed} || 0,
        _training_data                       =>    [],
        _root_node                           =>    undef,
        _probability_cache                   =>    {},
        _entropy_cache                       =>    {},
        _training_data_hash                  =>    {},
        _features_and_values_hash            =>    {},
        _samples_class_label_hash            =>    {},
        _class_names                         =>    [],
        _class_priors                        =>    [],
        _class_priors_hash                   =>    {},
        _feature_names                       =>    [],
        _numeric_features_valuerange_hash    =>    {},
        _sampling_points_for_numeric_feature_hash      =>      {},
        _feature_values_how_many_uniques_hash          =>      {},
        _prob_distribution_numeric_features_hash       =>      {},
        _histogram_delta_hash                          =>      {},
        _num_of_histogram_bins_hash                    =>      {},
    }, $class;

}

####################################  Classify with Decision Tree  #######################################

##  Classifies one test sample at a time using the decision tree constructed from
##  your training file.  The data record for the test sample must be supplied as
##  shown in the scripts in the `examples' subdirectory.  See the scripts
##  construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
sub classify {
    my $self = shift;
    my $root_node = shift;
    my $feature_and_values = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my @features_and_values = @$feature_and_values;
    @features_and_values = @{deep_copy_array(\@features_and_values)};
    die "\n\nError in the names you have used for features and/or values.  " .
        "Try using the csv_cleanup_needed option in the constructor call." 
                        unless $self->check_names_used(\@features_and_values);
    my @new_features_and_values = ();
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values) {
        $feature_and_value =~ /$pattern/;
        my ($feature, $value) = ($1, $2);
        my $newvalue = $value;
        my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
                                                                       $not_all_values_float == 0) {
            $newvalue = closest_sampling_point($value, \@unique_values_for_feature);
        }
        push @new_features_and_values, "$feature" . '=' . "$newvalue";
    }
    @features_and_values = @new_features_and_values;
    print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
    my %answer = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        $answer{$class_name} = undef;
    }
    $answer{'solution_path'} = [];
    my %classification = %{$self->recursive_descent_for_classification($root_node, 
                                                                    \@features_and_values,\%answer)};
    @{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
    if ($self->{_debug3}) {
        print "\nCL2 The classification:\n";
        foreach my $class_name (@{$self->{_class_names}}) {
            print "    $class_name  with probability $classification{$class_name}\n";
        }
    }
    my %classification_for_display = ();
    foreach my $item (keys %classification) {
        if ($item ne 'solution_path') {
            $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
        } else {
            my @outlist = ();
            foreach my $x (@{$classification{$item}}) {
                push @outlist, "NODE$x";
            }
            $classification_for_display{$item} =  \@outlist;
        }
    }
    return \%classification_for_display;
}

sub recursive_descent_for_classification {
    my $self = shift;
    my $node = shift;
    my $features_and_values = shift;
    my $answer = shift;
    my @features_and_values = @$features_and_values;
    my %answer = %$answer;
    my @children = @{$node->get_children()};
    if (@children == 0) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
        return \%answer;
    }
    my $feature_tested_at_node = $node->get_feature();
    print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n" 
        if $self->{_debug3};
    my $value_for_feature;
    my $path_found;
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values) {
        $feature_and_value =~ /$pattern/;
        $value_for_feature = $2 if $feature_tested_at_node eq $1;
    }
    # The following clause introduced in Version 3.20
    if (!defined $value_for_feature) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
        return \%answer;
    }
    if ($value_for_feature) {
        if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
            print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
            my $pattern1 = '(.+)<(.+)';
            my $pattern2 = '(.+)>(.+)';
            foreach my $child (@children) {
                my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
                my $last_feature_and_value_on_branch = $branch_features_and_values[-1]; 
                if ($last_feature_and_value_on_branch =~ /$pattern1/) {
                    my ($feature, $threshold) = ($1,$2); 
                    if ($value_for_feature <= $threshold) {
                        $path_found = 1;
                        %answer = %{$self->recursive_descent_for_classification($child,
                                                                             $features_and_values,\%answer)};
                        push @{$answer{'solution_path'}}, $node->get_serial_num();
                        last;
                    }
                }
                if ($last_feature_and_value_on_branch =~ /$pattern2/) {
                    my ($feature, $threshold) = ($1,$2); 
                    if ($value_for_feature > $threshold) {
                        $path_found = 1;
                        %answer = %{$self->recursive_descent_for_classification($child,
                                                                            $features_and_values,\%answer)};
                        push @{$answer{'solution_path'}}, $node->get_serial_num();
                        last;
                    }
                }
            }
            return \%answer if $path_found;
        } else {
            my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
            print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n" 
                if $self->{_debug3};
            foreach my $child (@children) {
                my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
                print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
                my $last_feature_and_value_on_branch = $branch_features_and_values[-1]; 
                if ($last_feature_and_value_on_branch eq $feature_value_combo) {
                    %answer = %{$self->recursive_descent_for_classification($child,
                                                                              $features_and_values,\%answer)};
                    push @{$answer{'solution_path'}}, $node->get_serial_num();
                    $path_found = 1;
                    last;
                }
            }
            return \%answer if $path_found;
        }
    }
    if (! $path_found) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
    }
    return \%answer;
}

##  If you want classification to be carried out by engaging a human user in a
##  question-answer session, this is the method to use for that purpose.  See, for
##  example, the script classify_by_asking_questions.pl in the `examples'
##  subdirectory for an illustration of how to do that.
sub classify_by_asking_questions {
    my $self = shift;
    my $root_node = shift;
    my %answer = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        $answer{$class_name} = undef;
    }
    $answer{'solution_path'} = [];
    my %scratchpad_for_numeric_answers = ();
    foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
        $scratchpad_for_numeric_answers{$feature_name} = undef;
    }
    my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
                                                       \%answer, \%scratchpad_for_numeric_answers)};
    @{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
    my %classification_for_display = ();
    foreach my $item (keys %classification) {
        if ($item ne 'solution_path') {
            $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
        } else {
            my @outlist = ();
            foreach my $x (@{$classification{$item}}) {
                push @outlist, "NODE$x";
            }
            $classification_for_display{$item} =  \@outlist;
        }
    }
    return \%classification_for_display;
}

sub interactive_recursive_descent_for_classification {
    my $self = shift;
    my $node = shift;
    my $answer = shift;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            if ($branch_attribute =~ /$pattern2/) {
                my ($feature,$threshold) = ($1,$2);
                if ($user_value_for_feature > $threshold) {
                    %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
                                                                     \%answer, \%scratchpad_for_numerics)};
                    $path_found = 1;
                    push @{$answer{'solution_path'}}, $node->get_serial_num();
                    last;
                }
            }
        }
        return \%answer if $path_found;
    } else {
        my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
        while (1) {
            print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
            print "\nEnter a value from the list (@possible_values_for_feature): ";
            $user_value_for_feature = <STDIN>;
            $user_value_for_feature =~ s/\r?\n?$//;
            $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
            my $answer_found = 0;
            if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
                $answer_found = 1;
                last;
            }
            last if $answer_found;
            print("You entered illegal value. Let's try again");
        }
        $feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
        foreach my $i (0..@list_of_branch_attributes_to_children-1) {
            my $branch_attribute = $list_of_branch_attributes_to_children[$i];
            if ($branch_attribute eq $feature_value_combo) {
                %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
                                                                     \%answer, \%scratchpad_for_numerics)};
                $path_found = 1;
                push @{$answer{'solution_path'}}, $node->get_serial_num();
                last;
            }
        }
        return \%answer if $path_found;
    }
    if (! $path_found) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
    }
    return \%answer;
}

######################################    Decision Tree Construction  ####################################

##  At the root node, we find the best feature that yields the greatest reduction in
##  class entropy from the entropy based on just the class priors. The logic for
##  finding this feature is different for symbolic features and for numeric features.
##  That logic is built into the method shown later for best feature calculations.
sub construct_decision_tree_classifier {
    print "\nConstructing the decision tree ...\n";
    my $self = shift;
    if ($self->{_debug3}) {        
        $self->determine_data_condition(); 
        print "\nStarting construction of the decision tree:\n";
    }
    my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
    if ($self->{_debug3}) { 
        print "\nPrior class probabilities: @class_probabilities\n";
        print "\nClass names: @{$self->{_class_names}}\n";
    }
    my $entropy = $self->class_entropy_on_priors();
    print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
    my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
    $root_node->set_class_names(\@{$self->{_class_names}});
    $self->{_root_node} = $root_node;
    $self->recursive_descent($root_node);
    return $root_node;
}

##  After the root node of the decision tree is calculated by the previous methods,
##  we invoke this method recursively to create the rest of the tree.  At each node,
##  we find the feature that achieves the largest entropy reduction with regard to
##  the partitioning of the training data samples that correspond to that node.
sub recursive_descent {
    my $self = shift;
    my $node = shift;
    print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
        if $self->{_debug3};
    my $node_serial_number = $node->get_serial_num();
    my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
    my $existing_node_entropy = $node->get_node_entropy();
    if ($self->{_debug3}) { 
        print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
        print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
        print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
        my @class_probs = @{$node->get_class_probabilities()};
        print "\nRD4 Class probabilities: @class_probs\n";
    }
    if ($existing_node_entropy < $self->{_entropy_threshold}) { 
        print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
        return;
    }
    my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
    my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
                    $self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
    $node->set_feature($best_feature);
    $node->display_node() if $self->{_debug3};
    if (defined($self->{_max_depth_desired}) && 
               (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
        print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3}; 
        return;
    }
    return if ! defined $best_feature;
    if ($self->{_debug3}) { 
        print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
        print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
        print "\nRD9 Best feature entropy: $best_feature_entropy\n";
        print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
    }
    my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
    print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
    if ($entropy_gain > $self->{_entropy_threshold}) {
        if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} && 
              $self->{_feature_values_how_many_uniques_hash}->{$best_feature} > 
                                        $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my $best_threshold = $decision_val;            # as returned by best feature calculator
            my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
            my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
            my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
            my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
            my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
            push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 
                                                                  $feature_threshold_combo_for_less_than;
            push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child, 
                                                               $feature_threshold_combo_for_greater_than;
            if ($self->{_debug3}) {
                print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
                print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
            }
            my @class_probabilities_for_lessthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                 \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
            my @class_probabilities_for_greaterthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
              \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
            if ($self->{_debug3}) {
                print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
                print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
            }
            if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
                                                         \@class_probabilities_for_lessthan_child_node,
                              \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
                $node->add_child_link($left_child_node);
                $self->recursive_descent($left_child_node);
            }
            if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
                                                         \@class_probabilities_for_greaterthan_child_node,
                            \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
                $node->add_child_link($right_child_node);
                $self->recursive_descent($right_child_node);
            }
        } else {
            print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
                if $self->{_debug3};
            my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
            print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
            my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
            my @class_entropies_for_children = ();
            foreach my $feature_and_value_index (0..@feature_value_combos-1) {
                print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
                    if $self->{_debug3};
                my @extended_branch_features_and_values_or_thresholds;
                if (! @features_and_values_or_thresholds_on_branch) {
                    @extended_branch_features_and_values_or_thresholds = 
                                                          ($feature_value_combos[$feature_and_value_index]);
                } else {
                    @extended_branch_features_and_values_or_thresholds = 
                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @extended_branch_features_and_values_or_thresholds, 
                                           $feature_value_combos[$feature_and_value_index];
                }
                my @class_probabilities =
                   map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                               \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
                my $class_entropy_for_child = 
                      $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
                                                         \@extended_branch_features_and_values_or_thresholds);
                if ($self->{_debug3}) {
                    print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
                    print "\nRD20 class entropy for child: $class_entropy_for_child\n"; 
                }
                if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
                    my $child_node = DTNode->new(undef, $class_entropy_for_child,
                          \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
                    $node->add_child_link($child_node);
                    $self->recursive_descent($child_node);
                } else {
                    print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
                }
            }
        }
    } else {
        print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n" 
            if $self->{_debug3};
        return;
    }
}

##  This is the heart of the decision tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples that
##  correspond to the current node.  The search for the best feature is carried out
##  differently for symbolic features and for numeric features.  For a symbolic
##  feature, the method estimates the entropy for each value of the feature and then
##  averages out these entropies as a measure of the discriminatory power of that
##  features.  For a numeric feature, on the other hand, it estimates the entropy
##  reduction that can be achieved if were to partition the set of training samples
##  for each possible threshold.  For a numeric feature, all possible sampling points
##  relevant to the node in question are considered as candidates for thresholds.
sub best_feature_calculator {
    my $self = shift;
    my $features_and_values_or_thresholds_on_branch = shift;
    my $existing_node_entropy = shift;
    my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @all_symbolic_features = ();
    foreach my $feature_name (@{$self->{_feature_names}}) {
        push @all_symbolic_features, $feature_name 
            if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
    }
    my @symbolic_features_already_used = ();  
    foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
        push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
    }
    my @symbolic_features_not_yet_used;
    foreach my $x (@all_symbolic_features) {
        push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
    }
    my @true_numeric_types = ();
    my @symbolic_types = ();
    my @true_numeric_types_feature_names = ();
    my @symbolic_types_feature_names = ();
    foreach my $item (@features_and_values_or_thresholds_on_branch) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern3/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern1/) {
            push @symbolic_types, $item;
            push @symbolic_types_feature_names, $1;
        } else {
            die "format error in the representation of feature and values or thresholds";
        }
    }
    my %seen = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
    %seen = ();
    @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
    my @bounded_intervals_numeric_types = 
                       @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature (@true_numeric_types_feature_names) {
        $upperbound{$feature} = undef;
        $lowerbound{$feature} = undef;
    }
    foreach my $item (@bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
            } else {
                $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
            }
        }
    }
    my %entropy_values_for_different_features = ();
    my %partitioning_point_child_entropies_hash = ();
    my %partitioning_point_threshold = ();
    my %entropies_for_different_values_of_symbolic_feature = ();
    foreach my $feature (@{$self->{_feature_names}}) {
        $entropy_values_for_different_features{$feature} = [];
        $partitioning_point_child_entropies_hash{$feature} = {};
        $partitioning_point_threshold{$feature} = undef;
        $entropies_for_different_values_of_symbolic_feature{$feature} = [];
    }
    foreach my $i (0..@{$self->{_feature_names}}-1) {
        my $feature_name = $self->{_feature_names}->[$i];
        print "\n\nBFC1          FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
        if (contained_in($feature_name, @symbolic_features_already_used)) {
            next;
        } elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
                 $self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
                                      $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};      
            my @newvalues = ();
            if (contained_in($feature_name, @true_numeric_types_feature_names)) {
                if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                              $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
                    next;
                } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                                    $lowerbound{$feature_name} < $upperbound{$feature_name}) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($upperbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($lowerbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name};
                    }
                } else {
                    die "Error is bound specifications in best feature calculator";
                }
            } else {
                @newvalues = @{deep_copy_array(\@values)};
            }
            next if @newvalues == 0;
            my @partitioning_entropies = ();            
            foreach my $value (@newvalues) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
                my @for_left_child;
                my @for_right_child;
                if (@features_and_values_or_thresholds_on_branch) {
                    @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_left_child, $feature_and_less_than_value_string;
                    @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_right_child, $feature_and_greater_than_value_string;
                } else {
                    @for_left_child = ($feature_and_less_than_value_string);
                    @for_right_child = ($feature_and_greater_than_value_string);
                }
                my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
                                    \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
                my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
                                    \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
                my $partitioning_entropy = $entropy1 * 
                     $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
                                           $entropy2 *
                     $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);

                push @partitioning_entropies, $partitioning_entropy;
                $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
            }
            my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
            if ($min_entropy < $existing_node_entropy) {
                $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
                $entropy_values_for_different_features{$feature_name} = $min_entropy;
            }
        } else {
            print "\nBFC2:  Entering section reserved for symbolic features\n" if $self->{_debug3};
            print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
            my %seen;
            my @values = grep {$_ ne 'NA' && !$seen{$_}++} 
                                    @{$self->{_features_and_unique_values_hash}->{$feature_name}};
            @values = sort @values;
            print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};

            my $entropy = 0;
            foreach my $value (@values) {
                my $feature_value_string = "$feature_name" . '=' . "$value";
                print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
                my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                if (@features_and_values_or_thresholds_on_branch) {
                    push @extended_attributes, $feature_value_string;
                } else {
                    @extended_attributes = ($feature_value_string);
                }
                $entropy += 
           $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) * 
           $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
                print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
                      "is $entropy\n" if $self->{_debug3};
                push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
            }
            if ($entropy < $existing_node_entropy) {
                $entropy_values_for_different_features{$feature_name} = $entropy;
            }
        }
    }
    my $min_entropy_for_best_feature;
    my $best_feature_name;
    foreach my $feature_nom (keys %entropy_values_for_different_features) { 
        if (!defined($best_feature_name)) {
            $best_feature_name = $feature_nom;
            $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
        } else {
            if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
                $best_feature_name = $feature_nom;
                $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
            }
        }
    }
    my $threshold_for_best_feature;
    if (exists $partitioning_point_threshold{$best_feature_name}) {
        $threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
    } else {
        $threshold_for_best_feature = undef;
    }
    my $best_feature_entropy = $min_entropy_for_best_feature;
    my @val_based_entropies_to_be_returned;
    my $decision_val_to_be_returned;
    if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} && 
          $self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} > 
                                    $self->{_symbolic_to_numeric_cardinality_threshold}) {
        @val_based_entropies_to_be_returned = 
            @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
    } else {
        @val_based_entropies_to_be_returned = ();
    }
    if (exists $partitioning_point_threshold{$best_feature_name}) {
        $decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
    } else {
        $decision_val_to_be_returned = undef;
    }
    print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
        "@val_based_entropies_to_be_returned\n"  if $self->{_debug3};
    return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned, 
                                                                      $decision_val_to_be_returned);
}

#########################################    Entropy Calculators     #####################################

sub class_entropy_on_priors {
    my $self = shift;
    return $self->{_entropy_cache}->{'priors'} 
        if exists $self->{_entropy_cache}->{"priors"};
    my @class_names = @{$self->{_class_names}};
    my $entropy;
    foreach my $class (@class_names) {
        my $prob = $self->prior_probability_for_class($class);
        my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
        $log_prob = 0 if $prob < 0.0001;           # since X.log(X)->0 as X->0
        $log_prob = 0 if $prob > 0.999;            # since log(1) = 0
        if (!defined $entropy) {
            $entropy = -1.0 * $prob * $log_prob; 
            next;
        }
        $entropy += -1.0 * $prob * $log_prob;
    }
    $self->{_entropy_cache}->{'priors'} = $entropy;
    return $entropy;
}

sub entropy_scanner_for_a_numeric_feature {
    local $| = 1;
    my $self = shift;
    my $feature = shift;
    my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
    my @entropies_for_less_than_thresholds = ();
    my @entropies_for_greater_than_thresholds = ();
    foreach my $point (@all_sampling_points) {
        print ". ";
        push @entropies_for_less_than_thresholds, 
                         $self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
        push @entropies_for_greater_than_thresholds,
                      $self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
    }
    print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ". 
                                                                "@entropies_for_less_than_thresholds\n";
    print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ". 
                                                             "@entropies_for_greater_than_thresholds\n";
}   

sub class_entropy_for_less_than_threshold_for_feature {
    my $self = shift;
    my $arr = shift;
    my $feature = shift;
    my $threshold = shift;
    my @array_of_features_and_values_or_thresholds = @$arr;
    my $feature_threshold_combo = "$feature" . '<' . "$threshold";
    my $sequence = join ":", @array_of_features_and_values_or_thresholds;
    $sequence .= ":" . $feature_threshold_combo;
    return $self->{_entropy_cache}->{$sequence}  if exists $self->{_entropy_cache}->{$sequence};
    my @copy_of_array_of_features_and_values_or_thresholds = 
                                       @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
    push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    my $array_of_features_and_values_or_thresholds = shift;
    my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
    my $sequence = join ":", @array_of_features_and_values_or_thresholds;
    return $self->{_entropy_cache}->{$sequence}  if exists $self->{_entropy_cache}->{$sequence};
    my $entropy = 0;
    foreach my $class_name (@{$self->{_class_names}}) {
        my $log_prob = undef;
        my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
                                             $class_name, \@array_of_features_and_values_or_thresholds);
        if ($prob >= 0.0001 && $prob <= 0.999) {
            $log_prob = log($prob) / log(2.0);
        } elsif ($prob < 0.0001) {
            $log_prob = 0;
        } elsif ($prob > 0.999) {
            $log_prob = 0;
        } else {
            die "An error has occurred in log_prob calculation";
        }
        $entropy +=  -1.0 * $prob * $log_prob;
    }
    if (abs($entropy) < 0.0000001) {
        $entropy = 0.0;
    }
    $self->{_entropy_cache}->{$sequence} = $entropy;
    return $entropy;
}


#####################################   Probability Calculators   ########################################

sub prior_probability_for_class {
    my $self = shift;
    my $class = shift;
    my $class_name_in_cache = "prior" . '::' . $class;
    return $self->{_probability_cache}->{$class_name_in_cache}
        if exists $self->{_probability_cache}->{$class_name_in_cache};
    my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
    my @values = values %{$self->{_samples_class_label_hash}};
    foreach my $class_name (@{$self->{_class_names}}) {
        my @trues = grep {$_ eq $class_name} @values;
        my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples; 
        my $this_class_name_in_cache = "prior" . '::' . $class_name;
        $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
    }
    return $self->{_probability_cache}->{$class_name_in_cache};
}

sub calculate_class_priors {
    my $self = shift;
    return if scalar keys %{$self->{_class_priors_hash}} > 1;
    foreach my $class_name (@{$self->{_class_names}}) {
        my $class_name_in_cache = "prior::$class_name";
        my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
        my @all_values = values %{$self->{_samples_class_label_hash}};
        my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
        my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
        $self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
        my $this_class_name_in_cache = "prior::$class_name";
        $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
    }
    if ($self->{_debug1}) {
        foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
            print "$class  =>  $self->{_class_priors_hash}->{$class}\n";
        }
    }
}

sub calculate_first_order_probabilities {
    print "\nEstimating probabilities...\n";
    my $self = shift;
    foreach my $feature (@{$self->{_feature_names}}) {
        $self->probability_of_feature_value($feature, undef);   
        if ($self->{_debug2}) {
            if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
                print "\nPresenting probability distribution for a numeric feature:\n";
                foreach my $sampling_point (sort {$a <=> $b} keys 
                                   %{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
                    my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
                    print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f", 
                          $self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
                }
            } else {
                print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
                my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
                foreach my $value (sort @values_for_feature) {
                    my $prob = $self->probability_of_feature_value($feature,$value); 
                    print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
                }
            }
        }
    }
}

sub probability_of_feature_value {
    my $self = shift;
    my $feature_name = shift;
    my $value = shift;
    $value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
    if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
        $value = closest_sampling_point($value, 
                                        $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
    }
    my $feature_and_value;
    if (defined($value)) {
        $feature_and_value = "$feature_name=$value";
    }
    if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
        return $self->{_probability_cache}->{$feature_and_value};
    }
    my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
    if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
        if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} > 
                                $self->{_symbolic_to_numeric_cardinality_threshold}) {
            if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
                @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}}; 
                $diffrange = $valuerange[1] - $valuerange[0];
                my %seen = ();
                my @unique_values_for_feature =  sort {$a <=> $b}  grep {$_ if $_ ne 'NA' && !$seen{$_}++} 
                                         @{$self->{_features_and_values_hash}->{$feature_name}};
                my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] - 
                                    $unique_values_for_feature[$_-1]}  1..@unique_values_for_feature-1;
                my $median_diff = $diffs[int(@diffs/2) - 1];
                $histogram_delta =  $median_diff * 2;
                if ($histogram_delta < $diffrange / 500.0) {
                    if (defined $self->{_number_of_histogram_bins}) {
                        $histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
                    } else {
                        $histogram_delta = $diffrange / 500.0;
                    }
                }
                $self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
                $num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
                $self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                        $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
                    }
                }
            }
            my $total_counts = 0;
            map {$total_counts += $_} @value_counts;
            die "PFV Something is wrong with your training file. It contains no training samples \
                         for feature named $feature_name" if $total_counts == 0;
            my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
            foreach my $i (0..@values_for_feature-1) {
                $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
            }
            if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
                return $self->{_probability_cache}->{$feature_and_value};
            } else {
                return 0;
            }
        }
    } else {
        # This section is only for purely symbolic features:  
        my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};        
        @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
        my @value_counts = (0) x @values_for_feature;
#        foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
            my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
            foreach my $i (0..@values_for_feature-1) {
                for my $current_value (@features_and_values) {
                    $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
                }
            }
        }
        foreach my $i (0..@values_for_feature-1) {
            $self->{_probability_cache}->{$values_for_feature[$i]} = 
                $value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
        }
        if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
            return $self->{_probability_cache}->{$feature_and_value};
        } else {
            return 0;
        }
    }
}

sub probability_of_feature_value_given_class {
    my $self = shift;
    my $feature_name = shift;
    my $feature_value = shift;
    my $class_name = shift;
    $feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
    if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
        $feature_value = closest_sampling_point($feature_value, 
                                        $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
    }
    my $feature_value_class;
    if (defined($feature_value)) {
        $feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
    }
    if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
        print "\nNext answer returned by cache for feature $feature_name and " .
            "value $feature_value given class $class_name\n" if $self->{_debug2};
        return $self->{_probability_cache}->{$feature_value_class};
    }
    my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);

    if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
        if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} > 
                                $self->{_symbolic_to_numeric_cardinality_threshold}) {
            $histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
            $num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
            @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
            $diffrange = $valuerange[1] - $valuerange[0];
        }
    }
    my @samples_for_class = ();
    # Accumulate all samples names for the given class:
    foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
        if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
            push @samples_for_class, $sample_name;
        }
    }
    if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
        if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} > 
                                $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my @sampling_points_for_feature = 
                              @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
            my @actual_feature_values_for_samples_in_class = ();
            foreach my $sample (@samples_for_class) {           
                foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
                    my $pattern = '(.+)=(.+)';
                    $feature_and_value =~ /$pattern/;
                    my ($feature, $value) = ($1, $2);
                    if (($feature eq $feature_name) && ($value ne 'NA')) {
                        push @actual_feature_values_for_samples_in_class, $value;
                    }
                }
            }
            foreach my $i (0..@sampling_points_for_feature-1) {
                foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
                    if (abs($sampling_points_for_feature[$i] - 
                            $actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
                        $counts_at_sampling_points[$i]++;
                    }
                }
            }
            my $total_counts = 0;
            map {$total_counts += $_} @counts_at_sampling_points;
            die "PFVC1 Something is wrong with your training file. It contains no training " .
                    "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
            my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
            my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"} 
                                                                     @sampling_points_for_feature;
            foreach my $i (0..@values_for_feature_and_class-1) {
                $self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
            }
            if (exists $self->{_probability_cache}->{$feature_value_class}) {
                return $self->{_probability_cache}->{$feature_value_class};
            } else {
                return 0;
            }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
        push @data_samples_for_class, $sample_name 
                  if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
    }
    my @actual_feature_values_for_samples_in_class = ();
    foreach my $sample (@data_samples_for_class) {
        foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
            my $pattern = '(.+)=(.+)';
            $feature_and_value =~ /$pattern/;
            my ($feature,$value) = ($1,$2);
            push @actual_feature_values_for_samples_in_class, $value
                                    if $feature eq $feature_name && $value ne 'NA';
        }
    }
    my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
    # The condition in the assignment that follows was a bug correction in Version 3.20
    my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
    $self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
    return $probability;
}

# This method requires that all truly numeric types only be expressed as '<' or '>'
# constructs in the array of branch features and thresholds
sub probability_of_a_sequence_of_features_and_values_or_thresholds {
    my $self = shift;
    my $arr = shift;
    my @array_of_features_and_values_or_thresholds = @$arr;
    return if scalar @array_of_features_and_values_or_thresholds == 0;
    my $sequence = join ':', @array_of_features_and_values_or_thresholds;
    return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
    my $probability = undef;
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @true_numeric_types = ();
    my @true_numeric_types_feature_names = ();
    my @symbolic_types = ();
    my @symbolic_types_feature_names = ();
    foreach my $item (@array_of_features_and_values_or_thresholds) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            my ($feature,$value) = ($1,$2);
            push @true_numeric_types_feature_names, $feature;
        } elsif ($item =~ /$pattern3/) {
            push @true_numeric_types, $item;
            my ($feature,$value) = ($1,$2);
            push @true_numeric_types_feature_names, $feature;
        } else {
            push @symbolic_types, $item;
            $item =~ /$pattern1/;
            my ($feature,$value) = ($1,$2);
            push @symbolic_types_feature_names, $feature;
        }
    }
    my %seen1 = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
    my %seen2 = ();
    @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
    my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
    print_array_with_msg("POS: Answer returned by find_bounded: ", 
                                       $bounded_intervals_numeric_types) if $self->{_debug2};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature_name (@true_numeric_types_feature_names) {
        $upperbound{$feature_name} = undef;
        $lowerbound{$feature_name} = undef;
    }
    foreach my $item (@$bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
            } else {
                $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
            }
        }
    }
    foreach my $feature_name (@true_numeric_types_feature_names) {
        if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) && 
                          $upperbound{$feature_name} <= $lowerbound{$feature_name}) { 
            return 0;
        } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
            if (! $probability) {
                $probability = $self->probability_of_feature_less_than_threshold($feature_name, 
                                                                                 $upperbound{$feature_name}) -
                   $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
            } else {
                $probability *= ($self->probability_of_feature_less_than_threshold($feature_name, 
                                                                                   $upperbound{$feature_name}) -
                 $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
            }
        } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
            if (! $probability) {
                $probability = $self->probability_of_feature_less_than_threshold($feature_name,
                                                                                 $upperbound{$feature_name});
            } else {
                $probability *= $self->probability_of_feature_less_than_threshold($feature_name, 
                                                                                  $upperbound{$feature_name});
            }
        } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
            if (! $probability) {
                $probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
                                                                                 $lowerbound{$feature_name});
            } else {
                $probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name, 
                                                                                $lowerbound{$feature_name}));
            }
        } else {
            die("Ill formatted call to 'probability_of_sequence' method");
        }
    }
    foreach my $feature_and_value (@symbolic_types) {
        if ($feature_and_value =~ /$pattern1/) {
            my ($feature,$value) = ($1,$2);
            if (! $probability) {        
                $probability = $self->probability_of_feature_value($feature, $value);
            } else {
                $probability *= $self->probability_of_feature_value($feature, $value);
            }
        }
    }
    $self->{_probability_cache}->{$sequence} = $probability;
    return $probability;
}

##  The following method requires that all truly numeric types only be expressed as
##  '<' or '>' constructs in the array of branch features and thresholds
sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
    my $self = shift;
    my $arr = shift;
    my $class_name = shift;
    my @array_of_features_and_values_or_thresholds = @$arr;
    return if scalar @array_of_features_and_values_or_thresholds == 0;
    my $sequence = join ':', @array_of_features_and_values_or_thresholds;
    my $sequence_with_class = "$sequence" . "::" . $class_name;
    return $self->{_probability_cache}->{$sequence_with_class} 
                      if exists $self->{_probability_cache}->{$sequence_with_class};
    my $probability = undef;
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @true_numeric_types = ();
    my @true_numeric_types_feature_names = ();
    my @symbolic_types = ();
    my @symbolic_types_feature_names = ();
    foreach my $item (@array_of_features_and_values_or_thresholds) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            my ($feature,$value) = ($1,$2);
            push @true_numeric_types_feature_names, $feature;
        } elsif ($item =~ /$pattern3/) {
            push @true_numeric_types, $item;
            my ($feature,$value) = ($1,$2);
            push @true_numeric_types_feature_names, $feature;
        } else {
            push @symbolic_types, $item;
            $item =~ /$pattern1/;
            my ($feature,$value) = ($1,$2);
            push @symbolic_types_feature_names, $feature;
        }
    }
    my %seen1 = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
    my %seen2 = ();
    @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
    my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
    print_array_with_msg("POSC: Answer returned by find_bounded: ", 
                                       $bounded_intervals_numeric_types) if $self->{_debug2};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature_name (@true_numeric_types_feature_names) {
        $upperbound{$feature_name} = undef;
        $lowerbound{$feature_name} = undef;
    }
    foreach my $item (@$bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
            } else {
                $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
            }
        }
    }
    foreach my $feature_name (@true_numeric_types_feature_names) {
        if ($lowerbound{$feature_name} && $upperbound{$feature_name} && 
                          $upperbound{$feature_name} <= $lowerbound{$feature_name}) { 
            return 0;
        } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
            if (! $probability) {

                $probability =   $self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $upperbound{$feature_name}, $class_name) -
                                 $self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $lowerbound{$feature_name}, $class_name);
            } else {
                $probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $upperbound{$feature_name}, $class_name) -
                                 $self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $lowerbound{$feature_name}, $class_name))
            }
        } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
            if (! $probability) {
                $probability =   $self->probability_of_feature_less_than_threshold_given_class($feature_name,
                                                               $upperbound{$feature_name}, $class_name);
            } else {
                $probability *=  $self->probability_of_feature_less_than_threshold_given_class($feature_name, 
                                                               $upperbound{$feature_name}, $class_name);
            }
        } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
            if (! $probability) {
                $probability =   1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
                                                               $lowerbound{$feature_name}, $class_name);
            } else {
                $probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
                                                               $lowerbound{$feature_name}, $class_name));
            }
        } else {
            die("Ill formatted call to 'probability of sequence given class' method");
        }
    }
    foreach my $feature_and_value (@symbolic_types) {
        if ($feature_and_value =~ /$pattern1/) {
            my ($feature,$value) = ($1,$2);
            if (! $probability) {        
                $probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
            } else {
                $probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    my $self = shift;
    my $class_name = shift;    
    my $arr = shift;
    my @array_of_features_and_values_or_thresholds = @$arr;
    my $sequence = join ':', @array_of_features_and_values_or_thresholds;
    my $class_and_sequence = "$class_name" . "::" . $sequence;
    return $self->{_probability_cache}->{$class_and_sequence} 
                      if exists $self->{_probability_cache}->{$class_and_sequence};
    my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
    foreach my $i (0..@{$self->{_class_names}}-1) {
        my $class_name = $self->{_class_names}->[$i];
        my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
                                               \@array_of_features_and_values_or_thresholds, $class_name);
        if ($prob < 0.000001) {
            $array_of_class_probabilities[$i] = 0.0;
            next;
        }
        my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
                                                            \@array_of_features_and_values_or_thresholds);
#        die "PCS Something is wrong with your sequence of feature values and thresholds in " .
#                "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
#                if ! $prob_of_feature_sequence;
        my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
        if ($prob_of_feature_sequence) {
            $array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
        } else {
            $array_of_class_probabilities[$i] =  $prior;
        }
    }
    my $sum_probability;
    map {$sum_probability += $_} @array_of_class_probabilities;
    if ($sum_probability == 0) {
        @array_of_class_probabilities =  map {1.0 / (scalar @{$self->{_class_names}})}  
                                                               (0..@{$self->{_class_names}}-1);
    } else {
        @array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
    }
    foreach my $i (0..@{$self->{_class_names}}-1) {
        my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
        $self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
    }
    return $self->{_probability_cache}->{$class_and_sequence};
}

#######################################  Class Based Utilities  ##########################################

##  Given a list of branch attributes for the numeric features of the form, say,
##  ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
##  smallest list that is relevant for the purpose of calculating the probabilities.
##  To explain, the probability that the feature `g2' is less than 1 AND, at the same
##  time, less than 2, AND, at the same time, less than 3, is the same as the
##  probability that the feature less than 1. Similarly, the probability that 'age'
##  is greater than 34 and also greater than 37 is the same as `age' being greater
##  than 37.
sub find_bounded_intervals_for_numeric_features {
    my $self = shift;
    my $arr = shift;    
    my @arr = @$arr;
    my @features = @{$self->{_feature_names}};
    my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;   
    print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
    my @arr3 = ();
    foreach my $feature_name (@features) {
        my @temp = ();
        foreach my $x (@arr1) {
            push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
        }
        push @arr3, \@temp if @temp > 0;
    }
    print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
    # Sort each list so that '<' entries occur before '>' entries:
    my @arr4;
    foreach my $li (@arr3) {
        my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
        push @arr4, \@sorted;
    }
    print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
    my @arr5;
    foreach my $li (@arr4) {
        my @temp1 = ();
        my @temp2 = ();
        foreach my $inner (@$li) {
            if ($inner->[1] eq '<') {
                push @temp1, $inner;
            } else {
                push @temp2, $inner;
            }
        }
        if (@temp1 > 0 && @temp2 > 0) {
            push @arr5, [\@temp1, \@temp2];
        } elsif (@temp1 > 0) {
            push @arr5, [\@temp1];
        } else {
            push @arr5, [\@temp2];
        }
    }
    print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
    my @arr6 = ();
    foreach my $li (@arr5) {
        my @temp1 = ();
        foreach my $inner (@$li) {
            my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
            push @temp1, \@sorted;
        }
        push @arr6, \@temp1;
    }
    print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
    my @arr9 = ();
    foreach my $li (@arr6) {
        foreach my $alist (@$li) {
            my @newalist = ();
            if ($alist->[0][1] eq '<') {
                push @newalist, $alist->[0];
            } else {
                push @newalist, $alist->[-1];
            }
            if ($alist->[0][1] ne $alist->[-1][1]) {
                push @newalist, $alist->[-1];
            }
            push @arr9, \@newalist;
        }
    }
    print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
    return \@arr9;

}

##  This method is used to verify that you used legal feature names in the test
##  sample that you want to classify with the decision tree.
sub check_names_used {
    my $self = shift;
    my $features_and_values_test_data = shift;
    my @features_and_values_test_data = @$features_and_values_test_data;
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values_test_data) {
        $feature_and_value =~ /$pattern/;
        my ($feature,$value) = ($1,$2);
        die "Your test data has formatting error" unless defined($feature) && defined($value);
        return 0 unless contained_in($feature, @{$self->{_feature_names}});
    }
    return 1;
}

#######################################  Data Condition Calculator  ######################################

##  This method estimates the worst-case fan-out of the decision tree taking into
##  account the number of values (and therefore the number of branches emanating from
##  a node) for the symbolic features.
sub determine_data_condition {
    my $self = shift;
    my $num_of_features = scalar @{$self->{_feature_names}};
    my @values = ();
    my @number_of_values;
    foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {  
        push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
            if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
        push @number_of_values, scalar @values;
    }
    return if ! @values;
    print "Number of features: $num_of_features\n";
    my @minmax = minmax(\@number_of_values);
    my $max_num_values = $minmax[1];
    print "Largest number of values for symbolic features is: $max_num_values\n";
    my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
    print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
          "nodes. The exact number of nodes created depends critically on " .
          "the entropy_threshold used for node expansion (the default value " .
          "for this threshold is 0.01) and on the value set for max_depth_desired " .
          "for the depth of the tree.\n";
    if ($estimated_number_of_nodes > 10000) {
        print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
              "large value for entropy_threshold and/or a small value for " .
              "for max_depth_desired to reduce the number of nodes created.\n";
        print "\nDo you wish to continue anyway? Enter 'y' for yes:  ";
        my $answer = <STDIN>;
        $answer =~ s/\r?\n?$//;
        while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
            print "\nAnswer not recognized.  Let's try again. Enter 'y' or 'n': ";
            $answer = <STDIN>;
            $answer =~ s/\r?\n?$//;
        }
        die unless $answer =~ /y(es)?/i;
    }
}


####################################  Read Training Data From File  ######################################


sub get_training_data {
    my $self = shift;
    die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
    my %class_names = ();
    my %all_record_ids_with_class_labels;
    my $firstline;
    my %data_hash;
    $|++;
    open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
    my $record_index = 0;
    my $firsetline;
    while (<FILEIN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
        if ($record_index == 0) {
            $firstline = $record;
            $record_index++;
            next;
        }
        my @parts = split /,/, $record;
        my $classname = $parts[$self->{_csv_class_column_index}];
        $class_names{$classname} = 1;
        my $record_label = shift @parts;
        $record_label  =~ s/^\s*\"|\"\s*$//g;
        $data_hash{$record_label} = \@parts;
        $all_record_ids_with_class_labels{$record_label} = $classname;
        print "." if $record_index % 10000 == 0;
        $record_index++;
    }
    close FILEIN;    
    $|--;
    $self->{_how_many_total_training_samples} = $record_index - 1;  # must subtract 1 for the header record
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
    my @all_feature_names =   split /,/, substr($firstline, index($firstline,','));
    my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
    my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
    my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
    my %class_for_sample_hash = map {"sample_" . $_  =>  "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $d...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};     
    my %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    my %features_and_unique_values_hash = ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($self->{_debug1}) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";
        }
        print "\nclass label for each data sample:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)}  keys %class_for_sample_hash) {
            print "$sample => $class_for_sample_hash{$sample}\n";
        }
        print "\nFeatures used: @feature_names\n\n";
        print "\nfeatures and the values taken by them:\n";
        foreach my $feature (sort keys %features_and_values_hash) {
            print "$feature => @{$features_and_values_hash{$feature}}\n";
        }
        print "\nnumeric features and their ranges:\n";
        foreach  my $feature (sort keys %numeric_features_valuerange_hash) {
            print "$feature  =>  @{$numeric_features_valuerange_hash{$feature}}\n";
        }
        print "\nnumber of unique values in each feature:\n";
        foreach  my $feature (sort keys %feature_values_how_many_uniques_hash) {
            print "$feature  =>  $feature_values_how_many_uniques_hash{$feature}\n";
        }
    }
    $self->{_class_names} = \@all_class_names;
    $self->{_feature_names} = \@feature_names;
    $self->{_samples_class_label_hash}  =  \%class_for_sample_hash;
    $self->{_training_data_hash}  =  \%feature_values_for_samples_hash;
    $self->{_features_and_values_hash}  = \%features_and_values_hash;
    $self->{_features_and_unique_values_hash}  =  \%features_and_unique_values_hash;
    $self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
    $self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
}

sub show_training_data {
    my $self = shift;
    my @class_names = @{$self->{_class_names}};
    my %features_and_values_hash = %{$self->{_features_and_values_hash}};
    my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
    my %training_data_hash = %{$self->{_training_data_hash}};
    print "\n\nClass Names: @class_names\n";
    print "\n\nFeatures and Their Values:\n\n";
    while ( my ($k, $v) = each %features_and_values_hash ) {
        print "$k --->  @{$features_and_values_hash{$k}}\n";
    }
    print "\n\nSamples vs. Class Labels:\n\n";
    foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
        print "$kee =>  $samples_class_label_hash{$kee}\n";
    }
    print "\n\nTraining Samples:\n\n";
    foreach my $kee (sort {sample_index($a) <=> sample_index($b)} 
                                      keys %training_data_hash) {
        print "$kee =>  @{$training_data_hash{$kee}}\n";
    }
}    

sub get_class_names {
    my $self = shift;
    return @{$self->{_class_names}}
}

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN


# Returns the minimum value and its positional index in an array
sub minimum {
    my $arr = shift;
    my $min;
    my $index;
    foreach my $i (0..@{$arr}-1) {
        if ( (!defined $min) || ($arr->[$i] < $min) ) {
            $index = $i;
            $min = $arr->[$i];
        }
    }
    return ($min, $index);
}

# Returns an array of two values, the min and the max, of an array of floats
sub minmax {
    my $arr = shift;
    my ($min, $max);
    foreach my $i (0..@{$arr}-1) {
        if ( (!defined $min) || ($arr->[$i] < $min) ) {
            $min = $arr->[$i];
        }
        if ( (!defined $max) || ($arr->[$i] > $max) ) {
            $max = $arr->[$i];
        }
    }
    return ($min, $max);
}

# checks whether an element is in an array:
sub contained_in {
    my $ele = shift;
    my @array = @_;
    my $count = 0;
    map {$count++ if $ele eq $_} @array;
    return $count;
}

# Meant only for an array of strings (no nesting):
sub deep_copy_array {
    my $ref_in = shift;
    my $ref_out;
    return [] if scalar @$ref_in == 0;
    foreach my $i (0..@{$ref_in}-1) {
        $ref_out->[$i] = $ref_in->[$i];
    }
    return $ref_out;
}

sub check_for_illegal_params2 {
    my @params = @_;
    my @legal_params = qw / training_datafile
                            entropy_threshold
                            max_depth_desired
                            csv_class_column_index
                            csv_columns_for_features
                            symbolic_to_numeric_cardinality_threshold
                            number_of_histogram_bins
                            csv_cleanup_needed
                            debug1
                            debug2
                            debug3
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub print_array_with_msg {
    my $message = shift;
    my $arr = shift;
    print "\n$message: ";
    print_nested_array( $arr );
}

sub print_nested_array {
    my $arr = shift;
    my @arr = @$arr;
    print "[";
    foreach my $item (@arr) {
        if (ref $item) {
            print_nested_array($item);
        } else {
            print "$item";
        }
    }
    print "]";
}    

sub cleanup_csv {
    my $line = shift;
    $line =~ tr/\/:?()[]{}'/          /;
#    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
    for (@double_quoted) {
        my $item = $_;
        $item = substr($item, 1, -1);
        $item =~ s/^\s+|,|\s+$//g;
        $item = join '_',  split /\s+/, $item;
        substr($line, index($line, $_), length($_)) = $item;
    }
    my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
    for (@white_spaced) {
        my $item = $_;
        $item =~ s/\s+/_/g;
        $item =~ s/^\s*_|_\s*$//g;
        substr($line, index($line, $_), length($_)) = $item;
    }
    $line =~ s/,\s*(?=,|$)/,NA/g;
    return $line;
}

######################################### Class EvalTrainingData  ########################################

##  This subclass of the DecisionTree class is used to evaluate the quality of your
##  training data by running a 10-fold cross-validation test on it. This test divides
##  all of the training data into ten parts, with nine parts used for training a
##  decision tree and one part used for testing its ability to classify correctly.
##  This selection of nine parts for training and one part for testing is carried out
##  in all of the ten different possible ways.  This testing functionality can also
##  be used to find the best values to use for the constructor parameters
##  entropy_threshold, max_depth_desired, and
##  symbolic_to_numeric_cardinality_threshold.

##  Only the CSV training files can be evaluated in this manner (because only CSV
##  training are allowed to have numeric features --- which is the more interesting
##  case for evaluation analytics.

package EvalTrainingData;

@EvalTrainingData::ISA = ('Algorithm::DecisionTree');

sub new {
    my $class = shift;
    my $instance = Algorithm::DecisionTree->new(@_);
    bless $instance, $class;
}

sub evaluate_training_data {
    my $self = shift;
    my $evaldebug = 0;
    die "The data evaluation function in the module can only be used when your " .
        "training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
    print "\nWill run a 10-fold cross-validation test on your training data to test its " .
          "class-discriminatory power:\n";
    my %all_training_data = %{$self->{_training_data_hash}};
    my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=> 
                                     Algorithm::DecisionTree::sample_index($b)}  keys %all_training_data;
    my $fold_size = int(0.1 * (scalar keys %all_training_data));
    print "fold size: $fold_size\n";
    my %confusion_matrix = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        foreach my $inner_class_name (@{$self->{_class_names}}) {
            $confusion_matrix{$class_name}->{$inner_class_name} = 0;
        }
    }
    foreach my $fold_index (0..9) {
        print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n"; 
        my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
        my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],  
                     @all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
        my %testing_data = ();
        foreach my $x (@testing_samples) {
            $testing_data{$x} = $all_training_data{$x};
        }
        my %training_data = ();
        foreach my $x (@training_samples) {
            $training_data{$x} = $all_training_data{$x};
        }
        my $trainingDT = Algorithm::DecisionTree->new('evalmode');
        $trainingDT->{_training_data_hash} = \%training_data;
        $trainingDT->{_class_names} = $self->{_class_names};
        $trainingDT->{_feature_names} = $self->{_feature_names};
        $trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
        $trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
        $trainingDT->{_symbolic_to_numeric_cardinality_threshold} = 
                                                $self->{_symbolic_to_numeric_cardinality_threshold};
        foreach my $sample_name (@training_samples) {
            $trainingDT->{_samples_class_label_hash}->{$sample_name} = 
                                                $self->{_samples_class_label_hash}->{$sample_name};
        }
        foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
            $trainingDT->{_features_and_values_hash}->{$feature} = ();
        }
        my $pattern = '(\S+)\s*=\s*(\S+)';
        foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=> 
                                Algorithm::DecisionTree::sample_index($b)}  
                          keys %{$trainingDT->{_training_data_hash}}) {
            foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
                $feature_and_value =~ /$pattern/;
                my ($feature,$value) = ($1,$2);
                push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
            }
        }
        foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
            my %seen = ();
            my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++} 
                                                @{$trainingDT->{_features_and_values_hash}->{$feature}}; 
            if (Algorithm::DecisionTree::contained_in($feature, 
                                                keys %{$self->{_numeric_features_valuerange_hash}})) {
                @unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
            } else {
                @unique_values_for_feature = sort @unique_values_for_feature;
            }
            $trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
        }
        foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
            my @minmaxvalues = Algorithm::DecisionTree::minmax(
                                         \@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
            $trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
        }
        if ($evaldebug) {
            print "\n\nprinting samples in the testing set: @testing_samples\n";
            print "\n\nPrinting features and their values in the training set:\n";
            foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
                print "$item  => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
            }
            print "\n\nPrinting unique values for features:\n";
            foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
                print "$item  => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
            }
            print "\n\nPrinting unique value ranges for features:\n";
            foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
                print "$item  => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
            }
        }
        foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
            $trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} = 
                scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
        }
        $trainingDT->{_debug2} = 1 if $evaldebug;
        $trainingDT->calculate_first_order_probabilities();
        $trainingDT->calculate_class_priors();
        my $root_node = $trainingDT->construct_decision_tree_classifier();
        $root_node->display_decision_tree("     ") if $evaldebug;
        foreach my $test_sample_name (@testing_samples) {
            my @test_sample_data = @{$all_training_data{$test_sample_name}};
            print "original data in test sample: @test_sample_data\n" if $evaldebug;
            @test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
            print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
            my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
            my @solution_path = @{$classification{'solution_path'}};
            delete $classification{'solution_path'};
            my @which_classes = keys %classification;
            @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
            my $most_likely_class_label = $which_classes[0];
            if ($evaldebug) {
                print "\nClassification:\n\n";
                print "     class                         probability\n";
                print "     ----------                    -----------\n";
                foreach my $which_class (@which_classes) {
                    my $classstring = sprintf("%-30s", $which_class);
                    my $valuestring = sprintf("%-30s", $classification{$which_class});
                    print "     $classstring $valuestring\n";
                }
                print "\nSolution path in the decision tree: @solution_path\n";
                print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
            }
            my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
            print "$test_sample_name:    true_class: $true_class_label_for_sample    " .
                     "estimated_class: $most_likely_class_label\n"  if $evaldebug;
            $confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
        }
    }
    print "\n\n       DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
    my $matrix_header = " " x 30;
    foreach my $class_name (@{$self->{_class_names}}) {  
        $matrix_header .= sprintf("%-30s", $class_name);
    }
    print "\n" . $matrix_header . "\n\n";
    foreach my $row_class_name (sort keys %confusion_matrix) {
        my $row_display = sprintf("%-30s", $row_class_name);
        foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
            $row_display .= sprintf( "%-30u",  $confusion_matrix{$row_class_name}->{$col_class_name} );
        }
        print "$row_display\n\n";
    }
    print "\n\n";
    my ($diagonal_sum, $off_diagonal_sum) = (0,0);
    foreach my $row_class_name (sort keys %confusion_matrix) {
        foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
            if ($row_class_name eq $col_class_name) {
                $diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
            } else {
                $off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
            }
        }
    }
    my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
    print "\nTraining Data Quality Index: $data_quality_index    (out of a possible maximum of 100)\n";
    if ($data_quality_index <= 80) {
        print "\nYour training data does not possess much class discriminatory " .
              "information.  It could be that the classes are inherently not well " .
              "separable or that your constructor parameter choices are not appropriate.\n";
    } elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
        print "\nYour training data possesses some class discriminatory information " .
              "but it may not be sufficient for real-world applications.  You might " .
              "try tweaking the constructor parameters to see if that improves the " .
              "class discriminations.\n";
    } elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
        print  "\nYour training data appears to possess good class discriminatory " .
               "information.  Whether or not it is acceptable would depend on your " .
               "application.\n";
    } elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
        print "\nYour training data is of excellent quality.\n";
    } else {
        print "\nYour training data is perfect.\n";
    }

}


#############################################  Class DTNode  #############################################

# The nodes of the decision tree are instances of this class:

package DTNode;

use strict; 
use Carp;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
        my @class_names = @{$self->get_class_names()};
        my @print_class_probabilities_with_class =
            map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
        print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
        my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
        print "$second_line_offset" . "Decision Feature: $feature_at_node    Node Creation Entropy: " ,
              "$print_node_creation_entropy_at_node   Class Probs: @print_class_probabilities_with_class\n\n";
        $offset .= "   ";
        foreach my $child (@{$self->get_children()}) {
            $child->display_decision_tree($offset);
        }
    } else {
        my $node_creation_entropy_at_node = $self->get_node_entropy();
        my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
        my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
        my @class_probabilities = @{$self->get_class_probabilities()};
        my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
        my @class_names = @{$self->get_class_names()};
        my @print_class_probabilities_with_class =
            map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
        print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
        my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
        print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node   " .
              "Class Probs: @print_class_probabilities_with_class\n\n";
    }
}


##############################  Generate Your Own Numeric Training Data  #################################
#############################      Class TrainingDataGeneratorNumeric     ################################

##  See the script generate_training_data_numeric.pl in the examples
##  directory on how to use this class for generating your own numeric training and
##  test data.  The training and test data are generated in accordance with the
##  specifications you place in the parameter file that is supplied as an argument to
##  the constructor of this class.

package TrainingDataGeneratorNumeric;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params3(@params) == 0;   
    bless {
        _output_training_csv_file          =>   $args{'output_training_csv_file'} 
                                                   || croak("name for output_training_csv_file required"),
        _output_test_csv_file              =>   $args{'output_test_csv_file'} 
                                                   || croak("name for output_test_csv_file required"),
        _parameter_file                    =>   $args{'parameter_file'}
                                                         || croak("parameter_file required"),
        _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'} 
                                                         || croak("number_of_samples_for_training"),
        _number_of_samples_for_testing     =>   $args{'number_of_samples_for_testing'} 
                                                         || croak("number_of_samples_for_testing"),
        _debug                             =>    $args{debug} || 0,
        _class_names                       =>    [],
        _class_names_and_priors            =>    {},
        _features_with_value_range         =>    {},
        _features_ordered                  =>    [],
        _classes_and_their_param_values    =>    {},
    }, $class;
}

sub check_for_illegal_params3 {
    my @params = @_;
    my @legal_params = qw / output_training_csv_file
                            output_test_csv_file
                            parameter_file
                            number_of_samples_for_training
                            number_of_samples_for_testing
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

##  The training data generated by an instance of the class
##  TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
##  you supply to the class constructor through a constructor variable called
##  `parameter_file'.  This method is for parsing the parameter file in order to
##  order to determine the names to be used for the different data classes, their
##  means, and their variances.
sub read_parameter_file_numeric {
    my $self = shift;
    my @class_names = ();
    my %class_names_and_priors = ();
    my %features_with_value_range = ();
    my %classes_and_their_param_values = ();
#   my $regex8 =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
    my @params = <FILE>;
    my $params = join "", @params;
    my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
    $params =~ /$regex/si;
    my ($class_names, $class_priors) = ($1, $2);
    @class_names = split ' ', $class_names; 
    my @class_priors = split ' ', $class_priors;
    foreach my $i (0..@class_names-1) {
        $class_names_and_priors{$class_names[$i]} = $class_priors[$i];
    }
    if ($self->{_debug}) {
        foreach my $cname (keys %class_names_and_priors) {
            print "$cname  =>   $class_names_and_priors{$cname}\n";
        }
    }
    $regex = 'feature name: \w*.*?value range: [\d\. -]+';
    my @features = $params =~ /$regex/gsi;
    my @features_ordered;
    $regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
    foreach my $feature (@features) {
        $feature =~ /$regex/i;
        my $feature_name = $1;
        push @features_ordered, $feature_name;
        my @value_range = split ' ', $2;
        $features_with_value_range{$feature_name} = \@value_range;
    }
    if ($self->{_debug}) {
        foreach my $fname (keys %features_with_value_range) {
            print "$fname  =>   @{$features_with_value_range{$fname}}\n";
        }
    }
    foreach my $i (0..@class_names-1) {
        $classes_and_their_param_values{$class_names[$i]} = {};
    }
    $regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
    my @class_params = $params =~ /$regex/gsi;
    $regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';
    foreach my $class_param (@class_params) {
        $class_param =~ /$regex/gsi;
        my $class_name = $1;
        my @class_mean = split ' ', $2;
        $classes_and_their_param_values{$class_name}->{'mean'} =  \@class_mean;
        my $class_param_string = $3;
        my @covar_rows = split '\n', $class_param_string;
        my @covar_matrix;
        foreach my $row (@covar_rows) {
            my @row = split ' ', $row;
            push @covar_matrix, \@row;
        }
        $classes_and_their_param_values{$class_name}->{'covariance'} =  \@covar_matrix;
    }
    if ($self->{_debug}) {
        print "\nThe class parameters are:\n\n";
        foreach my $cname (keys %classes_and_their_param_values) {
            print "\nFor class name $cname:\n";
            my %params_hash = %{$classes_and_their_param_values{$cname}};
            foreach my $x (keys %params_hash) {
                if ($x eq 'mean') {
                    print "    $x   =>   @{$params_hash{$x}}\n";
                } else {
                    if ($x eq 'covariance') {
                        print "    The covariance matrix:\n";
                        my @matrix = @{$params_hash{'covariance'}};
                        foreach my $row (@matrix) {
                            print "        @$row\n";
                        }
                    }
                }
            }
        }
    }
    $self->{_class_names}        =   \@class_names;
    $self->{_class_names_and_priors}   = \%class_names_and_priors;
    $self->{_features_with_value_range}   = \%features_with_value_range;
    $self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
    $self->{_features_ordered} = \@features_ordered;
}

##  After the parameter file is parsed by the previous method, this method calls on
##  Math::Random::random_multivariate_normal() to generate the training and test data
##  samples. Your training and test data can be of any number of of dimensions, can
##  have any mean, and any covariance.  The training and test data must obviously be
##  drawn from the same distribution.
sub gen_numeric_training_and_test_data_and_write_to_csv {
    use Math::Random;
    my $self = shift;
    my %training_samples_for_class;
    my %test_samples_for_class;
    foreach my $class_name (@{$self->{_class_names}}) {
        $training_samples_for_class{$class_name} = [];
        $test_samples_for_class{$class_name} = [];
    }
    foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
        my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
        my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
        my @new_training_data = Math::Random::random_multivariate_normal(
              $self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
              @mean, @covariance );
        my @new_test_data = Math::Random::random_multivariate_normal(
              $self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
              @mean, @covariance );
        if ($self->{_debug}) {
            print "training data for class $class_name:\n";
            foreach my $x (@new_training_data) {print "@$x\n";}
            print "\n\ntest data for class $class_name:\n";
            foreach my $x (@new_test_data) {print "@$x\n";}
        }
        $training_samples_for_class{$class_name} = \@new_training_data;
        $test_samples_for_class{$class_name} = \@new_test_data;
    }
    my @training_data_records = ();
    my @test_data_records = ();
    foreach my $class_name (keys %training_samples_for_class) {
        my $num_of_samples_for_training = $self->{_number_of_samples_for_training} * 
                                         $self->{_class_names_and_priors}->{$class_name};
        my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} * 
                                         $self->{_class_names_and_priors}->{$class_name};
        foreach my $sample_index (0..$num_of_samples_for_training-1) {
            my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
            @training_vector = map {sprintf("%.3f", $_)} @training_vector;
            my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
            push @training_data_records, $training_data_record;
        }
        foreach my $sample_index (0..$num_of_samples_for_testing-1) {
            my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
            @test_vector = map {sprintf("%.3f", $_)} @test_vector;
            my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
            push @test_data_records, $test_data_record;
        }
    }
    fisher_yates_shuffle(\@training_data_records);
    fisher_yates_shuffle(\@test_data_records);
    if ($self->{_debug}) {
        foreach my $record (@training_data_records) {
            print "$record";
        }
        foreach my $record (@test_data_records) {
            print "$record";
        }
    }
    open OUTPUT, ">$self->{_output_training_csv_file}";
    my @feature_names_training = @{$self->{_features_ordered}};
    my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
    my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
    print OUTPUT "$first_row_training\n";
    foreach my $i (0..@training_data_records-1) {
        my $i1 = $i+1;
        my $sample_record = "\"$i1\",$training_data_records[$i]";
        print OUTPUT "$sample_record";
    }
    close OUTPUT;
    open OUTPUT, ">$self->{_output_test_csv_file}";
    my @feature_names_testing = keys %{$self->{_features_with_value_range}};
    my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
    my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
    print OUTPUT "$first_row_testing\n";
    foreach my $i (0..@test_data_records-1) {
        my $i1 = $i+1;
        my $sample_record = "\"$i1\",$test_data_records[$i]";
        print OUTPUT "$sample_record";
    }
    close OUTPUT;
}

# from perl docs:                                                                         
sub fisher_yates_shuffle {
    my $arr =  shift;
    my $i = @$arr;
    while (--$i) {
        my $j = int rand( $i + 1 );
        @$arr[$i, $j] = @$arr[$j, $i];
    }
}

###########################  Generate Your Own Symbolic Training Data  ###############################
###########################     Class TrainingDataGeneratorSymbolic      #############################

##  See the sample script generate_training_and_test_data_symbolic.pl for how to use
##  this class for generating purely symbolic training and test data.  The data is
##  generated according to the specifications you place in a parameter file whose
##  name you supply as one of constructor arguments.
package TrainingDataGeneratorSymbolic;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params4(@params) == 0;   
    bless {
        _output_training_datafile          =>   $args{'output_training_datafile'} 
                                                   || die("name for output_training_datafile required"),
        _parameter_file                    =>   $args{'parameter_file'}
                                                   || die("parameter_file required"),
        _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'} 
                                                   || die("number_of_samples_for_training required"),
        _debug                             =>    $args{debug} || 0,
        _class_names                       =>    [],
        _class_priors                      =>    [],
        _features_and_values_hash          =>    {},
        _bias_hash                         =>    {},
        _training_sample_records           =>    {},
    }, $class;
}

sub check_for_illegal_params4 {
    my @params = @_;
    my @legal_params = qw / output_training_datafile
                            parameter_file
                            number_of_samples_for_training
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

##  Read a parameter file for generating symbolic training data. See the script
##  generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
##  pass the name of the parameter file to the constructor of the
##  TrainingDataGeneratorSymbolic class.
sub read_parameter_file_symbolic {
    my $self = shift;
    my $debug = $self->{_debug};
    my $number_of_training_samples = $self->{_number_of_samples_for_training};
    my $input_parameter_file = $self->{_parameter_file};
    croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
    my $output_file_training = $self->{_output_training_datafile};
    my $output_file_testing = $self->{_output_test_datafile};
    my @all_params;
    my $param_string;
    open INPUT, $input_parameter_file || "unable to open parameter file: $!";
    @all_params = <INPUT>;
    @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
    @all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
    $param_string = join ' ', @all_params;
    my ($class_names, $class_priors, $rest_param) = 
              $param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
    my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
    push @{$self->{_class_names}}, @class_names;
    my @class_priors =   grep {defined($_) && length($_) > 0} split /\s+/, $2;
    push @{$self->{_class_priors}}, @class_priors;    
    my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
    my %features_and_values_hash;
    my @features = split /(feature[:])/, $feature_string;
    @features = grep {defined($_) && length($_) > 0} @features;
    foreach my $item (@features) {
        next if $item =~ /feature/;
        my @splits = split / /, $item;
        @splits = grep {defined($_) && length($_) > 0} @splits;
        foreach my $i (0..@splits-1) {
            if ($i == 0) {
                $features_and_values_hash{$splits[0]} = [];
            } else {
                next if $splits[$i] =~ /values/;
                push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
            }
        }
    }
    $self->{_features_and_values_hash} = \%features_and_values_hash;
    my %bias_hash = %{$self->{_bias_hash}};
    my @biases = split /(bias[:]\s*class[:])/, $bias_string;
    @biases = grep {defined($_) && length($_) > 0} @biases;
    foreach my $item (@biases) {
        next if $item =~ /bias/;
        my @splits = split /\s+/, $item;
        @splits = grep {defined($_) && length($_) > 0} @splits;
        my $feature_name;
        foreach my $i (0..@splits-1) {
            if ($i == 0) {
                $bias_hash{$splits[0]} = {};
            } elsif ($splits[$i] =~ /(^.+)[:]$/) {
                $feature_name = $1;
                $bias_hash{$splits[0]}->{$feature_name} = [];
            } else {
                next if !defined $feature_name;
                push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
                        if defined $feature_name;
            }
        }
    }
    $self->{_bias_hash} = \%bias_hash;
    if ($debug) {
        print "\n\nClass names: @class_names\n";
        my $num_of_classes = @class_names;
        print "Class priors: @class_priors\n";
        print "Number of classes: $num_of_classes\n";
        print "\nHere are the features and their possible values:\n";
        while ( my ($k, $v) = each %features_and_values_hash ) {
            print "$k ===>  @$v\n";
        }
        print "\nHere is the biasing for each class:\n";
        while ( my ($k, $v) = each %bias_hash ) {
            print "$k:\n";
            while ( my ($k1, $v1) = each %$v ) {
                print "       $k1 ===>  @$v1\n";
            }
        }
    }
}

##  This method generates training data according to the specifications placed in a
##  parameter file that is read by the previous method.
sub gen_symbolic_training_data {
    my $self = shift;
    my @class_names = @{$self->{_class_names}};
    my @class_priors = @{$self->{_class_priors}};
    my %training_sample_records;
    my %features_and_values_hash = %{$self->{_features_and_values_hash}};
    my %bias_hash  = %{$self->{_bias_hash}};
    my $how_many_training_samples = $self->{_number_of_samples_for_training};
    my $how_many_test_samples = $self->{_number_of_samples_for_testing};
    my %class_priors_to_unit_interval_map;
    my $accumulated_interval = 0;
    foreach my $i (0..@class_names-1) {
        $class_priors_to_unit_interval_map{$class_names[$i]} 
         = [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
        $accumulated_interval += $class_priors[$i];
    }
    if ($self->{_debug}) {
        print "Mapping of class priors to unit interval: \n";
        while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
            print "$k =>  @$v\n";
        }
        print "\n\n";
    }
    my $ele_index = 0;
    while ($ele_index < $how_many_training_samples) {
        my $sample_name = "sample" . "_$ele_index";
        $training_sample_records{$sample_name} = [];
        # Generate class label for this training sample:                
        my $roll_the_dice = rand(1.0);
        my $class_label;
        foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
            my $v = $class_priors_to_unit_interval_map{$class_name};
            if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
                push @{$training_sample_records{$sample_name}}, 
                                    "class=" . $class_name;
                $class_label = $class_name;
                last;
            }
        }
        foreach my $feature (keys %features_and_values_hash) {
            my @values = @{$features_and_values_hash{$feature}};
            my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
            my $no_bias = 1.0 / @values;
            $bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
            my %value_priors_to_unit_interval_map;
            my @splits = split /\s*=\s*/, $bias_string;
            my $chosen_for_bias_value = $splits[0];
            my $chosen_bias = $splits[1];
            my $remaining_bias = 1 - $chosen_bias;
            my $remaining_portion_bias = $remaining_bias / (@values -1);
            @splits = grep {defined($_) && length($_) > 0} @splits;
            my $accumulated = 0;
            foreach my $i (0..@values-1) {
                if ($values[$i] eq $chosen_for_bias_value) {
                    $value_priors_to_unit_interval_map{$values[$i]} 
                        = [$accumulated, $accumulated + $chosen_bias];
                    $accumulated += $chosen_bias;
                } else {
                    $value_priors_to_unit_interval_map{$values[$i]} 
                      = [$accumulated, $accumulated + $remaining_portion_bias];
                    $accumulated += $remaining_portion_bias;           
                }
            }
            my $roll_the_dice = rand(1.0);
            my $value_label;
            foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
                my $v = $value_priors_to_unit_interval_map{$value_name};
                if ( ($roll_the_dice >= $v->[0]) 
                             && ($roll_the_dice <= $v->[1]) ) {
                    push @{$training_sample_records{$sample_name}}, 
                                            $feature . "=" . $value_name;
                    $value_label = $value_name;
                    last;
                }
            }
            if ($self->{_debug}) {
                print "mapping feature value priors for '$feature' " .
                                          "to unit interval: \n";
                while ( my ($k, $v) = 
                        each %value_priors_to_unit_interval_map ) {
                    print "$k =>  @$v\n";
                }
                print "\n\n";
            }
        }
        $ele_index++;
    }
    $self->{_training_sample_records} = \%training_sample_records;
    if ($self->{_debug}) {
        print "\n\nPRINTING TRAINING RECORDS:\n\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
            print "$kee =>  @{$training_sample_records{$kee}}\n\n";
        }
    }
    my $output_training_file = $self->{_output_training_datafile};
    print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
    open FILEHANDLE, ">$output_training_file";
    my @features = sort keys %features_and_values_hash;
    my $title_string = ',class';
    foreach my $feature_name (@features) {
        $title_string .= ',' . $feature_name;
    }
    print FILEHANDLE "$title_string\n";
    my @sample_names = sort {$a <=> $b}  map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
    my $record_string = '';
    foreach my $sample_name (@sample_names) {
        $record_string .= "$sample_name,";
        my @record = @{$training_sample_records{"sample_$sample_name"}};
        my %item_parts_hash;
        foreach my $item (@record) {
            my @splits = grep $_, split /=/, $item;
            $item_parts_hash{$splits[0]} = $splits[1];
        }
        $record_string .= $item_parts_hash{"class"};
        delete $item_parts_hash{"class"};
        my @kees = sort keys %item_parts_hash;
        foreach my $kee (@kees) {
            $record_string .= ",$item_parts_hash{$kee}";
        }
        print FILEHANDLE "$record_string\n";
        $record_string = '';
    }
    close FILEHANDLE;
}    

sub sample_index {
    my $arg = shift;
    $arg =~ /_(.+)$/;
    return $1;
}    

#################################   Decision Tree Introspection   #######################################
#################################      Class DTIntrospection      #######################################

package DTIntrospection;

##  Instances constructed from this class can provide explanations for the
##  classification decisions at the nodes of a decision tree.  
##  
##  When used in the interactive mode, the decision-tree introspection made possible
##  by this class provides answers to the following three questions: (1) List of the
##  training samples that fall in the portion of the feature space that corresponds
##  to a node of the decision tree; (2) The probabilities associated with the last
##  feature test that led to the node; and (3) The class probabilities predicated on
##  just the last feature test on the path to that node.
##  
##  CAVEAT: It is possible for a node to exist even when there are no training
##  samples in the portion of the feature space that corresponds to the node.  That
##  is because a decision tree is based on the probability densities estimated from
##  the training data. When training data is non-uniformly distributed, it is
##  possible for the probability associated with a point in the feature space to be
##  non-zero even when there are no training samples at or in the vicinity of that
##  point.
##  
##  For a node to exist even where there are no training samples in the portion of
##  the feature space that belongs to the node is an indication of the generalization
##  ability of decision-tree based classification.
##  
##  When used in a non-interactive mode, an instance of this class can be used to
##  create a tabular display that shows what training samples belong directly to the
##  portion of the feature space that corresponds to each node of the decision tree.
##  An instance of this class can also construct a tabular display that shows how the
##  influence of each training sample propagates in the decision tree.  For each
##  training sample, this display first shows the list of nodes that came into
##  existence through feature test(s) that used the data provided by that sample.
##  This list for each training sample is followed by a subtree of the nodes that owe
##  their existence indirectly to the training sample. A training sample influences a
##  node indirectly if the node is a descendant of another node that is affected
##  directly by the training sample.

use strict; 
use Carp;

sub new {                                                           
    my ($class, $dt) = @_; 
    croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
        unless ref($dt) eq "Algorithm::DecisionTree";
    bless {                                                         
        _dt                                 => $dt,
        _root_dtnode                        => $dt->{_root_node},
        _samples_at_nodes_hash              => {},
        _branch_features_to_nodes_hash      => {},
        _sample_to_node_mapping_direct_hash => {},
        _node_serial_num_to_node_hash       => {}, 
        _awareness_raising_msg_shown        => 0,
        _debug                              => 0,
    }, $class;                                                     
}

sub initialize {
    my $self = shift;
    croak "You must first construct the decision tree before using the DTIntrospection class"
        unless $self->{_root_dtnode};
    $self->recursive_descent($self->{_root_dtnode});
}

sub recursive_descent {
    my $self = shift;
    my $node = shift;
    my $node_serial_number = $node->get_serial_num();
    my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
    print "\nAt node $node_serial_number:  the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
    $self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
    $self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
    my @samples_at_node = ();
    foreach my $item (@$branch_features_and_values_or_thresholds) {
        my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
        unless (@samples_at_node) {
            @samples_at_node =  @$samples_for_feature_value_combo;
        } else {
            my @accum;
            foreach my $sample (@samples_at_node) {
                push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);  
            }
            @samples_at_node =  @accum;
        }
        last unless @samples_at_node;
    }
    @samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node; 
    print "Node: $node_serial_number    the samples are: [@samples_at_node]\n"  if ($self->{_debug});
    $self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
    if (@samples_at_node) {
        foreach my $sample (@samples_at_node) {
            if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
                $self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number]; 
            } else {
                push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
            }
        }
    }
    my $children = $node->get_children();
    foreach my $child (@$children) {
        $self->recursive_descent($child);
    }
}

sub display_training_samples_at_all_nodes_direct_influence_only {
    my $self = shift;
    croak "You must first construct the decision tree before using the DT Introspection class." 
        unless $self->{_root_dtnode};
    $self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
}

sub recursive_descent_for_showing_samples_at_a_node{
    my $self = shift;
    my $node = shift;
    my $node_serial_number = $node->get_serial_num();
    my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
    if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
        print "\nAt node $node_serial_number:  the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n"  if $self->{_debug};
        print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
    }
    map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};            
}

sub display_training_samples_to_nodes_influence_propagation {
    my $self = shift;
    foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)}  keys %{$self->{_dt}->{_training_data_hash}}) {
        if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
            my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
            print "\n$sample:\n    nodes affected directly: [@{$nodes_directly_affected}]\n";
            print "    nodes affected through probabilistic generalization:\n";
            map  $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, "    "), @$nodes_directly_affected;
        }
    }
}

sub recursive_descent_for_sample_to_node_influence {
    my $self = shift;
    my $node_serial_num = shift;
    my $nodes_already_accounted_for = shift;
    my $offset = shift;
    $offset .= "    ";
    my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
    my @children =  map $_->get_serial_num(), @{$node->get_children()};
    my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
    if (@children_affected) {
        print "$offset $node_serial_num => [@children_affected]\n";
    }
    map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
}

sub get_samples_for_feature_value_combo {
    my $self = shift;
    my $feature_value_combo = shift;
    my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
    my @samples = ();
    if ($op eq '=') {
        @samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
    } elsif ($op eq '<') {
        foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
            my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
            foreach my $item (@features_and_values) {
                my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
                if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
                    push @samples, $sample;
                    last;
                }
            }
        }
    } elsif ($op eq '>') {
        foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
            my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
            foreach my $item (@features_and_values) {
                my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
                if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
                    push @samples, $sample;
                    last;
                }
            }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

boosted decision tree classifier.  As mentioned above, a separate decision tree is
constructed for each stage of boosting using a set of training samples that are drawn
through a probability distribution maintained over the entire training dataset.

=back

=head2 B<Methods defined for C<BoostedDecisionTree> class>

=over 8

=item B<get_training_data_for_base_tree():>

This method reads your training datafile, creates the data structures from the data
ingested for constructing the base decision tree.

=item B<show_training_data_for_base_tree():>

Writes to the standard output the training data samples and also some relevant
properties of the features used in the training dataset.

=item B<calculate_first_order_probabilities_and_class_priors():>

Calls on the appropriate methods of the main C<DecisionTree> class to estimate the
first-order probabilities and the class priors.

=item B<construct_base_decision_tree():>

Calls on the appropriate method of the main C<DecisionTree> class to construct the
base decision tree.

=item B<display_base_decision_tree():>

Displays the base decision tree in your terminal window. (The textual form of the
decision tree is written out to the standard output.)

=item B<construct_cascade_of_trees():>

Uses the AdaBoost algorithm to construct a cascade of decision trees.  As mentioned
earlier, the training samples for each tree in the cascade are drawn using a
probability distribution over the entire training dataset. This probability
distribution for any given tree in the cascade is heavily influenced by which
training samples are misclassified by the previous tree.

=item B<display_decision_trees_for_different_stages():>

Displays separately in your terminal window the decision tree constructed for each
stage of the cascade. (The textual form of the trees is written out to the standard
output.)

=item B<classify_with_boosting( $test_sample ):>

Calls on each decision tree in the cascade to classify the argument C<$test_sample>.

=item B<display_classification_results_for_each_stage():>

You can call this method to display in your terminal window the classification
decision made by each decision tree in the cascade.  The method also prints out the
trust factor associated with each decision tree.  It is important to look
simultaneously at the classification decision and the trust factor for each tree ---
since a classification decision made by a specific tree may appear bizarre for a
given test sample.  This method is useful primarily for debugging purposes.

=item B<show_class_labels_for_misclassified_samples_in_stage( $stage_index ):>

As with the previous method, this method is useful mostly for debugging. It returns
class labels for the samples misclassified by the stage whose integer index is
supplied as an argument to the method.  Say you have 10 stages in your cascade.  The
value of the argument C<stage_index> would go from 0 to 9, with 0 corresponding to
the base tree.

=item B<trust_weighted_majority_vote_classifier():>

Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
classification decisions made by the individual trees while taking into account the
trust factors associated with the trees.  As mentioned earlier, we associate with
each tree of the cascade a trust factor that depends on the overall misclassification
rate associated with that tree.

=back

See the example scripts in the C<ExamplesBoosting> subdirectory for how to call the
methods listed above for classifying individual data samples with boosting and for
bulk classification when you place all your test samples in a single file.


=head1 USING RANDOMIZED DECISION TREES

Consider the following two situations that call for using randomized decision trees,
meaning multiple decision trees that are trained using data extracted randomly from a
large database of training samples: 

(1) Consider a two-class problem for which the training database is grossly
imbalanced in how many majority-class samples it contains vis-a-vis the number of
minority class samples.  Let's assume for a moment that the ratio of majority class
samples to minority class samples is 1000 to 1.  Let's also assume that you have a
test dataset that is drawn randomly from the same population mixture from which the
training database was created.  Now consider a stupid data classification program
that classifies everything as belonging to the majority class.  If you measure the
classification accuracy rate as the ratio of the number of samples correctly
classified to the total number of test samples selected randomly from the population,
this classifier would work with an accuracy of 99.99%. 

(2) Let's now consider another situation in which we are faced with a huge training
database but in which every class is equally well represented.  Feeding all the data
into a single decision tree would be akin to polling all of the population of the
United States for measuring the Coke-versus-Pepsi preference in the country.  You are
likely to get better results if you construct multiple decision trees, each trained
with a collection of training samples drawn randomly from the training database.
After you have created all the decision trees, your final classification decision
could then be based on, say, majority voting by the trees.

In summary, the C<RandomizedTreesForBigData> class allows you to solve the following
two problems: (1) Data classification using the needle-in-a-haystack metaphor, that
is, when a vast majority of your training samples belong to just one class.  And (2)
You have access to a very large database of training samples and you wish to
construct an ensemble of decision trees for classification.

=over 4

=item B<Calling the RandomizedTreesForBigData constructor:>

Here is how you'd call the C<RandomizedTreesForBigData> constructor for
needle-in-a-haystack classification:

    use Algorithm::RandomizedTreesForBigData;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                              csv_cleanup_needed => 1,
             );

Note in particular the constructor parameters:

    looking_for_needles_in_haystack
    how_many_trees

The first of these parameters, C<looking_for_needles_in_haystack>, invokes the logic for
constructing an ensemble of decision trees, each based on a training dataset that
uses all of the minority class samples, and a random drawing from the majority class
samples.

Here is how you'd call the C<RandomizedTreesForBigData> constructor for a more
general attempt at constructing an ensemble of decision trees, with each tree trained
with randomly drawn samples from a large database of training data (without paying
attention to the differences in the sizes of the populations for the different
classes):

    use Algorithm::RandomizedTreesForBigData;
    my $training_datafile = "your_database.csv";
    my $rt = Algorithm::RandomizedTreesForBigData->new(
                              training_datafile => $training_datafile,
                              csv_class_column_index => 2,
                              csv_columns_for_features => [3,4,5,6,7,8],
                              entropy_threshold => 0.01,
                              max_depth_desired => 8,
                              symbolic_to_numeric_cardinality_threshold => 10,
                              how_many_trees => 3,
                              how_many_training_samples_per_tree => 50,
                              csv_cleanup_needed => 1,
             );

Note in particular the constructor parameters:

    how_many_training_samples_per_tree
    how_many_trees

When you set the C<how_many_training_samples_per_tree> parameter, you are not allowed
to also set the C<looking_for_needles_in_haystack> parameter, and vice versa.

=back

=head2 B<Methods defined for C<RandomizedTreesForBigData> class>

=over 8

=item B<get_training_data_for_N_trees():>

What this method does depends on which of the two constructor parameters ---
C<looking_for_needles_in_haystack> or C<how_many_training_samples_per_tree> --- is
set.  When the former is set, it creates a collection of training datasets for
C<how_many_trees> number of decision trees, with each dataset being a mixture of the
minority class and sample drawn randomly from the majority class.  However, when the
latter option is set, all the datasets are drawn randomly from the training database
with no particular attention given to the relative populations of the two classes.

=item B<show_training_data_for_all_trees():>

As the name implies, this method shows the training data being used for all the
decision trees.  This method is useful for debugging purposes using small datasets.

=item B<calculate_first_order_probabilities():>

Calls on the appropriate method of the main C<DecisionTree class> to estimate the
first-order probabilities for the training dataset to be used for each decision tree.

=item B<calculate_class_priors():>

Calls on the appropriate method of the main C<DecisionTree> class to estimate the
class priors for the training dataset to be used for each decision tree.

=item B<construct_all_decision_trees():>

Calls on the appropriate method of the main C<DecisionTree> class to construct the
decision trees.

=item B<display_all_decision_trees():>

Displays all the decision trees in your terminal window. (The textual form of the
decision trees is written out to the standard output.)

=item B<classify_with_all_trees( $test_sample ):>

The test_sample is sent to each decision tree for classification.

=item B<display_classification_results_for_all_trees():>

The classification decisions returned by the individual decision trees are written
out to the standard output.

=item B<get_majority_vote_classification()>

This method aggregates the classification results returned by the individual decision
trees and returns the majority decision.

=back

=head1 CONSTRUCTING REGRESSION TREES:

Decision tree based modeling requires that the class labels be distinct.  That is,
the training dataset must contain a relatively small number of discrete class labels
for all of your data records if you want to model the data with one or more decision
trees.  However, when one is trying to understand all of the associational
relationships that exist in a large database, one often runs into situations where,
instead of discrete class labels, you have a continuously valued variable as a
dependent variable whose values are predicated on a set of feature values.  It is for
such situations that you will find useful the new class C<RegressionTree> that is now
a part of the C<DecisionTree> module.  The C<RegressionTree> class has been
programmed as a subclass of the main C<DecisionTree> class.

You can think of regression with a regression tree as a powerful generalization of
the very commonly used Linear Regression algorithms.  Although you can certainly
carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
modeling nonlinearities between the predictor variables and the dependent variable,
specifying the degree of the polynomial is often tricky. Additionally, a polynomial
can inject continuities between the predictor and the predicted variables that may
not really exist in the real data.  Regression trees, on the other hand, give you a
piecewise linear relationship between the predictor and the predicted variables that
is freed from the constraints of superimposed continuities at the joins between the
different segments.  See the following tutorial for further information regarding the



( run in 0.652 second using v1.01-cache-2.11-cpan-0bd6704ced7 )