Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                                                                    \@features_and_values,\%answer)};
    @{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
    if ($self->{_debug3}) {
        print "\nCL2 The classification:\n";
        foreach my $class_name (@{$self->{_class_names}}) {
            print "    $class_name  with probability $classification{$class_name}\n";
        }
    }
    my %classification_for_display = ();
    foreach my $item (keys %classification) {
        if ($item ne 'solution_path') {
            $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
        } else {
            my @outlist = ();
            foreach my $x (@{$classification{$item}}) {
                push @outlist, "NODE$x";
            }
            $classification_for_display{$item} =  \@outlist;
        }
    }
    return \%classification_for_display;
}

sub recursive_descent_for_classification {
    my $self = shift;
    my $node = shift;
    my $features_and_values = shift;
    my $answer = shift;
    my @features_and_values = @$features_and_values;
    my %answer = %$answer;
    my @children = @{$node->get_children()};
    if (@children == 0) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
        return \%answer;
    }
    my $feature_tested_at_node = $node->get_feature();
    print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n" 
        if $self->{_debug3};
    my $value_for_feature;
    my $path_found;
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values) {
        $feature_and_value =~ /$pattern/;
        $value_for_feature = $2 if $feature_tested_at_node eq $1;
    }
    # The following clause introduced in Version 3.20
    if (!defined $value_for_feature) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
        return \%answer;
    }
    if ($value_for_feature) {
        if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
            print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
            my $pattern1 = '(.+)<(.+)';
            my $pattern2 = '(.+)>(.+)';
            foreach my $child (@children) {
                my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
                my $last_feature_and_value_on_branch = $branch_features_and_values[-1]; 
                if ($last_feature_and_value_on_branch =~ /$pattern1/) {
                    my ($feature, $threshold) = ($1,$2); 
                    if ($value_for_feature <= $threshold) {
                        $path_found = 1;
                        %answer = %{$self->recursive_descent_for_classification($child,
                                                                             $features_and_values,\%answer)};
                        push @{$answer{'solution_path'}}, $node->get_serial_num();
                        last;
                    }
                }
                if ($last_feature_and_value_on_branch =~ /$pattern2/) {
                    my ($feature, $threshold) = ($1,$2); 
                    if ($value_for_feature > $threshold) {
                        $path_found = 1;
                        %answer = %{$self->recursive_descent_for_classification($child,
                                                                            $features_and_values,\%answer)};
                        push @{$answer{'solution_path'}}, $node->get_serial_num();
                        last;
                    }
                }
            }
            return \%answer if $path_found;
        } else {
            my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
            print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n" 
                if $self->{_debug3};
            foreach my $child (@children) {
                my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
                print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
                my $last_feature_and_value_on_branch = $branch_features_and_values[-1]; 
                if ($last_feature_and_value_on_branch eq $feature_value_combo) {
                    %answer = %{$self->recursive_descent_for_classification($child,
                                                                              $features_and_values,\%answer)};
                    push @{$answer{'solution_path'}}, $node->get_serial_num();
                    $path_found = 1;
                    last;
                }
            }
            return \%answer if $path_found;
        }
    }
    if (! $path_found) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
    }
    return \%answer;
}

##  If you want classification to be carried out by engaging a human user in a
##  question-answer session, this is the method to use for that purpose.  See, for
##  example, the script classify_by_asking_questions.pl in the `examples'
##  subdirectory for an illustration of how to do that.
sub classify_by_asking_questions {
    my $self = shift;
    my $root_node = shift;
    my %answer = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        $answer{$class_name} = undef;
    }
    $answer{'solution_path'} = [];
    my %scratchpad_for_numeric_answers = ();
    foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
        $scratchpad_for_numeric_answers{$feature_name} = undef;
    }
    my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
                                                       \%answer, \%scratchpad_for_numeric_answers)};
    @{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
    my %classification_for_display = ();
    foreach my $item (keys %classification) {
        if ($item ne 'solution_path') {
            $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
        } else {
            my @outlist = ();
            foreach my $x (@{$classification{$item}}) {
                push @outlist, "NODE$x";
            }
            $classification_for_display{$item} =  \@outlist;
        }
    }
    return \%classification_for_display;
}

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

               (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
        print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3}; 
        return;
    }
    return if ! defined $best_feature;
    if ($self->{_debug3}) { 
        print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
        print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
        print "\nRD9 Best feature entropy: $best_feature_entropy\n";
        print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
    }
    my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
    print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
    if ($entropy_gain > $self->{_entropy_threshold}) {
        if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} && 
              $self->{_feature_values_how_many_uniques_hash}->{$best_feature} > 
                                        $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my $best_threshold = $decision_val;            # as returned by best feature calculator
            my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
            my @extended_branch_features_and_values_or_thresholds_for_lessthan_child = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
            my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  = 
                                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)}; 
            my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
            my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
            push @extended_branch_features_and_values_or_thresholds_for_lessthan_child, 
                                                                  $feature_threshold_combo_for_less_than;
            push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child, 
                                                               $feature_threshold_combo_for_greater_than;
            if ($self->{_debug3}) {
                print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
                print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
                      "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
            }
            my @class_probabilities_for_lessthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                 \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
            my @class_probabilities_for_greaterthan_child_node = 
                map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
              \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
            if ($self->{_debug3}) {
                print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
                print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
            }
            if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
                                                         \@class_probabilities_for_lessthan_child_node,
                              \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
                $node->add_child_link($left_child_node);
                $self->recursive_descent($left_child_node);
            }
            if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
                my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
                                                         \@class_probabilities_for_greaterthan_child_node,
                            \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
                $node->add_child_link($right_child_node);
                $self->recursive_descent($right_child_node);
            }
        } else {
            print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
                if $self->{_debug3};
            my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
            print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
            my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
            my @class_entropies_for_children = ();
            foreach my $feature_and_value_index (0..@feature_value_combos-1) {
                print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
                    if $self->{_debug3};
                my @extended_branch_features_and_values_or_thresholds;
                if (! @features_and_values_or_thresholds_on_branch) {
                    @extended_branch_features_and_values_or_thresholds = 
                                                          ($feature_value_combos[$feature_and_value_index]);
                } else {
                    @extended_branch_features_and_values_or_thresholds = 
                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @extended_branch_features_and_values_or_thresholds, 
                                           $feature_value_combos[$feature_and_value_index];
                }
                my @class_probabilities =
                   map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                               \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
                my $class_entropy_for_child = 
                      $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
                                                         \@extended_branch_features_and_values_or_thresholds);
                if ($self->{_debug3}) {
                    print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
                    print "\nRD20 class entropy for child: $class_entropy_for_child\n"; 
                }
                if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
                    my $child_node = DTNode->new(undef, $class_entropy_for_child,
                          \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
                    $node->add_child_link($child_node);
                    $self->recursive_descent($child_node);
                } else {
                    print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
                }
            }
        }
    } else {
        print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n" 
            if $self->{_debug3};
        return;
    }
}

##  This is the heart of the decision tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples that
##  correspond to the current node.  The search for the best feature is carried out
##  differently for symbolic features and for numeric features.  For a symbolic
##  feature, the method estimates the entropy for each value of the feature and then
##  averages out these entropies as a measure of the discriminatory power of that
##  features.  For a numeric feature, on the other hand, it estimates the entropy
##  reduction that can be achieved if were to partition the set of training samples
##  for each possible threshold.  For a numeric feature, all possible sampling points
##  relevant to the node in question are considered as candidates for thresholds.
sub best_feature_calculator {
    my $self = shift;
    my $features_and_values_or_thresholds_on_branch = shift;
    my $existing_node_entropy = shift;
    my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};      
            my @newvalues = ();
            if (contained_in($feature_name, @true_numeric_types_feature_names)) {
                if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                              $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
                    next;
                } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
                                    $lowerbound{$feature_name} < $upperbound{$feature_name}) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($upperbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x <= $upperbound{$feature_name};
                    }
                } elsif (defined($lowerbound{$feature_name})) {
                    foreach my $x (@values) {
                        push @newvalues, $x if $x > $lowerbound{$feature_name};
                    }
                } else {
                    die "Error is bound specifications in best feature calculator";
                }
            } else {
                @newvalues = @{deep_copy_array(\@values)};
            }
            next if @newvalues == 0;
            my @partitioning_entropies = ();            
            foreach my $value (@newvalues) {
                my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";
                my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
                my @for_left_child;
                my @for_right_child;
                if (@features_and_values_or_thresholds_on_branch) {
                    @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_left_child, $feature_and_less_than_value_string;
                    @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @for_right_child, $feature_and_greater_than_value_string;
                } else {
                    @for_left_child = ($feature_and_less_than_value_string);
                    @for_right_child = ($feature_and_greater_than_value_string);
                }
                my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
                                    \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
                my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
                                    \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
                my $partitioning_entropy = $entropy1 * 
                     $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
                                           $entropy2 *
                     $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);

                push @partitioning_entropies, $partitioning_entropy;
                $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
            }
            my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
            if ($min_entropy < $existing_node_entropy) {
                $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
                $entropy_values_for_different_features{$feature_name} = $min_entropy;
            }
        } else {
            print "\nBFC2:  Entering section reserved for symbolic features\n" if $self->{_debug3};
            print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
            my %seen;
            my @values = grep {$_ ne 'NA' && !$seen{$_}++} 
                                    @{$self->{_features_and_unique_values_hash}->{$feature_name}};
            @values = sort @values;
            print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};

            my $entropy = 0;
            foreach my $value (@values) {
                my $feature_value_string = "$feature_name" . '=' . "$value";
                print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
                my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                if (@features_and_values_or_thresholds_on_branch) {
                    push @extended_attributes, $feature_value_string;
                } else {
                    @extended_attributes = ($feature_value_string);
                }
                $entropy += 
           $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) * 
           $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
                print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
                      "is $entropy\n" if $self->{_debug3};
                push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
            }
            if ($entropy < $existing_node_entropy) {
                $entropy_values_for_different_features{$feature_name} = $entropy;
            }
        }
    }
    my $min_entropy_for_best_feature;
    my $best_feature_name;
    foreach my $feature_nom (keys %entropy_values_for_different_features) { 
        if (!defined($best_feature_name)) {
            $best_feature_name = $feature_nom;
            $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
        } else {
            if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
                $best_feature_name = $feature_nom;
                $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
            }
        }
    }
    my $threshold_for_best_feature;
    if (exists $partitioning_point_threshold{$best_feature_name}) {
        $threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
    } else {
        $threshold_for_best_feature = undef;
    }
    my $best_feature_entropy = $min_entropy_for_best_feature;
    my @val_based_entropies_to_be_returned;
    my $decision_val_to_be_returned;
    if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} && 
          $self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} > 
                                    $self->{_symbolic_to_numeric_cardinality_threshold}) {
        @val_based_entropies_to_be_returned = 
            @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
    } else {
        @val_based_entropies_to_be_returned = ();
    }
    if (exists $partitioning_point_threshold{$best_feature_name}) {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

            my @sampling_points_for_feature = 
                               @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
            my @actual_values_for_feature = grep {$_ ne 'NA'} 
                                              @{$self->{_features_and_values_hash}->{$feature_name}};
            foreach my $i (0..@sampling_points_for_feature-1) {
                foreach my $j (0..@actual_values_for_feature-1) {
                    if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
                        $counts_at_sampling_points[$i]++
                    }
                }
            }
            my $total_counts = 0;
            map {$total_counts += $_} @counts_at_sampling_points;
            my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
            my %bin_prob_hash = ();
            foreach my $i (0..@sampling_points_for_feature-1) {
                $bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
            }
            $self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
            my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)} 
                                                                             @sampling_points_for_feature;
            foreach my $i (0..@values_for_feature-1) {
                $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
            }
            if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
                return $self->{_probability_cache}->{$feature_and_value};
            } else {
                return 0;
            }
        } else {
            my %seen = ();
            my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++} 
                                                 @{$self->{_features_and_values_hash}->{$feature_name}};
            @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
            my @value_counts = (0) x @values_for_feature;
#            foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
            foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
                my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
                foreach my $i (0..@values_for_feature-1) {
                    foreach my $current_value (@features_and_values) {
                        $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
                    }
                }
            }
            my $total_counts = 0;
            map {$total_counts += $_} @value_counts;
            die "PFV Something is wrong with your training file. It contains no training samples \
                         for feature named $feature_name" if $total_counts == 0;
            my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
            foreach my $i (0..@values_for_feature-1) {
                $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
            }
            if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
                return $self->{_probability_cache}->{$feature_and_value};
            } else {
                return 0;
            }
        }
    } else {
        # This section is only for purely symbolic features:  
        my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};        
        @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
        my @value_counts = (0) x @values_for_feature;
#        foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
            my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
            foreach my $i (0..@values_for_feature-1) {
                for my $current_value (@features_and_values) {
                    $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
                }
            }
        }
        foreach my $i (0..@values_for_feature-1) {
            $self->{_probability_cache}->{$values_for_feature[$i]} = 
                $value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
        }
        if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
            return $self->{_probability_cache}->{$feature_and_value};
        } else {
            return 0;
        }
    }
}

sub probability_of_feature_value_given_class {
    my $self = shift;
    my $feature_name = shift;
    my $feature_value = shift;
    my $class_name = shift;
    $feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
    if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
        $feature_value = closest_sampling_point($feature_value, 
                                        $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
    }
    my $feature_value_class;
    if (defined($feature_value)) {
        $feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
    }
    if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
        print "\nNext answer returned by cache for feature $feature_name and " .
            "value $feature_value given class $class_name\n" if $self->{_debug2};
        return $self->{_probability_cache}->{$feature_value_class};
    }
    my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);

    if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
        if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} > 
                                $self->{_symbolic_to_numeric_cardinality_threshold}) {
            $histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
            $num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
            @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
            $diffrange = $valuerange[1] - $valuerange[0];
        }
    }
    my @samples_for_class = ();
    # Accumulate all samples names for the given class:
    foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
        if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
            push @samples_for_class, $sample_name;
        }
    }
    if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
        if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} > 
                                $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my @sampling_points_for_feature = 
                              @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
            my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
            my @actual_feature_values_for_samples_in_class = ();
            foreach my $sample (@samples_for_class) {           
                foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
                    my $pattern = '(.+)=(.+)';
                    $feature_and_value =~ /$pattern/;
                    my ($feature, $value) = ($1, $2);
                    if (($feature eq $feature_name) && ($value ne 'NA')) {
                        push @actual_feature_values_for_samples_in_class, $value;
                    }
                }
            }
            foreach my $i (0..@sampling_points_for_feature-1) {
                foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
                    if (abs($sampling_points_for_feature[$i] - 
                            $actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
                        $counts_at_sampling_points[$i]++;
                    }
                }
            }
            my $total_counts = 0;
            map {$total_counts += $_} @counts_at_sampling_points;
            die "PFVC1 Something is wrong with your training file. It contains no training " .
                    "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
            my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
            my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"} 
                                                                     @sampling_points_for_feature;
            foreach my $i (0..@values_for_feature_and_class-1) {
                $self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
            }
            if (exists $self->{_probability_cache}->{$feature_value_class}) {
                return $self->{_probability_cache}->{$feature_value_class};
            } else {
                return 0;
            }
        } else {
            # This section is for numeric features that will be treated symbolically
            my %seen = ();
            my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++} 
                                                 @{$self->{_features_and_values_hash}->{$feature_name}};
            @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
            my @value_counts = (0) x @values_for_feature;
            foreach my $sample (@samples_for_class) {
                my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
                foreach my $i (0..@values_for_feature-1) {
                    foreach my $current_value (@features_and_values) {
                        $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
                    }
                }
            }
            my $total_counts = 0;
            map {$total_counts += $_} @value_counts;
            die "PFVC2 Something is wrong with your training file. It contains no training " .
                "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
            # We normalize by total_count because the probabilities are conditioned on a given class
            foreach my $i (0..@values_for_feature-1) {
                my $feature_and_value_and_class =  "$values_for_feature[$i]" . "::" . "$class_name";
                $self->{_probability_cache}->{$feature_and_value_and_class} = 
                                                           $value_counts[$i] / (1.0 * $total_counts);
            }
            if (exists $self->{_probability_cache}->{$feature_value_class}) {
                return $self->{_probability_cache}->{$feature_value_class};
            } else {
                return 0;
            }
        }
    } else {
        # This section is for purely symbolic features
        my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
        my %seen = ();
        @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++} 
                                             @{$self->{_features_and_values_hash}->{$feature_name}};
        @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
        my @value_counts = (0) x @values_for_feature;
        foreach my $sample (@samples_for_class) {
            my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
            foreach my $i (0..@values_for_feature-1) {
                foreach my $current_value (@features_and_values) {
                    $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
                }
            }
        }
        my $total_counts = 0;
        map {$total_counts += $_} @value_counts;
        die "PFVC2 Something is wrong with your training file. It contains no training " .
            "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
        # We normalize by total_count because the probabilities are conditioned on a given class
        foreach my $i (0..@values_for_feature-1) {
            my $feature_and_value_and_class =  "$values_for_feature[$i]" . "::" . "$class_name";
            $self->{_probability_cache}->{$feature_and_value_and_class} = 
                                                       $value_counts[$i] / (1.0 * $total_counts);
        }
        if (exists $self->{_probability_cache}->{$feature_value_class}) {
            return $self->{_probability_cache}->{$feature_value_class};
        } else {
            return 0;
        }
    }
}

sub probability_of_feature_less_than_threshold {
    my $self = shift;
    my $feature_name = shift;
    my $threshold = shift;
    my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
    return $self->{_probability_cache}->{$feature_threshold_combo}
                     if (exists $self->{_probability_cache}->{$feature_threshold_combo});
    my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
    my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
    my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
    $self->{_probability_cache}->{$feature_threshold_combo} = $probability;
    return $probability;
}

sub probability_of_feature_less_than_threshold_given_class {
    my $self = shift;
    my $feature_name = shift;
    my $threshold = shift;
    my $class_name = shift;
    my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
    return $self->{_probability_cache}->{$feature_threshold_class_combo}
                     if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
    my @data_samples_for_class = ();
    # Accumulate all samples names for the given class:
    foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
        push @data_samples_for_class, $sample_name 
                  if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
    }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    return $self->{ _class_probabilities};                    
}

sub get_branch_features_and_values_or_thresholds {
    my $self = shift; 
    return $self->{_branch_features_and_values_or_thresholds};
}

sub add_to_branch_features_and_values {
    my $self = shift;                   
    my $feature_and_value = shift;
    push @{$self->{ _branch_features_and_values }}, $feature_and_value;
}

sub get_children {       
    my $self = shift;                   
    return $self->{_linked_to};
}

sub add_child_link {         
    my ($self, $new_node, ) = @_;                            
    push @{$self->{_linked_to}}, $new_node;                  
}

sub delete_all_links {                  
    my $self = shift;                   
    $self->{_linked_to} = undef;        
}

sub display_node {
    my $self = shift; 
    my $feature_at_node = $self->get_feature() || " ";
    my $node_creation_entropy_at_node = $self->get_node_entropy();
    my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
    my @class_probabilities = @{$self->get_class_probabilities()};
    my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
    my $serial_num = $self->get_serial_num();
    my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
    print "\n\nNODE $serial_num" .
          ":\n   Branch features and values to this node: @branch_features_and_values_or_thresholds" .
          "\n   Class probabilities at current node: @class_probabilities_for_display" .
          "\n   Entropy at current node: $print_node_creation_entropy_at_node" .
          "\n   Best feature test at current node: $feature_at_node\n\n";
}

sub display_decision_tree {
    my $self = shift;
    my $offset = shift;
    my $serial_num = $self->get_serial_num();
    if (@{$self->get_children()} > 0) {
        my $feature_at_node = $self->get_feature() || " ";
        my $node_creation_entropy_at_node = $self->get_node_entropy();
        my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
        my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
        my @class_probabilities = @{$self->get_class_probabilities()};
        my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
        my @class_names = @{$self->get_class_names()};
        my @print_class_probabilities_with_class =
            map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
        print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
        my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
        print "$second_line_offset" . "Decision Feature: $feature_at_node    Node Creation Entropy: " ,
              "$print_node_creation_entropy_at_node   Class Probs: @print_class_probabilities_with_class\n\n";
        $offset .= "   ";
        foreach my $child (@{$self->get_children()}) {
            $child->display_decision_tree($offset);
        }
    } else {
        my $node_creation_entropy_at_node = $self->get_node_entropy();
        my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
        my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
        my @class_probabilities = @{$self->get_class_probabilities()};
        my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
        my @class_names = @{$self->get_class_names()};
        my @print_class_probabilities_with_class =
            map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
        print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
        my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
        print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node   " .
              "Class Probs: @print_class_probabilities_with_class\n\n";
    }
}


##############################  Generate Your Own Numeric Training Data  #################################
#############################      Class TrainingDataGeneratorNumeric     ################################

##  See the script generate_training_data_numeric.pl in the examples
##  directory on how to use this class for generating your own numeric training and
##  test data.  The training and test data are generated in accordance with the
##  specifications you place in the parameter file that is supplied as an argument to
##  the constructor of this class.

package TrainingDataGeneratorNumeric;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params3(@params) == 0;   
    bless {
        _output_training_csv_file          =>   $args{'output_training_csv_file'} 
                                                   || croak("name for output_training_csv_file required"),
        _output_test_csv_file              =>   $args{'output_test_csv_file'} 
                                                   || croak("name for output_test_csv_file required"),
        _parameter_file                    =>   $args{'parameter_file'}
                                                         || croak("parameter_file required"),
        _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'} 
                                                         || croak("number_of_samples_for_training"),
        _number_of_samples_for_testing     =>   $args{'number_of_samples_for_testing'} 
                                                         || croak("number_of_samples_for_testing"),
        _debug                             =>    $args{debug} || 0,
        _class_names                       =>    [],
        _class_names_and_priors            =>    {},
        _features_with_value_range         =>    {},
        _features_ordered                  =>    [],
        _classes_and_their_param_values    =>    {},
    }, $class;
}

sub check_for_illegal_params3 {
    my @params = @_;
    my @legal_params = qw / output_training_csv_file
                            output_test_csv_file
                            parameter_file
                            number_of_samples_for_training
                            number_of_samples_for_testing
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

regression trees.  I have fixed a couple of bugs in the calculation of the regression
coefficients. Additionally, the C<RegressionTree> class now comes with a new
constructor parameter named C<jacobian_choice>.  For most cases, you'd set this
parameter to 0, which causes the regression coefficients to be estimated through
linear least-squares minimization.

B<Version 3.40:> In addition to constructing decision trees, this version of the
module also allows you to construct regression trees. The regression tree capability
has been packed into a separate subclass, named C<RegressionTree>, of the main
C<DecisionTree> class.  The subdirectory C<ExamplesRegression> in the main
installation directory illustrates how you can use this new functionality of the
module.

B<Version 3.30:> This version incorporates four very significant upgrades/changes to
the C<DecisionTree> module: B<(1)> The CSV cleanup is now the default. So you do not
have to set any special parameters in the constructor calls to initiate CSV
cleanup. B<(2)> In the form of a new Perl class named C<RandomizedTreesForBigData>,
this module provides you with an easy-to-use programming interface for attempting
needle-in-a-haystack solutions for the case when your training data is overwhelmingly
dominated by a single class.  You need to set the constructor parameter
C<looking_for_needles_in_haystack> to invoke the logic that constructs multiple
decision trees, each using the minority class samples along with samples drawn
randomly from the majority class.  The final classification is made through a
majority vote from all the decision trees.  B<(3)> Assuming you are faced with a
big-data problem --- in the sense that you have been given a training database with a
very large number of training records --- the class C<RandomizedTreesForBigData> will
also let you construct multiple decision trees by pulling training data randomly from
your training database (without paying attention to the relative populations of the
classes).  The final classification decision for a test sample is based on a majority
vote from all the decision trees thus constructed.  See the C<ExamplesRandomizedTrees>
directory for how to use these new features of the module. And, finally, B<(4)>
Support for the old-style '.dat' training files has been dropped in this version.

B<Version 3.21:> This version makes it easier to use a CSV training file that
violates the assumption that a comma be used only to separate the different field
values in a line record.  Some large econometrics databases use double-quoted values
for fields, and these values may contain commas (presumably for better readability).
This version also allows you to specify the leftmost entry in the first CSV record
that names all the fields. Previously, this entry was required to be an empty
double-quoted string.  I have also made some minor changes to the
'C<get_training_data_from_csv()>' method to make it more user friendly for large
training files that may contain tens of thousands of records.  When pulling training
data from such files, this method prints out a dot on the terminal screen for every
10000 records it has processed. 

B<Version 3.20:> This version brings the boosting capability to the C<DecisionTree>
module.

B<Version 3.0:> This version adds bagging to the C<DecisionTree> module. If your
training dataset is large enough, you can ask the module to construct multiple
decision trees using data bags extracted from your dataset.  The module can show you
the results returned by the individual decision trees and also the results obtained
by taking a majority vote of the classification decisions made by the individual
trees.  You can specify any arbitrary extent of overlap between the data bags.

B<Version 2.31:> The introspection capability in this version packs more of a punch.
For each training data sample, you can now figure out not only the decision-tree
nodes that are affected directly by that sample, but also those nodes that are
affected indirectly through the generalization achieved by the probabilistic modeling
of the data.  The 'examples' directory of this version includes additional scripts
that illustrate these enhancements to the introspection capability.  See the section
"The Introspection API" for a declaration of the introspection related methods, old
and new.

B<Version 2.30:> In response to requests from several users, this version includes a new
capability: You can now ask the module to introspect about the classification
decisions returned by the decision tree.  Toward that end, the module includes a new
class named C<DTIntrospection>.  Perhaps the most important bit of information you
are likely to seek through DT introspection is the list of the training samples that
fall directly in the portion of the feature space that is assigned to a node.
B<CAVEAT:> When training samples are non-uniformly distributed in the underlying
feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE.  B<(This is an important
part of the generalization achieved by probabilistic modeling of the training data.)>
For additional information related to DT introspection, see the section titled
"DECISION TREE INTROSPECTION" in this documentation page.

B<Version 2.26> fixes a bug in the part of the module that some folks use for generating
synthetic data for experimenting with decision tree construction and classification.
In the class C<TrainingDataGeneratorNumeric> that is a part of the module, there
was a problem with the order in which the features were recorded from the
user-supplied parameter file.  The basic code for decision tree construction and
classification remains unchanged.

B<Version 2.25> further downshifts the required version of Perl for this module.  This
was a result of testing the module with Version 5.10.1 of Perl.  Only one statement
in the module code needed to be changed for the module to work with the older version
of Perl.

B<Version 2.24> fixes the C<Makefile.PL> restriction on the required Perl version.  This
version should work with Perl versions 5.14.0 and higher.

B<Version 2.23> changes the required version of Perl from 5.18.0 to 5.14.0.  Everything
else remains the same.

B<Version 2.22> should prove more robust when the probability distribution for the
values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
observations can occur with significant probabilities.  A new option in the
DecisionTree constructor lets the user specify the precision with which the
probability distributions are estimated for such features.

B<Version 2.21> fixes a bug that was caused by the explicitly set zero values for
numerical features being misconstrued as "false" in the conditional statements in
some of the method definitions.

B<Version 2.2> makes it easier to write code for classifying in one go all of your test
data samples in a CSV file.  The bulk classifications obtained can be written out to
either a CSV file or to a regular text file.  See the script
C<classify_test_data_in_a_file_numeric.pl> in the C<Examples> directory for how to
classify all of your test data records in a CSV file.  This version also includes
improved code for generating synthetic numeric/symbolic training and test data
records for experimenting with the decision tree classifier.

B<Version 2.1> allows you to test the quality of your training data by running a 10-fold
cross-validation test on the data.  This test divides all of the training data into
ten parts, with nine parts used for training a decision tree and one part used for
testing its ability to classify correctly. This selection of nine parts for training
and one part for testing is carried out in all of the ten different ways that are
possible.  This testing functionality in Version 2.1 can also be used to find the
best values to use for the constructor parameters C<entropy_threshold>,
C<max_depth_desired>, and C<symbolic_to_numeric_cardinality_threshold>.

B<Version 2.0 is a major rewrite of this module.> Now you can use both numeric and
symbolic features for constructing a decision tree. A feature is numeric if it can
take any floating-point value over an interval.

B<Version 1.71> fixes a bug in the code that was triggered by 0 being declared as one of
the features values in the training datafile. Version 1.71 also include an additional
safety feature that is useful for training datafiles that contain a very large number
of features.  The new version makes sure that the number of values you declare for
each sample record matches the number of features declared at the beginning of the
training datafile.

B<Version 1.7> includes safety checks on the consistency of the data you place in your
training datafile. When a training file contains thousands of samples, it is

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

script 

    construct_dt_for_heavytailed.pl 

in the C<Examples> directory shows an example of how to call the constructor of the
module with the C<number_of_histogram_bins> option.


=head1 TESTING THE QUALITY OF YOUR TRAINING DATA

Versions 2.1 and higher include a new class named C<EvalTrainingData>, derived from
the main class C<DecisionTree>, that runs a 10-fold cross-validation test on your
training data to test its ability to discriminate between the classes mentioned in
the training file.

The 10-fold cross-validation test divides all of the training data into ten parts,
with nine parts used for training a decision tree and one part used for testing its
ability to classify correctly. This selection of nine parts for training and one part
for testing is carried out in all of the ten different possible ways.

The following code fragment illustrates how you invoke the testing function of the
EvalTrainingData class:

    my $training_datafile = "training.csv";                                         
    my $eval_data = EvalTrainingData->new(
                                  training_datafile => $training_datafile,
                                  csv_class_column_index => 1,
                                  csv_columns_for_features => [2,3],
                                  entropy_threshold => 0.01,
                                  max_depth_desired => 3,
                                  symbolic_to_numeric_cardinality_threshold => 10,
                                  csv_cleanup_needed => 1,
                    );
    $eval_data->get_training_data();
    $eval_data->evaluate_training_data()

The last statement above prints out a Confusion Matrix and the value of Training Data
Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
cross-validation test.

This testing functionality can also be used to find the best values to use for the
constructor parameters C<entropy_threshold>, C<max_depth_desired>, and
C<symbolic_to_numeric_cardinality_threshold>.

The following two scripts in the C<Examples> directory illustrate the use of the
C<EvalTrainingData> class for testing the quality of your data:

    evaluate_training_data1.pl
    evaluate_training_data2.pl


=head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS

Assuming your training data is good, the quality of the results you get from a
decision tree would depend on the choices you make for the constructor parameters
C<entropy_threshold>, C<max_depth_desired>, and
C<symbolic_to_numeric_cardinality_threshold>.  You can optimize your choices for
these parameters by running the 10-fold cross-validation test that is made available
in Versions 2.2 and higher through the new class C<EvalTrainingData> that is included
in the module file.  A description of how to run this test is in the previous section
of this document.


=head1 DECISION TREE INTROSPECTION

Starting with Version 2.30, you can ask the C<DTIntrospection> class of the module to
explain the classification decisions made at the different nodes of the decision
tree.

Perhaps the most important bit of information you are likely to seek through DT
introspection is the list of the training samples that fall directly in the portion
of the feature space that is assigned to a node.

However, note that, when training samples are non-uniformly distributed in the
underlying feature space, it is possible for a node to exist even when there are no
training samples in the portion of the feature space assigned to the node.  That is
because the decision tree is constructed from the probability densities estimated
from the training data.  When the training samples are non-uniformly distributed, it
is entirely possible for the estimated probability densities to be non-zero in a
small region around a point even when there are no training samples specifically in
that region.  (After you have created a statistical model for, say, the height
distribution of people in a community, the model may return a non-zero probability
for the height values in a small interval even if the community does not include a
single individual whose height falls in that interval.)

That a decision-tree node can exist even when there are no training samples in that
portion of the feature space that belongs to the node is an important indication of
the generalization ability of a decision-tree-based classifier.

In light of the explanation provided above, before the DTIntrospection class supplies
any answers at all, it asks you to accept the fact that features can take on non-zero
probabilities at a point in the feature space even though there are zero training
samples at that point (or in a small region around that point).  If you do not accept
this rudimentary fact, the introspection class will not yield any answers (since you
are not going to believe the answers anyway).

The point made above implies that the path leading to a node in the decision tree may
test a feature for a certain value or threshold despite the fact that the portion of
the feature space assigned to that node is devoid of any training data.

See the following three scripts in the Examples directory for how to carry out DT
introspection:

    introspection_in_a_loop_interactive.pl

    introspection_show_training_samples_at_all_nodes_direct_influence.pl

    introspection_show_training_samples_to_nodes_influence_propagation.pl

The first script places you in an interactive session in which you will first be
asked for the node number you are interested in.  Subsequently, you will be asked for
whether or not you are interested in specific questions that the introspection can
provide answers for. The second script descends down the decision tree and shows for
each node the training samples that fall directly in the portion of the feature space
assigned to that node.  The third script shows for each training sample how it
affects the decision-tree nodes either directly or indirectly through the
generalization achieved by the probabilistic modeling of the data.

The output of the script
C<introspection_show_training_samples_at_all_nodes_direct_influence.pl> looks like:

    Node 0: the samples are: None
    Node 1: the samples are: [sample_46 sample_58]
    Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
    Node 3: the samples are: []
    Node 4: the samples are: []
    ...
    ...            

The nodes for which no samples are listed come into existence through
the generalization achieved by the probabilistic modeling of the data.

The output produced by the script
C<introspection_show_training_samples_to_nodes_influence_propagation.pl> looks like

    sample_1:                                                                 
       nodes affected directly: [2 5 19 23]                                
       nodes affected through probabilistic generalization:                   
            2=> [3 4 25]                                                    
                25=> [26]                                                     
            5=> [6]                                                           
                6=> [7 13]                                                   
                    7=> [8 11]                                               
                        8=> [9 10]                                           
                        11=> [12]                                             
                    13=> [14 18]                                             
                        14=> [15 16]                                         
                            16=> [17]                                         
            19=> [20]                                                         
                20=> [21 22]                                                 
            23=> [24]                                                         
                             
    sample_4:                                                                 
       nodes affected directly: [2 5 6 7 11]                              
       nodes affected through probabilistic generalization:                   
            2=> [3 4 25]                                                    
                25=> [26]                                                     
            5=> [19]                                                          
                19=> [20 23]                                                 
                    20=> [21 22]                                             
                    23=> [24]                                                 
            6=> [13]                                                          
                13=> [14 18]                                                 
                    14=> [15 16]                                             
                        16=> [17]                                             
            7=> [8]                                                           
                8=> [9 10]                                                   
            11=> [12]                                                         
                                                                              
    ...                                                                       
    ...  
    ...

For each training sample, the display shown above first presents the list of nodes
that are directly affected by the sample.  A node is affected directly by a sample if
the latter falls in the portion of the feature space that belongs to the former.
Subsequently, for each training sample, the display shows a subtree of the nodes that
are affected indirectly by the sample through the generalization achieved by the
probabilistic modeling of the data.  In general, a node is affected indirectly by a
sample if it is a descendant of another node that is affected directly.

Also see the section titled B<The Introspection API> regarding how to invoke the
introspection capabilities of the module in your own code.

=head1 METHODS

The module provides the following methods for constructing a decision tree from
training data in a disk file and for classifying new data records with the decision
tree thus constructed:

=over 4

=item B<new():>

    my $dt = Algorithm::DecisionTree->new( 
                              training_datafile => $training_datafile,
                              csv_class_column_index => 2,
                              csv_columns_for_features => [3,4,5,6,7,8],
                              entropy_threshold => 0.01,
                              max_depth_desired => 8,
                              symbolic_to_numeric_cardinality_threshold => 10,
                              csv_cleanup_needed => 1,
            );

A call to C<new()> constructs a new instance of the C<Algorithm::DecisionTree> class.
For this call to make sense, the training data in the training datafile must be
in the CSV format.  

=back

=head2 The Constructor Parameters

=over 8

=item C<training_datafile>:

This parameter supplies the name of the file that contains the training data.

=item C<csv_class_column_index>:

When using a CSV file for your training data, this parameter supplies the zero-based
column index for the column that contains the class label for each data record in the
training file.


=item C<csv_cleanup_needed>:

You need to set this parameter to 1 if your CSV file has double quoted strings (which
may include commas) as values for the fields and if such values are allowed to
include commas for, presumably, better readability.

=item C<csv_columns_for_features>:

When using a CSV file for your training data, this parameter supplies a list of
columns corresponding to the features you wish to use for decision tree construction.
Each column is specified by its zero-based index.

=item C<entropy_threshold>:

This parameter sets the granularity with which the entropies are sampled by the
module.  For example, a feature test at a node in the decision tree is acceptable if
the entropy gain achieved by the test exceeds this threshold.  The larger the value
you choose for this parameter, the smaller the tree.  Its default value is 0.001.

=item C<max_depth_desired>:

This parameter sets the maximum depth of the decision tree.  For obvious reasons, the
smaller the value you choose for this parameter, the smaller the tree.

=item C<symbolic_to_numeric_cardinality_threshold>:

This parameter allows the module to treat an otherwise numeric feature symbolically
if the number of different values the feature takes in the training data file does
not exceed the value of this parameter.

=item C<number_of_histogram_bins>:

This parameter gives the user the option to set the number of points at which the
value range for a feature should be sampled for estimating the probabilities.  This
parameter is effective only for those features that occupy a large value range and
whose probability distributions are heavy tailed.  B<This parameter is also important
when you have a very large training dataset:> In general, the larger the dataset, the
smaller the smallest difference between any two values for a numeric feature in
relation to the overall range of values for that feature. In such cases, the module
may use too large a number of bins for estimating the probabilities and that may slow
down the calculation of the decision tree.  You can get around this difficulty by
explicitly giving a value to the 'C<number_of_histogram_bins>' parameter.

=back


You can choose the best values to use for the last three constructor parameters by
running a 10-fold cross-validation test on your training data through the class
C<EvalTrainingData> that comes with Versions 2.1 and higher of this module.  See the
section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.

=over

=item B<get_training_data():>

After you have constructed a new instance of the C<Algorithm::DecisionTree> class,
you must now read in the training data that is the file named in the call to the
constructor.  This you do by:

    $dt->get_training_data(); 


=item B<show_training_data():>

If you wish to see the training data that was just digested by the module,
call 

    $dt->show_training_data(); 

=item B<calculate_first_order_probabilities():>

=item B<calculate_class_priors():>

After the module has read the training data file, it needs to initialize the
probability cache.  This you do by invoking:

    $dt->calculate_first_order_probabilities()
    $dt->calculate_class_priors() 

=item B<construct_decision_tree_classifier():>

With the probability cache initialized, it is time to construct a decision tree
classifier.  This you do by

    my $root_node = $dt->construct_decision_tree_classifier();

This call returns an instance of type C<DTNode>.  The C<DTNode> class is defined
within the main package file.  So, don't forget, that C<$root_node> in the above
example call will be instantiated to an object of type C<DTNode>.

=item B<$root_nodeC<< -> >>display_decision_tree(" "):>

    $root_node->display_decision_tree("   ");

This will display the decision tree in your terminal window by using a recursively
determined offset for each node as the display routine descends down the tree.

I have intentionally left the syntax fragment C<$root_node> in the above call to
remind the reader that C<display_decision_tree()> is NOT called on the instance of
the C<DecisionTree> we constructed earlier, but on the C<DTNode> instance returned by
the call to C<construct_decision_tree_classifier()>.

=item B<classify($root_node, \@test_sample):>

Let's say you want to classify the following data record:

    my @test_sample  = qw /  g2=4.2
                             grade=2.3
                             gleason=4
                             eet=1.7

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

This method allows you to use a decision-tree based classifier in an interactive
mode.  In this mode, a user is prompted for answers to the questions pertaining to
the feature tests at the nodes of the tree.  The syntax for invoking this method is:

    my $classification = $dt->classify_by_asking_questions($root_node);

where C<$dt> is an instance of the C<Algorithm::DecisionTree> class returned by a
call to C<new()> and C<$root_node> the root node of the decision tree returned by a
call to C<construct_decision_tree_classifier()>.

=back


=head1 THE INTROSPECTION API

To construct an instance of C<DTIntrospection>, you call

    my $introspector = DTIntrospection->new($dt);

where you supply the instance of the C<DecisionTree> class you used for constructing
the decision tree through the parameter C<$dt>.  After you have constructed an
instance of the introspection class, you must initialize it by

    $introspector->initialize();

Subsequently, you can invoke either of the following methods:

    $introspector->explain_classification_at_one_node($node);

    $introspector->explain_classifications_at_multiple_nodes_interactively();

depending on whether you want introspection at a single specified node or inside an
infinite loop for an arbitrary number of nodes.

If you want to output a tabular display that shows for each node in the decision tree
all the training samples that fall in the portion of the feature space that belongs
to that node, call

    $introspector->display_training_samples_at_all_nodes_direct_influence_only();

If you want to output a tabular display that shows for each training sample a list of
all the nodes that are affected directly AND indirectly by that sample, call

    $introspector->display_training_training_samples_to_nodes_influence_propagation();

A training sample affects a node directly if the sample falls in the portion of the
features space assigned to that node. On the other hand, a training sample is
considered to affect a node indirectly if the node is a descendant of a node that is
affected directly by the sample.


=head1 BULK CLASSIFICATION OF DATA RECORDS

For large test datasets, you would obviously want to process an entire file of test
data at a time. The following scripts in the C<Examples> directory illustrate how you
can do that:

      classify_test_data_in_a_file.pl

This script requires three command-line arguments, the first argument names the
training datafile, the second the test datafile, and the third the file in which the
classification results are to be deposited.  

The other examples directories, C<ExamplesBagging>, C<ExamplesBoosting>, and
C<ExamplesRandomizedTrees>, also contain scripts that illustrate how to carry out
bulk classification of data records when you wish to take advantage of bagging,
boosting, or tree randomization.  In their respective directories, these scripts are
named:

    bagging_for_bulk_classification.pl
    boosting_for_bulk_classification.pl
    classify_database_records.pl


=head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED

It depends on whether you apply the classifier at once to all the data samples in a
file, or whether you feed one data sample at a time into the classifier.

In general, the classifier returns soft classification for a test data vector.  What
that means is that, in general, the classifier will list all the classes to which a
given data vector could belong and the probability of each such class label for the
data vector. Run the examples scripts in the Examples directory to see how the output
of classification can be displayed.

With regard to the soft classifications returned by this classifier, if the
probability distributions for the different classes overlap in the underlying feature
space, you would want the classifier to return all of the applicable class labels for
a data vector along with the corresponding class probabilities.  Another reason for
why the decision tree classifier may associate significant probabilities with
multiple class labels is that you used inadequate number of training samples to
induce the decision tree.  The good thing is that the classifier does not lie to you
(unlike, say, a hard classification rule that would return a single class label
corresponding to the partitioning of the underlying feature space).  The decision
tree classifier give you the best classification that can be made given the training
data you fed into it.


=head1 USING BAGGING

Starting with Version 3.0, you can use the class C<DecisionTreeWithBagging> that
comes with the module to incorporate bagging in your decision tree based
classification.  Bagging means constructing multiple decision trees for different
(possibly overlapping) segments of the data extracted from your training dataset and
then aggregating the decisions made by the individual decision trees for the final
classification.  The aggregation of the classification decisions can average out the
noise and bias that may otherwise affect the classification decision obtained from
just one tree.

=over 4

=item B<Calling the bagging constructor::>

A typical call to the constructor for the C<DecisionTreeWithBagging> class looks
like:

    use Algorithm::DecisionTreeWithBagging;
    
    my $training_datafile = "stage3cancer.csv";
    
    my $dtbag = Algorithm::DecisionTreeWithBagging->new(

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN


Decision tree based modeling requires that the class labels be distinct.  That is,
the training dataset must contain a relatively small number of discrete class labels
for all of your data records if you want to model the data with one or more decision
trees.  However, when one is trying to understand all of the associational
relationships that exist in a large database, one often runs into situations where,
instead of discrete class labels, you have a continuously valued variable as a
dependent variable whose values are predicated on a set of feature values.  It is for
such situations that you will find useful the new class C<RegressionTree> that is now
a part of the C<DecisionTree> module.  The C<RegressionTree> class has been
programmed as a subclass of the main C<DecisionTree> class.

You can think of regression with a regression tree as a powerful generalization of
the very commonly used Linear Regression algorithms.  Although you can certainly
carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
modeling nonlinearities between the predictor variables and the dependent variable,
specifying the degree of the polynomial is often tricky. Additionally, a polynomial
can inject continuities between the predictor and the predicted variables that may
not really exist in the real data.  Regression trees, on the other hand, give you a
piecewise linear relationship between the predictor and the predicted variables that
is freed from the constraints of superimposed continuities at the joins between the
different segments.  See the following tutorial for further information regarding the
standard linear regression approach and the regression that can be achieved with the
RegressionTree class in this module:
L<https://engineering.purdue.edu/kak/Tutorials/RegressionTree.pdf>

The RegressionTree class in the current version of the module assumes that all of
your data is numerical.  That is, unlike what is possible with the DecisionTree class
(and the other more closely related classes in this module) that allow your training
file to contain a mixture of numerical and symbolic data, the RegressionTree class
requires that ALL of your data be numerical.  I hope to relax this constraint in
future versions of this module.  Obviously, the dependent variable will always be
numerical for regression.

See the example scripts in the directory C<ExamplesRegression> if you wish to become
more familiar with the regression capabilities of the module.

=over 4

=item B<Calling the RegressionTree constructor:>

    my $training_datafile = "gendata5.csv";
    my $rt = Algorithm::RegressionTree->new(
                              training_datafile => $training_datafile,
                              dependent_variable_column => 2,
                              predictor_columns => [1],
                              mse_threshold => 0.01,
                              max_depth_desired => 2,
                              jacobian_choice => 0,
                              csv_cleanup_needed => 1,
             );

Note in particular the constructor parameters:

    dependent_variable
    predictor_columns
    mse_threshold
    jacobian_choice

The first of these parameters, C<dependent_variable>, is set to the column index in
the CSV file for the dependent variable.  The second constructor parameter,
C<predictor_columns>, tells the system as to which columns contain values for the
predictor variables.  The third parameter, C<mse_threshold>, is for deciding when to
partition the data at a node into two child nodes as a regression tree is being
constructed.  If the minmax of MSE (Mean Squared Error) that can be achieved by
partitioning any of the features at a node is smaller than C<mse_threshold>, that
node becomes a leaf node of the regression tree.

The last parameter, C<jacobian_choice>, must be set to either 0 or 1 or 2.  Its
default value is 0. When this parameter equals 0, the regression coefficients are
calculated using the linear least-squares method and no further "refinement" of the
coefficients is carried out using gradient descent.  This is the fastest way to
calculate the regression coefficients.  When C<jacobian_choice> is set to 1, you get
a weak version of gradient descent in which the Jacobian is set to the "design
matrix" itself. Choosing 2 for C<jacobian_choice> results in a more reasonable
approximation to the Jacobian.  That, however, is at a cost of much longer
computation time.  B<NOTE:> For most cases, using 0 for C<jacobian_choice> is the
best choice.  See my tutorial "I<Linear Regression and Regression Trees>" for why
that is the case.

=back

=head2 B<Methods defined for C<RegressionTree> class>

=over 8

=item B<get_training_data_for_regression():>

Only CSV training datafiles are allowed. Additionally, the first record in the file
must list the names of the fields, and the first column must contain an integer ID
for each record.

=item B<construct_regression_tree():>

As the name implies, this is the method that construct a regression tree.

=item B<display_regression_tree("     "):>

Displays the regression tree, as the name implies.  The white-space string argument
specifies the offset to use in displaying the child nodes in relation to a parent
node.

=item B<prediction_for_single_data_point( $root_node, $test_sample ):>

You call this method after you have constructed a regression tree if you want to
calculate the prediction for one sample.  The parameter C<$root_node> is what is
returned by the call C<construct_regression_tree()>.  The formatting of the argument
bound to the C<$test_sample> parameter is important.  To elaborate, let's say you are
using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
this case, the C<$test_sample> parameter will be bound to a list that will look like

    ['xvar1 = 23.4', 'xvar2 = 12.9'] 

Arbitrary amount of white space, including none, on the two sides of the equality
symbol is allowed in the construct shown above.  A call to this method returns a
dictionary with two key-value pairs.  One of the keys is called C<solution_path> and
the other C<prediction>.  The value associated with key C<solution_path> is the path
in the regression tree to the leaf node that yielded the prediction.  And the value
associated with the key C<prediction> is the answer you are looking for.

=item B<predictions_for_all_data_used_for_regression_estimation( $root_node ):>

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

C<construct_regression_tree()>.  The values for the dependent variable thus predicted
can be seen by calling C<display_all_plots()>, which is the method mentioned below.

=item B<display_all_plots():>

This method displays the results obtained by calling the prediction method of the
previous entry.  This method also creates a hardcopy of the plots and saves it as a
C<.png> disk file. The name of this output file is always C<regression_plots.png>.

=item B<mse_for_tree_regression_for_all_training_samples( $root_node ):>

This method carries out an error analysis of the predictions for the samples in your
training datafile.  It shows you the overall MSE (Mean Squared Error) with tree-based
regression, the MSE for the data samples at each of the leaf nodes of the regression
tree, and the MSE for the plain old Linear Regression as applied to all of the data.
The parameter C<$root_node> in the call syntax is what is returned by the call to
C<construct_regression_tree()>.

=item B<bulk_predictions_for_data_in_a_csv_file( $root_node, $filename, $columns ):>

Call this method if you want to apply the regression tree to all your test data in a
disk file.  The predictions for all of the test samples in the disk file are written
out to another file whose name is the same as that of the test file except for the
addition of C<_output> in the name of the file.  The parameter C<$filename> is the
name of the disk file that contains the test data. And the parameter C<$columns> is a
list of the column indices for the predictor variables in the test file.

=back

=head1 GENERATING SYNTHETIC TRAINING DATA

The module file contains the following additional classes: (1)
C<TrainingDataGeneratorNumeric>, and (2) C<TrainingDataGeneratorSymbolic> for
generating synthetic training data.

The class C<TrainingDataGeneratorNumeric> outputs one CSV file for the
training data and another one for the test data for experimenting with numeric
features.  The numeric values are generated using a multivariate Gaussian
distribution whose mean and covariance are specified in a parameter file. See the
file C<param_numeric.txt> in the C<Examples> directory for an example of such a
parameter file.  Note that the dimensionality of the data is inferred from the
information you place in the parameter file.

The class C<TrainingDataGeneratorSymbolic> generates synthetic training for the
purely symbolic case.  The relative frequencies of the different possible values for
the features is controlled by the biasing information you place in a parameter file.
See C<param_symbolic.txt> for an example of such a file.


=head1 THE C<Examples> DIRECTORY

See the C<Examples> directory in the distribution for how to construct a decision
tree, and how to then classify new data using the decision tree.  To become more
familiar with the module, run the scripts

    construct_dt_and_classify_one_sample_case1.pl
    construct_dt_and_classify_one_sample_case2.pl
    construct_dt_and_classify_one_sample_case3.pl
    construct_dt_and_classify_one_sample_case4.pl

The first script is for the purely symbolic case, the second for the case that
involves both numeric and symbolic features, the third for the case of purely numeric
features, and the last for the case when the training data is synthetically generated
by the script C<generate_training_data_numeric.pl>.

Next run the following script as it is for bulk classification of data records placed
in a CSV file:

    classify_test_data_in_a_file.pl   training4.csv   test4.csv   out4.csv

The script first constructs a decision tree using the training data in the training
file supplied by the first argument file C<training4.csv>.  The script then
calculates the class label for each data record in the test data file supplied
through the second argument file, C<test4.csv>.  The estimated class labels are
written out to the output file which in the call shown above is C<out4.csv>.  An
important thing to note here is that your test file --- in this case C<test4.csv> ---
must have a column for class labels.  Obviously, in real-life situations, there will
be no class labels in this column.  What that is the case, you can place an empty
string C<""> there for each data record. This is demonstrated by the following call:

    classify_test_data_in_a_file.pl   training4.csv   test4_no_class_labels.csv   out4.csv

The following script in the C<Examples> directory

    classify_by_asking_questions.pl

shows how you can use a decision-tree classifier interactively.  In this mode, you
first construct the decision tree from the training data and then the user is
prompted for answers to the feature tests at the nodes of the tree.

If your training data has a feature whose values span a large range and, at the same
time, are characterized by a heavy-tail distribution, you should look at the script

    construct_dt_for_heavytailed.pl                                                     

to see how to use the option C<number_of_histogram_bins> in the call to the
constructor.  This option was introduced in Version 2.22 for dealing with such
features.  If you do not set this option, the module will use the default value of
500 for the number of points at which to sample the value range for such a feature.

The C<Examples> directory also contains the following scripts:

    generate_training_data_numeric.pl
    generate_training_data_symbolic.pl

that show how you can use the module to generate synthetic training.  Synthetic
training is generated according to the specifications laid out in a parameter file.
There are constraints on how the information is laid out in a parameter file.  See
the files C<param_numeric.txt> and C<param_symbolic.txt> in the C<Examples> directory
for how to structure these files.

The C<Examples> directory of Versions 2.1 and higher of the module also contains the
following two scripts:

    evaluate_training_data1.pl
    evaluate_training_data2.pl

that illustrate how the Perl class C<EvalTrainingData> can be used to evaluate the
quality of your training data (as long as it resides in a `C<.csv>' file.)  This new
class is a subclass of the C<DecisionTree> class in the module file.  See the README
in the C<Examples> directory for further information regarding these two scripts.

The C<Examples> directory of Versions 2.31 and higher of the module contains the
following three scripts:

    introspection_in_a_loop_interactive.pl

    introspection_show_training_samples_at_all_nodes_direct_influence.pl

    introspection_show_training_samples_to_nodes_influence_propagation.pl

The first script illustrates how to use the C<DTIntrospection> class of the module
interactively for generating explanations for the classification decisions made at
the nodes of the decision tree.  In the interactive session you are first asked for
the node number you are interested in.  Subsequently, you are asked for whether or
not you are interested in specific questions that the introspector can provide
answers for. The second script generates a tabular display that shows for each node
of the decision tree a list of the training samples that fall directly in the portion
of the feature space assigned that node.  (As mentioned elsewhere in this
documentation, when this list is empty for a node, that means the node is a result of
the generalization achieved by probabilistic modeling of the data.  Note that this
module constructs a decision tree NOT by partitioning the set of training samples,
BUT by partitioning the domains of the probability density functions.)  The third
script listed above also generates a tabular display, but one that shows how the
influence of each training sample propagates in the tree.  This display first shows
the list of nodes that are affected directly by the data in a training sample. This
list is followed by an indented display of the nodes that are affected indirectly by
the training sample.  A training sample affects a node indirectly if the node is a
descendant of one of the nodes affected directly.

The latest addition to the Examples directory is the script:

    get_indexes_associated_with_fields.py

As to why you may find this script useful, note that large database files may have
hundreds of fields and it is not always easy to figure out what numerical index is
associated with a given field.  At the same time, the constructor of the DecisionTree
module requires that the field that holds the class label and the fields that contain
the feature values be specified by their numerical zero-based indexes.  If you have a
large database and you are faced with this problem, you can run this script to see
the zero-based numerical index values associated with the different columns of your
CSV file.


=head1 THE C<ExamplesBagging> DIRECTORY

The C<ExamplesBagging> directory contains the following scripts:

    bagging_for_classifying_one_test_sample.pl
                                                                                               
    bagging_for_bulk_classification.pl

As the names of the scripts imply, the first shows how to call the different methods
of the C<DecisionTreeWithBagging> class for classifying a single test sample.  When
you are classifying a single test sample, you can also see how each bag is
classifying the test sample.  You can, for example, display the training data used in
each bag, the decision tree constructed for each bag, etc.

The second script is for the case when you place all of the test samples in a single
file.  The demonstration script displays for each test sample a single aggregate
classification decision that is obtained through majority voting by all the decision
trees.


=head1 THE C<ExamplesBoosting> DIRECTORY

The C<ExamplesBoosting> subdirectory in the main installation directory contains the
following three scripts:

    boosting_for_classifying_one_test_sample_1.pl

    boosting_for_classifying_one_test_sample_2.pl

    boosting_for_bulk_classification.pl

As the names of the first two scripts imply, these show how to call the different
methods of the C<BoostedDecisionTree> class for classifying a single test sample.
When you are classifying a single test sample, you can see how each stage of the
cascade of decision trees is classifying the test sample.  You can also view each
decision tree separately and also see the trust factor associated with the tree.

The third script is for the case when you place all of the test samples in a single
file.  The demonstration script outputs for each test sample a single aggregate
classification decision that is obtained through trust-factor weighted majority
voting by all the decision trees.

=head1 THE C<ExamplesRandomizedTrees> DIRECTORY

The C<ExamplesRandomizedTrees> directory shows example scripts that you can use to
become more familiar with the C<RandomizedTreesForBigData> class for solving
needle-in-a-haystack and big-data data classification problems. These scripts are:

    randomized_trees_for_classifying_one_test_sample_1.pl

    randomized_trees_for_classifying_one_test_sample_2.pl

    classify_database_records.pl

The first script shows the constructor options to use for solving a
needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
training data belongs to just one class.  The second script shows the constructor
options for using randomized decision trees for the case when you have access to a
very large database of training samples and you'd like to construct an ensemble of
decision trees using training samples pulled randomly from the training database.
The last script illustrates how you can evaluate the classification power of an
ensemble of decision trees as constructed by C<RandomizedTreesForBigData> by classifying
a large number of test samples extracted randomly from the training database.


=head1 THE C<ExamplesRegression> DIRECTORY

The C<ExamplesRegression> subdirectory in the main installation directory shows
example scripts that you can use to become familiar with regression trees and how
they can be used for nonlinear regression.  If you are new to the concept of
regression trees, start by executing the following scripts without changing them and
see what sort of output is produced by them:

    regression4.pl

    regression5.pl

    regression6.pl

    regression8.pl

The C<regression4.pl> script involves only one predictor variable and one dependent
variable. The training data for this exercise is drawn from the file C<gendata4.csv>.
This data file contains strongly nonlinear data.  When you run the script
C<regression4.pl>, you will see how much better the result from tree regression is
compared to what you can get with linear regression.

The C<regression5.pl> script is essentially the same as the previous script except
for the fact that the training datafile used in this case, C<gendata5.csv>, consists
of three noisy segments, as opposed to just two in the previous case.

The script C<regression6.pl> deals with the case when we have two predictor variables
and one dependent variable.  You can think of the data as consisting of noisy height
values over an C<(x1,x2)> plane.  The data used in this script is drawn from the csv
file C<gen3Ddata1.csv>.

Finally, the script C<regression8.pl> shows how you can carry out bulk prediction for
all your test data records in a disk file.  The script writes all the calculated
predictions into another disk file whose name is derived from the name of the test
data file.


=head1 EXPORT

None by design.

=head1 BUGS

Please notify the author if you encounter any bugs.  When sending email, please place
the string 'DecisionTree' in the subject line.

=head1 INSTALLATION

Download the archive from CPAN in any directory of your choice.  Unpack the archive
with a command that on a Linux machine would look like:

    tar zxvf Algorithm-DecisionTree-3.43.tar.gz



( run in 0.943 second using v1.01-cache-2.11-cpan-39bf76dae61 )