Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                           if check_for_illegal_params2(@params) == 0;
    }
    bless {
        _training_datafile                   =>    $args{training_datafile}, 
        _entropy_threshold                   =>    $args{entropy_threshold} || 0.01,
        _max_depth_desired                   =>    exists $args{max_depth_desired} ? 
                                                                       $args{max_depth_desired} : undef,
        _debug1                              =>    $args{debug1} || 0,
        _debug2                              =>    $args{debug2} || 0,
        _debug3                              =>    $args{debug3} || 0,
        _csv_class_column_index              =>    $args{csv_class_column_index} || undef,
        _csv_columns_for_features            =>    $args{csv_columns_for_features} || undef,
        _symbolic_to_numeric_cardinality_threshold
                                             =>    $args{symbolic_to_numeric_cardinality_threshold} || 10,
        _number_of_histogram_bins            =>    $args{number_of_histogram_bins} || undef,
        _csv_cleanup_needed                  =>    $args{csv_cleanup_needed} || 0,
        _training_data                       =>    [],
        _root_node                           =>    undef,
        _probability_cache                   =>    {},
        _entropy_cache                       =>    {},
        _training_data_hash                  =>    {},
        _features_and_values_hash            =>    {},
        _samples_class_label_hash            =>    {},
        _class_names                         =>    [],
        _class_priors                        =>    [],
        _class_priors_hash                   =>    {},
        _feature_names                       =>    [],
        _numeric_features_valuerange_hash    =>    {},
        _sampling_points_for_numeric_feature_hash      =>      {},
        _feature_values_how_many_uniques_hash          =>      {},
        _prob_distribution_numeric_features_hash       =>      {},
        _histogram_delta_hash                          =>      {},
        _num_of_histogram_bins_hash                    =>      {},
    }, $class;

}

####################################  Classify with Decision Tree  #######################################

##  Classifies one test sample at a time using the decision tree constructed from
##  your training file.  The data record for the test sample must be supplied as
##  shown in the scripts in the `examples' subdirectory.  See the scripts
##  construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
sub classify {
    my $self = shift;
    my $root_node = shift;
    my $feature_and_values = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my @features_and_values = @$feature_and_values;
    @features_and_values = @{deep_copy_array(\@features_and_values)};
    die "\n\nError in the names you have used for features and/or values.  " .
        "Try using the csv_cleanup_needed option in the constructor call." 
                        unless $self->check_names_used(\@features_and_values);
    my @new_features_and_values = ();
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values) {
        $feature_and_value =~ /$pattern/;
        my ($feature, $value) = ($1, $2);
        my $newvalue = $value;
        my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
                                                                       $not_all_values_float == 0) {
            $newvalue = closest_sampling_point($value, \@unique_values_for_feature);
        }
        push @new_features_and_values, "$feature" . '=' . "$newvalue";
    }
    @features_and_values = @new_features_and_values;
    print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
    my %answer = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        $answer{$class_name} = undef;
    }
    $answer{'solution_path'} = [];
    my %classification = %{$self->recursive_descent_for_classification($root_node, 
                                                                    \@features_and_values,\%answer)};
    @{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
    if ($self->{_debug3}) {
        print "\nCL2 The classification:\n";
        foreach my $class_name (@{$self->{_class_names}}) {
            print "    $class_name  with probability $classification{$class_name}\n";
        }
    }
    my %classification_for_display = ();
    foreach my $item (keys %classification) {
        if ($item ne 'solution_path') {
            $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
        } else {
            my @outlist = ();
            foreach my $x (@{$classification{$item}}) {
                push @outlist, "NODE$x";
            }
            $classification_for_display{$item} =  \@outlist;
        }
    }
    return \%classification_for_display;
}

sub recursive_descent_for_classification {
    my $self = shift;
    my $node = shift;
    my $features_and_values = shift;
    my $answer = shift;
    my @features_and_values = @$features_and_values;
    my %answer = %$answer;
    my @children = @{$node->get_children()};
    if (@children == 0) {
        my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
        foreach my $i (0..@{$self->{_class_names}}-1) {
            $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
        }
        push @{$answer{'solution_path'}}, $node->get_serial_num();
        return \%answer;
    }
    my $feature_tested_at_node = $node->get_feature();
    print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n" 
        if $self->{_debug3};
    my $value_for_feature;
    my $path_found;
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values) {
        $feature_and_value =~ /$pattern/;
        $value_for_feature = $2 if $feature_tested_at_node eq $1;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

                } else {
                    @extended_branch_features_and_values_or_thresholds = 
                        @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
                    push @extended_branch_features_and_values_or_thresholds, 
                                           $feature_value_combos[$feature_and_value_index];
                }
                my @class_probabilities =
                   map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
                               \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
                my $class_entropy_for_child = 
                      $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
                                                         \@extended_branch_features_and_values_or_thresholds);
                if ($self->{_debug3}) {
                    print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
                    print "\nRD20 class entropy for child: $class_entropy_for_child\n"; 
                }
                if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
                    my $child_node = DTNode->new(undef, $class_entropy_for_child,
                          \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
                    $node->add_child_link($child_node);
                    $self->recursive_descent($child_node);
                } else {
                    print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
                }
            }
        }
    } else {
        print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n" 
            if $self->{_debug3};
        return;
    }
}

##  This is the heart of the decision tree constructor.  Its main job is to figure
##  out the best feature to use for partitioning the training data samples that
##  correspond to the current node.  The search for the best feature is carried out
##  differently for symbolic features and for numeric features.  For a symbolic
##  feature, the method estimates the entropy for each value of the feature and then
##  averages out these entropies as a measure of the discriminatory power of that
##  features.  For a numeric feature, on the other hand, it estimates the entropy
##  reduction that can be achieved if were to partition the set of training samples
##  for each possible threshold.  For a numeric feature, all possible sampling points
##  relevant to the node in question are considered as candidates for thresholds.
sub best_feature_calculator {
    my $self = shift;
    my $features_and_values_or_thresholds_on_branch = shift;
    my $existing_node_entropy = shift;
    my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;
    my $pattern1 = '(.+)=(.+)';
    my $pattern2 = '(.+)<(.+)';
    my $pattern3 = '(.+)>(.+)';
    my @all_symbolic_features = ();
    foreach my $feature_name (@{$self->{_feature_names}}) {
        push @all_symbolic_features, $feature_name 
            if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
    }
    my @symbolic_features_already_used = ();  
    foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
        push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
    }
    my @symbolic_features_not_yet_used;
    foreach my $x (@all_symbolic_features) {
        push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
    }
    my @true_numeric_types = ();
    my @symbolic_types = ();
    my @true_numeric_types_feature_names = ();
    my @symbolic_types_feature_names = ();
    foreach my $item (@features_and_values_or_thresholds_on_branch) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern3/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern1/) {
            push @symbolic_types, $item;
            push @symbolic_types_feature_names, $1;
        } else {
            die "format error in the representation of feature and values or thresholds";
        }
    }
    my %seen = ();
    @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
    %seen = ();
    @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
    my @bounded_intervals_numeric_types = 
                       @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
    # Calculate the upper and the lower bounds to be used when searching for the best
    # threshold for each of the numeric features that are in play at the current node:
    my (%upperbound, %lowerbound);
    foreach my $feature (@true_numeric_types_feature_names) {
        $upperbound{$feature} = undef;
        $lowerbound{$feature} = undef;
    }
    foreach my $item (@bounded_intervals_numeric_types) {
        foreach my $feature_grouping (@$item) {
            if ($feature_grouping->[1] eq '>') {
                $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
            } else {
                $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
            }
        }
    }
    my %entropy_values_for_different_features = ();
    my %partitioning_point_child_entropies_hash = ();
    my %partitioning_point_threshold = ();
    my %entropies_for_different_values_of_symbolic_feature = ();
    foreach my $feature (@{$self->{_feature_names}}) {
        $entropy_values_for_different_features{$feature} = [];
        $partitioning_point_child_entropies_hash{$feature} = {};
        $partitioning_point_threshold{$feature} = undef;
        $entropies_for_different_values_of_symbolic_feature{$feature} = [];
    }
    foreach my $i (0..@{$self->{_feature_names}}-1) {
        my $feature_name = $self->{_feature_names}->[$i];
        print "\n\nBFC1          FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
        if (contained_in($feature_name, @symbolic_features_already_used)) {
            next;
        } elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
                 $self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
                                      $self->{_symbolic_to_numeric_cardinality_threshold}) {
            my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        }
        die unless $answer =~ /y(es)?/i;
    }
}


####################################  Read Training Data From File  ######################################


sub get_training_data {
    my $self = shift;
    die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
    my %class_names = ();
    my %all_record_ids_with_class_labels;
    my $firstline;
    my %data_hash;
    $|++;
    open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
    my $record_index = 0;
    my $firsetline;
    while (<FILEIN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
        if ($record_index == 0) {
            $firstline = $record;
            $record_index++;
            next;
        }
        my @parts = split /,/, $record;
        my $classname = $parts[$self->{_csv_class_column_index}];
        $class_names{$classname} = 1;
        my $record_label = shift @parts;
        $record_label  =~ s/^\s*\"|\"\s*$//g;
        $data_hash{$record_label} = \@parts;
        $all_record_ids_with_class_labels{$record_label} = $classname;
        print "." if $record_index % 10000 == 0;
        $record_index++;
    }
    close FILEIN;    
    $|--;
    $self->{_how_many_total_training_samples} = $record_index - 1;  # must subtract 1 for the header record
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
    my @all_feature_names =   split /,/, substr($firstline, index($firstline,','));
    my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
    my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
    my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
    my %class_for_sample_hash = map {"sample_" . $_  =>  "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
    my @sample_names = map {"sample_$_"} keys %data_hash;
    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $d...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};     
    my %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    my %features_and_unique_values_hash = ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($self->{_debug1}) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";
        }
        print "\nclass label for each data sample:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)}  keys %class_for_sample_hash) {
            print "$sample => $class_for_sample_hash{$sample}\n";
        }
        print "\nFeatures used: @feature_names\n\n";
        print "\nfeatures and the values taken by them:\n";
        foreach my $feature (sort keys %features_and_values_hash) {
            print "$feature => @{$features_and_values_hash{$feature}}\n";
        }
        print "\nnumeric features and their ranges:\n";
        foreach  my $feature (sort keys %numeric_features_valuerange_hash) {
            print "$feature  =>  @{$numeric_features_valuerange_hash{$feature}}\n";
        }
        print "\nnumber of unique values in each feature:\n";
        foreach  my $feature (sort keys %feature_values_how_many_uniques_hash) {
            print "$feature  =>  $feature_values_how_many_uniques_hash{$feature}\n";
        }
    }
    $self->{_class_names} = \@all_class_names;
    $self->{_feature_names} = \@feature_names;
    $self->{_samples_class_label_hash}  =  \%class_for_sample_hash;
    $self->{_training_data_hash}  =  \%feature_values_for_samples_hash;
    $self->{_features_and_values_hash}  = \%features_and_values_hash;
    $self->{_features_and_unique_values_hash}  =  \%features_and_unique_values_hash;
    $self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
    $self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
}

sub show_training_data {
    my $self = shift;
    my @class_names = @{$self->{_class_names}};
    my %features_and_values_hash = %{$self->{_features_and_values_hash}};
    my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
    my %training_data_hash = %{$self->{_training_data_hash}};
    print "\n\nClass Names: @class_names\n";
    print "\n\nFeatures and Their Values:\n\n";
    while ( my ($k, $v) = each %features_and_values_hash ) {
        print "$k --->  @{$features_and_values_hash{$k}}\n";
    }
    print "\n\nSamples vs. Class Labels:\n\n";
    foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
        print "$kee =>  $samples_class_label_hash{$kee}\n";
    }
    print "\n\nTraining Samples:\n\n";
    foreach my $kee (sort {sample_index($a) <=> sample_index($b)} 
                                      keys %training_data_hash) {
        print "$kee =>  @{$training_data_hash{$kee}}\n";
    }
}    



( run in 0.464 second using v1.01-cache-2.11-cpan-5a3173703d6 )