Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

Examples/classify_test_data_in_a_file.pl  view on Meta::CPAN

170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
@all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
print "\n All class names: @all_class_names\n" if $debug;
%numeric_features_valuerange_hash = ();
my %feature_values_how_many_uniques_hash = ();
%features_and_unique_values_hash = ();
foreach my $feature (keys %features_and_values_hash) {
    my %seen1 = ();
    my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
                                               @{$features_and_values_hash{$feature}};
    $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
    my $not_all_values_float = 0;
    map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
    if ($not_all_values_float == 0) {
        my @minmaxvalues = minmax(\@unique_values_for_feature);
        $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
    }
    $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
if ($debug) {
    print "\nAll class names: @all_class_names\n";
    print "\nEach sample data record:\n";
    foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
        print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";

ExamplesBagging/bagging_for_bulk_classification.pl  view on Meta::CPAN

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
@all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
print "\n All class names: @all_class_names\n" if $debug;
%numeric_features_valuerange_hash = ();
my %feature_values_how_many_uniques_hash = ();
%features_and_unique_values_hash = ();
foreach my $feature (keys %features_and_values_hash) {
    my %seen1 = ();
    my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
                                               @{$features_and_values_hash{$feature}};
    $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
    my $not_all_values_float = 0;
    map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
    if ($not_all_values_float == 0) {
        my @minmaxvalues = minmax(\@unique_values_for_feature);
        $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
    }
    $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
if ($debug) {
    print "\nAll class names: @all_class_names\n";
    print "\nEach sample data record:\n";
    foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
        print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";

ExamplesBoosting/boosting_for_bulk_classification.pl  view on Meta::CPAN

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
@all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
print "\n All class names: @all_class_names\n" if $debug;
%numeric_features_valuerange_hash = ();
my %feature_values_how_many_uniques_hash = ();
%features_and_unique_values_hash = ();
foreach my $feature (keys %features_and_values_hash) {
    my %seen1 = ();
    my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
                                               @{$features_and_values_hash{$feature}};
    $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
    my $not_all_values_float = 0;
    map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
    if ($not_all_values_float == 0) {
        my @minmaxvalues = minmax(\@unique_values_for_feature);
        $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
    }
    $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
if ($debug) {
    print "\nAll class names: @all_class_names\n";
    print "\nEach sample data record:\n";
    foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
        print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    }
}
my %features_and_unique_values_hash = ();
my %feature_values_how_many_uniques_hash  =  ();
my %numeric_features_valuerange_hash   =   ();
my $numregex '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $feature (keys %features_and_values_hash) {
    my %seen = ();
    my @unique_values_for_feature grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
    $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
    my $not_all_values_float = 0;
    map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
    if ($not_all_values_float == 0) {
        my @minmaxvalues = minmax(\@unique_values_for_feature);
        $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
    }
    $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
$self->{_all_trees}->{0}->{_class_names} = \@all_class_names;
$self->{_all_trees}->{0}->{_feature_names} = \@feature_names;
$self->{_all_trees}->{0}->{_samples_class_label_hash} = \%class_for_sample_hash;
$self->{_all_trees}->{0}->{_training_data_hash}  =  \%feature_values_for_samples_hash;
$self->{_all_trees}->{0}->{_features_and_values_hash}    =  \%features_and_values_hash;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
die "\n\nError in the names you have used for features and/or values.  " .
    "Try using the csv_cleanup_needed option in the constructor call."
                    unless $self->check_names_used(\@features_and_values);
my @new_features_and_values = ();
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values) {
    $feature_and_value =~ /$pattern/;
    my ($feature, $value) = ($1, $2);
    my $newvalue = $value;
    my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
    my $not_all_values_float = 0;
    map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
    if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
                                                                   $not_all_values_float == 0) {
        $newvalue = closest_sampling_point($value, \@unique_values_for_feature);
    }
    push @new_features_and_values, "$feature" . '=' . "$newvalue";
}
@features_and_values = @new_features_and_values;
print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
my %answer = ();
foreach my $class_name (@{$self->{_class_names}}) {
    $answer{$class_name} = undef;
}

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
my $pattern3 = '(.+)>(.+)';
my @all_symbolic_features = ();
foreach my $feature_name (@{$self->{_feature_names}}) {
    push @all_symbolic_features, $feature_name
        if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
}
my @symbolic_features_already_used = (); 
foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
    push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
}
my @symbolic_features_not_yet_used;
foreach my $x (@all_symbolic_features) {
    push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
}
my @true_numeric_types = ();
my @symbolic_types = ();
my @true_numeric_types_feature_names = ();
my @symbolic_types_feature_names = ();
foreach my $item (@features_and_values_or_thresholds_on_branch) {
    if ($item =~ /$pattern2/) {
        push @true_numeric_types, $item;
        push @true_numeric_types_feature_names, $1;
    } elsif ($item =~ /$pattern3/) {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};    
my %numeric_features_valuerange_hash = ();
my %feature_values_how_many_uniques_hash = ();
my %features_and_unique_values_hash = ();
my $numregex '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $feature (keys %features_and_values_hash) {
    my %seen1 = ();
    my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
                                               @{$features_and_values_hash{$feature}};
    $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
    my $not_all_values_float = 0;
    map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
    if ($not_all_values_float == 0) {
        my @minmaxvalues = minmax(\@unique_values_for_feature);
        $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
    }
    $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
if ($self->{_debug1}) {
    print "\nAll class names: @all_class_names\n";
    print "\nEach sample data record:\n";
    foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
        print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";

lib/Algorithm/DecisionTreeWithBagging.pm  view on Meta::CPAN

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
print "all class names: @all_class_names\n" if $self->{_debug2};
my %numeric_features_valuerange_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
my %feature_values_how_many_uniques_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
my %features_and_unique_values_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
my $numregex '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $i (0 .. $self->{_how_many_bags} - 1) {
    foreach my $feature (keys %{$features_and_values_hash_bags{$i}}) {
        my %seen = ();
        my @unique_values_for_feature_in_bag grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash_bags{$i}{$feature}};
        $feature_values_how_many_uniques_hash_bags{$i}->{$feature} = scalar @unique_values_for_feature_in_bag;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature_in_bag;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature_in_bag);
            $numeric_features_valuerange_hash_bags{$i}->{$feature} = \@minmaxvalues;
        }
        $features_and_unique_values_hash_bags{$i}->{$feature} = \@unique_values_for_feature_in_bag;
    }
}
if ($self->{_debug2}) {
    print "\nDisplaying value ranges for numeric features in each bag:\n\n";
    foreach my $bag_index (keys  %numeric_features_valuerange_hash_bags) {       
        my %keyval = %{$numeric_features_valuerange_hash_bags{$bag_index}};

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
}
my $numeric_features_valuerange_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
my $feature_values_how_many_uniques_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
my $features_and_unique_values_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
my $numregex '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $t (0 .. $self->{_how_many_trees} - 1) {   
    foreach my $feature (sort keys %{$features_and_values_all_trees->{$t}}) {
        my %all_values_for_feature map {$_ => 1} @{$features_and_values_all_trees->{$t}->{$feature}};
        my @unique_values_for_feature = grep {$_ ne 'NA'} keys %all_values_for_feature;
        $feature_values_how_many_uniques_all_trees->{$t}->{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_all_trees->{$t}->{$feature} = \@minmaxvalues;
        }
        $features_and_unique_values_all_trees->{$t}->{$feature} = \@unique_values_for_feature;           
    }
}
if ($self->{_debug1}) {
    print "\nDisplaying value ranges for numeric features for all trees:\n\n";
    foreach my $tree_index (keys  %{$numeric_features_valuerange_all_trees}) {       
        my %keyval = %{$numeric_features_valuerange_all_trees->{$tree_index}};

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_-1]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : ...
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a-1] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_predictor_columns}};    
my %numeric_features_valuerange_hash   =   ();
my %feature_values_how_many_uniques_hash  =  ();
my %features_and_unique_values_hash = ();
my $numregex '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $feature (keys %features_and_values_hash) {
    my %seen = ();
    my @unique_values_for_feature grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
    $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
    my $not_all_values_float = 0;
    map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
    if ($not_all_values_float == 0) {
        my @minmaxvalues = minmax(\@unique_values_for_feature);
        $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
    }
    $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
if ($self->{_debug1_r}) {
    print "\nDependent var values: @dependent_var_values\n";
    print "\nEach sample data record:\n";
    foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
        print "$kee    =>   @{$feature_values_for_samples_hash{$kee}}\n";



( run in 0.392 second using v1.01-cache-2.11-cpan-95122f20152 )