Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/DecisionTree.pm view on Meta::CPAN
push @arr3, \@temp if @temp > 0;
}
print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
# Sort each list so that '<' entries occur before '>' entries:
my @arr4;
foreach my $li (@arr3) {
my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
push @arr4, \@sorted;
}
print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
my @arr5;
foreach my $li (@arr4) {
my @temp1 = ();
my @temp2 = ();
foreach my $inner (@$li) {
if ($inner->[1] eq '<') {
push @temp1, $inner;
} else {
push @temp2, $inner;
}
}
if (@temp1 > 0 && @temp2 > 0) {
push @arr5, [\@temp1, \@temp2];
} elsif (@temp1 > 0) {
push @arr5, [\@temp1];
} else {
push @arr5, [\@temp2];
}
}
print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
my @arr6 = ();
foreach my $li (@arr5) {
my @temp1 = ();
foreach my $inner (@$li) {
my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
push @temp1, \@sorted;
}
push @arr6, \@temp1;
}
print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
my @arr9 = ();
foreach my $li (@arr6) {
foreach my $alist (@$li) {
my @newalist = ();
if ($alist->[0][1] eq '<') {
push @newalist, $alist->[0];
} else {
push @newalist, $alist->[-1];
}
if ($alist->[0][1] ne $alist->[-1][1]) {
push @newalist, $alist->[-1];
}
push @arr9, \@newalist;
}
}
print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
return \@arr9;
}
## This method is used to verify that you used legal feature names in the test
## sample that you want to classify with the decision tree.
sub check_names_used {
my $self = shift;
my $features_and_values_test_data = shift;
my @features_and_values_test_data = @$features_and_values_test_data;
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values_test_data) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1,$2);
die "Your test data has formatting error" unless defined($feature) && defined($value);
return 0 unless contained_in($feature, @{$self->{_feature_names}});
}
return 1;
}
####################################### Data Condition Calculator ######################################
## This method estimates the worst-case fan-out of the decision tree taking into
## account the number of values (and therefore the number of branches emanating from
## a node) for the symbolic features.
sub determine_data_condition {
my $self = shift;
my $num_of_features = scalar @{$self->{_feature_names}};
my @values = ();
my @number_of_values;
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
push @number_of_values, scalar @values;
}
return if ! @values;
print "Number of features: $num_of_features\n";
my @minmax = minmax(\@number_of_values);
my $max_num_values = $minmax[1];
print "Largest number of values for symbolic features is: $max_num_values\n";
my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
"nodes. The exact number of nodes created depends critically on " .
"the entropy_threshold used for node expansion (the default value " .
"for this threshold is 0.01) and on the value set for max_depth_desired " .
"for the depth of the tree.\n";
if ($estimated_number_of_nodes > 10000) {
print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
"large value for entropy_threshold and/or a small value for " .
"for max_depth_desired to reduce the number of nodes created.\n";
print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
my $answer = <STDIN>;
$answer =~ s/\r?\n?$//;
while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
$answer = <STDIN>;
$answer =~ s/\r?\n?$//;
}
die unless $answer =~ /y(es)?/i;
}
}
#################################### Read Training Data From File ######################################
( run in 1.820 second using v1.01-cache-2.11-cpan-98e64b0badf )