Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

        push @arr3, \@temp if @temp > 0;
    }
    print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
    # Sort each list so that '<' entries occur before '>' entries:
    my @arr4;
    foreach my $li (@arr3) {
        my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
        push @arr4, \@sorted;
    }
    print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
    my @arr5;
    foreach my $li (@arr4) {
        my @temp1 = ();
        my @temp2 = ();
        foreach my $inner (@$li) {
            if ($inner->[1] eq '<') {
                push @temp1, $inner;
            } else {
                push @temp2, $inner;
            }
        }
        if (@temp1 > 0 && @temp2 > 0) {
            push @arr5, [\@temp1, \@temp2];
        } elsif (@temp1 > 0) {
            push @arr5, [\@temp1];
        } else {
            push @arr5, [\@temp2];
        }
    }
    print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
    my @arr6 = ();
    foreach my $li (@arr5) {
        my @temp1 = ();
        foreach my $inner (@$li) {
            my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
            push @temp1, \@sorted;
        }
        push @arr6, \@temp1;
    }
    print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
    my @arr9 = ();
    foreach my $li (@arr6) {
        foreach my $alist (@$li) {
            my @newalist = ();
            if ($alist->[0][1] eq '<') {
                push @newalist, $alist->[0];
            } else {
                push @newalist, $alist->[-1];
            }
            if ($alist->[0][1] ne $alist->[-1][1]) {
                push @newalist, $alist->[-1];
            }
            push @arr9, \@newalist;
        }
    }
    print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
    return \@arr9;

}

##  This method is used to verify that you used legal feature names in the test
##  sample that you want to classify with the decision tree.
sub check_names_used {
    my $self = shift;
    my $features_and_values_test_data = shift;
    my @features_and_values_test_data = @$features_and_values_test_data;
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values_test_data) {
        $feature_and_value =~ /$pattern/;
        my ($feature,$value) = ($1,$2);
        die "Your test data has formatting error" unless defined($feature) && defined($value);
        return 0 unless contained_in($feature, @{$self->{_feature_names}});
    }
    return 1;
}

#######################################  Data Condition Calculator  ######################################

##  This method estimates the worst-case fan-out of the decision tree taking into
##  account the number of values (and therefore the number of branches emanating from
##  a node) for the symbolic features.
sub determine_data_condition {
    my $self = shift;
    my $num_of_features = scalar @{$self->{_feature_names}};
    my @values = ();
    my @number_of_values;
    foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {  
        push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
            if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
        push @number_of_values, scalar @values;
    }
    return if ! @values;
    print "Number of features: $num_of_features\n";
    my @minmax = minmax(\@number_of_values);
    my $max_num_values = $minmax[1];
    print "Largest number of values for symbolic features is: $max_num_values\n";
    my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
    print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
          "nodes. The exact number of nodes created depends critically on " .
          "the entropy_threshold used for node expansion (the default value " .
          "for this threshold is 0.01) and on the value set for max_depth_desired " .
          "for the depth of the tree.\n";
    if ($estimated_number_of_nodes > 10000) {
        print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
              "large value for entropy_threshold and/or a small value for " .
              "for max_depth_desired to reduce the number of nodes created.\n";
        print "\nDo you wish to continue anyway? Enter 'y' for yes:  ";
        my $answer = <STDIN>;
        $answer =~ s/\r?\n?$//;
        while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
            print "\nAnswer not recognized.  Let's try again. Enter 'y' or 'n': ";
            $answer = <STDIN>;
            $answer =~ s/\r?\n?$//;
        }
        die unless $answer =~ /y(es)?/i;
    }
}


####################################  Read Training Data From File  ######################################



( run in 1.820 second using v1.01-cache-2.11-cpan-98e64b0badf )