Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

Examples/classify_test_data_in_a_file.pl  view on Meta::CPAN


my $debug = 0;

### When the following variable is set to 1, only the most probable class for each
### data record is written out to the output file.  This works only for the case
### when the output is sent to a `.txt' file.  If the output is sent to a `.csv' 
### file, you'll see all the class names and their probabilities for each data sample
### in your test datafile.
my $show_hard_classifications = 1;

my ($training_datafile, $test_datafile, $outputfile) = @ARGV;

my $dt = Algorithm::DecisionTree->new( 
                 training_datafile => $training_datafile,
                 csv_class_column_index => 1,        # col indexing is 0 based
                 csv_columns_for_features => [2,3],
                 entropy_threshold => 0.01,
                 max_depth_desired => 3,
                 symbolic_to_numeric_cardinality_threshold => 10,
                 csv_cleanup_needed => 1,
        );

Examples/classify_test_data_in_a_file.pl  view on Meta::CPAN

### UNCOMMENT THE NEXT STATEMENT if you would like to see
### the decision tree displayed in your terminal window:
#$root_node->display_decision_tree("   ");

# NOW YOU ARE READY TO CLASSIFY THE FILE BASED TEST DATA:
my (@all_class_names, @feature_names, %class_for_sample_hash, %feature_values_for_samples_hash,
    %features_and_values_hash, %features_and_unique_values_hash, 
    %numeric_features_valuerange_hash, %feature_values_how_many_uniques_hash);

get_test_data_from_csv();
open OUTPUTHANDLE, ">$outputfile"
    or die "Unable to open the file $outputfile for writing out the classification results: $!";
if ($show_hard_classifications && ($outputfile !~ /\.csv$/i)) {
    print OUTPUTHANDLE "\nOnly the most probable class shown for each test sample\n\n";
} elsif (!$show_hard_classifications && ($outputfile !~ /\.csv$/i)) {
    print OUTPUTHANDLE "\nThe classification result for each sample ordered in decreasing order of probability\n\n";
}
if ($outputfile =~ /\.csv$/i) {
    my $class_names_csv = join ',', sort @{$dt->{_class_names}};
    my $output_string = "sample_index,$class_names_csv\n";
    print OUTPUTHANDLE "$output_string";
    foreach my $sample (sort {sample_index($a) <=> sample_index($b)} 
                                       keys %feature_values_for_samples_hash) {
        my @test_sample =  @{$feature_values_for_samples_hash{$sample}};
        my %classification = %{$dt->classify($root_node, \@test_sample)};
        my $sample_index = sample_index($sample);
        my @solution_path = @{$classification{'solution_path'}};
        delete $classification{'solution_path'};

ExamplesBagging/bagging_for_bulk_classification.pl  view on Meta::CPAN

use Algorithm::DecisionTreeWithBagging;

die "This script must be called with exactly three command-line arguments:\n" .
    "     1st arg: name of the training datafile\n" .
    "     2nd arg: name of the test data file\n" .     
    "     3rd arg: the name of the output file to which class labels will be written\n" 
    unless @ARGV == 3;

my $debug = 0;

my ($training_datafile, $test_datafile, $outputfile) = @ARGV;

my $training_file_class_name_in_column       = 1;
my $training_file_columns_for_feature_values = [2,3];
my $how_many_bags                            = 4;
my $bag_overlap_fraction                     = 0.2;

my (@all_class_names, @feature_names, %class_for_sample_hash, %feature_values_for_samples_hash,
    %features_and_values_hash, %features_and_unique_values_hash,
    %numeric_features_valuerange_hash, %feature_values_how_many_uniques_hash);

ExamplesBagging/bagging_for_bulk_classification.pl  view on Meta::CPAN

$dtbag->calculate_class_priors();

$dtbag->construct_decision_trees_for_bags();

##  UNCOMMENT the following statement if you want to see the decision trees constructed for each bag
$dtbag->display_decision_trees_for_bags();

### NOW YOU ARE READY TO CLASSIFY THE FILE-BASED TEST DATA:
get_test_data_from_csv();

open FILEOUT, ">$outputfile"
    or die "Unable to open file $outputfile for writing out classification results: $!";

my $class_names = join ",", sort @{$dtbag->get_all_class_names()};

my $output_string = "sample_index,$class_names\n";

print FILEOUT $output_string;

foreach my $item (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
    my $test_sample =  $feature_values_for_samples_hash{$item};
    $dtbag->classify_with_bagging($test_sample);
    my $classification = $dtbag->get_majority_vote_classification();
    my $output_string = sample_index($item);
#    $output_string .=  "," + $classification[11:]    
    $output_string .=  ",$classification";
    print FILEOUT "$output_string\n";
}

print "Majority vote classifications from the bags written out to $outputfile\n";

########################################################################################
###############################    Support Routines     ################################

sub get_test_data_from_csv {
    open FILEIN, $test_datafile or die "Unable to open $test_datafile: $!";
    die("Aborted. get_test_data_csv() is only for CSV files") 
                                           unless $test_datafile =~ /\.csv$/;
    my $class_name_in_column = $training_file_class_name_in_column - 1;
    my @all_data =  <FILEIN>;

ExamplesBoosting/boosting_for_bulk_classification.pl  view on Meta::CPAN

use Algorithm::BoostedDecisionTree;

die "This script must be called with exactly three command-line arguments:\n" .
    "     1st arg: name of the training datafile\n" .
    "     2nd arg: name of the test data file\n" .     
    "     3rd arg: the name of the output file to which class labels will be written\n" 
    unless @ARGV == 3;

my $debug = 0;

my ($training_datafile, $test_datafile, $outputfile) = @ARGV;

my $training_file_class_name_in_column       = 1;
my $training_file_columns_for_feature_values = [2,3];
my $how_many_stages                          = 4;

my (@all_class_names, @feature_names, %class_for_sample_hash, %feature_values_for_samples_hash,
    %features_and_values_hash, %features_and_unique_values_hash,
    %numeric_features_valuerange_hash, %feature_values_how_many_uniques_hash);

my $boosted = Algorithm::BoostedDecisionTree->new(

ExamplesBoosting/boosting_for_bulk_classification.pl  view on Meta::CPAN



##  UNCOMMENT the next statement if you want to see the decision trees constructed
##  for each stage of the cascade:
print "\nDisplaying the decision trees for all stages:\n\n";
$boosted->display_decision_trees_for_different_stages();

### NOW YOU ARE READY TO CLASSIFY THE FILE-BASED TEST DATA:
get_test_data_from_csv();

open FILEOUT, ">$outputfile"
    or die "Unable to open file $outputfile for writing out classification results: $!";

my $class_names = join ",", sort @{$boosted->get_all_class_names()};

my $output_string = "sample_index,$class_names\n";

print FILEOUT $output_string;

foreach my $item (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
    my $test_sample =  $feature_values_for_samples_hash{$item};
    $boosted->classify_with_boosting($test_sample);
    my $classification = $boosted->trust_weighted_majority_vote_classifier();
    my $output_string = sample_index($item);
#    $output_string .=  "," + $classification[11:]    
    $output_string .=  ",$classification";
    print FILEOUT "$output_string\n";
}

print "Majority vote classifications using boosting written out to $outputfile\n";

############################  Utility Routines #################################

sub get_test_data_from_csv {
    open FILEIN, $test_datafile or die "Unable to open $test_datafile: $!";
    die("Aborted. get_test_data_csv() is only for CSV files") 
                                           unless $test_datafile =~ /\.csv$/;
    my $class_name_in_column = $training_file_class_name_in_column - 1;
    my @all_data =  <FILEIN>;
    my %data_hash = ();



( run in 0.239 second using v1.01-cache-2.11-cpan-4d50c553e7e )