Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/BoostedDecisionTree.pm view on Meta::CPAN
package Algorithm::BoostedDecisionTree;
#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::BoostedDecisionTree is a Perl module for boosted decision-tree based
# classification of multidimensional data.
# -------------------------------------------------------------------------------------
#use lib 'blib/lib', 'blib/arch';
#use 5.10.0;
use strict;
use warnings;
use Carp;
use Algorithm::DecisionTree 3.43;
use List::Util qw(reduce min max);
our $VERSION = '3.43';
@Algorithm::BoostedDecisionTree::ISA = ('Algorithm::DecisionTree');
############################################ Constructor ##############################################
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
my %dtargs = %args;
delete $dtargs{how_many_stages};
my $instance = Algorithm::DecisionTree->new(%dtargs);
bless $instance, $class;
$instance->{_how_many_stages} = $args{how_many_stages} || undef;
$instance->{_stagedebug} = $args{stagedebug} || 0;
$instance->{_training_samples} = {map {$_ => []} 0..$args{how_many_stages}};
$instance->{_all_trees} = {map {$_ => Algorithm::DecisionTree->new(%dtargs)} 0..$args{how_many_stages}};
$instance->{_root_nodes} = {map {$_ => undef} 0..$args{how_many_stages}};
$instance->{_sample_selection_probs} = {map {$_ => {}} 0..$args{how_many_stages}};
$instance->{_trust_factors} = {map {$_ => undef} 0..$args{how_many_stages}};
$instance->{_misclassified_samples} = {map {$_ => []} 0..$args{how_many_stages}};
$instance->{_classifications} = undef;
$instance->{_trust_weighted_decision_classes} = undef;
bless $instance, $class;
}
############################################## Methods #################################################
sub get_training_data_for_base_tree {
my $self = shift;
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
my %class_names = ();
my %all_record_ids_with_class_labels;
my $firstline;
my %data_hash;
$|++;
open FILEIN, $self->{_training_datafile};
my $record_index = 0;
my $firsetline;
while (<FILEIN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
if ($record_index == 0) {
$firstline = $record;
$record_index++;
next;
}
my @parts = split /,/, $record;
my $classname = $parts[$self->{_csv_class_column_index}];
$class_names{$classname} = 1;
my $record_label = shift @parts;
$record_label =~ s/^\s*\"|\"\s*$//g;
$data_hash{$record_label} = \@parts;
$all_record_ids_with_class_labels{$record_label} = $classname;
print "." if $record_index % 10000 == 0;
$record_index++;
}
close FILEIN;
$|--;
$self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
my @sample_names = map {"sample_$_"} keys %data_hash;
my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $d...
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
my @all_class_names = sort keys %{ {map {$_ => 1} values %class_for_sample_hash } };
$self->{_number_of_training_samples} = scalar @sample_names;
if ($self->{_debug2}) {
print "\nDisplaying features and their values for entire training data:\n\n";
foreach my $fname (keys %features_and_values_hash) {
print " $fname => @{$features_and_values_hash{$fname}}\n";
}
}
my %features_and_unique_values_hash = ();
my %feature_values_how_many_uniques_hash = ();
my %numeric_features_valuerange_hash = ();
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $feature (keys %features_and_values_hash) {
my %seen = ();
my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
( run in 1.787 second using v1.01-cache-2.11-cpan-f56aa216473 )