Algorithm-DecisionTree
view release on metacpan or search on metacpan
lib/Algorithm/RandomizedTreesForBigData.pm view on Meta::CPAN
package Algorithm::RandomizedTreesForBigData;
#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::RandomizedTreesForBigData is a Perl module for inducing multiple decision
# trees using randomized selection of samples from a large training data file.
# -------------------------------------------------------------------------------------
#use lib 'blib/lib', 'blib/arch';
#use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(pairmap);
use Algorithm::DecisionTree 3.43;
our $VERSION = '3.43';
############################################ Constructor ##############################################
sub new {
my ($class, %args) = @_;
my @params = keys %args;
my %dtargs = %args;
delete $dtargs{how_many_trees};
delete $dtargs{how_many_training_samples_per_tree} if exists $dtargs{how_many_training_samples_per_tree};
delete $dtargs{looking_for_needles_in_haystack} if exists $dtargs{looking_for_needles_in_haystack};
croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
bless {
_all_trees => {map {$_ => Algorithm::DecisionTree->new(%dtargs)} 0..$args{how_many_trees}-1},
_csv_cleanup_needed => $args{csv_cleanup_needed} || 0,
_looking_for_needles_in_haystack => $args{looking_for_needles_in_haystack},
_how_many_training_samples_per_tree => $args{how_many_training_samples_per_tree},
_training_datafile => $args{training_datafile},
_csv_class_column_index => $args{csv_class_column_index} || undef,
_csv_columns_for_features => $args{csv_columns_for_features} || undef,
_how_many_trees => $args{how_many_trees} || die "must specify number of trees",
_root_nodes => [],
_training_data_for_trees => {map {$_ => []} 0..$args{how_many_trees} - 1},
_all_record_ids => [],
_training_data_record_indexes => {},
_classifications => undef,
_debug1 => $args{debug1},
}, $class;
}
############################################## Methods ################################################
sub get_training_data_for_N_trees {
my $self = shift;
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
my @all_record_ids;
open FILEIN, $self->{_training_datafile} or die "Unable to open $self->{_training_datafile} $!";
my $record_index = 0;
while (<FILEIN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
push @{$self->{_all_record_ids}}, substr($record, 0, index($record, ','));
$record_index++;
}
close FILEIN;
$self->{_how_many_total_training_samples} = $record_index - 1;
print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
print "\n\nAll record labels: @{$self->{_all_record_ids}}\n" if $self->{_debug1};
if ($self->{_looking_for_needles_in_haystack}) {
( run in 0.495 second using v1.01-cache-2.11-cpan-f0fbb3f571b )