Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

package Algorithm::RandomizedTreesForBigData;

#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::RandomizedTreesForBigData is a Perl module for inducing multiple decision
# trees using randomized selection of samples from a large training data file.
# -------------------------------------------------------------------------------------

#use lib 'blib/lib', 'blib/arch';

#use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(pairmap);
use Algorithm::DecisionTree 3.43;

our $VERSION = '3.43';

############################################   Constructor  ##############################################
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    my %dtargs = %args;
    delete $dtargs{how_many_trees};
    delete $dtargs{how_many_training_samples_per_tree} if exists $dtargs{how_many_training_samples_per_tree};
    delete $dtargs{looking_for_needles_in_haystack} if exists $dtargs{looking_for_needles_in_haystack};
    croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params(@params) == 0;
    bless {
        _all_trees              =>  {map {$_ => Algorithm::DecisionTree->new(%dtargs)} 0..$args{how_many_trees}-1},
        _csv_cleanup_needed                    =>  $args{csv_cleanup_needed} || 0,
        _looking_for_needles_in_haystack       =>  $args{looking_for_needles_in_haystack}, 
        _how_many_training_samples_per_tree    =>  $args{how_many_training_samples_per_tree},
        _training_datafile                     =>  $args{training_datafile}, 
        _csv_class_column_index                =>  $args{csv_class_column_index} || undef,
        _csv_columns_for_features              =>  $args{csv_columns_for_features} || undef,
        _how_many_trees                        =>  $args{how_many_trees} || die "must specify number of trees",
        _root_nodes                            =>  [],
        _training_data_for_trees               =>  {map {$_ => []} 0..$args{how_many_trees} - 1},
        _all_record_ids                        =>  [],
        _training_data_record_indexes          =>  {},
        _classifications                       =>  undef,
        _debug1                                =>  $args{debug1},
    }, $class;
}

##############################################   Methods  ################################################
sub get_training_data_for_N_trees {
    my $self = shift;
    die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
    my @all_record_ids;
    open FILEIN, $self->{_training_datafile} or die "Unable to open $self->{_training_datafile} $!";
    my $record_index = 0;
    while (<FILEIN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
        push @{$self->{_all_record_ids}}, substr($record, 0, index($record, ','));
        $record_index++;
    }
    close FILEIN;
    $self->{_how_many_total_training_samples} = $record_index - 1;
    print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
    print "\n\nAll record labels: @{$self->{_all_record_ids}}\n" if $self->{_debug1};
    if ($self->{_looking_for_needles_in_haystack}) {



( run in 0.495 second using v1.01-cache-2.11-cpan-f0fbb3f571b )