Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

Examples/classify_test_data_in_a_file.pl  view on Meta::CPAN

    @all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
    print "\n All class names: @all_class_names\n" if $debug;
    %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    %features_and_unique_values_hash = ();
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($debug) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";

ExamplesBagging/bagging_for_bulk_classification.pl  view on Meta::CPAN

    @all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
    print "\n All class names: @all_class_names\n" if $debug;
    %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    %features_and_unique_values_hash = ();
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($debug) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";

ExamplesBoosting/boosting_for_bulk_classification.pl  view on Meta::CPAN

    @all_class_names = grep {$_ if !$seen{$_}++}  values %class_for_sample_hash;
    print "\n All class names: @all_class_names\n" if $debug;
    %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    %features_and_unique_values_hash = ();
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^\d*\.\d+$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($debug) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";

lib/Algorithm/BoostedDecisionTree.pm  view on Meta::CPAN

        }
    }
    my %features_and_unique_values_hash = ();
    my %feature_values_how_many_uniques_hash  =  ();
    my %numeric_features_valuerange_hash   =   ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {
        my %seen = ();
        my @unique_values_for_feature =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    $self->{_all_trees}->{0}->{_class_names} = \@all_class_names;
    $self->{_all_trees}->{0}->{_feature_names} = \@feature_names;
    $self->{_all_trees}->{0}->{_samples_class_label_hash} = \%class_for_sample_hash;
    $self->{_all_trees}->{0}->{_training_data_hash}  =  \%feature_values_for_samples_hash;
    $self->{_all_trees}->{0}->{_features_and_values_hash}    =  \%features_and_values_hash;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    die "\n\nError in the names you have used for features and/or values.  " .
        "Try using the csv_cleanup_needed option in the constructor call." 
                        unless $self->check_names_used(\@features_and_values);
    my @new_features_and_values = ();
    my $pattern = '(\S+)\s*=\s*(\S+)';
    foreach my $feature_and_value (@features_and_values) {
        $feature_and_value =~ /$pattern/;
        my ($feature, $value) = ($1, $2);
        my $newvalue = $value;
        my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
                                                                       $not_all_values_float == 0) {
            $newvalue = closest_sampling_point($value, \@unique_values_for_feature);
        }
        push @new_features_and_values, "$feature" . '=' . "$newvalue";
    }
    @features_and_values = @new_features_and_values;
    print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
    my %answer = ();
    foreach my $class_name (@{$self->{_class_names}}) {
        $answer{$class_name} = undef;
    }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    my $pattern3 = '(.+)>(.+)';
    my @all_symbolic_features = ();
    foreach my $feature_name (@{$self->{_feature_names}}) {
        push @all_symbolic_features, $feature_name 
            if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
    }
    my @symbolic_features_already_used = ();  
    foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
        push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
    }
    my @symbolic_features_not_yet_used;
    foreach my $x (@all_symbolic_features) {
        push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
    }
    my @true_numeric_types = ();
    my @symbolic_types = ();
    my @true_numeric_types_feature_names = ();
    my @symbolic_types_feature_names = ();
    foreach my $item (@features_and_values_or_thresholds_on_branch) {
        if ($item =~ /$pattern2/) {
            push @true_numeric_types, $item;
            push @true_numeric_types_feature_names, $1;
        } elsif ($item =~ /$pattern3/) {

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};     
    my %numeric_features_valuerange_hash = ();
    my %feature_values_how_many_uniques_hash = ();
    my %features_and_unique_values_hash = ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {
        my %seen1 = ();
        my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++} 
                                                   @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($self->{_debug1}) {
        print "\nAll class names: @all_class_names\n";
        print "\nEach sample data record:\n";
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";

lib/Algorithm/DecisionTreeWithBagging.pm  view on Meta::CPAN

    print "all class names: @all_class_names\n" if $self->{_debug2};
    my %numeric_features_valuerange_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
    my %feature_values_how_many_uniques_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
    my %features_and_unique_values_hash_bags   =   map {$_ => {}} 0 .. $self->{_how_many_bags} - 1;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $i (0 .. $self->{_how_many_bags} - 1) {
        foreach my $feature (keys %{$features_and_values_hash_bags{$i}}) {
            my %seen = ();
            my @unique_values_for_feature_in_bag =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash_bags{$i}{$feature}};
            $feature_values_how_many_uniques_hash_bags{$i}->{$feature} = scalar @unique_values_for_feature_in_bag;
            my $not_all_values_float = 0;
            map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature_in_bag;
            if ($not_all_values_float == 0) {
                my @minmaxvalues = minmax(\@unique_values_for_feature_in_bag);
                $numeric_features_valuerange_hash_bags{$i}->{$feature} = \@minmaxvalues; 
            }
            $features_and_unique_values_hash_bags{$i}->{$feature} = \@unique_values_for_feature_in_bag;
        }
    }
    if ($self->{_debug2}) {
        print "\nDisplaying value ranges for numeric features in each bag:\n\n";
        foreach my $bag_index (keys  %numeric_features_valuerange_hash_bags) {        
            my %keyval = %{$numeric_features_valuerange_hash_bags{$bag_index}};

lib/Algorithm/RandomizedTreesForBigData.pm  view on Meta::CPAN

    }
    my $numeric_features_valuerange_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
    my $feature_values_how_many_uniques_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
    my $features_and_unique_values_all_trees = {map {my $t = $_; $t => {}} 0 .. $self->{_how_many_trees} - 1};
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $t (0 .. $self->{_how_many_trees} - 1) {    
        foreach my $feature (sort keys %{$features_and_values_all_trees->{$t}}) {
            my %all_values_for_feature =  map {$_ => 1} @{$features_and_values_all_trees->{$t}->{$feature}};
            my @unique_values_for_feature = grep {$_ ne 'NA'} keys %all_values_for_feature;
            $feature_values_how_many_uniques_all_trees->{$t}->{$feature} = scalar @unique_values_for_feature;
            my $not_all_values_float = 0;
            map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
            if ($not_all_values_float == 0) {
                my @minmaxvalues = minmax(\@unique_values_for_feature);
                $numeric_features_valuerange_all_trees->{$t}->{$feature} = \@minmaxvalues; 
            }
            $features_and_unique_values_all_trees->{$t}->{$feature} = \@unique_values_for_feature;            
        }
    }
    if ($self->{_debug1}) {
        print "\nDisplaying value ranges for numeric features for all trees:\n\n";
        foreach my $tree_index (keys  %{$numeric_features_valuerange_all_trees}) {        
            my %keyval = %{$numeric_features_valuerange_all_trees->{$tree_index}};

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

    my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_-1]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : ...
    my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a-1] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_predictor_columns}};     
    my %numeric_features_valuerange_hash   =   ();
    my %feature_values_how_many_uniques_hash  =  ();
    my %features_and_unique_values_hash = ();
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    foreach my $feature (keys %features_and_values_hash) {
        my %seen = ();
        my @unique_values_for_feature =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    if ($self->{_debug1_r}) {
        print "\nDependent var values: @dependent_var_values\n";
        print "\nEach sample data record:\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
            print "$kee    =>   @{$feature_values_for_samples_hash{$kee}}\n";



( run in 1.158 second using v1.01-cache-2.11-cpan-cc502c75498 )