Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

ExamplesRandomizedTrees/classify_database_records.pl  view on Meta::CPAN

                $record_lbl  =~ s/^\s*\"|\"\s*$//g;
                if (contained_in($record_lbl, @records_to_classify_local)) {
                    @records_to_classify_local = grep {$_ ne $record_lbl} @records_to_classify_local;
                    $record_ids_with_class_labels{$record_lbl} = $parts[$csv_class_column_index];
                    my @fields_of_interest = map {$all_fields[$_]} @{$csv_columns_for_features};
                    my @feature_vals_of_interest = map {$parts[$_]} @{$csv_columns_for_features};
                    my @interleaved = ( @fields_of_interest, @feature_vals_of_interest )[ map { $_, $_ + @fields_of_interest } ( 0 .. $#fields_of_interest ) ];
                    my @features_and_vals = pairmap { "$a=$b" } @interleaved;
                    $record_ids_with_features_and_vals{$record_lbl} = \@features_and_vals;
                }
                last if @records_to_classify_local == 0;
                $record_index++; 
            }
            close FILEIN;
            # Now classify all the records extracted from the database file:    
            foreach my $record_index (keys %record_ids_with_features_and_vals) {
                my $test_sample = $record_ids_with_features_and_vals{$record_index};
                $rt->classify_with_all_trees( $test_sample );
                my $classification = $rt->get_majority_vote_classification();
                printf("\nclassification for %5d: %10s       original classification: %s", $record_index, $classification, $record_ids_with_class_labels{$record_index});
            }
        } else {
            print "\nYou are allowed to enter only 'y' or 'n'.  Try again.";
        }
    }
}

####################################### support functions #################################

sub all_record_labels_in_database {
    my $filename = shift;
    my @record_labels;
    open FILEIN, $filename || die "unable to open $filename: $!";        
    while (<FILEIN>) {
        next if /^[ ]*\r?\n?$/;
        my $label = substr($_, 0, index($_, ','));
        $label  =~ s/^\s*\"|\"\s*$//g;
        push @record_labels, $label
    }
    shift @record_labels;     # the label in the head record not needed
    return \@record_labels;
}

##  Introduced in Version 3.21, I wrote this function in response to a need to
##  create a decision tree for a very large national econometric database.  The
##  fields in the CSV file for this database are allowed to be double quoted and such
##  fields may contain commas inside them.  This function also replaces empty fields
##  with the generic string 'NA' as a shorthand for "Not Available".  IMPORTANT: This
##  function skips over the first field in each record.  It is assumed that the first
##  field in the first record that defines the feature names is the empty string ("")
##  and the same field in all other records is an ID number for the record.
sub cleanup_csv {
    my $line = shift;
    $line =~ tr/()[]{}/      /;
    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
    for (@double_quoted) {
        my $item = $_;
        $item = substr($item, 1, -1);
        $item =~ s/^s+|,|\s+$//g;
        $item = join '_',  split /\s+/, $item;
        substr($line, index($line, $_), length($_)) = $item;
    }
    my @white_spaced = $line =~ /,\s*[^,]+\s+[^,]+\s*,/g;
    for (@white_spaced) {
        my $item = $_;
        $item = substr($item, 0, -1);
        $item = join '_',  split /\s+/, $item unless $item =~ /,\s+$/;
        substr($line, index($line, $_), length($_)) = "$item,";
    }
    $line =~ s/,\s*(?=,)/,NA/g;
    return $line;
}

# checks whether an element is in an array:
sub contained_in {
    my $ele = shift;
    my @array = @_;
    my $count = 0;
    map {$count++ if $ele eq $_} @array;
    return $count;
}



( run in 0.819 second using v1.01-cache-2.11-cpan-d7f47b0818f )