Algorithm-DecisionTree
view release on metacpan or search on metacpan
ExamplesRandomizedTrees/classify_database_records.pl view on Meta::CPAN
$record_lbl =~ s/^\s*\"|\"\s*$//g;
if (contained_in($record_lbl, @records_to_classify_local)) {
@records_to_classify_local = grep {$_ ne $record_lbl} @records_to_classify_local;
$record_ids_with_class_labels{$record_lbl} = $parts[$csv_class_column_index];
my @fields_of_interest = map {$all_fields[$_]} @{$csv_columns_for_features};
my @feature_vals_of_interest = map {$parts[$_]} @{$csv_columns_for_features};
my @interleaved = ( @fields_of_interest, @feature_vals_of_interest )[ map { $_, $_ + @fields_of_interest } ( 0 .. $#fields_of_interest ) ];
my @features_and_vals = pairmap { "$a=$b" } @interleaved;
$record_ids_with_features_and_vals{$record_lbl} = \@features_and_vals;
}
last if @records_to_classify_local == 0;
$record_index++;
}
close FILEIN;
# Now classify all the records extracted from the database file:
foreach my $record_index (keys %record_ids_with_features_and_vals) {
my $test_sample = $record_ids_with_features_and_vals{$record_index};
$rt->classify_with_all_trees( $test_sample );
my $classification = $rt->get_majority_vote_classification();
printf("\nclassification for %5d: %10s original classification: %s", $record_index, $classification, $record_ids_with_class_labels{$record_index});
}
} else {
print "\nYou are allowed to enter only 'y' or 'n'. Try again.";
}
}
}
####################################### support functions #################################
sub all_record_labels_in_database {
my $filename = shift;
my @record_labels;
open FILEIN, $filename || die "unable to open $filename: $!";
while (<FILEIN>) {
next if /^[ ]*\r?\n?$/;
my $label = substr($_, 0, index($_, ','));
$label =~ s/^\s*\"|\"\s*$//g;
push @record_labels, $label
}
shift @record_labels; # the label in the head record not needed
return \@record_labels;
}
## Introduced in Version 3.21, I wrote this function in response to a need to
## create a decision tree for a very large national econometric database. The
## fields in the CSV file for this database are allowed to be double quoted and such
## fields may contain commas inside them. This function also replaces empty fields
## with the generic string 'NA' as a shorthand for "Not Available". IMPORTANT: This
## function skips over the first field in each record. It is assumed that the first
## field in the first record that defines the feature names is the empty string ("")
## and the same field in all other records is an ID number for the record.
sub cleanup_csv {
my $line = shift;
$line =~ tr/()[]{}/ /;
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
for (@double_quoted) {
my $item = $_;
$item = substr($item, 1, -1);
$item =~ s/^s+|,|\s+$//g;
$item = join '_', split /\s+/, $item;
substr($line, index($line, $_), length($_)) = $item;
}
my @white_spaced = $line =~ /,\s*[^,]+\s+[^,]+\s*,/g;
for (@white_spaced) {
my $item = $_;
$item = substr($item, 0, -1);
$item = join '_', split /\s+/, $item unless $item =~ /,\s+$/;
substr($line, index($line, $_), length($_)) = "$item,";
}
$line =~ s/,\s*(?=,)/,NA/g;
return $line;
}
# checks whether an element is in an array:
sub contained_in {
my $ele = shift;
my @array = @_;
my $count = 0;
map {$count++ if $ele eq $_} @array;
return $count;
}
( run in 0.819 second using v1.01-cache-2.11-cpan-d7f47b0818f )