Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

Examples/classify_test_data_in_a_file.pl  view on Meta::CPAN

### UNCOMMENT THE NEXT STATEMENT if you would like to see
### the decision tree displayed in your terminal window:
#$root_node->display_decision_tree("   ");

# NOW YOU ARE READY TO CLASSIFY THE FILE BASED TEST DATA:
my (@all_class_names, @feature_names, %class_for_sample_hash, %feature_values_for_samples_hash,
    %features_and_values_hash, %features_and_unique_values_hash, 
    %numeric_features_valuerange_hash, %feature_values_how_many_uniques_hash);

get_test_data_from_csv();
open OUTPUTHANDLE, ">$outputfile"
    or die "Unable to open the file $outputfile for writing out the classification results: $!";
if ($show_hard_classifications && ($outputfile !~ /\.csv$/i)) {
    print OUTPUTHANDLE "\nOnly the most probable class shown for each test sample\n\n";
} elsif (!$show_hard_classifications && ($outputfile !~ /\.csv$/i)) {
    print OUTPUTHANDLE "\nThe classification result for each sample ordered in decreasing order of probability\n\n";
}
if ($outputfile =~ /\.csv$/i) {
    my $class_names_csv = join ',', sort @{$dt->{_class_names}};
    my $output_string = "sample_index,$class_names_csv\n";
    print OUTPUTHANDLE "$output_string";
    foreach my $sample (sort {sample_index($a) <=> sample_index($b)} 
                                       keys %feature_values_for_samples_hash) {
        my @test_sample =  @{$feature_values_for_samples_hash{$sample}};
        my %classification = %{$dt->classify($root_node, \@test_sample)};
        my $sample_index = sample_index($sample);
        my @solution_path = @{$classification{'solution_path'}};
        delete $classification{'solution_path'};
        my @which_classes = sort keys %classification;
        $output_string = "$sample_index";
        foreach my $which_class (@which_classes) {
            $which_class =~ /=(.*)/;
            my $class_name = $1;
            my $valuestring = $classification{$which_class};
            $output_string .= ",$valuestring";
        }
        print OUTPUTHANDLE "$output_string\n";
    }
} else {
    foreach my $sample (sort {sample_index($a) <=> sample_index($b)} 
                                       keys %feature_values_for_samples_hash) {
        my @test_sample =  @{$feature_values_for_samples_hash{$sample}};
        my %classification = %{$dt->classify($root_node, \@test_sample)};
        my @solution_path = @{$classification{'solution_path'}};
        delete $classification{'solution_path'};
        my @which_classes = keys %classification;
        @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
        my $result_string = "$sample:   ";
        if ($show_hard_classifications) {
            my $which_class = $which_classes[0];
            $which_class =~ /=(.*)/;
            my $class_name = $1;
            my $valuestring = sprintf("%-20s", $classification{$which_class});
            $result_string .= "$class_name => $valuestring    ";
            print OUTPUTHANDLE "$result_string\n";
        } else {
            foreach my $which_class (@which_classes) {
                $which_class =~ /=(.*)/;
                my $class_name = $1;
                my $valuestring = sprintf("%-20s", $classification{$which_class});
                $result_string .= "$class_name => $valuestring    ";
            }
            print OUTPUTHANDLE "$result_string\n";
        }
    }
}

sub get_test_data_from_csv {
    open FILEIN, $test_datafile or die "Unable to open $test_datafile: $!";
    die("Aborted. get_test_data_csv() is only for CSV files") 
                                           unless $test_datafile =~ /\.csv$/;
    my $class_name_in_column = $dt->{_csv_class_column_index} - 1; 
    my @all_data =  <FILEIN>;

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

    fisher_yates_shuffle(\@training_data_records);
    fisher_yates_shuffle(\@test_data_records);
    if ($self->{_debug}) {
        foreach my $record (@training_data_records) {
            print "$record";
        }
        foreach my $record (@test_data_records) {
            print "$record";
        }
    }
    open OUTPUT, ">$self->{_output_training_csv_file}";
    my @feature_names_training = @{$self->{_features_ordered}};
    my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
    my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
    print OUTPUT "$first_row_training\n";
    foreach my $i (0..@training_data_records-1) {
        my $i1 = $i+1;
        my $sample_record = "\"$i1\",$training_data_records[$i]";
        print OUTPUT "$sample_record";
    }
    close OUTPUT;
    open OUTPUT, ">$self->{_output_test_csv_file}";
    my @feature_names_testing = keys %{$self->{_features_with_value_range}};
    my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
    my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
    print OUTPUT "$first_row_testing\n";
    foreach my $i (0..@test_data_records-1) {
        my $i1 = $i+1;
        my $sample_record = "\"$i1\",$test_data_records[$i]";
        print OUTPUT "$sample_record";
    }
    close OUTPUT;
}

# from perl docs:                                                                         
sub fisher_yates_shuffle {
    my $arr =  shift;
    my $i = @$arr;
    while (--$i) {
        my $j = int rand( $i + 1 );
        @$arr[$i, $j] = @$arr[$j, $i];
    }

lib/Algorithm/DecisionTree.pm  view on Meta::CPAN

sub read_parameter_file_symbolic {
    my $self = shift;
    my $debug = $self->{_debug};
    my $number_of_training_samples = $self->{_number_of_samples_for_training};
    my $input_parameter_file = $self->{_parameter_file};
    croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
    my $output_file_training = $self->{_output_training_datafile};
    my $output_file_testing = $self->{_output_test_datafile};
    my @all_params;
    my $param_string;
    open INPUT, $input_parameter_file || "unable to open parameter file: $!";
    @all_params = <INPUT>;
    @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
    @all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
    $param_string = join ' ', @all_params;
    my ($class_names, $class_priors, $rest_param) = 
              $param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
    my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
    push @{$self->{_class_names}}, @class_names;
    my @class_priors =   grep {defined($_) && length($_) > 0} split /\s+/, $2;
    push @{$self->{_class_priors}}, @class_priors;    
    my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

}

sub display_all_plots {
    my $self = shift;
    my $ncols = $self->{_XMatrix}->cols;
    unlink "regression_plots.png" if -e "regression_plots.png";
    my $master_datafile = $self->{_training_datafile};
    my $filename = basename($master_datafile);
    my $temp_file = "__temp_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!\n";
    if ($ncols == 2) {
        my @predictor_entries = $self->{_XMatrix}->col(0)->as_list;
        my @dependent_val_vals = $self->{_YVector}->col(0)->as_list;
        map {print OUTPUT "$predictor_entries[$_] $dependent_val_vals[$_]\n"} 0 .. $self->{_XMatrix}->rows - 1;
        print OUTPUT "\n\n";
        foreach my $plot (sort {$a <=> $b} keys %{$self->{_output_for_plots}}) {
            map {print OUTPUT "$self->{_output_for_plots}->{$plot}->[0]->[$_] $self->{_output_for_plots}->{$plot}->[1]->[$_]\n"} 0 .. @{$self->{_output_for_plots}->{$plot}->[0]} - 1;
            print OUTPUT "\n\n"
        }
        close OUTPUT;
        my $gplot = Graphics::GnuplotIF->new( persist => 1 );
        my $hardcopy_plot = Graphics::GnuplotIF->new();
        $hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"regression_plots.png\"");        
        $gplot->gnuplot_cmd( "set noclip" );
        $gplot->gnuplot_cmd( "set pointsize 2" );
        my $arg_string = "";
        foreach my $i (0 .. scalar(keys %{$self->{_output_for_plots}})) {
            if ($i == 0) {            
                $arg_string .= "\"$temp_file\" index $i using 1:2 notitle with points lt -1 pt 1, ";
            } elsif ($i == 1) {

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

        $arg_string = $arg_string =~ /^(.*),[ ]+$/;
        $arg_string = $1;
        $hardcopy_plot->gnuplot_cmd( "plot $arg_string" );
        $gplot->gnuplot_cmd( "plot $arg_string" );
        $gplot->gnuplot_pause(-1);
    } elsif ($ncols == 3) {
        my @dependent_val_vals = $self->{_YVector}->col(0)->as_list;
        foreach my $i (0 .. $self->{_XMatrix}->rows - 1) {
            my @onerow = $self->{_XMatrix}->row($i)->as_list;
            pop @onerow;
            print OUTPUT "@onerow $dependent_val_vals[$i]\n";
        }
        print OUTPUT "\n\n";
        foreach my $plot (sort {$a <=> $b} keys %{$self->{_output_for_surface_plots}}) {
            my @plot_data = @{$self->{_output_for_surface_plots}->{$plot}};
            my @predictors = @{$plot_data[0]};
            my @predictions = @{$plot_data[1]};
            map {print OUTPUT "$predictors[$_] $predictions[$_]\n"} 0 .. @predictions - 1;
            print OUTPUT "\n\n"
        }
        close OUTPUT;
        my $gplot = Graphics::GnuplotIF->new( persist => 1 );
        my $hardcopy_plot = Graphics::GnuplotIF->new();
        $hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"regression_plots.png\"");        
        $gplot->gnuplot_cmd( "set noclip" );
        $gplot->gnuplot_cmd( "set pointsize 2" );
        my $arg_string = "";
        foreach my $i (0 .. scalar(keys %{$self->{_output_for_surface_plots}})) {
            if ($i == 0) {            
                $arg_string .= "\"$temp_file\" index $i using 1:2:3 notitle with points lt -1 pt 1, ";
            } elsif ($i == 1) {



( run in 0.287 second using v1.01-cache-2.11-cpan-4e96b696675 )