UTF-8 results from the CPAN

AI-Perceptron-Simple

view release on metacpan or search on metacpan

lib/AI/Perceptron/Simple.pm view on Meta::CPAN

=head2 tame ( ... )

=head2 exercise ( ... )

=head2 train ( $stimuli_train_csv, $expected_output_header, $save_nerve_to_file )

=head2 train ( $stimuli_train_csv, $expected_output_header, $save_nerve_to_file, $display_stats, $identifier )

Trains the perceptron. 

C<$stimuli_train_csv> is the set of data / input (in CSV format) to train the perceptron while C<$save_nerve_to_file> is 
the filename that will be generate each time the perceptron finishes the training process. This data file is the data of the C<AI::Perceptron::Simple> 
object and it is used in the C<validate> method.

C<$expected_output_header> is the header name of the columns in the csv file with the actual category or the exepcted values. This is used to determine to tune the nerve up or down. This value should only be 0 or 1 for the sake of simplicity.

C<$display_stats> is B<optional> and the default is 0. It will display more output about the tuning process. It will show the followings:

=over 4

=item tuning status

Indicates the nerve was tuned up, down or no tuning needed

=item old sum

The original sum of all C<weightage * input> or C<dendrite_size * binary_input>

=item threshold

The threshold of the nerve

=item new sum

The new sum of all C<weightage * input> after fine-tuning the nerve

=back

If C<$display_stats> is specified ie. set to C<1>, then you B<MUST> specify the C<$identifier>. C<$identifier> is the column / header name that is used to identify a specific row of data in C<$stimuli_train_csv>.

=cut

sub tame {
    train( @_ );
}

sub exercise {
    train( @_ );
}

sub train {
    my $self = shift;
    my( $stimuli_train_csv, $expected_output_header, $save_nerve_to_file, $display_stats, $identifier ) = @_;
    
    $display_stats = 0 if not defined $display_stats;
    if ( $display_stats and not defined $identifier ) {
        croak "Please specifiy a string for \$identifier if you are trying to display stats";
    }
    
    # CSV processing is all according to the documentation of Text::CSV
    open my $data_fh, "<:encoding(UTF-8)", $stimuli_train_csv 
        or croak "Can't open $stimuli_train_csv: $!";
    
    my $csv = Text::CSV->new( {auto_diag => 1, binary => 1} );
    
    my $attrib = $csv->getline($data_fh);
    $csv->column_names( $attrib );

    # individual row
    ROW: while ( my $row = $csv->getline_hr($data_fh) ) {
        # print $row->{book_name}, " -> ";
        # print $row->{$expected_output_header} ? "æ„æž—\n" : "é…ä¸½ä¼˜å“\n";

        # calculate the output and fine tune parameters if necessary
        while (1) {
            my $output = _calculate_output( $self, $row );
            
            #print "Sum = ", $output, "\n";
            
            # $expected_output_header to be checked together over here
            # if output >= threshold
            #    then category/result aka output is considered 1
            # else output considered 0
            
            # output expected/actual tuning
            #    0       0             -
            #    1       0             down
            #    0       1             up
            #    1       1             -
            if ( ($output >= $self->threshold) and ( $row->{$expected_output_header} eq 0 ) ) {
                _tune( $self, $row, TUNE_DOWN );

                if ( $display_stats ) {
                    print $row->{$identifier}, "\n";
                    print "   -> TUNED DOWN";
                    print "   Old sum = ", $output;
                    print "   Threshold = ", $self->threshold;
                    print "   New Sum = ", _calculate_output( $self, $row ), "\n";                
                }
                
            } elsif ( ($output < $self->threshold) and ( $row->{$expected_output_header} eq 1 ) ) {
                _tune( $self, $row, TUNE_UP );
                
                if ( $display_stats ) {
                    print $row->{$identifier}, "\n";
                    print "   -> TUNED UP";
                    print "   Old sum = ", $output;
                    print "   Threshold = ", $self->threshold;
                    print "   New Sum = ", _calculate_output( $self, $row ), "\n";
                }

            } elsif ( ($output < $self->threshold) and ( $row->{$expected_output_header} eq 0 ) ) {
            
                if ( $display_stats ) {
                    print $row->{$identifier}, "\n";
                    print "   -> NO TUNING NEEDED";
                    print "   Sum = ", _calculate_output( $self, $row );
                    print "   Threshold = ", $self->threshold, "\n";
                }
                
                next ROW;

lib/AI/Perceptron/Simple.pm view on Meta::CPAN


=head2 _real_validate_or_test ( $data_hash_ref )

This is where the actual validation or testing takes place. 

C<$data_hash_ref> is the list of parameters passed into the C<validate> or C<test> methods.

This is a B<method>, so use the OO way. This is one of the exceptions to the rules where private subroutines are treated as methods :)

=cut

sub _real_validate_or_test {

    my $self = shift;   my $data_hash_ref = shift;
    
    #####
    my @missing_keys;
    for ( qw( stimuli_validate predicted_column_index ) ) {
        push @missing_keys, $_ unless exists $data_hash_ref->{ $_ };
    }
    
    croak "Missing keys: @missing_keys" if @missing_keys;
    #####
    
    my $stimuli_validate = $data_hash_ref->{ stimuli_validate };
    my $predicted_index = $data_hash_ref->{ predicted_column_index };
    
    # actual processing starts here
    my $output_file = defined $data_hash_ref->{ results_write_to } 
                        ? $data_hash_ref->{ results_write_to }
                        : $stimuli_validate;
    
    # open for writing results
    my $aoa = csv (in => $stimuli_validate, encoding => ":encoding(utf-8)");
    
    my $attrib_array_ref = shift @$aoa; # 'remove' the header, it's annoying :)

    $aoa = _fill_predicted_values( $self, $stimuli_validate, $predicted_index, $aoa );

    # put back the array of headers before saving file
    unshift @$aoa, $attrib_array_ref;

    print "Saving data to $output_file\n";
    csv( in => $aoa, out => $output_file, encoding => ":encoding(utf-8)" );
    print "Done saving!\n";

}

=head2 &_fill_predicted_values ( $self, $stimuli_validate, $predicted_index, $aoa )

This is where the filling in of the predicted values takes place. Take note that the parameters naming are the same as the ones used in the C<validate> and C<test> method.

This subroutine should be called in the procedural way.

=cut

sub _fill_predicted_values {
    my ( $self, $stimuli_validate, $predicted_index, $aoa ) = @_;

    # CSV processing is all according to the documentation of Text::CSV
    open my $data_fh, "<:encoding(UTF-8)", $stimuli_validate 
        or croak "Can't open $stimuli_validate: $!";
    
    my $csv = Text::CSV->new( {auto_diag => 1, binary => 1} );
    
    my $attrib = $csv->getline($data_fh);
    
    $csv->column_names( $attrib );

    # individual row
    my $row = 0;
    while ( my $data = $csv->getline_hr($data_fh) ) {
        
        if ( _calculate_output( $self, $data )  >= $self->threshold ) {
            # write 1 into aoa
            $aoa->[ $row ][ $predicted_index ] = 1;
        } else {
            #write 0 into aoa
            $aoa->[ $row ][ $predicted_index ] = 0;
        }
        
        $row++;
    }
    
    close $data_fh;
    
    $aoa;
}

=head1 RESULTS RELATED SUBROUTINES/METHODS

This part is related to generating the confusion matrix.

=head2 get_exam_results ( ... )

The parameters and usage are the same as C<get_confusion_matrix>. See the next method.

=head2 get_confusion_matrix ( \%options )

Returns the confusion matrix in the form of a hash. The hash will contain these keys: C<true_positive>, C<true_negative>, C<false_positive>, C<false_negative>, C<accuracy>, C<sensitivity>. More stats like C<precision>, C<specificity> and C<F1_Score> ...

If you are trying to manipulate the confusion matrix hash or something, take note that all the stats are in percentage (%) in decimal (if any) except the total entries.

For C<%options>, the followings are needed unless mentioned:

=over 4

=item full_data_file => $filled_test_file

This is the CSV file filled with the predicted values. 

Make sure that you don't do anything to the actual and predicted output in this file after testing the nerve. These two columns must contain binary values only!

=item actual_output_header => $actual_column_name

=item predicted_output_header => $predicted_column_name

The binary values are treated as follows:

=over 4

=item C<0> is negative

=item C<1> is positive

=back

=item more_stats => 1

Optional.

Setting it to C<1> will process more stats that are usually not so important eg. C<precision>, C<specificity> and C<F1_Score>

=back

=cut

sub get_exam_results {

    my ( $self, $info ) = @_;
    
    $self->get_confusion_matrix( $info );
}

sub get_confusion_matrix {

    my ( $self, $info ) = @_;

    my %c_matrix = _collect_stats( $info ); # processes total_entries, accuracy, sensitivity etc
    
    %c_matrix;
}


=head2 &_collect_stats ( \%options )

Generates a hash of confusion matrix based on C<%options> given in the C<get_confusion_matrix> method.

=cut

sub _collect_stats {
    my $info = shift;
    my $file = $info->{ full_data_file };
    my $actual_header = $info->{ actual_output_header };
    my $predicted_header = $info->{ predicted_output_header };
    my $more_stats = defined ( $info->{ more_stats } ) ? 1 : 0;
    
    my %c_matrix = ( 
        true_positive => 0, true_negative => 0, false_positive => 0, false_negative => 0,
        accuracy => 0, sensitivity => 0
    );
    
    # CSV processing is all according to the documentation of Text::CSV
    open my $data_fh, "<:encoding(UTF-8)", $file
        or croak "Can't open $file: $!";
    
    my $csv = Text::CSV->new( {auto_diag => 1, binary => 1} );
    
    my $attrib = $csv->getline($data_fh); # get the row of headers, can't specify any column
    # shouldn't be a problem, since we're reading line by line :)

    $csv->column_names( $attrib );

    # individual row
    while ( my $row = $csv->getline_hr($data_fh) ) {
        
        # don't pack this part into another subroutine, number of rows can be very big
        if ( $row->{ $actual_header } == 1 and $row->{ $predicted_header } == 1 ) {

            # true positive
            $c_matrix{ true_positive }++;
            
        } elsif ( $row->{ $actual_header } == 0 and $row->{ $predicted_header } == 0 ) {
            
            # true negative
            $c_matrix{ true_negative }++;
            
        } elsif ( $row->{ $actual_header } == 1 and $row->{ $predicted_header } == 0 ) {
            
            # false negative
            $c_matrix{ false_negative }++;
            
        } elsif ( $row->{ $actual_header } == 0 and $row->{ $predicted_header } == 1 ) {
            
            # false positive
            $c_matrix{ false_positive }++;
            
        } else {
        
            croak "Something's wrong!\n".
            "Make sure that the actual and predicted values in your file are binary ie 0 or 1" ;
            
        }
    }
    
    close $data_fh;

    _calculate_total_entries( \%c_matrix );

    _calculate_sensitivity( \%c_matrix );
    
    _calculate_accuracy( \%c_matrix );
    
    if ( $more_stats == 1 ) {
        _calculate_precision( \%c_matrix );
        
        _calculate_specificity( \%c_matrix );
        
        _calculate_f1_score( \%c_matrix );
        
        # unimplemented, some more left
        _calculate_negative_predicted_value( \%c_matrix ); #
        _calculate_false_negative_rate( \%c_matrix ); #
        _calculate_false_positive_rate( \%c_matrix ); #

( run in 1.706 second using v1.01-cache-2.11-cpan-39bf76dae61 )