AI-Calibrate

 view release on metacpan or  search on metacpan

Changes  view on Meta::CPAN

1.3   Fri Nov 4
      - Removed dependency on Test::Deep, added explicit declaration of
        dependency on Test::More to Makefile.PL

1.2   Thu Nov 3
      - Fixed test ./t/AI-Calibrate-NB.t so that test wouldn't fail.  Used to
        call is_deeply, which was failing on slight differences between
        floating point numbers.  Now compares with a small tolerance.

1.1   Thu Feb 28 19:00:06 2008
      - Added new function print_mapping
      - Added new test file AI-Calibrate-NB.t which, if AI::NaiveBayes1 is
        present, trains a classifier and calibrates it.

1.0   Thu Feb 05 11:37:31 2008
      - First public release to CPAN.

0.01  Thu Jan 24 11:37:31 2008
	- original version; created by h2xs 1.23 with options
		-XA -n AI::Calibrate

lib/AI/Calibrate.pm  view on Meta::CPAN


# This allows declaration:
#	use AI::Calibrate ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
our %EXPORT_TAGS = (
    'all' => [
        qw(
              calibrate
              score_prob
              print_mapping
            )
    ]
);

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw( );

use constant DEBUG => 0;

# Structure slot names

lib/AI/Calibrate.pm  view on Meta::CPAN

Scores will be in descending numerical order.  See the DESCRIPTION section for
how this structure is interpreted.  You can pass this structure to the
B<score_prob> function, along with a new score, to get a probability.

=cut

sub calibrate {
    my($data, $sorted) = @_;

    if (DEBUG) {
        print "Original data:\n";
        for my $pair (@$data) {
            my($score, $prob) = @$pair;
            print "($score, $prob)\n";
        }
    }

    #  Copy the data over so PAV can clobber the PROB field
    my $new_data = [ map([@$_], @$data) ];

    #   If not already sorted, sort data decreasing by score
    if (!$sorted) {
        $new_data = [ sort { $b->[SCORE] <=> $a->[SCORE] } @$new_data ];
    }

    PAV($new_data);

    if (DEBUG) {
        print("After PAV, vector is:\n");
        print_vector($new_data);
    }

    my(@result);
    my( $last_prob, $last_score);

    push(@$new_data, [-1e10, 0]);

    for my $pair (@$new_data) {
        print "Seeing @$pair\n" if DEBUG;
        my($score, $prob) = @$pair;
        if (defined($last_prob) and $prob < $last_prob) {
            print("Pushing [$last_score, $last_prob]\n") if DEBUG;
            push(@result, [$last_score, $last_prob] );
        }
        $last_prob = $prob;
        $last_score = $score;
    }

    return \@result;
}


sub PAV {
    my ( $result ) = @_;

    for ( my $i = 0; $i < @$result - 1; $i++ ) {
        if ( $result->[$i][PROB] < $result->[ $i + 1 ][PROB] ) {
            $result->[$i][PROB] =
                ( $result->[$i][PROB] + $result->[ $i + 1 ][PROB] ) / 2;
            $result->[ $i + 1 ][PROB] = $result->[$i][PROB];
            print "Averaging elements $i and ", $i + 1, "\n" if DEBUG;

            for ( my $j = $i - 1; $j >= 0; $j-- ) {
                if ( $result->[$j][PROB] < $result->[ $i + 1 ][PROB] ) {
                    my $d = ( $i + 1 ) - $j + 1;
                    flatten( $result, $j, $d );
                }
                else {
                    last;
                }
            }
        }
    }
}

sub print_vector {
    my($vec) = @_;
    for my $pair (@$vec) {
        print join(", ", @$pair), "\n";
    }
}


sub flatten {
    my ( $vec, $start, $len ) = @_;
    if (DEBUG) {
        print "Flatten called on vec, $start, $len\n";
        print "Vector before: \n";
        print_vector($vec);
    }

    my $sum = 0;
    for my $i ( $start .. $start + $len-1 ) {
        $sum += $vec->[$i][PROB];
    }
    my $avg = $sum / $len;
    print "Sum = $sum, avg = $avg\n" if DEBUG;
    for my $i ( $start .. $start + $len -1) {
        $vec->[$i][PROB] = $avg;
    }
    if (DEBUG) {
        print "Vector after: \n";
        print_vector($vec);
    }
}

=item B<score_prob>

This is a simple utility function that takes the structure returned by
B<calibrate>, along with a new score, and returns the probability estimate.
Example calling form:

  $p = score_prob($calibrated, $score);

Once you have a trained, calibrated classifier, you could imagine using it
like this:

 $calibrated = calibrate( $calibration_set );
 print "Input instances, one per line:\n";
 while (<>) {
    chomp;
    my(@fields) = split;
    my $score = classifier(@fields);
    my $prob = score_prob($score);
    print "Estimated probability: $prob\n";
 }

=cut

sub score_prob {
    my($calibrated, $score) = @_;

    my $last_prob = 1.0;

    for my $tuple (@$calibrated) {
        my($bound, $prob) = @$tuple;
        return $prob if $score >= $bound;
        $last_prob = $prob;
    }
    #  If we drop off the end, probability estimate is zero
    return 0;
}


=item B<print_mapping>

This is a simple utility function that takes the structure returned by
B<calibrate> and prints out a simple list of lines describing the mapping
created.

Example calling form:

  print_mapping($calibrated);

Sample output:

  1.00 > SCORE >= 1.00     prob = 1.000
  1.00 > SCORE >= 0.71     prob = 0.667
  0.71 > SCORE >= 0.39     prob = 0.000
  0.39 > SCORE >= 0.00     prob = 0.000

These ranges are not necessarily compressed/optimized, as this sample output
shows.

=back

=cut
sub print_mapping {
    my($calibrated) = @_;
    my $last_bound = 1.0;
    for my $tuple (@$calibrated) {
        my($bound, $prob) = @$tuple;
        printf("%0.3f > SCORE >= %0.3f     prob = %0.3f\n",
               $last_bound, $bound, $prob);
        $last_bound = $bound;
    }
    if ($last_bound != 0) {
        printf("%0.3f > SCORE >= %0.3f     prob = %0.3f\n",
               $last_bound, 0, 0);
    }
}

=head1 DETAILS

The PAV algorithm is conceptually straightforward.  Given a set of training
cases ordered by the scores assigned by the classifier, it first assigns a
probability of one to each positive instance and a probability of zero to each
negative instance, and puts each instance in its own group.  It then looks, at

lib/AI/Calibrate.pm  view on Meta::CPAN

values.  It continues this process of averaging and replacement until the
entire sequence is monotonically decreasing.  The result is a sequence of
instances, each of which has a score and an associated probability estimate,
which can then be used to map scores into probability estimates.

For further information on the PAV algorithm, you can read the section in my
paper referenced below.

=head1 EXPORT

This module exports three functions: calibrate, score_prob and print_mapping.

=head1 BUGS

None known.  This implementation is straightforward but inefficient (its time
is O(n^2) in the length of the data series).  A linear time algorithm is
known, and in a later version of this module I'll probably implement it.

=head1 SEE ALSO

The AI::NaiveBayes1 perl module.

t/AI-Calibrate-1.t  view on Meta::CPAN


srand;

sub deeply_approx {
    # Like Test::More::is_deeply but uses approx() to compare elements.
    my( $got, $expected ) = @_;
    my $EPSILON = 1.0e-6;
    sub max {  $_[0] > $_[1] ? $_[0] : $_[1] }
    sub approx {
        my($x, $y) = @_;
        print("approx($x, $y)\n");
        if ($x == 0 and $y == 0) {
            return(1);
        } else {
            return(abs($x-$y) / max($x,$y) < $EPSILON);
        }
    }
    for my $i (0 .. $#{$got}) {
        my $g = $got->[$i];
        if (defined($expected->[$i])) {
            my $e = $expected->[$i];

t/AI-Calibrate-1.t  view on Meta::CPAN

    [.5,  3/4 ],
    [.45, 2/3 ],
    [.35, 2/3 ],
    [.3,  1/2 ],
    [.2,  1/3 ],
    [.02,   0 ],
    [.00001, 0]
);


print "Using this mapping:\n";
print_mapping($calibrated_got);
print;

for my $pair (@test_estimates) {
    my($score, $prob_expected) = @$pair;
    my $prob_got = score_prob($calibrated_got, $score);
    is($prob_got, $prob_expected, "score_prob test @$pair");
}

t/AI-Calibrate-KL.t  view on Meta::CPAN

0.998 > SCORE >= 0.505     prob = 0.900
0.505 > SCORE >= 0.475     prob = 0.667
0.475 > SCORE >= 0.425     prob = 0.500
0.425 > SCORE >= 0.359     prob = 0.385
0.359 > SCORE >= 0.000     prob = 0.000
";

my $output = '';
open TOOUTPUT, '>', \$output or die "Can't open TOOUTPUT: $!";
my $stdout = select(TOOUTPUT);
print_mapping($calibrated_got);
close(TOOUTPUT);
select $stdout;

is(trim($output), trim($expected_mapping), "printed mapping");

t/AI-Calibrate-NB.t  view on Meta::CPAN

    my($attrs, $play) = @$inst;

    my $ph = $nb->predict(attributes=>$attrs);

    my $play_score = $ph->{"play=yes"};
    push(@points, [$play_score, ($play eq "yes" ? 1 : 0)]);
}

my $calibrated = calibrate(\@points, 0); # not sorted

print "Mapping:\n";
print_mapping($calibrated);

my(@expected) =
  (
   [0.779495793582905, 1],
   [0.535425255450615, 0.666666666666667]
  );

for my $i (0 .. $#expected) {
    print "$i = @{$expected[$i]}\n";
}

# This fails because two numbers differ at the 15th digit:
# is_deeply($calibrated, \@expected, "Naive Bayes calibration test");

sub close_enough {
    my($x, $y) = @_;
    return(abs($x - $y) < 1.0e-5);
}

sub lists_close_enough {
    my($got, $expected) = @_;
    if (@$got != @$expected) {
        return 0;
    }
    for my $i (0 .. $#{$got}) {
        for my $elem (0, 1) {
            if (! close_enough($got->[$i][$elem], $expected->[$i][$elem])) {
                diag(sprintf( "Got: %f\n", $got->[$i]));
                diag(sprintf( "Expected: %f\n", $expected->[$i]));
                return 0;
            }
        }
    }
    return 1;
}

ok(lists_close_enough($calibrated, \@expected),
   'Calibration of NB1 results');



( run in 1.143 second using v1.01-cache-2.11-cpan-de7293f3b23 )