Algorithm-VSM

 view release on metacpan or  search on metacpan

examples/significance_testing.pl  view on Meta::CPAN

#!/usr/bin/perl -w

##  significance_testing.pl

##  See Item 11 in the README of the `examples' directory.

use strict;
use Algorithm::VSM;


my $debug_signi = 0;

die "Must supply one command-line argument, which must either be 'randomization' or 't-test'\n" 
    unless @ARGV == 1;
my $significance_testing_method = shift @ARGV;
die "The command-line argument must either be 'randomization' or " .
    "'t-test' for this module to be useful\n"
             if ($significance_testing_method ne 'randomization') and
                ($significance_testing_method ne 't-test');

print "Proceeding with significance testing based on $significance_testing_method\n";
    
my $MAX_ITERATIONS = 100000;
my $THRESHOLD_1    = 0.02;               # for LSA-1
my $THRESHOLD_2    = 0.12;               # for LSA-2

my $corpus_dir = "corpus";  

my $query_file      = "test_queries.txt";
my $stop_words_file = "stop_words.txt";  
my $relevancy_file   = "relevancy.txt"; 


#   Significance testing is applied to the output of two retrieval
#   algorithms.  We want to know if the difference between the MAP values
#   for the two algorithms are statistically significant.  Our example here
#   is based to LSA retrieval algorithms with different values for the
#   singular value acceptance threshold lsa_svd_threshold.  Under the 
#   null hypothesis, we assume that the two algorithms are the same.  
#   Our test statistic will be the difference between the MAP values.

########################  Algorithm 1  #########################

my $lsa1 = Algorithm::VSM->new( 
                   break_camelcased_and_underscored  => 1,  # default: 1
                   case_sensitive      => 0,                # default: 0 
                   corpus_directory    => $corpus_dir,
                   file_types          => ['.txt', '.java'],
                   lsa_svd_threshold   => $THRESHOLD_1,
                   min_word_length     => 4,
                   query_file          => $query_file,
                   relevancy_file      => $relevancy_file,
                   stop_words_file     => $stop_words_file,
                   want_stemming       => 1,                # default: 0
          );

$lsa1->get_corpus_vocabulary_and_word_counts();
$lsa1->generate_document_vectors();
$lsa1->construct_lsa_model();
$lsa1->upload_document_relevancies_from_file();
$lsa1->precision_and_recall_calculator('lsa');
my $avg_precisions_1 = $lsa1->get_query_sorted_average_precision_for_queries();
my $MAP_Algo_1 = 0;
map {$MAP_Algo_1 += $_} @$avg_precisions_1;
$MAP_Algo_1 /= @$avg_precisions_1;
print "MAP value for LSA-1: $MAP_Algo_1\n";
print "Avg precisions for LSA-1: @$avg_precisions_1\n" 
                                            if $debug_signi;


########################  Algorithm 2  #########################

my $lsa2 = Algorithm::VSM->new( 
                   break_camelcased_and_underscored  => 1,  # default: 1
                   case_sensitive      => 0,                # default: 0 
                   corpus_directory    => $corpus_dir,
                   file_types          => ['.txt', '.java'],
                   lsa_svd_threshold   => $THRESHOLD_2,
                   min_word_length     => 4,
                   query_file          => $query_file,
                   relevancy_file      => $relevancy_file,
                   stop_words_file     => $stop_words_file,
                   want_stemming       => 1,                # default: 0
          );

$lsa2->get_corpus_vocabulary_and_word_counts();
$lsa2->generate_document_vectors();
$lsa2->construct_lsa_model();
$lsa2->upload_document_relevancies_from_file();
$lsa2->precision_and_recall_calculator('lsa');
my $avg_precisions_2 = $lsa2->get_query_sorted_average_precision_for_queries();
my $MAP_Algo_2 = 0;
map {$MAP_Algo_2 += $_} @$avg_precisions_2;
$MAP_Algo_2 /= @$avg_precisions_2;
print "MAP value for LSA-2: $MAP_Algo_2\n";
print "Average precisions for LSA-2: @$avg_precisions_2\n"
                                          if $debug_signi;

#  This is the observed value for the test statistic that will be subject to 
#  significance testing:
my $OBSERVED_t = $MAP_Algo_1 - $MAP_Algo_2;

print "\n\nMAP Difference that will be Subject to Significance Testing: $OBSERVED_t\n\n";
                               

########################   Significance Testing  ######################

my @range = 0..@$avg_precisions_1-1;

if ($debug_signi) {
    my $total_number_of_permutations = 2 ** @range;
    print "\n\nTotal num of permuts $total_number_of_permutations\n\n";
}

#   For each permutation of the algorithm labels over the queries, we 
#   will store the test_statistic in the array \@test_statistic.
my @test_statistic = ();

#  At each iteration, we create a random permutation of the algo_1 and
#  algo_2 labels over the queries as explained on slides 39 and 45 of my
#  tutorial on Significance Testing.  For each assignment of average
#  precision values to algo_1, we calculate the MAP value for algo_1, and
#  the same for algo_2.  The difference between the two MAP values is the
#  value of the test_statistic for that iteration.  Our goal is create
#  test_statistic values for, say, 100,000 iterations of this calculation.

my $iter = 0;
while (1) {
    #  Here is the logic we use for permuting the algo_1 and algo_2 labels
    #  over the average precision values.  We first create a random
    #  permutation of the integers between 0 and the size of the query set.
    #  We refer to this permuted list as permuted_range in what follows.
    #  We then walk through the elements of the list permuted_range and at
    #  each position test when the value at that position is less than or
    #  greater than half the size of the number of queries.  This
    #  determines which of the two avg. precision values for a given query
    #  gets algo_1 label and which gets the algo_2 label.
    my @permuted_range = 0..@range-1;
    fisher_yates_shuffle( \@permuted_range );
    my @algo_1 = ();
    my @algo_2 = ();
    foreach (0..@range-1) {
        if ($permuted_range[$_] < @range / 2.0) {
            push @algo_1, $avg_precisions_1->[$_];
            push @algo_2, $avg_precisions_2->[$_];
        } else {
            push @algo_1, $avg_precisions_2->[$_];
            push @algo_2, $avg_precisions_1->[$_];
        }
    }
    my $MAP_1 = 0;
    my $MAP_2 = 0;
    if ($debug_signi) {
        print "\n\nHere come algo_1 and algo_2 average precisions:\n\n";
        print "\npretend produced by algo 1: @algo_1\n\n";
        print "pretend produced by algo 2: @algo_2\n";
    }
    map {$MAP_1 += $_} @algo_1;
    map {$MAP_2 += $_} @algo_2;
    $MAP_1 /= @range;
    $MAP_2 /= @range;        
    if ($debug_signi) {
        print "\nMAP_1: $MAP_1\n";
        print "MAP_2: $MAP_2\n\n";
    }
    $test_statistic[$iter] = $MAP_1 - $MAP_2;
    last if $iter++ == $MAX_ITERATIONS;
    print "." if $iter % 100 == 0;
}

if ($significance_testing_method eq 'randomization') {
    print "\n\nIn randomization based p-value calculation:\n\n";
    print "test-statistic values for different permutations: @test_statistic\n"
        if $debug_signi;

    #  This count keeps track of how many of the test_statistic values are
    #  less than and greater than the value in $OBSERVED_t
    my $count = 0;
    foreach (@test_statistic) {
        $count++ if $_ <= -1 * abs($OBSERVED_t);
        $count++ if $_ > abs($OBSERVED_t);
    }
    my $p_value = $count / @test_statistic;

    print "\n\n\nTesting the significance of the test statistic: $OBSERVED_t\n\n";
  
    print "\n\np_value for THRESHOLD_1 = $THRESHOLD_1 and THRESHOLD_2 = $THRESHOLD_2:   $p_value\n\n";

} elsif ($significance_testing_method eq 't-test') {
    print "\n\nIn Student's t-Test based p-value calculation:\n\n";

    my $mean = 0;
    my $variance = 0;
    my $previous_mean = 0;
    my $index = 0;
    map {    $index++;
             $previous_mean = $mean;
             $mean += ($_-$mean)/$index; 
             $variance = $variance*($index-1)+($_-$mean)*($_-$previous_mean);
             $variance /= $index;
        } @test_statistic;

    print "\n\nMean for test statistic values: $mean  and the variance: $variance\n";
###### The following commented out code is for verification:
#    use Statistics::OnLine;
#    my $S = Statistics::OnLine->new;
#    $S->add_data(@test_statistic);
#    my $verifymean = $S->mean;
#    my $verifyvariance = $S->variance;
#    print "\n\nVerification mean for test statistic values: $verifymean  and the verification variance: $verifyvariance\n";

    print "\n\nMAP Difference that will be Subject to Significance Testing: $OBSERVED_t\n\n";

    my $normalized_bound;
    my $p_value;
    if ($variance > 0.0000001) {
        $normalized_bound = ($OBSERVED_t - $mean) / sqrt($variance);
        print "Normalized bound: $normalized_bound\n\n";
        $p_value = 2*(1-cumulative_distribution_function(abs($normalized_bound)));
    } else {
        $p_value = 1.0;
    }
    print "\n\n\nTesting the significance of the test statistic: $OBSERVED_t\n\n";
    print "\n\np_value for THRESHOLD_1 = $THRESHOLD_1 and THRESHOLD_2 = $THRESHOLD_2:   $p_value\n\n";
}

############################  Utility Functions   #######################

# from perl docs:                                                              
sub fisher_yates_shuffle {                                                     
    my $arr =  shift;                                                          
    my $i = @$arr;                                                             
    while (--$i) {                                                             
        my $j = int rand( $i + 1 );                                            



( run in 1.515 second using v1.01-cache-2.11-cpan-0bd6704ced7 )