Algorithm-VSM

 view release on metacpan or  search on metacpan

examples/significance_testing.pl  view on Meta::CPAN

$MAP_Algo_1 /= @$avg_precisions_1;
print "MAP value for LSA-1: $MAP_Algo_1\n";
print "Avg precisions for LSA-1: @$avg_precisions_1\n" 
                                            if $debug_signi;


########################  Algorithm 2  #########################

my $lsa2 = Algorithm::VSM->new( 
                   break_camelcased_and_underscored  => 1,  # default: 1
                   case_sensitive      => 0,                # default: 0 
                   corpus_directory    => $corpus_dir,
                   file_types          => ['.txt', '.java'],
                   lsa_svd_threshold   => $THRESHOLD_2,
                   min_word_length     => 4,
                   query_file          => $query_file,
                   relevancy_file      => $relevancy_file,
                   stop_words_file     => $stop_words_file,
                   want_stemming       => 1,                # default: 0
          );

$lsa2->get_corpus_vocabulary_and_word_counts();
$lsa2->generate_document_vectors();
$lsa2->construct_lsa_model();
$lsa2->upload_document_relevancies_from_file();
$lsa2->precision_and_recall_calculator('lsa');
my $avg_precisions_2 = $lsa2->get_query_sorted_average_precision_for_queries();
my $MAP_Algo_2 = 0;
map {$MAP_Algo_2 += $_} @$avg_precisions_2;
$MAP_Algo_2 /= @$avg_precisions_2;
print "MAP value for LSA-2: $MAP_Algo_2\n";
print "Average precisions for LSA-2: @$avg_precisions_2\n"
                                          if $debug_signi;

#  This is the observed value for the test statistic that will be subject to 
#  significance testing:
my $OBSERVED_t = $MAP_Algo_1 - $MAP_Algo_2;

print "\n\nMAP Difference that will be Subject to Significance Testing: $OBSERVED_t\n\n";
                               

########################   Significance Testing  ######################

my @range = 0..@$avg_precisions_1-1;

if ($debug_signi) {
    my $total_number_of_permutations = 2 ** @range;
    print "\n\nTotal num of permuts $total_number_of_permutations\n\n";
}

#   For each permutation of the algorithm labels over the queries, we 
#   will store the test_statistic in the array \@test_statistic.
my @test_statistic = ();

#  At each iteration, we create a random permutation of the algo_1 and
#  algo_2 labels over the queries as explained on slides 39 and 45 of my
#  tutorial on Significance Testing.  For each assignment of average
#  precision values to algo_1, we calculate the MAP value for algo_1, and
#  the same for algo_2.  The difference between the two MAP values is the
#  value of the test_statistic for that iteration.  Our goal is create
#  test_statistic values for, say, 100,000 iterations of this calculation.

my $iter = 0;
while (1) {
    #  Here is the logic we use for permuting the algo_1 and algo_2 labels
    #  over the average precision values.  We first create a random
    #  permutation of the integers between 0 and the size of the query set.
    #  We refer to this permuted list as permuted_range in what follows.
    #  We then walk through the elements of the list permuted_range and at
    #  each position test when the value at that position is less than or
    #  greater than half the size of the number of queries.  This
    #  determines which of the two avg. precision values for a given query
    #  gets algo_1 label and which gets the algo_2 label.
    my @permuted_range = 0..@range-1;
    fisher_yates_shuffle( \@permuted_range );
    my @algo_1 = ();
    my @algo_2 = ();
    foreach (0..@range-1) {
        if ($permuted_range[$_] < @range / 2.0) {
            push @algo_1, $avg_precisions_1->[$_];
            push @algo_2, $avg_precisions_2->[$_];
        } else {
            push @algo_1, $avg_precisions_2->[$_];
            push @algo_2, $avg_precisions_1->[$_];
        }
    }
    my $MAP_1 = 0;
    my $MAP_2 = 0;
    if ($debug_signi) {
        print "\n\nHere come algo_1 and algo_2 average precisions:\n\n";
        print "\npretend produced by algo 1: @algo_1\n\n";
        print "pretend produced by algo 2: @algo_2\n";
    }
    map {$MAP_1 += $_} @algo_1;
    map {$MAP_2 += $_} @algo_2;
    $MAP_1 /= @range;
    $MAP_2 /= @range;        
    if ($debug_signi) {
        print "\nMAP_1: $MAP_1\n";
        print "MAP_2: $MAP_2\n\n";
    }
    $test_statistic[$iter] = $MAP_1 - $MAP_2;
    last if $iter++ == $MAX_ITERATIONS;
    print "." if $iter % 100 == 0;
}

if ($significance_testing_method eq 'randomization') {
    print "\n\nIn randomization based p-value calculation:\n\n";
    print "test-statistic values for different permutations: @test_statistic\n"
        if $debug_signi;

    #  This count keeps track of how many of the test_statistic values are
    #  less than and greater than the value in $OBSERVED_t
    my $count = 0;
    foreach (@test_statistic) {
        $count++ if $_ <= -1 * abs($OBSERVED_t);
        $count++ if $_ > abs($OBSERVED_t);
    }
    my $p_value = $count / @test_statistic;

    print "\n\n\nTesting the significance of the test statistic: $OBSERVED_t\n\n";



( run in 1.238 second using v1.01-cache-2.11-cpan-96521ef73a4 )