Algorithm-VSM

 view release on metacpan or  search on metacpan

examples/significance_testing.pl  view on Meta::CPAN

            push @algo_2, $avg_precisions_1->[$_];
        }
    }
    my $MAP_1 = 0;
    my $MAP_2 = 0;
    if ($debug_signi) {
        print "\n\nHere come algo_1 and algo_2 average precisions:\n\n";
        print "\npretend produced by algo 1: @algo_1\n\n";
        print "pretend produced by algo 2: @algo_2\n";
    }
    map {$MAP_1 += $_} @algo_1;
    map {$MAP_2 += $_} @algo_2;
    $MAP_1 /= @range;
    $MAP_2 /= @range;        
    if ($debug_signi) {
        print "\nMAP_1: $MAP_1\n";
        print "MAP_2: $MAP_2\n\n";
    }
    $test_statistic[$iter] = $MAP_1 - $MAP_2;
    last if $iter++ == $MAX_ITERATIONS;
    print "." if $iter % 100 == 0;
}

if ($significance_testing_method eq 'randomization') {
    print "\n\nIn randomization based p-value calculation:\n\n";
    print "test-statistic values for different permutations: @test_statistic\n"
        if $debug_signi;

    #  This count keeps track of how many of the test_statistic values are
    #  less than and greater than the value in $OBSERVED_t
    my $count = 0;
    foreach (@test_statistic) {
        $count++ if $_ <= -1 * abs($OBSERVED_t);
        $count++ if $_ > abs($OBSERVED_t);
    }
    my $p_value = $count / @test_statistic;

    print "\n\n\nTesting the significance of the test statistic: $OBSERVED_t\n\n";
  
    print "\n\np_value for THRESHOLD_1 = $THRESHOLD_1 and THRESHOLD_2 = $THRESHOLD_2:   $p_value\n\n";

} elsif ($significance_testing_method eq 't-test') {
    print "\n\nIn Student's t-Test based p-value calculation:\n\n";

    my $mean = 0;
    my $variance = 0;
    my $previous_mean = 0;
    my $index = 0;
    map {    $index++;
             $previous_mean = $mean;
             $mean += ($_-$mean)/$index; 
             $variance = $variance*($index-1)+($_-$mean)*($_-$previous_mean);
             $variance /= $index;
        } @test_statistic;

    print "\n\nMean for test statistic values: $mean  and the variance: $variance\n";
###### The following commented out code is for verification:
#    use Statistics::OnLine;
#    my $S = Statistics::OnLine->new;
#    $S->add_data(@test_statistic);
#    my $verifymean = $S->mean;
#    my $verifyvariance = $S->variance;
#    print "\n\nVerification mean for test statistic values: $verifymean  and the verification variance: $verifyvariance\n";

    print "\n\nMAP Difference that will be Subject to Significance Testing: $OBSERVED_t\n\n";

    my $normalized_bound;
    my $p_value;
    if ($variance > 0.0000001) {
        $normalized_bound = ($OBSERVED_t - $mean) / sqrt($variance);
        print "Normalized bound: $normalized_bound\n\n";
        $p_value = 2*(1-cumulative_distribution_function(abs($normalized_bound)));
    } else {
        $p_value = 1.0;
    }
    print "\n\n\nTesting the significance of the test statistic: $OBSERVED_t\n\n";
    print "\n\np_value for THRESHOLD_1 = $THRESHOLD_1 and THRESHOLD_2 = $THRESHOLD_2:   $p_value\n\n";
}

############################  Utility Functions   #######################

# from perl docs:                                                              
sub fisher_yates_shuffle {                                                     
    my $arr =  shift;                                                          
    my $i = @$arr;                                                             
    while (--$i) {                                                             
        my $j = int rand( $i + 1 );                                            
        @$arr[$i, $j] = @$arr[$j, $i];                                         
    }                                                                          
}              

#  Abramowitz and Stugun's high-quality approximation to the normal CDF:
#  This approximation works only for positive arguments.
sub cumulative_distribution_function {
    my $x = shift;
    my $PI = 3.14159265358;
    my $normalized_pdf_value = exp(-($x**2)/2.0) / sqrt(2*$PI);
    my $t = 1 / (1 + 0.2316419 * $x);
    my $cdf = 1 - $normalized_pdf_value * (  0.319381530*$t 
                                       - 0.356563782*($t**2) 
                                       + 1.781477937*($t**3) 
                                       - 1.821255978*($t**4) 
                                       + 1.330274429*($t**5));
    return $cdf;
}



( run in 0.558 second using v1.01-cache-2.11-cpan-13bb782fe5a )