Algorithm-VSM
view release on metacpan or search on metacpan
examples/significance_testing.pl view on Meta::CPAN
push @algo_2, $avg_precisions_1->[$_];
}
}
my $MAP_1 = 0;
my $MAP_2 = 0;
if ($debug_signi) {
print "\n\nHere come algo_1 and algo_2 average precisions:\n\n";
print "\npretend produced by algo 1: @algo_1\n\n";
print "pretend produced by algo 2: @algo_2\n";
}
map {$MAP_1 += $_} @algo_1;
map {$MAP_2 += $_} @algo_2;
$MAP_1 /= @range;
$MAP_2 /= @range;
if ($debug_signi) {
print "\nMAP_1: $MAP_1\n";
print "MAP_2: $MAP_2\n\n";
}
$test_statistic[$iter] = $MAP_1 - $MAP_2;
last if $iter++ == $MAX_ITERATIONS;
print "." if $iter % 100 == 0;
}
if ($significance_testing_method eq 'randomization') {
print "\n\nIn randomization based p-value calculation:\n\n";
print "test-statistic values for different permutations: @test_statistic\n"
if $debug_signi;
# This count keeps track of how many of the test_statistic values are
# less than and greater than the value in $OBSERVED_t
my $count = 0;
foreach (@test_statistic) {
$count++ if $_ <= -1 * abs($OBSERVED_t);
$count++ if $_ > abs($OBSERVED_t);
}
my $p_value = $count / @test_statistic;
print "\n\n\nTesting the significance of the test statistic: $OBSERVED_t\n\n";
print "\n\np_value for THRESHOLD_1 = $THRESHOLD_1 and THRESHOLD_2 = $THRESHOLD_2: $p_value\n\n";
} elsif ($significance_testing_method eq 't-test') {
print "\n\nIn Student's t-Test based p-value calculation:\n\n";
my $mean = 0;
my $variance = 0;
my $previous_mean = 0;
my $index = 0;
map { $index++;
$previous_mean = $mean;
$mean += ($_-$mean)/$index;
$variance = $variance*($index-1)+($_-$mean)*($_-$previous_mean);
$variance /= $index;
} @test_statistic;
print "\n\nMean for test statistic values: $mean and the variance: $variance\n";
###### The following commented out code is for verification:
# use Statistics::OnLine;
# my $S = Statistics::OnLine->new;
# $S->add_data(@test_statistic);
# my $verifymean = $S->mean;
# my $verifyvariance = $S->variance;
# print "\n\nVerification mean for test statistic values: $verifymean and the verification variance: $verifyvariance\n";
print "\n\nMAP Difference that will be Subject to Significance Testing: $OBSERVED_t\n\n";
my $normalized_bound;
my $p_value;
if ($variance > 0.0000001) {
$normalized_bound = ($OBSERVED_t - $mean) / sqrt($variance);
print "Normalized bound: $normalized_bound\n\n";
$p_value = 2*(1-cumulative_distribution_function(abs($normalized_bound)));
} else {
$p_value = 1.0;
}
print "\n\n\nTesting the significance of the test statistic: $OBSERVED_t\n\n";
print "\n\np_value for THRESHOLD_1 = $THRESHOLD_1 and THRESHOLD_2 = $THRESHOLD_2: $p_value\n\n";
}
############################ Utility Functions #######################
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
# Abramowitz and Stugun's high-quality approximation to the normal CDF:
# This approximation works only for positive arguments.
sub cumulative_distribution_function {
my $x = shift;
my $PI = 3.14159265358;
my $normalized_pdf_value = exp(-($x**2)/2.0) / sqrt(2*$PI);
my $t = 1 / (1 + 0.2316419 * $x);
my $cdf = 1 - $normalized_pdf_value * ( 0.319381530*$t
- 0.356563782*($t**2)
+ 1.781477937*($t**3)
- 1.821255978*($t**4)
+ 1.330274429*($t**5));
return $cdf;
}
( run in 0.558 second using v1.01-cache-2.11-cpan-13bb782fe5a )