view release on metacpan or search on metacpan
examples/significance_testing.pl view on Meta::CPAN
#!/usr/bin/perl -w
## significance_testing.pl
## See Item 11 in the README of the `examples' directory.
use strict;
use Algorithm::VSM;
my $debug_signi = 0;
die "Must supply one command-line argument, which must either be 'randomization' or 't-test'\n"
unless @ARGV == 1;
my $significance_testing_method = shift @ARGV;
die "The command-line argument must either be 'randomization' or " .
"'t-test' for this module to be useful\n"
if ($significance_testing_method ne 'randomization') and
($significance_testing_method ne 't-test');
print "Proceeding with significance testing based on $significance_testing_method\n";
examples/significance_testing.pl view on Meta::CPAN
$lsa1->generate_document_vectors();
$lsa1->construct_lsa_model();
$lsa1->upload_document_relevancies_from_file();
$lsa1->precision_and_recall_calculator('lsa');
my $avg_precisions_1 = $lsa1->get_query_sorted_average_precision_for_queries();
my $MAP_Algo_1 = 0;
map {$MAP_Algo_1 += $_} @$avg_precisions_1;
$MAP_Algo_1 /= @$avg_precisions_1;
print "MAP value for LSA-1: $MAP_Algo_1\n";
print "Avg precisions for LSA-1: @$avg_precisions_1\n"
if $debug_signi;
######################## Algorithm 2 #########################
my $lsa2 = Algorithm::VSM->new(
break_camelcased_and_underscored => 1, # default: 1
case_sensitive => 0, # default: 0
corpus_directory => $corpus_dir,
file_types => ['.txt', '.java'],
lsa_svd_threshold => $THRESHOLD_2,
examples/significance_testing.pl view on Meta::CPAN
$lsa2->generate_document_vectors();
$lsa2->construct_lsa_model();
$lsa2->upload_document_relevancies_from_file();
$lsa2->precision_and_recall_calculator('lsa');
my $avg_precisions_2 = $lsa2->get_query_sorted_average_precision_for_queries();
my $MAP_Algo_2 = 0;
map {$MAP_Algo_2 += $_} @$avg_precisions_2;
$MAP_Algo_2 /= @$avg_precisions_2;
print "MAP value for LSA-2: $MAP_Algo_2\n";
print "Average precisions for LSA-2: @$avg_precisions_2\n"
if $debug_signi;
# This is the observed value for the test statistic that will be subject to
# significance testing:
my $OBSERVED_t = $MAP_Algo_1 - $MAP_Algo_2;
print "\n\nMAP Difference that will be Subject to Significance Testing: $OBSERVED_t\n\n";
######################## Significance Testing ######################
my @range = 0..@$avg_precisions_1-1;
if ($debug_signi) {
my $total_number_of_permutations = 2 ** @range;
print "\n\nTotal num of permuts $total_number_of_permutations\n\n";
}
# For each permutation of the algorithm labels over the queries, we
# will store the test_statistic in the array \@test_statistic.
my @test_statistic = ();
# At each iteration, we create a random permutation of the algo_1 and
# algo_2 labels over the queries as explained on slides 39 and 45 of my
examples/significance_testing.pl view on Meta::CPAN
if ($permuted_range[$_] < @range / 2.0) {
push @algo_1, $avg_precisions_1->[$_];
push @algo_2, $avg_precisions_2->[$_];
} else {
push @algo_1, $avg_precisions_2->[$_];
push @algo_2, $avg_precisions_1->[$_];
}
}
my $MAP_1 = 0;
my $MAP_2 = 0;
if ($debug_signi) {
print "\n\nHere come algo_1 and algo_2 average precisions:\n\n";
print "\npretend produced by algo 1: @algo_1\n\n";
print "pretend produced by algo 2: @algo_2\n";
}
map {$MAP_1 += $_} @algo_1;
map {$MAP_2 += $_} @algo_2;
$MAP_1 /= @range;
$MAP_2 /= @range;
if ($debug_signi) {
print "\nMAP_1: $MAP_1\n";
print "MAP_2: $MAP_2\n\n";
}
$test_statistic[$iter] = $MAP_1 - $MAP_2;
last if $iter++ == $MAX_ITERATIONS;
print "." if $iter % 100 == 0;
}
if ($significance_testing_method eq 'randomization') {
print "\n\nIn randomization based p-value calculation:\n\n";
print "test-statistic values for different permutations: @test_statistic\n"
if $debug_signi;
# This count keeps track of how many of the test_statistic values are
# less than and greater than the value in $OBSERVED_t
my $count = 0;
foreach (@test_statistic) {
$count++ if $_ <= -1 * abs($OBSERVED_t);
$count++ if $_ > abs($OBSERVED_t);
}
my $p_value = $count / @test_statistic;
lib/Algorithm/VSM.pm view on Meta::CPAN
_case_sensitive => $args{case_sensitive} || 0,
_query_file => $args{query_file} || "",
_file_types => $args{file_types} || [],
_min_word_length => $args{min_word_length} || 4,
_want_stemming => $args{want_stemming} || 0,
_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
_max_number_retrievals => $args{max_number_retrievals} || 30,
_lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01,
_relevancy_threshold => exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
_relevancy_file => $args{relevancy_file} || "",
_debug => $args{debug} || 0,
_working_directory => cwd,
_vocab_hist_on_disk => {},
_vocab_hist => {},
_doc_hist_template => {},
_corpus_doc_vectors => {},
_normalized_doc_vecs => {},
_query_vector => {},
_stop_words => [],
_term_document_matrix => [],
_corpus_vocab_done => 0,
lib/Algorithm/VSM.pm view on Meta::CPAN
}
###################### Get corpus vocabulary and word counts #########################
sub get_corpus_vocabulary_and_word_counts {
my $self = shift;
die "You must supply the name of the corpus directory to the constructor"
unless $self->{_corpus_directory};
print "Scanning the directory '$self->{_corpus_directory}' for\n" .
" model construction\n\n" if $self->{_debug};
$self->_scan_directory( $self->{_corpus_directory} );
$self->_drop_stop_words() if $self->{_stop_words_file};
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
if ($self->{_save_model_on_disk}) {
unlink glob "$self->{_corpus_vocab_db}.*";
unlink glob "$self->{_doc_vectors_db}.*";
unlink glob "$self->{_normalized_doc_vecs_db}.*";
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
$self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
or die "Can't create DBM files: $!";
foreach (keys %{$self->{_vocab_hist}}) {
$self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
}
untie %{$self->{_vocab_hist_on_disk}};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
# Calculate idf(t):
foreach (keys %{$self->{_vocab_idf_hist}}) {
$self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
/
(1 + $self->{_vocab_idf_hist}->{$_})))
/ log(10) );
}
}
sub display_corpus_vocab {
lib/Algorithm/VSM.pm view on Meta::CPAN
my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
@clean_words = map {$_ =~ /$_regex/g} @brokenup;
@clean_words = $self->{_case_sensitive} ?
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
} else {
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
}
$query = \@clean_words;
print "\nYour query words are: @$query\n" if $self->{_debug};
if ($self->{_idf_filter_option}) {
die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()"
unless scalar(keys %{$self->{_vocab_hist}})
&& scalar(keys %{$self->{_normalized_doc_vecs}});
} else {
die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
unless scalar(keys %{$self->{_vocab_hist}})
&& scalar(keys %{$self->{_corpus_doc_vectors}});
}
foreach ( keys %{$self->{_vocab_hist}} ) {
lib/Algorithm/VSM.pm view on Meta::CPAN
$self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
}
}
my @query_word_counts = values %{$self->{_query_vector}};
my $query_word_count_total = sum(\@query_word_counts);
die "\nYour query does not contain corpus words. Nothing retrieved.\n"
unless $query_word_count_total;
my %retrievals;
if ($self->{_idf_filter_option}) {
print "\n\nUsing idf filter option for retrieval:\n\n"
if $self->{_debug};
foreach (sort {$self->_doc_vec_comparator}
keys %{$self->{_normalized_doc_vecs}}) {
$retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
}
} else {
print "\n\nNOT using idf filter option for retrieval:\n\n"
if $self->{_debug};
foreach (sort {$self->_doc_vec_comparator}
keys %{$self->{_corpus_doc_vectors}}) {
$retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
}
}
if ($self->{_debug}) {
print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
print "$_ => $retrievals{$_}\n";
}
}
return \%retrievals;
}
######################### Upload a Previously Constructed Model #########################
sub upload_vsm_model_from_disk {
my $self = shift;
die "\nCannot find the database files for the VSM model"
unless -s "$self->{_corpus_vocab_db}.pag"
&& -s $self->{_doc_vectors_db};
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
$self->{_corpus_vocab_db}, O_RDONLY, 0640
or die "Can't open DBM file: $!";
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
foreach (keys %{$self->{_vocab_hist_on_disk}}) {
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
untie %{$self->{_vocab_hist_on_disk}};
}
sub upload_normalized_vsm_model_from_disk {
my $self = shift;
die "\nCannot find the database files for the VSM model"
unless -s "$self->{_corpus_vocab_db}.pag"
&& -s $self->{_normalized_doc_vecs_db};
$self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
$self->{_corpus_vocab_db}, O_RDONLY, 0640
or die "Can't open DBM file: $!";
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
foreach (keys %{$self->{_vocab_hist_on_disk}}) {
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
untie %{$self->{_vocab_hist_on_disk}};
}
############################## Display Retrieval Results ################################
sub display_retrievals {
my $self = shift;
my $retrievals = shift;
print "\n\nShowing the retrievals and the similarity scores:\n\n";
my $iter = 0;
lib/Algorithm/VSM.pm view on Meta::CPAN
my $term_frequency_vec;
foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
push @$term_frequency_vec,
$self->{_corpus_doc_vectors}->{$_}->{$word};
}
push @{$self->{_term_document_matrix}}, $term_frequency_vec;
}
}
my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
my ($U,$SIGMA,$V) = svd $A;
print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
my $index = return_index_of_last_value_above_threshold($SIGMA,
$self->{_lsa_svd_threshold});
my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
# When you measure the size of a matrix in PDL, the zeroth dimension
# is considered to be along the horizontal and the one-th dimension
# along the rows. This is opposite of how we want to look at
# matrices. For a matrix of size MxN, we mean M rows and N columns.
# With this 'rows x columns' convention for matrix size, if you had
# to check the size of, say, U matrix, you would call
# my @size = ( $U->getdim(1), $U->getdim(0) );
# print "\nsize of U: @size\n";
my $U_trunc = $U->slice("0:$index,:")->sever;
my $V_trunc = $V->slice("0:$index,0:$index")->sever;
$self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
PDL::Basic::transpose($U_trunc);
print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
if $self->{_debug};
my @sorted_doc_names = $self->{_idf_filter_option} ?
sort keys %{$self->{_normalized_doc_vecs}} :
sort keys %{$self->{_corpus_doc_vectors}};
my $i = 0;
foreach (@{$self->{_term_document_matrix}}) {
my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
PDL::Basic::transpose(pdl($_));
my $doc_name = $sorted_doc_names[$i++];
print "\n\nTruncated doc vec for $doc_name: " .
$truncated_doc_vec . "\n" if $self->{_debug};
$self->{_doc_vecs_trunc_lsa}->{$doc_name}
= $truncated_doc_vec;
}
chdir $self->{_working_directory};
}
sub retrieve_with_lsa {
my $self = shift;
my $query = shift;
my @clean_words;
my $min = $self->{_min_word_length};
if ($self->{_break_camelcased_and_underscored}) {
my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
@clean_words = map {$_ =~ /$_regex/g} @brokenup;
@clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
} else {
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
}
$query = \@clean_words;
print "\nYour processed query words are: @$query\n" if $self->{_debug};
die "Your vocabulary histogram is empty"
unless scalar(keys %{$self->{_vocab_hist}});
die "You must first construct an LSA model"
unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
foreach ( keys %{$self->{_vocab_hist}} ) {
$self->{_query_vector}->{$_} = 0;
}
foreach (@$query) {
$self->{_query_vector}->{"\L$_"}++
if exists $self->{_vocab_hist}->{"\L$_"};
}
my @query_word_counts = values %{$self->{_query_vector}};
my $query_word_count_total = sum(\@query_word_counts);
die "Query does not contain corpus words. Nothing retrieved."
unless $query_word_count_total;
my $query_vec;
foreach (sort keys %{$self->{_query_vector}}) {
push @$query_vec, $self->{_query_vector}->{$_};
}
print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
my $truncated_query_vec = $self->{_lsa_vec_truncator} x
PDL::Basic::transpose(pdl($query_vec));
print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
if $self->{_debug};
my %retrievals;
foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
my $dot_product = PDL::Basic::transpose($truncated_query_vec)
x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
print "\n\nLSA: dot product of truncated query and\n" .
" truncated vec for doc $_ is " . $dot_product->sclr . "\n"
if $self->{_debug};
$retrievals{$_} = $dot_product->sclr if $dot_product->sclr > 0;
}
if ($self->{_debug}) {
print "\n\nShowing LSA retrievals and similarity scores:\n\n";
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
print "$_ => $retrievals{$_}\n";
}
print "\n\n";
}
return \%retrievals;
}
sub _construct_doc_vector {
my $self = shift;
my $file = shift;
my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
foreach ( sort keys %{$self->{_doc_hist_template}} ) {
$document_vector{$_} = 0;
}
my $min = $self->{_min_word_length};
my $total_words_in_doc = 0;
unless (open IN, $file) {
print "Unable to open file $file in the corpus: $!\n"
if $self->{_debug};
return;
}
while (<IN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
next unless @clean_words;
@clean_words = grep $_,
map &simple_stemmer($_, $self->{_debug}), @clean_words
if $self->{_want_stemming};
$self->{_case_sensitive} ?
map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
map { $document_vector{"\L$_"}++ }
grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
}
close IN;
die "Something went wrong. Doc vector size unequal to vocab size"
unless $self->{_vocab_size} == scalar(keys %document_vector);
foreach (keys %document_vector) {
lib/Algorithm/VSM.pm view on Meta::CPAN
next if /^#/;
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
/^[ ]*(q[0-9]+):[ ]*(.*)/;
my $query_label = $1;
my $query = $2;
next unless $query;
$self->{_queries_for_relevancy}->{$query_label} = $query;
}
if ($self->{_debug}) {
foreach (sort keys %{$self->{_queries_for_relevancy}}) {
print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
}
}
$self->{_scan_dir_for_rels} = 1;
$self->_scan_directory($self->{_corpus_directory});
$self->{_scan_dir_for_rels} = 0;
chdir $self->{_working_directory};
open(OUT, ">$self->{_relevancy_file}")
or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
lib/Algorithm/VSM.pm view on Meta::CPAN
$_ =~ s/\r?\n?$//;
die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/;
/^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/;
my $query_label = $1;
my $relevancy_docs_string = $2;
next unless $relevancy_docs_string;
my @relevancy_docs = grep $_, split / /, $relevancy_docs_string;
my %relevancies = map {$_ => 1} @relevancy_docs;
$self->{_relevancy_estimates}->{$query_label} = \%relevancies;
}
if ($self->{_debug}) {
for (sort keys %{$self->{_relevancy_estimates}}) {
my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
print "$_ => @rels\n";
}
}
}
sub display_doc_relevancies {
my $self = shift;
die "You must first estimate or provide the doc relevancies"
lib/Algorithm/VSM.pm view on Meta::CPAN
my $self = shift;
my $file = shift;
open IN, $file;
my @all_text = <IN>;
@all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
my $all_text = join ' ', @all_text;
foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
my $count = 0;
my @query_words = grep $_,
split /\s+/, $self->{_queries_for_relevancy}->{$query};
print "Query words for $query: @query_words\n" if $self->{_debug};
foreach my $word (@query_words) {
my @matches = $all_text =~ /$word/gi;
print "Number of occurrences for word '$word' in file $file: " .
scalar(@matches) . "\n" if $self->{_debug};
$count += @matches if @matches;
}
print "\nRelevancy count for query $query and file $file: $count\n\n"
if $self->{_debug};
$self->{_relevancy_estimates}->{$query}->{$file} = $count
if $count >= $self->{_relevancy_threshold};
}
}
######################### Calculate Precision versus Recall ##########################
sub precision_and_recall_calculator {
my $self = shift;
my $retrieval_type = shift;
lib/Algorithm/VSM.pm view on Meta::CPAN
next if /^#/;
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
/^[ ]*(q[0-9]+):[ ]*(.*)/;
my $query_label = $1;
my $query = $2;
next unless $query;
$self->{_queries_for_relevancy}->{$query_label} = $query;
}
if ($self->{_debug}) {
print "\n\nDisplaying queries in the query file:\n\n";
foreach (sort keys %{$self->{_queries_for_relevancy}}) {
print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
}
}
}
foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
print "\n\n====================================== query: $query ========================================\n\n"
if $self->{_debug};
print "\n\n\nQuery $query:\n" if $self->{_debug};
my @query_words = grep $_,
split /\s+/, $self->{_queries_for_relevancy}->{$query};
croak "\n\nYou have not specified the retrieval type for " .
"precision-recall calculation. See code in 'examples'" .
"directory:" if !defined $retrieval_type;
my $retrievals;
eval {
if ($retrieval_type eq 'vsm') {
$retrievals = $self->retrieve_with_vsm( \@query_words );
} elsif ($retrieval_type eq 'lsa') {
lib/Algorithm/VSM.pm view on Meta::CPAN
"Will skip over this query for precision and\n" .
"recall calculations\n\n";
next;
}
my %ranked_retrievals;
my $i = 1;
foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}}
keys %$retrievals) {
$ranked_retrievals{$i++} = $_;
}
if ($self->{_debug}) {
print "\n\nDisplaying ranked retrievals for query $query:\n\n";
foreach (sort {$a <=> $b} keys %ranked_retrievals) {
print "$_ => $ranked_retrievals{$_}\n";
}
}
# At this time, ranking of relevant documents based on their
# relevancy counts serves no particular purpose since all we want
# for the calculation of Precision and Recall are the total
# number of relevant documents. However, I believe such a
# ranking will play an important role in the future.
lib/Algorithm/VSM.pm view on Meta::CPAN
my %ranked_relevancies;
$i = 1;
foreach my $file (sort {
$self->{_relevancy_estimates}->{$query}->{$b}
<=>
$self->{_relevancy_estimates}->{$query}->{$a}
}
keys %{$self->{_relevancy_estimates}->{$query}}) {
$ranked_relevancies{$i++} = $file;
}
if ($self->{_debug}) {
print "\n\nDisplaying ranked relevancies for query $query:\n\n";
foreach (sort {$a <=> $b} keys %ranked_relevancies) {
print "$_ => $ranked_relevancies{$_}\n";
}
}
my @relevant_set = values %ranked_relevancies;
warn "\n\nNo relevant docs found for query $query.\n" .
"Will skip over this query for precision and\n" .
"recall calculations\n\n" unless @relevant_set;
next unless @relevant_set;
print "\n\nRelevant set for query $query: @relevant_set\n\n"
if $self->{_debug};
# @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed.
my @retrieved;
foreach (keys %ranked_retrievals) {
push @retrieved, $ranked_retrievals{$_};
}
print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n"
if $self->{_debug};
my @Precision_values = ();
my @Recall_values = ();
my $rank = 1;
while ($rank < @retrieved + 1) {
my $index = 1;
my @retrieved_at_rank = ();
while ($index <= $rank) {
push @retrieved_at_rank, $ranked_retrievals{$index};
$index++;
}
my $intersection =set_intersection(\@retrieved_at_rank,
\@relevant_set);
my $precision_at_rank = @retrieved_at_rank ?
(@$intersection / @retrieved_at_rank) : 0;
push @Precision_values, $precision_at_rank;
my $recall_at_rank = @$intersection / @relevant_set;
push @Recall_values, $recall_at_rank;
$rank++;
}
print "\n\nFor query $query, precision values: @Precision_values\n"
if $self->{_debug};
print "\nFor query $query, recall values: @Recall_values\n"
if $self->{_debug};
$self->{_precision_for_queries}->{$query} = \@Precision_values;
my $avg_precision;
$avg_precision += $_ for @Precision_values;
$self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values);
$self->{_recall_for_queries}->{$query} = \@Recall_values;
}
print "\n\n========= query by query processing for Precision vs. Recall calculations finished ========\n\n"
if $self->{_debug};
my @avg_precisions;
foreach (keys %{$self->{_avg_precision_for_queries}}) {
push @avg_precisions, $self->{_avg_precision_for_queries}->{$_};
}
$self->{_map} += $_ for @avg_precisions;
$self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}};
}
sub display_average_precision_for_queries_and_map {
my $self = shift;
lib/Algorithm/VSM.pm view on Meta::CPAN
case_sensitive
max_number_retrievals
query_file
relevancy_file
min_word_length
want_stemming
lsa_svd_threshold
relevancy_threshold
break_camelcased_and_underscored
save_model_on_disk
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
lib/Algorithm/VSM.pm view on Meta::CPAN
my $vec = shift;
my $result;
for my $item (@$vec) {
$result += $item;
}
return $result;
}
sub simple_stemmer {
my $word = shift;
my $debug = shift;
print "\nStemming the word: $word\n" if $debug;
$word =~ s/(.*[a-z][^aeious])s$/$1/i;
$word =~ s/(.*[a-z]s)es$/$1/i;
$word =~ s/(.*[a-z][ck])es$/$1e/i;
$word =~ s/(.*[a-z]+)tions$/$1tion/i;
$word =~ s/(.*[a-z]+)mming$/$1m/i;
$word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
$word =~ s/(.*[a-z]+)tices$/$1tex/i;
$word =~ s/(.*[a-z]+)pes$/$1pe/i;
$word =~ s/(.*[a-z]+)sed$/$1se/i;
$word =~ s/(.*[a-z]+)ed$/$1/i;
$word =~ s/(.*[a-z]+)tation$/$1t/i;
print "Stemmed word: $word\n\n" if $debug;
return $word;
}
# Assumes the array is sorted in a descending order, as would be the
# case with an array of singular values produced by an SVD algorithm
sub return_index_of_last_value_above_threshold {
my $pdl_obj = shift;
my $size = $pdl_obj->getdim(0);
my $threshold = shift;
my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;