Algorithm-VSM
view release on metacpan or search on metacpan
examples/calculate_precision_and_recall_for_LSA.pl view on Meta::CPAN
#!/usr/bin/perl -w
### calculate_precision_and_recall_for_LSA.pl
## This is a self-contained script for precision-and-recall calculatins with
## LSA. Therefore, it is NOT necessary that you first create the disk-based
## hash tables by calling retrieve_with_VSM_and_also_create_disk_based_model.pl
## See Item 8 of the README in the `examples' directory for further information.
use strict;
use Algorithm::VSM;
my $corpus_dir = "corpus"; # This is the directory containing
# the corpus
my $stop_words_file = "stop_words.txt"; # Will typically include the
# keywords of the programming
# language(s) used in the software.
my $query_file = "test_queries.txt"; # This file contains the queries
# to be used for precision vs.
# recall analysis. Its format
# must be as shown in test_queries.txt
my $relevancy_file = "relevancy.txt"; # The generated relevancies will
# be stored in this file.
my $lsa = Algorithm::VSM->new(
break_camelcased_and_underscored => 1, # default: 0
case_sensitive => 0, # default: 0
corpus_directory => $corpus_dir,
file_types => ['.txt', '.java'],
lsa_svd_threshold => 0.05, # Used for rejecting singular
# values that are smaller than
# this threshold fraction of
# the largest singular value.
min_word_length => 4,
query_file => $query_file,
relevancy_file => $relevancy_file, # Relevancy judgments
# are deposited in
# this file.
relevancy_threshold => 5, # Used when estimating relevancies
# with the method
# estimate_doc_relevancies(). A
# doc must have at least this
# number of query words to be
# considered relevant.
stop_words_file => $stop_words_file,
want_stemming => 1, # default: 0
);
$lsa->get_corpus_vocabulary_and_word_counts();
$lsa->generate_document_vectors();
# Uncomment the following statement if you want to see the corpus
# vocabulary:
#$lsa->display_corpus_vocab();
# Uncomment the following statement if you want to see the individual
# document vectors:
#$lsa->display_doc_vectors();
$lsa->construct_lsa_model();
# The argument below is the file that contains the queries to be used
# for precision-recall analysis. The format of this file must be
# according to what is shown in the file test_queries.txt in this
# directory:
$lsa->estimate_doc_relevancies();
# Uncomment the following statement if you wish to see the list of all
# the documents relevant to each of the queries:
#$lsa->display_doc_relevancies();
$lsa->precision_and_recall_calculator('lsa');
$lsa->display_precision_vs_recall_for_queries();
$lsa->display_average_precision_for_queries_and_map();
( run in 0.495 second using v1.01-cache-2.11-cpan-140bd7fdf52 )