Algorithm-VSM
view release on metacpan or search on metacpan
lib/Algorithm/VSM.pm view on Meta::CPAN
my $term_frequency_vec;
foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
push @$term_frequency_vec,
$self->{_corpus_doc_vectors}->{$_}->{$word};
}
push @{$self->{_term_document_matrix}}, $term_frequency_vec;
}
}
my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
my ($U,$SIGMA,$V) = svd $A;
print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
my $index = return_index_of_last_value_above_threshold($SIGMA,
$self->{_lsa_svd_threshold});
my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
# When you measure the size of a matrix in PDL, the zeroth dimension
# is considered to be along the horizontal and the one-th dimension
# along the rows. This is opposite of how we want to look at
# matrices. For a matrix of size MxN, we mean M rows and N columns.
# With this 'rows x columns' convention for matrix size, if you had
# to check the size of, say, U matrix, you would call
# my @size = ( $U->getdim(1), $U->getdim(0) );
# print "\nsize of U: @size\n";
my $U_trunc = $U->slice("0:$index,:")->sever;
my $V_trunc = $V->slice("0:$index,0:$index")->sever;
$self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
PDL::Basic::transpose($U_trunc);
print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
if $self->{_debug};
my @sorted_doc_names = $self->{_idf_filter_option} ?
sort keys %{$self->{_normalized_doc_vecs}} :
sort keys %{$self->{_corpus_doc_vectors}};
my $i = 0;
foreach (@{$self->{_term_document_matrix}}) {
my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
PDL::Basic::transpose(pdl($_));
my $doc_name = $sorted_doc_names[$i++];
print "\n\nTruncated doc vec for $doc_name: " .
$truncated_doc_vec . "\n" if $self->{_debug};
$self->{_doc_vecs_trunc_lsa}->{$doc_name}
= $truncated_doc_vec;
}
chdir $self->{_working_directory};
}
sub retrieve_with_lsa {
my $self = shift;
my $query = shift;
my @clean_words;
my $min = $self->{_min_word_length};
if ($self->{_break_camelcased_and_underscored}) {
my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
@clean_words = map {$_ =~ /$_regex/g} @brokenup;
@clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
} else {
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
}
$query = \@clean_words;
print "\nYour processed query words are: @$query\n" if $self->{_debug};
die "Your vocabulary histogram is empty"
unless scalar(keys %{$self->{_vocab_hist}});
die "You must first construct an LSA model"
unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
foreach ( keys %{$self->{_vocab_hist}} ) {
$self->{_query_vector}->{$_} = 0;
}
foreach (@$query) {
$self->{_query_vector}->{"\L$_"}++
if exists $self->{_vocab_hist}->{"\L$_"};
}
my @query_word_counts = values %{$self->{_query_vector}};
my $query_word_count_total = sum(\@query_word_counts);
die "Query does not contain corpus words. Nothing retrieved."
unless $query_word_count_total;
my $query_vec;
foreach (sort keys %{$self->{_query_vector}}) {
push @$query_vec, $self->{_query_vector}->{$_};
}
print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
my $truncated_query_vec = $self->{_lsa_vec_truncator} x
PDL::Basic::transpose(pdl($query_vec));
print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
if $self->{_debug};
my %retrievals;
foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
my $dot_product = PDL::Basic::transpose($truncated_query_vec)
x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
print "\n\nLSA: dot product of truncated query and\n" .
" truncated vec for doc $_ is " . $dot_product->sclr . "\n"
if $self->{_debug};
$retrievals{$_} = $dot_product->sclr if $dot_product->sclr > 0;
}
if ($self->{_debug}) {
print "\n\nShowing LSA retrievals and similarity scores:\n\n";
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
print "$_ => $retrievals{$_}\n";
}
print "\n\n";
}
return \%retrievals;
}
sub _construct_doc_vector {
my $self = shift;
my $file = shift;
my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
foreach ( sort keys %{$self->{_doc_hist_template}} ) {
$document_vector{$_} = 0;
}
my $min = $self->{_min_word_length};
my $total_words_in_doc = 0;
unless (open IN, $file) {
print "Unable to open file $file in the corpus: $!\n"
if $self->{_debug};
return;
}
while (<IN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
lib/Algorithm/VSM.pm view on Meta::CPAN
# same rank.
my %ranked_relevancies;
$i = 1;
foreach my $file (sort {
$self->{_relevancy_estimates}->{$query}->{$b}
<=>
$self->{_relevancy_estimates}->{$query}->{$a}
}
keys %{$self->{_relevancy_estimates}->{$query}}) {
$ranked_relevancies{$i++} = $file;
}
if ($self->{_debug}) {
print "\n\nDisplaying ranked relevancies for query $query:\n\n";
foreach (sort {$a <=> $b} keys %ranked_relevancies) {
print "$_ => $ranked_relevancies{$_}\n";
}
}
my @relevant_set = values %ranked_relevancies;
warn "\n\nNo relevant docs found for query $query.\n" .
"Will skip over this query for precision and\n" .
"recall calculations\n\n" unless @relevant_set;
next unless @relevant_set;
print "\n\nRelevant set for query $query: @relevant_set\n\n"
if $self->{_debug};
# @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed.
my @retrieved;
foreach (keys %ranked_retrievals) {
push @retrieved, $ranked_retrievals{$_};
}
print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n"
if $self->{_debug};
my @Precision_values = ();
my @Recall_values = ();
my $rank = 1;
while ($rank < @retrieved + 1) {
my $index = 1;
my @retrieved_at_rank = ();
while ($index <= $rank) {
push @retrieved_at_rank, $ranked_retrievals{$index};
$index++;
}
my $intersection =set_intersection(\@retrieved_at_rank,
\@relevant_set);
my $precision_at_rank = @retrieved_at_rank ?
(@$intersection / @retrieved_at_rank) : 0;
push @Precision_values, $precision_at_rank;
my $recall_at_rank = @$intersection / @relevant_set;
push @Recall_values, $recall_at_rank;
$rank++;
}
print "\n\nFor query $query, precision values: @Precision_values\n"
if $self->{_debug};
print "\nFor query $query, recall values: @Recall_values\n"
if $self->{_debug};
$self->{_precision_for_queries}->{$query} = \@Precision_values;
my $avg_precision;
$avg_precision += $_ for @Precision_values;
$self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values);
$self->{_recall_for_queries}->{$query} = \@Recall_values;
}
print "\n\n========= query by query processing for Precision vs. Recall calculations finished ========\n\n"
if $self->{_debug};
my @avg_precisions;
foreach (keys %{$self->{_avg_precision_for_queries}}) {
push @avg_precisions, $self->{_avg_precision_for_queries}->{$_};
}
$self->{_map} += $_ for @avg_precisions;
$self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}};
}
sub display_average_precision_for_queries_and_map {
my $self = shift;
die "You must first invoke precision_and_recall_calculator function"
unless scalar(keys %{$self->{_avg_precision_for_queries}});
print "\n\nDisplaying average precision for different queries:\n\n";
foreach my $query (sort
{get_integer_suffix($a) <=> get_integer_suffix($b)}
keys %{$self->{_avg_precision_for_queries}}) {
my $output = sprintf "Query %s => %.3f",
$query, $self->{_avg_precision_for_queries}->{$query};
print "$output\n";
}
print "\n\nMAP value: $self->{_map}\n\n";
}
sub display_precision_vs_recall_for_queries {
my $self = shift;
die "You must first invoke precision_and_recall_calculator function"
unless scalar(keys %{$self->{_precision_for_queries}});
print "\n\nDisplaying precision and recall values for different queries:\n\n";
foreach my $query (sort
{get_integer_suffix($a) <=> get_integer_suffix($b)}
keys %{$self->{_avg_precision_for_queries}}) {
print "\n\nQuery $query:\n";
print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
@precision_vals = map {sprintf "%.3f", $_} @precision_vals;
print " Precision at rank => @precision_vals\n";
my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
@recall_vals = map {sprintf "%.3f", $_} @recall_vals;
print "\n Recall at rank => @recall_vals\n";
}
print "\n\n";
}
sub get_query_sorted_average_precision_for_queries {
my $self = shift;
die "You must first invoke precision_and_recall_calculator function"
unless scalar(keys %{$self->{_avg_precision_for_queries}});
my @average_precisions_for_queries = ();
foreach my $query (sort
{get_integer_suffix($a) <=> get_integer_suffix($b)}
keys %{$self->{_avg_precision_for_queries}}) {
my $output = sprintf "%.3f", $self->{_avg_precision_for_queries}->{$query};
push @average_precisions_for_queries, $output;
}
return \@average_precisions_for_queries;
}
################################### Utility Routines ###################################
lib/Algorithm/VSM.pm view on Meta::CPAN
Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a
document file. What that means is that the words that appear in most of the documents
get reduced weighting since such words are non-discriminatory with respect to the
retrieval of the documents. A typical formula that is used to calculate the IDF
weight for a word is the logarithm of the ratio of the total number of documents to
the number of documents in which the word appears. So if a word were to appear in
all the documents, its IDF multiplier would be zero in the vector representation of a
document. If so desired, you can turn off the IDF weighting of the words by
explicitly setting the constructor parameter C<use_idf_filter> to zero.
Version 1.2 includes a code correction and some general code and documentation
cleanup.
With Version 1.1, you can access the retrieval precision results so that you can
compare two different retrieval algorithms (VSM or LSA with different choices for
some of the constructor parameters) with significance testing. (Version 1.0 merely
sent those results to standard output, typically your terminal window.) In Version
1.1, the new script B<significance_testing.pl> in the 'examples' directory
illustrates significance testing with Randomization and with Student's Paired t-Test.
=head1 DESCRIPTION
B<Algorithm::VSM> is a I<perl5> module for constructing a Vector Space Model (VSM) or
a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred
to as a corpus, and then retrieving the documents in response to search words in a
query.
VSM and LSA models have been around for a long time in the Information Retrieval (IR)
community. More recently such models have been shown to be effective in retrieving
files/documents from software libraries. For an account of this research that was
presented by Shivani Rao and the author of this module at the 2011 Mining Software
Repositories conference, see L<http://portal.acm.org/citation.cfm?id=1985451>.
VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2)
Stemming the words so extracted and eliminating the designated stop words from the
vocabulary. Stemming means that closely related words like 'programming' and
'programs' are reduced to the common root word 'program' and the stop words are the
non-discriminating words that can be expected to exist in virtually all the
documents. (3) Constructing document vectors for the individual files in the corpus
--- the document vectors taken together constitute what is usually referred to as a
'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to
factor out the effect of document size and, if desired, multiplying the term
frequencies by the IDF (Inverse Document Frequency) values for the words to reduce
the weight of the words that appear in a large number of documents. (5) Constructing
a query vector for the search query after the query is subject to the same stemming
and stop-word elimination rules that were applied to the corpus. And, lastly, (6)
Using a similarity metric to return the set of documents that are most similar to the
query vector. The commonly used similarity metric is one based on the cosine
distance between two vectors. Also note that all the vectors mentioned here are of
the same size, the size of the vocabulary. An element of a vector is the frequency
of occurrence of the word corresponding to that position in the vector.
LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one
step further by subjecting the term-frequency matrix for the corpus to singular value
decomposition (SVD). By retaining only a subset of the singular values (usually the
N largest for some value of N), you can construct reduced-dimensionality vectors for
the documents and the queries. In VSM, as mentioned above, the size of the document
and the query vectors is equal to the size of the vocabulary. For large corpora,
this size may involve tens of thousands of words --- this can slow down the VSM
modeling and retrieval process. So you are very likely to get faster performance
with retrieval based on LSA modeling, especially if you store the model once
constructed in a database file on the disk and carry out retrievals using the
disk-based model.
=head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL?
This module has only been tested for software retrieval. For more general text
retrieval, you would need to replace the simple stemmer used in the module by one
based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the
list of stop words appropriate to the text corpora of interest to you. As previously
mentioned, the stop words are the commonly occurring words that do not carry much
discriminatory power from the standpoint of distinguishing between the documents.
See the file 'stop_words.txt' in the 'examples' directory for how such a file must be
formatted.
=head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA?
It is not uncommon for large software libraries to consist of tens of thousands of
documents that include source-code files, documentation files, README files,
configuration files, etc. The bug-localization work presented recently by Shivani
Rao and this author at the 2011 Mining Software Repository conference (MSR11) was
based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary
size of 7553 unique words. (Here is a link to this work:
L<http://portal.acm.org/citation.cfm?id=1985451>. Also note that the iBUGS dataset
was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of
automated bug detection and localization tools.) If C<V> is the size of the
vocabulary and C<M> the number of the documents in the corpus, the size of each
vector will be C<V> and size of the term-frequency matrix for the entire corpus will
be C<V>xC<M>. So if you were to duplicate the bug localization experiments in
L<http://portal.acm.org/citation.cfm?id=1985451> you would be dealing with vectors of
size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers
to really large libraries/corpora, we are obviously talking about very large matrices
for SVD decomposition. For large libraries/corpora, it would be best to store away
the model in a disk file and to base all subsequent retrievals on the disk-stored
models. The 'examples' directory contains scripts that carry out retrievals on the
basis of disk-based models. Further speedup in retrieval can be achieved by using
LSA to create reduced-dimensionality representations for the documents and by basing
retrievals on the stored versions of such reduced-dimensionality representations.
=head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS
The performance of a retrieval algorithm is typically measured by two properties:
C<Precision at rank> and C<Recall at rank>. As mentioned in my tutorial
L<https://engineering.purdue.edu/kak/Tutorials/SignificanceTesting.pdf>, at a given
rank C<r>, C<Precision> is the ratio of the number of retrieved documents that are
relevant to the total number of retrieved documents up to that rank. And, along the
same lines, C<Recall> at a given rank C<r> is the ratio of the number of retrieved
documents that are relevant to the total number of relevant documents. The Average
Precision associated with a query is the average of all the Precision-at-rank values
for all the documents relevant to that query. When query specific Average Precision
is averaged over all the queries, you get Mean Average Precision (MAP) as a
single-number characterizer of the retrieval power of an algorithm for a given
corpus. For an oracle, the value of MAP should be 1.0. On the other hand, for
purely random retrieval from a corpus, the value of MAP will be inversely
proportional to the size of the corpus. (See the discussion in
L<https://engineering.purdue.edu/kak/Tutorials/SignificanceTesting.pdf> for further
explanation on these retrieval precision evaluators.)
lib/Algorithm/VSM.pm view on Meta::CPAN
will be stored after it is subject to stemming and the elimination of stop words.
Once a disk-based VSM model is created and stored away in the file named by this
parameter and the parameter to be described next, it can subsequently be used
directly for speedier retrieval.
=item I<doc_vectors_db:>
The database named by B<doc_vectors_db> stores the document vector representation for
each document in the corpus. Each document vector has the same size as the
corpus-wide vocabulary; each element of such a vector is the number of occurrences of
the word that corresponds to that position in the vocabulary vector.
=item I<file_types:>
This parameter tells the module what types of files in the corpus directory you want
scanned for creating the database model. The value supplied for this parameter is an
anonymous list of the file suffixes for the file types. For example, if you wanted
only Java and text files to be scanned, you will set this parameter to C<['.java',
'.txt']>. The module throws an exception if this parameter is left unspecified.
(This constructor parameter was introduced in Version 1.61.)
=item I<lsa_svd_threshold:>
The parameter B<lsa_svd_threshold> is used for rejecting singular values that are
smaller than this threshold fraction of the largest singular value. This plays a
critical role in creating reduced-dimensionality document vectors in LSA modeling of
a corpus.
=item I<max_number_retrievals:>
The constructor parameter B<max_number_retrievals> stands for what it means.
=item I<min_word_length:>
The parameter B<min_word_length> sets the minimum number of characters in a
word in order for it to be included in the corpus vocabulary.
=item I<normalized_doc_vecs_db:>
The database named by B<normalized_doc_vecs_db> stores the normalized document
vectors. Normalization consists of factoring out the size of the documents by
dividing the term frequency for each word in a document by the number of words in the
document, and then multiplying the result by the idf (Inverse Document Frequency)
value for the word.
=item I<query_file:>
The parameter B<query_file> points to a file that contains the queries to be used for
calculating retrieval performance with C<Precision> and C<Recall> numbers. The format
of the query file must be as shown in the sample file C<test_queries.txt> in the
'examples' directory.
=item I<relevancy_file:>
This option names the disk file for storing the relevancy judgments.
=item I<relevancy_threshold:>
The constructor parameter B<relevancy_threshold> is used for automatic determination
of document relevancies to queries on the basis of the number of occurrences of query
words in a document. You can exercise control over the process of determining
relevancy of a document to a query by giving a suitable value to the constructor
parameter B<relevancy_threshold>. A document is considered relevant to a query only
when the document contains at least B<relevancy_threshold> number of query words.
=item I<save_model_on_disk:>
The constructor parameter B<save_model_on_disk> will cause the basic
information about the VSM and the LSA models to be stored on the disk.
Subsequently, any retrievals can be carried out from the disk-based model.
=item I<stop_words_file:>
The parameter B<stop_words_file> is for naming the file that contains the stop words
that you do not wish to include in the corpus vocabulary. The format of this file
must be as shown in the sample file C<stop_words.txt> in the 'examples' directory.
=item I<use_idf_filter:>
The constructor parameter B<use_idf_filter> is set by default. If you want
to turn off the normalization of the document vectors, including turning
off the weighting of the term frequencies of the words by their idf values,
you must set this parameter explicitly to 0.
=item I<want_stemming:>
The boolean parameter B<want_stemming> determines whether or not the words extracted
from the documents would be subject to stemming. As mentioned elsewhere, stemming
means that related words like 'programming' and 'programs' would both be reduced to
the root word 'program'.
=back
=begin html
<br>
=end html
=item B<construct_lsa_model():>
You call this subroutine for constructing an LSA model for your corpus
after you have extracted the corpus vocabulary and constructed document
vectors:
$vsm->construct_lsa_model();
The SVD decomposition that is carried out in LSA model construction uses the
constructor parameter C<lsa_svd_threshold> to decide how many of the singular values
to retain for the LSA model. A singular is retained only if it is larger than the
C<lsa_svd_threshold> fraction of the largest singular value.
=item B<display_average_precision_for_queries_and_map():>
The Average Precision for a query is the average of the Precision-at-rank values
associated with each of the corpus documents relevant to the query. The mean of the
Average Precision values for all the queries is the Mean Average Precision (MAP).
The C<Average Precision> values for the queries and the overall C<MAP> can be printed
out by calling
( run in 1.647 second using v1.01-cache-2.11-cpan-d7f47b0818f )