Algorithm-VSM

 view release on metacpan or  search on metacpan

lib/Algorithm/VSM.pm  view on Meta::CPAN

    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
               if $self->{_debug};
    $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
    untie %{$self->{_vocab_hist_on_disk}};
}

sub upload_normalized_vsm_model_from_disk {
    my $self = shift;
    die "\nCannot find the database files for the VSM model"
        unless -s "$self->{_corpus_vocab_db}.pag" 
            && -s $self->{_normalized_doc_vecs_db};
    $self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
    tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', 
                      $self->{_corpus_vocab_db}, O_RDONLY, 0640
            or die "Can't open DBM file: $!";       
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    foreach (keys %{$self->{_vocab_hist_on_disk}}) {
        $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
               if $self->{_debug};
    untie %{$self->{_vocab_hist_on_disk}};
}

############################## Display Retrieval Results  ################################

sub display_retrievals {
    my $self = shift;
    my $retrievals = shift;
    print "\n\nShowing the retrievals and the similarity scores:\n\n";
    my $iter = 0;
    foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
        print "$_   =>   $retrievals->{$_}\n"; 
        $iter++;
        last if $iter > $self->{_max_number_retrievals};
    }   
    print "\n\n";
}

###############################    Directory Scanner      ################################

sub _scan_directory {
    my $self = shift;
    my $dir = rel2abs( shift );
    my $current_dir = cwd;
    chdir $dir or die "Unable to change directory to $dir: $!";
    foreach ( glob "*" ) {                                            
        if ( -d and !(-l) ) {
            $self->_scan_directory( $_ );
            chdir $dir                                                
                or die "Unable to change directory to $dir: $!";
        } elsif (-r _ and 
                 -T _ and 
                 -M _ > 0.00001 and  # modification age is at least 1 sec
                !( -l $_ ) and 
                $self->ok_to_filetype($_) ) {
            $self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
            $self->_scan_file($_) unless $self->{_corpus_vocab_done};
            $self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
        }
    }
    chdir $current_dir;
}

sub _scan_file {
    my $self = shift;
    my $file = shift;
    open IN, $file;
    my $min = $self->{_min_word_length};
    my %uniques = ();
    while (<IN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my @clean_words;
        if ($self->{_break_camelcased_and_underscored}) {
            my @brokenup = grep $_, split /\W|_|\s+/, $_;
            @clean_words = map {$_ =~ /$_regex/g} @brokenup;
        @clean_words = $self->{_case_sensitive} ? 
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
        } else {
            my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
            @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
        }
        next unless @clean_words;
        @clean_words = grep $_, map &simple_stemmer($_), @clean_words
               if $self->{_want_stemming};
        $self->{_case_sensitive} ?
            map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
            map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
        if ($self->{_case_sensitive}) {
            for (@clean_words) { $uniques{$_}++ }
        } else {
           for (@clean_words) { $uniques{"\L$_"}++ }
        }
    }
    close( IN );
    map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
    $self->{_total_num_of_docs}++;
}

sub ok_to_filetype {
    my $self = shift;    
    my $filename = shift;
    my ($base, $dir, $suffix) = fileparse($filename, '\..*');
    croak "You called this module without specifying the file types in the constructor"
        unless @{$self->{_file_types}} > 0;
    return 1 if contained_in($suffix, @{$self->{_file_types}});
    return 0;
}

############################## LSA Modeling and Retrieval ################################

sub construct_lsa_model {

lib/Algorithm/VSM.pm  view on Meta::CPAN

        foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} 
                                                      keys %$retrievals) {
            $ranked_retrievals{$i++} = $_;
        }      
        if ($self->{_debug}) {
            print "\n\nDisplaying ranked retrievals for query $query:\n\n";
            foreach (sort {$a <=> $b} keys %ranked_retrievals) {
                print "$_  =>   $ranked_retrievals{$_}\n";   
            }      
        }
        #   At this time, ranking of relevant documents based on their
        #   relevancy counts serves no particular purpose since all we want
        #   for the calculation of Precision and Recall are the total
        #   number of relevant documents.  However, I believe such a
        #   ranking will play an important role in the future.
        #   IMPORTANT:  The relevancy judgments are ranked only when
        #               estimated by the method estimate_doc_relevancies()
        #               of the VSM class.  When relevancies are supplied
        #               directly through a disk file, they all carry the
        #               same rank.
        my %ranked_relevancies;
        $i = 1;
        foreach my $file (sort {
                          $self->{_relevancy_estimates}->{$query}->{$b}
                          <=>
                          $self->{_relevancy_estimates}->{$query}->{$a}
                          }
                          keys %{$self->{_relevancy_estimates}->{$query}}) {
            $ranked_relevancies{$i++} = $file;
        }
        if ($self->{_debug}) {
            print "\n\nDisplaying ranked relevancies for query $query:\n\n";
            foreach (sort {$a <=> $b} keys %ranked_relevancies) {
                print "$_  =>   $ranked_relevancies{$_}\n";   
            }      
        }
        my @relevant_set = values %ranked_relevancies;
        warn "\n\nNo relevant docs found for query $query.\n" .
             "Will skip over this query for precision and\n" .
             "recall calculations\n\n" unless @relevant_set;
        next unless @relevant_set;    
        print "\n\nRelevant set for query $query:  @relevant_set\n\n"
            if $self->{_debug};
        # @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed.  
        my @retrieved; 
        foreach (keys %ranked_retrievals) {
            push @retrieved, $ranked_retrievals{$_};
        }
        print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n"
            if $self->{_debug};
        my @Precision_values = ();
        my @Recall_values = ();
        my $rank = 1;
        while ($rank < @retrieved + 1) {
            my $index = 1;      
            my @retrieved_at_rank = ();
            while ($index <= $rank) {
                push @retrieved_at_rank, $ranked_retrievals{$index};
                $index++;
            }
            my $intersection =set_intersection(\@retrieved_at_rank,
                                               \@relevant_set);
            my $precision_at_rank = @retrieved_at_rank ? 
                                 (@$intersection / @retrieved_at_rank) : 0;
            push @Precision_values, $precision_at_rank;
            my $recall_at_rank = @$intersection / @relevant_set;
            push @Recall_values, $recall_at_rank;
            $rank++;
        }
        print "\n\nFor query $query, precision values: @Precision_values\n"
            if $self->{_debug};
        print "\nFor query $query, recall values: @Recall_values\n"
            if $self->{_debug};      
        $self->{_precision_for_queries}->{$query} = \@Precision_values;
        my $avg_precision;
        $avg_precision += $_ for @Precision_values;        
        $self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values);
        $self->{_recall_for_queries}->{$query} = \@Recall_values;
    }
    print "\n\n=========  query by query processing for Precision vs. Recall calculations finished  ========\n\n"  
                    if $self->{_debug};
    my @avg_precisions;
    foreach (keys %{$self->{_avg_precision_for_queries}}) {
        push @avg_precisions, $self->{_avg_precision_for_queries}->{$_};
    }
    $self->{_map} += $_ for @avg_precisions;
    $self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}};
}

sub display_average_precision_for_queries_and_map {
    my $self = shift;
    die "You must first invoke precision_and_recall_calculator function" 
        unless scalar(keys %{$self->{_avg_precision_for_queries}});
    print "\n\nDisplaying average precision for different queries:\n\n";
    foreach my $query (sort 
                         {get_integer_suffix($a) <=> get_integer_suffix($b)} 
                         keys %{$self->{_avg_precision_for_queries}}) {
        my $output = sprintf "Query %s  =>   %.3f", 
                 $query, $self->{_avg_precision_for_queries}->{$query};
        print "$output\n";
    }
    print "\n\nMAP value: $self->{_map}\n\n";
}

sub display_precision_vs_recall_for_queries {
    my $self = shift;
    die "You must first invoke precision_and_recall_calculator function" 
        unless scalar(keys %{$self->{_precision_for_queries}});
    print "\n\nDisplaying precision and recall values for different queries:\n\n";
    foreach my $query (sort 
                         {get_integer_suffix($a) <=> get_integer_suffix($b)} 
                         keys %{$self->{_avg_precision_for_queries}}) {
        print "\n\nQuery $query:\n";
        print "\n   (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
        my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
        @precision_vals = map {sprintf "%.3f", $_} @precision_vals;
        print "   Precision at rank  =>  @precision_vals\n";
        my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
        @recall_vals = map {sprintf "%.3f", $_} @recall_vals;
        print "\n   Recall at rank   =>  @recall_vals\n";
    }
    print "\n\n";
}

sub get_query_sorted_average_precision_for_queries {
    my $self = shift;
    die "You must first invoke precision_and_recall_calculator function" 
        unless scalar(keys %{$self->{_avg_precision_for_queries}});
    my @average_precisions_for_queries = ();
    foreach my $query (sort 
                         {get_integer_suffix($a) <=> get_integer_suffix($b)} 
                         keys %{$self->{_avg_precision_for_queries}}) {
        my $output = sprintf "%.3f", $self->{_avg_precision_for_queries}->{$query};
        push @average_precisions_for_queries, $output;
    }
    return \@average_precisions_for_queries;
}

###################################  Utility Routines  ###################################

sub _check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / corpus_directory
                            corpus_vocab_db
                            doc_vectors_db
                            normalized_doc_vecs_db
                            use_idf_filter
                            stop_words_file
                            file_types
                            case_sensitive
                            max_number_retrievals
                            query_file
                            relevancy_file
                            min_word_length
                            want_stemming
                            lsa_svd_threshold
                            relevancy_threshold
                            break_camelcased_and_underscored
                            save_model_on_disk
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

lib/Algorithm/VSM.pm  view on Meta::CPAN

    my $vec2 = shift;
    croak "Something is wrong --- the two vectors are of unequal length"
        unless @$vec1 == @$vec2;
    my $product;
    for my $i (0..@$vec1-1) {
        $product += $vec1->[$i] * $vec2->[$i];
    }
    return $product;
}

sub vec_magnitude {
    my $vec = shift;
    my $mag_squared = 0;
    foreach my $num (@$vec) {
        $mag_squared += $num ** 2;
    }
    return sqrt $mag_squared;
}

sub sum {
    my $vec = shift;
    my $result;
    for my $item (@$vec) {
        $result += $item;
    }
    return $result;
}

sub simple_stemmer {
    my $word = shift;
    my $debug = shift;
    print "\nStemming the word:        $word\n" if $debug;
    $word =~ s/(.*[a-z][^aeious])s$/$1/i;
    $word =~ s/(.*[a-z]s)es$/$1/i;
    $word =~ s/(.*[a-z][ck])es$/$1e/i;
    $word =~ s/(.*[a-z]+)tions$/$1tion/i;
    $word =~ s/(.*[a-z]+)mming$/$1m/i;
    $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
    $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
    $word =~ s/(.*[a-z]+)tices$/$1tex/i;
    $word =~ s/(.*[a-z]+)pes$/$1pe/i;
    $word =~ s/(.*[a-z]+)sed$/$1se/i;
    $word =~ s/(.*[a-z]+)ed$/$1/i;
    $word =~ s/(.*[a-z]+)tation$/$1t/i;
    print "Stemmed word:                           $word\n\n" if $debug;
    return $word;
}

# Assumes the array is sorted in a descending order, as would be the
# case with an array of singular values produced by an SVD algorithm
sub return_index_of_last_value_above_threshold {
    my $pdl_obj = shift;
    my $size = $pdl_obj->getdim(0);
    my $threshold = shift;
    my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
    my $i = 0;
    while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
    return $i-1;
}

sub set_intersection {
    my $set1 = shift;
    my $set2 = shift;
    my %hset1 = map {$_ => 1} @$set1;
    my  @common_elements = grep {$hset1{$_}} @$set2;
    return @common_elements ? \@common_elements : [];
}

sub get_integer_suffix {
    my $label = shift;
    $label =~ /(\d*)$/;
    return $1;
}

1;

=pod

=head1 NAME

Algorithm::VSM --- A Perl module for retrieving files and documents from a software
library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
algorithms in response to search words and phrases.

=head1 SYNOPSIS

  # FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:

        use Algorithm::VSM;

        my $corpus_dir = "corpus";
        my @query = qw/ program ListIterator add ArrayList args /;
        my $stop_words_file = "stop_words.txt";  
        my $vsm = Algorithm::VSM->new( 
                            break_camelcased_and_underscored  => 1, 
                            case_sensitive         => 0,
                            corpus_directory       => $corpus_dir,
                            file_types             => ['.txt', '.java'],
                            max_number_retrievals  => 10,
                            min_word_length        => 4,
                            stop_words_file        => $stop_words_file,
                            use_idf_filter         => 1,
                            want_stemming          => 1,
        );
        $vsm->get_corpus_vocabulary_and_word_counts();
        $vsm->display_corpus_vocab();
        $vsm->display_corpus_vocab_size();
        $vsm->write_corpus_vocab_to_file("vocabulary_dump.txt");
        $vsm->display_inverse_document_frequencies();
        $vsm->generate_document_vectors();
        $vsm->display_doc_vectors();
        $vsm->display_normalized_doc_vectors();
        my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query );
        $vsm->display_retrievals( $retrievals );

     The purpose of each constructor option and what is accomplished by the method
     calls should be obvious by their names.  If not, they are explained in greater
     detail elsewhere in this documentation page.  Note that the methods
     display_corpus_vocab() and display_doc_vectors() are there only for testing
     purposes with small corpora.  If you must use them for large libraries/corpora,
     you might wish to redirect the output to a file.

lib/Algorithm/VSM.pm  view on Meta::CPAN

                   break_camelcased_and_underscored  => 1,  
                   case_sensitive           => 0,           
                   corpus_directory         => $corpus_dir,
                   file_types               => ['.txt', '.java'],
                   min_word_length          => 4,
                   stop_words_file          => $stop_words_file,
                   want_stemming            => 1,           
        );
        $vsm->get_corpus_vocabulary_and_word_counts();
        $vsm->generate_document_vectors();
        # code for calculating pairwise similarities as shown in the
        # script calculate_similarity_matrix_for_all_docs.pl in the
        # examples directory.  This script makes calls to
        #
        #   $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]);        
        #
        # for every pair of documents.

=head1 CHANGES

Version 1.70: All of the changes made in this version affect only that part of the
module that is used for calculating precision-vs.-recall curve for the estimation of
MAP (Mean Average Precision).  The new formulas that go into estimating MAP are
presented in the author's tutorial on significance testing.  Additionally, when
estimating the average retrieval precision for a query, this version explicitly
disregards all documents that have zero similarity with the query.

Version 1.62 removes the Perl version restriction on the module. This version also
fixes two bugs, one in the file scanner code and the other in the
precision-and-recall calculator.  The file scanner bug was related to the new
constructor parameter C<case_sensitive> that was introduced in Version 1.61.  And the
precision-and-recall calculator bug was triggered if a query consisted solely of
non-vocabulary words.

Version 1.61 improves the implementation of the directory scanner to make it more
platform independent.  Additionally, you are now required to specify in the
constructor call the file types to be considered for computing the database model.
If, say, you have a large software library and you want only Java and text files to
be scanned for creating the VSM (or the LSA) model, you must supply that information
to the module by setting the constructor parameter C<file_types> to the anonymous
list C<['.java', '.txt']>.  An additional constructor parameter introduced in this
version is C<case_sensitive>.  If you set it to 1, that will force the database model
and query matching to become case sensitive.

Version 1.60 reflects the fact that people are now more likely to use this module by
keeping the model constructed for a corpus in the fast memory (as opposed to storing
the models in disk-based hash tables) for its repeated invocation for different
queries.  As a result, the default value for the constructor option
C<save_model_on_disk> was changed from 1 to 0.  For those who still wish to store on
a disk the model that is constructed, the script
C<retrieve_with_VSM_and_also_create_disk_based_model.pl> shows how you can do that.
Other changes in 1.60 include a slight reorganization of the scripts in the
C<examples> directory.  Most scripts now do not by default store their models in
disk-based hash tables.  This reorganization is reflected in the description of the
C<examples> directory in this documentation.  The basic logic of constructing VSM and
LSA models and how these are used for retrievals remains unchanged.

Version 1.50 incorporates a couple of new features: (1) You now have the option to
split camel-cased and underscored words for constructing your vocabulary set; and (2)
Storing the VSM and LSA models in database files on the disk is now optional.  The
second feature, in particular, should prove useful to those who are using this module
for large collections of documents.

Version 1.42 includes two new methods, C<display_corpus_vocab_size()> and
C<write_corpus_vocab_to_file()>, for those folks who deal with very large datasets.
You can get a better sense of the overall vocabulary being used by the module for
file retrieval by examining the contents of a dump file whose name is supplied as an
argument to C<write_corpus_vocab_to_file()>.

Version 1.41 downshifts the required version of the PDL module. Also cleaned up are
the dependencies between this module and the submodules of PDL.

Version 1.4 makes it easier for a user to calculate a similarity matrix over all the
documents in the corpus. The elements of such a matrix express pairwise similarities
between the documents.  The pairwise similarities are based on the dot product of two
document vectors divided by the product of the vector magnitudes.  The 'examples'
directory contains two scripts to illustrate how such matrices can be calculated by
the user.  The similarity matrix is output as a CSV file.

Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a
document file. What that means is that the words that appear in most of the documents
get reduced weighting since such words are non-discriminatory with respect to the
retrieval of the documents. A typical formula that is used to calculate the IDF
weight for a word is the logarithm of the ratio of the total number of documents to
the number of documents in which the word appears.  So if a word were to appear in
all the documents, its IDF multiplier would be zero in the vector representation of a
document.  If so desired, you can turn off the IDF weighting of the words by
explicitly setting the constructor parameter C<use_idf_filter> to zero.

Version 1.2 includes a code correction and some general code and documentation
cleanup.

With Version 1.1, you can access the retrieval precision results so that you can
compare two different retrieval algorithms (VSM or LSA with different choices for
some of the constructor parameters) with significance testing. (Version 1.0 merely
sent those results to standard output, typically your terminal window.)  In Version
1.1, the new script B<significance_testing.pl> in the 'examples' directory
illustrates significance testing with Randomization and with Student's Paired t-Test.

=head1 DESCRIPTION

B<Algorithm::VSM> is a I<perl5> module for constructing a Vector Space Model (VSM) or
a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred
to as a corpus, and then retrieving the documents in response to search words in a
query.

VSM and LSA models have been around for a long time in the Information Retrieval (IR)
community.  More recently such models have been shown to be effective in retrieving
files/documents from software libraries. For an account of this research that was
presented by Shivani Rao and the author of this module at the 2011 Mining Software
Repositories conference, see L<http://portal.acm.org/citation.cfm?id=1985451>.

VSM modeling consists of: (1) Extracting the vocabulary used in a corpus.  (2)
Stemming the words so extracted and eliminating the designated stop words from the
vocabulary.  Stemming means that closely related words like 'programming' and
'programs' are reduced to the common root word 'program' and the stop words are the
non-discriminating words that can be expected to exist in virtually all the
documents. (3) Constructing document vectors for the individual files in the corpus
--- the document vectors taken together constitute what is usually referred to as a
'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to
factor out the effect of document size and, if desired, multiplying the term

lib/Algorithm/VSM.pm  view on Meta::CPAN

C<relevancy_file>.


=item B<get_all_document_names():>

If you want to get hold of all the filenames in the corpus in your own script, you
can call

    my @docs = @{$vsm->get_all_document_names()};

The array on the left will contain an alphabetized list of the files.


=item B<generate_document_vectors():>

This is a necessary step after the vocabulary used by a corpus is constructed. (Of
course, if you will be doing document retrieval through a disk-stored VSM or LSA
model, then you do not need to call this method.  You construct document vectors
through the following call:

    $vsm->generate_document_vectors();


=item B<get_corpus_vocabulary_and_word_counts():>

After you have constructed a new instance of the C<Algorithm::VSM> class, you must
now scan the corpus documents for constructing the corpus vocabulary. This you do by:

    $vsm->get_corpus_vocabulary_and_word_counts();

The only time you do NOT need to call this method is when you are using a previously
constructed disk-stored VSM model for retrieval.


=item B<get_query_sorted_average_precision_for_queries():>

If you want to run significance tests on the retrieval accuracies you obtain on a
given corpus and with different algorithms (VSM or LSA with different choices for the
constructor parameters), your own script would need access to the average precision
data for a set of queries. You can get hold of this data by calling

    $vsm->get_query_sorted_average_precision_for_queries();

The script C<significance_testing.pl> in the 'examples' directory shows how you can
use this method for significance testing.


=item B<pairwise_similarity_for_docs():>

=item B<pairwise_similarity_for_normalized_docs():>

If you would like to compare in your own script any two documents in the corpus, you
can call

    my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2");
or
    my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2");

Both these calls return a number that is the dot product of the two document vectors
normalized by the product of their magnitudes.  The first call uses the regular
document vectors and the second the normalized document vectors.


=item B<precision_and_recall_calculator():>

After you have created or obtained the relevancy judgments for your test queries, you
can make the following call to calculate C<Precision@rank> and C<Recall@rank>:

    $vsm->precision_and_recall_calculator('vsm');
or 
    $vsm->precision_and_recall_calculator('lsa');

depending on whether you are testing VSM-based retrieval or LSA-based retrieval.

=item B<retrieve_with_lsa():>

After you have built an LSA model through the call to C<construct_lsa_model()>, you
can retrieve the document names most similar to the query by:

    my $retrievals = $vsm->retrieve_with_lsa( \@query );

Subsequently, you can display the retrievals by calling the
C<display_retrievals($retrieval)> method described previously.


=item B<retrieve_with_vsm():>

After you have constructed a VSM model, you call this method for document retrieval
for a given query C<@query>.  The call syntax is:

    my $retrievals = $vsm->retrieve_with_vsm( \@query );

The argument, C<@query>, is simply a list of words that you wish to use for
retrieval. The method returns a hash whose keys are the document names and whose
values the similarity distance between the document and the query.  As is commonly
the case with VSM, this module uses the cosine similarity distance when comparing a
document vector with the query vector.


=item B<upload_document_relevancies_from_file():>

When human-supplied relevancies are available, you can upload them into the program
by calling

    $vsm->upload_document_relevancies_from_file();

These relevance judgments will be read from a file that is named with the
C<relevancy_file> constructor parameter.


=item B<upload_normalized_vsm_model_from_disk():>

When you invoke the methods C<get_corpus_vocabulary_and_word_counts()> and
C<generate_document_vectors()>, that automatically deposits the VSM model in the
database files named with the constructor parameters C<corpus_vocab_db>,
C<doc_vectors_db> and C<normalized_doc_vecs_db>.  Subsequently, you can carry out
retrieval by directly using this disk-based VSM model for speedier performance.  In
order to do so, you must upload the disk-based model by

    $vsm->upload_normalized_vsm_model_from_disk();



( run in 0.509 second using v1.01-cache-2.11-cpan-39bf76dae61 )