Algorithm-VSM

 view release on metacpan or  search on metacpan

examples/retrieve_with_disk_based_LSA.pl  view on Meta::CPAN

#!/usr/bin/perl -w

### retrieve_with_disk_based_LSA.pl


##  This scripts shows how you can carry out LSA-based retrieval using the
##  disk-based database files created by first running the script:

##      retrieve_with_VSM_and_also_create_disk_based_model.pl

##  You must therefore run the above-named script before executing the current
##  script. 


use strict;
use Algorithm::VSM;

print "\nIMPORTANT:  We assume that you have previously called\n\n" .
      "    retrieve_with_VSM_and_also_create_disk_based_model.pl \n\n" .
      "on the same corpus with the following constructor options:\n\n" .
      "           use_idf_filter  => 1,                 \n" .
      "           save_model_on_disk  => 1,           \n\n" .

      "The call to the above script generates the disk-based hashtables\n" .
      "needed by the current script\n";


my @query = qw/ string getAllChars throw IOException distinct TreeMap histogram map /;

#     The three databases mentioned in the next three statements are
#     created by calling the script
#     retrieve_with_VSM_and_also_create_disk_based_model.pl.  The first of
#     the databases stores the corpus vocabulary and term frequencies for
#     the vocabulary words.  The second database stores the term frequency
#     vectors for the individual documents in the corpus. The third
#     database stores the normalized document vectors.  As to what is meant
#     by document normalization, see the script retrieve_with_VSM.pl

my $corpus_vocab_db = "corpus_vocab_db";
my $doc_vectors_db  = "doc_vectors_db";
my $normalized_doc_vecs_db = "normalized_doc_vecs_db";

my $lsa = Algorithm::VSM->new( 
                   corpus_vocab_db          => $corpus_vocab_db,
                   doc_vectors_db           => $doc_vectors_db,
                   normalized_doc_vecs_db   => $normalized_doc_vecs_db,
                   max_number_retrievals    => 10,
          );

$lsa->upload_normalized_vsm_model_from_disk();

#   Uncomment the following if you would like to see the corpus vocabulary:
#$lsa->display_corpus_vocab();

#   Uncomment the following if you would like to see the doc vectors for
#   each of the documents in the corpus:
#$lsa->display_doc_vectors();


$lsa->construct_lsa_model();

my $retrievals = $lsa->retrieve_with_lsa( \@query );

$lsa->display_retrievals( $retrievals );



( run in 0.521 second using v1.01-cache-2.11-cpan-39bf76dae61 )