Algorithm-VSM
view release on metacpan or search on metacpan
examples/retrieve_with_LSA.pl view on Meta::CPAN
#!/usr/bin/perl -w
## retrieve_with_LSA.pl
## This is a demonstration of retrieval using the LSA model
use strict;
use Algorithm::VSM;
my $corpus_dir = "corpus";
my @query = qw/ string getAllChars throw IOException distinct TreeMap histogram map /;
my $stop_words_file = "stop_words.txt"; # This file will typically include the
# keywords of the programming
# language(s) used in the software.
my $lsa = Algorithm::VSM->new(
break_camelcased_and_underscored => 1, # default: 1
case_sensitive => 0, # default: 0
corpus_directory => $corpus_dir,
file_types => ['.txt', '.java'],
lsa_svd_threshold => 0.01,# Used for rejecting singular
# values that are smaller than
# this threshold fraction of
# the largest singular value.
max_number_retrievals => 10,
min_word_length => 4,
stop_words_file => $stop_words_file,
use_idf_filter => 1,
want_stemming => 1, # Default: 0
);
$lsa->get_corpus_vocabulary_and_word_counts();
# Uncomment the following statement if you would like to see the corpus
# vocabulary:
#$lsa->display_corpus_vocab();
# Uncomment the following statement if you would like to see the corpus
# vocabulary size:
$lsa->display_corpus_vocab_size();
# Uncomment the following statement if you would like to dump the corpus
# vocabulary in a file that you supply as an argument in the following call:
$lsa->write_corpus_vocab_to_file("vocabulary_dump.txt");
# Uncomment the following statement if you would like to see the inverse
# document frequencies:
#$lsa->display_inverse_document_frequencies();
$lsa->generate_document_vectors();
# Uncomment the following if you would like to see the doc vectors for
# each of the documents in the corpus:
#$lsa->display_doc_vectors();
# Uncomment the folloiwng statement if you would like to the individual
# normalized document vectors:
#$lsa->display_normalized_doc_vectors();
$lsa->construct_lsa_model();
my $retrievals = $lsa->retrieve_with_lsa( \@query );
$lsa->display_retrievals( $retrievals );
( run in 0.502 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )