Algorithm-VSM
view release on metacpan or search on metacpan
lib/Algorithm/VSM.pm view on Meta::CPAN
package Algorithm::VSM;
#---------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::VSM is a Perl module for retrieving the documents from a software
# library that match a list of words in a query. The matching criterion used depends
# on whether you ask the module to construct a full-dimensionality VSM or a
# reduced-dimensionality LSA model for the library.
# ---------------------------------------------------------------------------
#use 5.10.0;
use strict;
use warnings;
use Carp;
use SDBM_File;
use PDL::Lite;
use PDL::MatrixOps;
use File::Basename;
use File::Spec::Functions qw(rel2abs);
use Fcntl;
use Storable;
use Cwd;
our $VERSION = '1.70';
# for camelcase splits (from perlmonks):
my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/;
################################### Constructor #######################################
# Constructor for creating a VSM or LSA model of a corpus. The model instance
# returned by the constructor can be used for retrieving documents from the corpus
# in response to queries.
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if _check_for_illegal_params(@params) == 0;
bless {
_corpus_directory => $args{corpus_directory} || "",
_save_model_on_disk => $args{save_model_on_disk} || 0,
_break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
$args{break_camelcased_and_underscored} : 1,
_corpus_vocab_db => $args{corpus_vocab_db} || "corpus_vocab_db",
_doc_vectors_db => $args{doc_vectors_db} || "doc_vectors_db",
_normalized_doc_vecs_db => $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
_stop_words_file => $args{stop_words_file} || "",
_case_sensitive => $args{case_sensitive} || 0,
_query_file => $args{query_file} || "",
_file_types => $args{file_types} || [],
_min_word_length => $args{min_word_length} || 4,
_want_stemming => $args{want_stemming} || 0,
_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
_max_number_retrievals => $args{max_number_retrievals} || 30,
_lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01,
_relevancy_threshold => exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
_relevancy_file => $args{relevancy_file} || "",
_debug => $args{debug} || 0,
_working_directory => cwd,
_vocab_hist_on_disk => {},
_vocab_hist => {},
_doc_hist_template => {},
_corpus_doc_vectors => {},
_normalized_doc_vecs => {},
_query_vector => {},
_stop_words => [],
_term_document_matrix => [],
_corpus_vocab_done => 0,
_scan_dir_for_rels => 0,
_vocab_size => undef,
_doc_vecs_trunc_lsa => {},
_lsa_vec_truncator => undef,
_queries_for_relevancy => {},
_relevancy_estimates => {},
_precision_for_queries => {},
_avg_precision_for_queries => {},
_recall_for_queries => {},
_map => undef,
_vocab_idf_hist => {},
_idf_t => {},
_total_num_of_docs => 0,
}, $class;
}
###################### Get corpus vocabulary and word counts #########################
sub get_corpus_vocabulary_and_word_counts {
my $self = shift;
die "You must supply the name of the corpus directory to the constructor"
unless $self->{_corpus_directory};
print "Scanning the directory '$self->{_corpus_directory}' for\n" .
" model construction\n\n" if $self->{_debug};
$self->_scan_directory( $self->{_corpus_directory} );
$self->_drop_stop_words() if $self->{_stop_words_file};
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
( run in 2.007 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )