Algorithm-VSM

 view release on metacpan or  search on metacpan

lib/Algorithm/VSM.pm  view on Meta::CPAN

package Algorithm::VSM;

#---------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::VSM is a Perl module for retrieving the documents from a software
# library that match a list of words in a query. The matching criterion used depends
# on whether you ask the module to construct a full-dimensionality VSM or a
# reduced-dimensionality LSA model for the library.
# ---------------------------------------------------------------------------

#use 5.10.0;
use strict;
use warnings;
use Carp;
use SDBM_File;
use PDL::Lite;
use PDL::MatrixOps;
use File::Basename;
use File::Spec::Functions qw(rel2abs);
use Fcntl;
use Storable;
use Cwd;

our $VERSION = '1.70';

# for camelcase splits (from perlmonks):
my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/; 

###################################   Constructor  #######################################

#  Constructor for creating a VSM or LSA model of a corpus.  The model instance
#  returned by the constructor can be used for retrieving documents from the corpus
#  in response to queries.
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if _check_for_illegal_params(@params) == 0;
    bless {
        _corpus_directory           =>  $args{corpus_directory}   || "",
        _save_model_on_disk         =>  $args{save_model_on_disk} || 0,
        _break_camelcased_and_underscored  => exists $args{break_camelcased_and_underscored} ?
                                              $args{break_camelcased_and_underscored} : 1,
        _corpus_vocab_db            =>  $args{corpus_vocab_db} || "corpus_vocab_db",
        _doc_vectors_db             =>  $args{doc_vectors_db} || "doc_vectors_db",
        _normalized_doc_vecs_db     =>  $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
        _stop_words_file            =>  $args{stop_words_file} || "",
        _case_sensitive             =>  $args{case_sensitive} || 0,
        _query_file                 =>  $args{query_file} || "",
        _file_types                 =>  $args{file_types} || [],
        _min_word_length            =>  $args{min_word_length} || 4,
        _want_stemming              =>  $args{want_stemming} || 0,
        _idf_filter_option          =>  exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
        _max_number_retrievals      =>  $args{max_number_retrievals} || 30,
        _lsa_svd_threshold          =>  $args{lsa_svd_threshold} || 0.01,
        _relevancy_threshold        =>  exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
        _relevancy_file             =>  $args{relevancy_file} || "",
        _debug                      =>  $args{debug} || 0,
        _working_directory          =>  cwd,
        _vocab_hist_on_disk         =>  {},
        _vocab_hist                 =>  {},
        _doc_hist_template          =>  {},
        _corpus_doc_vectors         =>  {},
        _normalized_doc_vecs        =>  {},
        _query_vector               =>  {},
        _stop_words                 =>  [],
        _term_document_matrix       =>  [],
        _corpus_vocab_done          =>  0,
        _scan_dir_for_rels          =>  0,
        _vocab_size                 =>  undef,
        _doc_vecs_trunc_lsa         =>  {},
        _lsa_vec_truncator          =>  undef,
        _queries_for_relevancy      =>  {},
        _relevancy_estimates        =>  {},
        _precision_for_queries      =>  {},
        _avg_precision_for_queries  =>  {},
        _recall_for_queries         =>  {},
        _map                        =>  undef,
        _vocab_idf_hist             =>  {},
        _idf_t                      =>  {},
        _total_num_of_docs          =>  0,
    }, $class;
}


######################    Get corpus vocabulary and word counts  #########################

sub get_corpus_vocabulary_and_word_counts {
    my $self = shift;
    die "You must supply the name of the corpus directory to the constructor"
        unless $self->{_corpus_directory};
    print "Scanning the directory '$self->{_corpus_directory}' for\n" .
        "  model construction\n\n" if $self->{_debug};
    $self->_scan_directory( $self->{_corpus_directory} );
    $self->_drop_stop_words() if $self->{_stop_words_file};
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }



( run in 2.007 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )