Algorithm-VSM

 view release on metacpan or  search on metacpan

lib/Algorithm/VSM.pm  view on Meta::CPAN


#---------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::VSM is a Perl module for retrieving the documents from a software
# library that match a list of words in a query. The matching criterion used depends
# on whether you ask the module to construct a full-dimensionality VSM or a
# reduced-dimensionality LSA model for the library.
# ---------------------------------------------------------------------------

#use 5.10.0;
use strict;
use warnings;
use Carp;
use SDBM_File;
use PDL::Lite;
use PDL::MatrixOps;
use File::Basename;
use File::Spec::Functions qw(rel2abs);
use Fcntl;
use Storable;
use Cwd;

our $VERSION = '1.70';

# for camelcase splits (from perlmonks):
my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/; 

###################################   Constructor  #######################################

#  Constructor for creating a VSM or LSA model of a corpus.  The model instance
#  returned by the constructor can be used for retrieving documents from the corpus
#  in response to queries.
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if _check_for_illegal_params(@params) == 0;
    bless {
        _corpus_directory           =>  $args{corpus_directory}   || "",
        _save_model_on_disk         =>  $args{save_model_on_disk} || 0,
        _break_camelcased_and_underscored  => exists $args{break_camelcased_and_underscored} ?
                                              $args{break_camelcased_and_underscored} : 1,
        _corpus_vocab_db            =>  $args{corpus_vocab_db} || "corpus_vocab_db",
        _doc_vectors_db             =>  $args{doc_vectors_db} || "doc_vectors_db",
        _normalized_doc_vecs_db     =>  $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
        _stop_words_file            =>  $args{stop_words_file} || "",
        _case_sensitive             =>  $args{case_sensitive} || 0,
        _query_file                 =>  $args{query_file} || "",
        _file_types                 =>  $args{file_types} || [],
        _min_word_length            =>  $args{min_word_length} || 4,
        _want_stemming              =>  $args{want_stemming} || 0,
        _idf_filter_option          =>  exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
        _max_number_retrievals      =>  $args{max_number_retrievals} || 30,
        _lsa_svd_threshold          =>  $args{lsa_svd_threshold} || 0.01,
        _relevancy_threshold        =>  exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
        _relevancy_file             =>  $args{relevancy_file} || "",
        _debug                      =>  $args{debug} || 0,
        _working_directory          =>  cwd,
        _vocab_hist_on_disk         =>  {},
        _vocab_hist                 =>  {},
        _doc_hist_template          =>  {},
        _corpus_doc_vectors         =>  {},
        _normalized_doc_vecs        =>  {},
        _query_vector               =>  {},
        _stop_words                 =>  [],
        _term_document_matrix       =>  [],
        _corpus_vocab_done          =>  0,
        _scan_dir_for_rels          =>  0,
        _vocab_size                 =>  undef,
        _doc_vecs_trunc_lsa         =>  {},
        _lsa_vec_truncator          =>  undef,
        _queries_for_relevancy      =>  {},
        _relevancy_estimates        =>  {},
        _precision_for_queries      =>  {},
        _avg_precision_for_queries  =>  {},
        _recall_for_queries         =>  {},
        _map                        =>  undef,
        _vocab_idf_hist             =>  {},
        _idf_t                      =>  {},
        _total_num_of_docs          =>  0,
    }, $class;
}


######################    Get corpus vocabulary and word counts  #########################

sub get_corpus_vocabulary_and_word_counts {
    my $self = shift;
    die "You must supply the name of the corpus directory to the constructor"
        unless $self->{_corpus_directory};
    print "Scanning the directory '$self->{_corpus_directory}' for\n" .
        "  model construction\n\n" if $self->{_debug};
    $self->_scan_directory( $self->{_corpus_directory} );
    $self->_drop_stop_words() if $self->{_stop_words_file};
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    if ($self->{_save_model_on_disk}) {
        unlink glob "$self->{_corpus_vocab_db}.*";   
        unlink glob "$self->{_doc_vectors_db}.*";   
        unlink glob "$self->{_normalized_doc_vecs_db}.*";   
        tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',  
                 $self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
                or die "Can't create DBM files: $!";       
        foreach (keys %{$self->{_vocab_hist}}) {
            $self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
        }
        untie %{$self->{_vocab_hist_on_disk}};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
            if $self->{_debug};
    # Calculate idf(t):
    foreach (keys %{$self->{_vocab_idf_hist}}) {
        $self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
                                      /
                                      (1 + $self->{_vocab_idf_hist}->{$_}))) 
                                      / log(10) ); 
    }
}

sub display_corpus_vocab {
    my $self = shift;
    die "corpus vocabulary not yet constructed"
        unless keys %{$self->{_vocab_hist}};
    print "\n\nDisplaying corpus vocabulary:\n\n";
    foreach (sort keys %{$self->{_vocab_hist}}){
        my $outstring = sprintf("%30s     %d", $_,$self->{_vocab_hist}->{$_});
        print "$outstring\n";
    }
}

sub display_corpus_vocab_size {
    my $self = shift;
    die "corpus vocabulary not yet constructed"
        unless keys %{$self->{_vocab_hist}};
    my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
    print "\nSize of the corpus vocabulary: $vocab_size\n\n";
}

sub write_corpus_vocab_to_file {
    my $self = shift;
    my $file = shift;
    die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
    open OUT, "> $file" 
       or die "unable to open for output a file with name `$file': $!";
    foreach (sort keys %{$self->{_vocab_hist}}){
        my $outstring = sprintf("%30s     %d", $_,$self->{_vocab_hist}->{$_});
        print OUT "$outstring\n";
    }
    close OUT;
}

sub display_inverse_document_frequencies {
    my $self = shift;
    die "corpus vocabulary not yet constructed"
        unless keys %{$self->{_vocab_idf_hist}};
    print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
        unless $self->{_idf_filter_option};
    print "\n\nDisplaying inverse document frequencies:\n";
    foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {               
        my $outstring = sprintf("%30s     %d", 
                       $_, $self->{_vocab_idf_hist}->{$_});
        print "$outstring\n";
    }
    print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n";
    foreach ( sort keys %{$self->{_idf_t}} ) {               
        my $outstring = sprintf("%30s     %f", $_,$self->{_idf_t}->{$_});
        print "$outstring\n";
    }
}

lib/Algorithm/VSM.pm  view on Meta::CPAN

    my $doc2 = shift;
    my @all_files = keys %{$self->{_corpus_doc_vectors}};
    croak "The file $doc1 does not exist in the corpus:  " unless contained_in($doc1, @all_files);
    croak "The file $doc2 does not exist in the corpus:  " unless contained_in($doc2, @all_files);
    my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1};
    my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2};
    my @vec1 = ();
    my @vec2 = ();
    foreach my $word (sort keys %$vec_hash_ref1) {
        push @vec1, $vec_hash_ref1->{$word};
        push @vec2, $vec_hash_ref2->{$word};
    }
    my $vec_mag1 = vec_magnitude(\@vec1);
    my $vec_mag2 = vec_magnitude(\@vec2);
    my $product = vec_scalar_product(\@vec1, \@vec2);
    $product /= $vec_mag1 * $vec_mag2;
    return $product;
}

sub pairwise_similarity_for_normalized_docs {
    my $self = shift;
    my $doc1 = shift;
    my $doc2 = shift;
    my @all_files = keys %{$self->{_corpus_doc_vectors}};
    croak "The file $doc1 does not exist in the corpus:  " unless contained_in($doc1, @all_files);
    croak "The file $doc2 does not exist in the corpus:  " unless contained_in($doc2, @all_files);
    my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1};
    my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2};
    my @vec1 = ();
    my @vec2 = ();
    foreach my $word (sort keys %$vec_hash_ref1) {
        push @vec1, $vec_hash_ref1->{$word};
        push @vec2, $vec_hash_ref2->{$word};
    }
    my $vec_mag1 = vec_magnitude(\@vec1);
    my $vec_mag2 = vec_magnitude(\@vec2);
    my $product = vec_scalar_product(\@vec1, \@vec2);
    $product /= $vec_mag1 * $vec_mag2;
    return $product;
}

###############################  Retrieve with VSM Model  ################################

sub retrieve_with_vsm {
    my $self = shift;
    my $query = shift;
    my @clean_words;
    my $min = $self->{_min_word_length};

    if ($self->{_break_camelcased_and_underscored}) {
        my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
        @clean_words = map {$_ =~ /$_regex/g} @brokenup;
        @clean_words = $self->{_case_sensitive} ? 
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
    } else {
        my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
        @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
    }
    $query = \@clean_words;
    print "\nYour query words are: @$query\n" if $self->{_debug};
    if ($self->{_idf_filter_option}) {
        die "\nYou need to first generate normalized document vectors before you can call  retrieve_with_vsm()"
            unless scalar(keys %{$self->{_vocab_hist}}) 
                  && scalar(keys %{$self->{_normalized_doc_vecs}});
    } else {
        die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
            unless scalar(keys %{$self->{_vocab_hist}}) 
                  && scalar(keys %{$self->{_corpus_doc_vectors}});
    }
    foreach ( keys %{$self->{_vocab_hist}} ) {        
        $self->{_query_vector}->{$_} = 0;    
    }
    foreach (@$query) {
        if ($self->{_case_sensitive}) {
            $self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_};
        } else {
            $self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
        }
    }
    my @query_word_counts = values %{$self->{_query_vector}};
    my $query_word_count_total = sum(\@query_word_counts);
    die "\nYour query does not contain corpus words. Nothing retrieved.\n"
        unless $query_word_count_total;
    my %retrievals;
    if ($self->{_idf_filter_option}) {
        print "\n\nUsing idf filter option for retrieval:\n\n" 
                                                if $self->{_debug};
        foreach (sort {$self->_doc_vec_comparator} 
                         keys %{$self->{_normalized_doc_vecs}}) {
            $retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
        }
    } else {
        print "\n\nNOT using idf filter option for retrieval:\n\n"
                                                if $self->{_debug};
        foreach (sort {$self->_doc_vec_comparator} 
                         keys %{$self->{_corpus_doc_vectors}}) {
            $retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
        }
    }
    if ($self->{_debug}) {
        print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
        foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
            print "$_   =>   $retrievals{$_}\n";
        }
    }
    return \%retrievals;
}

######################### Upload a Previously Constructed Model  #########################

sub upload_vsm_model_from_disk {
    my $self = shift;
    die "\nCannot find the database files for the VSM model"
        unless -s "$self->{_corpus_vocab_db}.pag" 
            && -s $self->{_doc_vectors_db};
    $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
    tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', 
                      $self->{_corpus_vocab_db}, O_RDONLY, 0640
            or die "Can't open DBM file: $!";       
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    foreach (keys %{$self->{_vocab_hist_on_disk}}) {
        $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
               if $self->{_debug};
    $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
    untie %{$self->{_vocab_hist_on_disk}};
}

sub upload_normalized_vsm_model_from_disk {
    my $self = shift;
    die "\nCannot find the database files for the VSM model"
        unless -s "$self->{_corpus_vocab_db}.pag" 
            && -s $self->{_normalized_doc_vecs_db};
    $self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
    tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', 
                      $self->{_corpus_vocab_db}, O_RDONLY, 0640
            or die "Can't open DBM file: $!";       
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    foreach (keys %{$self->{_vocab_hist_on_disk}}) {
        $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
               if $self->{_debug};
    untie %{$self->{_vocab_hist_on_disk}};
}

############################## Display Retrieval Results  ################################

sub display_retrievals {
    my $self = shift;
    my $retrievals = shift;
    print "\n\nShowing the retrievals and the similarity scores:\n\n";
    my $iter = 0;
    foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
        print "$_   =>   $retrievals->{$_}\n"; 
        $iter++;
        last if $iter > $self->{_max_number_retrievals};
    }   
    print "\n\n";
}

###############################    Directory Scanner      ################################

sub _scan_directory {
    my $self = shift;
    my $dir = rel2abs( shift );
    my $current_dir = cwd;
    chdir $dir or die "Unable to change directory to $dir: $!";
    foreach ( glob "*" ) {                                            
        if ( -d and !(-l) ) {
            $self->_scan_directory( $_ );
            chdir $dir                                                
                or die "Unable to change directory to $dir: $!";
        } elsif (-r _ and 
                 -T _ and 
                 -M _ > 0.00001 and  # modification age is at least 1 sec
                !( -l $_ ) and 
                $self->ok_to_filetype($_) ) {
            $self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
            $self->_scan_file($_) unless $self->{_corpus_vocab_done};
            $self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
        }
    }
    chdir $current_dir;
}

sub _scan_file {
    my $self = shift;
    my $file = shift;
    open IN, $file;
    my $min = $self->{_min_word_length};
    my %uniques = ();
    while (<IN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my @clean_words;
        if ($self->{_break_camelcased_and_underscored}) {
            my @brokenup = grep $_, split /\W|_|\s+/, $_;
            @clean_words = map {$_ =~ /$_regex/g} @brokenup;
        @clean_words = $self->{_case_sensitive} ? 
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
        } else {

lib/Algorithm/VSM.pm  view on Meta::CPAN

        @clean_words = grep $_, map &simple_stemmer($_), @clean_words
               if $self->{_want_stemming};
        $self->{_case_sensitive} ?
            map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
            map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
        if ($self->{_case_sensitive}) {
            for (@clean_words) { $uniques{$_}++ }
        } else {
           for (@clean_words) { $uniques{"\L$_"}++ }
        }
    }
    close( IN );
    map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
    $self->{_total_num_of_docs}++;
}

sub ok_to_filetype {
    my $self = shift;    
    my $filename = shift;
    my ($base, $dir, $suffix) = fileparse($filename, '\..*');
    croak "You called this module without specifying the file types in the constructor"
        unless @{$self->{_file_types}} > 0;
    return 1 if contained_in($suffix, @{$self->{_file_types}});
    return 0;
}

############################## LSA Modeling and Retrieval ################################

sub construct_lsa_model {
    my $self = shift;
    if ($self->{_idf_filter_option}) {
        if (!$self->{_normalized_doc_vecs} and 
                            -s $self->{_normalized_doc_vecs_db}) { 
            $self->{_normalized_doc_vecs} = 
                             retrieve($self->{_normalized_doc_vecs_db});
        }
        foreach (sort keys %{$self->{_normalized_doc_vecs}}) {
            my $term_frequency_vec;
            foreach my $word (sort keys 
                      %{$self->{_normalized_doc_vecs}->{$_}}){
                push @$term_frequency_vec,   
                    $self->{_normalized_doc_vecs}->{$_}->{$word};
            }
            push @{$self->{_term_document_matrix}}, $term_frequency_vec;
        } 
    } else {
        if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) { 
            $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
        }
        foreach (sort keys %{$self->{_corpus_doc_vectors}}) {
            my $term_frequency_vec;
            foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
                push @$term_frequency_vec,   
                        $self->{_corpus_doc_vectors}->{$_}->{$word};
            }
            push @{$self->{_term_document_matrix}}, $term_frequency_vec;
        }
    }
    my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
    my ($U,$SIGMA,$V) = svd $A;
    print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
    print "size of svd SIGMA:  ", $SIGMA->dims, "\n" if $self->{_debug};
    my $index = return_index_of_last_value_above_threshold($SIGMA, 
                                          $self->{_lsa_svd_threshold});
    my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
    print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
    # When you measure the size of a matrix in PDL, the zeroth dimension
    # is considered to be along the horizontal and the one-th dimension
    # along the rows.  This is opposite of how we want to look at
    # matrices.  For a matrix of size MxN, we mean M rows and N columns.
    # With this 'rows x columns' convention for matrix size, if you had
    # to check the size of, say, U matrix, you would call
    #  my @size = ( $U->getdim(1), $U->getdim(0) );
    #  print "\nsize of U: @size\n";
    my $U_trunc = $U->slice("0:$index,:")->sever;
    my $V_trunc = $V->slice("0:$index,0:$index")->sever;    
    $self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x 
                                             PDL::Basic::transpose($U_trunc);
    print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
            if $self->{_debug};
    my @sorted_doc_names = $self->{_idf_filter_option} ? 
                       sort keys %{$self->{_normalized_doc_vecs}} :
                       sort keys %{$self->{_corpus_doc_vectors}};
    my $i = 0;
    foreach (@{$self->{_term_document_matrix}}) {
        my $truncated_doc_vec = $self->{_lsa_vec_truncator} x 
                                               PDL::Basic::transpose(pdl($_));
        my $doc_name = $sorted_doc_names[$i++];
        print "\n\nTruncated doc vec for $doc_name: " . 
                 $truncated_doc_vec . "\n" if $self->{_debug};
        $self->{_doc_vecs_trunc_lsa}->{$doc_name} 
                                                 = $truncated_doc_vec;
    }
    chdir $self->{_working_directory};
}

sub retrieve_with_lsa {
    my $self = shift;
    my $query = shift;
    my @clean_words;
    my $min = $self->{_min_word_length};
    if ($self->{_break_camelcased_and_underscored}) {
        my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
        @clean_words = map {$_ =~ /$_regex/g} @brokenup;
        @clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
    } else {
        my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
        @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
    }
    $query = \@clean_words;
    print "\nYour processed query words are: @$query\n" if $self->{_debug};
    die "Your vocabulary histogram is empty" 
        unless scalar(keys %{$self->{_vocab_hist}});
    die "You must first construct an LSA model" 
        unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
    foreach ( keys %{$self->{_vocab_hist}} ) {        
        $self->{_query_vector}->{$_} = 0;    
    }
    foreach (@$query) {
        $self->{_query_vector}->{"\L$_"}++ 
                       if exists $self->{_vocab_hist}->{"\L$_"};
    }
    my @query_word_counts = values %{$self->{_query_vector}};
    my $query_word_count_total = sum(\@query_word_counts);
    die "Query does not contain corpus words. Nothing retrieved."
        unless $query_word_count_total;
    my $query_vec;
    foreach (sort keys %{$self->{_query_vector}}) {
        push @$query_vec, $self->{_query_vector}->{$_};
    }
    print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
    my $truncated_query_vec = $self->{_lsa_vec_truncator} x 
                                               PDL::Basic::transpose(pdl($query_vec));
    print "\n\nTruncated query vector: " .  $truncated_query_vec . "\n"
                                   if $self->{_debug};                  
    my %retrievals;
    foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
        my $dot_product = PDL::Basic::transpose($truncated_query_vec)
                     x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
        print "\n\nLSA: dot product of truncated query and\n" .
              "     truncated vec for doc $_ is " . $dot_product->sclr . "\n"
                                        if $self->{_debug};                  
        $retrievals{$_} = $dot_product->sclr if $dot_product->sclr > 0;
    }
    if ($self->{_debug}) {
        print "\n\nShowing LSA retrievals and similarity scores:\n\n";
        foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
            print "$_   =>   $retrievals{$_}\n";
        }
        print "\n\n";
    }
    return \%retrievals;
}

sub _construct_doc_vector {
    my $self = shift;
    my $file = shift;
    my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
    foreach ( sort keys %{$self->{_doc_hist_template}} ) {  
        $document_vector{$_} = 0;    
    }
    my $min = $self->{_min_word_length};
    my $total_words_in_doc = 0;
    unless (open IN, $file) {
        print "Unable to open file $file in the corpus: $!\n" 
            if $self->{_debug};
        return;
    }
    while (<IN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
        my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
        next unless @clean_words;
        @clean_words = grep $_, 
                       map &simple_stemmer($_, $self->{_debug}), @clean_words
                       if $self->{_want_stemming};
        $self->{_case_sensitive} ? 
            map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
            map { $document_vector{"\L$_"}++ } 
                                       grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words; 
    }
    close IN;
    die "Something went wrong. Doc vector size unequal to vocab size"
        unless $self->{_vocab_size} == scalar(keys %document_vector);
    foreach (keys %document_vector) {        
        $total_words_in_doc += $document_vector{$_};
    }
    my %normalized_doc_vec;
    if ($self->{_idf_filter_option}) {
        foreach (keys %document_vector) {        
            $normalized_doc_vec{$_} = $document_vector{$_}
                                      *
                                      $self->{_idf_t}->{$_}
                                      /
                                      $total_words_in_doc;
        }
    }
    my $pwd = cwd;
    $pwd =~ m{$self->{_corpus_directory}.?(\S*)$};
    my $file_path_name;
    unless ( $1 eq "" ) {
        $file_path_name = "$1/$file";
    } else {
        $file_path_name = $file;
    }
    $self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector;
    $self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec;
}

###################################   Drop Stop Words  ###################################

sub _drop_stop_words {
    my $self = shift;
    open( IN, "$self->{_working_directory}/$self->{_stop_words_file}")
                     or die "unable to open stop words file: $!";
    while (<IN>) {
        next if /^#/;
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_};
        unshift @{$self->{_stop_words}}, $_;
    }
}

###################################  Support Methods  ####################################

sub _doc_vec_comparator {
    my $self = shift;
    my %query_vector = %{$self->{_query_vector}};
    my $vec1_hash_ref = $self->{_idf_filter_option} ?
                                $self->{_normalized_doc_vecs}->{$a} :
                                $self->{_corpus_doc_vectors}->{$a};
    my $vec2_hash_ref = $self->{_idf_filter_option} ?
                                $self->{_normalized_doc_vecs}->{$b} :
                                $self->{_corpus_doc_vectors}->{$b};

lib/Algorithm/VSM.pm  view on Meta::CPAN

    return 1 if $product1 < $product2;
    return 0  if $product1 == $product2;
    return -1  if $product1 > $product2;
}

sub _similarity_to_query {
    my $self = shift;
    my $doc_name = shift;
    my $vec_hash_ref = $self->{_idf_filter_option} ?
                          $self->{_normalized_doc_vecs}->{$doc_name} :
                          $self->{_corpus_doc_vectors}->{$doc_name};
    my @vec = ();
    my @qvec = ();
    foreach my $word (sort keys %$vec_hash_ref) {
        push @vec, $vec_hash_ref->{$word};
        push @qvec, $self->{_query_vector}->{$word};
    }
    my $vec_mag = vec_magnitude(\@vec);
    my $qvec_mag = vec_magnitude(\@qvec);
    my $product = vec_scalar_product(\@vec, \@qvec);
    $product /= $vec_mag * $qvec_mag;
    return $product;
}

######################  Relevance Judgments for Testing Purposes   #######################

## IMPORTANT: This estimation of document relevancies to queries is NOT for
##            serious work.  A document is considered to be relevant to a
##            query if it contains several of the query words.  As to the
##            minimum number of query words that must exist in a document
##            in order for the latter to be considered relevant is
##            determined by the relevancy_threshold parameter in the VSM
##            constructor.  (See the relevancy and precision-recall related
##            scripts in the 'examples' directory.)  The reason for why the
##            function shown below is not for serious work is because
##            ultimately it is the humans who are the best judges of the
##            relevancies of documents to queries.  The humans bring to
##            bear semantic considerations on the relevancy determination
##            problem that are beyond the scope of this module.

sub estimate_doc_relevancies {
    my $self = shift;
    die "You did not set the 'query_file' parameter in the constructor"
        unless $self->{_query_file};
    open( IN, $self->{_query_file} )
               or die "unable to open the query file $self->{_query_file}: $!";
    croak "\n\nYou need to specify a name for the relevancy file in \n" .
        " in which the relevancy judgments will be dumped." 
                                 unless  $self->{_relevancy_file};
    while (<IN>) {
        next if /^#/;
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
        /^[ ]*(q[0-9]+):[ ]*(.*)/;
        my $query_label = $1;
        my $query = $2;
        next unless $query;
        $self->{_queries_for_relevancy}->{$query_label} =  $query;
    }
    if ($self->{_debug}) {
        foreach (sort keys %{$self->{_queries_for_relevancy}}) {
            print "$_   =>   $self->{_queries_for_relevancy}->{$_}\n"; 
        }
    }
    $self->{_scan_dir_for_rels} = 1;
    $self->_scan_directory($self->{_corpus_directory});
    $self->{_scan_dir_for_rels} = 0;
    chdir $self->{_working_directory};
    open(OUT, ">$self->{_relevancy_file}") 
       or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
    my @relevancy_list_for_query;
    foreach (sort 
               {get_integer_suffix($a) <=> get_integer_suffix($b)} 
               keys %{$self->{_relevancy_estimates}}) {    
        @relevancy_list_for_query = 
                        keys %{$self->{_relevancy_estimates}->{$_}};
        print OUT "$_   =>   @relevancy_list_for_query\n\n"; 
        print "Number of relevant docs for query $_: " . 
                         scalar(@relevancy_list_for_query) . "\n";
    }
}

#   If there are available human-supplied relevancy judgments in a disk
#   file, use this script to upload that information.  One of the scripts
#   in the 'examples' directory carries out the precision-recall analysis 
#   by using this approach.  IMPORTANT:  The human-supplied relevancy
#   judgments must be in a format that is shown in the sample file
#   relevancy.txt in the 'examples' directory.
sub upload_document_relevancies_from_file {
    my $self = shift;
    chdir $self->{_working_directory};
    open( IN, $self->{_relevancy_file} )
       or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
    while (<IN>) {
        next if /^#/;
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/;
        /^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/;
        my $query_label = $1;
        my $relevancy_docs_string = $2;
        next unless $relevancy_docs_string;
        my @relevancy_docs  =  grep $_, split / /, $relevancy_docs_string;
        my %relevancies =     map {$_ => 1} @relevancy_docs;
        $self->{_relevancy_estimates}->{$query_label} = \%relevancies;
    }
    if ($self->{_debug}) {
        for (sort keys %{$self->{_relevancy_estimates}}) {
            my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
            print "$_   =>   @rels\n";
        }
    }
}

sub display_doc_relevancies {
    my $self = shift;
    die "You must first estimate or provide the doc relevancies" 
        unless scalar(keys %{$self->{_relevancy_estimates}});
    print "\nDisplaying relevancy judgments:\n\n";
    foreach my $query (sort keys %{$self->{_relevancy_estimates}}) {
        print "Query $query\n";
        foreach my $file (sort {
                          $self->{_relevancy_estimates}->{$query}->{$b}
                          <=>
                          $self->{_relevancy_estimates}->{$query}->{$a}
                          }
            keys %{$self->{_relevancy_estimates}->{$query}}){
            print "     $file  => $self->{_relevancy_estimates}->{$query}->{$file}\n";
        }
    }
}

sub _scan_file_for_rels {
    my $self = shift;
    my $file = shift;
    open IN, $file;
    my @all_text = <IN>;
    @all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
    my $all_text = join ' ', @all_text;
    foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
        my $count = 0;
        my @query_words = grep $_, 
                split /\s+/, $self->{_queries_for_relevancy}->{$query};
        print "Query words for $query:   @query_words\n" if $self->{_debug};
        foreach my $word (@query_words) {
            my @matches = $all_text =~ /$word/gi;
            print "Number of occurrences for word '$word' in file $file: " . 
                scalar(@matches) . "\n" if $self->{_debug};
            $count += @matches if @matches;         
        }
        print "\nRelevancy count for query $query and file $file: $count\n\n"
            if $self->{_debug};
        $self->{_relevancy_estimates}->{$query}->{$file} = $count 
            if $count >= $self->{_relevancy_threshold};
    }
}

#########################   Calculate Precision versus Recall   ##########################

sub precision_and_recall_calculator {
    my $self = shift;
    my $retrieval_type = shift;
    die "You must specify the retrieval type through an argument to the method " .
        "precision_and_recall_calculator().  The retrieval type must either be 'vsm' " .
        "or 'lsa' \n" unless $retrieval_type;
    die "You must first estimate or provide the doc relevancies" 
        unless scalar(keys %{$self->{_relevancy_estimates}});
    unless (scalar(keys %{$self->{_queries_for_relevancy}})) {
        open( IN, $self->{_query_file})
               or die "unable to open the query file $self->{_query_file}: $!";
        while (<IN>) {
            next if /^#/;
            next if /^[ ]*\r?\n?$/;
            $_ =~ s/\r?\n?$//;
            die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
            /^[ ]*(q[0-9]+):[ ]*(.*)/;
            my $query_label = $1;
            my $query = $2;
            next unless $query;
            $self->{_queries_for_relevancy}->{$query_label} =  $query;
        }
        if ($self->{_debug}) {
            print "\n\nDisplaying queries in the query file:\n\n";
            foreach (sort keys %{$self->{_queries_for_relevancy}}) {
                print "$_   =>   $self->{_queries_for_relevancy}->{$_}\n"; 
            }
        }
    }
    foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
        print "\n\n====================================== query: $query ========================================\n\n"
                    if $self->{_debug};
        print "\n\n\nQuery $query:\n" if $self->{_debug};
        my @query_words = grep $_, 
                split /\s+/, $self->{_queries_for_relevancy}->{$query};
        croak "\n\nYou have not specified the retrieval type for " . 
              "precision-recall calculation.  See code in 'examples'" .
              "directory:" if !defined $retrieval_type;
        my $retrievals;
        eval {
            if ($retrieval_type eq 'vsm') {
                $retrievals = $self->retrieve_with_vsm( \@query_words );
            } elsif ($retrieval_type eq 'lsa') {
                $retrievals = $self->retrieve_with_lsa( \@query_words );
            }
        };
        if ($@) {
            warn "\n\nNo relevant docs found for query $query.\n" .
                 "Will skip over this query for precision and\n" .
                 "recall calculations\n\n";
            next;
        }
        my %ranked_retrievals;
        my $i = 1;
        foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} 
                                                      keys %$retrievals) {
            $ranked_retrievals{$i++} = $_;
        }      
        if ($self->{_debug}) {
            print "\n\nDisplaying ranked retrievals for query $query:\n\n";
            foreach (sort {$a <=> $b} keys %ranked_retrievals) {
                print "$_  =>   $ranked_retrievals{$_}\n";   
            }      
        }
        #   At this time, ranking of relevant documents based on their
        #   relevancy counts serves no particular purpose since all we want
        #   for the calculation of Precision and Recall are the total
        #   number of relevant documents.  However, I believe such a
        #   ranking will play an important role in the future.
        #   IMPORTANT:  The relevancy judgments are ranked only when
        #               estimated by the method estimate_doc_relevancies()
        #               of the VSM class.  When relevancies are supplied
        #               directly through a disk file, they all carry the
        #               same rank.
        my %ranked_relevancies;
        $i = 1;
        foreach my $file (sort {
                          $self->{_relevancy_estimates}->{$query}->{$b}
                          <=>
                          $self->{_relevancy_estimates}->{$query}->{$a}
                          }
                          keys %{$self->{_relevancy_estimates}->{$query}}) {
            $ranked_relevancies{$i++} = $file;
        }
        if ($self->{_debug}) {
            print "\n\nDisplaying ranked relevancies for query $query:\n\n";
            foreach (sort {$a <=> $b} keys %ranked_relevancies) {
                print "$_  =>   $ranked_relevancies{$_}\n";   
            }      
        }
        my @relevant_set = values %ranked_relevancies;
        warn "\n\nNo relevant docs found for query $query.\n" .
             "Will skip over this query for precision and\n" .
             "recall calculations\n\n" unless @relevant_set;
        next unless @relevant_set;    
        print "\n\nRelevant set for query $query:  @relevant_set\n\n"
            if $self->{_debug};
        # @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed.  
        my @retrieved; 
        foreach (keys %ranked_retrievals) {
            push @retrieved, $ranked_retrievals{$_};
        }
        print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n"
            if $self->{_debug};
        my @Precision_values = ();
        my @Recall_values = ();
        my $rank = 1;
        while ($rank < @retrieved + 1) {
            my $index = 1;      
            my @retrieved_at_rank = ();
            while ($index <= $rank) {
                push @retrieved_at_rank, $ranked_retrievals{$index};
                $index++;
            }
            my $intersection =set_intersection(\@retrieved_at_rank,
                                               \@relevant_set);
            my $precision_at_rank = @retrieved_at_rank ? 
                                 (@$intersection / @retrieved_at_rank) : 0;
            push @Precision_values, $precision_at_rank;
            my $recall_at_rank = @$intersection / @relevant_set;
            push @Recall_values, $recall_at_rank;
            $rank++;
        }
        print "\n\nFor query $query, precision values: @Precision_values\n"
            if $self->{_debug};
        print "\nFor query $query, recall values: @Recall_values\n"
            if $self->{_debug};      
        $self->{_precision_for_queries}->{$query} = \@Precision_values;
        my $avg_precision;
        $avg_precision += $_ for @Precision_values;        
        $self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values);
        $self->{_recall_for_queries}->{$query} = \@Recall_values;
    }
    print "\n\n=========  query by query processing for Precision vs. Recall calculations finished  ========\n\n"  
                    if $self->{_debug};
    my @avg_precisions;
    foreach (keys %{$self->{_avg_precision_for_queries}}) {
        push @avg_precisions, $self->{_avg_precision_for_queries}->{$_};
    }
    $self->{_map} += $_ for @avg_precisions;
    $self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}};
}

sub display_average_precision_for_queries_and_map {
    my $self = shift;
    die "You must first invoke precision_and_recall_calculator function" 
        unless scalar(keys %{$self->{_avg_precision_for_queries}});
    print "\n\nDisplaying average precision for different queries:\n\n";
    foreach my $query (sort 
                         {get_integer_suffix($a) <=> get_integer_suffix($b)} 
                         keys %{$self->{_avg_precision_for_queries}}) {
        my $output = sprintf "Query %s  =>   %.3f", 
                 $query, $self->{_avg_precision_for_queries}->{$query};
        print "$output\n";
    }
    print "\n\nMAP value: $self->{_map}\n\n";
}

sub display_precision_vs_recall_for_queries {
    my $self = shift;
    die "You must first invoke precision_and_recall_calculator function" 
        unless scalar(keys %{$self->{_precision_for_queries}});
    print "\n\nDisplaying precision and recall values for different queries:\n\n";
    foreach my $query (sort 
                         {get_integer_suffix($a) <=> get_integer_suffix($b)} 
                         keys %{$self->{_avg_precision_for_queries}}) {
        print "\n\nQuery $query:\n";
        print "\n   (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
        my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
        @precision_vals = map {sprintf "%.3f", $_} @precision_vals;
        print "   Precision at rank  =>  @precision_vals\n";
        my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
        @recall_vals = map {sprintf "%.3f", $_} @recall_vals;
        print "\n   Recall at rank   =>  @recall_vals\n";
    }
    print "\n\n";
}

sub get_query_sorted_average_precision_for_queries {
    my $self = shift;
    die "You must first invoke precision_and_recall_calculator function" 
        unless scalar(keys %{$self->{_avg_precision_for_queries}});
    my @average_precisions_for_queries = ();
    foreach my $query (sort 
                         {get_integer_suffix($a) <=> get_integer_suffix($b)} 
                         keys %{$self->{_avg_precision_for_queries}}) {
        my $output = sprintf "%.3f", $self->{_avg_precision_for_queries}->{$query};
        push @average_precisions_for_queries, $output;
    }
    return \@average_precisions_for_queries;
}

###################################  Utility Routines  ###################################

sub _check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / corpus_directory
                            corpus_vocab_db
                            doc_vectors_db
                            normalized_doc_vecs_db
                            use_idf_filter
                            stop_words_file
                            file_types
                            case_sensitive
                            max_number_retrievals
                            query_file
                            relevancy_file
                            min_word_length
                            want_stemming
                            lsa_svd_threshold
                            relevancy_threshold
                            break_camelcased_and_underscored
                            save_model_on_disk
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

# checks whether an element is in an array:
sub contained_in {
    my $ele = shift;
    my @array = @_;
    my $count = 0;
    map {$count++ if $ele eq $_} @array;
    return $count;
}

# Meant only for an un-nested hash:
sub deep_copy_hash {
    my $ref_in = shift;
    my $ref_out = {};
    foreach ( keys %{$ref_in} ) {
        $ref_out->{$_} = $ref_in->{$_};
    }
    return $ref_out;
}

sub vec_scalar_product {
    my $vec1 = shift;
    my $vec2 = shift;
    croak "Something is wrong --- the two vectors are of unequal length"
        unless @$vec1 == @$vec2;
    my $product;
    for my $i (0..@$vec1-1) {
        $product += $vec1->[$i] * $vec2->[$i];
    }
    return $product;
}

sub vec_magnitude {
    my $vec = shift;
    my $mag_squared = 0;
    foreach my $num (@$vec) {
        $mag_squared += $num ** 2;
    }
    return sqrt $mag_squared;
}

sub sum {
    my $vec = shift;
    my $result;
    for my $item (@$vec) {
        $result += $item;
    }
    return $result;
}

sub simple_stemmer {
    my $word = shift;
    my $debug = shift;
    print "\nStemming the word:        $word\n" if $debug;
    $word =~ s/(.*[a-z][^aeious])s$/$1/i;
    $word =~ s/(.*[a-z]s)es$/$1/i;
    $word =~ s/(.*[a-z][ck])es$/$1e/i;
    $word =~ s/(.*[a-z]+)tions$/$1tion/i;
    $word =~ s/(.*[a-z]+)mming$/$1m/i;
    $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
    $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
    $word =~ s/(.*[a-z]+)tices$/$1tex/i;
    $word =~ s/(.*[a-z]+)pes$/$1pe/i;
    $word =~ s/(.*[a-z]+)sed$/$1se/i;
    $word =~ s/(.*[a-z]+)ed$/$1/i;
    $word =~ s/(.*[a-z]+)tation$/$1t/i;
    print "Stemmed word:                           $word\n\n" if $debug;
    return $word;
}

# Assumes the array is sorted in a descending order, as would be the
# case with an array of singular values produced by an SVD algorithm
sub return_index_of_last_value_above_threshold {
    my $pdl_obj = shift;
    my $size = $pdl_obj->getdim(0);
    my $threshold = shift;
    my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
    my $i = 0;
    while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
    return $i-1;
}

sub set_intersection {
    my $set1 = shift;
    my $set2 = shift;
    my %hset1 = map {$_ => 1} @$set1;
    my  @common_elements = grep {$hset1{$_}} @$set2;
    return @common_elements ? \@common_elements : [];
}

sub get_integer_suffix {
    my $label = shift;
    $label =~ /(\d*)$/;
    return $1;
}

1;

=pod

=head1 NAME

Algorithm::VSM --- A Perl module for retrieving files and documents from a software
library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
algorithms in response to search words and phrases.

=head1 SYNOPSIS

  # FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:

        use Algorithm::VSM;

        my $corpus_dir = "corpus";
        my @query = qw/ program ListIterator add ArrayList args /;
        my $stop_words_file = "stop_words.txt";  
        my $vsm = Algorithm::VSM->new( 
                            break_camelcased_and_underscored  => 1, 
                            case_sensitive         => 0,
                            corpus_directory       => $corpus_dir,
                            file_types             => ['.txt', '.java'],
                            max_number_retrievals  => 10,
                            min_word_length        => 4,
                            stop_words_file        => $stop_words_file,
                            use_idf_filter         => 1,
                            want_stemming          => 1,
        );
        $vsm->get_corpus_vocabulary_and_word_counts();



( run in 0.465 second using v1.01-cache-2.11-cpan-0bd6704ced7 )