Algorithm-VSM
view release on metacpan or search on metacpan
lib/Algorithm/VSM.pm view on Meta::CPAN
#---------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::VSM is a Perl module for retrieving the documents from a software
# library that match a list of words in a query. The matching criterion used depends
# on whether you ask the module to construct a full-dimensionality VSM or a
# reduced-dimensionality LSA model for the library.
# ---------------------------------------------------------------------------
#use 5.10.0;
use strict;
use warnings;
use Carp;
use SDBM_File;
use PDL::Lite;
use PDL::MatrixOps;
use File::Basename;
use File::Spec::Functions qw(rel2abs);
use Fcntl;
use Storable;
use Cwd;
our $VERSION = '1.70';
# for camelcase splits (from perlmonks):
my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/;
################################### Constructor #######################################
# Constructor for creating a VSM or LSA model of a corpus. The model instance
# returned by the constructor can be used for retrieving documents from the corpus
# in response to queries.
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if _check_for_illegal_params(@params) == 0;
bless {
_corpus_directory => $args{corpus_directory} || "",
_save_model_on_disk => $args{save_model_on_disk} || 0,
_break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
$args{break_camelcased_and_underscored} : 1,
_corpus_vocab_db => $args{corpus_vocab_db} || "corpus_vocab_db",
_doc_vectors_db => $args{doc_vectors_db} || "doc_vectors_db",
_normalized_doc_vecs_db => $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
_stop_words_file => $args{stop_words_file} || "",
_case_sensitive => $args{case_sensitive} || 0,
_query_file => $args{query_file} || "",
_file_types => $args{file_types} || [],
_min_word_length => $args{min_word_length} || 4,
_want_stemming => $args{want_stemming} || 0,
_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
_max_number_retrievals => $args{max_number_retrievals} || 30,
_lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01,
_relevancy_threshold => exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
_relevancy_file => $args{relevancy_file} || "",
_debug => $args{debug} || 0,
_working_directory => cwd,
_vocab_hist_on_disk => {},
_vocab_hist => {},
_doc_hist_template => {},
_corpus_doc_vectors => {},
_normalized_doc_vecs => {},
_query_vector => {},
_stop_words => [],
_term_document_matrix => [],
_corpus_vocab_done => 0,
_scan_dir_for_rels => 0,
_vocab_size => undef,
_doc_vecs_trunc_lsa => {},
_lsa_vec_truncator => undef,
_queries_for_relevancy => {},
_relevancy_estimates => {},
_precision_for_queries => {},
_avg_precision_for_queries => {},
_recall_for_queries => {},
_map => undef,
_vocab_idf_hist => {},
_idf_t => {},
_total_num_of_docs => 0,
}, $class;
}
###################### Get corpus vocabulary and word counts #########################
sub get_corpus_vocabulary_and_word_counts {
my $self = shift;
die "You must supply the name of the corpus directory to the constructor"
unless $self->{_corpus_directory};
print "Scanning the directory '$self->{_corpus_directory}' for\n" .
" model construction\n\n" if $self->{_debug};
$self->_scan_directory( $self->{_corpus_directory} );
$self->_drop_stop_words() if $self->{_stop_words_file};
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
if ($self->{_save_model_on_disk}) {
unlink glob "$self->{_corpus_vocab_db}.*";
unlink glob "$self->{_doc_vectors_db}.*";
unlink glob "$self->{_normalized_doc_vecs_db}.*";
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
$self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
or die "Can't create DBM files: $!";
foreach (keys %{$self->{_vocab_hist}}) {
$self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
}
untie %{$self->{_vocab_hist_on_disk}};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
# Calculate idf(t):
foreach (keys %{$self->{_vocab_idf_hist}}) {
$self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
/
(1 + $self->{_vocab_idf_hist}->{$_})))
/ log(10) );
}
}
sub display_corpus_vocab {
my $self = shift;
die "corpus vocabulary not yet constructed"
unless keys %{$self->{_vocab_hist}};
print "\n\nDisplaying corpus vocabulary:\n\n";
foreach (sort keys %{$self->{_vocab_hist}}){
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
print "$outstring\n";
}
}
sub display_corpus_vocab_size {
my $self = shift;
die "corpus vocabulary not yet constructed"
unless keys %{$self->{_vocab_hist}};
my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
print "\nSize of the corpus vocabulary: $vocab_size\n\n";
}
sub write_corpus_vocab_to_file {
my $self = shift;
my $file = shift;
die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
open OUT, "> $file"
or die "unable to open for output a file with name `$file': $!";
foreach (sort keys %{$self->{_vocab_hist}}){
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
print OUT "$outstring\n";
}
close OUT;
}
sub display_inverse_document_frequencies {
my $self = shift;
die "corpus vocabulary not yet constructed"
unless keys %{$self->{_vocab_idf_hist}};
print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
unless $self->{_idf_filter_option};
print "\n\nDisplaying inverse document frequencies:\n";
foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
my $outstring = sprintf("%30s %d",
$_, $self->{_vocab_idf_hist}->{$_});
print "$outstring\n";
}
print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n";
foreach ( sort keys %{$self->{_idf_t}} ) {
my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
print "$outstring\n";
}
}
lib/Algorithm/VSM.pm view on Meta::CPAN
my $doc2 = shift;
my @all_files = keys %{$self->{_corpus_doc_vectors}};
croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1};
my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2};
my @vec1 = ();
my @vec2 = ();
foreach my $word (sort keys %$vec_hash_ref1) {
push @vec1, $vec_hash_ref1->{$word};
push @vec2, $vec_hash_ref2->{$word};
}
my $vec_mag1 = vec_magnitude(\@vec1);
my $vec_mag2 = vec_magnitude(\@vec2);
my $product = vec_scalar_product(\@vec1, \@vec2);
$product /= $vec_mag1 * $vec_mag2;
return $product;
}
sub pairwise_similarity_for_normalized_docs {
my $self = shift;
my $doc1 = shift;
my $doc2 = shift;
my @all_files = keys %{$self->{_corpus_doc_vectors}};
croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1};
my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2};
my @vec1 = ();
my @vec2 = ();
foreach my $word (sort keys %$vec_hash_ref1) {
push @vec1, $vec_hash_ref1->{$word};
push @vec2, $vec_hash_ref2->{$word};
}
my $vec_mag1 = vec_magnitude(\@vec1);
my $vec_mag2 = vec_magnitude(\@vec2);
my $product = vec_scalar_product(\@vec1, \@vec2);
$product /= $vec_mag1 * $vec_mag2;
return $product;
}
############################### Retrieve with VSM Model ################################
sub retrieve_with_vsm {
my $self = shift;
my $query = shift;
my @clean_words;
my $min = $self->{_min_word_length};
if ($self->{_break_camelcased_and_underscored}) {
my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
@clean_words = map {$_ =~ /$_regex/g} @brokenup;
@clean_words = $self->{_case_sensitive} ?
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
} else {
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
}
$query = \@clean_words;
print "\nYour query words are: @$query\n" if $self->{_debug};
if ($self->{_idf_filter_option}) {
die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()"
unless scalar(keys %{$self->{_vocab_hist}})
&& scalar(keys %{$self->{_normalized_doc_vecs}});
} else {
die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
unless scalar(keys %{$self->{_vocab_hist}})
&& scalar(keys %{$self->{_corpus_doc_vectors}});
}
foreach ( keys %{$self->{_vocab_hist}} ) {
$self->{_query_vector}->{$_} = 0;
}
foreach (@$query) {
if ($self->{_case_sensitive}) {
$self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_};
} else {
$self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
}
}
my @query_word_counts = values %{$self->{_query_vector}};
my $query_word_count_total = sum(\@query_word_counts);
die "\nYour query does not contain corpus words. Nothing retrieved.\n"
unless $query_word_count_total;
my %retrievals;
if ($self->{_idf_filter_option}) {
print "\n\nUsing idf filter option for retrieval:\n\n"
if $self->{_debug};
foreach (sort {$self->_doc_vec_comparator}
keys %{$self->{_normalized_doc_vecs}}) {
$retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
}
} else {
print "\n\nNOT using idf filter option for retrieval:\n\n"
if $self->{_debug};
foreach (sort {$self->_doc_vec_comparator}
keys %{$self->{_corpus_doc_vectors}}) {
$retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
}
}
if ($self->{_debug}) {
print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
print "$_ => $retrievals{$_}\n";
}
}
return \%retrievals;
}
######################### Upload a Previously Constructed Model #########################
sub upload_vsm_model_from_disk {
my $self = shift;
die "\nCannot find the database files for the VSM model"
unless -s "$self->{_corpus_vocab_db}.pag"
&& -s $self->{_doc_vectors_db};
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
$self->{_corpus_vocab_db}, O_RDONLY, 0640
or die "Can't open DBM file: $!";
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
foreach (keys %{$self->{_vocab_hist_on_disk}}) {
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
untie %{$self->{_vocab_hist_on_disk}};
}
sub upload_normalized_vsm_model_from_disk {
my $self = shift;
die "\nCannot find the database files for the VSM model"
unless -s "$self->{_corpus_vocab_db}.pag"
&& -s $self->{_normalized_doc_vecs_db};
$self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
$self->{_corpus_vocab_db}, O_RDONLY, 0640
or die "Can't open DBM file: $!";
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
foreach (keys %{$self->{_vocab_hist_on_disk}}) {
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
untie %{$self->{_vocab_hist_on_disk}};
}
############################## Display Retrieval Results ################################
sub display_retrievals {
my $self = shift;
my $retrievals = shift;
print "\n\nShowing the retrievals and the similarity scores:\n\n";
my $iter = 0;
foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
print "$_ => $retrievals->{$_}\n";
$iter++;
last if $iter > $self->{_max_number_retrievals};
}
print "\n\n";
}
############################### Directory Scanner ################################
sub _scan_directory {
my $self = shift;
my $dir = rel2abs( shift );
my $current_dir = cwd;
chdir $dir or die "Unable to change directory to $dir: $!";
foreach ( glob "*" ) {
if ( -d and !(-l) ) {
$self->_scan_directory( $_ );
chdir $dir
or die "Unable to change directory to $dir: $!";
} elsif (-r _ and
-T _ and
-M _ > 0.00001 and # modification age is at least 1 sec
!( -l $_ ) and
$self->ok_to_filetype($_) ) {
$self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
$self->_scan_file($_) unless $self->{_corpus_vocab_done};
$self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
}
}
chdir $current_dir;
}
sub _scan_file {
my $self = shift;
my $file = shift;
open IN, $file;
my $min = $self->{_min_word_length};
my %uniques = ();
while (<IN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my @clean_words;
if ($self->{_break_camelcased_and_underscored}) {
my @brokenup = grep $_, split /\W|_|\s+/, $_;
@clean_words = map {$_ =~ /$_regex/g} @brokenup;
@clean_words = $self->{_case_sensitive} ?
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
} else {
lib/Algorithm/VSM.pm view on Meta::CPAN
@clean_words = grep $_, map &simple_stemmer($_), @clean_words
if $self->{_want_stemming};
$self->{_case_sensitive} ?
map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
if ($self->{_case_sensitive}) {
for (@clean_words) { $uniques{$_}++ }
} else {
for (@clean_words) { $uniques{"\L$_"}++ }
}
}
close( IN );
map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
$self->{_total_num_of_docs}++;
}
sub ok_to_filetype {
my $self = shift;
my $filename = shift;
my ($base, $dir, $suffix) = fileparse($filename, '\..*');
croak "You called this module without specifying the file types in the constructor"
unless @{$self->{_file_types}} > 0;
return 1 if contained_in($suffix, @{$self->{_file_types}});
return 0;
}
############################## LSA Modeling and Retrieval ################################
sub construct_lsa_model {
my $self = shift;
if ($self->{_idf_filter_option}) {
if (!$self->{_normalized_doc_vecs} and
-s $self->{_normalized_doc_vecs_db}) {
$self->{_normalized_doc_vecs} =
retrieve($self->{_normalized_doc_vecs_db});
}
foreach (sort keys %{$self->{_normalized_doc_vecs}}) {
my $term_frequency_vec;
foreach my $word (sort keys
%{$self->{_normalized_doc_vecs}->{$_}}){
push @$term_frequency_vec,
$self->{_normalized_doc_vecs}->{$_}->{$word};
}
push @{$self->{_term_document_matrix}}, $term_frequency_vec;
}
} else {
if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) {
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
}
foreach (sort keys %{$self->{_corpus_doc_vectors}}) {
my $term_frequency_vec;
foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
push @$term_frequency_vec,
$self->{_corpus_doc_vectors}->{$_}->{$word};
}
push @{$self->{_term_document_matrix}}, $term_frequency_vec;
}
}
my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
my ($U,$SIGMA,$V) = svd $A;
print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
my $index = return_index_of_last_value_above_threshold($SIGMA,
$self->{_lsa_svd_threshold});
my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
# When you measure the size of a matrix in PDL, the zeroth dimension
# is considered to be along the horizontal and the one-th dimension
# along the rows. This is opposite of how we want to look at
# matrices. For a matrix of size MxN, we mean M rows and N columns.
# With this 'rows x columns' convention for matrix size, if you had
# to check the size of, say, U matrix, you would call
# my @size = ( $U->getdim(1), $U->getdim(0) );
# print "\nsize of U: @size\n";
my $U_trunc = $U->slice("0:$index,:")->sever;
my $V_trunc = $V->slice("0:$index,0:$index")->sever;
$self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
PDL::Basic::transpose($U_trunc);
print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
if $self->{_debug};
my @sorted_doc_names = $self->{_idf_filter_option} ?
sort keys %{$self->{_normalized_doc_vecs}} :
sort keys %{$self->{_corpus_doc_vectors}};
my $i = 0;
foreach (@{$self->{_term_document_matrix}}) {
my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
PDL::Basic::transpose(pdl($_));
my $doc_name = $sorted_doc_names[$i++];
print "\n\nTruncated doc vec for $doc_name: " .
$truncated_doc_vec . "\n" if $self->{_debug};
$self->{_doc_vecs_trunc_lsa}->{$doc_name}
= $truncated_doc_vec;
}
chdir $self->{_working_directory};
}
sub retrieve_with_lsa {
my $self = shift;
my $query = shift;
my @clean_words;
my $min = $self->{_min_word_length};
if ($self->{_break_camelcased_and_underscored}) {
my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
@clean_words = map {$_ =~ /$_regex/g} @brokenup;
@clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
} else {
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
}
$query = \@clean_words;
print "\nYour processed query words are: @$query\n" if $self->{_debug};
die "Your vocabulary histogram is empty"
unless scalar(keys %{$self->{_vocab_hist}});
die "You must first construct an LSA model"
unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
foreach ( keys %{$self->{_vocab_hist}} ) {
$self->{_query_vector}->{$_} = 0;
}
foreach (@$query) {
$self->{_query_vector}->{"\L$_"}++
if exists $self->{_vocab_hist}->{"\L$_"};
}
my @query_word_counts = values %{$self->{_query_vector}};
my $query_word_count_total = sum(\@query_word_counts);
die "Query does not contain corpus words. Nothing retrieved."
unless $query_word_count_total;
my $query_vec;
foreach (sort keys %{$self->{_query_vector}}) {
push @$query_vec, $self->{_query_vector}->{$_};
}
print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
my $truncated_query_vec = $self->{_lsa_vec_truncator} x
PDL::Basic::transpose(pdl($query_vec));
print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
if $self->{_debug};
my %retrievals;
foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
my $dot_product = PDL::Basic::transpose($truncated_query_vec)
x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
print "\n\nLSA: dot product of truncated query and\n" .
" truncated vec for doc $_ is " . $dot_product->sclr . "\n"
if $self->{_debug};
$retrievals{$_} = $dot_product->sclr if $dot_product->sclr > 0;
}
if ($self->{_debug}) {
print "\n\nShowing LSA retrievals and similarity scores:\n\n";
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
print "$_ => $retrievals{$_}\n";
}
print "\n\n";
}
return \%retrievals;
}
sub _construct_doc_vector {
my $self = shift;
my $file = shift;
my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
foreach ( sort keys %{$self->{_doc_hist_template}} ) {
$document_vector{$_} = 0;
}
my $min = $self->{_min_word_length};
my $total_words_in_doc = 0;
unless (open IN, $file) {
print "Unable to open file $file in the corpus: $!\n"
if $self->{_debug};
return;
}
while (<IN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
next unless @clean_words;
@clean_words = grep $_,
map &simple_stemmer($_, $self->{_debug}), @clean_words
if $self->{_want_stemming};
$self->{_case_sensitive} ?
map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
map { $document_vector{"\L$_"}++ }
grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
}
close IN;
die "Something went wrong. Doc vector size unequal to vocab size"
unless $self->{_vocab_size} == scalar(keys %document_vector);
foreach (keys %document_vector) {
$total_words_in_doc += $document_vector{$_};
}
my %normalized_doc_vec;
if ($self->{_idf_filter_option}) {
foreach (keys %document_vector) {
$normalized_doc_vec{$_} = $document_vector{$_}
*
$self->{_idf_t}->{$_}
/
$total_words_in_doc;
}
}
my $pwd = cwd;
$pwd =~ m{$self->{_corpus_directory}.?(\S*)$};
my $file_path_name;
unless ( $1 eq "" ) {
$file_path_name = "$1/$file";
} else {
$file_path_name = $file;
}
$self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector;
$self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec;
}
################################### Drop Stop Words ###################################
sub _drop_stop_words {
my $self = shift;
open( IN, "$self->{_working_directory}/$self->{_stop_words_file}")
or die "unable to open stop words file: $!";
while (<IN>) {
next if /^#/;
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_};
unshift @{$self->{_stop_words}}, $_;
}
}
################################### Support Methods ####################################
sub _doc_vec_comparator {
my $self = shift;
my %query_vector = %{$self->{_query_vector}};
my $vec1_hash_ref = $self->{_idf_filter_option} ?
$self->{_normalized_doc_vecs}->{$a} :
$self->{_corpus_doc_vectors}->{$a};
my $vec2_hash_ref = $self->{_idf_filter_option} ?
$self->{_normalized_doc_vecs}->{$b} :
$self->{_corpus_doc_vectors}->{$b};
lib/Algorithm/VSM.pm view on Meta::CPAN
return 1 if $product1 < $product2;
return 0 if $product1 == $product2;
return -1 if $product1 > $product2;
}
sub _similarity_to_query {
my $self = shift;
my $doc_name = shift;
my $vec_hash_ref = $self->{_idf_filter_option} ?
$self->{_normalized_doc_vecs}->{$doc_name} :
$self->{_corpus_doc_vectors}->{$doc_name};
my @vec = ();
my @qvec = ();
foreach my $word (sort keys %$vec_hash_ref) {
push @vec, $vec_hash_ref->{$word};
push @qvec, $self->{_query_vector}->{$word};
}
my $vec_mag = vec_magnitude(\@vec);
my $qvec_mag = vec_magnitude(\@qvec);
my $product = vec_scalar_product(\@vec, \@qvec);
$product /= $vec_mag * $qvec_mag;
return $product;
}
###################### Relevance Judgments for Testing Purposes #######################
## IMPORTANT: This estimation of document relevancies to queries is NOT for
## serious work. A document is considered to be relevant to a
## query if it contains several of the query words. As to the
## minimum number of query words that must exist in a document
## in order for the latter to be considered relevant is
## determined by the relevancy_threshold parameter in the VSM
## constructor. (See the relevancy and precision-recall related
## scripts in the 'examples' directory.) The reason for why the
## function shown below is not for serious work is because
## ultimately it is the humans who are the best judges of the
## relevancies of documents to queries. The humans bring to
## bear semantic considerations on the relevancy determination
## problem that are beyond the scope of this module.
sub estimate_doc_relevancies {
my $self = shift;
die "You did not set the 'query_file' parameter in the constructor"
unless $self->{_query_file};
open( IN, $self->{_query_file} )
or die "unable to open the query file $self->{_query_file}: $!";
croak "\n\nYou need to specify a name for the relevancy file in \n" .
" in which the relevancy judgments will be dumped."
unless $self->{_relevancy_file};
while (<IN>) {
next if /^#/;
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
/^[ ]*(q[0-9]+):[ ]*(.*)/;
my $query_label = $1;
my $query = $2;
next unless $query;
$self->{_queries_for_relevancy}->{$query_label} = $query;
}
if ($self->{_debug}) {
foreach (sort keys %{$self->{_queries_for_relevancy}}) {
print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
}
}
$self->{_scan_dir_for_rels} = 1;
$self->_scan_directory($self->{_corpus_directory});
$self->{_scan_dir_for_rels} = 0;
chdir $self->{_working_directory};
open(OUT, ">$self->{_relevancy_file}")
or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
my @relevancy_list_for_query;
foreach (sort
{get_integer_suffix($a) <=> get_integer_suffix($b)}
keys %{$self->{_relevancy_estimates}}) {
@relevancy_list_for_query =
keys %{$self->{_relevancy_estimates}->{$_}};
print OUT "$_ => @relevancy_list_for_query\n\n";
print "Number of relevant docs for query $_: " .
scalar(@relevancy_list_for_query) . "\n";
}
}
# If there are available human-supplied relevancy judgments in a disk
# file, use this script to upload that information. One of the scripts
# in the 'examples' directory carries out the precision-recall analysis
# by using this approach. IMPORTANT: The human-supplied relevancy
# judgments must be in a format that is shown in the sample file
# relevancy.txt in the 'examples' directory.
sub upload_document_relevancies_from_file {
my $self = shift;
chdir $self->{_working_directory};
open( IN, $self->{_relevancy_file} )
or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
while (<IN>) {
next if /^#/;
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/;
/^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/;
my $query_label = $1;
my $relevancy_docs_string = $2;
next unless $relevancy_docs_string;
my @relevancy_docs = grep $_, split / /, $relevancy_docs_string;
my %relevancies = map {$_ => 1} @relevancy_docs;
$self->{_relevancy_estimates}->{$query_label} = \%relevancies;
}
if ($self->{_debug}) {
for (sort keys %{$self->{_relevancy_estimates}}) {
my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
print "$_ => @rels\n";
}
}
}
sub display_doc_relevancies {
my $self = shift;
die "You must first estimate or provide the doc relevancies"
unless scalar(keys %{$self->{_relevancy_estimates}});
print "\nDisplaying relevancy judgments:\n\n";
foreach my $query (sort keys %{$self->{_relevancy_estimates}}) {
print "Query $query\n";
foreach my $file (sort {
$self->{_relevancy_estimates}->{$query}->{$b}
<=>
$self->{_relevancy_estimates}->{$query}->{$a}
}
keys %{$self->{_relevancy_estimates}->{$query}}){
print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n";
}
}
}
sub _scan_file_for_rels {
my $self = shift;
my $file = shift;
open IN, $file;
my @all_text = <IN>;
@all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
my $all_text = join ' ', @all_text;
foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
my $count = 0;
my @query_words = grep $_,
split /\s+/, $self->{_queries_for_relevancy}->{$query};
print "Query words for $query: @query_words\n" if $self->{_debug};
foreach my $word (@query_words) {
my @matches = $all_text =~ /$word/gi;
print "Number of occurrences for word '$word' in file $file: " .
scalar(@matches) . "\n" if $self->{_debug};
$count += @matches if @matches;
}
print "\nRelevancy count for query $query and file $file: $count\n\n"
if $self->{_debug};
$self->{_relevancy_estimates}->{$query}->{$file} = $count
if $count >= $self->{_relevancy_threshold};
}
}
######################### Calculate Precision versus Recall ##########################
sub precision_and_recall_calculator {
my $self = shift;
my $retrieval_type = shift;
die "You must specify the retrieval type through an argument to the method " .
"precision_and_recall_calculator(). The retrieval type must either be 'vsm' " .
"or 'lsa' \n" unless $retrieval_type;
die "You must first estimate or provide the doc relevancies"
unless scalar(keys %{$self->{_relevancy_estimates}});
unless (scalar(keys %{$self->{_queries_for_relevancy}})) {
open( IN, $self->{_query_file})
or die "unable to open the query file $self->{_query_file}: $!";
while (<IN>) {
next if /^#/;
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
/^[ ]*(q[0-9]+):[ ]*(.*)/;
my $query_label = $1;
my $query = $2;
next unless $query;
$self->{_queries_for_relevancy}->{$query_label} = $query;
}
if ($self->{_debug}) {
print "\n\nDisplaying queries in the query file:\n\n";
foreach (sort keys %{$self->{_queries_for_relevancy}}) {
print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
}
}
}
foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
print "\n\n====================================== query: $query ========================================\n\n"
if $self->{_debug};
print "\n\n\nQuery $query:\n" if $self->{_debug};
my @query_words = grep $_,
split /\s+/, $self->{_queries_for_relevancy}->{$query};
croak "\n\nYou have not specified the retrieval type for " .
"precision-recall calculation. See code in 'examples'" .
"directory:" if !defined $retrieval_type;
my $retrievals;
eval {
if ($retrieval_type eq 'vsm') {
$retrievals = $self->retrieve_with_vsm( \@query_words );
} elsif ($retrieval_type eq 'lsa') {
$retrievals = $self->retrieve_with_lsa( \@query_words );
}
};
if ($@) {
warn "\n\nNo relevant docs found for query $query.\n" .
"Will skip over this query for precision and\n" .
"recall calculations\n\n";
next;
}
my %ranked_retrievals;
my $i = 1;
foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}}
keys %$retrievals) {
$ranked_retrievals{$i++} = $_;
}
if ($self->{_debug}) {
print "\n\nDisplaying ranked retrievals for query $query:\n\n";
foreach (sort {$a <=> $b} keys %ranked_retrievals) {
print "$_ => $ranked_retrievals{$_}\n";
}
}
# At this time, ranking of relevant documents based on their
# relevancy counts serves no particular purpose since all we want
# for the calculation of Precision and Recall are the total
# number of relevant documents. However, I believe such a
# ranking will play an important role in the future.
# IMPORTANT: The relevancy judgments are ranked only when
# estimated by the method estimate_doc_relevancies()
# of the VSM class. When relevancies are supplied
# directly through a disk file, they all carry the
# same rank.
my %ranked_relevancies;
$i = 1;
foreach my $file (sort {
$self->{_relevancy_estimates}->{$query}->{$b}
<=>
$self->{_relevancy_estimates}->{$query}->{$a}
}
keys %{$self->{_relevancy_estimates}->{$query}}) {
$ranked_relevancies{$i++} = $file;
}
if ($self->{_debug}) {
print "\n\nDisplaying ranked relevancies for query $query:\n\n";
foreach (sort {$a <=> $b} keys %ranked_relevancies) {
print "$_ => $ranked_relevancies{$_}\n";
}
}
my @relevant_set = values %ranked_relevancies;
warn "\n\nNo relevant docs found for query $query.\n" .
"Will skip over this query for precision and\n" .
"recall calculations\n\n" unless @relevant_set;
next unless @relevant_set;
print "\n\nRelevant set for query $query: @relevant_set\n\n"
if $self->{_debug};
# @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed.
my @retrieved;
foreach (keys %ranked_retrievals) {
push @retrieved, $ranked_retrievals{$_};
}
print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n"
if $self->{_debug};
my @Precision_values = ();
my @Recall_values = ();
my $rank = 1;
while ($rank < @retrieved + 1) {
my $index = 1;
my @retrieved_at_rank = ();
while ($index <= $rank) {
push @retrieved_at_rank, $ranked_retrievals{$index};
$index++;
}
my $intersection =set_intersection(\@retrieved_at_rank,
\@relevant_set);
my $precision_at_rank = @retrieved_at_rank ?
(@$intersection / @retrieved_at_rank) : 0;
push @Precision_values, $precision_at_rank;
my $recall_at_rank = @$intersection / @relevant_set;
push @Recall_values, $recall_at_rank;
$rank++;
}
print "\n\nFor query $query, precision values: @Precision_values\n"
if $self->{_debug};
print "\nFor query $query, recall values: @Recall_values\n"
if $self->{_debug};
$self->{_precision_for_queries}->{$query} = \@Precision_values;
my $avg_precision;
$avg_precision += $_ for @Precision_values;
$self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values);
$self->{_recall_for_queries}->{$query} = \@Recall_values;
}
print "\n\n========= query by query processing for Precision vs. Recall calculations finished ========\n\n"
if $self->{_debug};
my @avg_precisions;
foreach (keys %{$self->{_avg_precision_for_queries}}) {
push @avg_precisions, $self->{_avg_precision_for_queries}->{$_};
}
$self->{_map} += $_ for @avg_precisions;
$self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}};
}
sub display_average_precision_for_queries_and_map {
my $self = shift;
die "You must first invoke precision_and_recall_calculator function"
unless scalar(keys %{$self->{_avg_precision_for_queries}});
print "\n\nDisplaying average precision for different queries:\n\n";
foreach my $query (sort
{get_integer_suffix($a) <=> get_integer_suffix($b)}
keys %{$self->{_avg_precision_for_queries}}) {
my $output = sprintf "Query %s => %.3f",
$query, $self->{_avg_precision_for_queries}->{$query};
print "$output\n";
}
print "\n\nMAP value: $self->{_map}\n\n";
}
sub display_precision_vs_recall_for_queries {
my $self = shift;
die "You must first invoke precision_and_recall_calculator function"
unless scalar(keys %{$self->{_precision_for_queries}});
print "\n\nDisplaying precision and recall values for different queries:\n\n";
foreach my $query (sort
{get_integer_suffix($a) <=> get_integer_suffix($b)}
keys %{$self->{_avg_precision_for_queries}}) {
print "\n\nQuery $query:\n";
print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
@precision_vals = map {sprintf "%.3f", $_} @precision_vals;
print " Precision at rank => @precision_vals\n";
my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
@recall_vals = map {sprintf "%.3f", $_} @recall_vals;
print "\n Recall at rank => @recall_vals\n";
}
print "\n\n";
}
sub get_query_sorted_average_precision_for_queries {
my $self = shift;
die "You must first invoke precision_and_recall_calculator function"
unless scalar(keys %{$self->{_avg_precision_for_queries}});
my @average_precisions_for_queries = ();
foreach my $query (sort
{get_integer_suffix($a) <=> get_integer_suffix($b)}
keys %{$self->{_avg_precision_for_queries}}) {
my $output = sprintf "%.3f", $self->{_avg_precision_for_queries}->{$query};
push @average_precisions_for_queries, $output;
}
return \@average_precisions_for_queries;
}
################################### Utility Routines ###################################
sub _check_for_illegal_params {
my @params = @_;
my @legal_params = qw / corpus_directory
corpus_vocab_db
doc_vectors_db
normalized_doc_vecs_db
use_idf_filter
stop_words_file
file_types
case_sensitive
max_number_retrievals
query_file
relevancy_file
min_word_length
want_stemming
lsa_svd_threshold
relevancy_threshold
break_camelcased_and_underscored
save_model_on_disk
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
# checks whether an element is in an array:
sub contained_in {
my $ele = shift;
my @array = @_;
my $count = 0;
map {$count++ if $ele eq $_} @array;
return $count;
}
# Meant only for an un-nested hash:
sub deep_copy_hash {
my $ref_in = shift;
my $ref_out = {};
foreach ( keys %{$ref_in} ) {
$ref_out->{$_} = $ref_in->{$_};
}
return $ref_out;
}
sub vec_scalar_product {
my $vec1 = shift;
my $vec2 = shift;
croak "Something is wrong --- the two vectors are of unequal length"
unless @$vec1 == @$vec2;
my $product;
for my $i (0..@$vec1-1) {
$product += $vec1->[$i] * $vec2->[$i];
}
return $product;
}
sub vec_magnitude {
my $vec = shift;
my $mag_squared = 0;
foreach my $num (@$vec) {
$mag_squared += $num ** 2;
}
return sqrt $mag_squared;
}
sub sum {
my $vec = shift;
my $result;
for my $item (@$vec) {
$result += $item;
}
return $result;
}
sub simple_stemmer {
my $word = shift;
my $debug = shift;
print "\nStemming the word: $word\n" if $debug;
$word =~ s/(.*[a-z][^aeious])s$/$1/i;
$word =~ s/(.*[a-z]s)es$/$1/i;
$word =~ s/(.*[a-z][ck])es$/$1e/i;
$word =~ s/(.*[a-z]+)tions$/$1tion/i;
$word =~ s/(.*[a-z]+)mming$/$1m/i;
$word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
$word =~ s/(.*[a-z]+)tices$/$1tex/i;
$word =~ s/(.*[a-z]+)pes$/$1pe/i;
$word =~ s/(.*[a-z]+)sed$/$1se/i;
$word =~ s/(.*[a-z]+)ed$/$1/i;
$word =~ s/(.*[a-z]+)tation$/$1t/i;
print "Stemmed word: $word\n\n" if $debug;
return $word;
}
# Assumes the array is sorted in a descending order, as would be the
# case with an array of singular values produced by an SVD algorithm
sub return_index_of_last_value_above_threshold {
my $pdl_obj = shift;
my $size = $pdl_obj->getdim(0);
my $threshold = shift;
my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
my $i = 0;
while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
return $i-1;
}
sub set_intersection {
my $set1 = shift;
my $set2 = shift;
my %hset1 = map {$_ => 1} @$set1;
my @common_elements = grep {$hset1{$_}} @$set2;
return @common_elements ? \@common_elements : [];
}
sub get_integer_suffix {
my $label = shift;
$label =~ /(\d*)$/;
return $1;
}
1;
=pod
=head1 NAME
Algorithm::VSM --- A Perl module for retrieving files and documents from a software
library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
algorithms in response to search words and phrases.
=head1 SYNOPSIS
# FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:
use Algorithm::VSM;
my $corpus_dir = "corpus";
my @query = qw/ program ListIterator add ArrayList args /;
my $stop_words_file = "stop_words.txt";
my $vsm = Algorithm::VSM->new(
break_camelcased_and_underscored => 1,
case_sensitive => 0,
corpus_directory => $corpus_dir,
file_types => ['.txt', '.java'],
max_number_retrievals => 10,
min_word_length => 4,
stop_words_file => $stop_words_file,
use_idf_filter => 1,
want_stemming => 1,
);
$vsm->get_corpus_vocabulary_and_word_counts();
( run in 0.465 second using v1.01-cache-2.11-cpan-0bd6704ced7 )