Algorithm-VSM
view release on metacpan or search on metacpan
lib/Algorithm/VSM.pm view on Meta::CPAN
_break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
$args{break_camelcased_and_underscored} : 1,
_corpus_vocab_db => $args{corpus_vocab_db} || "corpus_vocab_db",
_doc_vectors_db => $args{doc_vectors_db} || "doc_vectors_db",
_normalized_doc_vecs_db => $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
_stop_words_file => $args{stop_words_file} || "",
_case_sensitive => $args{case_sensitive} || 0,
_query_file => $args{query_file} || "",
_file_types => $args{file_types} || [],
_min_word_length => $args{min_word_length} || 4,
_want_stemming => $args{want_stemming} || 0,
_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
_max_number_retrievals => $args{max_number_retrievals} || 30,
_lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01,
_relevancy_threshold => exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
_relevancy_file => $args{relevancy_file} || "",
_debug => $args{debug} || 0,
_working_directory => cwd,
_vocab_hist_on_disk => {},
_vocab_hist => {},
_doc_hist_template => {},
_corpus_doc_vectors => {},
_normalized_doc_vecs => {},
_query_vector => {},
_stop_words => [],
_term_document_matrix => [],
_corpus_vocab_done => 0,
_scan_dir_for_rels => 0,
_vocab_size => undef,
_doc_vecs_trunc_lsa => {},
_lsa_vec_truncator => undef,
_queries_for_relevancy => {},
_relevancy_estimates => {},
_precision_for_queries => {},
_avg_precision_for_queries => {},
_recall_for_queries => {},
_map => undef,
_vocab_idf_hist => {},
_idf_t => {},
_total_num_of_docs => 0,
}, $class;
}
###################### Get corpus vocabulary and word counts #########################
sub get_corpus_vocabulary_and_word_counts {
my $self = shift;
die "You must supply the name of the corpus directory to the constructor"
unless $self->{_corpus_directory};
print "Scanning the directory '$self->{_corpus_directory}' for\n" .
" model construction\n\n" if $self->{_debug};
$self->_scan_directory( $self->{_corpus_directory} );
$self->_drop_stop_words() if $self->{_stop_words_file};
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
if ($self->{_save_model_on_disk}) {
unlink glob "$self->{_corpus_vocab_db}.*";
unlink glob "$self->{_doc_vectors_db}.*";
unlink glob "$self->{_normalized_doc_vecs_db}.*";
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
$self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
or die "Can't create DBM files: $!";
foreach (keys %{$self->{_vocab_hist}}) {
$self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
}
untie %{$self->{_vocab_hist_on_disk}};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
# Calculate idf(t):
foreach (keys %{$self->{_vocab_idf_hist}}) {
$self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
/
(1 + $self->{_vocab_idf_hist}->{$_})))
/ log(10) );
}
}
sub display_corpus_vocab {
my $self = shift;
die "corpus vocabulary not yet constructed"
unless keys %{$self->{_vocab_hist}};
print "\n\nDisplaying corpus vocabulary:\n\n";
foreach (sort keys %{$self->{_vocab_hist}}){
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
print "$outstring\n";
}
}
sub display_corpus_vocab_size {
my $self = shift;
die "corpus vocabulary not yet constructed"
unless keys %{$self->{_vocab_hist}};
my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
print "\nSize of the corpus vocabulary: $vocab_size\n\n";
}
sub write_corpus_vocab_to_file {
my $self = shift;
my $file = shift;
die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
open OUT, "> $file"
or die "unable to open for output a file with name `$file': $!";
foreach (sort keys %{$self->{_vocab_hist}}){
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
print OUT "$outstring\n";
}
close OUT;
}
sub display_inverse_document_frequencies {
my $self = shift;
die "corpus vocabulary not yet constructed"
unless keys %{$self->{_vocab_idf_hist}};
print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
unless $self->{_idf_filter_option};
print "\n\nDisplaying inverse document frequencies:\n";
lib/Algorithm/VSM.pm view on Meta::CPAN
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
foreach (keys %{$self->{_vocab_hist_on_disk}}) {
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
untie %{$self->{_vocab_hist_on_disk}};
}
sub upload_normalized_vsm_model_from_disk {
my $self = shift;
die "\nCannot find the database files for the VSM model"
unless -s "$self->{_corpus_vocab_db}.pag"
&& -s $self->{_normalized_doc_vecs_db};
$self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
$self->{_corpus_vocab_db}, O_RDONLY, 0640
or die "Can't open DBM file: $!";
if ($self->{_debug}) {
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
}
}
foreach (keys %{$self->{_vocab_hist_on_disk}}) {
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
}
$self->{_corpus_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug};
untie %{$self->{_vocab_hist_on_disk}};
}
############################## Display Retrieval Results ################################
sub display_retrievals {
my $self = shift;
my $retrievals = shift;
print "\n\nShowing the retrievals and the similarity scores:\n\n";
my $iter = 0;
foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
print "$_ => $retrievals->{$_}\n";
$iter++;
last if $iter > $self->{_max_number_retrievals};
}
print "\n\n";
}
############################### Directory Scanner ################################
sub _scan_directory {
my $self = shift;
my $dir = rel2abs( shift );
my $current_dir = cwd;
chdir $dir or die "Unable to change directory to $dir: $!";
foreach ( glob "*" ) {
if ( -d and !(-l) ) {
$self->_scan_directory( $_ );
chdir $dir
or die "Unable to change directory to $dir: $!";
} elsif (-r _ and
-T _ and
-M _ > 0.00001 and # modification age is at least 1 sec
!( -l $_ ) and
$self->ok_to_filetype($_) ) {
$self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
$self->_scan_file($_) unless $self->{_corpus_vocab_done};
$self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
}
}
chdir $current_dir;
}
sub _scan_file {
my $self = shift;
my $file = shift;
open IN, $file;
my $min = $self->{_min_word_length};
my %uniques = ();
while (<IN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my @clean_words;
if ($self->{_break_camelcased_and_underscored}) {
my @brokenup = grep $_, split /\W|_|\s+/, $_;
@clean_words = map {$_ =~ /$_regex/g} @brokenup;
@clean_words = $self->{_case_sensitive} ?
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
} else {
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
}
next unless @clean_words;
@clean_words = grep $_, map &simple_stemmer($_), @clean_words
if $self->{_want_stemming};
$self->{_case_sensitive} ?
map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
if ($self->{_case_sensitive}) {
for (@clean_words) { $uniques{$_}++ }
} else {
for (@clean_words) { $uniques{"\L$_"}++ }
}
}
close( IN );
map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
$self->{_total_num_of_docs}++;
}
sub ok_to_filetype {
my $self = shift;
my $filename = shift;
my ($base, $dir, $suffix) = fileparse($filename, '\..*');
croak "You called this module without specifying the file types in the constructor"
unless @{$self->{_file_types}} > 0;
( run in 1.948 second using v1.01-cache-2.11-cpan-13bb782fe5a )