Algorithm-VSM
view release on metacpan or search on metacpan
examples/calculate_similarity_matrix_for_all_normalized_docs.pl view on Meta::CPAN
#!/usr/bin/perl -w
use lib '../blib/lib', '../blib/arch';
## calculate_similarity_matrix_for_all_normalized_docs.pl
# This script demonstrates how you can calculate the similarity matrix for
# all of the documents in your corpus. The (i,j)th element of the output
# matrix is the dot-vector based similarity between the i-th document and
# the j-th document. The index associated with a documnet is its place in
# an alphabetically sorted list of all the documents.
# This scirpt compares documents using their NORMALIZED vector
# representations.
# The similarity matrices are stored in a CSV file whose column headings
# are the names of the documents. The same is the case with the entries in
# the first column.
use strict;
use Algorithm::VSM;
use Text::CSV;
my $corpus_dir = "minicorpus";
my $stop_words_file = "stop_words.txt";
my $vsm = Algorithm::VSM->new(
break_camelcased_and_underscored => 1, # default: 1
case_sensitive => 0, # default: 0
corpus_directory => $corpus_dir,
file_types => ['.txt', '.java'],
min_word_length => 4,
stop_words_file => $stop_words_file,
want_stemming => 1, # default: 0
);
$vsm->get_corpus_vocabulary_and_word_counts();
$vsm->generate_document_vectors();
# If you would like to directly measure the similarity between two
# specific documents, uncomment the following two statements.
# Obviously, you will have to change the arguments to suit your needs.
# Note that the arguments "AddArray.java" and "ArrayBasic.java" are
# names of specific documents in the subdirectory 'corpus' of the
# 'examples' directory of the distro. You must change these to the
# filenames of the documents you want to compare.
#my $similarity = $vsm->pairwise_similarity_for_docs("AddArray.java", "ArrayBasic.java");
#print "Similarity score: $similarity\n";
# If you would the above calculation to be carried out with normalized
# document vectors, uncomment the following two statements. Again, you
# must change the arguments strings "AddArray.java" and "ArrayBasic.java"
# to the names of the documents you want to compare.
#my $similarity2 = $vsm->pairwise_similarity_for_normalized_docs("AddArray.java", "ArrayBasic.java");
#print "Similarity score for normalized docs: $similarity\n";
my @docs = @{$vsm->get_all_document_names()};
my @similarity_matrix;
foreach my $i (0..@docs-1) {
my @one_row = ();
foreach my $j (0..@docs-1) {
push @one_row, $vsm->pairwise_similarity_for_normalized_docs($docs[$i], $docs[$j]);
}
push @similarity_matrix, \@one_row;
}
foreach my $m (0..@similarity_matrix-1) {
my @row = @{$similarity_matrix[$m]};
foreach my $n (0..@row-1) {
my $sim_val = $row[$n];
$sim_val =~ s/^(\d+\.\d{1,4})\d*$/$1/;
print "$sim_val ";
}
print "\n";
}
foreach my $m (0..@similarity_matrix-1) {
unshift @{$similarity_matrix[$m]}, $docs[$m];
}
unshift @docs, " ";
unshift @similarity_matrix, \@docs;
my $csv = Text::CSV->new ( { binary => 1 } ) # should set binary attribute.
or die "Cannot use CSV: ".Text::CSV->error_diag ();
$csv->eol ("\r\n");
open my $fh, ">:encoding(utf8)", "SimilarityMatrixNormalizedDocs.csv"
or die "SimilarityMatrixNormalizedDocs.csv: $!";
#$csv->print ($fh, $_) for @rows;
$csv->print ($fh, $_) for @similarity_matrix;
close $fh or die "SimilarityMatrixNormalizedDocs.csv: $!";
( run in 0.590 second using v1.01-cache-2.11-cpan-39bf76dae61 )