Algorithm-VSM
view release on metacpan or search on metacpan
examples/calculate_similarity_matrix_for_all_normalized_docs.pl view on Meta::CPAN
#!/usr/bin/perl -w
use lib '../blib/lib', '../blib/arch';
## calculate_similarity_matrix_for_all_normalized_docs.pl
# This script demonstrates how you can calculate the similarity matrix for
# all of the documents in your corpus. The (i,j)th element of the output
# matrix is the dot-vector based similarity between the i-th document and
# the j-th document. The index associated with a documnet is its place in
# an alphabetically sorted list of all the documents.
# This scirpt compares documents using their NORMALIZED vector
# representations.
# The similarity matrices are stored in a CSV file whose column headings
# are the names of the documents. The same is the case with the entries in
# the first column.
use strict;
use Algorithm::VSM;
use Text::CSV;
my $corpus_dir = "minicorpus";
my $stop_words_file = "stop_words.txt";
my $vsm = Algorithm::VSM->new(
break_camelcased_and_underscored => 1, # default: 1
case_sensitive => 0, # default: 0
corpus_directory => $corpus_dir,
file_types => ['.txt', '.java'],
min_word_length => 4,
stop_words_file => $stop_words_file,
want_stemming => 1, # default: 0
);
$vsm->get_corpus_vocabulary_and_word_counts();
$vsm->generate_document_vectors();
# If you would like to directly measure the similarity between two
# specific documents, uncomment the following two statements.
# Obviously, you will have to change the arguments to suit your needs.
# Note that the arguments "AddArray.java" and "ArrayBasic.java" are
# names of specific documents in the subdirectory 'corpus' of the
# 'examples' directory of the distro. You must change these to the
# filenames of the documents you want to compare.
#my $similarity = $vsm->pairwise_similarity_for_docs("AddArray.java", "ArrayBasic.java");
#print "Similarity score: $similarity\n";
# If you would the above calculation to be carried out with normalized
# document vectors, uncomment the following two statements. Again, you
# must change the arguments strings "AddArray.java" and "ArrayBasic.java"
# to the names of the documents you want to compare.
#my $similarity2 = $vsm->pairwise_similarity_for_normalized_docs("AddArray.java", "ArrayBasic.java");
#print "Similarity score for normalized docs: $similarity\n";
my @docs = @{$vsm->get_all_document_names()};
my @similarity_matrix;
foreach my $i (0..@docs-1) {
my @one_row = ();
foreach my $j (0..@docs-1) {
( run in 0.863 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )