Algorithm-VSM

 view release on metacpan or  search on metacpan

examples/calculate_precision_and_recall_for_VSM.pl  view on Meta::CPAN

#!/usr/bin/perl -w

##  calculate_precision_and_recall_for_VSM.pl

use strict;
use Algorithm::VSM;

##    This is a self-contained script for precision-and-recall calculatins with
##    VSM.  Therefore, it is NOT necessary that you first create the disk-based
##    hash tables by calling retrieve_with_VSM_and_also_create_disk_based_model.pl

##    See Item 7 of the README of the `examples' directory for further information.



my $corpus_dir = "corpus";                     # This is the directory containing
                                               # the corpus

my $stop_words_file = "stop_words.txt";        # Will typically include the 
                                               # keywords of the programming
                                               # language(s) used in the software.

my $query_file      = "test_queries.txt";      # This file contains the queries
                                               # to be used for precision vs.
                                               # recall analysis.  Its format
                                               # must be as shown in test_queries.txt

my $relevancy_file   = "relevancy.txt";        # The generated relevancies will
                                               # be stored in this file.

my $vsm = Algorithm::VSM->new( 
                   break_camelcased_and_underscored  => 1,  #default: 1
                   case_sensitive      => 0,                # default: 0 
                   corpus_directory    => $corpus_dir,
                   file_types          => ['.txt', '.java'],
                   min_word_length     => 4,
                   query_file          => $query_file,
                   relevancy_file      => $relevancy_file,   # Relevancy judgments
                                                             # are deposited in 
                                                             # this file.
                   relevancy_threshold => 5,    # Used when estimating relevancies
                                                # with the method 
                                                # estimate_doc_relevancies().  A
                                                # doc must have at least this 
                                                # number of query words to be
                                                # considered relevant.
                   stop_words_file     => $stop_words_file,
                   want_stemming       => 1,                # default: 0
          );

$vsm->get_corpus_vocabulary_and_word_counts();

$vsm->generate_document_vectors();

#    Uncomment the following statement if you want to see the corpus
#    vocabulary:
#$vsm->display_corpus_vocab();

#    Uncomment the following statement if you want to see the individual
#    document vectors:
#$vsm->display_doc_vectors();

$vsm->estimate_doc_relevancies();

#    Uncomment the following statement if you wish to see the list of all
#    the documents relevant to each of the queries:
#$vsm->display_doc_relevancies();

$vsm->precision_and_recall_calculator('vsm');

$vsm->display_precision_vs_recall_for_queries();

$vsm->display_average_precision_for_queries_and_map();



( run in 0.799 second using v1.01-cache-2.11-cpan-39bf76dae61 )