Algorithm-VSM

 view release on metacpan or  search on metacpan

examples/significance_testing.pl  view on Meta::CPAN

#!/usr/bin/perl -w

##  significance_testing.pl

##  See Item 11 in the README of the `examples' directory.

use strict;
use Algorithm::VSM;


my $debug_signi = 0;

die "Must supply one command-line argument, which must either be 'randomization' or 't-test'\n" 
    unless @ARGV == 1;
my $significance_testing_method = shift @ARGV;
die "The command-line argument must either be 'randomization' or " .
    "'t-test' for this module to be useful\n"
             if ($significance_testing_method ne 'randomization') and
                ($significance_testing_method ne 't-test');

print "Proceeding with significance testing based on $significance_testing_method\n";

examples/significance_testing.pl  view on Meta::CPAN

$lsa1->generate_document_vectors();
$lsa1->construct_lsa_model();
$lsa1->upload_document_relevancies_from_file();
$lsa1->precision_and_recall_calculator('lsa');
my $avg_precisions_1 = $lsa1->get_query_sorted_average_precision_for_queries();
my $MAP_Algo_1 = 0;
map {$MAP_Algo_1 += $_} @$avg_precisions_1;
$MAP_Algo_1 /= @$avg_precisions_1;
print "MAP value for LSA-1: $MAP_Algo_1\n";
print "Avg precisions for LSA-1: @$avg_precisions_1\n" 
                                            if $debug_signi;


########################  Algorithm 2  #########################

my $lsa2 = Algorithm::VSM->new( 
                   break_camelcased_and_underscored  => 1,  # default: 1
                   case_sensitive      => 0,                # default: 0 
                   corpus_directory    => $corpus_dir,
                   file_types          => ['.txt', '.java'],
                   lsa_svd_threshold   => $THRESHOLD_2,

examples/significance_testing.pl  view on Meta::CPAN

$lsa2->generate_document_vectors();
$lsa2->construct_lsa_model();
$lsa2->upload_document_relevancies_from_file();
$lsa2->precision_and_recall_calculator('lsa');
my $avg_precisions_2 = $lsa2->get_query_sorted_average_precision_for_queries();
my $MAP_Algo_2 = 0;
map {$MAP_Algo_2 += $_} @$avg_precisions_2;
$MAP_Algo_2 /= @$avg_precisions_2;
print "MAP value for LSA-2: $MAP_Algo_2\n";
print "Average precisions for LSA-2: @$avg_precisions_2\n"
                                          if $debug_signi;

#  This is the observed value for the test statistic that will be subject to 
#  significance testing:
my $OBSERVED_t = $MAP_Algo_1 - $MAP_Algo_2;

print "\n\nMAP Difference that will be Subject to Significance Testing: $OBSERVED_t\n\n";
                               

########################   Significance Testing  ######################

my @range = 0..@$avg_precisions_1-1;

if ($debug_signi) {
    my $total_number_of_permutations = 2 ** @range;
    print "\n\nTotal num of permuts $total_number_of_permutations\n\n";
}

#   For each permutation of the algorithm labels over the queries, we 
#   will store the test_statistic in the array \@test_statistic.
my @test_statistic = ();

#  At each iteration, we create a random permutation of the algo_1 and
#  algo_2 labels over the queries as explained on slides 39 and 45 of my

examples/significance_testing.pl  view on Meta::CPAN

        if ($permuted_range[$_] < @range / 2.0) {
            push @algo_1, $avg_precisions_1->[$_];
            push @algo_2, $avg_precisions_2->[$_];
        } else {
            push @algo_1, $avg_precisions_2->[$_];
            push @algo_2, $avg_precisions_1->[$_];
        }
    }
    my $MAP_1 = 0;
    my $MAP_2 = 0;
    if ($debug_signi) {
        print "\n\nHere come algo_1 and algo_2 average precisions:\n\n";
        print "\npretend produced by algo 1: @algo_1\n\n";
        print "pretend produced by algo 2: @algo_2\n";
    }
    map {$MAP_1 += $_} @algo_1;
    map {$MAP_2 += $_} @algo_2;
    $MAP_1 /= @range;
    $MAP_2 /= @range;        
    if ($debug_signi) {
        print "\nMAP_1: $MAP_1\n";
        print "MAP_2: $MAP_2\n\n";
    }
    $test_statistic[$iter] = $MAP_1 - $MAP_2;
    last if $iter++ == $MAX_ITERATIONS;
    print "." if $iter % 100 == 0;
}

if ($significance_testing_method eq 'randomization') {
    print "\n\nIn randomization based p-value calculation:\n\n";
    print "test-statistic values for different permutations: @test_statistic\n"
        if $debug_signi;

    #  This count keeps track of how many of the test_statistic values are
    #  less than and greater than the value in $OBSERVED_t
    my $count = 0;
    foreach (@test_statistic) {
        $count++ if $_ <= -1 * abs($OBSERVED_t);
        $count++ if $_ > abs($OBSERVED_t);
    }
    my $p_value = $count / @test_statistic;

lib/Algorithm/VSM.pm  view on Meta::CPAN

        _case_sensitive             =>  $args{case_sensitive} || 0,
        _query_file                 =>  $args{query_file} || "",
        _file_types                 =>  $args{file_types} || [],
        _min_word_length            =>  $args{min_word_length} || 4,
        _want_stemming              =>  $args{want_stemming} || 0,
        _idf_filter_option          =>  exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
        _max_number_retrievals      =>  $args{max_number_retrievals} || 30,
        _lsa_svd_threshold          =>  $args{lsa_svd_threshold} || 0.01,
        _relevancy_threshold        =>  exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
        _relevancy_file             =>  $args{relevancy_file} || "",
        _debug                      =>  $args{debug} || 0,
        _working_directory          =>  cwd,
        _vocab_hist_on_disk         =>  {},
        _vocab_hist                 =>  {},
        _doc_hist_template          =>  {},
        _corpus_doc_vectors         =>  {},
        _normalized_doc_vecs        =>  {},
        _query_vector               =>  {},
        _stop_words                 =>  [],
        _term_document_matrix       =>  [],
        _corpus_vocab_done          =>  0,

lib/Algorithm/VSM.pm  view on Meta::CPAN

}


######################    Get corpus vocabulary and word counts  #########################

sub get_corpus_vocabulary_and_word_counts {
    my $self = shift;
    die "You must supply the name of the corpus directory to the constructor"
        unless $self->{_corpus_directory};
    print "Scanning the directory '$self->{_corpus_directory}' for\n" .
        "  model construction\n\n" if $self->{_debug};
    $self->_scan_directory( $self->{_corpus_directory} );
    $self->_drop_stop_words() if $self->{_stop_words_file};
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    if ($self->{_save_model_on_disk}) {
        unlink glob "$self->{_corpus_vocab_db}.*";   
        unlink glob "$self->{_doc_vectors_db}.*";   
        unlink glob "$self->{_normalized_doc_vecs_db}.*";   
        tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',  
                 $self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
                or die "Can't create DBM files: $!";       
        foreach (keys %{$self->{_vocab_hist}}) {
            $self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
        }
        untie %{$self->{_vocab_hist_on_disk}};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
            if $self->{_debug};
    # Calculate idf(t):
    foreach (keys %{$self->{_vocab_idf_hist}}) {
        $self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
                                      /
                                      (1 + $self->{_vocab_idf_hist}->{$_}))) 
                                      / log(10) ); 
    }
}

sub display_corpus_vocab {

lib/Algorithm/VSM.pm  view on Meta::CPAN

        my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
        @clean_words = map {$_ =~ /$_regex/g} @brokenup;
        @clean_words = $self->{_case_sensitive} ? 
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
    } else {
        my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
        @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
    }
    $query = \@clean_words;
    print "\nYour query words are: @$query\n" if $self->{_debug};
    if ($self->{_idf_filter_option}) {
        die "\nYou need to first generate normalized document vectors before you can call  retrieve_with_vsm()"
            unless scalar(keys %{$self->{_vocab_hist}}) 
                  && scalar(keys %{$self->{_normalized_doc_vecs}});
    } else {
        die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
            unless scalar(keys %{$self->{_vocab_hist}}) 
                  && scalar(keys %{$self->{_corpus_doc_vectors}});
    }
    foreach ( keys %{$self->{_vocab_hist}} ) {        

lib/Algorithm/VSM.pm  view on Meta::CPAN

            $self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
        }
    }
    my @query_word_counts = values %{$self->{_query_vector}};
    my $query_word_count_total = sum(\@query_word_counts);
    die "\nYour query does not contain corpus words. Nothing retrieved.\n"
        unless $query_word_count_total;
    my %retrievals;
    if ($self->{_idf_filter_option}) {
        print "\n\nUsing idf filter option for retrieval:\n\n" 
                                                if $self->{_debug};
        foreach (sort {$self->_doc_vec_comparator} 
                         keys %{$self->{_normalized_doc_vecs}}) {
            $retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
        }
    } else {
        print "\n\nNOT using idf filter option for retrieval:\n\n"
                                                if $self->{_debug};
        foreach (sort {$self->_doc_vec_comparator} 
                         keys %{$self->{_corpus_doc_vectors}}) {
            $retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
        }
    }
    if ($self->{_debug}) {
        print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
        foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
            print "$_   =>   $retrievals{$_}\n";
        }
    }
    return \%retrievals;
}

######################### Upload a Previously Constructed Model  #########################

sub upload_vsm_model_from_disk {
    my $self = shift;
    die "\nCannot find the database files for the VSM model"
        unless -s "$self->{_corpus_vocab_db}.pag" 
            && -s $self->{_doc_vectors_db};
    $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
    tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', 
                      $self->{_corpus_vocab_db}, O_RDONLY, 0640
            or die "Can't open DBM file: $!";       
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    foreach (keys %{$self->{_vocab_hist_on_disk}}) {
        $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
               if $self->{_debug};
    $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
    untie %{$self->{_vocab_hist_on_disk}};
}

sub upload_normalized_vsm_model_from_disk {
    my $self = shift;
    die "\nCannot find the database files for the VSM model"
        unless -s "$self->{_corpus_vocab_db}.pag" 
            && -s $self->{_normalized_doc_vecs_db};
    $self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
    tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', 
                      $self->{_corpus_vocab_db}, O_RDONLY, 0640
            or die "Can't open DBM file: $!";       
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    foreach (keys %{$self->{_vocab_hist_on_disk}}) {
        $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
               if $self->{_debug};
    untie %{$self->{_vocab_hist_on_disk}};
}

############################## Display Retrieval Results  ################################

sub display_retrievals {
    my $self = shift;
    my $retrievals = shift;
    print "\n\nShowing the retrievals and the similarity scores:\n\n";
    my $iter = 0;

lib/Algorithm/VSM.pm  view on Meta::CPAN

            my $term_frequency_vec;
            foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
                push @$term_frequency_vec,   
                        $self->{_corpus_doc_vectors}->{$_}->{$word};
            }
            push @{$self->{_term_document_matrix}}, $term_frequency_vec;
        }
    }
    my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
    my ($U,$SIGMA,$V) = svd $A;
    print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
    print "size of svd SIGMA:  ", $SIGMA->dims, "\n" if $self->{_debug};
    my $index = return_index_of_last_value_above_threshold($SIGMA, 
                                          $self->{_lsa_svd_threshold});
    my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
    print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
    # When you measure the size of a matrix in PDL, the zeroth dimension
    # is considered to be along the horizontal and the one-th dimension
    # along the rows.  This is opposite of how we want to look at
    # matrices.  For a matrix of size MxN, we mean M rows and N columns.
    # With this 'rows x columns' convention for matrix size, if you had
    # to check the size of, say, U matrix, you would call
    #  my @size = ( $U->getdim(1), $U->getdim(0) );
    #  print "\nsize of U: @size\n";
    my $U_trunc = $U->slice("0:$index,:")->sever;
    my $V_trunc = $V->slice("0:$index,0:$index")->sever;    
    $self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x 
                                             PDL::Basic::transpose($U_trunc);
    print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
            if $self->{_debug};
    my @sorted_doc_names = $self->{_idf_filter_option} ? 
                       sort keys %{$self->{_normalized_doc_vecs}} :
                       sort keys %{$self->{_corpus_doc_vectors}};
    my $i = 0;
    foreach (@{$self->{_term_document_matrix}}) {
        my $truncated_doc_vec = $self->{_lsa_vec_truncator} x 
                                               PDL::Basic::transpose(pdl($_));
        my $doc_name = $sorted_doc_names[$i++];
        print "\n\nTruncated doc vec for $doc_name: " . 
                 $truncated_doc_vec . "\n" if $self->{_debug};
        $self->{_doc_vecs_trunc_lsa}->{$doc_name} 
                                                 = $truncated_doc_vec;
    }
    chdir $self->{_working_directory};
}

sub retrieve_with_lsa {
    my $self = shift;
    my $query = shift;
    my @clean_words;
    my $min = $self->{_min_word_length};
    if ($self->{_break_camelcased_and_underscored}) {
        my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
        @clean_words = map {$_ =~ /$_regex/g} @brokenup;
        @clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
    } else {
        my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
        @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
    }
    $query = \@clean_words;
    print "\nYour processed query words are: @$query\n" if $self->{_debug};
    die "Your vocabulary histogram is empty" 
        unless scalar(keys %{$self->{_vocab_hist}});
    die "You must first construct an LSA model" 
        unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
    foreach ( keys %{$self->{_vocab_hist}} ) {        
        $self->{_query_vector}->{$_} = 0;    
    }
    foreach (@$query) {
        $self->{_query_vector}->{"\L$_"}++ 
                       if exists $self->{_vocab_hist}->{"\L$_"};
    }
    my @query_word_counts = values %{$self->{_query_vector}};
    my $query_word_count_total = sum(\@query_word_counts);
    die "Query does not contain corpus words. Nothing retrieved."
        unless $query_word_count_total;
    my $query_vec;
    foreach (sort keys %{$self->{_query_vector}}) {
        push @$query_vec, $self->{_query_vector}->{$_};
    }
    print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
    my $truncated_query_vec = $self->{_lsa_vec_truncator} x 
                                               PDL::Basic::transpose(pdl($query_vec));
    print "\n\nTruncated query vector: " .  $truncated_query_vec . "\n"
                                   if $self->{_debug};                  
    my %retrievals;
    foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
        my $dot_product = PDL::Basic::transpose($truncated_query_vec)
                     x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
        print "\n\nLSA: dot product of truncated query and\n" .
              "     truncated vec for doc $_ is " . $dot_product->sclr . "\n"
                                        if $self->{_debug};                  
        $retrievals{$_} = $dot_product->sclr if $dot_product->sclr > 0;
    }
    if ($self->{_debug}) {
        print "\n\nShowing LSA retrievals and similarity scores:\n\n";
        foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
            print "$_   =>   $retrievals{$_}\n";
        }
        print "\n\n";
    }
    return \%retrievals;
}

sub _construct_doc_vector {
    my $self = shift;
    my $file = shift;
    my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
    foreach ( sort keys %{$self->{_doc_hist_template}} ) {  
        $document_vector{$_} = 0;    
    }
    my $min = $self->{_min_word_length};
    my $total_words_in_doc = 0;
    unless (open IN, $file) {
        print "Unable to open file $file in the corpus: $!\n" 
            if $self->{_debug};
        return;
    }
    while (<IN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
        my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
        next unless @clean_words;
        @clean_words = grep $_, 
                       map &simple_stemmer($_, $self->{_debug}), @clean_words
                       if $self->{_want_stemming};
        $self->{_case_sensitive} ? 
            map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
            map { $document_vector{"\L$_"}++ } 
                                       grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words; 
    }
    close IN;
    die "Something went wrong. Doc vector size unequal to vocab size"
        unless $self->{_vocab_size} == scalar(keys %document_vector);
    foreach (keys %document_vector) {        

lib/Algorithm/VSM.pm  view on Meta::CPAN

        next if /^#/;
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
        /^[ ]*(q[0-9]+):[ ]*(.*)/;
        my $query_label = $1;
        my $query = $2;
        next unless $query;
        $self->{_queries_for_relevancy}->{$query_label} =  $query;
    }
    if ($self->{_debug}) {
        foreach (sort keys %{$self->{_queries_for_relevancy}}) {
            print "$_   =>   $self->{_queries_for_relevancy}->{$_}\n"; 
        }
    }
    $self->{_scan_dir_for_rels} = 1;
    $self->_scan_directory($self->{_corpus_directory});
    $self->{_scan_dir_for_rels} = 0;
    chdir $self->{_working_directory};
    open(OUT, ">$self->{_relevancy_file}") 
       or die "unable to open the relevancy file $self->{_relevancy_file}: $!";

lib/Algorithm/VSM.pm  view on Meta::CPAN

        $_ =~ s/\r?\n?$//;
        die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/;
        /^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/;
        my $query_label = $1;
        my $relevancy_docs_string = $2;
        next unless $relevancy_docs_string;
        my @relevancy_docs  =  grep $_, split / /, $relevancy_docs_string;
        my %relevancies =     map {$_ => 1} @relevancy_docs;
        $self->{_relevancy_estimates}->{$query_label} = \%relevancies;
    }
    if ($self->{_debug}) {
        for (sort keys %{$self->{_relevancy_estimates}}) {
            my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
            print "$_   =>   @rels\n";
        }
    }
}

sub display_doc_relevancies {
    my $self = shift;
    die "You must first estimate or provide the doc relevancies" 

lib/Algorithm/VSM.pm  view on Meta::CPAN

    my $self = shift;
    my $file = shift;
    open IN, $file;
    my @all_text = <IN>;
    @all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
    my $all_text = join ' ', @all_text;
    foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
        my $count = 0;
        my @query_words = grep $_, 
                split /\s+/, $self->{_queries_for_relevancy}->{$query};
        print "Query words for $query:   @query_words\n" if $self->{_debug};
        foreach my $word (@query_words) {
            my @matches = $all_text =~ /$word/gi;
            print "Number of occurrences for word '$word' in file $file: " . 
                scalar(@matches) . "\n" if $self->{_debug};
            $count += @matches if @matches;         
        }
        print "\nRelevancy count for query $query and file $file: $count\n\n"
            if $self->{_debug};
        $self->{_relevancy_estimates}->{$query}->{$file} = $count 
            if $count >= $self->{_relevancy_threshold};
    }
}

#########################   Calculate Precision versus Recall   ##########################

sub precision_and_recall_calculator {
    my $self = shift;
    my $retrieval_type = shift;

lib/Algorithm/VSM.pm  view on Meta::CPAN

            next if /^#/;
            next if /^[ ]*\r?\n?$/;
            $_ =~ s/\r?\n?$//;
            die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
            /^[ ]*(q[0-9]+):[ ]*(.*)/;
            my $query_label = $1;
            my $query = $2;
            next unless $query;
            $self->{_queries_for_relevancy}->{$query_label} =  $query;
        }
        if ($self->{_debug}) {
            print "\n\nDisplaying queries in the query file:\n\n";
            foreach (sort keys %{$self->{_queries_for_relevancy}}) {
                print "$_   =>   $self->{_queries_for_relevancy}->{$_}\n"; 
            }
        }
    }
    foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
        print "\n\n====================================== query: $query ========================================\n\n"
                    if $self->{_debug};
        print "\n\n\nQuery $query:\n" if $self->{_debug};
        my @query_words = grep $_, 
                split /\s+/, $self->{_queries_for_relevancy}->{$query};
        croak "\n\nYou have not specified the retrieval type for " . 
              "precision-recall calculation.  See code in 'examples'" .
              "directory:" if !defined $retrieval_type;
        my $retrievals;
        eval {
            if ($retrieval_type eq 'vsm') {
                $retrievals = $self->retrieve_with_vsm( \@query_words );
            } elsif ($retrieval_type eq 'lsa') {

lib/Algorithm/VSM.pm  view on Meta::CPAN

                 "Will skip over this query for precision and\n" .
                 "recall calculations\n\n";
            next;
        }
        my %ranked_retrievals;
        my $i = 1;
        foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} 
                                                      keys %$retrievals) {
            $ranked_retrievals{$i++} = $_;
        }      
        if ($self->{_debug}) {
            print "\n\nDisplaying ranked retrievals for query $query:\n\n";
            foreach (sort {$a <=> $b} keys %ranked_retrievals) {
                print "$_  =>   $ranked_retrievals{$_}\n";   
            }      
        }
        #   At this time, ranking of relevant documents based on their
        #   relevancy counts serves no particular purpose since all we want
        #   for the calculation of Precision and Recall are the total
        #   number of relevant documents.  However, I believe such a
        #   ranking will play an important role in the future.

lib/Algorithm/VSM.pm  view on Meta::CPAN

        my %ranked_relevancies;
        $i = 1;
        foreach my $file (sort {
                          $self->{_relevancy_estimates}->{$query}->{$b}
                          <=>
                          $self->{_relevancy_estimates}->{$query}->{$a}
                          }
                          keys %{$self->{_relevancy_estimates}->{$query}}) {
            $ranked_relevancies{$i++} = $file;
        }
        if ($self->{_debug}) {
            print "\n\nDisplaying ranked relevancies for query $query:\n\n";
            foreach (sort {$a <=> $b} keys %ranked_relevancies) {
                print "$_  =>   $ranked_relevancies{$_}\n";   
            }      
        }
        my @relevant_set = values %ranked_relevancies;
        warn "\n\nNo relevant docs found for query $query.\n" .
             "Will skip over this query for precision and\n" .
             "recall calculations\n\n" unless @relevant_set;
        next unless @relevant_set;    
        print "\n\nRelevant set for query $query:  @relevant_set\n\n"
            if $self->{_debug};
        # @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed.  
        my @retrieved; 
        foreach (keys %ranked_retrievals) {
            push @retrieved, $ranked_retrievals{$_};
        }
        print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n"
            if $self->{_debug};
        my @Precision_values = ();
        my @Recall_values = ();
        my $rank = 1;
        while ($rank < @retrieved + 1) {
            my $index = 1;      
            my @retrieved_at_rank = ();
            while ($index <= $rank) {
                push @retrieved_at_rank, $ranked_retrievals{$index};
                $index++;
            }
            my $intersection =set_intersection(\@retrieved_at_rank,
                                               \@relevant_set);
            my $precision_at_rank = @retrieved_at_rank ? 
                                 (@$intersection / @retrieved_at_rank) : 0;
            push @Precision_values, $precision_at_rank;
            my $recall_at_rank = @$intersection / @relevant_set;
            push @Recall_values, $recall_at_rank;
            $rank++;
        }
        print "\n\nFor query $query, precision values: @Precision_values\n"
            if $self->{_debug};
        print "\nFor query $query, recall values: @Recall_values\n"
            if $self->{_debug};      
        $self->{_precision_for_queries}->{$query} = \@Precision_values;
        my $avg_precision;
        $avg_precision += $_ for @Precision_values;        
        $self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values);
        $self->{_recall_for_queries}->{$query} = \@Recall_values;
    }
    print "\n\n=========  query by query processing for Precision vs. Recall calculations finished  ========\n\n"  
                    if $self->{_debug};
    my @avg_precisions;
    foreach (keys %{$self->{_avg_precision_for_queries}}) {
        push @avg_precisions, $self->{_avg_precision_for_queries}->{$_};
    }
    $self->{_map} += $_ for @avg_precisions;
    $self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}};
}

sub display_average_precision_for_queries_and_map {
    my $self = shift;

lib/Algorithm/VSM.pm  view on Meta::CPAN

                            case_sensitive
                            max_number_retrievals
                            query_file
                            relevancy_file
                            min_word_length
                            want_stemming
                            lsa_svd_threshold
                            relevancy_threshold
                            break_camelcased_and_underscored
                            save_model_on_disk
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }

lib/Algorithm/VSM.pm  view on Meta::CPAN

    my $vec = shift;
    my $result;
    for my $item (@$vec) {
        $result += $item;
    }
    return $result;
}

sub simple_stemmer {
    my $word = shift;
    my $debug = shift;
    print "\nStemming the word:        $word\n" if $debug;
    $word =~ s/(.*[a-z][^aeious])s$/$1/i;
    $word =~ s/(.*[a-z]s)es$/$1/i;
    $word =~ s/(.*[a-z][ck])es$/$1e/i;
    $word =~ s/(.*[a-z]+)tions$/$1tion/i;
    $word =~ s/(.*[a-z]+)mming$/$1m/i;
    $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
    $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
    $word =~ s/(.*[a-z]+)tices$/$1tex/i;
    $word =~ s/(.*[a-z]+)pes$/$1pe/i;
    $word =~ s/(.*[a-z]+)sed$/$1se/i;
    $word =~ s/(.*[a-z]+)ed$/$1/i;
    $word =~ s/(.*[a-z]+)tation$/$1t/i;
    print "Stemmed word:                           $word\n\n" if $debug;
    return $word;
}

# Assumes the array is sorted in a descending order, as would be the
# case with an array of singular values produced by an SVD algorithm
sub return_index_of_last_value_above_threshold {
    my $pdl_obj = shift;
    my $size = $pdl_obj->getdim(0);
    my $threshold = shift;
    my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;



( run in 0.648 second using v1.01-cache-2.11-cpan-49f99fa48dc )