Algorithm-VSM

 view release on metacpan or  search on metacpan

lib/Algorithm/VSM.pm  view on Meta::CPAN

        _break_camelcased_and_underscored  => exists $args{break_camelcased_and_underscored} ?
                                              $args{break_camelcased_and_underscored} : 1,
        _corpus_vocab_db            =>  $args{corpus_vocab_db} || "corpus_vocab_db",
        _doc_vectors_db             =>  $args{doc_vectors_db} || "doc_vectors_db",
        _normalized_doc_vecs_db     =>  $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
        _stop_words_file            =>  $args{stop_words_file} || "",
        _case_sensitive             =>  $args{case_sensitive} || 0,
        _query_file                 =>  $args{query_file} || "",
        _file_types                 =>  $args{file_types} || [],
        _min_word_length            =>  $args{min_word_length} || 4,
        _want_stemming              =>  $args{want_stemming} || 0,
        _idf_filter_option          =>  exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
        _max_number_retrievals      =>  $args{max_number_retrievals} || 30,
        _lsa_svd_threshold          =>  $args{lsa_svd_threshold} || 0.01,
        _relevancy_threshold        =>  exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
        _relevancy_file             =>  $args{relevancy_file} || "",
        _debug                      =>  $args{debug} || 0,
        _working_directory          =>  cwd,
        _vocab_hist_on_disk         =>  {},
        _vocab_hist                 =>  {},
        _doc_hist_template          =>  {},
        _corpus_doc_vectors         =>  {},
        _normalized_doc_vecs        =>  {},
        _query_vector               =>  {},
        _stop_words                 =>  [],
        _term_document_matrix       =>  [],
        _corpus_vocab_done          =>  0,
        _scan_dir_for_rels          =>  0,
        _vocab_size                 =>  undef,
        _doc_vecs_trunc_lsa         =>  {},
        _lsa_vec_truncator          =>  undef,
        _queries_for_relevancy      =>  {},
        _relevancy_estimates        =>  {},
        _precision_for_queries      =>  {},
        _avg_precision_for_queries  =>  {},
        _recall_for_queries         =>  {},
        _map                        =>  undef,
        _vocab_idf_hist             =>  {},
        _idf_t                      =>  {},
        _total_num_of_docs          =>  0,
    }, $class;
}


######################    Get corpus vocabulary and word counts  #########################

sub get_corpus_vocabulary_and_word_counts {
    my $self = shift;
    die "You must supply the name of the corpus directory to the constructor"
        unless $self->{_corpus_directory};
    print "Scanning the directory '$self->{_corpus_directory}' for\n" .
        "  model construction\n\n" if $self->{_debug};
    $self->_scan_directory( $self->{_corpus_directory} );
    $self->_drop_stop_words() if $self->{_stop_words_file};
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    if ($self->{_save_model_on_disk}) {
        unlink glob "$self->{_corpus_vocab_db}.*";   
        unlink glob "$self->{_doc_vectors_db}.*";   
        unlink glob "$self->{_normalized_doc_vecs_db}.*";   
        tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',  
                 $self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
                or die "Can't create DBM files: $!";       
        foreach (keys %{$self->{_vocab_hist}}) {
            $self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
        }
        untie %{$self->{_vocab_hist_on_disk}};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
            if $self->{_debug};
    # Calculate idf(t):
    foreach (keys %{$self->{_vocab_idf_hist}}) {
        $self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
                                      /
                                      (1 + $self->{_vocab_idf_hist}->{$_}))) 
                                      / log(10) ); 
    }
}

sub display_corpus_vocab {
    my $self = shift;
    die "corpus vocabulary not yet constructed"
        unless keys %{$self->{_vocab_hist}};
    print "\n\nDisplaying corpus vocabulary:\n\n";
    foreach (sort keys %{$self->{_vocab_hist}}){
        my $outstring = sprintf("%30s     %d", $_,$self->{_vocab_hist}->{$_});
        print "$outstring\n";
    }
}

sub display_corpus_vocab_size {
    my $self = shift;
    die "corpus vocabulary not yet constructed"
        unless keys %{$self->{_vocab_hist}};
    my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
    print "\nSize of the corpus vocabulary: $vocab_size\n\n";
}

sub write_corpus_vocab_to_file {
    my $self = shift;
    my $file = shift;
    die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
    open OUT, "> $file" 
       or die "unable to open for output a file with name `$file': $!";
    foreach (sort keys %{$self->{_vocab_hist}}){
        my $outstring = sprintf("%30s     %d", $_,$self->{_vocab_hist}->{$_});
        print OUT "$outstring\n";
    }
    close OUT;
}

sub display_inverse_document_frequencies {
    my $self = shift;
    die "corpus vocabulary not yet constructed"
        unless keys %{$self->{_vocab_idf_hist}};
    print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
        unless $self->{_idf_filter_option};
    print "\n\nDisplaying inverse document frequencies:\n";

lib/Algorithm/VSM.pm  view on Meta::CPAN

            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    foreach (keys %{$self->{_vocab_hist_on_disk}}) {
        $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
               if $self->{_debug};
    $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
    untie %{$self->{_vocab_hist_on_disk}};
}

sub upload_normalized_vsm_model_from_disk {
    my $self = shift;
    die "\nCannot find the database files for the VSM model"
        unless -s "$self->{_corpus_vocab_db}.pag" 
            && -s $self->{_normalized_doc_vecs_db};
    $self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
    tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', 
                      $self->{_corpus_vocab_db}, O_RDONLY, 0640
            or die "Can't open DBM file: $!";       
    if ($self->{_debug}) {
        foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {               
            printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );    
        }
    }
    foreach (keys %{$self->{_vocab_hist_on_disk}}) {
        $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
    }
    $self->{_corpus_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
               if $self->{_debug};
    untie %{$self->{_vocab_hist_on_disk}};
}

############################## Display Retrieval Results  ################################

sub display_retrievals {
    my $self = shift;
    my $retrievals = shift;
    print "\n\nShowing the retrievals and the similarity scores:\n\n";
    my $iter = 0;
    foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
        print "$_   =>   $retrievals->{$_}\n"; 
        $iter++;
        last if $iter > $self->{_max_number_retrievals};
    }   
    print "\n\n";
}

###############################    Directory Scanner      ################################

sub _scan_directory {
    my $self = shift;
    my $dir = rel2abs( shift );
    my $current_dir = cwd;
    chdir $dir or die "Unable to change directory to $dir: $!";
    foreach ( glob "*" ) {                                            
        if ( -d and !(-l) ) {
            $self->_scan_directory( $_ );
            chdir $dir                                                
                or die "Unable to change directory to $dir: $!";
        } elsif (-r _ and 
                 -T _ and 
                 -M _ > 0.00001 and  # modification age is at least 1 sec
                !( -l $_ ) and 
                $self->ok_to_filetype($_) ) {
            $self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
            $self->_scan_file($_) unless $self->{_corpus_vocab_done};
            $self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
        }
    }
    chdir $current_dir;
}

sub _scan_file {
    my $self = shift;
    my $file = shift;
    open IN, $file;
    my $min = $self->{_min_word_length};
    my %uniques = ();
    while (<IN>) {
        next if /^[ ]*\r?\n?$/;
        $_ =~ s/\r?\n?$//;
        my @clean_words;
        if ($self->{_break_camelcased_and_underscored}) {
            my @brokenup = grep $_, split /\W|_|\s+/, $_;
            @clean_words = map {$_ =~ /$_regex/g} @brokenup;
        @clean_words = $self->{_case_sensitive} ? 
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
                       grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
        } else {
            my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
            @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
        }
        next unless @clean_words;
        @clean_words = grep $_, map &simple_stemmer($_), @clean_words
               if $self->{_want_stemming};
        $self->{_case_sensitive} ?
            map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
            map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
        if ($self->{_case_sensitive}) {
            for (@clean_words) { $uniques{$_}++ }
        } else {
           for (@clean_words) { $uniques{"\L$_"}++ }
        }
    }
    close( IN );
    map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
    $self->{_total_num_of_docs}++;
}

sub ok_to_filetype {
    my $self = shift;    
    my $filename = shift;
    my ($base, $dir, $suffix) = fileparse($filename, '\..*');
    croak "You called this module without specifying the file types in the constructor"
        unless @{$self->{_file_types}} > 0;



( run in 1.948 second using v1.01-cache-2.11-cpan-13bb782fe5a )