Algorithm-LDA

 view release on metacpan or  search on metacpan

lib/Algorithm/LDA.pm  view on Meta::CPAN

# Algorithm::LDA
#
# Perl implementation of an example module 
#
# Copyright (c) 2016
#
# Bridget T McInnes, Virginia Commonwealth University 
# bmcinnes at vcu.edu
#
# Nicholas Jordan, Virginia Commonwealth University 
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to 
#
# The Free Software Foundation, Inc., 
# 59 Temple Place - Suite 330, 
# Boston, MA  02111-1307, USA.

=head1 NAME

Algorithm::LDA

=head1 SYNOPSIS

 use Algorithm::LDA;
 
 my $lda = new Algorithm::LDA("Data", 5, 100, 100, 0, 10, 0.1, 10, "stoplist.txt");
 
=head1 DESCRIPTION

Algorithm::LDA is an implementation of Latent Dirichlet Allocation in Algorithm

=cut

package Algorithm::LDA;


use strict;
use 5.006;
use strict;
use warnings FATAL => 'all';

use constant pi => 4*atan2(1, 1);
use constant e  => exp(1);
use parent qw/Class::Accessor::Fast/;
use List::Util qw(shuffle sum max);
use List::MoreUtils qw(uniq first_index);
use JSON::XS;


use vars qw($VERSION);

$VERSION = '0.03';


#Used for accessing $self->documents
__PACKAGE__->mk_accessors(qw/documents/);


# $documents - Data directory (TXT files)
# $stop - Stopword list (regex)
# $K - Number of Topics
# $k - $K-1 (for convenience)
# %vocabulary - hashmap containing words and IDs
# @words - array containing all words
# @documents - array of arrays of words in each document
    # Doc1 = word1, word2, word3
    # Doc2 = word4, word5, word6
# %map - hashmap used for getting word frequencies

# $V - vocabulary size
# $v - $V-1 (for convenience)
# @alpha - array of alpha values (parameter of topic distribution)
# @theta - array of theta values (topic distribution)
# @beta - array of beta values (parameter of word distribution)
# @phi - array of phi values (word distribution)

# $totalDocs - Total Documents (Only used for computing completeness when loading)
# $maxIterations - Maximum Iterations
# $updateCorpus - 1 = Force update documents, 0 = allow loading from JSON
# $threshold - Minimum number of documents a word must appear in
# $numWords - Number of words per topic
# $alpha - Default alpha value

# $documentNum - Number of documents


my $data;
my $docs;
my $stop; 

my $K;
my $k;
my %vocabulary;
my @words;
my @documents;
my %map = ();

my $V;
my $v;
my @alpha;
my @theta;
my @beta;
my @phi;

my $totalDocs;
my $maxIterations;
my $updateCorpus;
my $threshold;
my $numWords;
my $alpha;

my $documentNum = 0;


my $self;

sub new 
{
    my $class = shift;
    $self = {
        _data => shift,
        _numTopics => shift,
        _maxIterations => shift,
        _totalDocs => shift,
        _updateCorpus => shift,
        _wordThreshold => shift,
        _alpha => shift,
        _numWords => shift,
        _stop => shift,

        docs          => [],

        document_topic_map => {},
        topic_word_map     => {},
        document_map       => {},
        topic_map          => {},
        word_map           => {},
    };
    

lib/Algorithm/LDA.pm  view on Meta::CPAN

sub printResults
{
    print STDERR "Printing Results\n";

    my $iter = shift; 

    if(! (-e "Results")) { 
	system "mkdir Results"; 
    }
    
    if(! (-e "Results/$iter") ) { 
	system "mkdir Results/$iter"; 
    }

    my $file = "Results/" . $iter . "/Topics." . $iter . ".txt";
    open(my $fh, '>', $file) or die "Could not open file '$file' $!";
    
    my $file2 = "Results/" . $iter . "/Documents." . $iter . ".txt";
    open(my $fh2, '>', $file2) or die "Could not open file '$file2' $!";
    
    for my $topic (0 .. $k) 
	{
        my $words_on_topic = wordsPerTopic(topic => $topic);
        splice(@$words_on_topic, $numWords);
        print $fh join("\n", "Topic[$topic]:\n", map { "$_->{word}\t$_->{prob}"; } @$words_on_topic)."\n\n\n";
    }
    
    for my $doc (0 .. $#documents) {
        my $topics_on_document= topicsPerDocument(document => $doc);
        splice(@$topics_on_document, $numWords);
        print $fh2 join("\n", "Document[$doc]:\n", map { "$_->{topic}\t$_->{prob}"; } @$topics_on_document)."\n\n\n";
    }

    close($fh);
    close($fh2);
    
    my $file3 = "Results/" . $iter . "/phi." . $iter . ".txt";
    open(my $fh3, '>', $file3) or die "Could not open file '$file3' $!";
    
    my $file4 = "Results/" . $iter . "/theta." . $iter . ".txt";
    open(my $fh4, '>', $file4) or die "Could not open file '$file4' $!";
    
    for my $i (0..$k)
    {
        print $fh3 "$i  :  " . join(", ", @{$phi[$i]}) . "\n";
    }
    
    for my $i (0..scalar @{$self->documents} - 1)
    {
        print $fh4 "$i  :  " . join(", ", @{$theta[$i]}) . "\n";
    }
    
    close($fh3);
    close($fh4);
}

=head3 load

description:

 Loads documents from text files (in "data/$data") or JSON file (in "Documents")

input:   

 None

output:

 None

example:

 load();

=cut

#Loads document data from files or JSON

sub load
{    
    #open data directory
    opendir(DH, "$docs"); 
    my @files = grep { $_ ne '.' and $_ ne '..' } readdir DH;
    closedir(DH);
    
    #array holding string of words in each document
    my @documents1 = ();
    
    #stopword regex
    
    my $regex = ""; 
    if(defined $stop) { 
	my $rstop = stop();
	$regex = qr/($rstop)/;
    }
        
    #Load Files from TXTs
                
    print "Loading Documents from TXTs...\n";
    
    foreach my $filename (@files)
    {
	print "Loading Document $documentNum ($filename): Corpus ";
	print (($documentNum + 1) * 100 / $totalDocs);
	print "% completed...\n";
	open(FILE, "$docs/$filename")
	    || die "Could not open file '$docs/$filename' $!";
	
	#Load file into single string, remove stopwords, and split by whitespace into array
	my $document = do { local $/; lc(<FILE>)};
	$document =~ s/($regex)//g;
	my @temp = split(/\s+/, $document);
	
	#remove all special characters and add to @words and @documents1
	for my $i (0..scalar @temp - 1)
	{
	    $temp[$i] = removeSpecialChars($temp[$i]);
	}
	push(@words, @temp);
	
	$documents1[$documentNum] = join(" ", @temp);
	$documentNum++;
    }
    

    chomp @words;
    @words = grep {$_ if $_ } @words;
    %vocabulary = map {   $words[$_]=>$_   } (0..$#words);
    $V = scalar keys %vocabulary;
    $v = $V-1;
    
    #Loop through @documents1, remove special characters and populate @documents
    for my $text (@documents1) 
    {
        my @ws = split(/\s+/, $text);
        chomp @ws;
        @ws = map { removeSpecialChars($_) } @ws;
        @ws  = grep {  exists $vocabulary{$_} } @ws;
        @ws = uniq(@ws);
        push @documents, \@ws;
    }
    
    print "Vocabulary (Uncleaned): $V\n";
    
    #Get word frequencies
    for my $d (0..$#documents) 
    {
        for my $wrd (@{$documents[$d]}) 
        {
            next unless exists $vocabulary{$wrd};
            $map{$wrd}=0 unless exists $map{$wrd};
            $map{$wrd}++; 
        }
    }
    
    #Remove words that appear in more than half of the corpus, and less than $threshold documents
    #Also remove words of less than three letters
    my $D = @documents;
    for my $wd (0..$#words) 
    {
        my $times = $map{$words[$wd]};
        my $test = ($times > 0.5*$D  || $times<=$threshold || length($words[$wd]) <=3);    
	
        if($test)   
        {   
            $words[$wd]=0;
        }
    }
    
    #Repopulate %vocabulary with cleaned words
    @words = grep { $_ } (@words);
    @words = uniq(@words);
    %vocabulary = map { $words[$_] => $_ } (0..$#words);
    $V = scalar keys %vocabulary;
    $v = $V-1;
    
    print "Vocabulary (Cleaned): $V\n";
    
    
    #Convert words to hashmap (for use of "exists") and remove unclean 
    # words from documents array
    my %h;
    @h{@words} = ();
    for my $i (0..$#documents)
    {
        @{$documents[$i]} = grep{exists $h{$_}} @{$documents[$i]};
    }

    
    open(my $fh, '>', "JSON") or die "Could not open file 'JSON' $!";
	
    foreach my $i (@documents)
    {
	print $fh "{\"data\":[\"" . join("\", \"", @{$i})."\"]}\n";
    }
    close $fh;
	
   
        open(my $fh2, '<', "JSON") or die "Could not open file 'JSON' $!";
        while (my $line = <$fh2>) {
            my $obj = decode_json($line);
            add(%$obj);
        }
        close $fh2;
}


=head3 wordsPerTopic
    
  description:
    
 Creates an array of words in each topic

input:   

 %args -> hash containing topic

output:

 @words -> Array containing words and probabilities (phi value) for $args{topic}

example:

 my $words_on_topic = wordsPerTopic(topic => $topic);

=cut

sub wordsPerTopic 
{
    my (%args) = @_;

    return unless (defined $args{topic});
    my @words = sort { $b->{prob} <=> $a->{prob} } map {
        { word => $_, prob => $self->computePhi($args{topic}, $_) }
    } keys %{$self->{word_map}};
    return \@words;
}

=head3 topicsPerDocument

description:

 Creates an array of topics in each document

input:   

 %args -> hash containing document

output:

 @topics -> Array containing topics and probabilities (theta value) for $args{document}

example:

 my $topics_on_document= topicsPerDocument(document => $doc);

=cut

sub topicsPerDocument 



( run in 0.656 second using v1.01-cache-2.11-cpan-140bd7fdf52 )