Algorithm-LDA
view release on metacpan or search on metacpan
lib/Algorithm/LDA.pm view on Meta::CPAN
# Algorithm::LDA
#
# Perl implementation of an example module
#
# Copyright (c) 2016
#
# Bridget T McInnes, Virginia Commonwealth University
# bmcinnes at vcu.edu
#
# Nicholas Jordan, Virginia Commonwealth University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
=head1 NAME
Algorithm::LDA
=head1 SYNOPSIS
use Algorithm::LDA;
my $lda = new Algorithm::LDA("Data", 5, 100, 100, 0, 10, 0.1, 10, "stoplist.txt");
=head1 DESCRIPTION
Algorithm::LDA is an implementation of Latent Dirichlet Allocation in Algorithm
=cut
package Algorithm::LDA;
use strict;
use 5.006;
use strict;
use warnings FATAL => 'all';
use constant pi => 4*atan2(1, 1);
use constant e => exp(1);
use parent qw/Class::Accessor::Fast/;
use List::Util qw(shuffle sum max);
use List::MoreUtils qw(uniq first_index);
use JSON::XS;
use vars qw($VERSION);
$VERSION = '0.03';
#Used for accessing $self->documents
__PACKAGE__->mk_accessors(qw/documents/);
# $documents - Data directory (TXT files)
# $stop - Stopword list (regex)
# $K - Number of Topics
# $k - $K-1 (for convenience)
# %vocabulary - hashmap containing words and IDs
# @words - array containing all words
# @documents - array of arrays of words in each document
# Doc1 = word1, word2, word3
# Doc2 = word4, word5, word6
# %map - hashmap used for getting word frequencies
# $V - vocabulary size
# $v - $V-1 (for convenience)
# @alpha - array of alpha values (parameter of topic distribution)
# @theta - array of theta values (topic distribution)
# @beta - array of beta values (parameter of word distribution)
# @phi - array of phi values (word distribution)
# $totalDocs - Total Documents (Only used for computing completeness when loading)
# $maxIterations - Maximum Iterations
# $updateCorpus - 1 = Force update documents, 0 = allow loading from JSON
# $threshold - Minimum number of documents a word must appear in
# $numWords - Number of words per topic
# $alpha - Default alpha value
# $documentNum - Number of documents
my $data;
my $docs;
my $stop;
my $K;
my $k;
my %vocabulary;
my @words;
my @documents;
my %map = ();
my $V;
my $v;
my @alpha;
my @theta;
my @beta;
my @phi;
my $totalDocs;
my $maxIterations;
my $updateCorpus;
my $threshold;
my $numWords;
my $alpha;
my $documentNum = 0;
my $self;
sub new
{
my $class = shift;
$self = {
_data => shift,
_numTopics => shift,
_maxIterations => shift,
_totalDocs => shift,
_updateCorpus => shift,
_wordThreshold => shift,
_alpha => shift,
_numWords => shift,
_stop => shift,
docs => [],
document_topic_map => {},
topic_word_map => {},
document_map => {},
topic_map => {},
word_map => {},
};
lib/Algorithm/LDA.pm view on Meta::CPAN
sub printResults
{
print STDERR "Printing Results\n";
my $iter = shift;
if(! (-e "Results")) {
system "mkdir Results";
}
if(! (-e "Results/$iter") ) {
system "mkdir Results/$iter";
}
my $file = "Results/" . $iter . "/Topics." . $iter . ".txt";
open(my $fh, '>', $file) or die "Could not open file '$file' $!";
my $file2 = "Results/" . $iter . "/Documents." . $iter . ".txt";
open(my $fh2, '>', $file2) or die "Could not open file '$file2' $!";
for my $topic (0 .. $k)
{
my $words_on_topic = wordsPerTopic(topic => $topic);
splice(@$words_on_topic, $numWords);
print $fh join("\n", "Topic[$topic]:\n", map { "$_->{word}\t$_->{prob}"; } @$words_on_topic)."\n\n\n";
}
for my $doc (0 .. $#documents) {
my $topics_on_document= topicsPerDocument(document => $doc);
splice(@$topics_on_document, $numWords);
print $fh2 join("\n", "Document[$doc]:\n", map { "$_->{topic}\t$_->{prob}"; } @$topics_on_document)."\n\n\n";
}
close($fh);
close($fh2);
my $file3 = "Results/" . $iter . "/phi." . $iter . ".txt";
open(my $fh3, '>', $file3) or die "Could not open file '$file3' $!";
my $file4 = "Results/" . $iter . "/theta." . $iter . ".txt";
open(my $fh4, '>', $file4) or die "Could not open file '$file4' $!";
for my $i (0..$k)
{
print $fh3 "$i : " . join(", ", @{$phi[$i]}) . "\n";
}
for my $i (0..scalar @{$self->documents} - 1)
{
print $fh4 "$i : " . join(", ", @{$theta[$i]}) . "\n";
}
close($fh3);
close($fh4);
}
=head3 load
description:
Loads documents from text files (in "data/$data") or JSON file (in "Documents")
input:
None
output:
None
example:
load();
=cut
#Loads document data from files or JSON
sub load
{
#open data directory
opendir(DH, "$docs");
my @files = grep { $_ ne '.' and $_ ne '..' } readdir DH;
closedir(DH);
#array holding string of words in each document
my @documents1 = ();
#stopword regex
my $regex = "";
if(defined $stop) {
my $rstop = stop();
$regex = qr/($rstop)/;
}
#Load Files from TXTs
print "Loading Documents from TXTs...\n";
foreach my $filename (@files)
{
print "Loading Document $documentNum ($filename): Corpus ";
print (($documentNum + 1) * 100 / $totalDocs);
print "% completed...\n";
open(FILE, "$docs/$filename")
|| die "Could not open file '$docs/$filename' $!";
#Load file into single string, remove stopwords, and split by whitespace into array
my $document = do { local $/; lc(<FILE>)};
$document =~ s/($regex)//g;
my @temp = split(/\s+/, $document);
#remove all special characters and add to @words and @documents1
for my $i (0..scalar @temp - 1)
{
$temp[$i] = removeSpecialChars($temp[$i]);
}
push(@words, @temp);
$documents1[$documentNum] = join(" ", @temp);
$documentNum++;
}
chomp @words;
@words = grep {$_ if $_ } @words;
%vocabulary = map { $words[$_]=>$_ } (0..$#words);
$V = scalar keys %vocabulary;
$v = $V-1;
#Loop through @documents1, remove special characters and populate @documents
for my $text (@documents1)
{
my @ws = split(/\s+/, $text);
chomp @ws;
@ws = map { removeSpecialChars($_) } @ws;
@ws = grep { exists $vocabulary{$_} } @ws;
@ws = uniq(@ws);
push @documents, \@ws;
}
print "Vocabulary (Uncleaned): $V\n";
#Get word frequencies
for my $d (0..$#documents)
{
for my $wrd (@{$documents[$d]})
{
next unless exists $vocabulary{$wrd};
$map{$wrd}=0 unless exists $map{$wrd};
$map{$wrd}++;
}
}
#Remove words that appear in more than half of the corpus, and less than $threshold documents
#Also remove words of less than three letters
my $D = @documents;
for my $wd (0..$#words)
{
my $times = $map{$words[$wd]};
my $test = ($times > 0.5*$D || $times<=$threshold || length($words[$wd]) <=3);
if($test)
{
$words[$wd]=0;
}
}
#Repopulate %vocabulary with cleaned words
@words = grep { $_ } (@words);
@words = uniq(@words);
%vocabulary = map { $words[$_] => $_ } (0..$#words);
$V = scalar keys %vocabulary;
$v = $V-1;
print "Vocabulary (Cleaned): $V\n";
#Convert words to hashmap (for use of "exists") and remove unclean
# words from documents array
my %h;
@h{@words} = ();
for my $i (0..$#documents)
{
@{$documents[$i]} = grep{exists $h{$_}} @{$documents[$i]};
}
open(my $fh, '>', "JSON") or die "Could not open file 'JSON' $!";
foreach my $i (@documents)
{
print $fh "{\"data\":[\"" . join("\", \"", @{$i})."\"]}\n";
}
close $fh;
open(my $fh2, '<', "JSON") or die "Could not open file 'JSON' $!";
while (my $line = <$fh2>) {
my $obj = decode_json($line);
add(%$obj);
}
close $fh2;
}
=head3 wordsPerTopic
description:
Creates an array of words in each topic
input:
%args -> hash containing topic
output:
@words -> Array containing words and probabilities (phi value) for $args{topic}
example:
my $words_on_topic = wordsPerTopic(topic => $topic);
=cut
sub wordsPerTopic
{
my (%args) = @_;
return unless (defined $args{topic});
my @words = sort { $b->{prob} <=> $a->{prob} } map {
{ word => $_, prob => $self->computePhi($args{topic}, $_) }
} keys %{$self->{word_map}};
return \@words;
}
=head3 topicsPerDocument
description:
Creates an array of topics in each document
input:
%args -> hash containing document
output:
@topics -> Array containing topics and probabilities (theta value) for $args{document}
example:
my $topics_on_document= topicsPerDocument(document => $doc);
=cut
sub topicsPerDocument
( run in 0.656 second using v1.01-cache-2.11-cpan-140bd7fdf52 )