Algorithm-LDA
view release on metacpan or search on metacpan
lib/Algorithm/LDA.pm view on Meta::CPAN
{
print "Loading Document $documentNum ($filename): Corpus ";
print (($documentNum + 1) * 100 / $totalDocs);
print "% completed...\n";
open(FILE, "$docs/$filename")
|| die "Could not open file '$docs/$filename' $!";
#Load file into single string, remove stopwords, and split by whitespace into array
my $document = do { local $/; lc(<FILE>)};
$document =~ s/($regex)//g;
my @temp = split(/\s+/, $document);
#remove all special characters and add to @words and @documents1
for my $i (0..scalar @temp - 1)
{
$temp[$i] = removeSpecialChars($temp[$i]);
}
push(@words, @temp);
$documents1[$documentNum] = join(" ", @temp);
$documentNum++;
}
chomp @words;
@words = grep {$_ if $_ } @words;
%vocabulary = map { $words[$_]=>$_ } (0..$#words);
$V = scalar keys %vocabulary;
$v = $V-1;
#Loop through @documents1, remove special characters and populate @documents
for my $text (@documents1)
{
my @ws = split(/\s+/, $text);
chomp @ws;
@ws = map { removeSpecialChars($_) } @ws;
@ws = grep { exists $vocabulary{$_} } @ws;
@ws = uniq(@ws);
push @documents, \@ws;
}
print "Vocabulary (Uncleaned): $V\n";
#Get word frequencies
for my $d (0..$#documents)
{
for my $wrd (@{$documents[$d]})
{
next unless exists $vocabulary{$wrd};
$map{$wrd}=0 unless exists $map{$wrd};
$map{$wrd}++;
}
}
#Remove words that appear in more than half of the corpus, and less than $threshold documents
#Also remove words of less than three letters
my $D = @documents;
for my $wd (0..$#words)
{
my $times = $map{$words[$wd]};
my $test = ($times > 0.5*$D || $times<=$threshold || length($words[$wd]) <=3);
if($test)
{
$words[$wd]=0;
}
}
#Repopulate %vocabulary with cleaned words
@words = grep { $_ } (@words);
@words = uniq(@words);
%vocabulary = map { $words[$_] => $_ } (0..$#words);
$V = scalar keys %vocabulary;
$v = $V-1;
print "Vocabulary (Cleaned): $V\n";
#Convert words to hashmap (for use of "exists") and remove unclean
# words from documents array
my %h;
@h{@words} = ();
for my $i (0..$#documents)
{
@{$documents[$i]} = grep{exists $h{$_}} @{$documents[$i]};
}
open(my $fh, '>', "JSON") or die "Could not open file 'JSON' $!";
foreach my $i (@documents)
{
print $fh "{\"data\":[\"" . join("\", \"", @{$i})."\"]}\n";
}
close $fh;
open(my $fh2, '<', "JSON") or die "Could not open file 'JSON' $!";
while (my $line = <$fh2>) {
my $obj = decode_json($line);
add(%$obj);
}
close $fh2;
}
=head3 wordsPerTopic
description:
Creates an array of words in each topic
input:
%args -> hash containing topic
output:
@words -> Array containing words and probabilities (phi value) for $args{topic}
example:
lib/Algorithm/LDA.pm view on Meta::CPAN
$stop_regex -> regex containing stopwords
example:
my $stop = stop();
my $regex = qr/($stop)/;
=cut
#STOPWORD SUBROUTINE
sub stop
{
my $stop_regex = "";
my $stop_mode = "AND";
open ( STP, $stop ) ||
die ("Couldn't open the stoplist file $stop\n");
while ( <STP> )
{
chomp;
if(/\@stop.mode\s*=\s*(\w+)\s*$/)
{
$stop_mode=$1;
if(!($stop_mode=~/^(AND|and|OR|or)$/))
{
print STDERR "Requested Stop Mode $1 is not supported.\n";
exit;
}
next;
}
# accepting Perl Regexs from Stopfile
s/^\s+//;
s/\s+$//;
#handling a blank lines
if(/^\s*$/) { next; }
#check if a valid Perl Regex
if(!(/^\//))
{
print STDERR "Stop token regular expression <$_> should start with '/'\n";
exit;
}
if(!(/\/$/))
{
print STDERR "Stop token regular expression <$_> should end with '/'\n";
exit;
}
#remove the / s from beginning and end
s/^\///;
s/\/$//;
#form a single big regex
$stop_regex.="(".$_.")|";
}
if(length($stop_regex)<=0)
{
print STDERR "No valid Perl Regular Experssion found in Stop file $stop";
exit;
}
chop $stop_regex;
# making AND a default stop mode
if(!defined $stop_mode)
{
$stop_mode="AND";
}
close STP;
return $stop_regex;
}
1;
__END__
=head1 REFERENCING
If you have a reference paper for this module put it here in bibtex form
=head1 CONTACT US
If you have any trouble installing and using <module name>
please contact us via :
Bridget T. McInnes: btmcinnes at vcu.edu
=head1 SEE ALSO
Additional modules associated with the package
=head1 AUTHORS
Nick Jordan, Virginia Commonwealth University
Bridget McInnes, Virginia Commonwealth University
=head1 COPYRIGHT AND LICENSE
Copyright 2016 by Bridget McInnes, Nicholas Jordan
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to
The Free Software Foundation, Inc.,
( run in 1.653 second using v1.01-cache-2.11-cpan-140bd7fdf52 )