Algorithm-LDA
view release on metacpan or search on metacpan
lib/Algorithm/LDA.pm view on Meta::CPAN
# Algorithm::LDA
#
# Perl implementation of an example module
#
# Copyright (c) 2016
#
# Bridget T McInnes, Virginia Commonwealth University
# bmcinnes at vcu.edu
#
# Nicholas Jordan, Virginia Commonwealth University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
=head1 NAME
Algorithm::LDA
=head1 SYNOPSIS
use Algorithm::LDA;
my $lda = new Algorithm::LDA("Data", 5, 100, 100, 0, 10, 0.1, 10, "stoplist.txt");
=head1 DESCRIPTION
Algorithm::LDA is an implementation of Latent Dirichlet Allocation in Algorithm
=cut
package Algorithm::LDA;
use strict;
use 5.006;
use strict;
use warnings FATAL => 'all';
use constant pi => 4*atan2(1, 1);
use constant e => exp(1);
use parent qw/Class::Accessor::Fast/;
use List::Util qw(shuffle sum max);
use List::MoreUtils qw(uniq first_index);
use JSON::XS;
use vars qw($VERSION);
$VERSION = '0.03';
#Used for accessing $self->documents
__PACKAGE__->mk_accessors(qw/documents/);
# $documents - Data directory (TXT files)
# $stop - Stopword list (regex)
# $K - Number of Topics
# $k - $K-1 (for convenience)
# %vocabulary - hashmap containing words and IDs
# @words - array containing all words
# @documents - array of arrays of words in each document
# Doc1 = word1, word2, word3
# Doc2 = word4, word5, word6
# %map - hashmap used for getting word frequencies
# $V - vocabulary size
# $v - $V-1 (for convenience)
# @alpha - array of alpha values (parameter of topic distribution)
# @theta - array of theta values (topic distribution)
# @beta - array of beta values (parameter of word distribution)
# @phi - array of phi values (word distribution)
# $totalDocs - Total Documents (Only used for computing completeness when loading)
# $maxIterations - Maximum Iterations
# $updateCorpus - 1 = Force update documents, 0 = allow loading from JSON
# $threshold - Minimum number of documents a word must appear in
# $numWords - Number of words per topic
# $alpha - Default alpha value
# $documentNum - Number of documents
my $data;
my $docs;
my $stop;
my $K;
my $k;
my %vocabulary;
my @words;
my @documents;
my %map = ();
my $V;
my $v;
my @alpha;
my @theta;
my @beta;
my @phi;
my $totalDocs;
my $maxIterations;
my $updateCorpus;
( run in 1.576 second using v1.01-cache-2.11-cpan-5b529ec07f3 )