Algorithm-LDA

 view release on metacpan or  search on metacpan

lib/Algorithm/LDA.pm  view on Meta::CPAN

# Algorithm::LDA
#
# Perl implementation of an example module 
#
# Copyright (c) 2016
#
# Bridget T McInnes, Virginia Commonwealth University 
# bmcinnes at vcu.edu
#
# Nicholas Jordan, Virginia Commonwealth University 
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to 
#
# The Free Software Foundation, Inc., 
# 59 Temple Place - Suite 330, 
# Boston, MA  02111-1307, USA.

=head1 NAME

Algorithm::LDA

=head1 SYNOPSIS

 use Algorithm::LDA;
 
 my $lda = new Algorithm::LDA("Data", 5, 100, 100, 0, 10, 0.1, 10, "stoplist.txt");
 
=head1 DESCRIPTION

Algorithm::LDA is an implementation of Latent Dirichlet Allocation in Algorithm

=cut

package Algorithm::LDA;


use strict;
use 5.006;
use strict;
use warnings FATAL => 'all';

use constant pi => 4*atan2(1, 1);
use constant e  => exp(1);
use parent qw/Class::Accessor::Fast/;
use List::Util qw(shuffle sum max);
use List::MoreUtils qw(uniq first_index);
use JSON::XS;


use vars qw($VERSION);

$VERSION = '0.03';


#Used for accessing $self->documents
__PACKAGE__->mk_accessors(qw/documents/);


# $documents - Data directory (TXT files)
# $stop - Stopword list (regex)
# $K - Number of Topics
# $k - $K-1 (for convenience)
# %vocabulary - hashmap containing words and IDs
# @words - array containing all words
# @documents - array of arrays of words in each document
    # Doc1 = word1, word2, word3
    # Doc2 = word4, word5, word6
# %map - hashmap used for getting word frequencies

# $V - vocabulary size
# $v - $V-1 (for convenience)
# @alpha - array of alpha values (parameter of topic distribution)
# @theta - array of theta values (topic distribution)
# @beta - array of beta values (parameter of word distribution)
# @phi - array of phi values (word distribution)

# $totalDocs - Total Documents (Only used for computing completeness when loading)
# $maxIterations - Maximum Iterations
# $updateCorpus - 1 = Force update documents, 0 = allow loading from JSON
# $threshold - Minimum number of documents a word must appear in
# $numWords - Number of words per topic
# $alpha - Default alpha value

# $documentNum - Number of documents


my $data;
my $docs;
my $stop; 

my $K;
my $k;
my %vocabulary;
my @words;
my @documents;
my %map = ();

my $V;
my $v;
my @alpha;
my @theta;
my @beta;
my @phi;

my $totalDocs;
my $maxIterations;
my $updateCorpus;



( run in 1.576 second using v1.01-cache-2.11-cpan-5b529ec07f3 )