Algorithm-LDA

 view release on metacpan or  search on metacpan

lib/Algorithm/LDA.pm  view on Meta::CPAN

    {
	print "Loading Document $documentNum ($filename): Corpus ";
	print (($documentNum + 1) * 100 / $totalDocs);
	print "% completed...\n";
	open(FILE, "$docs/$filename")
	    || die "Could not open file '$docs/$filename' $!";
	
	#Load file into single string, remove stopwords, and split by whitespace into array
	my $document = do { local $/; lc(<FILE>)};
	$document =~ s/($regex)//g;
	my @temp = split(/\s+/, $document);
	
	#remove all special characters and add to @words and @documents1
	for my $i (0..scalar @temp - 1)
	{
	    $temp[$i] = removeSpecialChars($temp[$i]);
	}
	push(@words, @temp);
	
	$documents1[$documentNum] = join(" ", @temp);
	$documentNum++;
    }
    

    chomp @words;
    @words = grep {$_ if $_ } @words;
    %vocabulary = map {   $words[$_]=>$_   } (0..$#words);
    $V = scalar keys %vocabulary;
    $v = $V-1;
    
    #Loop through @documents1, remove special characters and populate @documents
    for my $text (@documents1) 
    {
        my @ws = split(/\s+/, $text);
        chomp @ws;
        @ws = map { removeSpecialChars($_) } @ws;
        @ws  = grep {  exists $vocabulary{$_} } @ws;
        @ws = uniq(@ws);
        push @documents, \@ws;
    }
    
    print "Vocabulary (Uncleaned): $V\n";
    
    #Get word frequencies
    for my $d (0..$#documents) 
    {
        for my $wrd (@{$documents[$d]}) 
        {
            next unless exists $vocabulary{$wrd};
            $map{$wrd}=0 unless exists $map{$wrd};
            $map{$wrd}++; 
        }
    }
    
    #Remove words that appear in more than half of the corpus, and less than $threshold documents
    #Also remove words of less than three letters
    my $D = @documents;
    for my $wd (0..$#words) 
    {
        my $times = $map{$words[$wd]};
        my $test = ($times > 0.5*$D  || $times<=$threshold || length($words[$wd]) <=3);    
	
        if($test)   
        {   
            $words[$wd]=0;
        }
    }
    
    #Repopulate %vocabulary with cleaned words
    @words = grep { $_ } (@words);
    @words = uniq(@words);
    %vocabulary = map { $words[$_] => $_ } (0..$#words);
    $V = scalar keys %vocabulary;
    $v = $V-1;
    
    print "Vocabulary (Cleaned): $V\n";
    
    
    #Convert words to hashmap (for use of "exists") and remove unclean 
    # words from documents array
    my %h;
    @h{@words} = ();
    for my $i (0..$#documents)
    {
        @{$documents[$i]} = grep{exists $h{$_}} @{$documents[$i]};
    }

    
    open(my $fh, '>', "JSON") or die "Could not open file 'JSON' $!";
	
    foreach my $i (@documents)
    {
	print $fh "{\"data\":[\"" . join("\", \"", @{$i})."\"]}\n";
    }
    close $fh;
	
   
        open(my $fh2, '<', "JSON") or die "Could not open file 'JSON' $!";
        while (my $line = <$fh2>) {
            my $obj = decode_json($line);
            add(%$obj);
        }
        close $fh2;
}


=head3 wordsPerTopic
    
  description:
    
 Creates an array of words in each topic

input:   

 %args -> hash containing topic

output:

 @words -> Array containing words and probabilities (phi value) for $args{topic}

example:

lib/Algorithm/LDA.pm  view on Meta::CPAN

 $stop_regex -> regex containing stopwords

example:

 my $stop = stop();
 my $regex = qr/($stop)/;

=cut

#STOPWORD SUBROUTINE
sub stop 
{ 
    my $stop_regex = "";
    my $stop_mode = "AND";

    open ( STP, $stop ) ||
        die ("Couldn't open the stoplist file $stop\n");
    
    while ( <STP> ) 
    {
	chomp; 
	
	if(/\@stop.mode\s*=\s*(\w+)\s*$/) 
        {
	    $stop_mode=$1;
	    if(!($stop_mode=~/^(AND|and|OR|or)$/)) 
            {
		print STDERR "Requested Stop Mode $1 is not supported.\n";
		exit;
	    }
	    next;
	} 
	
	# accepting Perl Regexs from Stopfile
	s/^\s+//;
	s/\s+$//;
	
	#handling a blank lines
	if(/^\s*$/) { next; }
	
	#check if a valid Perl Regex
        if(!(/^\//)) 
        {
	    print STDERR "Stop token regular expression <$_> should start with '/'\n";
	    exit;
        }
        if(!(/\/$/)) 
        {
	    print STDERR "Stop token regular expression <$_> should end with '/'\n";
	    exit;
        }

        #remove the / s from beginning and end
        s/^\///;
        s/\/$//;
        
	#form a single big regex
        $stop_regex.="(".$_.")|";
    }

    if(length($stop_regex)<=0) 
    {
	print STDERR "No valid Perl Regular Experssion found in Stop file $stop";
	exit;
    }
    
    chop $stop_regex;
    
    # making AND a default stop mode
    if(!defined $stop_mode) 
    {
	$stop_mode="AND";
    }
    
    close STP;
    
    return $stop_regex; 
}

1;

__END__

=head1 REFERENCING

    If you have a reference paper for this module put it here in bibtex form

=head1 CONTACT US

  If you have any trouble installing and using <module name> 
  please contact us via :

      Bridget T. McInnes: btmcinnes at vcu.edu

=head1 SEE ALSO

Additional modules associated with the package

=head1 AUTHORS

  Nick Jordan, Virginia Commonwealth University 

  Bridget McInnes, Virginia Commonwealth University

=head1 COPYRIGHT AND LICENSE

Copyright 2016 by Bridget McInnes, Nicholas Jordan

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to 

 The Free Software Foundation, Inc.,



( run in 1.653 second using v1.01-cache-2.11-cpan-140bd7fdf52 )