App-WIoZ

 view release on metacpan or  search on metacpan

doc/freq.pl  view on Meta::CPAN

#/usr/bin/perl
use strict;
use warnings;
use feature 'say';

#
# Usage :
#  $ curl -s http:// .... .html | html2text > file.txt
#  $ ./freq.pl file.txt > words.txt
#

binmode(STDOUT, ":utf8");

sub usage {
   say './freq.pl file';
   exit;
}

sub load_stopwords {
    my $fh;
    open $fh, '<:utf8', './stop-words-fr.txt';
    my @L = <$fh>;
    close $fh;
    my @SWords;
    foreach my $l (@L) {
        push @SWords, split ('\W',$l) if $l && $l !~ m/#/;
    };
    return @SWords;
};


my $File = $ARGV[0];

&usage if !$File ;


open my $F, '<:utf8', $File or die $!;
my @L = <$F>;
close $F;

my @StopWords = &load_stopwords;


my %seen=();
foreach my $l (@L)
{
    foreach my $word ( split '\W', $l )
    {
        $word = lc($word);
        $word =~ s/[. ,]*$//; # strip off punctuation, etc.

        next if $word =~m /\d+/;
        if ( length $word > 2 && ! grep /^$word$/, @StopWords ) {        
            $seen{$word}++;
        }
    }
}

foreach my $word ( sort { $seen{$b} <=> $seen{$a} } keys %seen) {
    say $word.';'.$seen{$word};
}




( run in 0.502 second using v1.01-cache-2.11-cpan-39bf76dae61 )