Alvis-Bags
view release on metacpan or search on metacpan
bin/linkTables view on Meta::CPAN
#!/usr/bin/perl -w
use strict;
use POSIX;
use HTML::Entities;
use Alvis::URLs;
use Getopt::Long;
use Pod::Usage;
# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";
# ensure sort handles UTF8 order
my $SORTCODE = "LC_ALL=en_US.UTF-8; export LC_ALL;" ;
my $MINCOUNT = 1;
my $linktext = 0;
my $titletext = 0;
my $stopfile = "";
my $fixdocs = 0; # set this to fix everything but .docs
my %stops = ();
# check options
GetOptions(
'man' => sub {pod2usage(-exitstatus => 0, -verbose => 2)},
'stopfile=s' => \$stopfile,
'mincount=i' => \$MINCOUNT,
'docs' => \$fixdocs,
'linktext' => \$linktext,
'titletext' => \$titletext,
'noclean' => \$Alvis::URLs::noclean,
'nocase' => \$Alvis::URLs::nocase,
'h|help' => sub {pod2usage(1)}
);
pod2usage(-message => "ERROR: need input file and stem")
if ( $#ARGV != 1 );
my $file = shift();
my $stem = shift();
my $doccount = 0;
my $featcount = 0;
# maps a cleaned URL's hash to a docID
my %docmap = ();
# maps a docID to a sequence number
my %docid = ();
# token value plus count
my %token = ();
my %tokencnt = ();
if ( $stopfile ) {
open(S,"<$stopfile");
while ( ($_=<S>) ) {
chomp();
$stops{lc($_)} = 1;
}
close(S);
}
sub tabletext() {
my $tw = $_[0];
# strip punctuation
$tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
# break at spaces
$tw =~ s/\s+/ /g;
$tw =~ s/^\s//;
$tw =~ s/\s$//;
foreach my $k ( split(/ /,$tw) ) {
( run in 1.455 second using v1.01-cache-2.11-cpan-437f7b0c052 )