Alvis-Bags

 view release on metacpan or  search on metacpan

bin/linkTables  view on Meta::CPAN

#!/usr/bin/perl -w

use strict;
use POSIX;
use HTML::Entities;
use Alvis::URLs;
use Getopt::Long;
use Pod::Usage;


# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";

#  ensure sort handles UTF8 order
my $SORTCODE = "LC_ALL=en_US.UTF-8; export LC_ALL;" ;

my $MINCOUNT = 1;
my $linktext = 0;
my $titletext = 0;
my $stopfile = "";
my $fixdocs = 0;    # set this to fix everything but .docs 
my %stops = ();


#  check options

GetOptions(
     'man'       => sub {pod2usage(-exitstatus => 0, -verbose => 2)},
      'stopfile=s' => \$stopfile,
      'mincount=i' => \$MINCOUNT,
      'docs' => \$fixdocs,
      'linktext' => \$linktext,
      'titletext' => \$titletext,
      'noclean' => \$Alvis::URLs::noclean,
      'nocase' => \$Alvis::URLs::nocase,
      'h|help'       => sub {pod2usage(1)}
);

pod2usage(-message => "ERROR: need input file and stem")
      if ( $#ARGV != 1 );

my $file = shift();
my $stem = shift();

my $doccount = 0;
my $featcount = 0;
#  maps a cleaned URL's hash to a docID 
my %docmap = ();
#  maps a docID to a sequence number
my %docid = ();
#  token value plus count
my %token = ();
my %tokencnt = ();

if ( $stopfile ) {
  open(S,"<$stopfile");
  while ( ($_=<S>) ) {
    chomp();
    $stops{lc($_)} = 1;
  }
  close(S);
}

sub tabletext() {
  my $tw = $_[0];
  #  strip punctuation
  $tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
  #  break at spaces
  $tw =~ s/\s+/ /g; 
  $tw =~ s/^\s//; 
  $tw =~ s/\s$//; 
  foreach my $k ( split(/ /,$tw) ) {



( run in 1.455 second using v1.01-cache-2.11-cpan-437f7b0c052 )