Alvis-Bags

 view release on metacpan or  search on metacpan

bin/linkBags  view on Meta::CPAN

#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use Pod::Usage;
use POSIX;
use Encode;
use Alvis::URLs;
use IO::Handle; 

###################### CONFIGURATION #####################

#  ensure sort handles UTF8 order
my $SORTCODE = "LC_ALL=en_US.UTF-8; export LC_ALL;" ;

############ END CONFIGURATION ######################

#  autoflush
select((select(STDERR), $| = 1)[0]);

# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";

#  command line inputs 
my $gzip = 0;
my $stem = "";
my $file = "";
my $linktext = 0;
my $titletext = 0;
my $update = 0;

# shared vars
my $doccount = 0;
my $featcount = 0;
my %featmap = ();        #  maps feature hash to feature index
my %docfeat = ();        #  true if its a doc/internal link


#################################################################
#
#  Build routines
#
#################################################################

#   discards feature if it doesn't hash to something known
sub table() {
  my $tp = $_[0];
  my $tk = $_[1];
  my $ft = $_[2];
  my $code = "$tp $tk";
  my $h = &Alvis::URLs::easyhash64char($code);
  if ( $docfeat{$h} ) {
    if ( $tp eq "link" ) {
      $ft->{$h} ++;
    }
  } elsif ( $featmap{$h} ) { 
    $ft->{$h} ++;
  }
}

#  copied from linkTables.pl, with changes
sub tabletext() {
  my $tw = $_[0];
  my $ft = $_[1];
  #  strip punctuation
  $tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
  #  break at spaces
  $tw =~ s/\s+/ /g; 
  $tw =~ s/^\s//; 
  $tw =~ s/\s$//; 
  foreach my $k ( split(/\s/,$tw) ) {
    #  lower case by default
    $k = lc($k);
    &table("text",$k,$ft);
  }
}

#  Processors the file of links got through filehandle IN.
#  Output bag to filehandle OUT.
#  Output a bagged version to make it shorter, but its pretty
#  complex.
sub MakeBags() {



( run in 0.951 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )