Alvis-Bags
view release on metacpan or search on metacpan
bin/linkBags view on Meta::CPAN
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Pod::Usage;
use POSIX;
use Encode;
use Alvis::URLs;
use IO::Handle;
###################### CONFIGURATION #####################
# ensure sort handles UTF8 order
my $SORTCODE = "LC_ALL=en_US.UTF-8; export LC_ALL;" ;
############ END CONFIGURATION ######################
# autoflush
select((select(STDERR), $| = 1)[0]);
# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";
# command line inputs
my $gzip = 0;
my $stem = "";
my $file = "";
my $linktext = 0;
my $titletext = 0;
my $update = 0;
# shared vars
my $doccount = 0;
my $featcount = 0;
my %featmap = (); # maps feature hash to feature index
my %docfeat = (); # true if its a doc/internal link
#################################################################
#
# Build routines
#
#################################################################
# discards feature if it doesn't hash to something known
sub table() {
my $tp = $_[0];
my $tk = $_[1];
my $ft = $_[2];
my $code = "$tp $tk";
my $h = &Alvis::URLs::easyhash64char($code);
if ( $docfeat{$h} ) {
if ( $tp eq "link" ) {
$ft->{$h} ++;
}
} elsif ( $featmap{$h} ) {
$ft->{$h} ++;
}
}
# copied from linkTables.pl, with changes
sub tabletext() {
my $tw = $_[0];
my $ft = $_[1];
# strip punctuation
$tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
# break at spaces
$tw =~ s/\s+/ /g;
$tw =~ s/^\s//;
$tw =~ s/\s$//;
foreach my $k ( split(/\s/,$tw) ) {
# lower case by default
$k = lc($k);
&table("text",$k,$ft);
}
}
# Processors the file of links got through filehandle IN.
# Output bag to filehandle OUT.
# Output a bagged version to make it shorter, but its pretty
# complex.
sub MakeBags() {
( run in 0.951 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )