Alvis-Bags
view release on metacpan or search on metacpan
bin/linkBags view on Meta::CPAN
my %docfeat = (); # true if its a doc/internal link
#################################################################
#
# Build routines
#
#################################################################
# discards feature if it doesn't hash to something known
sub table() {
my $tp = $_[0];
my $tk = $_[1];
my $ft = $_[2];
my $code = "$tp $tk";
my $h = &Alvis::URLs::easyhash64char($code);
if ( $docfeat{$h} ) {
if ( $tp eq "link" ) {
$ft->{$h} ++;
}
} elsif ( $featmap{$h} ) {
$ft->{$h} ++;
}
}
# copied from linkTables.pl, with changes
sub tabletext() {
my $tw = $_[0];
my $ft = $_[1];
# strip punctuation
$tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
# break at spaces
$tw =~ s/\s+/ /g;
$tw =~ s/^\s//;
$tw =~ s/\s$//;
foreach my $k ( split(/\s/,$tw) ) {
# lower case by default
$k = lc($k);
&table("text",$k,$ft);
}
}
# Processors the file of links got through filehandle IN.
# Output bag to filehandle OUT.
# Output a bagged version to make it shorter, but its pretty
# complex.
sub MakeBags() {
while (($_=<IN>) ) {
chomp();
if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
my $titles = $3;
my %feats = ();
if ( $titletext ) {
&tabletext($titles);
}
# now process links
for ( $_=<IN>,chomp(); $_ ne "EOD" && $_ ne "EOL";
bin/linkBags view on Meta::CPAN
#################################################################
#
# Load routines
#
#################################################################
# Load up symbol table info.
# $doccount, $featcount
# %featmap (hashcode to feature number map)
sub LoadTables() {
open(FEATS,"<$stem.words");
# load up the mappings, precomputed
%featmap = ();
$featcount = 0;
while ( ($_=<FEATS>) ) {
chomp();
my @a = split();
$featmap{$a[2]} = $a[0];
if ( $a[1] eq "doc" ) {
$docfeat{$a[2]} = 1;
bin/linkRedir view on Meta::CPAN
my $usezip = "";
my $init = 0;
# both hash tables only store URLs cleaned with StandardURL()
# single redirect for a URL
my %redirect = ();
# space delimited set of entries with same target
my %direct = ();
sub updateMaps() {
my $inu = shift();
my $outu = shift();
my $direct_add = "";
# no duplicates
if ( defined($redirect{$inu}) &&
$redirect{$inu} ne $outu ) {
print STDERR "Previous definition '$inu'->'" . $redirect{$inu}
. "' for line:\n $_\n";
# exit(1);
}
bin/linkTables view on Meta::CPAN
if ( $stopfile ) {
open(S,"<$stopfile");
while ( ($_=<S>) ) {
chomp();
$stops{lc($_)} = 1;
}
close(S);
}
sub tabletext() {
my $tw = $_[0];
# strip punctuation
$tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
# break at spaces
$tw =~ s/\s+/ /g;
$tw =~ s/^\s//;
$tw =~ s/\s$//;
foreach my $k ( split(/ /,$tw) ) {
# lower case by default
$k = lc($k);
if ( ! defined($stops{$k}) ) {
&table("text",$k);
}
}
}
# ensure to make "link" entries dominate, they should never be
# dropped in favor of non-link entries
sub table() {
my $tp = $_[0];
my $text = $_[1];
my $code = "$tp $text";
# print STDERR "Table $code\n";
my $h = &Alvis::URLs::easyhash64char($code);
if ( defined($token{$h}) ) {
if ( $token{$h} ne $code ) {
if ( defined($docmap{$h}) ) {
# documents always override
if ( $tp eq "link" ) {
lib/Alvis/URLs.pm view on Meta::CPAN
my $dig = md5_hex($string);
# print $dig . " \n";
return substr($dig,0,16);
}
# URL switches
$Alvis::URLs::nocase = 0;
$Alvis::URLs::noclean = 0;
$Alvis::URLs::keepfrag = 0;
sub CleanURL() {
if ( !$_[0] ) {
return undef;
}
my $uri = new URI($_[0]);
if ( ! $Alvis::URLs::keepfrag ) {
$uri->fragment(undef);
}
return $uri->canonical;
}
sub StandardURL() {
my $inu = shift();
if ( $Alvis::URLs::nocase ) {
$inu = lc($inu);
}
if ( $Alvis::URLs::noclean == 0 ) {
$inu = &Alvis::URLs::CleanURL($inu);
}
return $inu;
}
( run in 0.738 second using v1.01-cache-2.11-cpan-65fba6d93b7 )