Alvis-Bags
view release on metacpan or search on metacpan
bin/linkBags view on Meta::CPAN
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Pod::Usage;
use POSIX;
use Encode;
use Alvis::URLs;
use IO::Handle;
###################### CONFIGURATION #####################
# ensure sort handles UTF8 order
my $SORTCODE = "LC_ALL=en_US.UTF-8; export LC_ALL;" ;
############ END CONFIGURATION ######################
# autoflush
select((select(STDERR), $| = 1)[0]);
# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";
# command line inputs
my $gzip = 0;
my $stem = "";
my $file = "";
my $linktext = 0;
my $titletext = 0;
my $update = 0;
# shared vars
my $doccount = 0;
my $featcount = 0;
my %featmap = (); # maps feature hash to feature index
my %docfeat = (); # true if its a doc/internal link
#################################################################
#
# Build routines
#
#################################################################
# discards feature if it doesn't hash to something known
sub table() {
my $tp = $_[0];
my $tk = $_[1];
my $ft = $_[2];
my $code = "$tp $tk";
my $h = &Alvis::URLs::easyhash64char($code);
if ( $docfeat{$h} ) {
if ( $tp eq "link" ) {
$ft->{$h} ++;
}
} elsif ( $featmap{$h} ) {
$ft->{$h} ++;
}
}
# copied from linkTables.pl, with changes
sub tabletext() {
my $tw = $_[0];
my $ft = $_[1];
# strip punctuation
$tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
# break at spaces
$tw =~ s/\s+/ /g;
$tw =~ s/^\s//;
$tw =~ s/\s$//;
foreach my $k ( split(/\s/,$tw) ) {
# lower case by default
$k = lc($k);
&table("text",$k,$ft);
}
}
bin/linkBags view on Meta::CPAN
}
close(FEATS);
print STDERR "Loading feature map, size = " . %featmap . ".\n";
print STDERR "Loading document map, size = " . %docfeat . ".\n";
open(PAR,"grep '^maxdoc=' $stem.srcpar |");
my $par = <PAR>;
chomp($par);
close(PAR);
$par =~ s/.*=//;
$doccount = int($par);
print STDERR "Loaded $stem with $doccount docs and $featcount features\n";
}
#################################################################
#
# Run
#
#################################################################
GetOptions(
'man' => sub {pod2usage(-exitstatus => 0, -verbose => 2)},
'update' => \$update,
'linktext' => \$linktext,
'titletext' => \$titletext,
'noclean' => \$Alvis::URLs::noclean,
'nocase' => \$Alvis::URLs::nocase,
'gzip' => \$gzip,
'h|help' => sub {pod2usage(1)}
);
pod2usage(-message => "ERROR: need input file and stem")
if ( $#ARGV != 1 );
$file = shift();
$stem = shift();
print STDERR "\nNow build the document text bag file\n";
print STDERR "======================================\n";
&LoadTables();
my $tfile = "$stem.txtbag";
if ( -f "$stem.txtbag" ) {
open(OUT,">>$tfile");
} else {
open(OUT,">$tfile");
# first time write header, correct it later
printf OUT "%8d\n%8d\n", $doccount, $featcount;
}
# bags generated with the document number to assist sorting;
# sorts the bags prior to saving, and cut out the document number
open(IN,"<$file");
&MakeBags();
close(IN);
close(OUT);
# we want to write to the front of the file,
# without destroying rest of contents, this open does it
sysopen(OUT, $tfile, O_RDWR) or die "Cannot reopen $tfile: $!";
my $oldfh = select(OUT); $| = 1; select($oldfh);
# now, rewrite header
print OUT sprintf("%8d\n%8d\n", $doccount, $featcount);
close(OUT);
print STDERR "\nNow build the document bags with mpdata\n";
print STDERR "=========================================\n";
open(OUT,">$stem.cnf");
print OUT "#\nindexdensity=10\ninput=\"$stem.txtbag\"\nbaggedinput\n";
if ( $update ) {
print OUT "update\n";
}
close(OUT);
system("mpdata $stem");
if ( ( $? >> 8) == 1 ) {
print STDERR "mpdata error, status: " . ( $? >> 8) . "\n";
exit(1);
}
if ( $gzip ) {
system("gzip $stem.txtbag");
}
exit 0;
__END__
=head1 NAME
linkBags - input file of links and tokens for document set,
plus tables generated with
I<linkTables>, to produce forward bags.
=head1 SYNOPSIS
linkBags [--linktext|--nocase|--noclean|--titletext|--update] LINK-FILE STEM
Options:
LINK-FILE Filename for input link file usually created by XSL.
STEM Stem for output file, severl extensions read and made.
--linktext add link text to the bag
--nocase set nocase flag in Alvis::URLs
--noclean set noclean flag in Alvis::URLs
--titletext add title text to the bag
--update indicate to the output config file that this is an update
-h, --help display help message and exit.
--man print man page and exit.
=head1 DESCRIPTION
This package works in conjunction with an XSL script which is used to
generate a text file giving URL+title+link+tag information for the input XML
files. Use name '-' to input stdin. The final output files are created
when the MPCA
I<mpdata>(1) utility is called.
Input file of links and tags is assumed to be in UTF-8 encoding
in the format given in
I<linkTables>(1).
Separate
tables (
( run in 2.601 seconds using v1.01-cache-2.11-cpan-0d23b851a93 )