Alvis-Bags
view release on metacpan or search on metacpan
bin/linkTables view on Meta::CPAN
#!/usr/bin/perl -w
use strict;
use POSIX;
use HTML::Entities;
use Alvis::URLs;
use Getopt::Long;
use Pod::Usage;
# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";
# ensure sort handles UTF8 order
my $SORTCODE = "LC_ALL=en_US.UTF-8; export LC_ALL;" ;
my $MINCOUNT = 1;
my $linktext = 0;
my $titletext = 0;
my $stopfile = "";
my $fixdocs = 0; # set this to fix everything but .docs
my %stops = ();
# check options
GetOptions(
'man' => sub {pod2usage(-exitstatus => 0, -verbose => 2)},
'stopfile=s' => \$stopfile,
'mincount=i' => \$MINCOUNT,
'docs' => \$fixdocs,
'linktext' => \$linktext,
'titletext' => \$titletext,
'noclean' => \$Alvis::URLs::noclean,
'nocase' => \$Alvis::URLs::nocase,
'h|help' => sub {pod2usage(1)}
);
pod2usage(-message => "ERROR: need input file and stem")
if ( $#ARGV != 1 );
my $file = shift();
my $stem = shift();
my $doccount = 0;
my $featcount = 0;
# maps a cleaned URL's hash to a docID
my %docmap = ();
# maps a docID to a sequence number
my %docid = ();
# token value plus count
my %token = ();
my %tokencnt = ();
if ( $stopfile ) {
open(S,"<$stopfile");
while ( ($_=<S>) ) {
chomp();
$stops{lc($_)} = 1;
}
close(S);
}
sub tabletext() {
my $tw = $_[0];
# strip punctuation
$tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
# break at spaces
$tw =~ s/\s+/ /g;
$tw =~ s/^\s//;
$tw =~ s/\s$//;
foreach my $k ( split(/ /,$tw) ) {
# lower case by default
$k = lc($k);
if ( ! defined($stops{$k}) ) {
&table("text",$k);
}
}
}
# ensure to make "link" entries dominate, they should never be
# dropped in favor of non-link entries
sub table() {
my $tp = $_[0];
my $text = $_[1];
my $code = "$tp $text";
# print STDERR "Table $code\n";
my $h = &Alvis::URLs::easyhash64char($code);
if ( defined($token{$h}) ) {
if ( $token{$h} ne $code ) {
if ( defined($docmap{$h}) ) {
# documents always override
if ( $tp eq "link" ) {
print STDERR "Dropping token '$token{$h}' with hash $h due to clash\n";
$token{$h} = $code;
} else {
print STDERR "Dropping token '$code' with hash $h due to clash\n";
}
} else {
print STDERR "Dropping token '$code' with hash $h due to clash\n";
}
} else {
$tokencnt{$h}++;
}
} else {
if ( $tp eq "link" || ! defined($docmap{$h}) ) {
$token{$h} = $code;
$tokencnt{$h}++;
}
}
bin/linkTables view on Meta::CPAN
$_ = <ND>;
close(ND);
if ( /^([0-9]+) / ) {
$line = int($1) + 1;
} else {
print STDERR "Cannot read document index from $stem.docs\n";
exit(1);
}
# now start from here, update .docs
open(DOCS,">>$stem.docs");
open(I,"<$file") or die "Cannot open input linkdata file $file: $!";
while (($_=<I>) ) {
chomp();
if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
my $inu = &Alvis::URLs::StandardURL($1);
my $id = uc($2);
my $titles = $3;
my $hash = &Alvis::URLs::easyhash64char("link " .$inu);
print DOCS "$line $inu $id $hash $titles\n";
$line ++;
for ( $_=<I>,chomp(); $_ && $_ ne "EOD";
$_=<I>,chomp() ) {
# skip to end of record
}
}
}
close(I);
close(DOCS);
# update .srcpar
open(SRCPAR,"<$stem.srcpar");
my $sp = "";
while ( ($_=<SRCPAR>) ) {
$sp .= $_;
}
close(SRCPAR);
$sp =~ s/\nmaxdoc=.*/\nmaxdoc=$line/;
open(SRCPAR,">$stem.srcpar");
print SRCPAR $sp;
close(SRCPAR);
exit(0);
} else {
print STDERR "Cannot open $stem.docs\n";
exit(1);
}
}
# one pass fills tables
open(DOCS,">$stem.docs");
open(I,"<$file") or die "Cannot open input linkdata file $file: $!";
my $line = 0;
while (($_=<I>) ) {
chomp();
if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
my $inu = &Alvis::URLs::StandardURL($1);
my $id = uc($2);
my $titles = $3;
my $hash = &Alvis::URLs::easyhash64char("link " .$inu);
# print STDERR "DOCS > $line $hash $inu $id $titles\n";
print DOCS "$line $inu $id $hash $titles\n";
# notice we overwrite any previous docID
$docid{$id} = $line;
if ( defined($docmap{$hash}) ) {
$docmap{$hash} .= " $id";
} else {
$docmap{$hash} = $id;
}
$line ++;
if ( $titletext ) {
&tabletext($titles);
}
# now process links
for ( $_=<I>,chomp(); $_ && $_ ne "EOD" && $_ ne "EOL";
$_=<I>,chomp() ) {
my $link = $_;
$link =~ s/ .*//;
# print STDERR "LINK: $link $_\n";
$link = &Alvis::URLs::StandardURL($link);
&table("link",$link);
if ( $linktext && /^([^ ]+) (.*)$/ ) {
&tabletext($2);
}
}
if ( $_ eq "EOL" ) {
# now process tokens
for ( $_=<I>,chomp(); $_ && $_ ne "EOD";
$_=<I>,chomp() ) {
if ( /^([^ ]+) (.*)$/ ) {
if ( $1 eq "text" ) {
&tabletext($2);
} else {
&table($1,$2);
}
}
}
}
} elsif ( /^D / ) {
print STDERR "Unmatched document entry: (($_))\n";
}
}
close(I);
close(DOCS);
print STDERR "Processed $line documents\n";
$doccount = $line;
# we have insured that any hash that belongs to a document
# is reserved for links
open(TMP,">$stem.tokens.tmp");
foreach my $t ( keys(%token) ) {
if ( $docmap{$t} ) {
$token{$t} =~ /^([^ ]+) (.*)/;
if ( $1 ne "link" ) {
print STDERR "Dropped token $t '$token{$t}', should be 'link'\n";
} else {
print TMP "doc $t $tokencnt{$t} $2\n";
}
} elsif ( $tokencnt{$t}>= $MINCOUNT ) {
$token{$t} =~ /^([^ ]+) (.*)/;
print TMP "$1 $t $tokencnt{$t} $2\n";
}
}
close(TMP);
# discard unused tables
%token = ();
%tokencnt = ();
# now sort by type, docs first, and add line number
# also print doc mappings, i.e., feature to original document
my @typecnt = ();
my @typename = ();
my $types = 0;
open(TMP,"$SORTCODE ( grep '^doc ' $stem.tokens.tmp | sort ) |");
open(TOKENS,">$stem.tokens");
open(TOKENMAP,">$stem.words");
open(DOCMAP,">$stem.docfeats");
$line = 0;
while ( ($_=<TMP>) ) {
chomp();
my $tok = $_;
$tok =~ s/^([^ ]+) ([^ ]+) ([^ ]+) //;
print TOKENS "$tok\n";
print TOKENMAP "$line $_\n";
$_ =~ /^doc ([^ ]+) /;
my $h = $1;
foreach my $id ( split(/ /,$docmap{$h}) ) {
if ( !defined($docid{$id}) ) {
print STDERR "Lost doc sequence number for docID $id\n";
}
print DOCMAP "$line $docid{$id}\n";
}
$line++;
}
if ( $line>0 ) {
# keep track of type details
$typename[0] = "doc";
$typecnt[0] = $line;
$types++;
}
close(TMP);
open(TMP,"$SORTCODE ( grep -v '^doc ' $stem.tokens.tmp | sort ) |");
my $type = "";
my $type_start = $line;
while ( ($_=<TMP>) ) {
chomp();
my $tok = $_;
$tok =~ s/^([^ ]+) ([^ ]+) ([^ ]+) //;
print TOKENS "$tok\n";
print TOKENMAP "$line $_\n";
my $ntype = $_;
$ntype =~ s/ .*//;
if ( $ntype ne $type ) {
if ( $type ) {
$typename[$types] = $type;
$typecnt[$types] = $line - $type_start;
$type_start = $line;
$types ++;
}
$type = $ntype;
}
$line++;
}
$typename[$types] = $type;
$typecnt[$types] = $line - $type_start;
$type_start = $line;
$types ++;
close(TOKENS);
close(TOKENMAP);
close(DOCMAP);
unlink("$stem.tokens.tmp");
$featcount = $line;
# now create some dimensions in .srcpar
open(SRCPAR,">$stem.srcpar");
print SRCPAR "datastem=\"$stem\"\n";
print SRCPAR "linkstem=\"$stem\"\n";
print SRCPAR "maxdoc=$doccount\n";
print SRCPAR "maxfeat=$featcount\n";
print SRCPAR "maxcomp=1\n";
print SRCPAR "dims.n_dims=$types\n";
print SRCPAR "dims.tot=$featcount\n";
print SRCPAR "dims.names=" . join(",",@typename) . "\n";
print SRCPAR "dims.dims=" . join(",",@typecnt) . "\n";
close(SRCPAR);
exit 0;
__END__
=head1 NAME
( run in 0.501 second using v1.01-cache-2.11-cpan-97f6503c9c8 )