Alvis-Bags
view release on metacpan or search on metacpan
bin/linkMpca view on Meta::CPAN
print STDERR "\nNow building single rank\n";
print STDERR "========================\n";
system("mprank -u -h $stem");
# turn off character processing on this since am using unpack()
open(INF,'<:bytes',"$stem.onerank");
open(INH,"<$stem.dochash");
my $buf = "";
read(INF,$buf,4);
my $I = unpack("i",$buf);
my $dhash = "";
my $docid = 0;
# the binary read and unpack() combo reads
# entries from an MPCA Vec_t file
my %hashrank = ();
for (my $i=0; $i<$I; $i++) {
read(INF,$buf,4);
if ( eof(INH) ) {
print STDERR "Too few entries in $stem.dochash for $stem.onerank\n";
exit(1);
}
$_ = <INH>;
bin/linkTables view on Meta::CPAN
if ( $#ARGV != 1 );
my $file = shift();
my $stem = shift();
my $doccount = 0;
my $featcount = 0;
# maps a cleaned URL's hash to a docID
my %docmap = ();
# maps a docID to a sequence number
my %docid = ();
# token value plus count
my %token = ();
my %tokencnt = ();
if ( $stopfile ) {
open(S,"<$stopfile");
while ( ($_=<S>) ) {
chomp();
$stops{lc($_)} = 1;
}
bin/linkTables view on Meta::CPAN
while (($_=<I>) ) {
chomp();
if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
my $inu = &Alvis::URLs::StandardURL($1);
my $id = uc($2);
my $titles = $3;
my $hash = &Alvis::URLs::easyhash64char("link " .$inu);
# print STDERR "DOCS > $line $hash $inu $id $titles\n";
print DOCS "$line $inu $id $hash $titles\n";
# notice we overwrite any previous docID
$docid{$id} = $line;
if ( defined($docmap{$hash}) ) {
$docmap{$hash} .= " $id";
} else {
$docmap{$hash} = $id;
}
$line ++;
if ( $titletext ) {
&tabletext($titles);
}
# now process links
bin/linkTables view on Meta::CPAN
$line = 0;
while ( ($_=<TMP>) ) {
chomp();
my $tok = $_;
$tok =~ s/^([^ ]+) ([^ ]+) ([^ ]+) //;
print TOKENS "$tok\n";
print TOKENMAP "$line $_\n";
$_ =~ /^doc ([^ ]+) /;
my $h = $1;
foreach my $id ( split(/ /,$docmap{$h}) ) {
if ( !defined($docid{$id}) ) {
print STDERR "Lost doc sequence number for docID $id\n";
}
print DOCMAP "$line $docid{$id}\n";
}
$line++;
}
if ( $line>0 ) {
# keep track of type details
$typename[0] = "doc";
$typecnt[0] = $line;
$types++;
}
close(TMP);
( run in 2.756 seconds using v1.01-cache-2.11-cpan-4505f990765 )