Alvis-Bags

 view release on metacpan or  search on metacpan

bin/linkMpca  view on Meta::CPAN

  print STDERR "\nNow building single rank\n";
  print STDERR "========================\n";
  system("mprank -u -h $stem");
  #  turn off character processing on this since am using unpack()
  open(INF,'<:bytes',"$stem.onerank");
  open(INH,"<$stem.dochash");
  my $buf = ""; 
  read(INF,$buf,4); 
  my $I = unpack("i",$buf);
  my $dhash = "";
  my $docid = 0;
  #   the binary read and unpack() combo reads
  #   entries from an MPCA Vec_t file
  my %hashrank = ();
  for (my $i=0; $i<$I; $i++) {
    read(INF,$buf,4);
    if ( eof(INH) ) {
      print STDERR "Too few entries in $stem.dochash for $stem.onerank\n";
      exit(1);
    }
    $_ = <INH>;

bin/linkTables  view on Meta::CPAN

      if ( $#ARGV != 1 );

my $file = shift();
my $stem = shift();

my $doccount = 0;
my $featcount = 0;
#  maps a cleaned URL's hash to a docID 
my %docmap = ();
#  maps a docID to a sequence number
my %docid = ();
#  token value plus count
my %token = ();
my %tokencnt = ();

if ( $stopfile ) {
  open(S,"<$stopfile");
  while ( ($_=<S>) ) {
    chomp();
    $stops{lc($_)} = 1;
  }

bin/linkTables  view on Meta::CPAN

while (($_=<I>) ) {
  chomp();
  if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
    my $inu = &Alvis::URLs::StandardURL($1);
    my $id = uc($2);
    my $titles = $3;
    my $hash = &Alvis::URLs::easyhash64char("link " .$inu);
    # print STDERR "DOCS > $line $hash $inu $id $titles\n";
    print DOCS "$line $inu $id $hash $titles\n";
    #   notice we overwrite any previous docID
    $docid{$id} = $line;
    if ( defined($docmap{$hash}) ) {
      $docmap{$hash} .= " $id";
    } else {
      $docmap{$hash} = $id;
    }
    $line ++;	  
    if ( $titletext ) {
      &tabletext($titles);
    }
    #   now process links

bin/linkTables  view on Meta::CPAN

$line = 0;
while ( ($_=<TMP>) ) {
  chomp();
  my $tok = $_;
  $tok =~ s/^([^ ]+) ([^ ]+) ([^ ]+) //;
  print TOKENS "$tok\n";
  print TOKENMAP "$line $_\n";
  $_ =~ /^doc ([^ ]+) /;
  my $h = $1;
  foreach my $id ( split(/ /,$docmap{$h}) ) {
    if ( !defined($docid{$id}) ) {
      print STDERR "Lost doc sequence number for docID $id\n";
    }
    print DOCMAP "$line $docid{$id}\n";
  }
  $line++;
}
if ( $line>0 ) {
  #  keep track of type details
  $typename[0] = "doc";
  $typecnt[0] = $line;
  $types++;
}
close(TMP);



( run in 0.476 second using v1.01-cache-2.11-cpan-4505f990765 )