Alvis-Bags

 view release on metacpan or  search on metacpan

bin/linkTables  view on Meta::CPAN

#!/usr/bin/perl -w

use strict;
use POSIX;
use HTML::Entities;
use Alvis::URLs;
use Getopt::Long;
use Pod::Usage;


# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";

#  ensure sort handles UTF8 order
my $SORTCODE = "LC_ALL=en_US.UTF-8; export LC_ALL;" ;

my $MINCOUNT = 1;
my $linktext = 0;
my $titletext = 0;
my $stopfile = "";
my $fixdocs = 0;    # set this to fix everything but .docs 
my %stops = ();


#  check options

GetOptions(
     'man'       => sub {pod2usage(-exitstatus => 0, -verbose => 2)},
      'stopfile=s' => \$stopfile,
      'mincount=i' => \$MINCOUNT,
      'docs' => \$fixdocs,
      'linktext' => \$linktext,
      'titletext' => \$titletext,
      'noclean' => \$Alvis::URLs::noclean,
      'nocase' => \$Alvis::URLs::nocase,
      'h|help'       => sub {pod2usage(1)}
);

pod2usage(-message => "ERROR: need input file and stem")
      if ( $#ARGV != 1 );

my $file = shift();
my $stem = shift();

my $doccount = 0;
my $featcount = 0;
#  maps a cleaned URL's hash to a docID 
my %docmap = ();
#  maps a docID to a sequence number
my %docid = ();
#  token value plus count
my %token = ();
my %tokencnt = ();

if ( $stopfile ) {
  open(S,"<$stopfile");
  while ( ($_=<S>) ) {
    chomp();
    $stops{lc($_)} = 1;
  }
  close(S);
}

sub tabletext() {
  my $tw = $_[0];
  #  strip punctuation
  $tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
  #  break at spaces
  $tw =~ s/\s+/ /g; 
  $tw =~ s/^\s//; 
  $tw =~ s/\s$//; 
  foreach my $k ( split(/ /,$tw) ) {
    #  lower case by default
    $k = lc($k);
    if ( ! defined($stops{$k}) ) {
      &table("text",$k);
    }
  }
}

#  ensure to make "link" entries dominate, they should never be
#  dropped in favor of non-link entries
sub table() {
  my $tp = $_[0];
  my $text = $_[1];
  my $code = "$tp $text";
  # print STDERR "Table $code\n";
  my $h = &Alvis::URLs::easyhash64char($code);
  if ( defined($token{$h}) ) {
    if ( $token{$h} ne $code ) {
      if ( defined($docmap{$h}) ) {
	#  documents always override
	if ( $tp eq "link" ) {
	  print STDERR "Dropping token '$token{$h}' with hash $h due to clash\n";
	  $token{$h} = $code;
	} else {
	  print STDERR "Dropping token '$code' with hash $h due to clash\n";
	}
      } else {
	print STDERR "Dropping token '$code' with hash $h due to clash\n";
      }
    } else {
      $tokencnt{$h}++;
    }
  } else {
    if ( $tp eq "link" || ! defined($docmap{$h}) ) {
      $token{$h} = $code;
      $tokencnt{$h}++;
    }
  }

bin/linkTables  view on Meta::CPAN

    $_ = <ND>;
    close(ND);
    if ( /^([0-9]+) / ) {
      $line = int($1) + 1;
    } else {
      print STDERR "Cannot read document index from $stem.docs\n";
      exit(1);
    }
    #  now start from here, update .docs
    open(DOCS,">>$stem.docs");
    open(I,"<$file") or die "Cannot open input linkdata file $file: $!";
    while (($_=<I>) ) {
      chomp();
      if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
	my $inu = &Alvis::URLs::StandardURL($1);
	my $id = uc($2);
	my $titles = $3;
	my $hash = &Alvis::URLs::easyhash64char("link " .$inu);
	print DOCS "$line $inu $id $hash $titles\n";
	$line ++;
	for ( $_=<I>,chomp(); $_ && $_ ne "EOD";
	      $_=<I>,chomp() ) {
	  #  skip to end of record
	}
      }
    }
    close(I);
    close(DOCS);
    #  update .srcpar
    open(SRCPAR,"<$stem.srcpar");
    my $sp = "";
    while ( ($_=<SRCPAR>) ) {
      $sp .= $_;
    }
    close(SRCPAR);
    $sp =~ s/\nmaxdoc=.*/\nmaxdoc=$line/;
    open(SRCPAR,">$stem.srcpar");
    print SRCPAR $sp;
    close(SRCPAR);
    exit(0);
  } else {
    print STDERR "Cannot open $stem.docs\n";
    exit(1);
  }
}

#  one pass fills tables
open(DOCS,">$stem.docs");
open(I,"<$file") or die "Cannot open input linkdata file $file: $!";
my $line = 0;
while (($_=<I>) ) {
  chomp();
  if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
    my $inu = &Alvis::URLs::StandardURL($1);
    my $id = uc($2);
    my $titles = $3;
    my $hash = &Alvis::URLs::easyhash64char("link " .$inu);
    # print STDERR "DOCS > $line $hash $inu $id $titles\n";
    print DOCS "$line $inu $id $hash $titles\n";
    #   notice we overwrite any previous docID
    $docid{$id} = $line;
    if ( defined($docmap{$hash}) ) {
      $docmap{$hash} .= " $id";
    } else {
      $docmap{$hash} = $id;
    }
    $line ++;	  
    if ( $titletext ) {
      &tabletext($titles);
    }
    #   now process links
    for ( $_=<I>,chomp(); $_ && $_ ne "EOD" && $_ ne "EOL";
	  $_=<I>,chomp() ) {
      my $link = $_;
      $link =~ s/ .*//;
      # print STDERR "LINK: $link $_\n";
      $link = &Alvis::URLs::StandardURL($link);
      &table("link",$link);
      if ( $linktext && /^([^ ]+) (.*)$/ ) {
	&tabletext($2);
      }
    }
    if ( $_ eq "EOL" ) {
      #   now process tokens
      for ( $_=<I>,chomp(); $_ && $_ ne "EOD";
	    $_=<I>,chomp() ) {
	if ( /^([^ ]+) (.*)$/ ) {
	  if ( $1 eq "text" ) {
	    &tabletext($2);
	  } else {
	    &table($1,$2);
	  }
	}
      }
    }
  } elsif ( /^D / ) {
    print STDERR "Unmatched document entry: (($_))\n";
  }
}
close(I);
close(DOCS);
print STDERR "Processed $line documents\n";
$doccount = $line;

# we have insured that any hash that belongs to a document
# is reserved for links
open(TMP,">$stem.tokens.tmp");
foreach my $t ( keys(%token) ) {
  if ( $docmap{$t} ) {
    $token{$t} =~ /^([^ ]+) (.*)/;
    if ( $1 ne "link" ) {
      print STDERR "Dropped token $t '$token{$t}', should be 'link'\n";
    } else {
      print TMP "doc $t $tokencnt{$t} $2\n";
    }
  } elsif ( $tokencnt{$t}>= $MINCOUNT ) {
    $token{$t} =~ /^([^ ]+) (.*)/;
    print TMP "$1 $t $tokencnt{$t} $2\n";
  }
}
close(TMP);

#  discard unused tables
%token = ();
%tokencnt = ();

# now sort by type, docs first, and add line number
# also print doc mappings, i.e., feature to original document

my @typecnt = ();
my @typename = ();
my $types = 0;

open(TMP,"$SORTCODE ( grep '^doc ' $stem.tokens.tmp | sort ) |");
open(TOKENS,">$stem.tokens");
open(TOKENMAP,">$stem.words");
open(DOCMAP,">$stem.docfeats");
$line = 0;
while ( ($_=<TMP>) ) {
  chomp();
  my $tok = $_;
  $tok =~ s/^([^ ]+) ([^ ]+) ([^ ]+) //;
  print TOKENS "$tok\n";
  print TOKENMAP "$line $_\n";
  $_ =~ /^doc ([^ ]+) /;
  my $h = $1;
  foreach my $id ( split(/ /,$docmap{$h}) ) {
    if ( !defined($docid{$id}) ) {
      print STDERR "Lost doc sequence number for docID $id\n";
    }
    print DOCMAP "$line $docid{$id}\n";
  }
  $line++;
}
if ( $line>0 ) {
  #  keep track of type details
  $typename[0] = "doc";
  $typecnt[0] = $line;
  $types++;
}
close(TMP);
open(TMP,"$SORTCODE ( grep -v '^doc ' $stem.tokens.tmp | sort ) |");
my $type = "";
my $type_start = $line;
while ( ($_=<TMP>) ) {
  chomp();
  my $tok = $_;
  $tok =~ s/^([^ ]+) ([^ ]+) ([^ ]+) //;
  print TOKENS "$tok\n";
  print TOKENMAP "$line $_\n";
  my $ntype = $_;
  $ntype =~ s/ .*//;
  if ( $ntype ne $type ) {
    if ( $type ) {
      $typename[$types] = $type;
      $typecnt[$types] = $line - $type_start;
      $type_start = $line;
      $types ++;
    }
    $type = $ntype;
  }
  $line++;
}
$typename[$types] = $type;
$typecnt[$types] = $line - $type_start;
$type_start = $line;
$types ++;
close(TOKENS);
close(TOKENMAP);
close(DOCMAP);
unlink("$stem.tokens.tmp");
$featcount = $line;

#  now create some dimensions in .srcpar
open(SRCPAR,">$stem.srcpar");
print SRCPAR "datastem=\"$stem\"\n";
print SRCPAR "linkstem=\"$stem\"\n";
print SRCPAR "maxdoc=$doccount\n";
print SRCPAR "maxfeat=$featcount\n";
print SRCPAR "maxcomp=1\n";
print SRCPAR "dims.n_dims=$types\n";
print SRCPAR "dims.tot=$featcount\n";
print SRCPAR "dims.names=" . join(",",@typename) . "\n";
print SRCPAR "dims.dims=" . join(",",@typecnt) . "\n";
close(SRCPAR);

exit 0;

__END__

=head1 NAME



( run in 0.501 second using v1.01-cache-2.11-cpan-97f6503c9c8 )