Alvis-Bags

 view release on metacpan or  search on metacpan

bin/linkBags  view on Meta::CPAN

my %docfeat = ();        #  true if its a doc/internal link


#################################################################
#
#  Build routines
#
#################################################################

#   discards feature if it doesn't hash to something known
sub table() {
  my $tp = $_[0];
  my $tk = $_[1];
  my $ft = $_[2];
  my $code = "$tp $tk";
  my $h = &Alvis::URLs::easyhash64char($code);
  if ( $docfeat{$h} ) {
    if ( $tp eq "link" ) {
      $ft->{$h} ++;
    }
  } elsif ( $featmap{$h} ) { 
    $ft->{$h} ++;
  }
}

#  copied from linkTables.pl, with changes
sub tabletext() {
  my $tw = $_[0];
  my $ft = $_[1];
  #  strip punctuation
  $tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
  #  break at spaces
  $tw =~ s/\s+/ /g; 
  $tw =~ s/^\s//; 
  $tw =~ s/\s$//; 
  foreach my $k ( split(/\s/,$tw) ) {
    #  lower case by default
    $k = lc($k);
    &table("text",$k,$ft);
  }
}

#  Processors the file of links got through filehandle IN.
#  Output bag to filehandle OUT.
#  Output a bagged version to make it shorter, but its pretty
#  complex.
sub MakeBags() {
  while (($_=<IN>) ) {
    chomp();
    if ( /^D ([^ ]*) ([^ ]*) (.*)$/ ) {
      my $titles = $3;
      my %feats = ();
      if ( $titletext ) {
	&tabletext($titles);
      }
      #   now process links
      for ( $_=<IN>,chomp(); $_ ne "EOD" && $_ ne "EOL";

bin/linkBags  view on Meta::CPAN

#################################################################
#
#  Load routines
#
#################################################################


#  Load up symbol table info.
#      $doccount, $featcount
#      %featmap  (hashcode to feature number map)
sub LoadTables() {
  open(FEATS,"<$stem.words");
  #  load up the mappings, precomputed
  %featmap = ();
  $featcount = 0;
  while ( ($_=<FEATS>) ) {
    chomp();
    my @a = split();
    $featmap{$a[2]} = $a[0];
    if ( $a[1] eq "doc" ) {
      $docfeat{$a[2]} = 1;

bin/linkRedir  view on Meta::CPAN


my $usezip = "";
my $init = 0;

#  both hash tables only store URLs cleaned with StandardURL()
#  single redirect for a URL
my %redirect = ();
#  space delimited set of entries with same target
my %direct = ();

sub updateMaps() {
  my $inu = shift();
  my $outu = shift();
  my $direct_add = "";
  #  no duplicates
  if ( defined($redirect{$inu}) &&
       $redirect{$inu} ne $outu ) {
    print STDERR "Previous definition '$inu'->'" . $redirect{$inu}
      . "' for line:\n   $_\n";
    # exit(1);
  }

bin/linkTables  view on Meta::CPAN


if ( $stopfile ) {
  open(S,"<$stopfile");
  while ( ($_=<S>) ) {
    chomp();
    $stops{lc($_)} = 1;
  }
  close(S);
}

sub tabletext() {
  my $tw = $_[0];
  #  strip punctuation
  $tw =~ s/[!-\/:-@\{\}\|~\[-_\`]+/ /g;
  #  break at spaces
  $tw =~ s/\s+/ /g; 
  $tw =~ s/^\s//; 
  $tw =~ s/\s$//; 
  foreach my $k ( split(/ /,$tw) ) {
    #  lower case by default
    $k = lc($k);
    if ( ! defined($stops{$k}) ) {
      &table("text",$k);
    }
  }
}

#  ensure to make "link" entries dominate, they should never be
#  dropped in favor of non-link entries
sub table() {
  my $tp = $_[0];
  my $text = $_[1];
  my $code = "$tp $text";
  # print STDERR "Table $code\n";
  my $h = &Alvis::URLs::easyhash64char($code);
  if ( defined($token{$h}) ) {
    if ( $token{$h} ne $code ) {
      if ( defined($docmap{$h}) ) {
	#  documents always override
	if ( $tp eq "link" ) {

lib/Alvis/URLs.pm  view on Meta::CPAN

  my $dig = md5_hex($string);
  # print $dig . " \n";
  return substr($dig,0,16);
}

#  URL switches
$Alvis::URLs::nocase = 0;
$Alvis::URLs::noclean = 0;
$Alvis::URLs::keepfrag = 0;

sub CleanURL() {
  if ( !$_[0] ) {
	return undef;
  }
  my $uri = new URI($_[0]);
  if ( ! $Alvis::URLs::keepfrag ) {
    $uri->fragment(undef);
  }
  return $uri->canonical;
}

sub StandardURL() {
  my $inu = shift();
  if ( $Alvis::URLs::nocase ) {
    $inu = lc($inu);
  }
  if ( $Alvis::URLs::noclean == 0 ) {
    $inu = &Alvis::URLs::CleanURL($inu);
  }
  return $inu;
}



( run in 0.738 second using v1.01-cache-2.11-cpan-65fba6d93b7 )