Freq
view release on metacpan or search on metacpan
bin/tokenize-sb view on Meta::CPAN
#!/usr/bin/perl -w
# This tokenizes by nonword chars. Generates a stream of
# space-delimited, lowercased words, denuded of punctuation and
# suitable for redirecting or piping into something else like so:
# cat *.txt | tokenize_std > tokens.txt
# cat *.txt | tokenize_std | index_this
# This version adds a name and surrounds text with
use strict;
use Text::Scan;
my $usage = <<"EOF";
Usage: tokenize
Pipe a stream of documents in "Ejemoni" format, that is surrounded
by <DOC></DOC> tags with a <DOCNO></DOCNO> tag in there somewhere.
Outputs a stream of lowercased words split on nonword strings.
Examples:
# cat document.txt | tokenize | indexstream corpus_dir
# cat document.txt | tokenize | ngrams 3 11
EOF
my $sb = new Text::Scan;
while(<DATA>){
chomp;
my @SB = split(/\t/, $_);
$sb->insert($SB[2], $SB[1]);
}
$sb->insert("\n\n", 0);
$/ = '</DOC>';
while(<>){
chomp;
next if $_ eq '';
# If no document name, skip it.
if( m|<DOCNO>([^<]+)</DOCNO>|ms ){
print "\n<DOC>\n<DOCNO>$1</DOCNO>\n";
}
else {
next;
}
$_ = $1 if /<TEXT>(.+?)<\/TEXT>/ms;
s|<[^>]+>||g; # Get rid of all other tags.
#s|(\d)| $1 |g; # Count each digit.
s|_| |g; # underscores bah!
$_ = lc $_;
my @tokens = split m|\b|;
my $offset = 0;
# if the first token is a word, start the $offset early
unless($tokens[0] =~ /\w/){
$offset = length($tokens[0]);
shift @tokens;
}
my @locations = ();
while(@tokens){
my $word = shift @tokens;
my $junk = shift @tokens;
$junk ||= '';
push @locations, [ $word, $offset ];
$offset += length($word . $junk);
}
my @sbs = $sb->multiscan($_);
push @locations, [ 'SB', $_->[1] + $_->[2] ] for @sbs;
@locations = sort { $a->[1] <=> $b->[1] } @locations;
print join " ", map { $_->[0] } @locations;
print " SB";
print "\n</DOC>\n";
}
__DATA__
SB 5 Aani.
SB 5 Aaru.
SB 5 Abba.
SB 5 Abby.
SB 5 Abel.
SB 5 Abie.
SB 5 Absi.
SB 5 Acer.
SB 5 Acis.
SB 5 Acts.
SB 5 Adad.
SB 5 Adai.
SB 5 Adam.
SB 5 Adar.
SB 5 Adda.
SB 5 Addu.
SB 5 Addy.
SB 5 Adib.
SB 5 Adin.
SB 5 Afar.
SB 5 Agag.
SB 5 Agao.
SB 5 Agau.
SB 5 Agaz.
SB 5 Agib.
SB 5 Agra.
SB 5 Ahet.
SB 5 Ahir.
SB 5 Ahom.
SB 5 Aias.
SB 5 Ainu.
SB 5 Aira.
SB 5 Akal.
SB 5 Akan.
SB 5 Akha.
( run in 1.340 second using v1.01-cache-2.11-cpan-71847e10f99 )