Freq

 view release on metacpan or  search on metacpan

bin/tokenize-sb  view on Meta::CPAN

#!/usr/bin/perl -w
# This tokenizes by nonword chars. Generates a stream of 
# space-delimited, lowercased words, denuded of punctuation and
# suitable for redirecting or piping into something else like so:
# cat *.txt | tokenize_std > tokens.txt
# cat *.txt | tokenize_std | index_this
# This version adds a name and surrounds text with 

use strict;
use Text::Scan;

my $usage = <<"EOF";

Usage: tokenize

Pipe a stream of documents in "Ejemoni" format, that is surrounded
by <DOC></DOC> tags with a <DOCNO></DOCNO> tag in there somewhere.
Outputs a stream of lowercased words split on nonword strings.
Examples:

# cat document.txt | tokenize | indexstream corpus_dir
# cat document.txt | tokenize | ngrams 3 11 

EOF


my $sb = new Text::Scan;
while(<DATA>){
    chomp;
    my @SB = split(/\t/, $_);
    $sb->insert($SB[2], $SB[1]);
}
$sb->insert("\n\n", 0);

$/ = '</DOC>';

while(<>){
    chomp;
    next if $_ eq '';
    # If no document name, skip it.
    if( m|<DOCNO>([^<]+)</DOCNO>|ms ){
        print "\n<DOC>\n<DOCNO>$1</DOCNO>\n";
    }
    else {
        next; 
    }
    $_ = $1 if /<TEXT>(.+?)<\/TEXT>/ms;
    s|<[^>]+>||g; # Get rid of all other tags.
    #s|(\d)| $1 |g; # Count each digit.
    s|_| |g;       # underscores bah!
    $_ = lc $_;
    my @tokens = split m|\b|;
    my $offset = 0;
    # if the first token is a word, start the $offset early
    unless($tokens[0] =~ /\w/){
        $offset = length($tokens[0]);
        shift @tokens;
    }
    my @locations = ();
    while(@tokens){
        my $word = shift @tokens;
        my $junk = shift @tokens;
        $junk ||= '';
        push @locations, [ $word, $offset ];
        $offset += length($word . $junk);
    }
    
    my @sbs = $sb->multiscan($_);
    push @locations, [ 'SB', $_->[1] + $_->[2] ] for @sbs;
    @locations = sort { $a->[1] <=> $b->[1] } @locations;
    print join " ", map { $_->[0] } @locations;
    print " SB";
    print "\n</DOC>\n";
}


__DATA__
SB	5	Aani. 
SB	5	Aaru. 
SB	5	Abba. 
SB	5	Abby. 
SB	5	Abel. 
SB	5	Abie. 
SB	5	Absi. 
SB	5	Acer. 
SB	5	Acis. 
SB	5	Acts. 
SB	5	Adad. 
SB	5	Adai. 
SB	5	Adam. 
SB	5	Adar. 
SB	5	Adda. 
SB	5	Addu. 
SB	5	Addy. 
SB	5	Adib. 
SB	5	Adin. 
SB	5	Afar. 
SB	5	Agag. 
SB	5	Agao. 
SB	5	Agau. 
SB	5	Agaz. 
SB	5	Agib. 
SB	5	Agra. 
SB	5	Ahet. 
SB	5	Ahir. 
SB	5	Ahom. 
SB	5	Aias. 
SB	5	Ainu. 
SB	5	Aira. 
SB	5	Akal. 
SB	5	Akan. 
SB	5	Akha. 



( run in 1.340 second using v1.01-cache-2.11-cpan-71847e10f99 )