WAIT
view release on metacpan or search on metacpan
lib/WAIT/InvertedIndex.pm view on Meta::CPAN
$text =~ s/(^\s+)// and $pos += length($1);
}
@result;
}
sub _xfiltergen {
my $filter = pop @_;
# Oops, we cannot overrule the user's choice. Other filters may kill
# stopwords, such as isotr clobbers "isn't" to "isnt".
# if ($filter eq 'stop') { # avoid the slow stopword elimination
# return _xfiltergen(@_); # it's cheaper to look them up afterwards
# }
if (@_) {
if ($filter =~ /^split(\d*)/) {
if ($1) {
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
} else {
"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
}
} else {
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
}
} else {
if ($filter =~ /^split(\d*)/) {
if ($1) {
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
} else {
"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
}
} else {
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
}
}
}
sub parse_pos {
my $self = shift;
unless (exists $self->{xfunc}) {
$self->{xfunc} =
eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
}
&{$self->{xfunc}}($_[0]);
}
sub _filtergen {
my $filter = pop @_;
if (@_) {
"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
} else {
"map(&WAIT::Filter::$filter(\$_), \@_)";
}
}
sub drop {
my $self = shift;
if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
my $file = $self->{file};
! (!-e $file or unlink $file);
} else { # notify our database
croak ref($self)."::drop called directly";
}
}
sub open {
my $self = shift;
my $file = $self->{file};
if (defined $self->{dbh}) {
$self->{dbh};
} else {
$self->{func} =
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
$self->{mode}, 0664, $DB_BTREE);
$self->{cache} = {}
if $self->{mode} & O_RDWR;
$self->{cdict} = {}
if $self->{mode} & O_RDWR;
$self->{cached} = 0;
}
}
sub insert {
my $self = shift;
my $key = shift;
my %occ;
defined $self->{db} or $self->open;
grep $occ{$_}++, &{$self->{func}}(@_);
my ($word, $noc);
$self->{records}++;
while (($word, $noc) = each %occ) {
if (defined $self->{cache}->{$word}) {
$self->{cdict}->{$O,$word}++;
$self->{cache}->{$word} .= pack 'w2', $key, $noc;
} else {
$self->{cdict}->{$O,$word} = 1;
$self->{cache}->{$word} = pack 'w2', $key, $noc;
}
$self->{cached}++;
}
# This cache limit should be configurable
$self->sync if $self->{cached} > 100_000;
my $maxtf = 0;
for (values %occ) {
$maxtf = $_ if $_ > $maxtf;
}
$self->{db}->{$M, $key} = $maxtf;
}
# We sort postings by increasing max term frequency (~ by increasing
# document length. This reduces the quality degradation if we process
# only the first part of a posting list.
sub sort_postings {
( run in 0.725 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )