XAO-Indexer
view release on metacpan or search on metacpan
lib/XAO/DO/Indexer/Base.pm view on Meta::CPAN
$value||=$config->get("/indexer/default/$param");
}
return $value || $default;
}
###############################################################################
sub ignore_limit ($) {
my $self=shift;
return $self->config_param('ignore_limit',500);
}
###############################################################################
sub commit_interval ($) {
my $self=shift;
return $self->config_param('commit_interval',0);
}
###############################################################################
sub init ($%) {
my $self=shift;
my $args=get_args(\@_);
my $index_object=$args->{'index_object'} ||
throw $self "init - no 'index_object'";
dprint ref($self)."::init - XXX, nothing's in here yet....";
}
###############################################################################
sub search ($%) {
my $self=shift;
my $args=get_args(\@_);
my $index_object=$args->{'index_object'} ||
throw $self "search - no 'index_object'";
eval 'use Compress::LZO';
dprint "No Compress::LZO, comression won't work" if $@;
my $str=$args->{'search_string'};
$str='' unless defined $str;
$str=~s/^\s*(.*?)\s*$/$1/sg;
if(!length($str)) {
dprint ref($self)."::search - empty search string";
return [ ];
}
my $ordering=$args->{'ordering'} ||
throw $self "search - no 'ordering'";
my $ordering_seq=$self->get_orderings->{$ordering}->{'seq'} ||
throw $self "search - no sequence in '$ordering' ordering";
##
# Optional hash reference to be filled with statistics
#
my $rcdata=$args->{'rcdata'};
$rcdata->{'ignored_words'}={ } if $rcdata;
### dprint "Searching for '$str' (ordering=$ordering, seq=$ordering_seq)";
##
# Preparing spellchecker if needed
#
my $use_spellchecker=$rcdata && $self->config_param('use_spellchecker');
my $spellchecker;
if($use_spellchecker) {
$spellchecker=$self->get_spellchecker;
$spellchecker->switch_index($index_object->container_key);
$rcdata->{'spellchecker_words'}={ };
}
##
# We cache ignored words. Cache returns a hash reference with
# ignored words.
#
my $i_cache=get_current_project()->cache(
name => 'indexer_ignored',
coords => [ 'index_id' ],
expire => 60,
retrieve => sub {
my $index_list=shift;
my $args=get_args(\@_);
my $index_id=$args->{'index_id'};
my $ign_list=$index_list->get($index_id)->get('Ignore');
my %ignored=map {
$ign_list->get($_)->get('keyword','count');
} $ign_list->keys;
return \%ignored;
},
);
my $ignored=$i_cache->get($index_object->container_object, {
index_id => $index_object->container_key,
});
##
# Building multi-word sequences. If some words are in the
# ignore-list we skip them, but assume that there is a word in
# between in word sequences. For instance 'wag the dog' would become
# { wag => 1, dog => 3 }, providing that 'the' is ignored.
#
my @mdata;
$str=~s/"(.*?)"/push(@mdata,$1);" "/sge;
my @multi;
my @simple;
foreach my $elt (@mdata) {
my $s=$self->analyze_text_split(0,$elt);
next unless @$s;
if(@$s==1) {
push(@simple,$s->[0]);
}
else {
if($spellchecker) {
my $pairs=$spellchecker->suggest_replacements(join(' ',@$s));
@{$rcdata->{'spellchecker_words'}}{keys %$pairs}=values %$pairs;
}
my @t=map {
if(exists $ignored->{$_}) {
if($rcdata) {
$rcdata->{'ignored_words'}->{$_}=$ignored->{$_};
}
undef;
}
else {
$_;
}
} @$s;
shift(@t) while @t && !defined($t[0]);
pop(@t) while @t && !defined($t[$#t]);
if(@t==1) {
push(@simple,$t[0]);
}
else {
push(@multi,\@t);
}
}
}
undef @mdata;
##
# Simple words
#
if($spellchecker) {
my $pairs=$spellchecker->suggest_replacements($str);
@{$rcdata->{'spellchecker_words'}}{keys %$pairs}=values %$pairs;
}
push(@simple,map {
if(exists $ignored->{$_}) {
if($rcdata) {
$rcdata->{'ignored_words'}->{$_}=$ignored->{$_};
}
();
}
else {
$_;
}
} @{$self->analyze_text_split(0,$str)});
##
# If we are asked to provide data, storing splitted words.
#
if($rcdata) {
$rcdata->{'words_single'}=\@simple;
$rcdata->{'words_multi'}=\@multi;
$rcdata->{'results_count'}=0;
}
### dprint Dumper(\@multi),Dumper(\@simple);
##
# First we search for multi-words sequences in the assumption that
# they will provide smaller result sets or no results at all.
#
my @results;
my $data_list=$index_object->get('Data');
foreach my $marr (sort { scalar(@$b) <=> scalar(@$a) } @multi) {
my $res=$self->search_multi($data_list,$ordering_seq,$marr);
### dprint "Multi Results: ",Dumper($marr),Dumper($res);
if(!@$res) {
return [ ];
}
push(@results,$res);
}
##
# Searching for simple words, position independent. Longer words first.
#
foreach my $kw (sort { length($b) <=> length($a) } @simple) {
my $res=$self->search_simple($data_list,$ordering_seq,$kw);
### dprint "Simple Results: '$kw' ",Dumper($res);
if(!@$res) {
return [ ];
}
push(@results,$res);
}
##
# Joining all results together
#
if($rcdata) {
my $sr=XAO::IndexerSupport::sorted_intersection(@results);
$rcdata->{'results_count'}=scalar(@$sr);
return $sr;
}
else {
return XAO::IndexerSupport::sorted_intersection(@results);
}
}
###############################################################################
sub search_multi ($$$$) {
my ($self,$data_list,$oseq,$marr)=@_;
##
# Getting results starting from the longest set of words in the hope
# that we get no match at all. On null results exit immediately.
#
my %rawdata;
foreach my $kw (sort { length($b || '') <=> length($a || '') } @$marr) {
last unless defined $kw;
my $sr=$data_list->search('keyword','eq',$kw);
return [ ] unless @$sr;
use bytes;
my $r=$data_list->get($sr->[0])->get("idpos_$oseq");
my $compression_flag=unpack('w',$r);
if(defined $compression_flag && $compression_flag == 0) {
# The temporary variable is required. See the note on the
# other decompress() use.
#
my $c=substr($r,1);
$r=Compress::LZO::decompress($c);
}
$rawdata{$kw}=$r;
}
return XAO::IndexerSupport::sorted_intersection_pos($marr,\%rawdata);
}
###############################################################################
sub search_simple ($$$$) {
my ($self,$data_list,$oseq,$keyword)=@_;
my $sr=$data_list->search('keyword','eq',$keyword);
return [ ] unless @$sr;
my $iddata=$data_list->get($sr->[0])->get("id_$oseq");
##
# Sometimes the data gets damaged or is in the process of being
# updated. This can lead to unparsable results.
#
my $result;
try {
use bytes;
my $compression_flag=unpack('w',$iddata);
if(defined $compression_flag && $compression_flag==0) {
# Directly passing substr(..) into decompress fails in perl
# 5.22 with "buffer parameter is not a SCALAR". Likely this
# is because substr is actually an lvalue and not a true
# scalar. Worked fine in earlier versions of perl. Putting
# it in a temporary variable fixes that.
#
my $c=scalar(substr($iddata,1));
$iddata=Compress::LZO::decompress($c);
defined $iddata ||
throw $self "Can't decompress data";
}
$result=[ unpack('w*',$iddata) ];
}
otherwise {
my $e=shift;
eprint "Bad indexer data for kw='$keyword', sorting='$oseq': $e";
$result=[ ];
};
return $result;
}
###############################################################################
sub suggest_alternative ($%) {
my $self=shift;
my $args=get_args(\@_);
my $index_object=$args->{'index_object'} ||
throw $self "search - no 'index_object'";
my $rcdata=$args->{'rcdata'} ||
throw $self "suggest_alternatives - need rcdata";
my $query=$args->{'search_string'} ||
throw $self "suggest_alternatives - need search_string";
my $spwords=$rcdata->{'spellchecker_words'};
return '' unless $spwords;
##
# This can be used to improve results when using a generic
# dictionary on a small dataset, not a dictionary based on the
# actual index content (which guarantees that we already get only
# valid substitutes).
#
my %wcounts;
my $max_alt_words=$self->config_param('spellchecker/max_alt_words') || 0;
if($max_alt_words) {
dprint "Using 'max_alt_words' slows down suggestion, consider building a custom dictionary";
my $data_list=$index_object->get('Data');
foreach my $word (keys %$spwords) {
my $alist=$spwords->{$word};
my @newlist;
for(my $i=0; $i<$max_alt_words && $i<@$alist; ++$i) {
my $altword=lc($alist->[$i]);
### dprint "Trying word '$word' -> '$altword'";
my @aw=$self->analyze_text_split(0,$altword);
my $found=0;
foreach my $aw (@aw) {
my $sr=$data_list->search('keyword','eq',$aw);
next unless @$sr;
++$found;
$wcounts{$aw}=scalar(@$sr);
}
if($found && $found==scalar(@aw)) {
push(@newlist,$altword);
}
}
$spwords->{$word}=\@newlist;
}
}
##
# Ordering the list of substitutions to try on the original query.
# Fuzzy area, there could be multiple algorithms to do that.
#
my $algorithm=$self->config_param('spellchecker/algorithm') || 'sequential';
my $max_alt_searches=$self->config_param('spellchecker/max_alt_searches') || 15;
my $max_alt_results=$self->config_param('spellchecker/max_alt_results') || 2;
my $max_result_distance=$self->config_param('spellchecker/max_result_distance') || 5;
### dprint ".algorithm=$algorithm max_alt_results=$max_alt_results max_alt_searches=$max_alt_searches";
##
# In 'sequential' we scan through all words from most likely to
# least likely, with all possible variations. If we have these variants:
# A->(B,C) K->(L,M) X->(Y)
# then the following will be tried
# (A-B) (K-L) (X-Y) (A-B,K-L) (A-B,X-Y) (K-L,X-Y) (A-B,K-L,X-Y) (A-B,K-M)
# (A-C,K-L) (A-C,X-Y) (K-M,X-Y) (A-C,K-M) (A-C,K-L,X-Y), etc..
#
# The idea is that we assign weight proportional to the distance
# from the top of the list, make lists and then sort on the weight.
#
my @jobs;
if($algorithm eq 'sequential') {
lib/XAO/DO/Indexer/Base.pm view on Meta::CPAN
my $distance=0;
my $count=0;
foreach my $pair (@{$elt->{'job'}}) {
$distance+=$pair->[2];
++$count;
}
$elt->{'distance'}=$distance+$count-1;
}
@jobs=map {
{
pairs => [ map { ($_->[0],$_->[1]) } @{$_->{'job'}} ],
distance => $_->{'distance'},
}
} sort {
$a->{'distance'} <=> $b->{'distance'} ||
scalar(@{$b->{'job'}}) <=> scalar(@{$a->{'job'}})
} @list;
### use Data::Dumper;
### dprint Dumper(\@jobs);
if(@jobs>$max_alt_searches) {
splice(@jobs,$max_alt_searches);
}
}
##
# In 'bycount' we keep completely dropping least likely words. Does
# not work very well, probably should be removed altogether as unusable.
#
### elsif($algorithm eq 'bycount') {
### for(my $i=0; %words && $i<$max_alt_searches; ++$i) {
### my @wlist=sort { $words{$b}->[0]->[1] <=> $words{$a}->[0]->[1] } keys %words;
### my @pairs;
### my $have_difference;
### foreach my $word (@wlist) {
### my $altword=$words{$word}->[0]->[0];
### $have_difference=1 if $altword ne $word;
### push(@pairs,$word => $altword);
### }
### push(@jobs,{
### pairs => @pairs,
### distance => 1,
### }) if $have_difference;
### my $word=$wlist[$#wlist];
### if(@{$words{$word}}>1) {
### shift(@{$words{$word}});
### }
### else {
### delete $words{$word};
### }
### }
### }
else {
throw $self "suggest_alternative - '$algorithm' algorithm is not supported";
}
# Now building other alternative strings and returning them in order.
#
my $results_count=$rcdata->{'results_count'} || 0;
my @alts;
for(my $i=0; $i<@jobs; ++$i) {
my $distance=$jobs[$i]->{'distance'} || 1;
$distance<=$max_result_distance || next;
my $newq=$query;
my $pairs=$jobs[$i]->{'pairs'};
my @finalpairs;
for(my $j=0; $j<@$pairs; $j+=2) {
my $word=$pairs->[$j];
my $altword=$pairs->[$j+1];
if($word ne $altword) {
$newq=~s/\b$word\b/$altword/ig;
$newq=~s/^\s*(.*?)\s*$/$1/s;
$newq=~s/\s{2,}/ /s;
push(@finalpairs,[ $word, $altword ]);
}
}
next if $newq =~ /^\s+$/s;
dprint "Trying query '$newq' instead of '$query'";
my $sr=$self->search(
index_object => $index_object,
search_string => $newq,
ordering => $args->{'ordering'},
);
my $newcount=@$sr;
if($newcount && $newcount>=$results_count) {
dprint "Got a match on '$newq' ($newcount)";
push(@alts,{
query => $newq,
pairs => \@finalpairs,
results => $args->{'need_results'} ? $sr : undef,
count => $newcount,
distance => $distance,
});
last if scalar(@alts)>=$max_alt_results;
}
}
# Storing all variants and returning the first one.
#
$rcdata->{'spellchecker_alternatives'}=\@alts;
return @alts ? $alts[0]->{'query'} : undef;
}
###############################################################################
sub update ($%) {
my $self=shift;
my $args=get_args(\@_);
my $index_object=$args->{'index_object'} ||
throw $self "update - no 'index_object'";
##
# Checking if we need to compress the data
#
my $compression=$index_object->get('compression');
if($compression) {
eval 'use Compress::LZO';
if($@) {
throw $self "update - need Compress::LZO for compression ($compression)";
}
if($compression<1 || $compression>9) {
throw $self "update - compression level must be between 1 and 9";
}
}
##
# For compatibility reasons we need to call it in array context.
#
my @carr=$self->get_collection($args);
my $cinfo;
if(@carr==2) {
$cinfo={
collection => $carr[0],
ids => $carr[1],
};
}
else {
$cinfo=$carr[0];
}
##
# Maximum number of matches for a word for it to be completely
# ignored.
#
my $ignore_limit=$self->ignore_limit;
##
# Maximum number of keywords updated before a forced transaction
# commit. This helps to deal with otherwise huge transaction that
# have to be cached before being moved into binlogs for slave
# updates in MySQL -- it is my understanding that it will require at
# least tripple write, and at least one atomic huge write. Better
# not do that.
#
my $commit_interval=$self->commit_interval;
##
# If that's a partial update and we don't have any IDs to update
# we return immediately. Otherwise proceed even with empty set to
( run in 0.573 second using v1.01-cache-2.11-cpan-f56aa216473 )