XAO-Indexer

 view release on metacpan or  search on metacpan

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN

        $value||=$config->get("/indexer/default/$param");
    }

    return $value || $default;
}

###############################################################################

sub ignore_limit ($) {
    my $self=shift;
    return $self->config_param('ignore_limit',500);
}

###############################################################################

sub commit_interval ($) {
    my $self=shift;
    return $self->config_param('commit_interval',0);
}

###############################################################################

sub init ($%) {
    my $self=shift;
    my $args=get_args(\@_);

    my $index_object=$args->{'index_object'} ||
        throw $self "init - no 'index_object'";

    dprint ref($self)."::init - XXX, nothing's in here yet....";
}

###############################################################################

sub search ($%) {
    my $self=shift;
    my $args=get_args(\@_);

    my $index_object=$args->{'index_object'} ||
        throw $self "search - no 'index_object'";

    eval 'use Compress::LZO';
    dprint "No Compress::LZO, comression won't work" if $@;

    my $str=$args->{'search_string'};
    $str='' unless defined $str;
    $str=~s/^\s*(.*?)\s*$/$1/sg;
    if(!length($str)) {
        dprint ref($self)."::search - empty search string";
        return [ ];
    }

    my $ordering=$args->{'ordering'} ||
        throw $self "search - no 'ordering'";
    my $ordering_seq=$self->get_orderings->{$ordering}->{'seq'} ||
        throw $self "search - no sequence in '$ordering' ordering";

    ##
    # Optional hash reference to be filled with statistics
    #
    my $rcdata=$args->{'rcdata'};
    $rcdata->{'ignored_words'}={ } if $rcdata;

    ### dprint "Searching for '$str' (ordering=$ordering, seq=$ordering_seq)";

    ##
    # Preparing spellchecker if needed
    #
    my $use_spellchecker=$rcdata && $self->config_param('use_spellchecker');
    my $spellchecker;
    if($use_spellchecker) {
        $spellchecker=$self->get_spellchecker;
        $spellchecker->switch_index($index_object->container_key);
        $rcdata->{'spellchecker_words'}={ };
    }

    ##
    # We cache ignored words. Cache returns a hash reference with
    # ignored words.
    #
    my $i_cache=get_current_project()->cache(
        name        => 'indexer_ignored',
        coords      => [ 'index_id' ],
        expire      => 60,
        retrieve    => sub {
            my $index_list=shift;
            my $args=get_args(\@_);
            my $index_id=$args->{'index_id'};
            my $ign_list=$index_list->get($index_id)->get('Ignore');
            my %ignored=map {
                $ign_list->get($_)->get('keyword','count');
            } $ign_list->keys;
            return \%ignored;
        },
    );
    my $ignored=$i_cache->get($index_object->container_object, {
        index_id    => $index_object->container_key,
    });

    ##
    # Building multi-word sequences. If some words are in the
    # ignore-list we skip them, but assume that there is a word in
    # between in word sequences. For instance 'wag the dog' would become
    # { wag => 1, dog => 3 }, providing that 'the' is ignored.
    #
    my @mdata;
    $str=~s/"(.*?)"/push(@mdata,$1);" "/sge;
    my @multi;
    my @simple;
    foreach my $elt (@mdata) {
        my $s=$self->analyze_text_split(0,$elt);
        next unless @$s;
        if(@$s==1) {
            push(@simple,$s->[0]);
        }
        else {
            if($spellchecker) {
                my $pairs=$spellchecker->suggest_replacements(join(' ',@$s));
                @{$rcdata->{'spellchecker_words'}}{keys %$pairs}=values %$pairs;
            }
            my @t=map {
                if(exists $ignored->{$_}) {
                    if($rcdata) {
                        $rcdata->{'ignored_words'}->{$_}=$ignored->{$_};
                    }
                    undef;
                }
                else {
                    $_;
                }
            } @$s;
            shift(@t) while @t && !defined($t[0]);
            pop(@t) while @t && !defined($t[$#t]);
            if(@t==1) {
                push(@simple,$t[0]);
            }
            else {
                push(@multi,\@t);
            }
        }
    }
    undef @mdata;

    ##
    # Simple words
    #
    if($spellchecker) {
        my $pairs=$spellchecker->suggest_replacements($str);
        @{$rcdata->{'spellchecker_words'}}{keys %$pairs}=values %$pairs;
    }
    push(@simple,map {
        if(exists $ignored->{$_}) {
            if($rcdata) {
                $rcdata->{'ignored_words'}->{$_}=$ignored->{$_};
            }
            ();
        }
        else {
            $_;
        }
    } @{$self->analyze_text_split(0,$str)});

    ##
    # If we are asked to provide data, storing splitted words.
    #
    if($rcdata) {
        $rcdata->{'words_single'}=\@simple;
        $rcdata->{'words_multi'}=\@multi;
        $rcdata->{'results_count'}=0;
    }
    ### dprint Dumper(\@multi),Dumper(\@simple);

    ##
    # First we search for multi-words sequences in the assumption that
    # they will provide smaller result sets or no results at all.
    #
    my @results;
    my $data_list=$index_object->get('Data');
    foreach my $marr (sort { scalar(@$b) <=> scalar(@$a) } @multi) {
        my $res=$self->search_multi($data_list,$ordering_seq,$marr);
        ### dprint "Multi Results: ",Dumper($marr),Dumper($res);
        if(!@$res) {
            return [ ];
        }
        push(@results,$res);
    }

    ##
    # Searching for simple words, position independent. Longer words first.
    #
    foreach my $kw (sort { length($b) <=> length($a) } @simple) {
        my $res=$self->search_simple($data_list,$ordering_seq,$kw);
        ### dprint "Simple Results: '$kw' ",Dumper($res);
        if(!@$res) {
            return [ ];
        }
        push(@results,$res);
    }

    ##
    # Joining all results together
    #
    if($rcdata) {
        my $sr=XAO::IndexerSupport::sorted_intersection(@results);
        $rcdata->{'results_count'}=scalar(@$sr);
        return $sr;
    }
    else {
        return XAO::IndexerSupport::sorted_intersection(@results);
    }
}

###############################################################################

sub search_multi ($$$$) {
    my ($self,$data_list,$oseq,$marr)=@_;

    ##
    # Getting results starting from the longest set of words in the hope
    # that we get no match at all. On null results exit immediately.
    #
    my %rawdata;
    foreach my $kw (sort { length($b || '') <=> length($a || '') } @$marr) {
        last unless defined $kw;
        my $sr=$data_list->search('keyword','eq',$kw);
        return [ ] unless @$sr;

        use bytes;

        my $r=$data_list->get($sr->[0])->get("idpos_$oseq");

        my $compression_flag=unpack('w',$r);
        if(defined $compression_flag && $compression_flag == 0) {

            # The temporary variable is required. See the note on the
            # other decompress() use.
            #
            my $c=substr($r,1);
            $r=Compress::LZO::decompress($c);
        }

        $rawdata{$kw}=$r;
    }

    return XAO::IndexerSupport::sorted_intersection_pos($marr,\%rawdata);
}

###############################################################################

sub search_simple ($$$$) {
    my ($self,$data_list,$oseq,$keyword)=@_;

    my $sr=$data_list->search('keyword','eq',$keyword);
    return [ ] unless @$sr;

    my $iddata=$data_list->get($sr->[0])->get("id_$oseq");

    ##
    # Sometimes the data gets damaged or is in the process of being
    # updated. This can lead to unparsable results.
    #
    my $result;
    try {
        use bytes;

        my $compression_flag=unpack('w',$iddata);
        if(defined $compression_flag && $compression_flag==0) {

            # Directly passing substr(..) into decompress fails in perl
            # 5.22 with "buffer parameter is not a SCALAR". Likely this
            # is because substr is actually an lvalue and not a true
            # scalar. Worked fine in earlier versions of perl. Putting
            # it in a temporary variable fixes that.
            #
            my $c=scalar(substr($iddata,1));
            $iddata=Compress::LZO::decompress($c);

            defined $iddata ||
                throw $self "Can't decompress data";
        }

        $result=[ unpack('w*',$iddata) ];
    }
    otherwise {
        my $e=shift;
        eprint "Bad indexer data for kw='$keyword', sorting='$oseq': $e";
        $result=[ ];
    };

    return $result;
}

###############################################################################

sub suggest_alternative ($%) {
    my $self=shift;
    my $args=get_args(\@_);

    my $index_object=$args->{'index_object'} ||
        throw $self "search - no 'index_object'";

    my $rcdata=$args->{'rcdata'} ||
        throw $self "suggest_alternatives - need rcdata";

    my $query=$args->{'search_string'} ||
        throw $self "suggest_alternatives - need search_string";

    my $spwords=$rcdata->{'spellchecker_words'};
    return '' unless $spwords;

    ##
    # This can be used to improve results when using a generic
    # dictionary on a small dataset, not a dictionary based on the
    # actual index content (which guarantees that we already get only
    # valid substitutes).
    #
    my %wcounts;
    my $max_alt_words=$self->config_param('spellchecker/max_alt_words') || 0;
    if($max_alt_words) {
        dprint "Using 'max_alt_words' slows down suggestion, consider building a custom dictionary";
        my $data_list=$index_object->get('Data');
        foreach my $word (keys %$spwords) {
            my $alist=$spwords->{$word};
            my @newlist;
            for(my $i=0; $i<$max_alt_words && $i<@$alist; ++$i) {
                my $altword=lc($alist->[$i]);
                ### dprint "Trying word '$word' -> '$altword'";

                my @aw=$self->analyze_text_split(0,$altword);
                my $found=0;
                foreach my $aw (@aw) {
                    my $sr=$data_list->search('keyword','eq',$aw);
                    next unless @$sr;
                    ++$found;
                    $wcounts{$aw}=scalar(@$sr);
                }

                if($found && $found==scalar(@aw)) {
                    push(@newlist,$altword);
                }
            }
            $spwords->{$word}=\@newlist;
        }
    }

    ##
    # Ordering the list of substitutions to try on the original query.
    # Fuzzy area, there could be multiple algorithms to do that.
    #
    my $algorithm=$self->config_param('spellchecker/algorithm') || 'sequential';
    my $max_alt_searches=$self->config_param('spellchecker/max_alt_searches') || 15;
    my $max_alt_results=$self->config_param('spellchecker/max_alt_results') || 2;
    my $max_result_distance=$self->config_param('spellchecker/max_result_distance') || 5;
    ### dprint ".algorithm=$algorithm max_alt_results=$max_alt_results max_alt_searches=$max_alt_searches";

    ##
    # In 'sequential' we scan through all words from most likely to
    # least likely, with all possible variations. If we have these variants:
    # A->(B,C)  K->(L,M)  X->(Y)
    # then the following will be tried
    # (A-B) (K-L) (X-Y) (A-B,K-L) (A-B,X-Y) (K-L,X-Y) (A-B,K-L,X-Y) (A-B,K-M)
    # (A-C,K-L) (A-C,X-Y) (K-M,X-Y) (A-C,K-M) (A-C,K-L,X-Y), etc..
    #
    # The idea is that we assign weight proportional to the distance
    # from the top of the list, make lists and then sort on the weight.
    #
    my @jobs;
    if($algorithm eq 'sequential') {

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN

            my $distance=0;
            my $count=0;
            foreach my $pair (@{$elt->{'job'}}) {
                $distance+=$pair->[2];
                ++$count;
            }
            $elt->{'distance'}=$distance+$count-1;
        }

        @jobs=map {
            {
                pairs       => [ map { ($_->[0],$_->[1]) } @{$_->{'job'}} ],
                distance    => $_->{'distance'},
            }
        } sort {
            $a->{'distance'} <=> $b->{'distance'} ||
            scalar(@{$b->{'job'}}) <=> scalar(@{$a->{'job'}})
        } @list;

        ### use Data::Dumper;
        ### dprint Dumper(\@jobs);

        if(@jobs>$max_alt_searches) {
            splice(@jobs,$max_alt_searches);
        }
    }

    ##
    # In 'bycount' we keep completely dropping least likely words. Does
    # not work very well, probably should be removed altogether as unusable.
    #
    ### elsif($algorithm eq 'bycount') {
    ###     for(my $i=0; %words && $i<$max_alt_searches; ++$i) {
    ###         my @wlist=sort { $words{$b}->[0]->[1] <=> $words{$a}->[0]->[1] } keys %words;
    ###         my @pairs;
    ###         my $have_difference;
    ###         foreach my $word (@wlist) {
    ###             my $altword=$words{$word}->[0]->[0];
    ###             $have_difference=1 if $altword ne $word;
    ###             push(@pairs,$word => $altword);
    ###         }
    ###         push(@jobs,{
    ###             pairs       => @pairs,
    ###             distance    => 1,
    ###         }) if $have_difference;
    ###         my $word=$wlist[$#wlist];
    ###         if(@{$words{$word}}>1) {
    ###             shift(@{$words{$word}});
    ###         }
    ###         else {
    ###             delete $words{$word};
    ###         }
    ###     }
    ### }
    else {
        throw $self "suggest_alternative - '$algorithm' algorithm is not supported";
    }

    # Now building other alternative strings and returning them in order.
    #
    my $results_count=$rcdata->{'results_count'} || 0;
    my @alts;
    for(my $i=0; $i<@jobs; ++$i) {
        my $distance=$jobs[$i]->{'distance'} || 1;
        $distance<=$max_result_distance || next;

        my $newq=$query;
        my $pairs=$jobs[$i]->{'pairs'};
        my @finalpairs;
        for(my $j=0; $j<@$pairs; $j+=2) {
            my $word=$pairs->[$j];
            my $altword=$pairs->[$j+1];
            if($word ne $altword) {
                $newq=~s/\b$word\b/$altword/ig;
                $newq=~s/^\s*(.*?)\s*$/$1/s;
                $newq=~s/\s{2,}/ /s;
                push(@finalpairs,[ $word, $altword ]);
            }
        }
        next if $newq =~ /^\s+$/s;

        dprint "Trying query '$newq' instead of '$query'";
        my $sr=$self->search(
            index_object    => $index_object,
            search_string   => $newq,
            ordering        => $args->{'ordering'},
        );
        my $newcount=@$sr;
        if($newcount && $newcount>=$results_count) {
            dprint "Got a match on '$newq' ($newcount)";
            push(@alts,{
                query       => $newq,
                pairs       => \@finalpairs,
                results     => $args->{'need_results'} ? $sr : undef,
                count       => $newcount,
                distance    => $distance,
            });

            last if scalar(@alts)>=$max_alt_results;
        }
    }

    # Storing all variants and returning the first one.
    #
    $rcdata->{'spellchecker_alternatives'}=\@alts;
    return @alts ? $alts[0]->{'query'} : undef;
}

###############################################################################

sub update ($%) {
    my $self=shift;
    my $args=get_args(\@_);

    my $index_object=$args->{'index_object'} ||
        throw $self "update - no 'index_object'";

    ##
    # Checking if we need to compress the data
    #
    my $compression=$index_object->get('compression');
    if($compression) {
        eval 'use Compress::LZO';
        if($@) {
            throw $self "update - need Compress::LZO for compression ($compression)";
        }
        if($compression<1 || $compression>9) {
            throw $self "update - compression level must be between 1 and 9";
        }
    }

    ##
    # For compatibility reasons we need to call it in array context.
    #
    my @carr=$self->get_collection($args);
    my $cinfo;
    if(@carr==2) {
        $cinfo={
            collection  => $carr[0],
            ids         => $carr[1],
        };
    }
    else {
        $cinfo=$carr[0];
    }

    ##
    # Maximum number of matches for a word for it to be completely
    # ignored.
    #
    my $ignore_limit=$self->ignore_limit;

    ##
    # Maximum number of keywords updated before a forced transaction
    # commit. This helps to deal with otherwise huge transaction that
    # have to be cached before being moved into binlogs for slave
    # updates in MySQL -- it is my understanding that it will require at
    # least tripple write, and at least one atomic huge write. Better
    # not do that.
    #
    my $commit_interval=$self->commit_interval;

    ##
    # If that's a partial update and we don't have any IDs to update
    # we return immediately. Otherwise proceed even with empty set to



( run in 0.573 second using v1.01-cache-2.11-cpan-f56aa216473 )