XAO-Indexer

 view release on metacpan or  search on metacpan

lib/XAO/DO/Data/Index.pm  view on Meta::CPAN

 if(keys %{$sd{ignored_words}}) {
     print "Ignored words:\n";
     foreach my $word (sort keys %{$sd{ignored_words}}) {
         print " * $word ($sd{ignored_words}->{$word}\n";
     }
 }

=cut

sub search_by_string ($$$;$) {
    my ($self,$ordering,$str,$rcdata)=@_;

    return $self->indexer->search(
        index_object    => $self,
        search_string   => $str,
        ordering        => $ordering,
        rcdata          => $rcdata,
    );
}

###############################################################################

=item search_by_string_oid ($)

The same as search_by_string() method, but translates results from
collection IDs to object IDs. Use it with care, on large result sets it
may take significant time!

=cut

sub search_by_string_oid ($$$;$) {
    my ($self,$ordering,$str,$rcdata)=@_;

    my $cids=$self->indexer->search(
        index_object    => $self,
        search_string   => $str,
        ordering        => $ordering,
        rcdata          => $rcdata,
    );

    my $coll_obj=$self->get_collection_object;
    my @oids;
    foreach my $cid (@$cids) {
        try {
            push(@oids,$coll_obj->get($cid)->container_key);
        }
        otherwise {
            my $e=shift;

lib/XAO/DO/Data/Index.pm  view on Meta::CPAN

=item suggest_alternative ($$$)

Returns an alternative search string by trying words found during
search_by_string and stored in the returned data array.

EXPERIMENTAL UNSTABLE API.

=cut

sub suggest_alternative ($$$$;$) {
    my ($self,$ordering,$str,$rcdata,$need_results)=@_;

    return $self->indexer->suggest_alternative(
        index_object    => $self,
        search_string   => $str,
        ordering        => $ordering,
        rcdata          => $rcdata,
        need_results    => $need_results,
    );
}

###############################################################################

=item update ($)

Updates the index with the current data. Exactly what data it is based
on depends entirely on the corresponding indexer object.

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN

    }

    my $ordering=$args->{'ordering'} ||
        throw $self "search - no 'ordering'";
    my $ordering_seq=$self->get_orderings->{$ordering}->{'seq'} ||
        throw $self "search - no sequence in '$ordering' ordering";

    ##
    # Optional hash reference to be filled with statistics
    #
    my $rcdata=$args->{'rcdata'};
    $rcdata->{'ignored_words'}={ } if $rcdata;

    ### dprint "Searching for '$str' (ordering=$ordering, seq=$ordering_seq)";

    ##
    # Preparing spellchecker if needed
    #
    my $use_spellchecker=$rcdata && $self->config_param('use_spellchecker');
    my $spellchecker;
    if($use_spellchecker) {
        $spellchecker=$self->get_spellchecker;
        $spellchecker->switch_index($index_object->container_key);
        $rcdata->{'spellchecker_words'}={ };
    }

    ##
    # We cache ignored words. Cache returns a hash reference with
    # ignored words.
    #
    my $i_cache=get_current_project()->cache(
        name        => 'indexer_ignored',
        coords      => [ 'index_id' ],
        expire      => 60,

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN

    my @simple;
    foreach my $elt (@mdata) {
        my $s=$self->analyze_text_split(0,$elt);
        next unless @$s;
        if(@$s==1) {
            push(@simple,$s->[0]);
        }
        else {
            if($spellchecker) {
                my $pairs=$spellchecker->suggest_replacements(join(' ',@$s));
                @{$rcdata->{'spellchecker_words'}}{keys %$pairs}=values %$pairs;
            }
            my @t=map {
                if(exists $ignored->{$_}) {
                    if($rcdata) {
                        $rcdata->{'ignored_words'}->{$_}=$ignored->{$_};
                    }
                    undef;
                }
                else {
                    $_;
                }
            } @$s;
            shift(@t) while @t && !defined($t[0]);
            pop(@t) while @t && !defined($t[$#t]);
            if(@t==1) {

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN

            }
        }
    }
    undef @mdata;

    ##
    # Simple words
    #
    if($spellchecker) {
        my $pairs=$spellchecker->suggest_replacements($str);
        @{$rcdata->{'spellchecker_words'}}{keys %$pairs}=values %$pairs;
    }
    push(@simple,map {
        if(exists $ignored->{$_}) {
            if($rcdata) {
                $rcdata->{'ignored_words'}->{$_}=$ignored->{$_};
            }
            ();
        }
        else {
            $_;
        }
    } @{$self->analyze_text_split(0,$str)});

    ##
    # If we are asked to provide data, storing splitted words.
    #
    if($rcdata) {
        $rcdata->{'words_single'}=\@simple;
        $rcdata->{'words_multi'}=\@multi;
        $rcdata->{'results_count'}=0;
    }
    ### dprint Dumper(\@multi),Dumper(\@simple);

    ##
    # First we search for multi-words sequences in the assumption that
    # they will provide smaller result sets or no results at all.
    #
    my @results;
    my $data_list=$index_object->get('Data');
    foreach my $marr (sort { scalar(@$b) <=> scalar(@$a) } @multi) {

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN

        ### dprint "Simple Results: '$kw' ",Dumper($res);
        if(!@$res) {
            return [ ];
        }
        push(@results,$res);
    }

    ##
    # Joining all results together
    #
    if($rcdata) {
        my $sr=XAO::IndexerSupport::sorted_intersection(@results);
        $rcdata->{'results_count'}=scalar(@$sr);
        return $sr;
    }
    else {
        return XAO::IndexerSupport::sorted_intersection(@results);
    }
}

###############################################################################

sub search_multi ($$$$) {

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN


###############################################################################

sub suggest_alternative ($%) {
    my $self=shift;
    my $args=get_args(\@_);

    my $index_object=$args->{'index_object'} ||
        throw $self "search - no 'index_object'";

    my $rcdata=$args->{'rcdata'} ||
        throw $self "suggest_alternatives - need rcdata";

    my $query=$args->{'search_string'} ||
        throw $self "suggest_alternatives - need search_string";

    my $spwords=$rcdata->{'spellchecker_words'};
    return '' unless $spwords;

    ##
    # This can be used to improve results when using a generic
    # dictionary on a small dataset, not a dictionary based on the
    # actual index content (which guarantees that we already get only
    # valid substitutes).
    #
    my %wcounts;
    my $max_alt_words=$self->config_param('spellchecker/max_alt_words') || 0;

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN

    ###             delete $words{$word};
    ###         }
    ###     }
    ### }
    else {
        throw $self "suggest_alternative - '$algorithm' algorithm is not supported";
    }

    # Now building other alternative strings and returning them in order.
    #
    my $results_count=$rcdata->{'results_count'} || 0;
    my @alts;
    for(my $i=0; $i<@jobs; ++$i) {
        my $distance=$jobs[$i]->{'distance'} || 1;
        $distance<=$max_result_distance || next;

        my $newq=$query;
        my $pairs=$jobs[$i]->{'pairs'};
        my @finalpairs;
        for(my $j=0; $j<@$pairs; $j+=2) {
            my $word=$pairs->[$j];

lib/XAO/DO/Indexer/Base.pm  view on Meta::CPAN

                count       => $newcount,
                distance    => $distance,
            });

            last if scalar(@alts)>=$max_alt_results;
        }
    }

    # Storing all variants and returning the first one.
    #
    $rcdata->{'spellchecker_alternatives'}=\@alts;
    return @alts ? $alts[0]->{'query'} : undef;
}

###############################################################################

sub update ($%) {
    my $self=shift;
    my $args=get_args(\@_);

    my $index_object=$args->{'index_object'} ||

lib/XAO/DO/Web/Indexer.pm  view on Meta::CPAN

    ##
    # Searching. If we have ignored words templates then building the
    # list of ignored words as well.
    #
    my $index_obj=$self->odb->fetch('/Indexes')->get($index_id);
    my $obj_ids;
    my $page=$self->object;
    my $ignored_text='';
    my $spelling_text='';
    if($args->{'ignored.path'} || $args->{'ignored.template'} || $args->{'spelling.path'} || $args->{'spelling.template'}) {
        my %rcdata;
        $obj_ids=$index_obj->search_by_string($orderby,$keywords,\%rcdata);

        if($args->{'ignored.path'} || $args->{'ignored.template'}) {
            my $iw=$rcdata{'ignored_words'};
            my $iw_num=scalar keys %$iw;
            if($iw_num) {
                $ignored_text.=$page->expand($args,{
                    path        => $args->{'ignored.header.path'},
                    template    => $args->{'ignored.header.template'},
                    TOTAL_WORDS => $iw_num,
                }) if $args->{'ignored.header.path'} || $args->{'ignored.header.template'};

                my $first=1;
                foreach my $w (keys %$iw) {

lib/XAO/DO/Web/Indexer.pm  view on Meta::CPAN

                $ignored_text.=$page->expand($args,{
                    path        => $args->{'ignored.footer.path'},
                    template    => $args->{'ignored.footer.template'},
                    TOTAL_WORDS => $iw_num,
                }) if $args->{'ignored.footer.path'} || $args->{'ignored.footer.template'};
            }
        }

        my $trigger=$args->{'spelling.trigger'} || 3;
        if(@$obj_ids<$trigger && $args->{'spelling.path'} || $args->{'spelling.template'}) {
            $index_obj->suggest_alternative($orderby,$keywords,\%rcdata);

            my @alt_kw;
            my @alt_kw_html;
            foreach my $i (0,1) {
                my $spdata=$rcdata{'spellchecker_alternatives'}->[$i];
                last unless $spdata &&
                            $spdata->{'query'} &&
                            $spdata->{'distance'}<=3;

                my $alt_query=$spdata->{'query'};
                my $alt_query_html=t2ht($alt_query);
                foreach my $pair (@{$spdata->{'pairs'}}) {
                    my $altword=t2ht($pair->[1]);
                    next unless length($altword);
                    $alt_query_html=~s/\b($altword)\b/<em><strong>$1<\/em><\/strong>/sg;

scripts/xao-indexer  view on Meta::CPAN

##
# Searching
#
if($search) {
    my $i_name=shift @ARGV;
    my $ordering=shift @ARGV;
    my $index=$index_list->get($i_name);

    foreach my $str (@ARGV) {
        dprint "Searching '$i_name' for '$str'";
        my %rcdata;
        my $sr=$index->search_by_string($ordering,$str,\%rcdata);
        if($alternatives) {
            $index->suggest_alternative($ordering,$str,\%rcdata);
        }
        dprint Dumper(\%rcdata);
        unless($no_results) {
            print join("\n",@$sr),"\n";
        }
    }

    exit 0;
}

##
# Creating new index structure

t/testcases/Indexer/incremental.pm  view on Meta::CPAN

                of          => 1,
            },
        },
    );
    foreach my $test_id (keys %matrix) {
        my $test=$matrix{$test_id};
        my $query=$test->{query};
        foreach my $oname (sort keys %$test) {
            next if $oname eq 'query';
            next if $oname eq 'ignored';
            my %rcdata;
            my $sr;
            if($test->{ignored}) {
                $sr=$foo_index->search_by_string($oname,$query,\%rcdata);
                foreach my $w (keys %{$test->{ignored}}) {
                    my $expect=$test->{ignored}->{$w};
                    my $got=$rcdata{ignored_words}->{$w};
                    if($expect) {
                        $self->assert(defined($got),
                                      "Expected '$w' to be ignored, but it is not");
                    }
                    else {
                        $self->assert(!defined($got),
                                      "Expected '$w' not to be ignored, but it is (count=".($got||'').")");
                    }
                }
            }

t/testcases/Indexer/search.pm  view on Meta::CPAN

        },
    );

    foreach my $test_id (keys %matrix) {
        my $test=$matrix{$test_id};
        my $query=$test->{'query'};
        foreach my $oname (sort keys %$test) {
            next if $oname eq 'query';
            next if $oname eq 'ignored';
            next if $oname eq 'use_oid';
            my %rcdata;
            my $sr;
            if($test->{'ignored'}) {
                $sr=$test->{'use_oid'} ? $foo_index->search_by_string_oid($oname,$query,\%rcdata)
                                       : $foo_index->search_by_string($oname,$query,\%rcdata);
                foreach my $w (keys %{$test->{'ignored'}}) {
                    my $expect=$test->{'ignored'}->{$w};
                    my $got=$rcdata{'ignored_words'}->{$w};
                    if(defined $expect) {
                        $self->assert(defined($got),
                                      "Expected '$w' to be ignored, but it is not");
                        $self->assert($got == $expect,
                                      "Expected count $expect on ignored $w, got $got");
                    }
                    else {
                        $self->assert(!defined($got),
                                      "Expected '$w' not to be ignored, but it is (count=".($got||'').")");
                    }

t/testcases/Indexer/spellchecker.pm  view on Meta::CPAN

    );
    foreach my $test_id (keys %matrix) {
        my $test=$matrix{$test_id};
        my $query=$test->{'query'};
        foreach my $oname (sort keys %$test) {
            next if $oname eq 'query';
            next if $oname eq 'ignored';
            next if $oname eq 'use_oid';
            next if $oname eq 'speller';
            next if $oname eq 'speller_query';
            my %rcdata;
            my $sr;
            if($test->{'ignored'} || $test->{'speller'} || $test->{'speller_query'}) {
                $sr=$test->{'use_oid'} ? $foo_index->search_by_string_oid($oname,$query,\%rcdata)
                                       : $foo_index->search_by_string($oname,$query,\%rcdata);

                if($test->{'speller'}) {
                    my $got=$rcdata{'spellchecker_words'};
                    $self->assert(defined($got) && ref($got) eq 'HASH',
                                  "Expected a spellchecked list");
                    foreach my $word (keys %{$test->{'speller'}}) {
                        $self->assert(exists($got->{$word}),
                                      "Expected a spell-suggestion for $word");
                        my $expect=$test->{'speller'}->{$word};
                        $self->assert(scalar(grep { $_ eq  $expect } @{$got->{$word}}),
                                      "Expected spell-suggestion '".$test->{'speller'}->{$word}."' for '$word'");
                    }
                }

                if($test->{'speller_query'}) {
                    my $got=$foo_index->suggest_alternative($oname,$query,\%rcdata);
                    $self->assert($got eq $test->{'speller_query'},
                                  "Expected alternative query '$test->{'speller_query'}', got '$got'");

                    if($test->{'speller'}) {
                        foreach my $word (keys %{$test->{'speller'}}) {
                            my $altword=$test->{'speller'}->{$word};
                            next if $altword eq $word;
                            my $pairs=$rcdata{'spellchecker_alternatives'}->[0]->{'pairs'};
                            $self->assert(scalar(grep { $_->[0] eq $word && $_->[1] eq $altword } @$pairs),
                                          "Expected to have a replacement pair ($word->$altword) for $query");
                        }
                    }
                }

                if($test->{'ignored'}) {
                    foreach my $w (keys %{$test->{'ignored'}}) {
                        my $expect=$test->{'ignored'}->{$w};
                        my $got=$rcdata{'ignored_words'}->{$w};
                        if(defined $expect) {
                            $self->assert(defined($got),
                                          "Expected '$w' to be ignored, but it is not");
                            $self->assert($got == $expect,
                                          "Expected count $expect on ignored $w, got $got");
                        }
                        else {
                            $self->assert(!defined($got),
                                          "Expected '$w' not to be ignored, but it is (count=".($got||'').")");
                        }



( run in 0.286 second using v1.01-cache-2.11-cpan-454fe037f31 )