endpoint results from the CPAN

endpoint
KSx-Highlight-Summarizer
view release on metacpan or search on metacpan
lib/KSx/Highlight/Summarizer.pm view on Meta::CPAN
package KSx::Highlight::Summarizer;

$VERSION = '0.06';

@ISA = KinoSearch::Highlight::Highlighter;
use KinoSearch::Highlight::Highlighter;

use strict;

use List::Util qw 'min';
use Number::Range;

use Hash::Util::FieldHash::Compat 'fieldhashes';
fieldhashes \my( %ellipsis, %summ_len, %page_h, %encoder );

sub _range_endpoints {
  my $range = shift;
  my @range = $range->range;
  my $previous = shift @range;
  my $subrange = [($previous) x 2];
  my @arrays;
  foreach my $current (@range) {
    if ($current == ($previous + 1)) {
      $subrange->[1] = $current;
    }
    else {
      push @arrays, $subrange;
      $subrange = [($current) x 2];
    }
    $previous = $current;
  }
  return @arrays, $subrange; # Make sure the last subrange isnâ€™t left out!
}

sub new {
	my($pack, %args) = @_;
	my $ellipsis = exists $args{ellipsis} ? delete $args{ellipsis}
		: ' ... ';
	my $summ_len = exists $args{summary_length}
		? delete $args{summary_length} : 0;
	my $page_h = delete $args{page_handler};
	my $encoder   = delete $args{encoder};

	# accept args that the superclass only allows one to set through
	# accessor methods:
        my $pre_tag = delete $args{pre_tag};
        my $post_tag = delete $args{post_tag};

	my $self = SUPER::new $pack %args;

	$ellipsis{$self} = $ellipsis;
	$summ_len{$self} = $summ_len;
	$page_h{$self}   = $page_h;
	$encoder{$self}  = $encoder;

	defined $pre_tag and $self->set_pre_tag($pre_tag);
	defined $post_tag and $self->set_post_tag($post_tag);

	return $self;
}

sub create_excerpt {
    my ($self, $hitdoc) = @_;
	
    my $field = $self->get_field;
    my $x_len = $self->get_excerpt_length;
    my $limit = int($x_len /3 );

    # retrieve the text from the chosen field
    my $text = $hitdoc->{$field};
    return unless defined $text;
    my $text_length = length $text;
    return '' unless $text_length;

    # get offsets and weights of words that match
    my $searcher = $self->get_searchable;
    my $posits = $self->get_compiler->highlight_spans(
        searchable => $searcher,
        field      => $field,
        doc_vec    => $searcher->fetch_doc_vec(
                          $hitdoc->get_doc_id
                      ),
    );
    my @locs = map [$_->get_offset,$_->get_weight], @{
     KinoSearch::Highlight::HeatMap->new(
        spans  => $posits,
        window => $limit*2
     )->get_spans
    };
    @locs = map $$_[0], sort { $$b[1] <=> $$a[1] } @locs;
    
    @locs or @locs = 0;

#warn "@locs" if $summ_len{$self};
    # determine the rough boundaries of the excerpts
    my $range = new Number::Range;
    my $summ_len = $summ_len{$self};
    for(@locs) {
        no warnings; # suppress Number::Rangeâ€™s nasty warnings
	my $start = $_-$limit;
	$start = 0 if $start < 0;
        $range->addrange($start . '..' . min($start+$x_len, $text_length));
	last if !$summ_len || $range->size >= $summ_len;
    }
    my @excerpt_bounds = _range_endpoints($range);
#use DDS; warn Dump \@excerpt_bounds if $summ_len;

    # close small gaps between ranges
    for(my $c = 1; $c < @excerpt_bounds;++$c) {
        $excerpt_bounds[$c][0] - $excerpt_bounds[$c-1][1] <= 10 and
            $excerpt_bounds[$c-1][1] = $excerpt_bounds[$c][1],
            splice(@excerpt_bounds, $c, 1),
            --$c;
    }

    # extract the offsets from the highlight spans
    my(@starts, @ends);
    for(@$posits) {
     push(@starts, my $start = $_->get_offset);
     push(@ends,   $start + $_->get_length);
    }

    # make the summary
    my $summary = '';
    my $ellipsis = $ellipsis{$self};
    my $token_re = qr/\b\w+(?:'\w+)?\b/;
    my $prev_ellipsis; # whether the previous excerpt ended with an ellip.
    my $prev_page = 0; # last page number of previous excerpt
    my $page_h = $page_h{$self};
    for(@excerpt_bounds) {
        # make the excerpt
        my ($start,$end) = @$_;

        # determine the page number that $start falls within
        my $page_no;
        $page_h and $page_no =
            substr($text, 0,$start) =~ y/\014// + 1;

        my $x; # short for x-cerpt
        my $need_ellipsis;

#warn "<<".substr($text,$start,$limit).">>";
        # look for a page break within $limit chars from $start (except we
        # shouldnâ€™t do it if $start is 0 because thereâ€™s  a  good  chance
        # weâ€™ll go past the very word for whose sake this excerpt exists)
	# ~~~ What about a case in which a page break plus maybe a few
	#     spaces occur just *before* $start. That shouldnâ€™t get an
	#     ellipsis  (as in the  elsif  block  below),  should  it?
        if($page_h && $start &&
           substr($text,$start,$limit) =~ /^(.*)\014/s) {
            $start += length($1) + 1;
            $page_no += 1 + $1 =~ y/\014//;
            $x = substr $text, $start;
        }
        elsif( $start ) { # if this is not the beginning of the doc
            my $sb = $self->find_sentences(
                text => $text, offset => $start, length => $limit
            );
            if(@$sb) {
                $start = $$sb[0];
            }
            else { ++ $need_ellipsis }
            $x = substr $text, $start;
            if($need_ellipsis) {                      
                # skip past possible partial tokens, but donâ€™t insert an
( run in 1.547 second using v1.01-cache-2.11-cpan-524268b4103 )