KSx-Highlight-Summarizer
view release on metacpan or search on metacpan
lib/KSx/Highlight/Summarizer.pm view on Meta::CPAN
package KSx::Highlight::Summarizer;
$VERSION = '0.06';
@ISA = KinoSearch::Highlight::Highlighter;
use KinoSearch::Highlight::Highlighter;
use strict;
use List::Util qw 'min';
use Number::Range;
use Hash::Util::FieldHash::Compat 'fieldhashes';
fieldhashes \my( %ellipsis, %summ_len, %page_h, %encoder );
sub _range_endpoints {
my $range = shift;
my @range = $range->range;
my $previous = shift @range;
my $subrange = [($previous) x 2];
my @arrays;
foreach my $current (@range) {
if ($current == ($previous + 1)) {
$subrange->[1] = $current;
}
else {
push @arrays, $subrange;
$subrange = [($current) x 2];
}
$previous = $current;
}
return @arrays, $subrange; # Make sure the last subrange isnât left out!
}
sub new {
my($pack, %args) = @_;
my $ellipsis = exists $args{ellipsis} ? delete $args{ellipsis}
: ' ... ';
my $summ_len = exists $args{summary_length}
? delete $args{summary_length} : 0;
my $page_h = delete $args{page_handler};
my $encoder = delete $args{encoder};
# accept args that the superclass only allows one to set through
# accessor methods:
my $pre_tag = delete $args{pre_tag};
my $post_tag = delete $args{post_tag};
my $self = SUPER::new $pack %args;
$ellipsis{$self} = $ellipsis;
$summ_len{$self} = $summ_len;
$page_h{$self} = $page_h;
$encoder{$self} = $encoder;
defined $pre_tag and $self->set_pre_tag($pre_tag);
defined $post_tag and $self->set_post_tag($post_tag);
return $self;
}
sub create_excerpt {
my ($self, $hitdoc) = @_;
my $field = $self->get_field;
my $x_len = $self->get_excerpt_length;
my $limit = int($x_len /3 );
# retrieve the text from the chosen field
my $text = $hitdoc->{$field};
return unless defined $text;
my $text_length = length $text;
return '' unless $text_length;
# get offsets and weights of words that match
my $searcher = $self->get_searchable;
my $posits = $self->get_compiler->highlight_spans(
searchable => $searcher,
field => $field,
doc_vec => $searcher->fetch_doc_vec(
$hitdoc->get_doc_id
),
);
my @locs = map [$_->get_offset,$_->get_weight], @{
KinoSearch::Highlight::HeatMap->new(
spans => $posits,
window => $limit*2
)->get_spans
};
@locs = map $$_[0], sort { $$b[1] <=> $$a[1] } @locs;
@locs or @locs = 0;
#warn "@locs" if $summ_len{$self};
# determine the rough boundaries of the excerpts
my $range = new Number::Range;
my $summ_len = $summ_len{$self};
for(@locs) {
no warnings; # suppress Number::Rangeâs nasty warnings
my $start = $_-$limit;
$start = 0 if $start < 0;
$range->addrange($start . '..' . min($start+$x_len, $text_length));
last if !$summ_len || $range->size >= $summ_len;
}
my @excerpt_bounds = _range_endpoints($range);
#use DDS; warn Dump \@excerpt_bounds if $summ_len;
# close small gaps between ranges
for(my $c = 1; $c < @excerpt_bounds;++$c) {
$excerpt_bounds[$c][0] - $excerpt_bounds[$c-1][1] <= 10 and
$excerpt_bounds[$c-1][1] = $excerpt_bounds[$c][1],
splice(@excerpt_bounds, $c, 1),
--$c;
}
# extract the offsets from the highlight spans
my(@starts, @ends);
for(@$posits) {
push(@starts, my $start = $_->get_offset);
push(@ends, $start + $_->get_length);
}
# make the summary
my $summary = '';
my $ellipsis = $ellipsis{$self};
my $token_re = qr/\b\w+(?:'\w+)?\b/;
my $prev_ellipsis; # whether the previous excerpt ended with an ellip.
my $prev_page = 0; # last page number of previous excerpt
my $page_h = $page_h{$self};
for(@excerpt_bounds) {
# make the excerpt
my ($start,$end) = @$_;
# determine the page number that $start falls within
my $page_no;
$page_h and $page_no =
substr($text, 0,$start) =~ y/\014// + 1;
my $x; # short for x-cerpt
my $need_ellipsis;
#warn "<<".substr($text,$start,$limit).">>";
# look for a page break within $limit chars from $start (except we
# shouldnât do it if $start is 0 because thereâs a good chance
# weâll go past the very word for whose sake this excerpt exists)
# ~~~ What about a case in which a page break plus maybe a few
# spaces occur just *before* $start. That shouldnât get an
# ellipsis (as in the elsif block below), should it?
if($page_h && $start &&
substr($text,$start,$limit) =~ /^(.*)\014/s) {
$start += length($1) + 1;
$page_no += 1 + $1 =~ y/\014//;
$x = substr $text, $start;
}
elsif( $start ) { # if this is not the beginning of the doc
my $sb = $self->find_sentences(
text => $text, offset => $start, length => $limit
);
if(@$sb) {
$start = $$sb[0];
}
else { ++ $need_ellipsis }
$x = substr $text, $start;
if($need_ellipsis) {
# skip past possible partial tokens, but donât insert an
( run in 1.547 second using v1.01-cache-2.11-cpan-524268b4103 )