Dallycot
view release on metacpan or search on metacpan
lib/Dallycot/Library/Linguistics.pm view on Meta::CPAN
$result -> [0] -> [0]
}
);
}
when ('Dallycot::Value::URI') {
return $text -> resolve_content -> then(
sub {
my ($body) = @_;
my $content = '';
given ( blessed $body ) {
when ('HTML') { # content-type: text/html
# we want to strip out the HTML and keep only text in the
# <body /> outside of <script/> tags
my $dom = Mojo::DOM->new( $body->{'value'} );
$content = $dom->find('body')->all_text;
}
when ('Dallycot::Value::String') { # content-type: text/plain
$content = $body->value;
}
when ('XML') { # content-type: text/xml (TEI, etc.)
my $dom = Mojo::DOM->new->xml(1)->parse( $body->{'value'} );
$content = $dom->all_text;
}
default {
croak "Unable to extract text from " . $text->{'value'};
}
}
my $worked = eval {
# TODO: make '4096' a tunable parameter
# algorithm takes a *long* time with large strings
( run in 0.371 second using v1.01-cache-2.11-cpan-524268b4103 )