Dallycot
view release on metacpan or search on metacpan
lib/Dallycot/Library/Linguistics.pm view on Meta::CPAN
}
my $d = deferred;
$d->resolve( Dallycot::Value::Vector->new(
map { Dallycot::Value::String->new($_) }
grep { $_ }
map { $language_codes_from_classifier{$_} }
@{Lingua::YALI::LanguageIdentifier -> new -> get_available_languages}
) );
return $d->promise;
};
define 'language-classifier-languages' => 'build-language-classifier-languages()';
define 'classify-text-language' => (
hold => 0,
arity => 1,
options => { 'languages' => Dallycot::Value::Vector -> new(Dallycot::Value::String->new('en')) }
), sub {
my ( $engine, $options, $text ) = @_;
if ( !$text -> isa('Dallycot::Value::String') && !$text->isa('Dallycot::Value::URI') ) {
croak "language-classify requires a String or URI as a second argument";
}
if( !$options->{'languages'} -> isa('Dallycot::Value::Vector') ) {
croak "language-classifier's 'languages' option requires a vector of strings";
}
my @languages =
grep { $_ }
map { $language_codes_for_classifier{ $_ -> value } }
grep { $_->isa('Dallycot::Value::String') }
$options->{'languages'}->values;
if(!@languages) {
croak "language-classifier's 'languages' option requires a vector of strings";
}
if(!defined $Lingua::YALI::LanguageIdentifier::VERSION) {
return Dallycot::Value::Vector->new();
}
my $identifier = Lingua::YALI::LanguageIdentifier->new;
$identifier->add_language($_) for @languages;
given ( blessed $text ) {
when ('Dallycot::Value::String') {
my $result = $identifier->identify_string( $text->value );
return Dallycot::Value::String -> new(
$language_codes_from_classifier{
$result -> [0] -> [0]
}
);
}
when ('Dallycot::Value::URI') {
return $text -> resolve_content -> then(
sub {
my ($body) = @_;
my $content = '';
given ( blessed $body ) {
when ('HTML') { # content-type: text/html
# we want to strip out the HTML and keep only text in the
# <body /> outside of <script/> tags
my $dom = Mojo::DOM->new( $body->{'value'} );
$content = $dom->find('body')->all_text;
}
when ('Dallycot::Value::String') { # content-type: text/plain
$content = $body->value;
}
when ('XML') { # content-type: text/xml (TEI, etc.)
my $dom = Mojo::DOM->new->xml(1)->parse( $body->{'value'} );
$content = $dom->all_text;
}
default {
croak "Unable to extract text from " . $text->{'value'};
}
}
my $worked = eval {
# TODO: make '4096' a tunable parameter
# algorithm takes a *long* time with large strings
my $result = $identifier->identify_string( substr( $content, 0, 4096 ) );
Dallycot::Value::String -> new(
$language_codes_from_classifier{
$result -> [0] -> [0]
}
);
};
if ($@) {
croak $@;
}
elsif ( !$worked ) {
croak "Unable to identify language.";
}
else {
return $worked;
}
}
);
}
default {
croak "language-classify requires a String or URI as a second argument";
}
}
};
define 'stop-word-languages' => '<<da nl en fi fr de hu it no pt es sv ru>>';
1;
( run in 1.587 second using v1.01-cache-2.11-cpan-524268b4103 )