Dallycot

 view release on metacpan or  search on metacpan

lib/Dallycot/Library/Linguistics.pm  view on Meta::CPAN

  }

  my $d = deferred;

  $d->resolve( Dallycot::Value::Vector->new(
    map { Dallycot::Value::String->new($_) }
    grep { $_ }
    map { $language_codes_from_classifier{$_} }
    @{Lingua::YALI::LanguageIdentifier -> new -> get_available_languages}
  ) );

  return $d->promise;
};

define 'language-classifier-languages' => 'build-language-classifier-languages()';

define 'classify-text-language' => (
  hold    => 0,
  arity   => 1,
  options => { 'languages' => Dallycot::Value::Vector -> new(Dallycot::Value::String->new('en')) }
  ), sub {
  my ( $engine, $options, $text ) = @_;

  if ( !$text -> isa('Dallycot::Value::String') && !$text->isa('Dallycot::Value::URI') ) {
    croak "language-classify requires a String or URI as a second argument";
  }
  if( !$options->{'languages'} -> isa('Dallycot::Value::Vector') ) {
    croak "language-classifier's 'languages' option requires a vector of strings";
  }
  my @languages =
    grep { $_ }
    map { $language_codes_for_classifier{ $_ -> value } }
    grep { $_->isa('Dallycot::Value::String') }
    $options->{'languages'}->values;

  if(!@languages) {
    croak "language-classifier's 'languages' option requires a vector of strings";
  }
  if(!defined $Lingua::YALI::LanguageIdentifier::VERSION) {
    return Dallycot::Value::Vector->new();
  }

  my $identifier = Lingua::YALI::LanguageIdentifier->new;
  $identifier->add_language($_) for @languages;

  given ( blessed $text ) {
    when ('Dallycot::Value::String') {
      my $result = $identifier->identify_string( $text->value );
      return Dallycot::Value::String -> new(
        $language_codes_from_classifier{
          $result -> [0] -> [0]
        }
      );
    }
    when ('Dallycot::Value::URI') {
      return $text -> resolve_content -> then(
        sub {
          my ($body) = @_;
          my $content = '';
          given ( blessed $body ) {
            when ('HTML') {    # content-type: text/html
                               # we want to strip out the HTML and keep only text in the
                               # <body /> outside of <script/> tags
              my $dom = Mojo::DOM->new( $body->{'value'} );
              $content = $dom->find('body')->all_text;
            }
            when ('Dallycot::Value::String') {    # content-type: text/plain
              $content = $body->value;
            }
            when ('XML') {       # content-type: text/xml (TEI, etc.)
              my $dom = Mojo::DOM->new->xml(1)->parse( $body->{'value'} );
              $content = $dom->all_text;
            }
            default {
              croak "Unable to extract text from " . $text->{'value'};
            }
          }
          my $worked = eval {
            # TODO: make '4096' a tunable parameter
            # algorithm takes a *long* time with large strings
            my $result = $identifier->identify_string( substr( $content, 0, 4096 ) );
            Dallycot::Value::String -> new(
              $language_codes_from_classifier{
                $result -> [0] -> [0]
              }
            );
          };
          if ($@) {
            croak $@;
          }
          elsif ( !$worked ) {
            croak "Unable to identify language.";
          }
          else {
            return $worked;
          }
        }
      );
    }
    default {
      croak "language-classify requires a String or URI as a second argument";
    }
  }
};

define 'stop-word-languages' => '<<da nl en fi fr de hu it no pt es sv ru>>';

1;



( run in 1.587 second using v1.01-cache-2.11-cpan-524268b4103 )