Lingua-Identifier

 view release on metacpan or  search on metacpan

lib/Lingua/Identifier.pm  view on Meta::CPAN

use 5.014004;
use strict;
use warnings FATAL => 'all';

use File::ShareDir 'dist_dir';
use File::Spec::Functions;

use Math::Matrix::MaybeGSL 0.006;

use Lingua::Identifier::ForwardProp;
use Lingua::Identifier::Feature::Trigrams;
use Lingua::Identifier::Feature::Alphabet;

=encoding UTF-8

=head1 NAME

Lingua::Identifier - A NN based approach for language identification

=cut

our $sharedir = dist_dir('Lingua-Identifier');

our $features = do( catfile($sharedir, "features.dmp"));
die __PACKAGE__ . "- could not load 'features.dmp'" unless defined $features;

our $classes = do( catfile($sharedir, "classes.dmp"));
die __PACKAGE__ . "- could not load 'classes.dmp'" unless defined $classes;

our $thetas;
_load_thetas($sharedir);


=head1 SYNOPSIS

    use Lingua::Identifier;

    my $identifier = Lingua::Identifier->new();

    # identify language on a file
    my $lang = $identifier->identify_file("text.txt");

    # identify language on a string
    my $lang = $identifier->identify($string);

=head1 DESCRIPTION

This documentation is not ready yet. These releases are just for
CPANtesters testing.

=head2 C<new>

Constructs a new Language Identification object.

    my $identifier = Lingua::Identifier->new();

=cut

sub new {

    return bless { languages => $classes }, __PACKAGE__;
}

=head2 C<languages>

Returns the list of codes for the active languages.

=cut

sub languages {
    my $self = shift;
    return @{$self->{languages}};
}

=head2 C<identify_file>

This method receives a filename and tries to identify its langauge.

In scalar context returns the language id. In list context returns an
associative array, with language codes and respective scores.

    my $lang = $identifier->identify_file("sometext.txt");

=cut

sub identify_file {
    my ($self, $filename) = @_;

    my $string = _load_file($filename);
    $self->identify($string);
}

=head2 C<identify>

This method receives a string and tries to identify its langauge.

In scalar context returns the language id. In list context returns an
associative array, with language codes and respective scores.

    my $lang = $identifier->identify($string);

=cut

sub identify {
    my ($self, $string) = @_;

    my $ngrams = _compute_features($string);

    my $data = Matrix->new(scalar(@$features), 1);

    my $i = 1;
    for my $feature (@$features) {
        if (exists($ngrams->{$feature})) {
            $data->assign($i, 1, $ngrams->{$feature});
        }
        $i++;
    }

    my $ans = Lingua::Identifier::ForwardProp::forward_prop($data, $thetas);

    my ($max, $pos) = $ans->max();

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 1.958 second using v1.00-cache-2.02-grep-82fe00e-cpan-cec75d87357c )