Lingua-Identifier
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/Lingua/Identifier.pm view on Meta::CPAN
use 5.014004;
use strict;
use warnings FATAL => 'all';
use File::ShareDir 'dist_dir';
use File::Spec::Functions;
use Math::Matrix::MaybeGSL 0.006;
use Lingua::Identifier::ForwardProp;
use Lingua::Identifier::Feature::Trigrams;
use Lingua::Identifier::Feature::Alphabet;
=encoding UTF-8
=head1 NAME
Lingua::Identifier - A NN based approach for language identification
=cut
our $sharedir = dist_dir('Lingua-Identifier');
our $features = do( catfile($sharedir, "features.dmp"));
die __PACKAGE__ . "- could not load 'features.dmp'" unless defined $features;
our $classes = do( catfile($sharedir, "classes.dmp"));
die __PACKAGE__ . "- could not load 'classes.dmp'" unless defined $classes;
our $thetas;
_load_thetas($sharedir);
=head1 SYNOPSIS
use Lingua::Identifier;
my $identifier = Lingua::Identifier->new();
# identify language on a file
my $lang = $identifier->identify_file("text.txt");
# identify language on a string
my $lang = $identifier->identify($string);
=head1 DESCRIPTION
This documentation is not ready yet. These releases are just for
CPANtesters testing.
=head2 C<new>
Constructs a new Language Identification object.
my $identifier = Lingua::Identifier->new();
=cut
sub new {
return bless { languages => $classes }, __PACKAGE__;
}
=head2 C<languages>
Returns the list of codes for the active languages.
=cut
sub languages {
my $self = shift;
return @{$self->{languages}};
}
=head2 C<identify_file>
This method receives a filename and tries to identify its langauge.
In scalar context returns the language id. In list context returns an
associative array, with language codes and respective scores.
my $lang = $identifier->identify_file("sometext.txt");
=cut
sub identify_file {
my ($self, $filename) = @_;
my $string = _load_file($filename);
$self->identify($string);
}
=head2 C<identify>
This method receives a string and tries to identify its langauge.
In scalar context returns the language id. In list context returns an
associative array, with language codes and respective scores.
my $lang = $identifier->identify($string);
=cut
sub identify {
my ($self, $string) = @_;
my $ngrams = _compute_features($string);
my $data = Matrix->new(scalar(@$features), 1);
my $i = 1;
for my $feature (@$features) {
if (exists($ngrams->{$feature})) {
$data->assign($i, 1, $ngrams->{$feature});
}
$i++;
}
my $ans = Lingua::Identifier::ForwardProp::forward_prop($data, $thetas);
my ($max, $pos) = $ans->max();
view all matches for this distributionview release on metacpan - search on metacpan
( run in 1.958 second using v1.00-cache-2.02-grep-82fe00e-cpan-cec75d87357c )