Lingua-Guess
view release on metacpan or search on metacpan
lib/Lingua/Guess.pm view on Meta::CPAN
$params{modeldir} = $md;
}
if (! -d $params{modeldir}) {
croak "Model directory '$params{modeldir}' does not exist";
}
my $self = bless { %params }, $class;
return $self;
}
sub guess
{
my ($self, $string) = @_;
unless (defined $self->{models}) {
$self->load_models ();
}
my @runs = find_runs($string);
my %scripts;
for my $run (@runs) {
$scripts{$run->[1]}++;
}
return $self->identify ($string, %scripts);
}
sub simple_guess
{
my ($self, $string) = @_;
my $got = $self->guess ($string);
return $got->[0]{name};
}
sub load_models
{
my ($self) = @_;
opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!";
my %models;
while (my $f = readdir $dh) {
unless ($f =~ /\.train$/) {
next;
}
my ($name) = $f =~ m|(.*)\.|;
my $path = catfile ($self->{modeldir}, $f);
open my $fh, "<:encoding(utf8)", $path or die "Failed to open file: $!";
my %model;
while (my $line = <$fh>) {
chomp $line;
my ($k, $v) = $line =~ m|(.{3})\s+(.*)|;
unless (defined $k) {
next;
}
$model{$k} = $v;
}
$models{$name} = \%model;
}
$self->{models} = \%models;
}
sub find_runs
{
my ($raw) = @_;
my @chars = split m//, $raw;
my $prev = '';
my @c;
my @runs;
my @run_types;
my $current_run = -1;
for my $c (@chars) {
my $is_alph = $c =~ /[[:alpha:]]/o;
my $inf = get_charinfo ($c);
if ($is_alph and ! ($inf->{block} eq $prev)) {
$prev = $inf->{block};
@c = ();
$current_run++;
$run_types[$current_run] = $prev;
}
push @c, $c;
if ($current_run > -1) {
push @{ $runs[$current_run] }, $c;
}
}
my ($newruns, $newtypes) = reconcile_latin (\@runs, \@run_types);
my $counter = 0;
my @result;
for my $r (@$newruns) {
push @result, [ $r, $newtypes->[$counter]];
$counter++;
}
return @result;
}
# Cached lookups from charinfo
my %cache;
# Look up characters using charinfo, but with a cache to save repeated
# lookups.
sub get_charinfo
{
my ($char) = @_;
my $known = $cache{$char};
if ($known) {
return $known;
}
my $inf = charinfo (ord ($char));
$cache{$char} = $inf;
return $inf;
}
sub reconcile_latin
{
my ($runs, $types) = @_;
my @types = @$types;
my (@new_runs, @new_types);
my $last_type = '';
my $upgrade;
if (has_supplemental_latin (@$types)) {
$upgrade = 'Accented Latin';
( run in 0.884 second using v1.01-cache-2.11-cpan-71847e10f99 )