Lingua-Guess

 view release on metacpan or  search on metacpan

lib/Lingua/Guess.pm  view on Meta::CPAN

	$params{modeldir} = $md;
    }
    if (! -d $params{modeldir}) {
        croak "Model directory '$params{modeldir}' does not exist";
    }
    my $self = bless { %params }, $class;
    return $self;
}


sub guess 
{
    my ($self, $string) = @_;
    unless (defined $self->{models}) {
        $self->load_models ();
    }
    my @runs = find_runs($string);
    my %scripts;
    for my $run (@runs) {
	$scripts{$run->[1]}++;
    }
    return $self->identify ($string, %scripts);
}

sub simple_guess 
{
    my ($self, $string) = @_;
    my $got = $self->guess ($string);
    return $got->[0]{name};
}

sub load_models 
{
    my ($self) = @_;
    opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!";
    my %models;
    while (my $f = readdir $dh) {
	unless ($f =~ /\.train$/) {
	    next;
	}
	my ($name) = $f =~ m|(.*)\.|;
	my $path = catfile ($self->{modeldir}, $f);
	open my $fh, "<:encoding(utf8)", $path or die "Failed to open file: $!";
	my %model;
	while (my $line = <$fh>) {
	    chomp $line;
	    my ($k, $v) = $line =~ m|(.{3})\s+(.*)|;
	    unless (defined $k) {
	        next;
	    }
	    $model{$k} = $v;
	}
	$models{$name} = \%model;
    }
    $self->{models} = \%models;
}

sub find_runs 
{
    my ($raw) = @_;
    my @chars = split m//, $raw;
    my $prev = '';
    my @c;
    my @runs;
    my @run_types;
    my $current_run = -1;
	
    for my $c (@chars) {
	my $is_alph = $c =~ /[[:alpha:]]/o;
	my $inf = get_charinfo ($c);
	if ($is_alph and ! ($inf->{block} eq $prev)) {
	    $prev = $inf->{block};
	    @c = ();
	    $current_run++;
	    $run_types[$current_run] = $prev;
	}
	push @c, $c;
	if ($current_run > -1) {
	    push @{ $runs[$current_run] }, $c;
	}
    }
	
    my ($newruns, $newtypes) = reconcile_latin (\@runs, \@run_types);
    my $counter = 0;
    my @result;
    for my $r (@$newruns) {
	push @result, [ $r, $newtypes->[$counter]];
	$counter++;
    }
    return @result;
}

# Cached lookups from charinfo

my %cache;

# Look up characters using charinfo, but with a cache to save repeated
# lookups.

sub get_charinfo 
{
    my ($char) = @_;
    my $known = $cache{$char};
    if ($known) {
	return $known;
    }
    my $inf = charinfo (ord ($char));
    $cache{$char} = $inf;
    return $inf;
}

sub reconcile_latin 
{
    my ($runs, $types) = @_;
    my @types = @$types;
    my (@new_runs, @new_types);
    my $last_type = '';
	
    my $upgrade;
    if (has_supplemental_latin (@$types)) {
        $upgrade = 'Accented Latin';



( run in 0.884 second using v1.01-cache-2.11-cpan-71847e10f99 )