DTA-CAB
view release on metacpan or search on metacpan
CAB/Analyzer/Unidecode.pm view on Meta::CPAN
## -*- Mode: CPerl -*-
##
## File: DTA::CAB::Analyzer::Unidecode.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: latin-1 approximator (old)
package DTA::CAB::Analyzer::Unidecode;
use DTA::CAB::Analyzer;
use DTA::CAB::Datum ':all';
use DTA::CAB::Token;
use Unicode::Normalize; ##-- compatibility decomposition 'KD' (see Unicode TR #15)
use Text::Unidecode; ##-- last-ditch effort: transliterate to ASCII
#use Unicode::UCD; ##-- unicode character names, info, etc.
#use Unicode::CharName; ##-- ... faster access to character name, block
use Encode qw(encode decode);
use IO::File;
use Carp;
use strict;
##==============================================================================
## Globals
##==============================================================================
our @ISA = qw(DTA::CAB::Analyzer);
##==============================================================================
## Constructors etc.
##==============================================================================
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure, new:
## label => $key, ##-- token analysis key (default='xlit')
sub new {
my $that = shift;
return $that->SUPER::new(
##-- options
label => 'xlit',
##-- user args
@_
);
}
##==============================================================================
## Methods: I/O
##==============================================================================
## $bool = $aut->ensureLoaded()
## + ensures analysis data is loaded
sub ensureLoaded { return 1; }
##==============================================================================
## Methods: Analysis: v1.x
##==============================================================================
## $doc = $xlit->analyzeTypes($doc,\%types,\%opts)
## + perform type-wise analysis of all (text) types in values(%types)
## + sets
## $tok->{$anl->{label}} = { latin1Text=>$latin1Text, isLatin1=>$isLatin1, isLatinExt=>$isLatinExt }
## with:
## $latin1Text = $str ##-- best latin-1 approximation of $token->{text}
## $isLatin1 = $bool ##-- true iff $token->{text} is losslessly encodable as latin1
## $isLatinExt = $bool, ##-- true iff $token->{text} is losslessly encodable as latin-extended
sub analyzeTypes {
my ($xlit,$doc,$types,$opts) = @_;
$types = $doc->types if (!$types);
my $akey = $xlit->{label};
my ($tok, $w,$uc, $ld,$l0,$l, $isLatin1,$isLatinExt);
foreach $tok (values(%$types)) {
$w = $tok->{text};
( run in 1.948 second using v1.01-cache-2.11-cpan-39bf76dae61 )