DTA-CAB
view release on metacpan or search on metacpan
CAB/Analyzer/Unicruft.pm view on Meta::CPAN
## -*- Mode: CPerl -*-
##
## File: DTA::CAB::Analyzer::Unicruft.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: latin-1 approximator
package DTA::CAB::Analyzer::Unicruft;
use DTA::CAB::Analyzer;
use DTA::CAB::Datum ':all';
use DTA::CAB::Token;
use Unicruft;
use Unicode::Normalize; ##-- compatibility decomposition 'KD' (see Unicode TR #15)
#use Unicode::UCD; ##-- unicode character names, info, etc.
#use Unicode::CharName; ##-- ... faster access to character name, block
#use Text::Unidecode; ##-- last-ditch effort: transliterate to ASCII
use Encode qw(encode decode);
use IO::File;
use Carp;
use strict;
##==============================================================================
## Globals
##==============================================================================
our @ISA = qw(DTA::CAB::Analyzer);
##==============================================================================
## Constructors etc.
##==============================================================================
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure, new:
## label => 'xlit', ##-- analyzer label
## + object structure, INHERITED from Analyzer:
## label => $label, ##-- analyzer label (default: from class name)
sub new {
my $that = shift;
return $that->SUPER::new(
##-- options
label => 'xlit',
##-- user args
@_
);
}
##==============================================================================
## Methods: version
## $version_or_undef = $anl->version()
sub version {
return "Unicruft-${Unicruft::VERSION} libunicruft-".Unicruft::library_version();
}
##==============================================================================
## Methods: I/O
##==============================================================================
## $bool = $anl->ensureLoaded()
## + ensures analysis data is loaded
## + always returns 1, but reports Unicruft module + library version if (!$anl->{loaded})
sub ensureLoaded {
my $anl = shift;
$anl->info("using Unicruft.xs v$Unicruft::VERSION; libunicruft v", Unicruft::library_version)
if (!$anl->{loaded});
return $anl->{loaded}=1;
}
##==============================================================================
## Methods: Analysis: v1.x
##==============================================================================
## $doc = $xlit->analyzeTypes($doc,\%types,\%opts)
## + perform type-wise analysis of all (text) types in values(%types)
## + sets:
## $tok->{$anl->{label}} = { latin1Text=>$latin1Text, isLatin1=>$isLatin1, isLatinExt=>$isLatinExt }
## with:
## $latin1Text = $str ##-- best latin-1 approximation of $token->{text}
## $isLatin1 = $bool ##-- true iff $token->{text} is losslessly encodable as latin1
## $isLatinExt = $bool, ##-- true iff $token->{text} is losslessly encodable as latin-extended
sub analyzeTypes {
my ($xlit,$doc,$types,$opts) = @_;
$types = $doc->types if (!$types);
my $akey = $xlit->{label};
my ($tok, $w,$uc, $ld, $isLatin1,$isLatinExt);
foreach $tok (values(%$types)) {
next if (defined($tok->{$akey})); ##-- avoid re-analysis
$w = $tok->{text};
##-- 2010-01-23: Mantis Bug #140: 'µ'="\x{b5}" gets mapped to 'm' rather than
## + (unicruft-v0.07) 'u'
## + (unicruft-v0.08) 'µ' (identity)
## + problem is NFKC-decomposition which maps
## 'µ'="\x{b5}" = Latin1 Supplement / MICRO SIGN
## to
## "\x{03bc}" = Greek and Coptic / GREEK SMALL LETTER MU
## + solution (hack): use NFC (canonical composition only)
## rather than NFKC (compatibility decomposition + canonical composition) here,
## and let Unicruft take care of decomposition
## + potentially problematic cases (from unicode normalization form techreport
## @ http://unicode.org/reports/tr15/ : fi ligature, 2^5, long-S + diacritics)
## are all handled correctly by unicruft
#$uc = Unicode::Normalize::NFKC($w); ##-- compatibility(?) decomposition + canonical composition
$uc = Unicode::Normalize::NFC($w); ##-- canonical composition only
##-- construct latin-1/de approximation
$ld = decode('latin1',Unicruft::utf8_to_latin1_de($uc));
##-- special handling for double-initial-caps, e.g. "AUf", "CHristus", "GOtt", etc.
$ld = ucfirst(lc($ld)) if ($ld =~ /^[[:upper:]]{2}[[:lower:]]+$/);
##-- properties
if (
#$uc !~ m([^\p{inBasicLatin}\p{inLatin1Supplement}]) #)
$uc =~ m(^[\x{00}-\x{ff}]*$) #)
)
{
$isLatin1 = $isLatinExt = 1;
}
elsif ($uc =~ m(^[\x{00}-\x{ff}\p{Latin}\p{IsPunct}\p{IsMark}\x{a75b}\x{fffc}-\x{ffff}]*$))
{
$isLatin1 = 0;
$isLatinExt = 1;
}
else
{
$isLatin1 = $isLatinExt = 0;
}
##-- update token
$tok->{$akey} = { latin1Text=>$ld, isLatin1=>$isLatin1, isLatinExt=>$isLatinExt };
}
return $doc;
}
1; ##-- be happy
__END__
##========================================================================
## POD DOCUMENTATION, auto-generated by podextract.perl, edited
##========================================================================
## NAME
=pod
=head1 NAME
DTA::CAB::Analyzer::Unicruft - latin-1 approximator using libunicruft
=cut
##========================================================================
## SYNOPSIS
=pod
=head1 SYNOPSIS
use DTA::CAB::Analyzer::Unicruft;
( run in 2.294 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )