DTA-CAB

 view release on metacpan or  search on metacpan

CAB/Analyzer/Unidecode.pm  view on Meta::CPAN

## -*- Mode: CPerl -*-
##
## File: DTA::CAB::Analyzer::Unidecode.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: latin-1 approximator (old)

package DTA::CAB::Analyzer::Unidecode;

use DTA::CAB::Analyzer;
use DTA::CAB::Datum ':all';
use DTA::CAB::Token;

use Unicode::Normalize; ##-- compatibility decomposition 'KD' (see Unicode TR #15)
use Text::Unidecode;    ##-- last-ditch effort: transliterate to ASCII
#use Unicode::UCD;       ##-- unicode character names, info, etc.
#use Unicode::CharName;  ##-- ... faster access to character name, block

use Encode qw(encode decode);
use IO::File;
use Carp;

use strict;

##==============================================================================
## Globals
##==============================================================================

our @ISA = qw(DTA::CAB::Analyzer);

##==============================================================================
## Constructors etc.
##==============================================================================

## $obj = CLASS_OR_OBJ->new(%args)
##  + object structure, new:
##    label => $key,   ##-- token analysis key (default='xlit')
sub new {
  my $that = shift;
  return $that->SUPER::new(
			   ##-- options
			   label => 'xlit',

			   ##-- user args
			   @_
			  );
}

##==============================================================================
## Methods: I/O
##==============================================================================

## $bool = $aut->ensureLoaded()
##  + ensures analysis data is loaded
sub ensureLoaded { return 1; }

##==============================================================================
## Methods: Analysis: v1.x
##==============================================================================

## $doc = $xlit->analyzeTypes($doc,\%types,\%opts)
##  + perform type-wise analysis of all (text) types in values(%types)
##  + sets
##      $tok->{$anl->{label}} = { latin1Text=>$latin1Text, isLatin1=>$isLatin1, isLatinExt=>$isLatinExt }
##    with:
##      $latin1Text = $str     ##-- best latin-1 approximation of $token->{text}
##      $isLatin1   = $bool    ##-- true iff $token->{text} is losslessly encodable as latin1
##      $isLatinExt = $bool,   ##-- true iff $token->{text} is losslessly encodable as latin-extended
sub analyzeTypes {
  my ($xlit,$doc,$types,$opts) = @_;
  $types = $doc->types if (!$types);
  my $akey = $xlit->{label};

  my ($tok, $w,$uc, $ld,$l0,$l, $isLatin1,$isLatinExt);
  foreach $tok (values(%$types)) {
    $w   = $tok->{text};



( run in 1.948 second using v1.01-cache-2.11-cpan-39bf76dae61 )