DTA-CAB
view release on metacpan or search on metacpan
CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN
## + code string for {analyzeGet}
## + eval()d in list context, may return multiples
## + available vars:
## $tok => token object being analyzed
## $aut => analyzer (automaton)
#our $DEFAULT_ANALYZE_GET = '$_[0]{xlit} ? $_[0]{xlit}{latin1Text} : $_[0]{text}';
our $DEFAULT_ANALYZE_GET = '$tok->{xlit} ? $tok->{xlit}{latin1Text} : $tok->{text}';
## $DEFAULT_ANALYZE_SET
## + default code string for {analyzeSet}
## + available vars:
## $tok => token object being analyzed
## $a => analyses (array-ref, maybe blessed)
## $aut => analyzer (automaton)
#our $DEFAULT_ANALYZE_SET = '$_[0]{$anl->{label}}=$_[1]';
our $DEFAULT_ANALYZE_SET = '$tok->{$aut->{label}}=$wa';
##==============================================================================
## Constructors etc.
##==============================================================================
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure:
## (
## ##-- Filename Options
## fstFile => $filename, ##-- source FST file (default: none)
## labFile => $filename, ##-- source labels file (default: none)
## dictFile => $filename, ##-- source dict file (default: none): clobbers $dict->{dictFile} if defined
##
## ##-- Exception lexicon options
## dict => $dict, ##-- exception lexicon as a DTA::CAB::Analyzer::Dict object or option hash
## ## + default=undef
## dictClass => $class, ##-- fallback class for new dict (default='DTA::CAB::Analyzer::Dict')
##
## ##-- Analysis Output
## analyzeGet => $code, ##-- accessor: coderef or string: source text (default=$DEFAULT_ANALYZE_GET; return undef for no analysis)
## analyzeSet => $code, ##-- accessor: coderef or string: set analyses (default=$DEFAULT_ANALYZE_SET)
## wantAnalysisLo => $bool, ##-- set to true to include 'lo' keys in analyses (default: true)
## wantAnalysisLemma => $bool, ##-- set to true to include 'lemma' keys in analyses (default: false)
##
## ##-- Analysis Options
## eow => $sym, ##-- EOW symbol for analysis FST
## check_symbols => $bool, ##-- check for unknown symbols? (default=1)
## labenc => $enc, ##-- encoding of labels file (default='auto')
## #dictenc => $enc, ##-- dictionary encoding (default='UTF-8') (set $aut->{dict}{encoding} instead)
## auto_connect => $bool, ##-- whether to call $result->_connect() after every lookup (default=0)
## tolower => $bool, ##-- if true, all input words will be bashed to lower-case (default=0)
## tolowerNI => $bool, ##-- if true, all non-initial characters of inputs will be lower-cased (default=0)
## toupperI => $bool, ##-- if true, initial character will be upper-cased (default=0)
## bashWS => $str, ##-- if defined, input whitespace will be bashed to '$str' (default='_')
## attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
## allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
## ## : useful: /^(?:(?:[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+)|(?:[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+))(?:[\'\x{2018}\x{2019}]s)?$/
## ## : == DTA::CAB::Analyzer::_am_wordlike_regex()
##
## ##-- Analysis objects
## fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton::Dyn object (default=new)
## lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
## labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
## laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
## labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
## result=>$resultfst, ##-- (child classes only) e.g. result fst
##
## ##-- INHERITED from DTA::CAB::Analyzer
## label => $label, ##-- analyzer label (default: from analyzer class name)
## typeKeys => \@keys, ##-- type-wise keys to expand
## )
sub new {
my $that = shift;
my $aut = $that->SUPER::new(
##-- filenames
fstFile => undef,
labFile => undef,
dictFile => undef,
##-- analysis objects
fst=>undef,
lab=>undef,
result=>undef,
labh=>{},
laba=>[],
labc=>[],
dict=>undef,
dictClass=>'DTA::CAB::Analyzer::Dict',
##-- options
eow =>'',
check_symbols => 1,
labenc => 'auto',
#dictenc => 'utf8',
auto_connect => 0,
tolower => 0,
tolowerNI => 0,
toupperI => 0,
bashWS => '_',
attInput => 0,
allowTextRegex => undef, #DTA::CAB::Analyzer::_am_wordlike_regex(),
##-- analysis I/O
analyzeSrc => 'text',
wantAnalysisLo => 1,
wantAnalysisLemma => 0,
##-- user args
@_
);
return $aut;
}
## $aut = $aut->clear()
sub clear {
my $aut = shift;
##-- analysis sub(s)
$aut->dropClosures();
##-- analysis objects
delete($aut->{fst});
delete($aut->{lab});
delete($aut->{result});
%{$aut->{labh}} = qw();
CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN
sub load {
my ($aut,%args) = @_;
return 0 if (!grep {defined($_)} @args{qw(fst lab dict)});
my $rc = $aut;
$rc &&= $aut->loadFst($args{fst}) if (defined($args{fst}));
$rc &&= $aut->loadLabels($args{lab}) if (defined($args{lab}));
$rc &&= $aut->loadDict($args{dict}) if (defined($args{dict}));
return $rc;
}
##--------------------------------------------------------------
## Methods: I/O: Input: FST
## $aut = $aut->loadFst($fstfile)
sub loadFst {
my ($aut,$fstfile) = @_;
$aut->info("loading FST file '$fstfile'");
$aut->{fst} = $aut->fstClass->new() if (!defined($aut->{fst}));
$aut->{fst}->load($fstfile)
or $aut->logconfess("loadFst(): load failed for '$fstfile': $!");
$aut->{result} = $aut->{fst}->shadow; #if (defined($aut->{result}) && $aut->{fst}->can('shadow'));
delete($aut->{_analyze});
return $aut;
}
##--------------------------------------------------------------
## Methods: I/O: Input: Labels
## $aut = $aut->loadLabels($labfile)
sub loadLabels {
my ($aut,$labfile) = @_;
$aut->info("loading labels file '$labfile'");
$aut->{lab} = $aut->labClass->new() if (!defined($aut->{lab}));
$aut->{lab}->load($labfile)
or $aut->logconfess("loadLabels(): load failed for '$labfile': $!");
if (!$aut->{labenc} || $aut->{labenc} eq 'auto') {
##-- guess label encoding
my $buf = join('',@{$aut->{lab}->toArray});
$aut->{labenc} = utf8::decode($buf) ? 'utf8' : 'latin1';
$aut->debug("loadLabels(): guessed label encoding '$aut->{labenc}'");
}
$aut->{lab}->utf8(1)
if ($aut->{lab}->can('utf8') && (($aut->{labenc}||'') =~ /^utf\-?8$/i));
$aut->parseLabels();
delete($aut->{_analyze});
return $aut;
}
## $aut = $aut->parseLabels()
## + sets up $aut->{labh}, $aut->{laba}, $aut->{labc}
## + fixes encoding difficulties in $aut->{labh}, $aut->{laba}
sub parseLabels {
my $aut = shift;
my $laba = $aut->{laba};
@$laba = @{$aut->{lab}->asArray};
my ($i);
foreach $i (grep { defined($laba->[$_]) } 0..$#$laba) {
$laba->[$i] = decode($aut->{labenc}, $laba->[$i]) if ($aut->{labenc});
$aut->{labh}{$laba->[$i]} = $i;
}
##-- setup labc: $labId = $labc->[ord($c)]; ##-- single unicode characater
## : @labIds = @$labc[unpack('U0U*',$s)]; ##-- batch lookup for strings (fast)
my @csyms = grep {defined($_) && length($_)==1} @$laba; ##-- @csyms = ($sym1, ...) s.t. each sym has len==1
@{$aut->{labc}}[map {ord($_)} @csyms] = @{$aut->{labh}}{@csyms};
##
return $aut;
}
##--------------------------------------------------------------
## Methods: I/O: Input: Dictionary
## $aut = $aut->loadDict()
## $aut = $aut->loadDict($dictfile)
sub loadDict {
my ($aut,$dictfile) = @_;
$dictfile = $aut->{dictFile} if (!defined($dictfile));
$dictfile = $aut->{dict}{dictFile} if (!defined($dictfile));
return $aut if (!defined($dictfile)); ##-- no dict file to load
$aut->info("loading exception lexicon from '$dictfile'");
##-- sanitize dict object
my $dclass = (ref($aut->{dict})||$aut->{dictClass}||'DTA::CAB::Analyzer::Dict');
my $dict = $aut->{dict} = bless(_unifyClobber($dclass->new,$aut->{dict},undef), $dclass);
$dict->{label} = $aut->{label}."_dict"; ##-- force sub-analyzer label
$dict->{dictFile} = $dictfile; ##-- clobber sub-analyzer file
##-- load dict object
$dict->ensureLoaded();
return undef if (!$dict->dictOk);
return $aut;
}
##==============================================================================
## Methods: Persistence
##==============================================================================
##======================================================================
## Methods: Persistence: Perl
## @keys = $class_or_obj->noSaveKeys()
## + returns list of keys not to be saved
sub noSaveKeys {
my $that = shift;
return ($that->SUPER::noSaveKeys, qw(dict fst lab laba labc labh result));
}
## $saveRef = $obj->savePerlRef()
## + inherited from DTA::CAB::Persistent
## $loadedObj = $CLASS_OR_OBJ->loadPerlRef($ref)
## + implicitly calls $obj->clear()
sub loadPerlRef {
my ($that,$ref) = @_;
my $obj = $that->SUPER::loadPerlRef($ref);
$obj->clear();
return $obj;
}
##==============================================================================
## Methods: Analysis
##==============================================================================
CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN
=head2 Globals
=over 4
=item Variable: @ISA
DTA::CAB::Analyzer::Automaton::Dyn
inherits from
L<DTA::CAB::Analyzer|DTA::CAB::Analyzer>.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Constructors etc.
=pod
=head2 Constructors etc.
=over 4
=item new
$aut = CLASS_OR_OBJ->new(%args);
Constuctor.
%args, %$aut:
##-- Filename Options
fstFile => $filename, ##-- default: none
labFile => $filename, ##-- default: none
dictFile=> $filename, ##-- default: none (clobbers $aut->{dict}{dictFile} if defined)
##
##-- Analysis Output
analysisClass => $class, ##-- default: none (ARRAY)
analyzeSrc => $key, ##-- source key for analysis (default: 'text')
analyzeDst => $key, ##-- token output key (default: from __PACKAGE__)
wantAnalysisLo => $bool, ##-- set to true to include 'lo' keys in analyses (default: true)
##
##-- Analysis Options
eow => $sym, ##-- EOW symbol for analysis FST
check_symbols => $bool, ##-- check for unknown symbols? (default=1)
labenc => $enc, ##-- encoding of labels file (default='auto': utf8 > latin1)
#dictenc => $enc, ##-- dictionary encoding (default='utf8') : prefer $aut->{dict}{encoding}
auto_connect => $bool, ##-- whether to call $result->_connect() after every lookup (default=0)
tolower => $bool, ##-- if true, all input words will be bashed to lower-case (default=0)
tolowerNI => $bool, ##-- if true, all non-initial characters of inputs will be lower-cased (default=0)
toupperI => $bool, ##-- if true, initial character will be upper-cased (default=0)
bashWS => $str, ##-- if defined, input whitespace will be bashed to '$str' (default='_')
attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
##
##-- Analysis objects
fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton::Dyn object (default=new)
lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
result=>$resultfst, ##-- (child classes only) e.g. result fst
dict => $dict, ##-- exception lexicon / static cache as DTA::CAB::Analyzer::Dict object
=item clear
$aut = $aut->clear();
Clears the object.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Methods: Generic
=pod
=head2 Methods: Generic
=over 4
=item fstClass
$class = $aut->fstClass();
Returns default FST class for L</loadFst>() method.
Used by sub-classes.
=item labClass
$class = $aut->labClass();
Returns default alphabet class for L</loadLabels>() method.
Used by sub-classes.
=item fstOk
$bool = $aut->fstOk();
Should return false iff fst is undefined or "empty".
=item labOk
$bool = $aut->labOk();
Should return false iff alphabet (label-set) is undefined or "empty".
=item dictOk
$bool = $aut->dictOk();
Should return false iff dict is undefined or "empty".
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Methods: I/O
=pod
( run in 2.718 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )