DTA-CAB
view release on metacpan or search on metacpan
CAB/Analyzer/Automaton.pm view on Meta::CPAN
our $DEFAULT_ANALYZE_GET = '$_->{$lab} ? qw() : '._am_xlit;
## $DEFAULT_ANALYZE_SET
## + default coderef or eval-able string for {analyzeSet}
## + vars:
## $anl => analyzer (automaton)
## $lab => analyzer label
## $_ => token
## $wa => analyses (array-ref)
our $DEFAULT_ANALYZE_SET = '$_->{$lab} = ($wa && @$wa ? [@$wa] : undef);';
##==============================================================================
## Constructors etc.
##==============================================================================
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure:
## (
## ##-- Filename Options
## fstFile => $filename, ##-- source FST file (default: none)
## labFile => $filename, ##-- source labels file (default: none)
##
## ##-- Exception lexicon options (OBSOLETE for CAB >=v1.15)
## #dictFile => $filename, ##-- source dict file (default: none): clobbers $dict->{dictFile} if defined
## #dict => $dict, ##-- exception lexicon as a DTA::CAB::Analyzer::Dict object or option hash
## # ## + default=undef
## #dictClass => $class, ##-- fallback class for new dict (default='DTA::CAB::Analyzer::Dict')
##
## ##-- Analysis Output
## analyzeGet => $code, ##-- accessor: coderef or string: source text (default=$DEFAULT_ANALYZE_GET; return undef for no analysis)
## analyzeSet => $code, ##-- accessor: coderef or string: set analyses (default=$DEFAULT_ANALYZE_SET)
## analyzePre => $code, ##-- coderef or string: pre-analysis code to call (called as $code->($word))
## analyzePost => $code, ##-- coderef or string: post-analysis code to call (called as $code->($word))
## wantAnalysisLo => $bool, ##-- set to true to include 'lo' keys in analyses (default: true)
## #wantAnalysisFst => $bool, ##-- set to true to include 'fst' key in analyses (default: false)
## wantAnalysisLemma => $bool, ##-- set to true to include 'lemma' keys in analyses (default: false)
##
## ##-- Analysis Options
## eow => $sym, ##-- EOW symbol for analysis FST
## check_symbols => $bool, ##-- check for unknown symbols? (default=1)
## labenc => $enc, ##-- encoding of labels file (default='auto' [utf8 if applicable, otherwise latin1])
## #dictenc => $enc, ##-- dictionary encoding (default='UTF-8') (set $aut->{dict}{encoding} instead)
## auto_connect => $bool, ##-- whether to call $result->_connect() after every lookup (default=0)
## tolower => $bool, ##-- if true, all input words will be bashed to lower-case (default=0)
## tolowerNI => $bool, ##-- if true, all non-initial characters of inputs will be lower-cased (default=0)
## toupperI => $bool, ##-- if true, initial character will be upper-cased (default=0)
## capsFallback => $bool, ##-- if true, all-caps words will also be analyzed as ucfirst(lc($w)) (default=0)
## qsFallback => $bool, ##-- if true, "-'s" suffixes will also be analyzed as "-s" (default=0)
## bashWS => $str, ##-- if defined, input whitespace will be bashed to '$str' (default='_')
## attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
## attOutput => $bool, ##-- if true, generate AT&T escapes in output (default=1)
## allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
## ## : useful: /^(?:(?:[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+)|(?:[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+))(?:[\'\x{2018}\x{2019}]s)?$/
## ## : == DTA::CAB::Analyzer::_am_wordlike_regex()
##
## ##-- Analysis objects
## fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton object (default=new)
## lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
## labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
## laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
## labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
## warned_symbols => \%sym2undef, ##-- tracks unknown symbols we've already warned about (for check_symbols != 0)
##
## ##-- INHERITED from DTA::CAB::Analyzer
## label => $label, ##-- analyzer label (default: from analyzer class name)
## typeKeys => \@keys, ##-- type-wise keys to expand
## )
sub new {
my $that = shift;
my $aut = $that->SUPER::new(
##-- filenames
fstFile => undef,
labFile => undef,
##-- analysis objects
fst=>undef,
lab=>undef,
result=>undef,
labh=>{},
laba=>[],
labc=>[],
warned_symbols=>{},
##-- options
eow =>'',
check_symbols => 1,
labenc => 'auto',
auto_connect => 0,
tolower => 0,
tolowerNI => 0,
toupperI => 0,
capsFallback => 0,
qsFallback => 0,
bashWS => '_',
attInput => 0,
attOutput => 1,
allowTextRegex => undef, #'(?:^[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+$)|(?:^[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+$)',
#'^(?:(?:[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+)|(?:[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+))(?:[\'\x{2018}\x{2019}]s)?$'
# == DTA::CAB::Analyzer::_am_wordlike_regex()
##-- analysis I/O
analyzeSrc => 'text',
wantAnalysisLo => 1,
wantAnalysisLemma => 0,
##-- user args
@_
);
return $aut;
}
## $aut = $aut->clear()
sub clear {
my $aut = shift;
##-- analysis sub(s)
$aut->dropClosures();
##-- analysis objects
delete($aut->{fst});
delete($aut->{lab});
delete($aut->{result}); ##-- should be undef anyways
CAB/Analyzer/Automaton.pm view on Meta::CPAN
return $rc;
}
##--------------------------------------------------------------
## Methods: I/O: Input: FST
## $aut = $aut->loadFst($fstfile)
sub loadFst {
my ($aut,$fstfile) = @_;
$aut->info("loading FST file '$fstfile'");
$aut->{fst} = $aut->fstClass->new() if (!defined($aut->{fst}));
$aut->{fst}->load($fstfile)
or $aut->logconfess("loadFst(): load failed for '$fstfile': $!");
delete($aut->{result});
#$aut->{result} = $aut->{fst}->shadow; #if (defined($aut->{result}) && $aut->{fst}->can('shadow'));
delete($aut->{_analyze});
return $aut;
}
## $result = $aut->resultFst()
## + returns empty result FST
sub resultFst {
return $_[0]{fst} ? $_[0]{fst}->shadow : undef;
}
##--------------------------------------------------------------
## Methods: I/O: Input: Labels
## $aut = $aut->loadLabels($labfile)
sub loadLabels {
my ($aut,$labfile) = @_;
$aut->info("loading labels file '$labfile'");
$aut->{lab} = $aut->labClass->new() if (!defined($aut->{lab}));
$aut->{lab}->load($labfile)
or $aut->logconfess("loadLabels(): load failed for '$labfile': $!");
if (!$aut->{labenc} || $aut->{labenc} eq 'auto') {
##-- guess label encoding
my $buf = join('',@{$aut->{lab}->toArray});
$aut->{labenc} = utf8::decode($buf) ? 'utf8' : 'latin1';
$aut->debug("loadLabels(): guessed label encoding '$aut->{labenc}'");
}
$aut->{lab}->utf8(1)
if ($aut->{lab}->can('utf8') && (($aut->{labenc}||'') =~ /^utf\-?8$/i));
$aut->parseLabels();
delete($aut->{_analyze});
return $aut;
}
## $aut = $aut->parseLabels()
## + sets up $aut->{labh}, $aut->{laba}, $aut->{labc}
## + fixes encoding difficulties in $aut->{labh}, $aut->{laba}
sub parseLabels {
my $aut = shift;
my $laba = $aut->{laba};
@$laba = @{$aut->{lab}->asArray};
my ($i);
foreach $i (grep { defined($laba->[$_]) } 0..$#$laba) {
$laba->[$i] = decode($aut->{labenc}, $laba->[$i]) if ($aut->{labenc});
$aut->{labh}{$laba->[$i]} = $i;
}
##-- setup labc: $labId = $labc->[ord($c)]; ##-- single unicode characater
## : @labIds = @$labc[unpack('U0U*',$s)]; ##-- batch lookup for strings (fast)
my @csyms = grep {defined($_) && length($_)==1} @$laba; ##-- @csyms = ($sym1, ...) s.t. each sym has len==1
@{$aut->{labc}}[map {ord($_)} @csyms] = @{$aut->{labh}}{@csyms};
##
return $aut;
}
##==============================================================================
## Methods: Persistence
##==============================================================================
##======================================================================
## Methods: Persistence: Perl
## @keys = $class_or_obj->noSaveKeys()
## + returns list of keys not to be saved
sub noSaveKeys {
my $that = shift;
return ($that->SUPER::noSaveKeys, qw(dict fst lab laba labc labh result));
}
## $saveRef = $obj->savePerlRef()
## + inherited from DTA::CAB::Persistent
## $loadedObj = $CLASS_OR_OBJ->loadPerlRef($ref)
## + implicitly calls $obj->clear()
sub loadPerlRef {
my ($that,$ref) = @_;
my $obj = $that->SUPER::loadPerlRef($ref);
$obj->clear();
return $obj;
}
##==============================================================================
## Methods: Analysis
##==============================================================================
##------------------------------------------------------------------------
## Methods: Analysis: Generic
## $bool = $anl->canAnalyze()
## + returns true if analyzer can perform its function (e.g. data is loaded & non-empty)
sub canAnalyze {
return ($_[0]->labOk && $_[0]->fstOk);
}
##==============================================================================
## Methods: Analysis: v1.x
##==============================================================================
## $doc = $anl->analyzeTypes($doc,\%types,\%opts)
## + perform type-wise analysis of all (text) types in %types (= %{$doc->{types}})
sub analyzeTypes {
my ($aut,$doc,$types,$opts) = @_;
return if (!$aut->canAnalyze);
$types = $doc->types if (!$types);
##-- setup common variables
CAB/Analyzer/Automaton.pm view on Meta::CPAN
=pod
=head2 Globals
=over 4
=item Variable: @ISA
DTA::CAB::Analyzer::Automaton
inherits from
L<DTA::CAB::Analyzer|DTA::CAB::Analyzer>.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::Automaton: Constructors etc.
=pod
=head2 Constructors etc.
=over 4
=item new
$aut = CLASS_OR_OBJ->new(%args);
Constuctor.
%args, %$aut:
##-- Filename Options
fstFile => $filename, ##-- source FST file (default: none)
labFile => $filename, ##-- source labels file (default: none)
##
##-- Analysis Output
analyzeGet => $code, ##-- accessor: coderef or string: source text (default=$DEFAULT_ANALYZE_GET; return undef for no analysis)
analyzeSet => $code, ##-- accessor: coderef or string: set analyses (default=$DEFAULT_ANALYZE_SET)
wantAnalysisLo => $bool, ##-- set to true to include 'lo' keys in analyses (default: true)
wantAnalysisLemma => $bool, ##-- set to true to include 'lemma' keys in analyses (default: false)
##
##-- Analysis Options
eow => $sym, ##-- EOW symbol for analysis FST
check_symbols => $bool, ##-- check for unknown symbols? (default=1)
labenc => $enc, ##-- encoding of labels file (default='auto': utf8 if valid, else latin1)
auto_connect => $bool, ##-- whether to call $result->_connect() after every lookup (default=0)
tolower => $bool, ##-- if true, all input words will be bashed to lower-case (default=0)
tolowerNI => $bool, ##-- if true, all non-initial characters of inputs will be lower-cased (default=0)
toupperI => $bool, ##-- if true, initial character will be upper-cased (default=0)
bashWS => $str, ##-- if defined, input whitespace will be bashed to '$str' (default='_')
attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
attOutput => $bool, ##-- if true, generate AT&T escapes in output (default=1)
allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
## : useful: /(?:^[[:alpha:]\-\x{ac}]*[[:alpha:]]+$)|(?:^[[:alpha:]]+[[:alpha:]\-\x{ac}]+$)/
##-- Analysis objects
fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton object (default=new)
lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
##
##-- INHERITED from DTA::CAB::Analyzer
label => $label, ##-- analyzer label (default: from analyzer class name)
typeKeys => \@keys, ##-- type-wise keys to expand
=item clear
$aut = $aut->clear();
Clears the object.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::Automaton: Methods: Generic
=pod
=head2 Methods: Generic
=over 4
=item fstClass
$class = $aut->fstClass();
Returns default FST class for L</loadFst>() method.
Used by sub-classes.
=item labClass
$class = $aut->labClass();
Returns default alphabet class for L</loadLabels>() method.
Used by sub-classes.
=item fstOk
$bool = $aut->fstOk();
Should return false iff fst is undefined or "empty".
=item labOk
$bool = $aut->labOk();
Should return false iff alphabet (label-set) is undefined or "empty".
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::Automaton: Methods: I/O
=pod
=head2 Methods: I/O
=over 4
( run in 2.423 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )