DTA-CAB
view release on metacpan or search on metacpan
CAB/Chain/EN.pm view on Meta::CPAN
## -*- Mode: CPerl -*-
## File: DTA::CAB::Chain::EN.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: robust analysis: default chain (english)
package DTA::CAB::Chain::EN;
use DTA::CAB::Datum ':all';
use DTA::CAB::Chain::Multi;
##-- sub-analyzers
use DTA::CAB::Analyzer::TokPP;
use DTA::CAB::Analyzer::Morph::Helsinki::EN;
use DTA::CAB::Analyzer::Morph::Latin;
use DTA::CAB::Analyzer::MorphSafe;
use DTA::CAB::Analyzer::Moot;
use DTA::CAB::Analyzer::MootSub;
use DTA::CAB::Analyzer::LangId::Simple;
use DTA::CAB::Analyzer::DTAClean;
use DTA::CAB::Analyzer::Null;
use IO::File;
use Carp;
use strict;
##==============================================================================
## Constants
##==============================================================================
our @ISA = qw(DTA::CAB::Chain::Multi);
##==============================================================================
## Constructors etc.
##==============================================================================
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure: HASH
sub new {
my $that = shift;
return $that->SUPER::new
(
##-- analyzers
tokpp => DTA::CAB::Analyzer::TokPP->new(),
xlit => DTA::CAB::Analyzer::Unicruft->new(),
##
morph => DTA::CAB::Analyzer::Morph::Helsinki::EN->new(),
mlatin=> DTA::CAB::Analyzer::Morph::Latin->new(),
msafe => DTA::CAB::Analyzer::MorphSafe->new(), ##-- remove this for en-chain?
##
moot => DTA::CAB::Analyzer::Moot->new(lang=>'en'), ##-- moot tagger (on dmoot output; (n>1)-grams)
moot1 => DTA::CAB::Analyzer::Moot->new(lang=>'en'), ##-- moot tagger (on dmoot output; 1-grams only)
mootsub => DTA::CAB::Analyzer::MootSub->new(ucTags=>[],stts=>0,wMorph=>.2), ##-- moot tagger, post-processing hacks
##
langid => DTA::CAB::Analyzer::LangId::Simple->new(defaultLang=>'en'), ##-- language-guesser (stopword-based; between msafe and rw)
##
clean => DTA::CAB::Analyzer::DTAClean->new(),
##
null => DTA::CAB::Analyzer::Null->new(), ##-- null analyzer (for 'null' chain)
##-- security
autoClean => 0, ##-- always run 'clean' analyzer regardless of options; checked in both doAnalyze(), analyzeClean()
defaultChain => 'default',
##-- user args
@_,
##-- overrides
chains => undef, ##-- see setupChains() method
chain => undef, ##-- see setupChains() method
);
}
##==============================================================================
## Methods: Chain selection
##==============================================================================
## $ach = $ach->setupChains()
## + setup default named sub-chains in $ach->{chains}
## + override
## + adapted from Chain::DTA
sub setupChains {
my $ach = shift;
my @akeys = grep {UNIVERSAL::isa($ach->{$_},'DTA::CAB::Analyzer')} keys(%$ach);
my $chains = $ach->{chains} =
{
(map {("sub.$_"=>[$ach->{$_}])} @akeys), ##-- sub.xlit, sub.lts, ...
#(map {("$_"=>[$ach->{$_}])} @akeys), ##-- xlit, lts, ...
##
'sub.sent' =>[@$ach{qw(moot mootsub)}],
'sub.sent1' =>[@$ach{qw(moot1 mootsub)}],
##
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin langid)}],
'default.moot' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'default.base' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
##
'norm' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'norm1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'all' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}], ##-- old dta clients use 'all'!
'clean' =>[@$ach{qw(clean)}],
'null' =>[$ach->{null}],
};
#$chains->{'default'} = [map {@{$chains->{$_}}} qw(default.type sub.sent)];
##-- chain aliases
$chains->{'default'} = $chains->{lemma} = $chains->{'norm'};
$chains->{'default1'} = $chains->{lemma1} = $chains->{'norm1'};
##-- sanitize chains
foreach (values %{$ach->{chains}}) {
@$_ = grep {ref($_)} @$_;
}
##-- set default chain
$ach->{chain} = $ach->{chains}{$ach->{defaultChain}};
##-- force default labels
foreach (grep {UNIVERSAL::isa($ach->{$_},'DTA::CAB::Analyzer')} keys(%$ach)) {
next if ($_ =~ /^(?:langid)$/); ##-- keep these labels
($ach->{$_}{label} = $_) =~ s/1$//; ##-- truncate '1' suffix for label (e.g. dmoot1, moot1)
}
return $ach;
}
## \@analyzers = $ach->chain()
## \@analyzers = $ach->chain(\%opts)
## + get selected analyzer chain
## + inherited from DTA::CAB::Chain::Multi
## - calls setupChains() if $ach->{chain} is empty
## - checks for $opts{chain} and returns $ach->{chains}{ $opts{chain} } if available
## $ach = $ach->ensureChain()
## + checks for $ach->{chain}, calls $ach->setupChains() if needed
## + inherited from DTA::CAB::Chain::Multi
##==============================================================================
## Methods: I/O
##==============================================================================
##==============================================================================
## Methods: Persistence
##==============================================================================
##======================================================================
## Methods: Persistence: Perl
## @keys = $class_or_obj->noSaveKeys()
## + returns list of keys not to be saved
## + default just greps for CODE-refs
## + inherited from DTA::CAB::Chain::Multi: override appends {chain},{chains}
## $saveRef = $obj->savePerlRef()
## + return reference to be saved (top-level objects only)
## + inherited from DTA::CAB::Persistent
## $loadedObj = $CLASS_OR_OBJ->loadPerlRef($ref)
## + default implementation just clobbers $CLASS_OR_OBJ with $ref and blesses
## + inherited from DTA::CAB::Persistent
##==============================================================================
## Methods: Analysis
##==============================================================================
##------------------------------------------------------------------------
## Methods: Analysis: v1.x: Utils
## $bool = $anl->doAnalyze(\%opts, $name)
## + alias for $anl->can("analyze${name}") && (!exists($opts{"doAnalyze${name}"}) || $opts{"doAnalyze${name}"})
## + override checks $anl->{autoClean} flag
sub doAnalyze {
my ($anl,$opts,$name) = @_;
return 1 if ($anl->{autoClean} && $name eq 'Clean');
return $anl->SUPER::doAnalyze($opts,$name);
}
##------------------------------------------------------------------------
## Methods: Analysis: v1.x: API
CAB/Chain/EN.pm view on Meta::CPAN
Token preprocessor,
a L<DTA::CAB::Analyzer::TokPP|DTA::CAB::Analyzer::TokPP> object.
=item xlit
Transliterator,
a L<DTA::CAB::Analyzer::Unicruft|DTA::CAB::Analyzer::Unicruft> object.
=item morph
Morphological analyzer (Helsinki-style with TAGH emulation hacks),
a L<DTA::CAB::Analyzer::Morph::Helsinki::EN|DTA::CAB::Analyzer::Morph::Helsinki::EN> object.
=item mlatin
Latin pseudo-morphology,
a L<DTA::CAB::Analyzer::Morph::Latin|DTA::CAB::Analyzer::Morph::Latin> object.
=item msafe
Morphological security heuristics,
a L<DTA::CAB::Analyzer::MorphSafe|DTA::CAB::Analyzer::MorphSafe> object.
=item moot
HMM part-of-speech tagger,
a L<DTA::CAB::Analyzer::Moot|DTA::CAB::Analyzer::Moot> object.
=item mootsub
Post-processing for L</moot> tagger,
a L<DTA::CAB::Analyzer::MootSub|DTA::CAB::Analyzer::MootSub> object.
=item clean
Janitor (paranoid removal of internal temporary data),
a L<DTA::CAB::Analyzer::DTAClean|DTA::CAB::Analyzer::DTAClean> object.
=back
=back
=item setupChains
$ach = $ach->setupChains();
Setup default named sub-chains in $ach-E<gt>{chains}.
Currently defines a singleton chain C<sub.NAME>
for each analyzer key in keys(%$ach), as well as the following
non-trivial chains:
'sub.sent' =>[@$ach{qw(moot mootsub)}],
'sub.sent1' =>[@$ach{qw(moot1 mootsub)}],
##
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.moot' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'default.base' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
##
'norm' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'norm1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'all' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}], ##-- old dta clients use 'all'!
'clean' =>[@$ach{qw(clean)}],
'null' =>[$ach->{null}],
=item ensureLoaded
$bool = $ach->ensureLoaded();
Ensures analysis data is loaded from default files.
Inherited DTA::CAB::Chain::Multi override calls ensureChain() before inherited method.
Hack copies chain sub-analyzers (rwsub, dmootsub) AFTER loading their own sub-analyzers,
setting 'enabled' only then if appropriate.
=item doAnalyze
$bool = $anl->doAnalyze(\%opts, $name);
Alias for $anl-E<gt>can("analyze${name}") && (!exists($opts{"doAnalyze${name}"}) || $opts{"doAnalyze${name}"}).
Override checks $anl-E<gt>{autoClean} flag.
=item analyzeClean
$doc = $ach->analyzeClean($doc,\%opts);
Cleanup any temporary data associated with $doc.
Chain default calls $a-E<gt>analyzeClean for each analyzer $a in the chain,
then superclass Analyzer-E<gt>analyzeClean.
Local override checks $ach-E<gt>{autoClean}.
=back
=cut
##========================================================================
## END POD DOCUMENTATION, auto-generated by podextract.perl
##======================================================================
## Footer
##======================================================================
=pod
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2016-2019 by Bryan Jurish
This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.
=head1 SEE ALSO
L<dta-cab-analyze.perl(1)|dta-cab-analyze.perl>,
L<DTA::CAB::Chain::Multi(3pm)|DTA::CAB::Chain::Multi>,
L<DTA::CAB::Chain(3pm)|DTA::CAB::Chain>,
L<DTA::CAB::Analyzer(3pm)|DTA::CAB::Analyzer>,
( run in 0.981 second using v1.01-cache-2.11-cpan-5735350b133 )