DTA-CAB
view release on metacpan or search on metacpan
CAB/Chain/DTA.pm view on Meta::CPAN
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.rw' =>[@$ach{qw(tokpp xlit rw)}],
'default.rw.safe'=>[@$ach{qw(tokpp xlit morph mlatin msafe langid rw)}],
'default.dmoot' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot)}],
'default.dmoot1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1)}],
'default.moot' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot dmootsub moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot1 mootsub)}],
'default.ner' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot dmootsub moot mootsub ner)}],
'default.base' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub)}],
##
'expand.old' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw)}],
'expand.ext' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw eqphox)}],
'expand.all' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
'expand.eqpho' =>[@$ach{qw(static exlex xlit lts eqpho)}],
'expand.eqrw' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqrw)}],
'expand.eqlemma' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
'expand.gn-syn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-syn)}],
'expand.gn-isa' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-isa)}],
'expand.gn-asi' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-asi)}],
'expand.gn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-syn gn-isa gn-asi)}],
'expand.ot-syn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-syn)}],
'expand.ot-isa' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-isa)}],
'expand.ot-asi' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-asi)}],
'expand.ot' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-syn ot-isa ot-asi)}],
##
'norm' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub)}],
'norm1' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot1 dmootsub moot1 mootsub)}],
'ner' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub ner)}],
'caberr' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub mapclass)}],
'caberr1' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot1 dmootsub moot1 mootsub mapclass)}],
'all' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub eqpho eqrw eqphox dmoot dmootsub moot mootsub eqlemma)}], ##-- old dta clients use 'all'!
'clean' =>[@$ach{qw(clean)}],
'null' =>[$ach->{null}],
};
#$chains->{'default'} = [map {@{$chains->{$_}}} qw(default.type sub.sent)];
##-- chain aliases
$chains->{'default'} = $chains->{lemma} = $chains->{'norm'};
$chains->{'default1'} = $chains->{lemma1} = $chains->{'norm1'};
$chains->{'expand'} = $chains->{'expand.all'};
##-- BEGIN TEMPORARY custom chain(s): "hlgl": place-name recognition Hessen
$chains->{"norm.hlgl"} = [map {($_,$_ eq $ach->{mlatin} ? $ach->{mhessen} : qw())} @{$chains->{norm}}];
$chains->{"norm1.hlgl"} = [map {($_,$_ eq $ach->{mlatin} ? $ach->{mhessen} : qw())} @{$chains->{norm1}}];
##
$chains->{"norm.hlgl.geo"} = [map {($_,$_ eq $ach->{mlatin} ? $ach->{mhessengeo} : qw())} @{$chains->{norm}}];
$chains->{"norm1.hlgl.geo"} = [map {($_,$_ eq $ach->{mlatin} ? $ach->{mhessengeo} : qw())} @{$chains->{norm1}}];
##-- END TEMPORARY custom chain(s)
##-- date-dependent chains
foreach my $rng (@RW_RANGES) {
if ($ach->{"rw.$rng"} && ($ach->{"rw.$rng"}{enabled}//1)) {
foreach my $key (qw(norm norm1 lemma lemma1 default default1 expand)) {
$chains->{"$key.$rng"} = [map {$_ eq $ach->{rw} ? $ach->{"rw.$rng"} : $_} @{$chains->{$key}}];
}
} else {
$ach->warn("optimized rewrite cascade rw.$rng not available: disabling derived chains for range $rng");
delete $ach->{"rw.$rng"};
delete $chains->{"sub.rw.$rng"};
}
}
##-- sanitize chains
foreach (values %{$ach->{chains}}) {
@$_ = grep {ref($_)} @$_;
}
##-- set default chain
$ach->{chain} = $ach->{chains}{$ach->{defaultChain}};
##-- force default labels
foreach (grep {UNIVERSAL::isa($ach->{$_},'DTA::CAB::Analyzer')} keys(%$ach)) {
next if ($_ =~ /^(?:langid|rw\.[0-9\-]+|mhessen(?:geo)?)$/); ##-- keep labels for these analyzers
($ach->{$_}{label} = $_) =~ s/1$//; ##-- truncate '1' suffix for label (e.g. dmoot1, moot1)
}
return $ach;
}
## \@analyzers = $ach->chain()
## \@analyzers = $ach->chain(\%opts)
## + get selected analyzer chain
## + inherited from DTA::CAB::Chain::Multi
## - calls setupChains() if $ach->{chain} is empty
## - checks for $opts{chain} and returns $ach->{chains}{ $opts{chain} } if available
## $ach = $ach->ensureChain()
## + checks for $ach->{chain}, calls $ach->setupChains() if needed
## + inherited from DTA::CAB::Chain::Multi
##==============================================================================
## Methods: I/O
##==============================================================================
## $bool = $ach->ensureLoaded()
## + ensures analysis data is loaded from default files
## + inherited DTA::CAB::Chain::Multi override calls ensureChain() before inherited method
sub ensureLoaded {
my $ach = shift;
$ach->SUPER::ensureLoaded(@_) || return 0;
##-- hack: copy chain members AFTER loading for sub-analyzers, setting 'enabled' if appropriate
my ($subkey);
foreach $subkey (qw(rwsub dmootsub)) {
if (ref($ach->{$subkey})) {
foreach (grep {!$_->{"_${subkey}"}} @{$ach->{$subkey}{chain}}) {
$_ = bless( {%$_}, ref($_) );
$_->{label} = $subkey.'_'.$_->{label};
$_->{enabled} = $ach->{$subkey}{enabled};
$_->{"_$subkey"} = 1;
}
}
}
return 1;
}
##==============================================================================
CAB/Chain/DTA.pm view on Meta::CPAN
## $tok = $ach->analyzeSentence($sent_or_array,\%opts)
## + perform type-, token-, and sentence-analyses on $sent_or_array
## + wrapper for $ach->analyzeDocument()
## + INHERITED from DTA::CAB::Analyzer
## $rpc_xml_base64 = $anl->analyzeData($data_str,\%opts)
## + analyze a raw (formatted) data string $data_str with internal parsing & formatting
## + wrapper for $anl->analyzeDocument()
## + INHERITED from DTA::CAB::Analyzer
##==============================================================================
## Methods: XML-RPC
## + INHERITED from DTA::CAB::Chain::Multi
1; ##-- be happy
__END__
##========================================================================
## POD DOCUMENTATION, auto-generated by podextract.perl, edited
##========================================================================
## NAME
=pod
=head1 NAME
DTA::CAB::Chain::DTA - Deutsches Textarchiv canonicalization chain class
=cut
##========================================================================
## SYNOPSIS
=pod
=head1 SYNOPSIS
use DTA::CAB::Chain::DTA;
##========================================================================
## Methods
$obj = CLASS_OR_OBJ->new(%args);
$ach = $ach->setupChains();
$bool = $ach->ensureLoaded();
$bool = $anl->doAnalyze(\%opts, $name);
$doc = $ach->analyzeClean($doc,\%opts);
=cut
##========================================================================
## DESCRIPTION
=pod
=head1 DESCRIPTION
DTA::CAB::Chain::DTA
is the L<DTA::CAB::Analyzer|DTA::CAB::Analyzer> subclass implementing
the robust orthographic canonicalization cascade used in the
I<Deutsches Textarchiv> project. This class inherits from
L<DTA::CAB::Chain::Multi|DTA::CAB::Chain::Multi>.
See the L</setupChains> method for a list of supported sub-chains
and the corresponding analyers.
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Chain::DTA: Methods
=pod
=head2 Methods
=over 4
=item new
$obj = CLASS_OR_OBJ->new(%args);
%$obj, %args:
##-- paranoia
autoClean => 0, ##-- always run 'clean' analyzer regardless of options; checked in both doAnalyze(), analyzeClean()
defaultChain => 'default',
##
##-- overrides
chains => undef, ##-- see setupChains() method
chain => undef, ##-- see setupChains() method
Additionally, the following sub-analyzers are defined
as fields of %$obj:
=over 4
=item tokpp
Token preprocessor,
a L<DTA::CAB::Analyzer::TokPP|DTA::CAB::Analyzer::TokPP> object.
=item xlit
Transliterator,
a L<DTA::CAB::Analyzer::Unicruft|DTA::CAB::Analyzer::Unicruft> object.
=item lts
Phonetizer (Letter-to-Sound mapper),
a L<DTA::CAB::Analyzer::LTS|DTA::CAB::Analyzer::LTS> object.
=item morph
Morphological analyzer (TAGH),
a L<DTA::CAB::Analyzer::Morph|DTA::CAB::Analyzer::Morph> object.
=item mlatin
Latin pseudo-morphology,
a L<DTA::CAB::Analyzer::Morph::Latin|DTA::CAB::Analyzer::Morph::Latin> object.
=item msafe
Morphological security heuristics,
a L<DTA::CAB::Analyzer::MorphSafe|DTA::CAB::Analyzer::MorphSafe> object.
=item rw
Weighted finite-state rewrite cascade,
a L<DTA::CAB::Analyzer::Rewrite|DTA::CAB::Analyzer::Rewrite> object.
Date-optimized variants C<rw.1600-1700>, C<rw.1700-1800>, and C<rw.1800-1900> may also be included.
=item rwsub
Post-processing for rewrite cascade,
a L<DTA::CAB::Analyzer::RewriteSub|DTA::CAB::Analyzer::RewriteSub> object.
=item eqphox
Intensional (TAGH-based) phonetic equivalence expander,
a L<DTA::CAB::Analyzer::EqPhoX|DTA::CAB::Analyzer::EqPhoX> object.
=item eqpho
Extensional (corpus-based) phonetic equivalence expander,
a L<DTA::CAB::Analyzer::EqPho|DTA::CAB::Analyzer::EqPho> object.
=item eqrw
Extensional rewrite-equivalence expander,
a L< DTA::CAB::Analyzer::EqRW| DTA::CAB::Analyzer::EqRW> object.
=item dmoot
Token-level dynamic HMM conflation disambiguator,
a L<DTA::CAB::Analyzer::Moot::DynLex|DTA::CAB::Analyzer::Moot::DynLex> object.
=item dmootsub
Post-processing for L</dmoot> analyzer,
a L<DTA::CAB::Analyzer::DmootSub|DTA::CAB::Analyzer::DmootSub> object.
=item moot
HMM part-of-speech tagger,
a L<DTA::CAB::Analyzer::Moot|DTA::CAB::Analyzer::Moot> object.
=item mootsub
Post-processing for L</moot> tagger,
a L<DTA::CAB::Analyzer::MootSub|DTA::CAB::Analyzer::MootSub> object.
=item eqlemma
Extensional (corpus-based) lemma-equivalence class expander,
a L< DTA::CAB::Analyzer::EqLemma| DTA::CAB::Analyzer::EqLemma> object.
=item clean
Janitor (paranoid removal of internal temporary data),
a L<DTA::CAB::Analyzer::DTAClean|DTA::CAB::Analyzer::DTAClean> object.
=back
=back
=item setupChains
$ach = $ach->setupChains();
Setup default named sub-chains in $ach-E<gt>{chains}.
Currently defines a singleton chain C<sub.NAME>
for each analyzer key in keys(%$ach), as well as the following
non-trivial chains:
'sub.expand' =>[@$ach{qw(eqpho eqrw eqlemma)}],
'sub.sent' =>[@$ach{qw(dmoot dmootsub moot mootsub)}],
'sub.sent1' =>[@$ach{qw(dmoot1 dmootsub moot1 mootsub)}],
'sub.gn' =>[@$ach{qw(gn-syn gn-isa gn-asi)}],
'sub.ot' =>[@$ach{qw(ot-syn ot-isa ot-asi)}],
##
'default.static' =>[@$ach{qw(static)}],
'default.exlex' =>[@$ach{qw(exlex)}],
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.lts' =>[@$ach{qw(xlit lts)}],
'default.eqphox' =>[@$ach{qw(tokpp xlit lts eqphox)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.rw' =>[@$ach{qw(tokpp xlit rw)}],
'default.rw.safe'=>[@$ach{qw(tokpp xlit morph mlatin msafe langid rw)}],
'default.dmoot' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot)}],
'default.dmoot1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1)}],
'default.moot' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot dmootsub moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot1 mootsub)}],
'default.ner' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot dmootsub moot mootsub ner)}],
'default.base' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub)}],
##
'expand.old' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw)}],
'expand.ext' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw eqphox)}],
'expand.all' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
'expand.eqpho' =>[@$ach{qw(static exlex xlit lts eqpho)}],
'expand.eqrw' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqrw)}],
'expand.eqlemma' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
'expand.gn-syn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-syn)}],
'expand.gn-isa' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-isa)}],
'expand.gn-asi' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-asi)}],
'expand.gn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-syn gn-isa gn-asi)}],
'expand.ot-syn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-syn)}],
'expand.ot-isa' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-isa)}],
'expand.ot-asi' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-asi)}],
'expand.ot' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-syn ot-isa ot-asi)}],
##
'norm' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub)}],
'norm1' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot1 dmootsub moot1 mootsub)}],
'ner' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub ner)}],
'caberr' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub mapclass)}],
'caberr1' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot1 dmootsub moot1 mootsub mapclass)}],
'all' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub eqpho eqrw eqphox dmoot dmootsub moot mootsub eqlemma)}],
'clean' =>[@$ach{qw(clean)}],
##
'null' =>[$ach->{null}],
High-level date-optimized chains C<norm.RNG>, C<norm1.RNG>, C<lemma.RNG>, C<lemma1.RNG>, C<default.RNG>, and C<expand.RNG>
are also defined using the date-optimized rewrite cascade C<rw.RNG> in place of the default "generic" cascade C<rw>
for each range I<RNG> in C<1600-1700>, C<1700-1800>, and C<1800-1900>.
=item ensureLoaded
$bool = $ach->ensureLoaded();
Ensures analysis data is loaded from default files.
Inherited DTA::CAB::Chain::Multi override calls ensureChain() before inherited method.
Hack copies chain sub-analyzers (rwsub, dmootsub) AFTER loading their own sub-analyzers,
setting 'enabled' only then if appropriate.
=item doAnalyze
$bool = $anl->doAnalyze(\%opts, $name);
Alias for $anl-E<gt>can("analyze${name}") && (!exists($opts{"doAnalyze${name}"}) || $opts{"doAnalyze${name}"}).
Override checks $anl-E<gt>{autoClean} flag.
=item analyzeClean
$doc = $ach->analyzeClean($doc,\%opts);
Cleanup any temporary data associated with $doc.
Chain default calls $a-E<gt>analyzeClean for each analyzer $a in the chain,
then superclass Analyzer-E<gt>analyzeClean.
Local override checks $ach-E<gt>{autoClean}.
=back
=cut
##========================================================================
## END POD DOCUMENTATION, auto-generated by podextract.perl
##======================================================================
## Footer
##======================================================================
=pod
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2010-2019 by Bryan Jurish
This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.
=head1 SEE ALSO
L<dta-cab-analyze.perl(1)|dta-cab-analyze.perl>,
L<DTA::CAB::Chain::Multi(3pm)|DTA::CAB::Chain::Multi>,
L<DTA::CAB::Chain(3pm)|DTA::CAB::Chain>,
L<DTA::CAB::Analyzer(3pm)|DTA::CAB::Analyzer>,
L<DTA::CAB(3pm)|DTA::CAB>,
( run in 0.814 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )