DTA-CAB

 view release on metacpan or  search on metacpan

CAB/Chain/DTA.pm  view on Meta::CPAN

     'default.msafe'  =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
     'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
     'default.rw'     =>[@$ach{qw(tokpp xlit rw)}],
     'default.rw.safe'=>[@$ach{qw(tokpp xlit                         morph mlatin msafe langid rw)}],
     'default.dmoot'  =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot)}],
     'default.dmoot1' =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot1)}],
     'default.moot'   =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot  dmootsub moot)}],
     'default.moot1'  =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot1 dmootsub moot1)}],
     'default.lemma'  =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw        dmoot1 dmootsub moot  mootsub)}],
     'default.lemma1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw        dmoot1 dmootsub moot1 mootsub)}],
     'default.ner'    =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot  dmootsub moot mootsub ner)}],
     'default.base'   =>[@$ach{qw(static exlex tokpp xlit lts        morph mlatin msafe langid)}],
     'default.type'   =>[@$ach{qw(static exlex tokpp xlit lts        morph mlatin msafe langid rw rwsub)}],
     ##
     'expand.old'     =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw       eqpho eqrw)}],
     'expand.ext'     =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw       eqpho eqrw eqphox)}],
     'expand.all'     =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw       eqpho eqrw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
     'expand.eqpho'   =>[@$ach{qw(static exlex       xlit lts                             eqpho)}],
     'expand.eqrw'    =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw             eqrw)}],
     'expand.eqlemma' =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
     'expand.gn-syn'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub gn-syn)}],
     'expand.gn-isa'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub gn-isa)}],
     'expand.gn-asi'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub gn-asi)}],
     'expand.gn'      =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub gn-syn gn-isa gn-asi)}],
     'expand.ot-syn'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub ot-syn)}],
     'expand.ot-isa'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub ot-isa)}],
     'expand.ot-asi'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub ot-asi)}],
     'expand.ot'      =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub ot-syn ot-isa ot-asi)}],
     ##
     'norm'           =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot  dmootsub moot  mootsub)}],
     'norm1'          =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot1 dmootsub moot1 mootsub)}],
     'ner'            =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot  dmootsub moot  mootsub ner)}],
     'caberr'         =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot  dmootsub moot  mootsub mapclass)}],
     'caberr1'        =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot1 dmootsub moot1 mootsub mapclass)}],
     'all'            =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub eqpho eqrw eqphox dmoot  dmootsub moot  mootsub eqlemma)}], ##-- old dta clients use 'all'!
     'clean'          =>[@$ach{qw(clean)}],
     'null'           =>[$ach->{null}],
    };
  #$chains->{'default'} = [map {@{$chains->{$_}}} qw(default.type sub.sent)];

  ##-- chain aliases
  $chains->{'default'}  = $chains->{lemma}  = $chains->{'norm'};
  $chains->{'default1'} = $chains->{lemma1} = $chains->{'norm1'};
  $chains->{'expand'}   = $chains->{'expand.all'};

  ##-- BEGIN TEMPORARY custom chain(s): "hlgl": place-name recognition Hessen
  $chains->{"norm.hlgl"}  = [map {($_,$_ eq $ach->{mlatin} ? $ach->{mhessen} : qw())} @{$chains->{norm}}];
  $chains->{"norm1.hlgl"} = [map {($_,$_ eq $ach->{mlatin} ? $ach->{mhessen} : qw())} @{$chains->{norm1}}];
  ##
  $chains->{"norm.hlgl.geo"}  = [map {($_,$_ eq $ach->{mlatin} ? $ach->{mhessengeo} : qw())} @{$chains->{norm}}];
  $chains->{"norm1.hlgl.geo"} = [map {($_,$_ eq $ach->{mlatin} ? $ach->{mhessengeo} : qw())} @{$chains->{norm1}}];
  ##-- END TEMPORARY custom chain(s)

  ##-- date-dependent chains
  foreach my $rng (@RW_RANGES) {
    if ($ach->{"rw.$rng"} && ($ach->{"rw.$rng"}{enabled}//1)) {
      foreach my $key (qw(norm norm1 lemma lemma1 default default1 expand)) {
	$chains->{"$key.$rng"} = [map {$_ eq $ach->{rw} ? $ach->{"rw.$rng"} : $_} @{$chains->{$key}}];
      }
    } else {
      $ach->warn("optimized rewrite cascade rw.$rng not available: disabling derived chains for range $rng");
      delete $ach->{"rw.$rng"};
      delete $chains->{"sub.rw.$rng"};
    }
  }

  ##-- sanitize chains
  foreach (values %{$ach->{chains}}) {
    @$_ = grep {ref($_)} @$_;
  }

  ##-- set default chain
  $ach->{chain} = $ach->{chains}{$ach->{defaultChain}};

  ##-- force default labels
  foreach (grep {UNIVERSAL::isa($ach->{$_},'DTA::CAB::Analyzer')} keys(%$ach)) {
    next if ($_ =~ /^(?:langid|rw\.[0-9\-]+|mhessen(?:geo)?)$/);    ##-- keep labels for these analyzers
    ($ach->{$_}{label} = $_) =~ s/1$//;   ##-- truncate '1' suffix for label (e.g. dmoot1, moot1)
  }
  return $ach;
}

## \@analyzers = $ach->chain()
## \@analyzers = $ach->chain(\%opts)
##  + get selected analyzer chain
##  + inherited from DTA::CAB::Chain::Multi
##    - calls setupChains() if $ach->{chain} is empty
##    - checks for $opts{chain} and returns $ach->{chains}{ $opts{chain} } if available

## $ach = $ach->ensureChain()
##  + checks for $ach->{chain}, calls $ach->setupChains() if needed
##  + inherited from DTA::CAB::Chain::Multi

##==============================================================================
## Methods: I/O
##==============================================================================

## $bool = $ach->ensureLoaded()
##  + ensures analysis data is loaded from default files
##  + inherited DTA::CAB::Chain::Multi override calls ensureChain() before inherited method
sub ensureLoaded {
  my $ach = shift;
  $ach->SUPER::ensureLoaded(@_) || return 0;

  ##-- hack: copy chain members AFTER loading for sub-analyzers, setting 'enabled' if appropriate
  my ($subkey);
  foreach $subkey (qw(rwsub dmootsub)) {
    if (ref($ach->{$subkey})) {
      foreach (grep {!$_->{"_${subkey}"}} @{$ach->{$subkey}{chain}}) {
	$_ = bless( {%$_}, ref($_) );
	$_->{label}   = $subkey.'_'.$_->{label};
	$_->{enabled} = $ach->{$subkey}{enabled};
	$_->{"_$subkey"}  = 1;
      }
    }
  }

  return 1;
}

##==============================================================================

CAB/Chain/DTA.pm  view on Meta::CPAN


## $tok = $ach->analyzeSentence($sent_or_array,\%opts)
##  + perform type-, token-, and sentence-analyses on $sent_or_array
##  + wrapper for $ach->analyzeDocument()
##  + INHERITED from DTA::CAB::Analyzer

## $rpc_xml_base64 = $anl->analyzeData($data_str,\%opts)
##  + analyze a raw (formatted) data string $data_str with internal parsing & formatting
##  + wrapper for $anl->analyzeDocument()
##  + INHERITED from DTA::CAB::Analyzer

##==============================================================================
## Methods: XML-RPC
##  + INHERITED from DTA::CAB::Chain::Multi

1; ##-- be happy

__END__

##========================================================================
## POD DOCUMENTATION, auto-generated by podextract.perl, edited

##========================================================================
## NAME
=pod

=head1 NAME

DTA::CAB::Chain::DTA - Deutsches Textarchiv canonicalization chain class

=cut

##========================================================================
## SYNOPSIS
=pod

=head1 SYNOPSIS

 use DTA::CAB::Chain::DTA;
 
 ##========================================================================
 ## Methods
 
 $obj = CLASS_OR_OBJ->new(%args);
 $ach = $ach->setupChains();
 $bool = $ach->ensureLoaded();
 $bool = $anl->doAnalyze(\%opts, $name);
 $doc = $ach->analyzeClean($doc,\%opts);
 

=cut

##========================================================================
## DESCRIPTION
=pod

=head1 DESCRIPTION

DTA::CAB::Chain::DTA
is the L<DTA::CAB::Analyzer|DTA::CAB::Analyzer> subclass implementing
the robust orthographic canonicalization cascade used in the
I<Deutsches Textarchiv> project.  This class inherits from
L<DTA::CAB::Chain::Multi|DTA::CAB::Chain::Multi>.
See the L</setupChains> method for a list of supported sub-chains
and the corresponding analyers.

=cut

##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Chain::DTA: Methods
=pod

=head2 Methods

=over 4

=item new

 $obj = CLASS_OR_OBJ->new(%args);

%$obj, %args:

 ##-- paranoia
 autoClean => 0,  ##-- always run 'clean' analyzer regardless of options; checked in both doAnalyze(), analyzeClean()
 defaultChain => 'default',
 ##
 ##-- overrides
 chains => undef, ##-- see setupChains() method
 chain => undef, ##-- see setupChains() method

Additionally, the following sub-analyzers are defined
as fields of %$obj:

=over 4

=item tokpp

Token preprocessor,
a L<DTA::CAB::Analyzer::TokPP|DTA::CAB::Analyzer::TokPP> object.

=item xlit

Transliterator,
a L<DTA::CAB::Analyzer::Unicruft|DTA::CAB::Analyzer::Unicruft> object.

=item lts

Phonetizer (Letter-to-Sound mapper),
a L<DTA::CAB::Analyzer::LTS|DTA::CAB::Analyzer::LTS> object.

=item morph

Morphological analyzer (TAGH),
a L<DTA::CAB::Analyzer::Morph|DTA::CAB::Analyzer::Morph> object.

=item mlatin

Latin pseudo-morphology,
a L<DTA::CAB::Analyzer::Morph::Latin|DTA::CAB::Analyzer::Morph::Latin> object.

=item msafe

Morphological security heuristics,
a L<DTA::CAB::Analyzer::MorphSafe|DTA::CAB::Analyzer::MorphSafe> object.

=item rw

Weighted finite-state rewrite cascade,
a L<DTA::CAB::Analyzer::Rewrite|DTA::CAB::Analyzer::Rewrite> object.

Date-optimized variants C<rw.1600-1700>, C<rw.1700-1800>, and C<rw.1800-1900> may also be included.

=item rwsub

Post-processing for rewrite cascade,
a L<DTA::CAB::Analyzer::RewriteSub|DTA::CAB::Analyzer::RewriteSub> object.

=item eqphox

Intensional (TAGH-based) phonetic equivalence expander,
a L<DTA::CAB::Analyzer::EqPhoX|DTA::CAB::Analyzer::EqPhoX> object.

=item eqpho

Extensional (corpus-based) phonetic equivalence expander,
a L<DTA::CAB::Analyzer::EqPho|DTA::CAB::Analyzer::EqPho> object.

=item eqrw

Extensional rewrite-equivalence expander,
a L< DTA::CAB::Analyzer::EqRW| DTA::CAB::Analyzer::EqRW> object.

=item dmoot

Token-level dynamic HMM conflation disambiguator,
a L<DTA::CAB::Analyzer::Moot::DynLex|DTA::CAB::Analyzer::Moot::DynLex> object.

=item dmootsub

Post-processing for L</dmoot> analyzer,
a L<DTA::CAB::Analyzer::DmootSub|DTA::CAB::Analyzer::DmootSub> object.

=item moot

HMM part-of-speech tagger,
a L<DTA::CAB::Analyzer::Moot|DTA::CAB::Analyzer::Moot> object.

=item mootsub

Post-processing for L</moot> tagger,
a L<DTA::CAB::Analyzer::MootSub|DTA::CAB::Analyzer::MootSub> object.

=item eqlemma

Extensional (corpus-based) lemma-equivalence class expander,
a L< DTA::CAB::Analyzer::EqLemma| DTA::CAB::Analyzer::EqLemma> object.

=item clean

Janitor (paranoid removal of internal temporary data),
a L<DTA::CAB::Analyzer::DTAClean|DTA::CAB::Analyzer::DTAClean> object.

=back

=back


=item setupChains

 $ach = $ach->setupChains();

Setup default named sub-chains in $ach-E<gt>{chains}.
Currently defines a singleton chain C<sub.NAME>
for each analyzer key in keys(%$ach), as well as the following
non-trivial chains:

 'sub.expand'     =>[@$ach{qw(eqpho eqrw eqlemma)}],
 'sub.sent'       =>[@$ach{qw(dmoot  dmootsub moot  mootsub)}],
 'sub.sent1'      =>[@$ach{qw(dmoot1 dmootsub moot1 mootsub)}],
 'sub.gn'         =>[@$ach{qw(gn-syn gn-isa gn-asi)}],
 'sub.ot'         =>[@$ach{qw(ot-syn ot-isa ot-asi)}],
 ##
 'default.static' =>[@$ach{qw(static)}],
 'default.exlex'  =>[@$ach{qw(exlex)}],
 'default.tokpp'  =>[@$ach{qw(tokpp)}],
 'default.xlit'   =>[@$ach{qw(xlit)}],
 'default.lts'    =>[@$ach{qw(xlit lts)}],
 'default.eqphox' =>[@$ach{qw(tokpp xlit lts eqphox)}],
 'default.morph'  =>[@$ach{qw(tokpp xlit morph)}],
 'default.mlatin' =>[@$ach{qw(tokpp xlit       mlatin)}],
 'default.msafe'  =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
 'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
 'default.rw'     =>[@$ach{qw(tokpp xlit rw)}],
 'default.rw.safe'=>[@$ach{qw(tokpp xlit                         morph mlatin msafe langid rw)}],
 'default.dmoot'  =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot)}],
 'default.dmoot1' =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot1)}],
 'default.moot'   =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot  dmootsub moot)}],
 'default.moot1'  =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot1 dmootsub moot1)}],
 'default.lemma'  =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw        dmoot1 dmootsub moot  mootsub)}],
 'default.lemma1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw        dmoot1 dmootsub moot1 mootsub)}],
 'default.ner'    =>[@$ach{qw(tokpp xlit              lts eqphox morph mlatin msafe langid rw        dmoot  dmootsub moot mootsub ner)}],
 'default.base'   =>[@$ach{qw(static exlex tokpp xlit lts        morph mlatin msafe langid)}],
 'default.type'   =>[@$ach{qw(static exlex tokpp xlit lts        morph mlatin msafe langid rw rwsub)}],
 ##
 'expand.old'     =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw       eqpho eqrw)}],
 'expand.ext'     =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw       eqpho eqrw eqphox)}],
 'expand.all'     =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw       eqpho eqrw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
 'expand.eqpho'   =>[@$ach{qw(static exlex       xlit lts                             eqpho)}],
 'expand.eqrw'    =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw             eqrw)}],
 'expand.eqlemma' =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
 'expand.gn-syn'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub gn-syn)}],
 'expand.gn-isa'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub gn-isa)}],
 'expand.gn-asi'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub gn-asi)}],
 'expand.gn'      =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub gn-syn gn-isa gn-asi)}],
 'expand.ot-syn'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub ot-syn)}],
 'expand.ot-isa'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub ot-isa)}],
 'expand.ot-asi'  =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub ot-asi)}],
 'expand.ot'      =>[@$ach{qw(static exlex       xlit lts morph mlatin msafe rw                  eqphox dmoot1 dmootsub moot1 mootsub ot-syn ot-isa ot-asi)}],
 ##
 'norm'           =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot  dmootsub moot  mootsub)}],
 'norm1'          =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot1 dmootsub moot1 mootsub)}],
 'ner'            =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot  dmootsub moot  mootsub ner)}],
 'caberr'         =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot  dmootsub moot  mootsub mapclass)}],
 'caberr1'        =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw                  eqphox dmoot1 dmootsub moot1 mootsub mapclass)}],
 'all'            =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub eqpho eqrw eqphox dmoot  dmootsub moot  mootsub eqlemma)}],
 'clean'          =>[@$ach{qw(clean)}],
 ##
 'null'           =>[$ach->{null}],

High-level date-optimized chains C<norm.RNG>, C<norm1.RNG>, C<lemma.RNG>, C<lemma1.RNG>, C<default.RNG>, and C<expand.RNG>
are also defined using the date-optimized rewrite cascade C<rw.RNG> in place of the default "generic" cascade C<rw>
for each range I<RNG> in C<1600-1700>, C<1700-1800>, and C<1800-1900>.

=item ensureLoaded

 $bool = $ach->ensureLoaded();

Ensures analysis data is loaded from default files.
Inherited DTA::CAB::Chain::Multi override calls ensureChain() before inherited method.
Hack copies chain sub-analyzers (rwsub, dmootsub) AFTER loading their own sub-analyzers,
setting 'enabled' only then if appropriate.


=item doAnalyze

 $bool = $anl->doAnalyze(\%opts, $name);

Alias for $anl-E<gt>can("analyze${name}") && (!exists($opts{"doAnalyze${name}"}) || $opts{"doAnalyze${name}"}).
Override checks $anl-E<gt>{autoClean} flag.


=item analyzeClean

 $doc = $ach->analyzeClean($doc,\%opts);

Cleanup any temporary data associated with $doc.
Chain default calls $a-E<gt>analyzeClean for each analyzer $a in the chain,
then superclass Analyzer-E<gt>analyzeClean.
Local override checks $ach-E<gt>{autoClean}.

=back

=cut

##========================================================================
## END POD DOCUMENTATION, auto-generated by podextract.perl

##======================================================================
## Footer
##======================================================================
=pod

=head1 AUTHOR

Bryan Jurish E<lt>moocow@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2010-2019 by Bryan Jurish

This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.

=head1 SEE ALSO

L<dta-cab-analyze.perl(1)|dta-cab-analyze.perl>,
L<DTA::CAB::Chain::Multi(3pm)|DTA::CAB::Chain::Multi>,
L<DTA::CAB::Chain(3pm)|DTA::CAB::Chain>,
L<DTA::CAB::Analyzer(3pm)|DTA::CAB::Analyzer>,
L<DTA::CAB(3pm)|DTA::CAB>,



( run in 0.814 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )