view release on metacpan or search on metacpan
CAB/Analyzer/LangId.pm view on Meta::CPAN
##==============================================================================
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure:
## (
## ##-- Filename Options
## mapFile => $filename, ##-- default: none (REQUIRED)
##
## ##-- Analysis Options
## analyzeWhich => $which, ##-- one of 'token', 'sentence', 'document'; default='document'
## vlabel => $label, ##-- verbose destination key (default='langid')
## label => $label, ##-- simple destination key (default='lang')
##
##
## ##-- Analysis Objects
## map => $map, ##-- a Lingua::LangId::Map object
## )
sub new {
my $that = shift;
my $lid = $that->SUPER::new(
##-- filenames
mapFile => undef,
##-- options
analyzeWhich => 'document',
vlabel => 'langid',
label => 'lang',
##-- analysis objects
#map => undef,
##-- user args
@_
);
return $lid;
}
CAB/Analyzer/LangId.pm view on Meta::CPAN
$obj = CLASS_OR_OBJ->new(%args);
object structure:
(
##-- Filename Options
mapFile => $filename, ##-- default: none (REQUIRED)
##-- Analysis Options
analyzeWhich => $which, ##-- one of 'token', 'sentence', 'document'; default='document'
vlabel => $label, ##-- verbose destination key (default='langid')
label => $label, ##-- simple destination key (default='lang')
##-- Analysis Objects
map => $map, ##-- a Lingua::LangId::Map object
)
=item clear
$lid = $lid->clear();
(undocumented)
CAB/Chain/DE_free.pm view on Meta::CPAN
xlit => DTA::CAB::Analyzer::Unicruft->new(),
##
morph => DTA::CAB::Analyzer::Morph::Helsinki::DE->new(),
mlatin=> DTA::CAB::Analyzer::Morph::Latin->new(),
msafe => DTA::CAB::Analyzer::MorphSafe->new(), ##-- remove this for DE_free chain?
##
moot => DTA::CAB::Analyzer::Moot->new(lang=>'de'), ##-- moot tagger (on dmoot output; (n>1)-grams)
moot1 => DTA::CAB::Analyzer::Moot->new(lang=>'de'), ##-- moot tagger (on dmoot output; 1-grams only)
mootsub => DTA::CAB::Analyzer::MootSub->new(ucTags=>[qw(NN NE)],stts=>1,wMorph=>.2), ##-- moot tagger, post-processing hacks
##
langid => DTA::CAB::Analyzer::LangId::Simple->new(defaultLang=>'de'), ##-- language-guesser (stopword-based; between msafe and rw)
##
clean => DTA::CAB::Analyzer::DTAClean->new(),
##
null => DTA::CAB::Analyzer::Null->new(), ##-- null analyzer (for 'null' chain)
##-- security
autoClean => 0, ##-- always run 'clean' analyzer regardless of options; checked in both doAnalyze(), analyzeClean()
defaultChain => 'default',
##-- user args
CAB/Chain/DE_free.pm view on Meta::CPAN
#(map {("$_"=>[$ach->{$_}])} @akeys), ##-- xlit, lts, ...
##
'sub.sent' =>[@$ach{qw(moot mootsub)}],
'sub.sent1' =>[@$ach{qw(moot1 mootsub)}],
##
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin langid)}],
'default.moot' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'default.base' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
##
'norm' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'norm1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'all' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}], ##-- old dta clients use 'all'!
'clean' =>[@$ach{qw(clean)}],
'null' =>[$ach->{null}],
};
#$chains->{'default'} = [map {@{$chains->{$_}}} qw(default.type sub.sent)];
##-- chain aliases
$chains->{'default'} = $chains->{lemma} = $chains->{'norm'};
$chains->{'default1'} = $chains->{lemma1} = $chains->{'norm1'};
##-- sanitize chains
foreach (values %{$ach->{chains}}) {
@$_ = grep {ref($_)} @$_;
}
##-- set default chain
$ach->{chain} = $ach->{chains}{$ach->{defaultChain}};
##-- force default labels
foreach (grep {UNIVERSAL::isa($ach->{$_},'DTA::CAB::Analyzer')} keys(%$ach)) {
next if ($_ =~ /^(?:langid)$/); ##-- keep these labels
($ach->{$_}{label} = $_) =~ s/1$//; ##-- truncate '1' suffix for label (e.g. dmoot1, moot1)
}
return $ach;
}
## \@analyzers = $ach->chain()
## \@analyzers = $ach->chain(\%opts)
## + get selected analyzer chain
## + inherited from DTA::CAB::Chain::Multi
## - calls setupChains() if $ach->{chain} is empty
CAB/Chain/DE_free.pm view on Meta::CPAN
non-trivial chains:
'sub.sent' =>[@$ach{qw(moot mootsub)}],
'sub.sent1' =>[@$ach{qw(moot1 mootsub)}],
##
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.moot' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'default.base' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
##
'norm' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'norm1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'all' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}], ##-- old dta clients use 'all'!
'clean' =>[@$ach{qw(clean)}],
'null' =>[$ach->{null}],
=item ensureLoaded
$bool = $ach->ensureLoaded();
Ensures analysis data is loaded from default files.
Inherited DTA::CAB::Chain::Multi override calls ensureChain() before inherited method.
Hack copies chain sub-analyzers (rwsub, dmootsub) AFTER loading their own sub-analyzers,
CAB/Chain/DTA.pm view on Meta::CPAN
##
##
dmoot => DTA::CAB::Analyzer::Moot::Boltzmann->new(), ##-- moot n-gram disambiguator ((n>=1)-grams)
dmoot1 => DTA::CAB::Analyzer::Moot::Boltzmann->new(), ##-- moot n-gram disambiguator (1-grams only)
dmootsub => DTA::CAB::Analyzer::DmootSub->new(), ##-- moot n-gram disambiguator: sub-morph
moot => DTA::CAB::Analyzer::Moot->new(), ##-- moot tagger (on dmoot output; (n>1)-grams)
moot1 => DTA::CAB::Analyzer::Moot->new(), ##-- moot tagger (on dmoot output; 1-grams only)
mootsub => DTA::CAB::Analyzer::MootSub->new(), ##-- moot tagger, post-processing hacks
mapclass => DTA::CAB::Analyzer::DTAMapClass->new(), ##-- mapping class (post-moot)
##
langid => DTA::CAB::Analyzer::LangId::Simple->new(), ##-- language-guesser (stopword-based; between msafe and rw)
##
ner => DTA::CAB::Analyzer::SynCoPe::NER->new(), ##-- ne-recognizer (post-moot)
##
eqlemma => DTA::CAB::Analyzer::EqLemma->new(), ##-- eqlemma (best only)
##
'gn-syn' => DTA::CAB::Analyzer::GermaNet::Synonyms->new(), ##-- GermaNet synonyms
'gn-isa' => DTA::CAB::Analyzer::GermaNet::Hypernyms->new(), ##-- GermaNet hyperyms (superclasses)
'gn-asi' => DTA::CAB::Analyzer::GermaNet::Hyponyms->new(), ##-- GermaNet hyponyms (subclasses)
##
'ot-syn' => DTA::CAB::Analyzer::GermaNet::Synonyms->new(label=>'ot-syn'), ##-- OpenThesaurus synonyms
CAB/Chain/DTA.pm view on Meta::CPAN
'default.exlex' =>[@$ach{qw(exlex)}],
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.lts' =>[@$ach{qw(xlit lts)}],
'default.eqphox' =>[@$ach{qw(tokpp xlit lts eqphox)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.mhessen'=>[@$ach{qw(tokpp xlit mhessen)}],
'default.mhessengeo'=>[@$ach{qw(tokpp xlit mhessengeo)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.rw' =>[@$ach{qw(tokpp xlit rw)}],
'default.rw.safe'=>[@$ach{qw(tokpp xlit morph mlatin msafe langid rw)}],
'default.dmoot' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot)}],
'default.dmoot1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1)}],
'default.moot' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot dmootsub moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot1 mootsub)}],
'default.ner' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot dmootsub moot mootsub ner)}],
'default.base' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub)}],
##
'expand.old' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw)}],
'expand.ext' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw eqphox)}],
'expand.all' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
'expand.eqpho' =>[@$ach{qw(static exlex xlit lts eqpho)}],
'expand.eqrw' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqrw)}],
'expand.eqlemma' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
'expand.gn-syn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-syn)}],
'expand.gn-isa' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-isa)}],
'expand.gn-asi' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-asi)}],
'expand.gn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-syn gn-isa gn-asi)}],
'expand.ot-syn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-syn)}],
'expand.ot-isa' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-isa)}],
'expand.ot-asi' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-asi)}],
'expand.ot' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-syn ot-isa ot-asi)}],
##
'norm' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub)}],
'norm1' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot1 dmootsub moot1 mootsub)}],
'ner' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub ner)}],
'caberr' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub mapclass)}],
'caberr1' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot1 dmootsub moot1 mootsub mapclass)}],
'all' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub eqpho eqrw eqphox dmoot dmootsub moot mootsub eqlemma)}], ##-- old dta clients use 'all'!
'clean' =>[@$ach{qw(clean)}],
'null' =>[$ach->{null}],
};
#$chains->{'default'} = [map {@{$chains->{$_}}} qw(default.type sub.sent)];
##-- chain aliases
$chains->{'default'} = $chains->{lemma} = $chains->{'norm'};
$chains->{'default1'} = $chains->{lemma1} = $chains->{'norm1'};
$chains->{'expand'} = $chains->{'expand.all'};
CAB/Chain/DTA.pm view on Meta::CPAN
##-- sanitize chains
foreach (values %{$ach->{chains}}) {
@$_ = grep {ref($_)} @$_;
}
##-- set default chain
$ach->{chain} = $ach->{chains}{$ach->{defaultChain}};
##-- force default labels
foreach (grep {UNIVERSAL::isa($ach->{$_},'DTA::CAB::Analyzer')} keys(%$ach)) {
next if ($_ =~ /^(?:langid|rw\.[0-9\-]+|mhessen(?:geo)?)$/); ##-- keep labels for these analyzers
($ach->{$_}{label} = $_) =~ s/1$//; ##-- truncate '1' suffix for label (e.g. dmoot1, moot1)
}
return $ach;
}
## \@analyzers = $ach->chain()
## \@analyzers = $ach->chain(\%opts)
## + get selected analyzer chain
## + inherited from DTA::CAB::Chain::Multi
## - calls setupChains() if $ach->{chain} is empty
CAB/Chain/DTA.pm view on Meta::CPAN
##
'default.static' =>[@$ach{qw(static)}],
'default.exlex' =>[@$ach{qw(exlex)}],
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.lts' =>[@$ach{qw(xlit lts)}],
'default.eqphox' =>[@$ach{qw(tokpp xlit lts eqphox)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.rw' =>[@$ach{qw(tokpp xlit rw)}],
'default.rw.safe'=>[@$ach{qw(tokpp xlit morph mlatin msafe langid rw)}],
'default.dmoot' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot)}],
'default.dmoot1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1)}],
'default.moot' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot dmootsub moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot1 dmootsub moot1 mootsub)}],
'default.ner' =>[@$ach{qw(tokpp xlit lts eqphox morph mlatin msafe langid rw dmoot dmootsub moot mootsub ner)}],
'default.base' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub)}],
##
'expand.old' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw)}],
'expand.ext' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw eqphox)}],
'expand.all' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqpho eqrw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
'expand.eqpho' =>[@$ach{qw(static exlex xlit lts eqpho)}],
'expand.eqrw' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqrw)}],
'expand.eqlemma' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub eqlemma)}],
'expand.gn-syn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-syn)}],
'expand.gn-isa' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-isa)}],
'expand.gn-asi' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-asi)}],
'expand.gn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub gn-syn gn-isa gn-asi)}],
'expand.ot-syn' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-syn)}],
'expand.ot-isa' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-isa)}],
'expand.ot-asi' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-asi)}],
'expand.ot' =>[@$ach{qw(static exlex xlit lts morph mlatin msafe rw eqphox dmoot1 dmootsub moot1 mootsub ot-syn ot-isa ot-asi)}],
##
'norm' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub)}],
'norm1' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot1 dmootsub moot1 mootsub)}],
'ner' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub ner)}],
'caberr' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot dmootsub moot mootsub mapclass)}],
'caberr1' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw eqphox dmoot1 dmootsub moot1 mootsub mapclass)}],
'all' =>[@$ach{qw(static exlex tokpp xlit lts morph mlatin msafe langid rw rwsub eqpho eqrw eqphox dmoot dmootsub moot mootsub eqlemma)}],
'clean' =>[@$ach{qw(clean)}],
##
'null' =>[$ach->{null}],
High-level date-optimized chains C<norm.RNG>, C<norm1.RNG>, C<lemma.RNG>, C<lemma1.RNG>, C<default.RNG>, and C<expand.RNG>
are also defined using the date-optimized rewrite cascade C<rw.RNG> in place of the default "generic" cascade C<rw>
for each range I<RNG> in C<1600-1700>, C<1700-1800>, and C<1800-1900>.
=item ensureLoaded
CAB/Chain/EN.pm view on Meta::CPAN
xlit => DTA::CAB::Analyzer::Unicruft->new(),
##
morph => DTA::CAB::Analyzer::Morph::Helsinki::EN->new(),
mlatin=> DTA::CAB::Analyzer::Morph::Latin->new(),
msafe => DTA::CAB::Analyzer::MorphSafe->new(), ##-- remove this for en-chain?
##
moot => DTA::CAB::Analyzer::Moot->new(lang=>'en'), ##-- moot tagger (on dmoot output; (n>1)-grams)
moot1 => DTA::CAB::Analyzer::Moot->new(lang=>'en'), ##-- moot tagger (on dmoot output; 1-grams only)
mootsub => DTA::CAB::Analyzer::MootSub->new(ucTags=>[],stts=>0,wMorph=>.2), ##-- moot tagger, post-processing hacks
##
langid => DTA::CAB::Analyzer::LangId::Simple->new(defaultLang=>'en'), ##-- language-guesser (stopword-based; between msafe and rw)
##
clean => DTA::CAB::Analyzer::DTAClean->new(),
##
null => DTA::CAB::Analyzer::Null->new(), ##-- null analyzer (for 'null' chain)
##-- security
autoClean => 0, ##-- always run 'clean' analyzer regardless of options; checked in both doAnalyze(), analyzeClean()
defaultChain => 'default',
##-- user args
CAB/Chain/EN.pm view on Meta::CPAN
#(map {("$_"=>[$ach->{$_}])} @akeys), ##-- xlit, lts, ...
##
'sub.sent' =>[@$ach{qw(moot mootsub)}],
'sub.sent1' =>[@$ach{qw(moot1 mootsub)}],
##
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin langid)}],
'default.moot' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'default.base' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
##
'norm' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'norm1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'all' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}], ##-- old dta clients use 'all'!
'clean' =>[@$ach{qw(clean)}],
'null' =>[$ach->{null}],
};
#$chains->{'default'} = [map {@{$chains->{$_}}} qw(default.type sub.sent)];
##-- chain aliases
$chains->{'default'} = $chains->{lemma} = $chains->{'norm'};
$chains->{'default1'} = $chains->{lemma1} = $chains->{'norm1'};
##-- sanitize chains
foreach (values %{$ach->{chains}}) {
@$_ = grep {ref($_)} @$_;
}
##-- set default chain
$ach->{chain} = $ach->{chains}{$ach->{defaultChain}};
##-- force default labels
foreach (grep {UNIVERSAL::isa($ach->{$_},'DTA::CAB::Analyzer')} keys(%$ach)) {
next if ($_ =~ /^(?:langid)$/); ##-- keep these labels
($ach->{$_}{label} = $_) =~ s/1$//; ##-- truncate '1' suffix for label (e.g. dmoot1, moot1)
}
return $ach;
}
## \@analyzers = $ach->chain()
## \@analyzers = $ach->chain(\%opts)
## + get selected analyzer chain
## + inherited from DTA::CAB::Chain::Multi
## - calls setupChains() if $ach->{chain} is empty
CAB/Chain/EN.pm view on Meta::CPAN
non-trivial chains:
'sub.sent' =>[@$ach{qw(moot mootsub)}],
'sub.sent1' =>[@$ach{qw(moot1 mootsub)}],
##
'default.tokpp' =>[@$ach{qw(tokpp)}],
'default.xlit' =>[@$ach{qw(xlit)}],
'default.morph' =>[@$ach{qw(tokpp xlit morph)}],
'default.mlatin' =>[@$ach{qw(tokpp xlit mlatin)}],
'default.msafe' =>[@$ach{qw(tokpp xlit morph mlatin msafe)}],
'default.langid' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.moot' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot)}],
'default.moot1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1)}],
'default.lemma' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'default.lemma1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'default.base' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
'default.type' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid)}],
##
'norm' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}],
'norm1' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot1 mootsub)}],
'all' =>[@$ach{qw(tokpp xlit morph mlatin msafe langid moot mootsub)}], ##-- old dta clients use 'all'!
'clean' =>[@$ach{qw(clean)}],
'null' =>[$ach->{null}],
=item ensureLoaded
$bool = $ach->ensureLoaded();
Ensures analysis data is loaded from default files.
Inherited DTA::CAB::Chain::Multi override calls ensureChain() before inherited method.
Hack copies chain sub-analyzers (rwsub, dmootsub) AFTER loading their own sub-analyzers,
CAB/Server/HTTP/UNIX.pm view on Meta::CPAN
## Constructors etc.
##==============================================================================
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure: HASH ref
## {
## ##-- DTA::CAB::Server::HTTP::UNIX overrides
## daemonArgs => \%daemonArgs, ##-- overrides for HTTP::Daemon::UNIX->new(); default={Local=>'/tmp/dta-cab.sock'}
## socketPerms => $mode, ##-- socket permissions as an octal string (default='0666')
## socketUser => $user, ##-- socket user or uid (root only; default=undef: current user)
## socketGroup => $group, ##-- socket group or gid (default=undef: current group)
## _socketPath => $path, ##-- bound socket path (for unlink() on destroy)
## #_socketDirs => \@dirs, ##-- auto-created socket directories (DISABLED)
## relayCmd => \@cmd, ##-- TCP relay command-line for exec() (default=[qw(socat ...)], see prepareRelay())
## relayAddr => $addr, ##-- TCP relay address to bind (default=$daemonArgs{LocalAddr}, see prepareRelay())
## relayPort => $port, ##-- TCP relay address to bind (default=$daemonArgs{LocalPort}, see prepareRelay())
## relayPid => $pid, ##-- child PID for TCP relay process (sockrelay.perl / socat; see prepareRelay())
##
## ##-- (inherited from DTA::CAB::Server:HTTP): Underlying HTTP::Daemon server
## daemonMode => $daemonMode, ##-- one of 'serial' or 'fork' [default='serial']
## #daemonArgs => \%daemonArgs, ##-- args to HTTP::Daemon->new(); default={LocalAddr=>'0.0.0.0',LocalPort=>8088}
CAB/Server/HTTP/UNIX.pm view on Meta::CPAN
$srv->{daemon}->listen( $srv->{daemonArgs}{Listen}||SOMAXCONN ); ##-- workaround for missing option pass-through HTTP::Daemon::UNIX v0.06
##-- get socket path
$sockpath = $srv->{_socketPath} = $srv->{daemon}->hostpath()
or $srv->logconfess("prepareLocal(): daemon returned bad socket path");
##-- setup socket ownership
my $sockuid = (($srv->{socketUser}//'') =~ /^[0-9]+$/
? $srv->{socketUser}
: getpwnam($srv->{socketUser}//''));
my $sockgid = (($srv->{socketGroup}//'') =~ /^[0-9]+$/
? $srv->{socketGroup}
: getgrnam($srv->{socketGroup}//''));
if (defined($sockuid) || defined($sockgid)) {
$sockuid //= $>;
$sockgid //= $);
$srv->vlog('info', "setting socket ownership (".scalar(getpwuid $sockuid).".".scalar(getgrgid $sockgid).") on $sockpath");
chown($sockuid, $sockgid, $sockpath)
or $srv->logconfess("prepareLocal(): failed to set ownership for socket '$sockpath': $!");
foreach my $dir (reverse @{$srv->{_socketDirs}||[]}) {
$srv->vlog('info', "setting directory ownership (".scalar(getpwuid $sockuid).".".scalar(getgrgid $sockgid).") on $dir");
chown($sockuid, $sockgid, $dir)
or $srv->logconfess("prepareLocal(): failed to set ownership for directory '$dir': $!");
}
}
##-- setup socket permissions
if ( ($srv->{socketPerms}//'') ne '' ) {
my $sockperms = oct($srv->{socketPerms});
$srv->vlog('info', sprintf("setting socket permissions (0%03o) on %s", $sockperms, $sockpath));
chmod($sockperms, $sockpath)
or $srv->logconfess("prepareLocal(): failed to set permissions for socket '$sockpath': $!");
CAB/Server/HTTP/UNIX.pm view on Meta::CPAN
##-- check whether relay address is already bound
if (!$srv->SUPER::canBindSocket({LocalAddr=>($srv->relayAddr||'0.0.0.0'), LocalPort=>$srv->relayPort})) {
$srv->logwarn("WARNING: cannot bind TCP socket relay on ${addr}:${port} (is there a stale relay still running?): $!");
return -1;
}
$srv->vlog('trace',"starting TCP socket relay on ${addr}:${port}");
$SIG{CHLD} ||= $srv->reaper();
##-- set main server process as group leader (kill whole process group with `pkill -g $SERVER_PID`)
POSIX::setpgid(0,0);
my $pgid = POSIX::getpgrp();
if ( ($srv->{relayPid}=fork()) ) {
##-- parent
$srv->vlog('info', "started TCP socket relay process for ${addr}:${port} on pid=$srv->{relayPid}");
} else {
##-- child (relay)
##-- cleanup: close file desriptors
POSIX::close($_) foreach (3..1024);
##-- join main server's process group
POSIX::setpgid($$, $pgid);
##-- cleanup: environment
#delete @ENV{grep {$_ !~ /^(?:PATH|PERL|LANG|L[CD]_)/} keys %ENV};
##-- get relay command
my $cmd = ($srv->{relayCmd}
|| [
#qw(env -i), ##-- be paranoid
#qw(sockrelay.perl -syslog), "-label=dta-cab-relay/$port",
qw(socat -d -ly),
CAB/Server/HTTP/UNIX.pm view on Meta::CPAN
##======================================================================
## Methods: Local: error handling: inherited
##==============================================================================
## PACKAGE: DTA::CAB::Server::HTTP::UNIX::ClientConn
package DTA::CAB::Server::HTTP::UNIX::ClientConn;
use File::Basename qw(basename);
use DTA::CAB::Utils qw(:proc);
our @ISA = qw(HTTP::Daemon::ClientConn);
## ($pid,$uid,$gid) = $sock->peercred()
## + gets peer credentials; returns (-1,-1,-1) on failure
sub peercred {
my $sock = shift;
if ($sock->can('SO_PEERCRED')) {
my $buf = $sock->sockopt($sock->SO_PEERCRED);
return unpack('lll',$buf);
}
return (-1,-1,-1);
}
CAB/Server/HTTP/UNIX.pm view on Meta::CPAN
close($fh);
}
##-- debug
#print STDERR "PEERENV($sock): $_=$env{$_}\n" foreach (sort keys %env);
${*$sock}{'peerenv'} = \%env;
}
## $str = $sock->peerstr()
## $str = $sock->peerstr($uid,$gid,$pid)
## + returns stringified unix peer credentials: "${USER}.${GROUP}[${PID}]"
sub peerstr {
my ($sock,$pid,$uid,$gid) = @_;
($pid,$uid,$gid) = $sock->peercred() if (@_ < 4);
return (
(defined($uid) ? (getpwuid($uid)//'?') : '?')
.'.'
.(defined($gid) ? (getgrgid($gid)//'?') : '?')
.':'
.(defined($pid) ? (basename(pid_cmd($pid)//'?')."[$pid]") : '?[?]')
);
}
## $host = peerhost()
## + for relayed connections, gets underlying TCP peer via socat environment
## + for unix connections, returns UNIX credentials as as for peerstr()
sub peerhost {
my $sock = shift;
##-- get UNIX socket credentials
my ($pid,$uid,$gid) = $sock->peercred();
if (defined($pid) && basename(pid_cmd($pid)//'?') eq 'socat') {
##-- get socat environment variable if applicable
my $env = $sock->peerenv();
return $env->{DTA_CAB_RELAY_PEERADDR} if ($env && $env->{DTA_CAB_RELAY_PEERADDR});
}
##-- return UNIX socket credentials
return $sock->peerstr($pid,$uid,$gid);
}
## $port = peerport()
## + for relayed connections, gets underlying TCP port via socat environment
## + for unix connections, returns socket path
sub peerport {
my $sock = shift;
##-- get UNIX socket credentials
my ($pid,$uid,$gid) = $sock->peercred();
if (defined($pid) && basename(pid_cmd($pid)//'?') eq 'socat') {
##-- get socat environment variable if applicable
my $env = $sock->peerenv();
return $env->{DTA_CAB_RELAY_PEERPORT} if ($env && $env->{DTA_CAB_RELAY_PEERPORT});
}
##-- return UNIX socket path
return $sock->peerpath();
}
CAB/Server/HTTP/UNIX.pm view on Meta::CPAN
=cut
##------------------------------------------------------------------------
## SYNOPSIS: DTA::CAB::Server::HTTP::UNIX::ClientConn
=pod
=head2 DTA::CAB::Server::HTTP::UNIX::ClientConn Synopsis
($pid,$uid,$gid) = $sock->peercred();
\%env = $sock->peerenv();
$str = $sock->peerstr();
$host = peerhost();
$port = peerport();
=cut
##========================================================================
## DESCRIPTION
CAB/Server/HTTP/UNIX.pm view on Meta::CPAN
Arguments and object structure are mostly inherited from L<DTA::CAB::Server::HTTP|DTA::CAB::Server::HTTP>.
Local overrides and extensions:
(
##-- DTA::CAB::Server::HTTP overrides
daemonArgs => \%daemonArgs, ##-- overrides for HTTP::Daemon::UNIX->new(); default={Local=>'/tmp/dta-cab.sock'}
##
##-- DTA::CAB::Server::HTTP::UNIX extensions
socketPerms => $mode, ##-- socket permissions as an octal string (default='0666')
socketUser => $user, ##-- socket user or uid (root only; default=undef: current user)
socketGroup => $group, ##-- socket group or gid (default=undef: current group)
_socketPath => $path, ##-- bound socket path (for unlink() on destroy)
relayCmd => \@cmd, ##-- TCP relay command-line for exec() (default=[qw(socat ...)], see prepareRelay())
relayAddr => $addr, ##-- TCP relay address to bind (default=$daemonArgs{LocalAddr}, see prepareRelay())
relayPort => $port, ##-- TCP relay address to bind (default=$daemonArgs{LocalPort}, see prepareRelay())
relayPid => $pid, ##-- child PID for TCP relay process (sockrelay.perl / socat; see prepareRelay())
=item DESTROY
undef = $srv->DESTROY();
CAB/Server/HTTP/UNIX.pm view on Meta::CPAN
=item Variable: @ISA
L<DTA::CAB::Server::HTTP::UNIX|DTA::CAB::Server::HTTP::UNIX::ClientConn>
inherits from
L<HTTP::Daemon::ClientConn|HTTP::Daemon>
and should support most HTTP::Daemon::ClientConn methods.
=item peercred
($pid,$uid,$gid) = $sock->peercred();
Gets UNIX socket peer credentials; returns (-1,-1,-1) on failure.
=item peerenv
\%env = $sock->peerenv();
\%env = $sock->peerenv($pid);
Attempts to retrieve environment variables for peer process, if possible.
Uses cached value in ${*sock}{peerenv} if present,
otherwise attempts to open and parse F</proc/$pid/environ>.
Returns undef on failure.
=item peerstr
$str = $sock->peerstr();
$str = $sock->peerstr($uid,$gid,$pid);
Returns stringified unix peer credentials, "${USER}.${GROUP}[${PID}]".
=item peerhost
$host = peerhost();
For relayed connections, gets underlying TCP peer via socat environment (INET emulation);
for unix connections, returns UNIX credentials as as for peerstr().
CAB/WebServiceHowto.pod view on Meta::CPAN
the
"L<William Whitaker's Words|https://sourceforge.net/projects/wwwords/>"
Latin dictionary.
=item L<msafe|DTA::CAB::Analyzer::MorphSafe>
Heuristics for detecting "suspicious" analyses supplied
by the L</morph> component (L<TAGH|https://www.dwds.de/static/publications/text/Geyken_Hanneforth_fsmnlp.pdf>),
as described in L<Jurish (2012), App. A.4|http://./#jurish2012>.
=item L<langid|DTA::CAB::Analyzer::LangId::Simple>
Simple sentence-wise language guesser based on stopword lists
extracted from the python L<NLTK project|http://www.nltk.org/>.
Also supports the pseudo-language C<XY>, which is typically assigned
for mathematical notation, abbreviations, or other extra-lexical material.
=item L<rw|DTA::CAB::Analyzer::Rewrite>
Type-wise I<k>-best weighted finite-state rewrite cascade conflator ("nearest neighbors")
via L<GfsmXL|http://kaskade.dwds.de/~moocow/mirror/projects/gfsm/#gfsmxl> transducer cascade.
CAB/WebServiceHowto.pod view on Meta::CPAN
L<TAGH|https://www.dwds.de/static/publications/text/Geyken_Hanneforth_fsmnlp.pdf>
morphology for improved recall.
=item L<dmoot|DTA::CAB::Analyzer::Moot::Boltzmann>
Sentence-wise conflation candidate disambiguator as described in
L<Jurish (2012), Ch. 4|http://./#jurish2012>. Attempts to determine
the "best" modern form from the canidate conflations provided by
the L</exlex>, L</xlit>, L</eqphox>, and L</rw> components,
after consideration of the properties provided by the
L</morph>, L</msafe>, L</mlatin>, and L</langid> components
(e.g. sentences already identified as consisting primarily of
foreign-language material will B<not> be "forced" onto contemporary
German).
=item L<dmootsub|DTA::CAB::Analyzer::MootSub>
Sentence-wise post-processing for the L</dmoot> HMM.
Mostly useful for performing L<morphological analysis|/morph> on
non-trivial canonicalizations supplied by L</dmoot>.
* UNIX process group tweaks
* dta-cab-server.sh: kill whole process group on 'stop'
* clean Version.pm
* v1.99: improved handling for pathological Server::HTTP::UNIX conditions
(stale unix socket, stale relay process)
- server now only WARNs for stale relay sockets; dodgy 'fix' for
mantis bug #25326 (should be a valid fix for identical relay
command-lines as in bug #25326)
v1.98 2018-02-21 moocow
* moot langid FM.* pseudo-tags: keep CARD analyses too
* check for undef pid_cmd() output in Server::UNIX -- avoid heinous death in File::Basename::basename()
v1.97 2018-02-12 moocow
* v1.97: peerenv() optimization for DTA::CAB::Server::HTTP::UNIX::ClientConn
- only call peerenv() for peer command 'socat'
+ support http+unix:// scheme in DTA::CAB::Client::HTTP::lwpUrl()
v1.96 2018-02-09 moocow
* check for existing rc-file
* clean Version.pm
* removed bogus debug code from dta-cab-analyze.perl
* cab.plm: moot,dmoot use 'dtiger' infix instead of tiger
- centralized training source in moot-models/dta-dtiger
* Format/Raw.pm : handle U+00AD (SOFT HYPHEN)
* LangId::Simple : don't output lang_counts by default
* cab-rc-update.sh: update from kaskade
* Raw tokenizer: handle '[Formel]'
* improved LangId::Simple
- now counts number of stopword CHARACTERS (vs tokens)
- added better 'xy' rules, also added an xy 'stopword' list in
cab_automata/langid/data/xy.t
v1.45 2013-09-03 moocow
* CAB::Analyzer::LangId : got working again; results not very encouraging
* special handling for double-initial caps in Analyzer::Unicruft: updated version
* special handling for double-initial caps
* re-built logos using inkscape
* added new compatibility symlink cab-favicon.png
* removed old cab-favicon.png
* added new logos
* added caberr-64.png
* updated DTAClean: added 'hasmorph' key
* prune analyzers in cab.perl wrapper
* dingler: try to enable autoclean
* cab-http-9099: auto-clean on
* trimmed cab-http-9099.plm to ignore authentication
* updates from kaskade2 for debian/wheezy
* lang-guesser updates: unicode hacks
* Morph::Latin : only analyze if isLatinExt
* Moot: use FM.$lang as tag for language-guesser hack
* XML formatting woes
* built in langid heuristics to Moot/Boltzmann and Moot
* added LangId::Simple analyzer, built into DTA chain as 'langid'
v1.40 2013-04-30 moocow
* smarter verbosity for cab-rc-update.sh
* updated to use (my own) GermaNet::Flat API module, rather than clunky google code variant
* added -begin and -end CODE options to dta-cab-analyze.perl
* Format::Raw : parse underscores as word-like
v1.39 2013-04-24 moocow
* removed xlemma stuff again
* MootSub: generate moot/xlemma field: raw TAGH segmentation for best lemma
* split ExLex into {BDB,CDB} subclasses: todo: replace BDB by CDB for db-based lookups (ca 25% faster)
* removed stale BDB directory
* added Format::XmlTokWrapFast : quick+dirty fast output for feeding to dtatw-xml2ddc.perl
* more fixes (short format alias 'bin' for Storable)
* kaskade fixes for big dta build
* fixed wide-character bug in tj output
* update script debugging
* added documentation to README.update
* changed alias structure in Chain::DTA (default->norm rather than norm->default)
- no functional difference
* don't start langid server by default
* README: newline at EOF
* fixed CAB_RCDIR
* cab_corpus/ build: fixes & adjustments
* fixed TJ format bug for sentence attributes
* version, analyze verbosity for spawn
* got forked block-processing working
* pre-split blocks in dta-cab-analyze.perl
v1.17 2011-08-12 moocow
* work on new system/resources/ dir (as system/resources.new)