view release on metacpan or search on metacpan
CAB/Analyzer/Automaton.pm view on Meta::CPAN
## attOutput => $bool, ##-- if true, generate AT&T escapes in output (default=1)
## allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
## ## : useful: /^(?:(?:[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+)|(?:[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+))(?:[\'\x{2018}\x{2019}]s)?$/
## ## : == DTA::CAB::Analyzer::_am_wordlike_regex()
##
## ##-- Analysis objects
## fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton object (default=new)
## lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
## labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
## laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
## labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
## warned_symbols => \%sym2undef, ##-- tracks unknown symbols we've already warned about (for check_symbols != 0)
##
## ##-- INHERITED from DTA::CAB::Analyzer
## label => $label, ##-- analyzer label (default: from analyzer class name)
## typeKeys => \@keys, ##-- type-wise keys to expand
## )
sub new {
my $that = shift;
my $aut = $that->SUPER::new(
##-- filenames
CAB/Analyzer/Automaton.pm view on Meta::CPAN
## + fixes encoding difficulties in $aut->{labh}, $aut->{laba}
sub parseLabels {
my $aut = shift;
my $laba = $aut->{laba};
@$laba = @{$aut->{lab}->asArray};
my ($i);
foreach $i (grep { defined($laba->[$_]) } 0..$#$laba) {
$laba->[$i] = decode($aut->{labenc}, $laba->[$i]) if ($aut->{labenc});
$aut->{labh}{$laba->[$i]} = $i;
}
##-- setup labc: $labId = $labc->[ord($c)]; ##-- single unicode characater
## : @labIds = @$labc[unpack('U0U*',$s)]; ##-- batch lookup for strings (fast)
my @csyms = grep {defined($_) && length($_)==1} @$laba; ##-- @csyms = ($sym1, ...) s.t. each sym has len==1
@{$aut->{labc}}[map {ord($_)} @csyms] = @{$aut->{labh}}{@csyms};
##
return $aut;
}
##==============================================================================
## Methods: Persistence
##==============================================================================
CAB/Analyzer/Automaton.pm view on Meta::CPAN
bashWS => $str, ##-- if defined, input whitespace will be bashed to '$str' (default='_')
attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
attOutput => $bool, ##-- if true, generate AT&T escapes in output (default=1)
allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
## : useful: /(?:^[[:alpha:]\-\x{ac}]*[[:alpha:]]+$)|(?:^[[:alpha:]]+[[:alpha:]\-\x{ac}]+$)/
##-- Analysis objects
fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton object (default=new)
lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
##
##-- INHERITED from DTA::CAB::Analyzer
label => $label, ##-- analyzer label (default: from analyzer class name)
typeKeys => \@keys, ##-- type-wise keys to expand
=item clear
$aut = $aut->clear();
Clears the object.
CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN
## attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
## allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
## ## : useful: /^(?:(?:[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+)|(?:[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+))(?:[\'\x{2018}\x{2019}]s)?$/
## ## : == DTA::CAB::Analyzer::_am_wordlike_regex()
##
## ##-- Analysis objects
## fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton::Dyn object (default=new)
## lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
## labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
## laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
## labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
## result=>$resultfst, ##-- (child classes only) e.g. result fst
##
## ##-- INHERITED from DTA::CAB::Analyzer
## label => $label, ##-- analyzer label (default: from analyzer class name)
## typeKeys => \@keys, ##-- type-wise keys to expand
## )
sub new {
my $that = shift;
my $aut = $that->SUPER::new(
##-- filenames
CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN
## + fixes encoding difficulties in $aut->{labh}, $aut->{laba}
sub parseLabels {
my $aut = shift;
my $laba = $aut->{laba};
@$laba = @{$aut->{lab}->asArray};
my ($i);
foreach $i (grep { defined($laba->[$_]) } 0..$#$laba) {
$laba->[$i] = decode($aut->{labenc}, $laba->[$i]) if ($aut->{labenc});
$aut->{labh}{$laba->[$i]} = $i;
}
##-- setup labc: $labId = $labc->[ord($c)]; ##-- single unicode characater
## : @labIds = @$labc[unpack('U0U*',$s)]; ##-- batch lookup for strings (fast)
my @csyms = grep {defined($_) && length($_)==1} @$laba; ##-- @csyms = ($sym1, ...) s.t. each sym has len==1
@{$aut->{labc}}[map {ord($_)} @csyms] = @{$aut->{labh}}{@csyms};
##
return $aut;
}
##--------------------------------------------------------------
## Methods: I/O: Input: Dictionary
CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN
toupperI => $bool, ##-- if true, initial character will be upper-cased (default=0)
bashWS => $str, ##-- if defined, input whitespace will be bashed to '$str' (default='_')
attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
##
##-- Analysis objects
fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton::Dyn object (default=new)
lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
result=>$resultfst, ##-- (child classes only) e.g. result fst
dict => $dict, ##-- exception lexicon / static cache as DTA::CAB::Analyzer::Dict object
=item clear
$aut = $aut->clear();
Clears the object.
=back
CAB/Analyzer/Unicruft.pm view on Meta::CPAN
## Description: latin-1 approximator
package DTA::CAB::Analyzer::Unicruft;
use DTA::CAB::Analyzer;
use DTA::CAB::Datum ':all';
use DTA::CAB::Token;
use Unicruft;
use Unicode::Normalize; ##-- compatibility decomposition 'KD' (see Unicode TR #15)
#use Unicode::UCD; ##-- unicode character names, info, etc.
#use Unicode::CharName; ##-- ... faster access to character name, block
#use Text::Unidecode; ##-- last-ditch effort: transliterate to ASCII
use Encode qw(encode decode);
use IO::File;
use Carp;
use strict;
##==============================================================================
CAB/Analyzer/Unicruft.pm view on Meta::CPAN
##-- 2010-01-23: Mantis Bug #140: 'µ'="\x{b5}" gets mapped to 'm' rather than
## + (unicruft-v0.07) 'u'
## + (unicruft-v0.08) 'µ' (identity)
## + problem is NFKC-decomposition which maps
## 'µ'="\x{b5}" = Latin1 Supplement / MICRO SIGN
## to
## "\x{03bc}" = Greek and Coptic / GREEK SMALL LETTER MU
## + solution (hack): use NFC (canonical composition only)
## rather than NFKC (compatibility decomposition + canonical composition) here,
## and let Unicruft take care of decomposition
## + potentially problematic cases (from unicode normalization form techreport
## @ http://unicode.org/reports/tr15/ : fi ligature, 2^5, long-S + diacritics)
## are all handled correctly by unicruft
#$uc = Unicode::Normalize::NFKC($w); ##-- compatibility(?) decomposition + canonical composition
$uc = Unicode::Normalize::NFC($w); ##-- canonical composition only
##-- construct latin-1/de approximation
$ld = decode('latin1',Unicruft::utf8_to_latin1_de($uc));
##-- special handling for double-initial-caps, e.g. "AUf", "CHristus", "GOtt", etc.
$ld = ucfirst(lc($ld)) if ($ld =~ /^[[:upper:]]{2}[[:lower:]]+$/);
CAB/Analyzer/Unidecode.pm view on Meta::CPAN
## Description: latin-1 approximator (old)
package DTA::CAB::Analyzer::Unidecode;
use DTA::CAB::Analyzer;
use DTA::CAB::Datum ':all';
use DTA::CAB::Token;
use Unicode::Normalize; ##-- compatibility decomposition 'KD' (see Unicode TR #15)
use Text::Unidecode; ##-- last-ditch effort: transliterate to ASCII
#use Unicode::UCD; ##-- unicode character names, info, etc.
#use Unicode::CharName; ##-- ... faster access to character name, block
use Encode qw(encode decode);
use IO::File;
use Carp;
use strict;
##==============================================================================
## Globals
CAB/Format/SQLite.pm view on Meta::CPAN
##==============================================================================
## $fmt = CLASS_OR_OBJ->new(%args)
## + object structure: assumed HASH
## (
## ##---- Input
## doc => $doc, ##-- buffered input document
## db_user => $user, ##-- db user (required?)
## db_pass => $pass, ##-- db password (required?)
## db_dsn => $dsn, ##-- db dsn (set by fromFile())
## db_opts => \%dbopts, ##-- additional options for DBI->connect() ; default={sqlite_unicode=>1}
## f_which => $f_which, ##-- restriction (see fromFile())
## f_where => $f_where, ##-- target value for restriction (see fromFile())
## limit => $limit, ##-- sql limit clause (default: undef: none)
## keep_history => $bool, ##-- if true, parse history as well as raw data (default: 1)
## keep_null => $bool, ##-- if true, NULL values from db will be kept as undef (default: false)
## keep_eps => $bool, ##-- if true, empty-string values from db will be kept as undef (default: false)
## keep_temp => $bool, ##-- if true, temporary tables will be kept (default: false)
##
## ##---- Output
## #(disabled)
CAB/Format/SQLite.pm view on Meta::CPAN
## )
sub new {
my $that = shift;
return $that->SUPER::new(
##-- Input
#doc => undef,
db_user=>undef,
db_pass=>undef,
db_dsn=>undef,
db_opts=>{
sqlite_unicode=>1,
},
f_which=>undef,
f_where=>undef,
limit=>undef,
keep_history=>1,
keep_null=>0,
keep_eps=>0,
keep_temp=>0,
##-- Output
* cabx directory basically in place
* automaton resultfst crashing
* added logos
* cab demo: added logo
* added 48p logo
* tag-hacks: added mathematical operators to 'punctuation-like' class
* MootSub tag-tweaking hacks: avoid 'normal' tags for non-wordlike tokens
v1.60 2014-08-22 moocow
* fixed DTA::CAB::Analyzer::_am_wordlike_regex() to allow combining diacritical whetver [[:alpha:]] is included
- unicode should really call these things alphabetic, imho, but it doesn't
v1.59 2014-06-24 moocow
* added dta 'lemma', 'lemma1' chains (with exlex)
* sleep between stop and start actions on restart
* allow direct demo-gui display of xml responses
- fixed 'pretty' parameter pass-through bug in DTA::CAB::Format::Registry::newFormat()
- stop tcf format complaining about missing document for spliceback (avoid garbage in apache logs)
v1.58 2014-06-16 moocow
* added example scripts cab-curl-post.sh, cab-curl-xpost.sh
DS-traversal with potential cycles, caused infinite allocation
loop and memory explosion in 'real' CAB servers)
* added /upload and /file paths to cab-http.plm
* demo/upload tweaks (don't call it 'upload')
* file upload updates
* merged in branch htdocs-1.41-upload -r9728:9736
* fixed YAML dispatch
* updated demo.js: make traffic-light frame work in proxy mode
* language guesser tests
* wrap various YAML implementations directly in YAML.pm (rather than subclass hacks)
* LangId::Simple: only use unicode character block hacks for words of length >= 2
* hasmorph for text-mode output
* updated DTAClean: added 'hasmorph' key
* prune analyzers in cab.perl wrapper
* dingler: try to enable autoclean
* cab-http-9099: auto-clean on
* trimmed cab-http-9099.plm to ignore authentication
* updates from kaskade2 for debian/wheezy
* lang-guesser updates: unicode hacks
* Morph::Latin : only analyze if isLatinExt
* Moot: use FM.$lang as tag for language-guesser hack
* XML formatting woes
* built in langid heuristics to Moot/Boltzmann and Moot
* added LangId::Simple analyzer, built into DTA chain as 'langid'
v1.40 2013-04-30 moocow
* smarter verbosity for cab-rc-update.sh
* updated to use (my own) GermaNet::Flat API module, rather than clunky google code variant
* added -begin and -end CODE options to dta-cab-analyze.perl
* v1.27: blockScan fixes for Format::XmlNative (and by inheritance Format::XmlTokWrapFast)
- fixes mantis bug #543 : disappearing pages
- this worked with negative lookahead regexes, but those crash perl on some inputs (grr....)
v1.26 2012-07-06 moocow
* debug
* cab-rc-update.sh: pull from dta2012/cab rather than ddc/cab
* real new DTA-unknown-char U+FFFC (object replacement character), various bugfixes
v1.25 2012-07-04 moocow
* cab improvements for dealing with unicode replacement character (U+FFFD) as unknown-text marker
* workaround for blockScan() segfault: slower but works on plato
* segfault bughunt / kaskade:
- dying at Format/XmlNative.pm line 146 (regex match in blockScanFoot) for
ddc/dta2012/build/xml_tok/campe_robinson02_1780.TEI-P5.chr.ddc.t.xml
in build/cab_corpus
- only dying under make (make -j , -blockSize don't matter)
- segfault backtrace:
0x00002b26f788ef77 in ?? () from /usr/lib/libperl.so.5.10
(gdb) bt
#0 0x00002b26f788ef77 in ?? () from /usr/lib/libperl.so.5.10
* added DTA pseudo-analyzer 'null'
* tei fix
* ner fix
* added NER to DTA chain
* moved nerec/ into tests/
* added nerec/ test directory for syncope ne-recognition
* added Analyzer::SynCoPe::NER : named-entity recognition via SynCoPe XML-RPC server
v1.24 2012-03-28 moocow
* dta-cab-analyze.perl -fo option fix
* even more msafe adaptation; use unicode class \p{Letter}
* more msafe adaptation
* typo fix
* updated MorphSafe:
- all-non-alphabetic tokens are now considered "safe" (replaces /^[[:punct:][:digit:]]*$/ heuristic)
* add U+A75B (r rotunda) to latin1x-safe symbols
* added rudimentary query handling to cab demo.js, demo.html.tpl
* improved lemmatization for XY (no lower-case bashing)
* added canonical option to Format::TJ if level>=0
* hack: remove ge\| prefixes in lemmatizer
* added live javascript demo.js to taghx-http.plm