unicode results from the CPAN

unicode

DTA-CAB

view release on metacpan or search on metacpan

CAB/Analyzer/Automaton.pm view on Meta::CPAN

##     attOutput      => $bool, ##-- if true, generate AT&T escapes in output (default=1)
##     allowTextRegex => $re,   ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
##                              ##   : useful: /^(?:(?:[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+)|(?:[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+))(?:[\'\x{2018}\x{2019}]s)?$/
##                              ##   :     ==  DTA::CAB::Analyzer::_am_wordlike_regex()
##
##     ##-- Analysis objects
##     fst  => $gfst,      ##-- (child classes only) e.g. a Gfsm::Automaton object (default=new)
##     lab  => $lab,       ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
##     labh => \%sym2lab,  ##-- (?) label hash:  $sym2lab{$labSym} = $labId;
##     laba => \@lab2sym,  ##-- (?) label array:  $lab2sym[$labId]  = $labSym;
##     labc => \@chr2lab,  ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
##     warned_symbols => \%sym2undef, ##-- tracks unknown symbols we've already warned about (for check_symbols != 0)
##
##     ##-- INHERITED from DTA::CAB::Analyzer
##     label => $label,    ##-- analyzer label (default: from analyzer class name)
##     typeKeys => \@keys, ##-- type-wise keys to expand
##    )
sub new {
  my $that = shift;
  my $aut = $that->SUPER::new(
			      ##-- filenames

CAB/Analyzer/Automaton.pm view on Meta::CPAN

##  + fixes encoding difficulties in $aut->{labh}, $aut->{laba}
sub parseLabels {
  my $aut = shift;
  my $laba = $aut->{laba};
  @$laba = @{$aut->{lab}->asArray};
  my ($i);
  foreach $i (grep { defined($laba->[$_]) } 0..$#$laba) {
    $laba->[$i] = decode($aut->{labenc}, $laba->[$i]) if ($aut->{labenc});
    $aut->{labh}{$laba->[$i]} = $i;
  }
  ##-- setup labc: $labId  = $labc->[ord($c)];             ##-- single unicode characater
  ##             : @labIds = @$labc[unpack('U0U*',$s)];    ##-- batch lookup for strings (fast)
  my @csyms = grep {defined($_) && length($_)==1} @$laba;  ##-- @csyms = ($sym1, ...) s.t. each sym has len==1
  @{$aut->{labc}}[map {ord($_)} @csyms] = @{$aut->{labh}}{@csyms};
  ##
  return $aut;
}

##==============================================================================
## Methods: Persistence
##==============================================================================

CAB/Analyzer/Automaton.pm view on Meta::CPAN

 bashWS         => $str,  ##-- if defined, input whitespace will be bashed to '$str' (default='_')
 attInput       => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
 attOutput      => $bool, ##-- if true, generate AT&T escapes in output (default=1)
 allowTextRegex => $re,   ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
                          ##   : useful: /(?:^[[:alpha:]\-\x{ac}]*[[:alpha:]]+$)|(?:^[[:alpha:]]+[[:alpha:]\-\x{ac}]+$)/
 ##-- Analysis objects
 fst  => $gfst,      ##-- (child classes only) e.g. a Gfsm::Automaton object (default=new)
 lab  => $lab,       ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
 labh => \%sym2lab,  ##-- (?) label hash:  $sym2lab{$labSym} = $labId;
 laba => \@lab2sym,  ##-- (?) label array:  $lab2sym[$labId]  = $labSym;
 labc => \@chr2lab,  ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
 ##
 ##-- INHERITED from DTA::CAB::Analyzer
 label => $label,    ##-- analyzer label (default: from analyzer class name)
 typeKeys => \@keys, ##-- type-wise keys to expand

=item clear

 $aut = $aut->clear();

Clears the object.

CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN

##     attInput       => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
##     allowTextRegex => $re,   ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
##                              ##   : useful: /^(?:(?:[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+)|(?:[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+))(?:[\'\x{2018}\x{2019}]s)?$/
##                              ##   :     ==  DTA::CAB::Analyzer::_am_wordlike_regex()
##
##     ##-- Analysis objects
##     fst  => $gfst,      ##-- (child classes only) e.g. a Gfsm::Automaton::Dyn object (default=new)
##     lab  => $lab,       ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
##     labh => \%sym2lab,  ##-- (?) label hash:  $sym2lab{$labSym} = $labId;
##     laba => \@lab2sym,  ##-- (?) label array:  $lab2sym[$labId]  = $labSym;
##     labc => \@chr2lab,  ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
##     result=>$resultfst, ##-- (child classes only) e.g. result fst
##
##     ##-- INHERITED from DTA::CAB::Analyzer
##     label => $label,    ##-- analyzer label (default: from analyzer class name)
##     typeKeys => \@keys, ##-- type-wise keys to expand
##    )
sub new {
  my $that = shift;
  my $aut = $that->SUPER::new(
			      ##-- filenames

CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN

##  + fixes encoding difficulties in $aut->{labh}, $aut->{laba}
sub parseLabels {
  my $aut = shift;
  my $laba = $aut->{laba};
  @$laba = @{$aut->{lab}->asArray};
  my ($i);
  foreach $i (grep { defined($laba->[$_]) } 0..$#$laba) {
    $laba->[$i] = decode($aut->{labenc}, $laba->[$i]) if ($aut->{labenc});
    $aut->{labh}{$laba->[$i]} = $i;
  }
  ##-- setup labc: $labId  = $labc->[ord($c)];             ##-- single unicode characater
  ##             : @labIds = @$labc[unpack('U0U*',$s)];    ##-- batch lookup for strings (fast)
  my @csyms = grep {defined($_) && length($_)==1} @$laba;  ##-- @csyms = ($sym1, ...) s.t. each sym has len==1
  @{$aut->{labc}}[map {ord($_)} @csyms] = @{$aut->{labh}}{@csyms};
  ##
  return $aut;
}

##--------------------------------------------------------------
## Methods: I/O: Input: Dictionary

CAB/Analyzer/Automaton/Dyn.pm view on Meta::CPAN

 toupperI       => $bool, ##-- if true, initial character will be upper-cased (default=0)
 bashWS         => $str,  ##-- if defined, input whitespace will be bashed to '$str' (default='_')
 attInput       => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
 allowTextRegex => $re,   ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
 ##
 ##-- Analysis objects
 fst  => $gfst,      ##-- (child classes only) e.g. a Gfsm::Automaton::Dyn object (default=new)
 lab  => $lab,       ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
 labh => \%sym2lab,  ##-- (?) label hash:  $sym2lab{$labSym} = $labId;
 laba => \@lab2sym,  ##-- (?) label array:  $lab2sym[$labId]  = $labSym;
 labc => \@chr2lab,  ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
 result=>$resultfst, ##-- (child classes only) e.g. result fst
 dict => $dict,      ##-- exception lexicon / static cache as DTA::CAB::Analyzer::Dict object

=item clear

 $aut = $aut->clear();

Clears the object.

=back

CAB/Analyzer/Unicruft.pm view on Meta::CPAN

## Description: latin-1 approximator

package DTA::CAB::Analyzer::Unicruft;

use DTA::CAB::Analyzer;
use DTA::CAB::Datum ':all';
use DTA::CAB::Token;

use Unicruft;
use Unicode::Normalize; ##-- compatibility decomposition 'KD' (see Unicode TR #15)
#use Unicode::UCD;       ##-- unicode character names, info, etc.
#use Unicode::CharName;  ##-- ... faster access to character name, block
#use Text::Unidecode;    ##-- last-ditch effort: transliterate to ASCII

use Encode qw(encode decode);
use IO::File;
use Carp;

use strict;

##==============================================================================

CAB/Analyzer/Unicruft.pm view on Meta::CPAN

    ##-- 2010-01-23: Mantis Bug #140: 'µ'="\x{b5}" gets mapped to 'm' rather than
    ##   + (unicruft-v0.07) 'u'
    ##   + (unicruft-v0.08) 'µ' (identity)
    ##   + problem is NFKC-decomposition which maps
    ##       'µ'="\x{b5}" = Latin1 Supplement / MICRO SIGN
    ##     to
    ##       "\x{03bc}" = Greek and Coptic / GREEK SMALL LETTER MU
    ##   + solution (hack): use NFC (canonical composition only)
    ##     rather than NFKC (compatibility decomposition + canonical composition) here,
    ##     and let Unicruft take care of decomposition
    ##   + potentially problematic cases (from unicode normalization form techreport
    ##     @ http://unicode.org/reports/tr15/ : fi ligature, 2^5, long-S + diacritics)
    ##     are all handled correctly by unicruft
    #$uc  = Unicode::Normalize::NFKC($w); ##-- compatibility(?) decomposition + canonical composition
    $uc  = Unicode::Normalize::NFC($w);   ##-- canonical composition only

    ##-- construct latin-1/de approximation
    $ld = decode('latin1',Unicruft::utf8_to_latin1_de($uc));

    ##-- special handling for double-initial-caps, e.g. "AUf", "CHristus", "GOtt", etc.
    $ld = ucfirst(lc($ld)) if ($ld =~ /^[[:upper:]]{2}[[:lower:]]+$/);

CAB/Analyzer/Unidecode.pm view on Meta::CPAN

## Description: latin-1 approximator (old)

package DTA::CAB::Analyzer::Unidecode;

use DTA::CAB::Analyzer;
use DTA::CAB::Datum ':all';
use DTA::CAB::Token;

use Unicode::Normalize; ##-- compatibility decomposition 'KD' (see Unicode TR #15)
use Text::Unidecode;    ##-- last-ditch effort: transliterate to ASCII
#use Unicode::UCD;       ##-- unicode character names, info, etc.
#use Unicode::CharName;  ##-- ... faster access to character name, block

use Encode qw(encode decode);
use IO::File;
use Carp;

use strict;

##==============================================================================
## Globals

CAB/Format/SQLite.pm view on Meta::CPAN

##==============================================================================

## $fmt = CLASS_OR_OBJ->new(%args)
##  + object structure: assumed HASH
##    (
##     ##---- Input
##     doc => $doc,                    ##-- buffered input document
##     db_user => $user,	       ##-- db user (required?)
##     db_pass => $pass,	       ##-- db password (required?)
##     db_dsn  => $dsn,		       ##-- db dsn (set by fromFile())
##     db_opts => \%dbopts,	       ##-- additional options for DBI->connect() ; default={sqlite_unicode=>1}
##     f_which => $f_which,            ##-- restriction (see fromFile())
##     f_where => $f_where,            ##-- target value for restriction (see fromFile())
##     limit => $limit,		       ##-- sql limit clause (default: undef: none)
##     keep_history => $bool,	       ##-- if true, parse history as well as raw data (default: 1)
##     keep_null => $bool,	       ##-- if true, NULL values from db will be kept as undef (default: false)
##     keep_eps => $bool,	       ##-- if true, empty-string values from db will be kept as undef (default: false)
##     keep_temp => $bool,	       ##-- if true, temporary tables will be kept (default: false)
##
##     ##---- Output
##     #(disabled)

CAB/Format/SQLite.pm view on Meta::CPAN

##    )
sub new {
  my $that = shift;
  return $that->SUPER::new(
			   ##-- Input
			   #doc => undef,
			   db_user=>undef,
			   db_pass=>undef,
			   db_dsn=>undef,
			   db_opts=>{
				     sqlite_unicode=>1,
				    },
			   f_which=>undef,
			   f_where=>undef,
			   limit=>undef,
			   keep_history=>1,
			   keep_null=>0,
			   keep_eps=>0,
			   keep_temp=>0,

			   ##-- Output

Changes view on Meta::CPAN

	* cabx directory basically in place
	* automaton resultfst crashing
	* added logos
	* cab demo: added logo
	* added 48p logo
	* tag-hacks: added mathematical operators to 'punctuation-like' class
	* MootSub tag-tweaking hacks: avoid 'normal' tags for non-wordlike tokens

v1.60 2014-08-22  moocow
	* fixed DTA::CAB::Analyzer::_am_wordlike_regex() to allow combining diacritical whetver [[:alpha:]] is included
	  - unicode should really call these things alphabetic, imho, but it doesn't

v1.59 2014-06-24  moocow
	* added dta 'lemma', 'lemma1' chains (with exlex)
	* sleep between stop and start actions on restart
	* allow direct demo-gui display of xml responses
	  - fixed 'pretty' parameter pass-through bug in DTA::CAB::Format::Registry::newFormat()
	  - stop tcf format complaining about missing document for spliceback (avoid garbage in apache logs)

v1.58 2014-06-16  moocow
	* added example scripts cab-curl-post.sh, cab-curl-xpost.sh

Changes view on Meta::CPAN

	    DS-traversal with potential cycles, caused infinite allocation
	    loop and memory explosion in 'real' CAB servers)
	* added /upload and /file paths to cab-http.plm
	* demo/upload tweaks (don't call it 'upload')
	* file upload updates
	* merged in branch htdocs-1.41-upload -r9728:9736
	* fixed YAML dispatch
	* updated demo.js: make traffic-light frame work in proxy mode
	* language guesser tests
	* wrap various YAML implementations directly in YAML.pm (rather than subclass hacks)
	* LangId::Simple: only use unicode character block hacks for words of length >= 2
	* hasmorph for text-mode output
	* updated DTAClean: added 'hasmorph' key
	* prune analyzers in cab.perl wrapper
	* dingler: try to enable autoclean
	* cab-http-9099: auto-clean on
	* trimmed cab-http-9099.plm to ignore authentication
	* updates from kaskade2 for debian/wheezy
	* lang-guesser updates: unicode hacks
	* Morph::Latin : only analyze if isLatinExt
	* Moot: use FM.$lang as tag for language-guesser hack
	* XML formatting woes
	* built in langid heuristics to Moot/Boltzmann and Moot
	* added LangId::Simple analyzer, built into DTA chain as 'langid'

v1.40 2013-04-30  moocow
	* smarter verbosity for cab-rc-update.sh
	* updated to use (my own) GermaNet::Flat API module, rather than clunky google code variant
	* added -begin and -end CODE options to dta-cab-analyze.perl

Changes view on Meta::CPAN

	* v1.27: blockScan fixes for Format::XmlNative (and by inheritance Format::XmlTokWrapFast)
	  - fixes mantis bug #543 : disappearing pages
	  - this worked with negative lookahead regexes, but those crash perl on some inputs (grr....)

v1.26 2012-07-06  moocow
	* debug
	* cab-rc-update.sh: pull from dta2012/cab rather than ddc/cab
	* real new DTA-unknown-char U+FFFC (object replacement character), various bugfixes

v1.25 2012-07-04  moocow
	* cab improvements for dealing with unicode replacement character (U+FFFD) as unknown-text marker
	* workaround for blockScan() segfault: slower but works on plato
	* segfault bughunt / kaskade:
	  - dying at Format/XmlNative.pm line 146 (regex match in blockScanFoot) for
	    ddc/dta2012/build/xml_tok/campe_robinson02_1780.TEI-P5.chr.ddc.t.xml
	    in build/cab_corpus
	  - only dying under make (make -j , -blockSize don't matter)
	  - segfault backtrace:
	  0x00002b26f788ef77 in ?? () from /usr/lib/libperl.so.5.10
	  (gdb) bt
	  #0 0x00002b26f788ef77 in ?? () from /usr/lib/libperl.so.5.10

Changes view on Meta::CPAN

	* added DTA pseudo-analyzer 'null'
	* tei fix
	* ner fix
	* added NER to DTA chain
	* moved nerec/ into tests/
	* added nerec/ test directory for syncope ne-recognition
	* added Analyzer::SynCoPe::NER : named-entity recognition via SynCoPe XML-RPC server

v1.24 2012-03-28  moocow
	* dta-cab-analyze.perl -fo option fix
	* even more msafe adaptation; use unicode class \p{Letter}
	* more msafe adaptation
	* typo fix
	* updated MorphSafe:
	  - all-non-alphabetic tokens are now considered "safe" (replaces /^[[:punct:][:digit:]]*$/ heuristic)
	* add U+A75B (r rotunda) to latin1x-safe symbols
	* added rudimentary query handling to cab demo.js, demo.html.tpl
	* improved lemmatization for XY (no lower-case bashing)
	* added canonical option to Format::TJ if level>=0
	* hack: remove ge\| prefixes in lemmatizer
	* added live javascript demo.js to taghx-http.plm

( run in 0.394 second using v1.01-cache-2.11-cpan-f29a10751f0 )