DTA-CAB
view release on metacpan or search on metacpan
CAB/Analyzer.pm view on Meta::CPAN
## -*- Mode: CPerl -*-
##
## File: DTA::CAB::Analyzer.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: generic analyzer API
package DTA::CAB::Analyzer;
use DTA::CAB::Persistent;
use DTA::CAB::Logger;
use DTA::CAB::Datum ':all';
use DTA::CAB::Utils ':minmax', ':files', ':time';
use File::Basename qw(basename dirname);
use Scalar::Util qw(weaken);
use Exporter;
use Carp;
use strict;
##==============================================================================
## Globals
##==============================================================================
our @ISA = qw(Exporter DTA::CAB::Persistent);
our @EXPORT = qw();
our %EXPORT_TAGS =
(
'access' => [qw(_am_xtext _am_xlit _am_lts _am_rw),
qw(_am_tt_list _am_tt_fst),
qw(_am_id_fst _am_xlit_fst),
qw(_am_fst_wcp _am_fst_wcp_list _am_fst_wcp_listref),
qw(_am_tt_fst_list _am_tt_fst_eqlist),
qw(_am_fst_sort _am_fst_rsort _am_fst_uniq _am_fst_usort),
qw(_am_clean),
qw(_am_tag _am_word _am_lemma),
qw(_am_tagh_fst2moota _am_tagh_list2moota _am_tagh_moota_uniq _am_tagh_list2moota_uniq),
qw(_am_dmoot_fst2moota _am_dmoot_list2moota),
qw(_am_wordlike_regex),
qw(parseFstString),
],
);
our @EXPORT_OK = map {@$_} values %EXPORT_TAGS;
$EXPORT_TAGS{all} = [@EXPORT_OK];
$EXPORT_TAGS{child} = [@EXPORT_OK];
## %CLOSURE_CACHE = ("$object" => { "$closureCode" => \&closure, ... }, ... }
## + cache for accessClosure() to avoid unnecessary re-compilation
our (%CLOSURE_CACHE);
##==============================================================================
## Constructors etc.
##==============================================================================
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure:
## (
## label => $label, ##-- analyzer label (default: from class name)
## aclass => $class, ##-- analysis class (optional; see $anl->analysisClass() method; default=undef)
## typeKeys => \@keys, ##-- analyzer type keys for $anl->typeKeys()
## enabled => $bool, ##-- set to false, non-undef value to disable this analyzer
## initQuiet => $bool, ##-- if true, initInfo() will not print any output
## traceLevel => $level, ##-- log-level for trace messages (default=undef: none)
## )
sub new {
my $that = shift;
my $anl = bless({
##-- user args
@_
}, ref($that)||$that);
$anl->initialize();
$anl->{label} = $anl->defaultLabel() if (!defined($anl->{label})); ##-- get label
return $anl;
}
CAB/Analyzer.pm view on Meta::CPAN
}
## $tok = $anl->analyzeSentence($sent_or_array,\%opts)
## + perform type- and token-, and sentence- analyses on $sent_or_array
## + wrapper for $anl->analyzeDocument()
sub analyzeSentence {
my ($anl,$sent,$opts) = @_;
$sent = [$sent] if (!UNIVERSAL::isa($sent,'ARRAY'));
@$sent = map {toToken($_)} @$sent;
my $doc = toDocument([toSentence($sent)]);
$anl->analyzeDocument($doc, { ($opts ? %$opts : qw()), doAnalyzeLocal=>0});
return $doc->{body}[0];
}
## $rpc_xml_base64 = $anl->analyzeData($data_str,\%opts)
## + analyze a raw (formatted) data string $data_str with internal parsing & formatting
## + wrapper for $anl->analyzeDocument()
sub analyzeData {
require RPC::XML;
my ($anl,$doc0,$opts) = @_;
##-- parsing & formatting options
my $reader = $opts && $opts->{reader} ? $opts->{reader} : {}; ##-- reader options
my $writer = $opts && $opts->{writer} ? $opts->{writer} : {}; ##-- writer options
##-- get format reader,writer
my $ifmt = DTA::CAB::Format->newReader(%$reader);
my $ofmt = DTA::CAB::Format->newWriter(class=>ref($ifmt), %$writer);
##-- parse, analyze, format
my $doc = $ifmt->parseString($doc0);
#$doc = DTA::CAB::Utils::deep_decode('UTF-8', $doc); ##-- this should NOT be necessary!
$doc = $anl->analyzeDocument($doc,$opts);
my $str = $ofmt->flush->putDocument($doc)->toString;
$ofmt->flush;
return RPC::XML::base64->new($str);
}
##------------------------------------------------------------------------
## Methods: Analysis: Closure Utilities (optional)
## \&closure = $anl->analyzeClosure($which)
## + returns cached $anl->{"_analyze${which}"} if present
## + otherwise calls $anl->getAnalyzeClosure($which) & caches result
## + optional utility for closure-based analysis
sub analyzeClosure {
my ($anl,$which) = @_;
return $anl->{"_analyze${which}"} if (defined($anl->{"_analyze${which}"}));
return $anl->{"_analyze${which}"} = $anl->getAnalyzeClosure($which);
}
## \&closure = $anl->getAnalyzeClosure($which)
## + returns closure \&closure for analyzing data of type "$which"
## (e.g. Word, Type, Token, Sentence, Document, ...)
## + default implementation calls $anl->getAnalyze"${which}"Closure() if
## available, otherwise croak()s
sub getAnalyzeClosure {
my ($anl,$which) = @_;
my $getsub = $anl->can("getAnalyze${which}Closure");
weaken($anl);
return $getsub->($anl) if ($getsub);
$anl->logconfess("getAnalyzeClosure('$which'): no getAnalyze${which}Closure() method defined!");
}
##------------------------------------------------------------------------
## Methods: Analysis: (Token-)Accessor Closures
## $closure = $anl->accessClosure( $methodName, %opts);
## $closure = $anl->accessClosure(\&codeRef, %opts);
## $closure = $anl->accessClosure( $codeString, %opts);
## $closure = $anl->accessClosure(\%opts );
## + returns accessor-closure $closure for $anl
## + passed argument can be one of the following:
## - a CODE ref resolves to itself
## - a method name resolves to $anl->can($methodName)
## - anything else resolves to a string passed to eval()
## + if the string contains no /\bsub\b/, it will be wrapped
## as "sub {$codeString}"
## + $codeString may reference the closure variable $anl
## (and maybe others; see 'pre' and 'vars' options)
## + %opts
## code => $codeRefOrMethodNameOrCodeString, ##-- clobbers first argument
## pre => $code_str, ##-- for $codeString accessors, prefix for eval (e.g. 'my ($lexVar);')
## vars => \@vars, ##-- adds lexical vars 'my ('.join(',',@varNames).');'
## cache => $bool, ##-- enable/disable use of %CLOSURE_CACHE (default=enabled)
sub accessClosure {
my ($anl,$code,%opts) = @_;
if (UNIVERSAL::isa($code,'HASH')) {
%opts = (%$code,%opts);
$code = undef;
}
$code = $opts{code} if (defined($opts{code}));
$code = ';' if (!defined($code));
return $code if (UNIVERSAL::isa($code,'CODE'));
return $anl->can($code) if ($anl->can($code));
$code = (''
.($opts{pre} ? "$opts{pre}; " : '')
.($opts{vars} ? ('my ('.join(',',@{$opts{vars}}).'); ') : '')
.($code =~ /\bsub\b/ ? $code : "sub { $code }")
);
print STDERR
((ref($anl)||$anl), "->accessClosure():\n$code\n") if (0 || (ref($anl) && $anl->{debugAccessClosure}));
my $do_cache = !exists($opts{cache}) || $opts{cache};
my $sub = ($do_cache ? $CLOSURE_CACHE{$anl}{$code} : undef);
my $cached = $sub ? 1 : 0;
weaken($anl);
$sub ||= eval $code;
$anl->logcluck("accessClosure(): could not compile closure {$code}: $@") if (!$sub);
$CLOSURE_CACHE{$anl}{$code} = $sub if ($do_cache && !$cached);
return $sub;
}
## PACKAGE::_am_xlit($tokvar='$_')
## + access-closure macro (EXPR): get text (xlit.latin1Text << text) for token $$tokvar
## + evaluates to a string:
## ($$tokvar->{xlit} ? $$tokvar->{xlit}{latin1Text} : $$tokvar->{text})
sub _am_xlit {
my $tokvar = shift || '$_';
return "($tokvar\->{xlit} ? $tokvar\->{xlit}{latin1Text} : $tokvar\->{text}) ##== _am_xlit\n";
}
## PACKAGE::_am_xtext($tokvar='$_')
## + access-closure macro (EXPR): get (exlex << xlit.latin1Text << text) for token $$tokvar
## + evaluates to a string:
## (defined($$tokvar->{exlex}) ? $$tokvar->{exlex} : $${am_xlit($tokvar)})
sub _am_xtext {
my $tokvar = shift || '$_';
return ("(defined($tokvar\->{exlex})"
." ? $tokvar\->{exlex}"
." : ($tokvar\->{xlit} ? $tokvar\->{xlit}{latin1Text} : $tokvar\->{text})"
.") ##== _am_xtext\n"
);
}
## PACKAGE::_am_lts($tokvar='$_')
## + access-closure macro (EXPR) for first LTS analysis of token $$tokvar
## + evaluates to string:
## ($$tokvar->{lts} && @{$$tokvar->{lts}} ? $$tokvar->{lts}[0]{hi} : $$tokvar->{text})
sub _am_lts {
my $tokvar = shift || '$_';
return "($tokvar\->{lts} && \@{$tokvar\->{lts}} ? $tokvar\->{lts}[0]{hi} : $tokvar\->{text}) ##== _am_lts\n";
}
## PACKAGE::_am_rw($tokvar='$_')
## + access-closure macro (EXPR) for rw output(s) for token $$tokvar
## + evaluates to string:
## ($$tokvar->{rw} ? (map {$_->{hi}} @{$$tokvar->{rw}}) : qw())
sub _am_rw {
my $tokvar = shift || '$_';
return "($tokvar\->{rw} ? (map {\$_->{hi}} \@{$tokvar\->{rw}}) : qw()) ##== _am_rw\n";
}
## PACKAGE::_am_tt_list($ttvar='$_')
## + access-closure macro (EXPR) for a TT-style list of strings $$ttvar
## + evaluatees to a list: "split(/\\t/,$$ttvar)"
sub _am_tt_list {
my $ttvar = shift || '$_';
return "split(/\\t/,$ttvar) ##== _am_tt_list\n";
}
## PACKAGE::_am_tt_fst($ttvar='$_')
## + access-closure macro (EXPR) for a single TT-style FST analysis $$ttvar
## + formerly mutliply defined in sub-packages as SUBPACKAGE::parseFstString()
## + evaluates to a FST-analysis hash {hi=>$hi,w=>$w,lo=>$lo,lemma=>$lemma}:
## (
( run in 2.659 seconds using v1.01-cache-2.11-cpan-8f98c5d2c55 )