DTA-CAB

 view release on metacpan or  search on metacpan

CAB/Analyzer.pm  view on Meta::CPAN

## -*- Mode: CPerl -*-
##
## File: DTA::CAB::Analyzer.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: generic analyzer API

package DTA::CAB::Analyzer;
use DTA::CAB::Persistent;
use DTA::CAB::Logger;
use DTA::CAB::Datum ':all';
use DTA::CAB::Utils ':minmax', ':files', ':time';
use File::Basename qw(basename dirname);
use Scalar::Util qw(weaken);
use Exporter;
use Carp;
use strict;

##==============================================================================
## Globals
##==============================================================================

our @ISA = qw(Exporter DTA::CAB::Persistent);

our @EXPORT = qw();
our %EXPORT_TAGS =
  (
   'access' => [qw(_am_xtext _am_xlit _am_lts _am_rw),
		qw(_am_tt_list _am_tt_fst),
		qw(_am_id_fst _am_xlit_fst),
		qw(_am_fst_wcp _am_fst_wcp_list _am_fst_wcp_listref),
		qw(_am_tt_fst_list _am_tt_fst_eqlist),
		qw(_am_fst_sort _am_fst_rsort _am_fst_uniq _am_fst_usort),
		qw(_am_clean),
		qw(_am_tag _am_word _am_lemma),
		qw(_am_tagh_fst2moota _am_tagh_list2moota _am_tagh_moota_uniq _am_tagh_list2moota_uniq),
		qw(_am_dmoot_fst2moota _am_dmoot_list2moota),
		qw(_am_wordlike_regex),
		qw(parseFstString),
	       ],
  );
our @EXPORT_OK = map {@$_} values %EXPORT_TAGS;
$EXPORT_TAGS{all}   = [@EXPORT_OK];
$EXPORT_TAGS{child} = [@EXPORT_OK];

## %CLOSURE_CACHE = ("$object" => { "$closureCode" => \&closure, ... }, ... }
##  + cache for accessClosure() to avoid unnecessary re-compilation
our (%CLOSURE_CACHE);

##==============================================================================
## Constructors etc.
##==============================================================================

## $obj = CLASS_OR_OBJ->new(%args)
##  + object structure:
##    (
##     label => $label,    ##-- analyzer label (default: from class name)
##     aclass => $class,   ##-- analysis class (optional; see $anl->analysisClass() method; default=undef)
##     typeKeys => \@keys, ##-- analyzer type keys for $anl->typeKeys()
##     enabled => $bool,   ##-- set to false, non-undef value to disable this analyzer
##     initQuiet => $bool, ##-- if true, initInfo() will not print any output
##     traceLevel => $level, ##-- log-level for trace messages (default=undef: none)
##    )
sub new {
  my $that = shift;
  my $anl = bless({
		   ##-- user args
		   @_
		  }, ref($that)||$that);
  $anl->initialize();
  $anl->{label} = $anl->defaultLabel() if (!defined($anl->{label})); ##-- get label
  return $anl;
}

CAB/Analyzer.pm  view on Meta::CPAN

}

## $tok = $anl->analyzeSentence($sent_or_array,\%opts)
##  + perform type- and token-, and sentence- analyses on $sent_or_array
##  + wrapper for $anl->analyzeDocument()
sub analyzeSentence {
  my ($anl,$sent,$opts) = @_;
  $sent = [$sent] if (!UNIVERSAL::isa($sent,'ARRAY'));
  @$sent = map {toToken($_)} @$sent;
  my $doc = toDocument([toSentence($sent)]);
  $anl->analyzeDocument($doc, { ($opts ? %$opts : qw()), doAnalyzeLocal=>0});
  return $doc->{body}[0];
}

## $rpc_xml_base64 = $anl->analyzeData($data_str,\%opts)
##  + analyze a raw (formatted) data string $data_str with internal parsing & formatting
##  + wrapper for $anl->analyzeDocument()
sub analyzeData {
  require RPC::XML;
  my ($anl,$doc0,$opts) = @_;

  ##-- parsing & formatting options
  my $reader = $opts && $opts->{reader} ? $opts->{reader} : {}; ##-- reader options
  my $writer = $opts && $opts->{writer} ? $opts->{writer} : {}; ##-- writer options

  ##-- get format reader,writer
  my $ifmt = DTA::CAB::Format->newReader(%$reader);
  my $ofmt = DTA::CAB::Format->newWriter(class=>ref($ifmt), %$writer);

  ##-- parse, analyze, format
  my $doc = $ifmt->parseString($doc0);
  #$doc = DTA::CAB::Utils::deep_decode('UTF-8', $doc); ##-- this should NOT be necessary!
  $doc = $anl->analyzeDocument($doc,$opts);
  my $str = $ofmt->flush->putDocument($doc)->toString;
  $ofmt->flush;

  return RPC::XML::base64->new($str);
}

##------------------------------------------------------------------------
## Methods: Analysis: Closure Utilities (optional)

## \&closure = $anl->analyzeClosure($which)
##  + returns cached $anl->{"_analyze${which}"} if present
##  + otherwise calls $anl->getAnalyzeClosure($which) & caches result
##  + optional utility for closure-based analysis
sub analyzeClosure {
  my ($anl,$which) = @_;
  return $anl->{"_analyze${which}"} if (defined($anl->{"_analyze${which}"}));
  return $anl->{"_analyze${which}"} = $anl->getAnalyzeClosure($which);
}

## \&closure = $anl->getAnalyzeClosure($which)
##  + returns closure \&closure for analyzing data of type "$which"
##    (e.g. Word, Type, Token, Sentence, Document, ...)
##  + default implementation calls $anl->getAnalyze"${which}"Closure() if
##    available, otherwise croak()s
sub getAnalyzeClosure {
  my ($anl,$which) = @_;
  my $getsub = $anl->can("getAnalyze${which}Closure");
  weaken($anl);
  return $getsub->($anl) if ($getsub);
  $anl->logconfess("getAnalyzeClosure('$which'): no getAnalyze${which}Closure() method defined!");
}

##------------------------------------------------------------------------
## Methods: Analysis: (Token-)Accessor Closures

## $closure = $anl->accessClosure( $methodName, %opts);
## $closure = $anl->accessClosure(\&codeRef,    %opts);
## $closure = $anl->accessClosure( $codeString, %opts);
## $closure = $anl->accessClosure(\%opts );
##  + returns accessor-closure $closure for $anl
##  + passed argument can be one of the following:
##    - a CODE ref resolves to itself
##    - a method name resolves to $anl->can($methodName)
##    - anything else resolves to a string passed to eval()
##      + if the string contains no /\bsub\b/, it will be wrapped
##        as "sub {$codeString}"
##      + $codeString may reference the closure variable $anl
##        (and maybe others; see 'pre' and 'vars' options)
##  + %opts
##     code => $codeRefOrMethodNameOrCodeString, ##-- clobbers first argument
##     pre => $code_str,   ##-- for $codeString accessors, prefix for eval (e.g. 'my ($lexVar);')
##     vars => \@vars,     ##-- adds lexical vars 'my ('.join(',',@varNames).');'
##     cache => $bool,     ##-- enable/disable use of %CLOSURE_CACHE (default=enabled)
sub accessClosure {
  my ($anl,$code,%opts) = @_;
  if (UNIVERSAL::isa($code,'HASH')) {
    %opts = (%$code,%opts);
    $code = undef;
  }
  $code = $opts{code} if (defined($opts{code}));
  $code = ';' if (!defined($code));
  return $code if (UNIVERSAL::isa($code,'CODE'));
  return $anl->can($code) if ($anl->can($code));
  $code = (''
	   .($opts{pre}  ? "$opts{pre}; " : '')
	   .($opts{vars} ? ('my ('.join(',',@{$opts{vars}}).'); ') : '')
	   .($code =~ /\bsub\b/ ? $code : "sub { $code }")
	  );

  print STDERR
    ((ref($anl)||$anl), "->accessClosure():\n$code\n") if (0 || (ref($anl) && $anl->{debugAccessClosure}));

  my $do_cache = !exists($opts{cache}) || $opts{cache};
  my $sub      = ($do_cache ? $CLOSURE_CACHE{$anl}{$code} : undef);
  my $cached   = $sub ? 1 : 0;

  weaken($anl);
  $sub       ||= eval $code;
  $anl->logcluck("accessClosure(): could not compile closure {$code}: $@") if (!$sub);
  $CLOSURE_CACHE{$anl}{$code} = $sub if ($do_cache && !$cached);

  return $sub;
}

## PACKAGE::_am_xlit($tokvar='$_')
##  + access-closure macro (EXPR): get text (xlit.latin1Text << text) for token $$tokvar
##  + evaluates to a string:
##    ($$tokvar->{xlit} ? $$tokvar->{xlit}{latin1Text} : $$tokvar->{text})
sub _am_xlit {
  my $tokvar = shift || '$_';
  return "($tokvar\->{xlit} ? $tokvar\->{xlit}{latin1Text} : $tokvar\->{text}) ##== _am_xlit\n";
}

## PACKAGE::_am_xtext($tokvar='$_')
##  + access-closure macro (EXPR): get (exlex << xlit.latin1Text << text) for token $$tokvar
##  + evaluates to a string:
##    (defined($$tokvar->{exlex}) ? $$tokvar->{exlex} : $${am_xlit($tokvar)})
sub _am_xtext {
  my $tokvar = shift || '$_';
  return ("(defined($tokvar\->{exlex})"
	  ." ? $tokvar\->{exlex}"
	  ." : ($tokvar\->{xlit} ? $tokvar\->{xlit}{latin1Text} : $tokvar\->{text})"
	  .") ##== _am_xtext\n"
	 );
}

## PACKAGE::_am_lts($tokvar='$_')
##  + access-closure macro (EXPR) for first LTS analysis of token $$tokvar
##  + evaluates to string:
##    ($$tokvar->{lts} && @{$$tokvar->{lts}} ? $$tokvar->{lts}[0]{hi} : $$tokvar->{text})
sub _am_lts {
  my $tokvar = shift || '$_';
  return "($tokvar\->{lts} && \@{$tokvar\->{lts}} ? $tokvar\->{lts}[0]{hi} : $tokvar\->{text}) ##== _am_lts\n";
}

## PACKAGE::_am_rw($tokvar='$_')
##  + access-closure macro (EXPR) for rw output(s) for token $$tokvar
##  + evaluates to string:
##    ($$tokvar->{rw} ? (map {$_->{hi}} @{$$tokvar->{rw}}) : qw())
sub _am_rw {
  my $tokvar = shift || '$_';
  return "($tokvar\->{rw} ? (map {\$_->{hi}} \@{$tokvar\->{rw}}) : qw()) ##== _am_rw\n";
}

## PACKAGE::_am_tt_list($ttvar='$_')
##  + access-closure macro (EXPR) for a TT-style list of strings $$ttvar
##  + evaluatees to a list: "split(/\\t/,$$ttvar)"
sub _am_tt_list {
  my $ttvar = shift || '$_';
  return "split(/\\t/,$ttvar) ##== _am_tt_list\n";
}

## PACKAGE::_am_tt_fst($ttvar='$_')
##  + access-closure macro (EXPR) for a single TT-style FST analysis $$ttvar
##  + formerly mutliply defined in sub-packages as SUBPACKAGE::parseFstString()
##  + evaluates to a FST-analysis hash {hi=>$hi,w=>$w,lo=>$lo,lemma=>$lemma}:
##    (



( run in 2.659 seconds using v1.01-cache-2.11-cpan-8f98c5d2c55 )