Alt-CWB-CL-ambs
view release on metacpan or search on metacpan
lib/CWB/CL.pm view on Meta::CPAN
qw(CDA_EFSETINV CDA_EIDORNG CDA_EIDXORNG CDA_EINTERNAL),
qw(CDA_ENODATA CDA_ENOMEM CDA_ENOSTRING CDA_ENULLATT CDA_ENYI CDA_EOTHER),
qw(CDA_EPATTERN CDA_EPOSORNG CDA_EREMOTE CDA_ESTRUC),
);
# error symbols (indexed by <negative> error code)
our @ErrorSymbol = sort {(-$ErrorCode{$a}) <=> (-$ErrorCode{$b})} keys %ErrorCode;
# regex flags (for cl_regex2id())
our %RegexFlags = (
'' => 0,
'c' => constant('IGNORE_CASE'), # ignore case
'd' => constant('IGNORE_DIAC'), # ignore diacritics
'cd' => constant('IGNORE_CASE') | constant('IGNORE_DIAC'), # nice short-cut trick ...
'dc' => constant('IGNORE_CASE') | constant('IGNORE_DIAC'),
);
# structure boundary flags
our %Boundary = (
'inside' => constant('STRUC_INSIDE'),
'left' => constant('STRUC_LBOUND'),
'right' => constant('STRUC_RBOUND'),
'outside' => 0, # for completeness
'i' => constant('STRUC_INSIDE'),
'l' => constant('STRUC_LBOUND'),
'r' => constant('STRUC_RBOUND'),
'o' => 0,
'lr' => constant('STRUC_LBOUND') | constant('STRUC_RBOUND'), # these are all reasonable flag combinations
'rl' => constant('STRUC_LBOUND') | constant('STRUC_RBOUND'),
'leftright' => constant('STRUC_LBOUND') | constant('STRUC_RBOUND'),
'rightleft' => constant('STRUC_LBOUND') | constant('STRUC_RBOUND'),
);
#
# ------------ CWB::CL global variables ------------
#
# registry directory
our $Registry = cl_standard_registry();
#
# ------------ CWB::CL package functions ------------
#
# return error message for last error encountered during last method call (or "" if last call was successful)
# -- CWB::CL::error_message(); [exported by XS code]
# access error messages for CL (and internal) error codes
# -- CWB::CL::cwb_cl_error_message($code); [exported by XS code]
# set strictness (in strict mode, every CL or argument error aborts the script with croak())
sub strict ( ; $ ) {
my $current_mode = get_strict_mode();
if (@_) {
my $on_off = shift;
set_strict_mode($on_off ? 1 : 0);
}
return $current_mode;
}
# set CL debugging level (0=no, 1=some, 2=all debugging messages)
sub set_debug_level ( $ ) {
my $lvl = shift;
$lvl = 0 if (lc $lvl) eq "none";
$lvl = 1 if (lc $lvl) eq "some";
$lvl = 2 if (lc $lvl) eq "all";
croak "Usage: CWB::CL::set_debug_level('none' | 'some' | 'all');"
unless $lvl =~ /^[012]$/;
CWB::CL::cl_set_debug_level($lvl);
}
# set CL memory limit (used only by makeall so far, so no point in setting it here)
sub set_memory_limit ( $ ) {
my $mb = shift;
croak "Usage: CWB::CL::set_memory_limit(\$megabytes);"
unless $mb =~ /^[0-9]+$/;
croak "CWB::CL: invalid memory limit ${mb} MB (must be >= 42 MB)"
unless $mb >= 42;
CWB::CL::cl_set_memory_limit($mb);
}
# convert '|'-delimited string into proper (sorted) feature set value
# (if 's' or 'split' is given, splits string on whitespace; returns undef if there is a syntax error)
*make_set = \&cl_make_set; # now implemented in pure XS for better efficiency
# compute intersection of two feature sets (CQP's 'unify()' function)
# (returns undef if there is a syntax error)
*set_intersection = \&cl_set_intersection;
# compute cardinality of feature set (= "size", i.e. number of elements)
# (returns undef if there is a syntax error)
*set_size = \&cl_set_size;
# convert feature set value into hashref
sub set2hash ( $ ) {
my $set = shift;
my $is_ok = defined set_size($set); # easy & fast way of validating feature set format
if ($is_ok) {
my @items = split /\|/, $set; # returns empty field before leading |
shift @items;
return { map {$_ => 1} @items };
}
else {
return undef;
}
}
#
# ------------ CWB::CL::PosAttrib objects ------------
#
package CWB::CL::PosAttrib;
use Carp;
sub new {
my $class = shift;
my $corpus = shift; # corpus object (provided by CWB::CL::Corpus->attribute)
my $name = shift; # attribute name (provided by CWB::CL::Corpus->attribute)
my $self = {};
my $corpusPtr = $corpus->{'ptr'};
my $ptr = CWB::CL::cl_new_attribute($corpusPtr, $name, $CWB::CL::AttType{'ATT_POS'});
unless (defined $ptr) {
my $corpusName = $corpus->{'name'};
local($Carp::CarpLevel) = 1; # call has been delegated from attribute() method of CWB::CL::Corpus object
croak("Can't access p-attribute $corpusName.$name (aborted)")
if CWB::CL::strict(); # CL library doesn't set error code in cl_new_attribute() function
return undef;
lib/CWB/CL.pm view on Meta::CPAN
(defined $CWB::CL::Registry) ? $CWB::CL::Registry : CWB::CL::cl_standard_registry(),
lc($corpusname) # ... but CL API requires corpus name in lowercase
);
unless (defined $ptr) {
croak("Can't access corpus $corpusname (aborted)")
if CWB::CL::strict(); # CL library doesn't set error code in cl_new_corpus() function
return undef;
}
$self->{'ptr'} = $ptr;
$self->{'name'} = $corpusname;
return bless($self, $class);
}
sub DESTROY {
my $self = shift;
# disabled because of buggy nature of CL interface
# CWB::CL::cl_delete_corpus($self->{'ptr'});
}
sub attribute {
my $self = shift;
my $name = shift;
my $type = shift;
if ($type eq 'p') {
return (new CWB::CL::PosAttrib $self, $name);
}
elsif ($type eq 's') {
return (new CWB::CL::StrucAttrib $self, $name);
}
elsif ($type eq 'a') {
return (new CWB::CL::AlignAttrib $self, $name);
}
else {
croak "USAGE: \$corpus->attribute(\$name, 'p' | 's' | 'a')";
}
}
package CWB::CL; # back to main package for autosplitter's sake
1;
__END__
=head1 NAME
CWB::CL - Perl interface to the low-level Corpus Library of the IMS Open CWB
=head1 SYNOPSIS
use CWB::CL;
print "Registry path = ", $CWB::CL::Registry, "\n";
$CWB::CL::Registry .= ":/home/my_registry"; # add your own registry directory
# "strict" mode aborts if any error occurs (convenient in one-off scripts)
CWB::CL::strict(1); # or simply load CWB::CL::Strict module
CWB::CL::set_debug_level('some'); # 'some', 'all' or 'none' (default)
# CWB::CL::Corpus objects
$corpus = new CWB::CL::Corpus "HANSARD-EN"; # name of corpus can be upper or lower case
die "Error: can't access corpus HANSARD-EN" # all error conditions return undef
unless defined $corpus; # (checks are not needed in "strict" mode)
undef $corpus; # currently, mapped memory cannot be freed
# CWB::CL::Attribute objects (positional attributes)
$lemma = $corpus->attribute("lemma", 'p'); # returns CWB::CL::Attribute object
$corpus_length = $lemma->max_cpos; # valid cpos values are 0 .. $corpus_length-1
$lexicon_size = $lemma->max_id; # valid id values are 0 .. $lexicon_size-1
$id = $lemma->str2id($string);
@idlist = $lemma->str2id(@strlist); # all scalar functions map to lists in list context
$str = $lemma->id2str($id);
$len = $lemma->id2strlen($id);
$f = $lemma->id2freq($id);
$id = $lemma->cpos2id($cpos);
$str = $lemma->cpos2str($cpos);
@idlist = $lemma->regex2id($re); # regular expression matching
@cpos = $lemma->idlist2cpos(@idlist); # accessing the index (occurrences of given IDs)
$total_freq = $lemma->idlist2freq(@idlist); # better check the list size first on large corpora
# CWB::CL::AttStruc objects (structural attributes)
$chapter = $corpus->attribute("chapter", 's'); # returns CWB::CL::AttStruc object
$number_of_regions = $chapter->max_struc; # valid region numbers are 0 .. $number_of_regions-1
$has_values = $chapter->struc_values; # are regions annotated with strings?
$struc = $chapter->cpos2struc($cpos); # returns undef if not $cpos is not in <chapter> region
($start, $end) = $chapter->struc2cpos($struc); # returns empty list on error -> $start is undefined
($start, $end) = $chapter->cpos2struc2cpos($struc); # returns empty list if not in <chapter> region
# returns 2 * <n> values (= <n> start/end pairs) if called with <n> arguments
$str = $chapter->struc2str($struc); # always returns undef if not $chapter->struc_values
$str = $chapter->cpos2str($cpos); # combines cpos2struc() and struc2str()
# check whether corpus position is at boundary (l, r, lr) or inside/outside (i/o) of region
if ($chapter->cpos2boundary($cpos) & $CWB::CL::Boundary{'l'}) { ... }
if ($chapter->cpos2is_boundary('l', $cpos)) { ... }
# CWB::CL::AttAlign objects (alignment attributes)
$french = $corpus->attribute("hansard-fr", 'a'); # returns CWB::CL::AttAlign object
$nr_of_alignments = $french->max_alg; # alignment block numbers are 0 .. $nr_of_alignments-1
$extended = $french->has_extended_alignment; # extended alignment allows gaps & crossing alignments
$alg = $french->cpos2alg($cpos); # returns undef if no alignment was found
($src_start, $src_end, $target_start, $target_end)
= $french->alg2cpos($alg); # returns empty list on error
# or use convenience function $french->cpos2alg2cpos($cpos);
# Feature sets (used as values of CWB::CL::Attribute and CWB::CL::AttStruc)
$np_f = $corpus->attribute("np_feat", 's'); # p- and s-attributes can store feature sets
$fs_string = $np_f->cpos2str($cpos); # feature sets are encoded as strings
$fs = CL::set2hash($fs_string); # expand feature set into hash (reference)
if (exists $fs->{"paren"}) { ... {}
$fs1 = CWB::CL::make_set("|proper|nogen|"); # validate feature set or construct from string
( run in 1.609 second using v1.01-cache-2.11-cpan-39bf76dae61 )