Alt-CWB-CL-ambs

 view release on metacpan or  search on metacpan

lib/CWB/CL.pm  view on Meta::CPAN

                                 qw(CDA_EFSETINV CDA_EIDORNG CDA_EIDXORNG CDA_EINTERNAL),
                                 qw(CDA_ENODATA CDA_ENOMEM CDA_ENOSTRING CDA_ENULLATT CDA_ENYI CDA_EOTHER),
                                 qw(CDA_EPATTERN CDA_EPOSORNG CDA_EREMOTE CDA_ESTRUC),
                                );

# error symbols (indexed by <negative> error code) 
our @ErrorSymbol = sort {(-$ErrorCode{$a}) <=> (-$ErrorCode{$b})} keys %ErrorCode;

# regex flags (for cl_regex2id())
our %RegexFlags = (
               '' => 0,
               'c' => constant('IGNORE_CASE'), # ignore case
               'd' => constant('IGNORE_DIAC'), # ignore diacritics
               'cd' => constant('IGNORE_CASE') | constant('IGNORE_DIAC'),       # nice short-cut trick ...
               'dc' => constant('IGNORE_CASE') | constant('IGNORE_DIAC'),
              );

# structure boundary flags
our %Boundary = (
    'inside' => constant('STRUC_INSIDE'),
    'left' => constant('STRUC_LBOUND'),
    'right' => constant('STRUC_RBOUND'),
    'outside' => 0,  # for completeness
    'i' => constant('STRUC_INSIDE'),
    'l' => constant('STRUC_LBOUND'),
    'r' => constant('STRUC_RBOUND'),
    'o' => 0,
    'lr' => constant('STRUC_LBOUND') | constant('STRUC_RBOUND'), # these are all reasonable flag combinations
    'rl' => constant('STRUC_LBOUND') | constant('STRUC_RBOUND'),
    'leftright' => constant('STRUC_LBOUND') | constant('STRUC_RBOUND'),
    'rightleft' => constant('STRUC_LBOUND') | constant('STRUC_RBOUND'),    
  );

#
#  ------------  CWB::CL global variables  ------------
#

# registry directory
our $Registry = cl_standard_registry();

#
#  ------------  CWB::CL package functions  ------------
#

# return error message for last error encountered during last method call (or "" if last call was successful)
# -- CWB::CL::error_message(); [exported by XS code]

# access error messages for CL (and internal) error codes
# -- CWB::CL::cwb_cl_error_message($code); [exported by XS code]

# set strictness (in strict mode, every CL or argument error aborts the script with croak())
sub strict ( ; $ ) {
  my $current_mode = get_strict_mode();
  if (@_) {
    my $on_off = shift;
    set_strict_mode($on_off ? 1 : 0);
  }
  return $current_mode;
}

# set CL debugging level (0=no, 1=some, 2=all debugging messages)
sub set_debug_level ( $ ) {
  my $lvl = shift;
  $lvl = 0 if (lc $lvl) eq "none";
  $lvl = 1 if (lc $lvl) eq "some";
  $lvl = 2 if (lc $lvl) eq "all";
  croak "Usage:  CWB::CL::set_debug_level('none' | 'some' | 'all');"
    unless $lvl =~ /^[012]$/;
  CWB::CL::cl_set_debug_level($lvl);
}

# set CL memory limit (used only by makeall so far, so no point in setting it here)
sub set_memory_limit ( $ ) {
  my $mb = shift;
  croak "Usage:  CWB::CL::set_memory_limit(\$megabytes);"
    unless $mb =~ /^[0-9]+$/;
  croak "CWB::CL: invalid memory limit ${mb} MB (must be >= 42 MB)"
    unless $mb >= 42;
  CWB::CL::cl_set_memory_limit($mb);
}

# convert '|'-delimited string into proper (sorted) feature set value
# (if 's' or 'split' is given, splits string on whitespace; returns undef if there is a syntax error)
*make_set = \&cl_make_set;  # now implemented in pure XS for better efficiency
 
# compute intersection of two feature sets (CQP's 'unify()' function)
# (returns undef if there is a syntax error)
*set_intersection = \&cl_set_intersection;

# compute cardinality of feature set (= "size", i.e. number of elements)
# (returns undef if there is a syntax error)
*set_size = \&cl_set_size;

# convert feature set value into hashref
sub set2hash ( $ ) {
  my $set = shift;
  my $is_ok = defined set_size($set); # easy & fast way of validating feature set format
  if ($is_ok) {
    my @items = split /\|/, $set; # returns empty field before leading |
    shift @items;
    return { map {$_ => 1} @items };
  }
  else {
    return undef;
  }
}


#
#  ------------  CWB::CL::PosAttrib objects  ------------
#

package CWB::CL::PosAttrib;
use Carp;

sub new {
  my $class = shift;
  my $corpus = shift;           # corpus object  (provided by CWB::CL::Corpus->attribute)
  my $name = shift;             # attribute name (provided by CWB::CL::Corpus->attribute)
  my $self = {};

  my $corpusPtr = $corpus->{'ptr'};
  my $ptr = CWB::CL::cl_new_attribute($corpusPtr, $name, $CWB::CL::AttType{'ATT_POS'});
  unless (defined $ptr) {
    my $corpusName = $corpus->{'name'};
    local($Carp::CarpLevel) = 1; # call has been delegated from attribute() method of CWB::CL::Corpus object
    croak("Can't access p-attribute $corpusName.$name (aborted)")
      if CWB::CL::strict(); # CL library doesn't set error code in cl_new_attribute() function
    return undef;

lib/CWB/CL.pm  view on Meta::CPAN

      (defined $CWB::CL::Registry) ? $CWB::CL::Registry : CWB::CL::cl_standard_registry(),
      lc($corpusname)  # ... but CL API requires corpus name in lowercase
    );
  unless (defined $ptr) {
    croak("Can't access corpus $corpusname (aborted)")
      if CWB::CL::strict(); # CL library doesn't set error code in cl_new_corpus() function
    return undef;
  }
  $self->{'ptr'} = $ptr;
  $self->{'name'} = $corpusname;
  return bless($self, $class);
}

sub DESTROY {
  my $self = shift;

  # disabled because of buggy nature of CL interface
  #   CWB::CL::cl_delete_corpus($self->{'ptr'});
}

sub attribute {
  my $self = shift;
  my $name = shift;
  my $type = shift;

  if ($type eq 'p') {
    return (new CWB::CL::PosAttrib $self, $name);
  }
  elsif ($type eq 's') {
    return (new CWB::CL::StrucAttrib $self, $name);
  }
  elsif ($type eq 'a') {
    return (new CWB::CL::AlignAttrib $self, $name);
  }
  else {
    croak "USAGE: \$corpus->attribute(\$name, 'p' | 's' | 'a')";
  }
}




package CWB::CL;                        # back to main package for autosplitter's sake
1;
__END__


=head1 NAME

CWB::CL - Perl interface to the low-level Corpus Library of the IMS Open CWB

=head1 SYNOPSIS

  use CWB::CL;

  print "Registry path = ", $CWB::CL::Registry, "\n";
  $CWB::CL::Registry .= ":/home/my_registry";    # add your own registry directory

  # "strict" mode aborts if any error occurs (convenient in one-off scripts)
  CWB::CL::strict(1);                            # or simply load CWB::CL::Strict module
  CWB::CL::set_debug_level('some');              # 'some', 'all' or 'none' (default)

  # CWB::CL::Corpus objects
  $corpus = new CWB::CL::Corpus "HANSARD-EN";    # name of corpus can be upper or lower case
  die "Error: can't access corpus HANSARD-EN"    # all error conditions return undef
    unless defined $corpus;                      #   (checks are not needed in "strict" mode)
  undef $corpus;                                 # currently, mapped memory cannot be freed


  # CWB::CL::Attribute objects (positional attributes)
  $lemma = $corpus->attribute("lemma", 'p');     # returns CWB::CL::Attribute object
  $corpus_length = $lemma->max_cpos;             # valid cpos values are 0 .. $corpus_length-1
  $lexicon_size = $lemma->max_id;                # valid id values are 0 .. $lexicon_size-1

  $id  = $lemma->str2id($string); 
  @idlist = $lemma->str2id(@strlist);            # all scalar functions map to lists in list context
  $str = $lemma->id2str($id);
  $len = $lemma->id2strlen($id);
  $f   = $lemma->id2freq($id);
  $id  = $lemma->cpos2id($cpos);
  $str = $lemma->cpos2str($cpos);

  @idlist = $lemma->regex2id($re);               # regular expression matching
  @cpos = $lemma->idlist2cpos(@idlist);          # accessing the index (occurrences of given IDs)
  $total_freq = $lemma->idlist2freq(@idlist);    # better check the list size first on large corpora


  # CWB::CL::AttStruc objects (structural attributes)
  $chapter = $corpus->attribute("chapter", 's'); # returns CWB::CL::AttStruc object
  $number_of_regions = $chapter->max_struc;      # valid region numbers are 0 .. $number_of_regions-1
  $has_values = $chapter->struc_values;          # are regions annotated with strings?

  $struc = $chapter->cpos2struc($cpos);          # returns undef if not $cpos is not in <chapter> region
  ($start, $end) = $chapter->struc2cpos($struc); # returns empty list on error -> $start is undefined
  ($start, $end) = $chapter->cpos2struc2cpos($struc);  # returns empty list if not in <chapter> region
      # returns 2 * <n> values (= <n> start/end pairs) if called with <n> arguments
  $str  = $chapter->struc2str($struc);           # always returns undef if not $chapter->struc_values
  $str  = $chapter->cpos2str($cpos);             # combines cpos2struc() and struc2str() 

  # check whether corpus position is at boundary (l, r, lr) or inside/outside (i/o) of region
  if ($chapter->cpos2boundary($cpos) & $CWB::CL::Boundary{'l'}) { ... }
  if ($chapter->cpos2is_boundary('l', $cpos)) { ... }


  # CWB::CL::AttAlign objects (alignment attributes)
  $french = $corpus->attribute("hansard-fr", 'a'); # returns CWB::CL::AttAlign object
  $nr_of_alignments = $french->max_alg;          # alignment block numbers are 0 .. $nr_of_alignments-1
  $extended = $french->has_extended_alignment;   # extended alignment allows gaps & crossing alignments
  
  $alg = $french->cpos2alg($cpos);               # returns undef if no alignment was found
  ($src_start, $src_end, $target_start, $target_end) 
      = $french->alg2cpos($alg);                 # returns empty list on error
      # or use convenience function $french->cpos2alg2cpos($cpos);


  # Feature sets (used as values of CWB::CL::Attribute and CWB::CL::AttStruc)
  $np_f = $corpus->attribute("np_feat", 's');    # p- and s-attributes can store feature sets
  $fs_string = $np_f->cpos2str($cpos);           # feature sets are encoded as strings
  $fs = CL::set2hash($fs_string);                # expand feature set into hash (reference)
  if (exists $fs->{"paren"}) { ... {}
  $fs1 = CWB::CL::make_set("|proper|nogen|");    # validate feature set or construct from string



( run in 1.609 second using v1.01-cache-2.11-cpan-39bf76dae61 )