Alt-CWB-ambs

 view release on metacpan or  search on metacpan

lib/CWB/Encoder.pm  view on Meta::CPAN

package CWB::Encoder;
# -*-cperl-*-

use strict;
use warnings;

=head1 NAME

  CWB::Encoder - Perl tools for encoding and indexing CWB corpora

=head1 SYNOPSIS

  use CWB::Encoder;


  $bnc = new CWB::Indexer "BNC";
  $bnc = new CWB::Indexer "/path/to/registry:BNC";

  $bnc->group("corpora");     # optional: group and access
  $bnc->perm("640");          # permissions for newly created files

  $bnc->memory(400);          # use up to 400 MB of RAM (default: 75)
  $bnc->validate(0);          # disable validation for faster indexing
  $bnc->debug(1);             # enable debugging output

  $bnc->make("word", "pos");  # build index & compress
  $bnc->makeall;              # process all p-attributes


  $bnc = new CWB::Encoder "BNC";

  $bnc->registry("/path/to/registry");  # will try to guess otherwise
  $bnc->dir("/path/to/data/directory"); # directory for corpus data files
  $bnc->overwrite(1);         # may overwrite existing files / directories
  
  $bnc->longname("British National Corpus"); # optional
  $bnc->info("Line1.\nLine2.\n...");    # optional multi-line info text
  $bnc->charset("latin1");    # defaults to latin1
  $bnc->language("en");       # defaults to ??
  
  $bnc->group("corpora");     # optional: group and access permissions
  $bnc->perm("640");          # for newly created files & directories

  $bnc->p_attributes("word"); # declare postional atts (no default!)
  $bnc->p_attributes(qw<pos lemma>);  # may be called repeatedly
  $bnc->null_attributes("teiHeader"); # declare null atts (ignored)
  $bnc->s_attributes("s");    # s-attributes in cwb-encode syntax
  $bnc->s_attributes(qw<div0* div1*>);# * = store annotations (-V)
  $bnc->s_attributes("bncDoc:0+id");  # recursion & XML attributes

  $bnc->decode_entities(0);        # don't decode XML entities (with -x flag)
  $bnc->undef_symbol("__UNDEF__"); # mark missing values like cwb-encode

  $bnc->memory(400);          # use up to 400 MB of RAM (default: 75)
  $bnc->validate(0);          # disable validation for faster indexing
  $bnc->verbose(1);           # print some progress information
  $bnc->debug(1);             # enable debugging output

  $bnc->encode(@files);       # encoding, indexing, and compression

  $pipe = $bnc->encode_pipe;  # can also feed input text from Perl script
  while (...) {
    print $pipe "$line\n";
  }
  $bnc->close_pipe;

=head1 DESCRIPTION

This package contains modules for the automatic encoding and indexing
of CWB corpora. 

B<CWB::Indexer> builds indices for some or all positional attributes
of an existing corpus (using the B<cwb-makeall> tool). In addition,
these attributes are automatically compressed (using the
B<cwb-huffcode> and B<cwb-compress-rdx> tools). Compression and
indexing is interleaved to minimise the required amount of temporary
disk space, and a B<make>-like system ensures that old index files are
automatically updated.

B<CWB::Encoder> automates all steps necessary to encode a CWB corpus
(which includes cleaning up old files, running B<cwb-encode>, editing
the registry entry, indexing & compressing positional attributes, and
setting access permissions). Both modules can be set up with a few
simple method calls. Full descriptions are given separately in the
following sections. 

=cut

## ======================================================================
##  automatic creation, compression and updating of CWB index files (for p-attributes)
## ======================================================================

package CWB::Indexer;

use CWB;
use Carp;

# makefile-like rules for creating / updating components

lib/CWB/Encoder.pm  view on Meta::CPAN

              LANG => "??",     # language (corpus property)
              REGISTRY => undef, # registry directory (will be automatically chosen if possible)
              DIR => undef,     # data directory
              PATT => [],       # positional attributes
              SATT => [],       # structural attributes (cwb-encode syntax for recursion and XML atts)
              NATT => [],       # null attributes (tags are ignored)
              GROUP => undef,   # optional: group and access
              PERM => undef,    # permissions for created files
              OVERWRITE => undef, # can I overwrite existing files?
              MEMORY => 75,     # passed to CWB::Indexer
              VALIDATE => 1,    # passed to CWB::Indexer
              ENTITIES => 1,    # whether to decode XML entities (and skip comments etc.)
              UNDEF_SYMBOL => "", # string to insert for missing values of p-attributes
              VERBOSE => 0,     # print some progress information (stdout)
              DEBUG => 0,
              PIPE => undef,    # pipe to cwb-encode (for encode_pipe() method)
             };
  bless($self, $class);
  $self->name(shift)
    if @_;
  return $self;
}

=item $enc->name($corpus);

Change the CWB name of a corpus after the encoder object I<$enc> has been created.
Has to be used if the constructor was called without arguments.

=cut

sub name {
  my ($self, $name) = @_;
  $self->{NAME} = lc($name);
}

=item $enc->longname($descriptive_name);

Optional long, descriptive name for a corpus (single line).

=cut

sub longname {
  my ($self, $longname) = @_;
  carp "CWB::Encoder: long name ($longname) must not contain \" and \\ characters (removed).\n"
    if $longname =~ tr/\"\\//d;
  $self->{LONGNAME} = $longname;
}

=item $enc->info($multiline_text);

Multi-line text that will be written to the C<.info> file of the
corpus.

=cut

sub info {
  my ($self, $info) = @_;
  $self->{INFO} = $info;
}

=item $enc->charset($code);

Set corpus character set (as a corpus property in the registry entry).
So far, only C<latin1> is fully supported. Other valid character sets are
C<latin2>, ..., C<latin9>, and C<utf8> (which will be supported by future
releases of the CWB). Any other I<$code> will raise a warning.

=cut

sub charset {
  my ($self, $charset) = @_;
  carp "CWB::Encoder: character set $charset not supported by CWB (latin1, ..., latin9, utf8).\n"
    unless $charset =~ /^(latin[1-9]|utf8)$/;
  $self->{CHARSET} = $charset;
}

=item $enc->language($code);

Set corpus language (as an informational corpus property in the
registry entry). Use of a two-letter ISO code (C<de>, C<en>, C<fr>,
...) is recommended, and any other formats will raise a warning.

=cut

sub language {
  my ($self, $lang) = @_;
  carp "CWB::Encoder: language ($lang) should be two-letter ISO code.\n"
    unless $lang =~ /^[a-z]{2}$/;
  $self->{LANG} = $lang;
}

=item $enc->registry($registry_dir);

Specify registry directory I<$registry_dir>, which must be a single
directory rather than a path. If the registry directory is not set
explicitly, B<CWB::Encoder> attempts to determine the standard
registry directory, and will fail if there is no unique match
(e.g. when the C<CORPUS_REGISTRY> environment variable specifies
multiple directories).

=cut

sub registry {
  my ($self, $registry) = @_;
  $self->{REGISTRY} = $registry;
}

=item $enc->dir($data_dir);

Specify directory I<$data_dir> for corpus data files. The directory is
automatically created if it does not exist.

=cut

sub dir {
  my ($self, $dir) = @_;
  $self->{DIR} = $dir;
}

=item $enc->p_attributes($att1, $att2, ...);

Declare one or more B<positional attributes>. This method can be
called repeatedly with additional attributes. Note that I<all> 
positional attributes, including C<word>, have to be declared
explicitly.

=cut

sub p_attributes {
  my $self = shift;
  push @{$self->{PATT}}, @_;
}

=item $enc->s_attributes($att1, $att2, ...);

lib/CWB/Encoder.pm  view on Meta::CPAN

  print "Encoding complete.\n"
    if $self->{VERBOSE};
}


# internal method: called _after_ running cwb-encode
sub post_encode {
  my $self = shift;
  my $perm = $self->{PERM};
  my $group = $self->{GROUP};
  my $dir = $self->{DIR};

  print "Setting access permissions ...\n" # set access permissions for created files
    if $self->{VERBOSE};
  foreach my $att (@{$self->{PATT}}) { # positional attributes
    my $pattern = "'$dir'/$att.*";
    print STDERR "CWB::Encoder: processing group $pattern\n"
      if $self->{DEBUG} and ($perm or $group); 
    CWB::Shell::Cmd("chmod $perm $pattern")
      if $perm;
    CWB::Shell::Cmd("chgrp $group $pattern")
      if $group;
  }
  foreach my $attspec (@{$self->{SATT}}) { # structural attributes
    my $temp = $attspec;        # don't modify original list
    my $rec = ($temp =~ s/:([0-9]+)//) ? $1 : 0;   # recursion depth
    my ($att, @xmlatts) = split /\+/, $temp;       # attribute name and XML tag attributes
    foreach my $n ("", 1 .. $rec) {                # indices of embedded regions
      foreach my $ext ("", map {"_$_"} @xmlatts) { # extensions for XML tag attributes
        my $pattern = "'$dir'/$att$ext$n.*";
        print STDERR "CWB::Encoder: processing group $pattern\n"
          if $self->{DEBUG} and ($perm or $group); 
        CWB::Shell::Cmd("chmod $perm $pattern")
          if $perm;
        CWB::Shell::Cmd("chgrp $group $pattern")
          if $group;
      }
    }
  }

  print "Writing .info file ...\n"     # write .info file
    if $self->{VERBOSE};
  my $infofile = "$dir/.info";
  my $fh = CWB::OpenFile "> $infofile";
  print $fh $self->{INFO}, "\n";
  $fh->close;
  CWB::Shell::Cmd("chmod $perm '$infofile'")
    if $perm;
  CWB::Shell::Cmd("chgrp $group '$infofile'")
    if $group;

  print "Editing registry entry ...\n" # edit registry file
    if $self->{VERBOSE};
  my $reg = $self->{REGISTRY};
  my $name = $self->{NAME};
  my $regfile = "$reg/$name";
  my $rf = new CWB::RegistryFile $regfile;
  croak "CWB::Encoder: Syntax error in registry entry $regfile\n"
    unless defined $rf;
  $rf->name($self->{LONGNAME});
  # $rf->property("charset", $self->{CHARSET}); # -- already set by cwb-encode (since v2.2.101)
  $rf->property("language", $self->{LANG});
  $rf->write($regfile);
  print STDERR "CWB::Encoder: registry entry $regfile has been edited\n"
    if $self->{DEBUG};
  print STDERR "CWB::Encoder: setting access permissions for $regfile\n"
    if $self->{DEBUG} and ($perm or $group);
  CWB::Shell::Cmd("chmod $perm '$regfile'")
    if $perm;
  CWB::Shell::Cmd("chgrp $group '$regfile'")
    if $group;

  my $idx = new CWB::Indexer "$reg:".(uc $name); # build indices and compress p-attributes
  $idx->group($group)
    if $group;
  $idx->perm($perm)
    if $perm;
  $idx->memory($self->{MEMORY});
  $idx->validate($self->{VALIDATE});
  $idx->debug($self->{DEBUG});
  print "Building indices and compressing p-attributes ...\n"
    if $self->{VERBOSE};
  $idx->makeall;

}

=back

=cut

## ======================================================================

1;

__END__

=head1 COPYRIGHT

Copyright (C) 2002-2010 Stefan Evert [http::/purl.org/stefan.evert]

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut



( run in 0.570 second using v1.01-cache-2.11-cpan-39bf76dae61 )