Alt-CWB-ambs

 view release on metacpan or  search on metacpan

lib/CWB/Encoder.pm  view on Meta::CPAN

package CWB::Encoder;
# -*-cperl-*-

use strict;
use warnings;

=head1 NAME

  CWB::Encoder - Perl tools for encoding and indexing CWB corpora

=head1 SYNOPSIS

  use CWB::Encoder;


  $bnc = new CWB::Indexer "BNC";
  $bnc = new CWB::Indexer "/path/to/registry:BNC";

  $bnc->group("corpora");     # optional: group and access
  $bnc->perm("640");          # permissions for newly created files

  $bnc->memory(400);          # use up to 400 MB of RAM (default: 75)
  $bnc->validate(0);          # disable validation for faster indexing
  $bnc->debug(1);             # enable debugging output

  $bnc->make("word", "pos");  # build index & compress
  $bnc->makeall;              # process all p-attributes


  $bnc = new CWB::Encoder "BNC";

  $bnc->registry("/path/to/registry");  # will try to guess otherwise
  $bnc->dir("/path/to/data/directory"); # directory for corpus data files
  $bnc->overwrite(1);         # may overwrite existing files / directories
  
  $bnc->longname("British National Corpus"); # optional
  $bnc->info("Line1.\nLine2.\n...");    # optional multi-line info text
  $bnc->charset("latin1");    # defaults to latin1
  $bnc->language("en");       # defaults to ??
  
  $bnc->group("corpora");     # optional: group and access permissions
  $bnc->perm("640");          # for newly created files & directories

  $bnc->p_attributes("word"); # declare postional atts (no default!)
  $bnc->p_attributes(qw<pos lemma>);  # may be called repeatedly
  $bnc->null_attributes("teiHeader"); # declare null atts (ignored)
  $bnc->s_attributes("s");    # s-attributes in cwb-encode syntax
  $bnc->s_attributes(qw<div0* div1*>);# * = store annotations (-V)
  $bnc->s_attributes("bncDoc:0+id");  # recursion & XML attributes

  $bnc->decode_entities(0);        # don't decode XML entities (with -x flag)
  $bnc->undef_symbol("__UNDEF__"); # mark missing values like cwb-encode

  $bnc->memory(400);          # use up to 400 MB of RAM (default: 75)
  $bnc->validate(0);          # disable validation for faster indexing
  $bnc->verbose(1);           # print some progress information
  $bnc->debug(1);             # enable debugging output

  $bnc->encode(@files);       # encoding, indexing, and compression

  $pipe = $bnc->encode_pipe;  # can also feed input text from Perl script
  while (...) {
    print $pipe "$line\n";
  }
  $bnc->close_pipe;

=head1 DESCRIPTION

This package contains modules for the automatic encoding and indexing
of CWB corpora. 

B<CWB::Indexer> builds indices for some or all positional attributes
of an existing corpus (using the B<cwb-makeall> tool). In addition,
these attributes are automatically compressed (using the
B<cwb-huffcode> and B<cwb-compress-rdx> tools). Compression and
indexing is interleaved to minimise the required amount of temporary
disk space, and a B<make>-like system ensures that old index files are
automatically updated.

B<CWB::Encoder> automates all steps necessary to encode a CWB corpus
(which includes cleaning up old files, running B<cwb-encode>, editing
the registry entry, indexing & compressing positional attributes, and
setting access permissions). Both modules can be set up with a few
simple method calls. Full descriptions are given separately in the
following sections. 

=cut

## ======================================================================
##  automatic creation, compression and updating of CWB index files (for p-attributes)
## ======================================================================

package CWB::Indexer;

use CWB;
use Carp;

# makefile-like rules for creating / updating components
#   TRIGGER .. update component when one of these comps exists & is newer
#   NEEDED  .. componentes required by command below
#   CREATES .. these files will be created by COMMAND
#   COMMAND .. shell command to create this component 
#              interpolates '#C' (corpus id), '#A' (attribute name), '#R' (registry flag), 
#                           '#M' (memory limit), '#T' (no validate), '#V' (validate)
#              (issues "can't create" error message if COMMAND starts with "ERROR")
#   DELETE  .. delete these components when target exist or has been created
our %RULES =
  (
   DIR => {
           TRIGGER => [],
           NEEDED  => [],
           CREATES => [],
           COMMAND => "ERROR: Corpus data directory must be created manually.",
           DELETE  => [],
          },
   CORPUS => {
           TRIGGER => [],

lib/CWB/Encoder.pm  view on Meta::CPAN

           COMMAND => "$CWB::Makeall #R #M #V -P #A #C",
           DELETE  => [],
          },
   REVCIDX => {
           TRIGGER => [qw<CORPUS CIS CISCODE CISSYNC LEXICON LEXIDX FREQS LEXSRT>],
           NEEDED  => [qw<CIS CISCODE CISSYNC LEXICON LEXIDX FREQS LEXSRT>],
           CREATES => [qw<REVCORP REVCIDX>],
           COMMAND => "$CWB::Makeall #R #M #V -P #A #C",
           DELETE  => [],
          },
   CRC => {
           TRIGGER => [qw<CORPUS CIS CISCODE CISSYNC REVCORP REVCIDX LEXICON LEXIDX FREQS LEXSRT>],
           NEEDED  => [qw<REVCORP REVCIDX LEXICON LEXIDX FREQS LEXSRT>],
           CREATES => [qw<CRC CRCIDX>],
           COMMAND => "$CWB::CompressRdx #R #T -P #A #C",
           DELETE  => [qw<REVCORP REVCIDX>],
          },
   CRCIDX => {
           TRIGGER => [qw<CORPUS CIS CISCODE CISSYNC REVCORP REVCIDX LEXICON LEXIDX FREQS LEXSRT>],
           NEEDED  => [qw<REVCORP REVCIDX LEXICON LEXIDX FREQS LEXSRT>],
           CREATES => [qw<CRC CRCIDX>],
           COMMAND => "$CWB::CompressRdx #R #T -P #A #C",
           DELETE  => [qw<REVCORP REVCIDX>],
          },
  );

# components that must exist or be created by make() in the specified order
# (note that prerequisites are created recursively, so there mustn't be loops in the rules!)
# (CISCODE, CISSYNC, and CRCIDX should be created automatically by previous rules)
our @NEEDED = qw<LEXICON LEXIDX FREQS LEXSRT CIS CISCODE CISSYNC CRC CRCIDX>;

=head1 CWB::Indexer METHODS

=over 4

=item $idx = new CWB::Indexer $corpus;

=item $idx = new CWB::Indexer "$registry_path:$corpus";

Create a new B<CWB::Indexer> object for the specified corpus. If
I<$corpus> is not registered in the default registry path (the built-in 
default or the C<CORPUS_REGISTRY> environment variable), the registry
directory has to be specified explicitly, separated from the corpus name
by a C<:> character. I<$registry_path> may contain multiple directories
separated by C<:> characters.

=cut

sub new {
  my $class = shift;
  my $self = {
              NAME => undef,    # name of the corpus (CWB corpus ID)
              REGISTRY => "",   # -r flag for non-default registry
              FILES => {},      # lookup hash for component filenames
              # $self->{FILES}->{$att}->{$comp} = $pathname;
              TYPES => {},      # attribute types: P / S
              GROUP => undef,   # optional: set group for new files
              PERM => undef,    # optional: set permissions for new files
              MEMORY => 75,     # memory limit for index creation
              VALIDATE => 1,    # enable/disable validation
              DEBUG => 0,       # enable/disable debugging output
             };
  croak 'USAGE:  $c = new CWB::Indexer $corpus_id;'
    unless @_ == 1;
  my $name = shift;
  if ($name =~ /^\s*(.+)\s*:\s*([^:]+)$/) {
    $self->{REGISTRY} = "-r '$1'";
    $name = $2;
  }
  $self->{NAME} = $name;

  # use cwb-describe-corpus to find out component pathnames
  my @lines = ();
  my $registry = $self->{REGISTRY};
  my $cmd = "'$CWB::DescribeCorpus' $registry -d $name";
  CWB::Shell::Cmd($cmd, \@lines);

  my $comp = "";                # component name
  my $attr = "";                # attribute name
  foreach (@lines) {
    if (/Component\s+([A-Z]+):/) {
      $comp = $1;
    }
    elsif (/Attribute:\s+(\S+)/ or /Attribute\s+(\S+):/) {
      $attr = $1;
    }
    elsif (/Path\/Value:\s+(\S(.*\S)?)/) {
      croak "CWB::Indexer: Can't find component name for file $1 (aborted).\n"
        unless $comp;
      croak "CWB::Indexer: Can't find attribute name for file $1 (aborted).\n"
        unless $attr;
      $self->{FILES}->{$attr}->{$comp} = $1;
      $comp = $attr = "";       # reset to check for syntax errors
    }
    elsif (/Type:\s+([A-Z])/) {
      carp "CWB::Indexer: Missing attribute name in output of cwb-describe-corpus $name (skipped).\n"
        unless $attr;
      $self->{TYPES}->{$attr} = $1;
    }
    # all other lines are ignored
  }

  return bless($self, $class);
}

=item $idx->group($group);

=item $idx->perm($permission);

Optional group membership and access permissions for newly created
files (otherwise, neither B<chgrp> nor B<chmod> will be called). Note
that I<$permission> must be a string rather than an octal number (as
for the built-in B<chmod> function). Indexing will fail if the
specified group and/or permissions cannot be set.

=cut

sub group {
  my ($self, $group) = @_;
  $self->{GROUP} = $group;
}

sub perm {
  my ($self, $perm) = @_;
  $self->{PERM} = $perm;
}

=item $idx->memory($mbytes);

Set approximate memory limit for B<cwb-makeall> command, in MBytes.
The memory limit defaults to 75 MB, which is a reasonable value for
systems with at least 128 MB of RAM. 

=cut

sub memory {
  my ($self, $mem) = @_;
  croak "CWB::Indexer:  memory limit ($mem) must be positive integer number (aborted).\n"
    unless $mem =~ /^[1-9][0-9]*$/;
  $self->{MEMORY} = $mem;
}

=item $idx->validate(0);

Turn off validation of index and compressed files, which may give 
substantial speed improvements for larger corpora.

=cut

sub validate {
  my ($self, $yesno) = @_;
  $self->{VALIDATE} = $yesno;
}

=item $idx->debug(1);

Activate debugging output (on STDERR). 

=cut

sub debug {
  my ($self, $yesno) = @_;
  $self->{DEBUG} = $yesno;
}

# internal method: get full pathname of a component file
sub filename {
  my ($self, $att, $comp) = @_;
  my $path = $self->{FILES}->{$att}->{$comp};
  croak "CWB::Indexer: can't determine filename for component $att/$comp (aborted).\n"
    unless defined $path;
  return $path;
}

# internal method: make single component (recursively builds dependencies)
sub make_comp {
  my ($self, $att, $comp) = @_;
  my $rule = $RULES{$comp};
  croak "CWB::Indexer:  no rule found for component $comp (aborted).\n"
    unless defined $rule;
  my ($trigger, $needed, $creates, $command, $delete) =
    @$rule{qw<TRIGGER NEEDED CREATES COMMAND DELETE>};

  my $update = 0;               # check whether component needs to be created / updated
  my $file = $self->filename($att, $comp);
  if (not -f $file) {
    print STDERR "CWB::Indexer: component $att/$comp does not exist -> create\n"
      if $self->{DEBUG};
    $update = 1;                # file does not exist -> create
  }
  else {
    my $age = -M $file;
    foreach my $t (@$trigger) { # check for triggers that are newer than target
      my $t_file = $self->filename($att, $t);
      if (-f $t_file) {
        my $t_age = -M $t_file;
        if ($t_age < $age) {
          $update = 1;          # trigger is newer -> update
          print STDERR "CWB::Indexer: component $att/$t is newer than $att/$comp -> update\n"
            if $self->{DEBUG};
        }
      }
    }
  }

  if ($update) {                # (re-)create component if necessary
    print STDERR
      "CWB::Indexer: make_comp($att, $comp)\n",
      "CWB::Indexer:   creating component file $file\n"
        if $self->{DEBUG};

    foreach my $c (@$creates) { # delete old target files (first, to make room for intermediate files)
      my $f = $self->filename($att, $c);
      if (-f $f) {
        unlink $f;
        croak "CWB::Indexer: Can't delete file $f (aborted).\n"
          if -e $f;
        print STDERR "CWB::Indexer:   deleting file $f\n"
          if $self->{DEBUG};
      }
    }

lib/CWB/Encoder.pm  view on Meta::CPAN

    unless $mem =~ /^[1-9][0-9]*$/;
  $self->{MEMORY} = $mem;
}

=item $enc->validate(0);

Turn off validation of index and compressed files, which may give 
substantial speed improvements for larger corpora.

=cut

sub validate {
  my ($self, $yesno) = @_;
  $self->{VALIDATE} = $yesno;
}

=item $enc->decode_entities(0);

Whether B<cwb-encode> is allowed to decode XML entities and skip XML 
comments (with the C<-x> option).  Set this option to false if you
want an HTML-compatible encoding of the CWB corpus that does not need
to be converted before display in a Web browser.

=cut

sub decode_entities {
  my ($self, $yesno) = @_;
  $self->{ENTITIES} = $yesno;
}

=item $enc->undef_symbol("__UNDEF__");

Symbol inserted for missing values of positional attributes (either
because there are too few columns in the input or because attribute
values are explicit empty strings).  By default, no special symbol
is inserted (i.e. missing values are encoded as empty strings C<"">).
Use the command shown above to mimic the standard behaviour of
B<cwb-encode>.

=cut

sub undef_symbol {
  my ($self, $symbol) = @_;
  $symbol = "" unless defined $symbol;
  croak "CWB::Indexer: symbol <$symbol> for missing values of p-attributes must not contain single quotes or control characters (aborted).\n"
    if $symbol =~ /[\x{00}-\x{1f}']/;
  $self->{UNDEF_SYMBOL} = $symbol;
}

=item $enc->verbose(1);

Print some progress information (on STDOUT).

=cut

sub verbose {
  my ($self, $yesno) = @_;
  $self->{VERBOSE} = $yesno;
}

=item $enc->debug(1);

Activate debugging output (on STDERR).

=cut

sub debug {
  my ($self, $yesno) = @_;
  $self->{DEBUG} = $yesno;
  $self->{VERBOSE} = 1          # debugging also activates verbose output
    if $yesno;
}

# internal method: called _before_ running cwb-encode
sub prepare_encode {
  my $self = shift;
  my $overwrite = $self->{OVERWRITE};
  
  my $name = $self->{NAME};     # check that setup is complete
  croak "CWB::Encoder: Corpus ID hasn't been specified (with name() method)\n"
    unless $name;
  croak "CWB::Encoder: No positional attributes specified.\n"
    unless @{$self->{PATT}} > 0;

  my $reg = $self->{REGISTRY};
  if (not defined $reg) {
    $reg = CWB::RegistryDirectory(); # try to guess registry if not specified
    $self->{REGISTRY} = $reg;
  }
  croak "CWB::Encoder: Can't determine unique registry directory (path is $reg).\n"
    if $reg =~ /:/;
  croak "CWB::Encoder: Registry directory $reg does not exist.\n"
    unless -d $reg;
  print STDERR "CWB::Encoder: registry directory is $reg\n"
    if $self->{DEBUG};

  my $regfile = "$reg/$name";   # remove registry entry if it exists
  if (-f $regfile) {
    croak "CWB::Encoder: Registry file already exists (overwriting not enabled).\n"
      unless $overwrite;
    print "Removing registry file $reg/$name ...\n"
      if $self->{VERBOSE};
    unlink "$reg/$name";
    croak "CWB::Encoder: Can't delete registry file $reg/$name\n"
      if -f "$reg/$name";
    print STDERR "CWB::Encoder: deleting file $reg/$name\n"
      if $self->{DEBUG};
  }

  my $dir = $self->{DIR};       # check/create data directory
  croak "CWB::Encoder: Data directory has not been set.\n"
    unless $dir;
  if (-d $dir) {
    croak "CWB::Encoder: Data directory already exists (overwriting not enabled).\n"
      unless $overwrite;
    print "Cleaning up data directory $dir ...\n"
      if $self->{VERBOSE};
    my $dh = new DirHandle $dir;
    my @files = grep {-f $_} (glob("$dir/*"), glob("$dir/.*"));
    my ($file, $filename);
    while (defined($filename = $dh->read)) {
      $file = "$dir/$filename";
      next unless -f $file;     # skip subdirectories etc.
      unlink $file;
      carp "CWB::Encoder: Can't delete file $file (trying to continue).\n"
        if -f $file;
      print STDERR "CWB::Encoder: deleting file $file\n"
        if $self->{DEBUG};
    }
    $dh->close;

lib/CWB/Encoder.pm  view on Meta::CPAN

      if $perm;
    CWB::Shell::Cmd("chgrp $group $pattern")
      if $group;
  }
  foreach my $attspec (@{$self->{SATT}}) { # structural attributes
    my $temp = $attspec;        # don't modify original list
    my $rec = ($temp =~ s/:([0-9]+)//) ? $1 : 0;   # recursion depth
    my ($att, @xmlatts) = split /\+/, $temp;       # attribute name and XML tag attributes
    foreach my $n ("", 1 .. $rec) {                # indices of embedded regions
      foreach my $ext ("", map {"_$_"} @xmlatts) { # extensions for XML tag attributes
        my $pattern = "'$dir'/$att$ext$n.*";
        print STDERR "CWB::Encoder: processing group $pattern\n"
          if $self->{DEBUG} and ($perm or $group); 
        CWB::Shell::Cmd("chmod $perm $pattern")
          if $perm;
        CWB::Shell::Cmd("chgrp $group $pattern")
          if $group;
      }
    }
  }

  print "Writing .info file ...\n"     # write .info file
    if $self->{VERBOSE};
  my $infofile = "$dir/.info";
  my $fh = CWB::OpenFile "> $infofile";
  print $fh $self->{INFO}, "\n";
  $fh->close;
  CWB::Shell::Cmd("chmod $perm '$infofile'")
    if $perm;
  CWB::Shell::Cmd("chgrp $group '$infofile'")
    if $group;

  print "Editing registry entry ...\n" # edit registry file
    if $self->{VERBOSE};
  my $reg = $self->{REGISTRY};
  my $name = $self->{NAME};
  my $regfile = "$reg/$name";
  my $rf = new CWB::RegistryFile $regfile;
  croak "CWB::Encoder: Syntax error in registry entry $regfile\n"
    unless defined $rf;
  $rf->name($self->{LONGNAME});
  # $rf->property("charset", $self->{CHARSET}); # -- already set by cwb-encode (since v2.2.101)
  $rf->property("language", $self->{LANG});
  $rf->write($regfile);
  print STDERR "CWB::Encoder: registry entry $regfile has been edited\n"
    if $self->{DEBUG};
  print STDERR "CWB::Encoder: setting access permissions for $regfile\n"
    if $self->{DEBUG} and ($perm or $group);
  CWB::Shell::Cmd("chmod $perm '$regfile'")
    if $perm;
  CWB::Shell::Cmd("chgrp $group '$regfile'")
    if $group;

  my $idx = new CWB::Indexer "$reg:".(uc $name); # build indices and compress p-attributes
  $idx->group($group)
    if $group;
  $idx->perm($perm)
    if $perm;
  $idx->memory($self->{MEMORY});
  $idx->validate($self->{VALIDATE});
  $idx->debug($self->{DEBUG});
  print "Building indices and compressing p-attributes ...\n"
    if $self->{VERBOSE};
  $idx->makeall;

}

=back

=cut

## ======================================================================

1;

__END__

=head1 COPYRIGHT

Copyright (C) 2002-2010 Stefan Evert [http::/purl.org/stefan.evert]

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.

=cut



( run in 0.511 second using v1.01-cache-2.11-cpan-39bf76dae61 )