Alt-CWB-ambs

 view release on metacpan or  search on metacpan

lib/CWB/Encoder.pm  view on Meta::CPAN

directory has to be specified explicitly, separated from the corpus name
by a C<:> character. I<$registry_path> may contain multiple directories
separated by C<:> characters.

=cut

sub new {
  my $class = shift;
  my $self = {
              NAME => undef,    # name of the corpus (CWB corpus ID)
              REGISTRY => "",   # -r flag for non-default registry
              FILES => {},      # lookup hash for component filenames
              # $self->{FILES}->{$att}->{$comp} = $pathname;
              TYPES => {},      # attribute types: P / S
              GROUP => undef,   # optional: set group for new files
              PERM => undef,    # optional: set permissions for new files
              MEMORY => 75,     # memory limit for index creation
              VALIDATE => 1,    # enable/disable validation
              DEBUG => 0,       # enable/disable debugging output
             };
  croak 'USAGE:  $c = new CWB::Indexer $corpus_id;'
    unless @_ == 1;
  my $name = shift;
  if ($name =~ /^\s*(.+)\s*:\s*([^:]+)$/) {
    $self->{REGISTRY} = "-r '$1'";
    $name = $2;
  }
  $self->{NAME} = $name;

  # use cwb-describe-corpus to find out component pathnames
  my @lines = ();
  my $registry = $self->{REGISTRY};
  my $cmd = "'$CWB::DescribeCorpus' $registry -d $name";
  CWB::Shell::Cmd($cmd, \@lines);

  my $comp = "";                # component name
  my $attr = "";                # attribute name
  foreach (@lines) {
    if (/Component\s+([A-Z]+):/) {
      $comp = $1;
    }
    elsif (/Attribute:\s+(\S+)/ or /Attribute\s+(\S+):/) {
      $attr = $1;
    }
    elsif (/Path\/Value:\s+(\S(.*\S)?)/) {
      croak "CWB::Indexer: Can't find component name for file $1 (aborted).\n"
        unless $comp;
      croak "CWB::Indexer: Can't find attribute name for file $1 (aborted).\n"
        unless $attr;
      $self->{FILES}->{$attr}->{$comp} = $1;
      $comp = $attr = "";       # reset to check for syntax errors
    }
    elsif (/Type:\s+([A-Z])/) {
      carp "CWB::Indexer: Missing attribute name in output of cwb-describe-corpus $name (skipped).\n"
        unless $attr;
      $self->{TYPES}->{$attr} = $1;
    }
    # all other lines are ignored
  }

  return bless($self, $class);
}

=item $idx->group($group);

=item $idx->perm($permission);

Optional group membership and access permissions for newly created
files (otherwise, neither B<chgrp> nor B<chmod> will be called). Note
that I<$permission> must be a string rather than an octal number (as
for the built-in B<chmod> function). Indexing will fail if the
specified group and/or permissions cannot be set.

=cut

sub group {
  my ($self, $group) = @_;
  $self->{GROUP} = $group;
}

sub perm {
  my ($self, $perm) = @_;
  $self->{PERM} = $perm;
}

=item $idx->memory($mbytes);

Set approximate memory limit for B<cwb-makeall> command, in MBytes.
The memory limit defaults to 75 MB, which is a reasonable value for
systems with at least 128 MB of RAM. 

=cut

sub memory {
  my ($self, $mem) = @_;
  croak "CWB::Indexer:  memory limit ($mem) must be positive integer number (aborted).\n"
    unless $mem =~ /^[1-9][0-9]*$/;
  $self->{MEMORY} = $mem;
}

=item $idx->validate(0);

Turn off validation of index and compressed files, which may give 
substantial speed improvements for larger corpora.

=cut

sub validate {
  my ($self, $yesno) = @_;
  $self->{VALIDATE} = $yesno;
}

=item $idx->debug(1);

Activate debugging output (on STDERR). 

=cut

sub debug {
  my ($self, $yesno) = @_;
  $self->{DEBUG} = $yesno;

lib/CWB/Encoder.pm  view on Meta::CPAN

=cut

sub makeall {
  my $self = shift;
  foreach my $att (keys %{$self->{TYPES}}) {
    $self->make($att)
      if $self->{TYPES}->{$att} eq "P";
  }
}

=back

=cut

## ======================================================================
##  automatic encoding, indexing, and compression of corpora
## ======================================================================

package CWB::Encoder;

use CWB;
use Carp;
use DirHandle;

=head1 CWB::Encoder METHODS

=over 4

=item $enc = new CWB::Encoder $corpus;

Create a new B<CWB::Encoder> object for the specified corpus. Note
that the registry directory cannot be passed directly to the
constructor (use the B<registry> method instead).

=cut

sub new {
  my $class = shift;
  my $self = {                  # create and initialise object
              NAME => undef,    # name of corpus (CWB corpus ID)
              LONGNAME => "",   # long descriptive name
              INFO => "Indexed with CWB::Encoder.", # contents of .info file
              CHARSET => "latin1", # character set (corpus property)
              LANG => "??",     # language (corpus property)
              REGISTRY => undef, # registry directory (will be automatically chosen if possible)
              DIR => undef,     # data directory
              PATT => [],       # positional attributes
              SATT => [],       # structural attributes (cwb-encode syntax for recursion and XML atts)
              NATT => [],       # null attributes (tags are ignored)
              GROUP => undef,   # optional: group and access
              PERM => undef,    # permissions for created files
              OVERWRITE => undef, # can I overwrite existing files?
              MEMORY => 75,     # passed to CWB::Indexer
              VALIDATE => 1,    # passed to CWB::Indexer
              ENTITIES => 1,    # whether to decode XML entities (and skip comments etc.)
              UNDEF_SYMBOL => "", # string to insert for missing values of p-attributes
              VERBOSE => 0,     # print some progress information (stdout)
              DEBUG => 0,
              PIPE => undef,    # pipe to cwb-encode (for encode_pipe() method)
             };
  bless($self, $class);
  $self->name(shift)
    if @_;
  return $self;
}

=item $enc->name($corpus);

Change the CWB name of a corpus after the encoder object I<$enc> has been created.
Has to be used if the constructor was called without arguments.

=cut

sub name {
  my ($self, $name) = @_;
  $self->{NAME} = lc($name);
}

=item $enc->longname($descriptive_name);

Optional long, descriptive name for a corpus (single line).

=cut

sub longname {
  my ($self, $longname) = @_;
  carp "CWB::Encoder: long name ($longname) must not contain \" and \\ characters (removed).\n"
    if $longname =~ tr/\"\\//d;
  $self->{LONGNAME} = $longname;
}

=item $enc->info($multiline_text);

Multi-line text that will be written to the C<.info> file of the
corpus.

=cut

sub info {
  my ($self, $info) = @_;
  $self->{INFO} = $info;
}

=item $enc->charset($code);

Set corpus character set (as a corpus property in the registry entry).
So far, only C<latin1> is fully supported. Other valid character sets are
C<latin2>, ..., C<latin9>, and C<utf8> (which will be supported by future
releases of the CWB). Any other I<$code> will raise a warning.

=cut

sub charset {
  my ($self, $charset) = @_;
  carp "CWB::Encoder: character set $charset not supported by CWB (latin1, ..., latin9, utf8).\n"
    unless $charset =~ /^(latin[1-9]|utf8)$/;
  $self->{CHARSET} = $charset;
}

=item $enc->language($code);



( run in 0.709 second using v1.01-cache-2.11-cpan-39bf76dae61 )