Alt-CWB-ambs

 view release on metacpan or  search on metacpan

lib/CWB.pm  view on Meta::CPAN

  # full pathnames of CQP and the CWB tools
  $CWB::CQP;             # cqp
  $CWB::Config;          # cwb-config
  $CWB::Encode;          # cwb-encode
  $CWB::Makeall;         # cwb-makeall
  $CWB::Decode;          # cwb-decode
  $CWB::Lexdecode;       # cwb-lexdecode
  $CWB::DescribeCorpus;  # cwb-describe-corpus
  $CWB::Huffcode;        # cwb-huffcode
  $CWB::CompressRdx;     # cwb-compress-rdx
  $CWB::Itoa;            # cwb-itoa
  $CWB::Atoi;            # cwb-atoi
  $CWB::SEncode;         # cwb-s-encode
  $CWB::SDecode;         # cwb-s-decode
  $CWB::ScanCorpus;      # cwb-scan-corpus
  $CWB::Align;           # cwb-align
  $CWB::AlignEncode;     # cwb-align-encode
  $CWB::CQPserver;       # cqpserver

  # default registry directory and effective registry setting
  $CWB::DefaultRegistry;
  @dirs = CWB::RegistryDirectory(); # may return multiple directories

  # open filehandle for reading or writing
  # automagically compresses/decompresses files and dies on error
  $fh = CWB::OpenFile("> my_file.gz");
  $fh = CWB::OpenFile(">", "my_file.gz"); # as in 3-argument open() call

  # temporary file objects (disk files are automatically removed)
  $t1 = new CWB::TempFile;             # picks a unique filename
  $t2 = new CWB::TempFile "mytemp";    # extends prefix to unique name
  $t3 = new CWB::TempFile "mytemp.gz"; # compressed temporary file
  $filename = $t1->name;        # full pathname of temporary file
  $t1->write(...);              # works like $fh->print()
  $t1->finish;                  # stop writing file
  print $t1->status, "\n";      # WRITING/FINISHED/READING/DELETED
  # main program can read or overwrite file <$filename> now
  $line = $t1->read;            # read one line, like $fh->getline()
  $t1->rewind;                  # re-read from beginning of file
  $line = $t1->read;            # (reads first line again)
  $t1->close;                   # stop reading and delete temporary file
  # other files will be deleted when objects $t2 and $t3 are destroyed

  # execute shell command with automatic error detection
  $cmd = "ls -l";
  $errlevel = CWB::Shell::Cmd($cmd);   # dies with error message if not ok
  # $errlevel: 0 (ok), 1 (minor problems), ..., 6 (fatal error)
  @lines = ();
  CWB::Shell::Cmd($cmd, \@lines);      # capture standard output in array
  CWB::Shell::Cmd($cmd, "files.txt");  # ... or in file (for large amounts of data)
  $CWB::Shell::Paranoid = 1;    # more paranoid checks (-1 for less paranoid)

  # read / modify / write registry files (must be in canonical format)
  $reg = new CWB::RegistryFile; # create new registry file
  $reg = new CWB::RegistryFile "/corpora/c1/registry/dickens";  # load file
  die "failed" unless defined $reg;    # will fail if not in canonical format

  $reg = new CWB::RegistryFile "dickens";       # search in standard registry
  $filename = $reg->filename;                   # retrieve full pathname

  # edit standard fields
  $name = $reg->name;           # read NAME field
  $reg->name("Charles Dickens");# modify NAME field
  $corpus_id = $reg->id;        # same for ID, HOME, INFO
  $home_dir = $reg->home;
  $info_file = $reg->info;
  $reg->delete_info;            # INFO line is optional and may be deleted

  # edit corpus properties
  @properties = $reg->list_properties;
  $value = $reg->property("language");  # get property value
  $reg->property("language", "en");     # set / add property
  $reg->delete_property("language");

  # edit attributes ('p'=positional, 's'=structural, 'a'=alignment)
  @attr = $reg->list_attributes;        # list all attributes
  @s_attr = $reg->list_attributes('a'); # list alignment attributes
  $type = $reg->attribute("word");      # 'p'/'s'/'a' or undef
  $reg->delete_attribute("np");
  $reg->add_attribute("np", 's');       # specify type when adding attribute
  $dir = $reg->attribute_path("lemma"); # may be stored in different directory
  $reg->attribute_path("lemma", $dir);  # set attribute path
  $reg->delete_attribute_path;          # default location is HOME directory

  # comment lines (preceding field/declaration) and inline comments use keys:
  #   ":NAME", ":ID", ... "::$property", ... "$attribute", ...
  @lines = $reg->comments(":HOME");     # comment lines before HOME field
  $reg->set_comments(":INFO", @lines);  # overwrite existing comments
  $reg->add_comments("::language", "", "comment for language property", "");
  $reg->set_comments("::language");     # delete comments before property
  $comment = $reg->line_comment("np");  # inline comment of np attribute
  $reg->line_comment("word", "the required word attribute");  # set comment
  $reg->delete_line_comment("word");    # delete inline comment

  # (over)write registry file (requires full pathname)
  $reg->write("/corpora/c1/registry/dickens");

=head1 DESCRIPTION

This module offers basic support for using the IMS Open Corpus Workbench
(L<http://cwb.sourceforge.net/>) from Perl scripts.
Several additional functions are included to perform tasks
that are often needed by corpus-related scripts.


=head1 CWB PATHNAMES

Package variables give the full pathnames of B<CQP> and the B<CWB tools>, 
so they can be used in shell commands even when they are not
installed in the user's search path. The following variables are available:

  $CWB::CQP;             # cqp
  $CWB::Config;          # cwb-config
  $CWB::Encode;          # cwb-encode
  $CWB::Makeall;         # cwb-makeall
  $CWB::Decode;          # cwb-decode
  $CWB::Lexdecode;       # cwb-lexdecode
  $CWB::DescribeCorpus;  # cwb-describe-corpus
  $CWB::Huffcode;        # cwb-huffcode
  $CWB::CompressRdx;     # cwb-compress-rdx
  $CWB::Itoa;            # cwb-itoa
  $CWB::Atoi;            # cwb-atoi
  $CWB::SEncode;         # cwb-s-encode
  $CWB::SDecode;         # cwb-s-decode
  $CWB::ScanCorpus;      # cwb-scan-corpus
  $CWB::Align;           # cwb-align
  $CWB::AlignEncode;     # cwb-align-encode
  $CWB::CQPserver;       # cqpserver

Other configuration information includes the general installation prefix,
the directory containing CWB binaries (which might be used to install additional
software related to the CWB), and the default registry directory.  B<NB:>
individual install paths may have overridden the general prefix, so the
package variable I<$CWB::Prefix> does not have much practical importance.
Use the B<cwb-config> program to find out the precise installation paths.

lib/CWB.pm  view on Meta::CPAN

    $stdout_tmp = new CWB::TempFile "CWB-Shell-Cmd-STDOUT";
    $stdout_tmp->finish;        # now we're allowed to access the file directly
    $stdout_file = $stdout_tmp->name;
  }
  $stderr_tmp->finish;
  my $stderr_file = $stderr_tmp->name;

  my $status = system "($cmd) 1>$stdout_file 2>$stderr_file";
  my $syscode = $status & 0xff;
  my $exitval = $status >> 8;

  my $fh = CWB::OpenFile $stderr_file;
  @$stderr = <$fh>;
  map {chomp;} @$stderr;
  $fh->close;
  if ($outfile) {
    @$stdout = ();              # don't check STDOUT if caller wants it in file
  }
  else {
    $fh = CWB::OpenFile $stdout_file;
    @$stdout = <$fh>;
    map {chomp;} @$stdout;
    $fh->close;
  }

  $current_cmd = $cmd;          # Error() may want to report the command that failed
  $return_status = 0;           # error level will be increased (but not decreased) by Error() function

  Error 6, "System error: $!", @$stderr
    if $syscode != 0;
  Error 5, "Non-zero exit value $exitval.", @$stderr
    if $exitval != 0;
  Error 5, "Error message on stderr:", @$stderr
    if grep { /error|fail|abnormal|abort/i  } @$stderr;
  Error 3, "Warning on stderr:", @$stderr
    if grep { /warn|problem/i } @$stderr;
  Error 2, "Stderr output:", @$stderr
    if @$stderr;
  Error 1, "Error message on stdout:", @$stdout
    if grep { /error|fail|abnormal|abort/i } @$stdout;

  # return highest error status set by one of the previous commands
  return $return_status;
}

=back 

=cut

## ======================================================================
##  parse, modify and create registry entries (in canonical format)
## ======================================================================

package CWB::RegistryFile;

use Carp;

=head1 REGISTRY FILE EDITING

Registry files in B<canonical format> can be loaded into B<CWB::RegistryFile> objects,
edited using the various access methods detailed below, and written back to disk. It
is also possible to create a registry entry from scratch and save it to a disk file.

Canonical registry files consist of a B<header> and a B<body>. The
B<header> begins with a NAME, ID, PATH, and optional INFO field

  NAME "long descriptive name"
  ID   my-corpus
  PATH /path/to/data/directory
  INFO /path/to/info/file.txt

followed by optional B<corpus property> definitions

  ##:: property1 = "value1"
  ##:: property2 = "value2"

The B<body> declares B<positional>, B<structural>, and B<alignment> attributes in
arbitrary order, using the following keywords

  ATTRIBUTE  word     # positional attribute
  STRUCTURE  np       # structural attribute
  ALIGNED    corpus2  # alignment attribute (CORPUS2 is target corpus)

Each attribute declaration may be followed by an alternative directory path on
the same line, if the attribute data is not stored in the HOME directory of the
corpus:

  ATTRIBUTE  lemma  /path/to/other/data/directory

The header fields, corpus properties, and attribute declarations are jointly
referred to as B<content lines>. Each content line may be preceded by an arbitrary
number of B<comment lines> (starting with a C<#> character) and B<blank lines>.
Trailing comments and blank lines (i.e. after the last content line in a registry
file) are allowed but will be ignored by B<CWB::RegistryFile>. Besides, each 
content line may include an B<in-line comment> which extends from the first C<#>
character to the end of the line (see examples above). Note that lines starting
with the special symbol C<##::> are interpreted as corpus property definitions
rather than comments.

=cut

# internal method: "return $self->error();" prints error message and returns undef
sub error {
  my ($self, @msg) = @_;
  print STDERR "CWB::RegistryFile:\n";
  foreach my $line (@msg) {
    print STDERR "  Error: $line\n";
  }
  return undef;
}

# internal function: read (and parse) one content line from filehandle $fh, including preceding comment lines
#   ($line, @comments) = read_segment($fh);
# $line is "" at end of file (so trailing commments might be salvaged)
sub read_segment {
  my $fh = shift;
  my @comments = ();
  while (<$fh>) {
    chomp;
    s/^\s+//;
    s/\s+$//;
    next if $_ eq ("#".("=" x 72)."#"); # header separator line -- ignore
    last unless /^$/ or (/^\#/ and not /^\#\#::/);
    s/^\#//;                                    # remove (first) comment marker from line
    push @comments, $_;
  }
  if ($_) {
    return ($_, @comments);
  }
  else { 
    return ("", @comments);
  }
}

# internal functions: map between attribute types (p, s, a) and keywords (ATTRIBUTE, STRUCTURE, ALIGNED)
sub type2keyword {
  my $type = shift;
  my %mapping = qw<p ATTRIBUTE  s STRUCTURE  a ALIGNED>;
  die "Internal error: type2keyword('$type') undefined."
    unless defined $mapping{$type};
  return $mapping{$type};
}

sub keyword2type {
  my $key = shift;
  my %mapping = qw<ATTRIBUTE p  STRUCTURE s  ALIGNED a>;
  die "Internal error: keyword2type('$key') undefined."
    unless defined $mapping{$key};
  return $mapping{$key};
}

=over 4

=item $reg = new CWB::RegistryFile;

=item $reg = new CWB::RegistryFile $filename;

The first form of the B<CWB::RegistryFile> constructor creates a new, 
empty registry entry. The mandatory fields have to be filled in by the
Perl script before the I<$reg> object can be saved to disk. It is also highly
advisable to declare at least the C<word> attribute. :-)

The second form attempts to read and parse the registry file I<$filename>. If
successful, a B<CWB::RegistryFile> object storing all relevant information is
returned.  If I<$filename> does not contain the character C</> and cannot be
found in the current directory, the constructor will automatically search the
standard registry directories for it.  The full pathname of the registry file
can later be determined with the B<filename> method.

If the load operation failed (i.e. the file does not exist or is not in the
canonical registry file format), an error message is printed and an undefined
value returned (so this module can be used e.g. to write a robust graphical
registry editor). Always check the return value of the constructor before
proceeding.

=cut

sub new {
  my $class = shift;
  my $filename = shift;
  my $self =                                    # create and initialise object
    {
     NAME => "",                                # name of corpus (defaults to empty string)
     ID => undef,                               # corpus ID (required)
     HOME => undef,                             # home directory (required)
     INFO => undef,                             # info file (optional, but highly recommended)
     PROPERTIES => [],                          # corpus properties ([property, value] pairs)
     ATT => {},                                 # attributes (att => 'p' / 's' / 'a')
     ATT_PATH => {},                            # data paths for attributs
     SERIALIZE => [],                           # order in which attributes are listed in the registry entry
     COMMENTS =>  {},                           # comments and/or blank lines preceding each content line
                                                # (att => [comment1, comment2, ...], ':NAME' => [...], '::property' => ...)
     LINECOMMENT => {},                         # line comments on content lines (att => comment, ':NAME' => comment, ...)
     FILENAME => undef,                         # filename of registry file (if loaded from file)
    };
  bless($self, $class);

  # if filename was specified, try loading registry entry (searches in registry directories if necessary)
  if (defined $filename) {
    if ($filename !~ /\// and not -f $filename) {
      my @dirs = CWB::RegistryDirectory();
      my @files = grep { -f $_ } map { "$_/".lc($filename) } @dirs; # corpus ID may be specified in uppercase
      return $self->error("Found multiple registry entries for corpus ".uc($filename).":", @files)
        if @files > 1;
      $filename = shift @files
        if @files;
    }
    return $self->error("Can't access registry file or corpus $filename")
      unless -r $filename;
    my $fh = CWB::OpenFile $filename;
    $self->{FILENAME} = $filename;
    # NAME (required)
    my ($l, @c) = read_segment($fh);
    return $self->error("Missing or misplaced NAME line in registry file $filename",
                        "  >> $l <<")
      unless $l =~ /^NAME\s+/;
    return $self->error("Syntax error in registry file $filename:",
                        "  >> $l <<",
                        "(expected >> NAME \" ... \" <<)")
      unless $l =~ /^NAME\s+\"([^\"]*)\"\s*(\#.*)?$/;
    my ($v, $lc) = ($1, $2);
    $lc =~ s/^\#// if $lc;
    $self->{NAME} = $v;
    $self->{COMMENTS}->{':NAME'} = [@c];
    $self->{LINECOMMENT}->{':NAME'} = $lc
      if $lc;
    # ID (required)
    ($l, @c) = read_segment($fh);
    return $self->error("Missing or misplaced ID line in registry file $filename",
                        "  >> $l <<")
      unless $l =~ /^ID\s+/;
    return $self->error("Syntax error in registry file $filename:",
                        "  >> $l <<",



( run in 2.109 seconds using v1.01-cache-2.11-cpan-13bb782fe5a )