Alt-CWB-ambs
view release on metacpan or search on metacpan
# full pathnames of CQP and the CWB tools
$CWB::CQP; # cqp
$CWB::Config; # cwb-config
$CWB::Encode; # cwb-encode
$CWB::Makeall; # cwb-makeall
$CWB::Decode; # cwb-decode
$CWB::Lexdecode; # cwb-lexdecode
$CWB::DescribeCorpus; # cwb-describe-corpus
$CWB::Huffcode; # cwb-huffcode
$CWB::CompressRdx; # cwb-compress-rdx
$CWB::Itoa; # cwb-itoa
$CWB::Atoi; # cwb-atoi
$CWB::SEncode; # cwb-s-encode
$CWB::SDecode; # cwb-s-decode
$CWB::ScanCorpus; # cwb-scan-corpus
$CWB::Align; # cwb-align
$CWB::AlignEncode; # cwb-align-encode
$CWB::CQPserver; # cqpserver
# default registry directory and effective registry setting
$CWB::DefaultRegistry;
@dirs = CWB::RegistryDirectory(); # may return multiple directories
# open filehandle for reading or writing
# automagically compresses/decompresses files and dies on error
$fh = CWB::OpenFile("> my_file.gz");
$fh = CWB::OpenFile(">", "my_file.gz"); # as in 3-argument open() call
# temporary file objects (disk files are automatically removed)
$t1 = new CWB::TempFile; # picks a unique filename
$t2 = new CWB::TempFile "mytemp"; # extends prefix to unique name
$t3 = new CWB::TempFile "mytemp.gz"; # compressed temporary file
$filename = $t1->name; # full pathname of temporary file
$t1->write(...); # works like $fh->print()
$t1->finish; # stop writing file
print $t1->status, "\n"; # WRITING/FINISHED/READING/DELETED
# main program can read or overwrite file <$filename> now
$line = $t1->read; # read one line, like $fh->getline()
$t1->rewind; # re-read from beginning of file
$line = $t1->read; # (reads first line again)
$t1->close; # stop reading and delete temporary file
# other files will be deleted when objects $t2 and $t3 are destroyed
# execute shell command with automatic error detection
$cmd = "ls -l";
$errlevel = CWB::Shell::Cmd($cmd); # dies with error message if not ok
# $errlevel: 0 (ok), 1 (minor problems), ..., 6 (fatal error)
@lines = ();
CWB::Shell::Cmd($cmd, \@lines); # capture standard output in array
CWB::Shell::Cmd($cmd, "files.txt"); # ... or in file (for large amounts of data)
$CWB::Shell::Paranoid = 1; # more paranoid checks (-1 for less paranoid)
# read / modify / write registry files (must be in canonical format)
$reg = new CWB::RegistryFile; # create new registry file
$reg = new CWB::RegistryFile "/corpora/c1/registry/dickens"; # load file
die "failed" unless defined $reg; # will fail if not in canonical format
$reg = new CWB::RegistryFile "dickens"; # search in standard registry
$filename = $reg->filename; # retrieve full pathname
# edit standard fields
$name = $reg->name; # read NAME field
$reg->name("Charles Dickens");# modify NAME field
$corpus_id = $reg->id; # same for ID, HOME, INFO
$home_dir = $reg->home;
$info_file = $reg->info;
$reg->delete_info; # INFO line is optional and may be deleted
# edit corpus properties
@properties = $reg->list_properties;
$value = $reg->property("language"); # get property value
$reg->property("language", "en"); # set / add property
$reg->delete_property("language");
# edit attributes ('p'=positional, 's'=structural, 'a'=alignment)
@attr = $reg->list_attributes; # list all attributes
@s_attr = $reg->list_attributes('a'); # list alignment attributes
$type = $reg->attribute("word"); # 'p'/'s'/'a' or undef
$reg->delete_attribute("np");
$reg->add_attribute("np", 's'); # specify type when adding attribute
$dir = $reg->attribute_path("lemma"); # may be stored in different directory
$reg->attribute_path("lemma", $dir); # set attribute path
$reg->delete_attribute_path; # default location is HOME directory
# comment lines (preceding field/declaration) and inline comments use keys:
# ":NAME", ":ID", ... "::$property", ... "$attribute", ...
@lines = $reg->comments(":HOME"); # comment lines before HOME field
$reg->set_comments(":INFO", @lines); # overwrite existing comments
$reg->add_comments("::language", "", "comment for language property", "");
$reg->set_comments("::language"); # delete comments before property
$comment = $reg->line_comment("np"); # inline comment of np attribute
$reg->line_comment("word", "the required word attribute"); # set comment
$reg->delete_line_comment("word"); # delete inline comment
# (over)write registry file (requires full pathname)
$reg->write("/corpora/c1/registry/dickens");
=head1 DESCRIPTION
This module offers basic support for using the IMS Open Corpus Workbench
(L<http://cwb.sourceforge.net/>) from Perl scripts.
Several additional functions are included to perform tasks
that are often needed by corpus-related scripts.
=head1 CWB PATHNAMES
Package variables give the full pathnames of B<CQP> and the B<CWB tools>,
so they can be used in shell commands even when they are not
installed in the user's search path. The following variables are available:
$CWB::CQP; # cqp
$CWB::Config; # cwb-config
$CWB::Encode; # cwb-encode
$CWB::Makeall; # cwb-makeall
$CWB::Decode; # cwb-decode
$CWB::Lexdecode; # cwb-lexdecode
$CWB::DescribeCorpus; # cwb-describe-corpus
$CWB::Huffcode; # cwb-huffcode
$CWB::CompressRdx; # cwb-compress-rdx
$CWB::Itoa; # cwb-itoa
$CWB::Atoi; # cwb-atoi
$CWB::SEncode; # cwb-s-encode
$CWB::SDecode; # cwb-s-decode
$CWB::ScanCorpus; # cwb-scan-corpus
$CWB::Align; # cwb-align
$CWB::AlignEncode; # cwb-align-encode
$CWB::CQPserver; # cqpserver
Other configuration information includes the general installation prefix,
the directory containing CWB binaries (which might be used to install additional
software related to the CWB), and the default registry directory. B<NB:>
individual install paths may have overridden the general prefix, so the
package variable I<$CWB::Prefix> does not have much practical importance.
Use the B<cwb-config> program to find out the precise installation paths.
$stdout_tmp = new CWB::TempFile "CWB-Shell-Cmd-STDOUT";
$stdout_tmp->finish; # now we're allowed to access the file directly
$stdout_file = $stdout_tmp->name;
}
$stderr_tmp->finish;
my $stderr_file = $stderr_tmp->name;
my $status = system "($cmd) 1>$stdout_file 2>$stderr_file";
my $syscode = $status & 0xff;
my $exitval = $status >> 8;
my $fh = CWB::OpenFile $stderr_file;
@$stderr = <$fh>;
map {chomp;} @$stderr;
$fh->close;
if ($outfile) {
@$stdout = (); # don't check STDOUT if caller wants it in file
}
else {
$fh = CWB::OpenFile $stdout_file;
@$stdout = <$fh>;
map {chomp;} @$stdout;
$fh->close;
}
$current_cmd = $cmd; # Error() may want to report the command that failed
$return_status = 0; # error level will be increased (but not decreased) by Error() function
Error 6, "System error: $!", @$stderr
if $syscode != 0;
Error 5, "Non-zero exit value $exitval.", @$stderr
if $exitval != 0;
Error 5, "Error message on stderr:", @$stderr
if grep { /error|fail|abnormal|abort/i } @$stderr;
Error 3, "Warning on stderr:", @$stderr
if grep { /warn|problem/i } @$stderr;
Error 2, "Stderr output:", @$stderr
if @$stderr;
Error 1, "Error message on stdout:", @$stdout
if grep { /error|fail|abnormal|abort/i } @$stdout;
# return highest error status set by one of the previous commands
return $return_status;
}
=back
=cut
## ======================================================================
## parse, modify and create registry entries (in canonical format)
## ======================================================================
package CWB::RegistryFile;
use Carp;
=head1 REGISTRY FILE EDITING
Registry files in B<canonical format> can be loaded into B<CWB::RegistryFile> objects,
edited using the various access methods detailed below, and written back to disk. It
is also possible to create a registry entry from scratch and save it to a disk file.
Canonical registry files consist of a B<header> and a B<body>. The
B<header> begins with a NAME, ID, PATH, and optional INFO field
NAME "long descriptive name"
ID my-corpus
PATH /path/to/data/directory
INFO /path/to/info/file.txt
followed by optional B<corpus property> definitions
##:: property1 = "value1"
##:: property2 = "value2"
The B<body> declares B<positional>, B<structural>, and B<alignment> attributes in
arbitrary order, using the following keywords
ATTRIBUTE word # positional attribute
STRUCTURE np # structural attribute
ALIGNED corpus2 # alignment attribute (CORPUS2 is target corpus)
Each attribute declaration may be followed by an alternative directory path on
the same line, if the attribute data is not stored in the HOME directory of the
corpus:
ATTRIBUTE lemma /path/to/other/data/directory
The header fields, corpus properties, and attribute declarations are jointly
referred to as B<content lines>. Each content line may be preceded by an arbitrary
number of B<comment lines> (starting with a C<#> character) and B<blank lines>.
Trailing comments and blank lines (i.e. after the last content line in a registry
file) are allowed but will be ignored by B<CWB::RegistryFile>. Besides, each
content line may include an B<in-line comment> which extends from the first C<#>
character to the end of the line (see examples above). Note that lines starting
with the special symbol C<##::> are interpreted as corpus property definitions
rather than comments.
=cut
# internal method: "return $self->error();" prints error message and returns undef
sub error {
my ($self, @msg) = @_;
print STDERR "CWB::RegistryFile:\n";
foreach my $line (@msg) {
print STDERR " Error: $line\n";
}
return undef;
}
# internal function: read (and parse) one content line from filehandle $fh, including preceding comment lines
# ($line, @comments) = read_segment($fh);
# $line is "" at end of file (so trailing commments might be salvaged)
sub read_segment {
my $fh = shift;
my @comments = ();
while (<$fh>) {
chomp;
s/^\s+//;
s/\s+$//;
next if $_ eq ("#".("=" x 72)."#"); # header separator line -- ignore
last unless /^$/ or (/^\#/ and not /^\#\#::/);
s/^\#//; # remove (first) comment marker from line
push @comments, $_;
}
if ($_) {
return ($_, @comments);
}
else {
return ("", @comments);
}
}
# internal functions: map between attribute types (p, s, a) and keywords (ATTRIBUTE, STRUCTURE, ALIGNED)
sub type2keyword {
my $type = shift;
my %mapping = qw<p ATTRIBUTE s STRUCTURE a ALIGNED>;
die "Internal error: type2keyword('$type') undefined."
unless defined $mapping{$type};
return $mapping{$type};
}
sub keyword2type {
my $key = shift;
my %mapping = qw<ATTRIBUTE p STRUCTURE s ALIGNED a>;
die "Internal error: keyword2type('$key') undefined."
unless defined $mapping{$key};
return $mapping{$key};
}
=over 4
=item $reg = new CWB::RegistryFile;
=item $reg = new CWB::RegistryFile $filename;
The first form of the B<CWB::RegistryFile> constructor creates a new,
empty registry entry. The mandatory fields have to be filled in by the
Perl script before the I<$reg> object can be saved to disk. It is also highly
advisable to declare at least the C<word> attribute. :-)
The second form attempts to read and parse the registry file I<$filename>. If
successful, a B<CWB::RegistryFile> object storing all relevant information is
returned. If I<$filename> does not contain the character C</> and cannot be
found in the current directory, the constructor will automatically search the
standard registry directories for it. The full pathname of the registry file
can later be determined with the B<filename> method.
If the load operation failed (i.e. the file does not exist or is not in the
canonical registry file format), an error message is printed and an undefined
value returned (so this module can be used e.g. to write a robust graphical
registry editor). Always check the return value of the constructor before
proceeding.
=cut
sub new {
my $class = shift;
my $filename = shift;
my $self = # create and initialise object
{
NAME => "", # name of corpus (defaults to empty string)
ID => undef, # corpus ID (required)
HOME => undef, # home directory (required)
INFO => undef, # info file (optional, but highly recommended)
PROPERTIES => [], # corpus properties ([property, value] pairs)
ATT => {}, # attributes (att => 'p' / 's' / 'a')
ATT_PATH => {}, # data paths for attributs
SERIALIZE => [], # order in which attributes are listed in the registry entry
COMMENTS => {}, # comments and/or blank lines preceding each content line
# (att => [comment1, comment2, ...], ':NAME' => [...], '::property' => ...)
LINECOMMENT => {}, # line comments on content lines (att => comment, ':NAME' => comment, ...)
FILENAME => undef, # filename of registry file (if loaded from file)
};
bless($self, $class);
# if filename was specified, try loading registry entry (searches in registry directories if necessary)
if (defined $filename) {
if ($filename !~ /\// and not -f $filename) {
my @dirs = CWB::RegistryDirectory();
my @files = grep { -f $_ } map { "$_/".lc($filename) } @dirs; # corpus ID may be specified in uppercase
return $self->error("Found multiple registry entries for corpus ".uc($filename).":", @files)
if @files > 1;
$filename = shift @files
if @files;
}
return $self->error("Can't access registry file or corpus $filename")
unless -r $filename;
my $fh = CWB::OpenFile $filename;
$self->{FILENAME} = $filename;
# NAME (required)
my ($l, @c) = read_segment($fh);
return $self->error("Missing or misplaced NAME line in registry file $filename",
" >> $l <<")
unless $l =~ /^NAME\s+/;
return $self->error("Syntax error in registry file $filename:",
" >> $l <<",
"(expected >> NAME \" ... \" <<)")
unless $l =~ /^NAME\s+\"([^\"]*)\"\s*(\#.*)?$/;
my ($v, $lc) = ($1, $2);
$lc =~ s/^\#// if $lc;
$self->{NAME} = $v;
$self->{COMMENTS}->{':NAME'} = [@c];
$self->{LINECOMMENT}->{':NAME'} = $lc
if $lc;
# ID (required)
($l, @c) = read_segment($fh);
return $self->error("Missing or misplaced ID line in registry file $filename",
" >> $l <<")
unless $l =~ /^ID\s+/;
return $self->error("Syntax error in registry file $filename:",
" >> $l <<",
( run in 2.109 seconds using v1.01-cache-2.11-cpan-13bb782fe5a )