Alt-CWB-ambs
view release on metacpan or search on metacpan
lib/CWB/Encoder.pm view on Meta::CPAN
package CWB::Encoder;
# -*-cperl-*-
use strict;
use warnings;
=head1 NAME
CWB::Encoder - Perl tools for encoding and indexing CWB corpora
=head1 SYNOPSIS
use CWB::Encoder;
$bnc = new CWB::Indexer "BNC";
$bnc = new CWB::Indexer "/path/to/registry:BNC";
$bnc->group("corpora"); # optional: group and access
$bnc->perm("640"); # permissions for newly created files
$bnc->memory(400); # use up to 400 MB of RAM (default: 75)
$bnc->validate(0); # disable validation for faster indexing
$bnc->debug(1); # enable debugging output
$bnc->make("word", "pos"); # build index & compress
$bnc->makeall; # process all p-attributes
$bnc = new CWB::Encoder "BNC";
$bnc->registry("/path/to/registry"); # will try to guess otherwise
$bnc->dir("/path/to/data/directory"); # directory for corpus data files
$bnc->overwrite(1); # may overwrite existing files / directories
$bnc->longname("British National Corpus"); # optional
$bnc->info("Line1.\nLine2.\n..."); # optional multi-line info text
$bnc->charset("latin1"); # defaults to latin1
$bnc->language("en"); # defaults to ??
$bnc->group("corpora"); # optional: group and access permissions
$bnc->perm("640"); # for newly created files & directories
$bnc->p_attributes("word"); # declare postional atts (no default!)
$bnc->p_attributes(qw<pos lemma>); # may be called repeatedly
$bnc->null_attributes("teiHeader"); # declare null atts (ignored)
$bnc->s_attributes("s"); # s-attributes in cwb-encode syntax
$bnc->s_attributes(qw<div0* div1*>);# * = store annotations (-V)
$bnc->s_attributes("bncDoc:0+id"); # recursion & XML attributes
$bnc->decode_entities(0); # don't decode XML entities (with -x flag)
$bnc->undef_symbol("__UNDEF__"); # mark missing values like cwb-encode
$bnc->memory(400); # use up to 400 MB of RAM (default: 75)
$bnc->validate(0); # disable validation for faster indexing
$bnc->verbose(1); # print some progress information
$bnc->debug(1); # enable debugging output
$bnc->encode(@files); # encoding, indexing, and compression
$pipe = $bnc->encode_pipe; # can also feed input text from Perl script
while (...) {
print $pipe "$line\n";
}
$bnc->close_pipe;
=head1 DESCRIPTION
This package contains modules for the automatic encoding and indexing
of CWB corpora.
B<CWB::Indexer> builds indices for some or all positional attributes
of an existing corpus (using the B<cwb-makeall> tool). In addition,
these attributes are automatically compressed (using the
B<cwb-huffcode> and B<cwb-compress-rdx> tools). Compression and
indexing is interleaved to minimise the required amount of temporary
disk space, and a B<make>-like system ensures that old index files are
automatically updated.
B<CWB::Encoder> automates all steps necessary to encode a CWB corpus
(which includes cleaning up old files, running B<cwb-encode>, editing
the registry entry, indexing & compressing positional attributes, and
setting access permissions). Both modules can be set up with a few
simple method calls. Full descriptions are given separately in the
following sections.
=cut
## ======================================================================
## automatic creation, compression and updating of CWB index files (for p-attributes)
## ======================================================================
lib/CWB/Encoder.pm view on Meta::CPAN
my ($self, $yesno) = @_;
$self->{ENTITIES} = $yesno;
}
=item $enc->undef_symbol("__UNDEF__");
Symbol inserted for missing values of positional attributes (either
because there are too few columns in the input or because attribute
values are explicit empty strings). By default, no special symbol
is inserted (i.e. missing values are encoded as empty strings C<"">).
Use the command shown above to mimic the standard behaviour of
B<cwb-encode>.
=cut
sub undef_symbol {
my ($self, $symbol) = @_;
$symbol = "" unless defined $symbol;
croak "CWB::Indexer: symbol <$symbol> for missing values of p-attributes must not contain single quotes or control characters (aborted).\n"
if $symbol =~ /[\x{00}-\x{1f}']/;
$self->{UNDEF_SYMBOL} = $symbol;
}
=item $enc->verbose(1);
Print some progress information (on STDOUT).
=cut
sub verbose {
my ($self, $yesno) = @_;
$self->{VERBOSE} = $yesno;
}
=item $enc->debug(1);
Activate debugging output (on STDERR).
=cut
sub debug {
my ($self, $yesno) = @_;
$self->{DEBUG} = $yesno;
$self->{VERBOSE} = 1 # debugging also activates verbose output
if $yesno;
}
# internal method: called _before_ running cwb-encode
sub prepare_encode {
my $self = shift;
my $overwrite = $self->{OVERWRITE};
my $name = $self->{NAME}; # check that setup is complete
croak "CWB::Encoder: Corpus ID hasn't been specified (with name() method)\n"
unless $name;
croak "CWB::Encoder: No positional attributes specified.\n"
unless @{$self->{PATT}} > 0;
my $reg = $self->{REGISTRY};
if (not defined $reg) {
$reg = CWB::RegistryDirectory(); # try to guess registry if not specified
$self->{REGISTRY} = $reg;
}
croak "CWB::Encoder: Can't determine unique registry directory (path is $reg).\n"
if $reg =~ /:/;
croak "CWB::Encoder: Registry directory $reg does not exist.\n"
unless -d $reg;
print STDERR "CWB::Encoder: registry directory is $reg\n"
if $self->{DEBUG};
my $regfile = "$reg/$name"; # remove registry entry if it exists
if (-f $regfile) {
croak "CWB::Encoder: Registry file already exists (overwriting not enabled).\n"
unless $overwrite;
print "Removing registry file $reg/$name ...\n"
if $self->{VERBOSE};
unlink "$reg/$name";
croak "CWB::Encoder: Can't delete registry file $reg/$name\n"
if -f "$reg/$name";
print STDERR "CWB::Encoder: deleting file $reg/$name\n"
if $self->{DEBUG};
}
my $dir = $self->{DIR}; # check/create data directory
croak "CWB::Encoder: Data directory has not been set.\n"
unless $dir;
if (-d $dir) {
croak "CWB::Encoder: Data directory already exists (overwriting not enabled).\n"
unless $overwrite;
print "Cleaning up data directory $dir ...\n"
if $self->{VERBOSE};
my $dh = new DirHandle $dir;
my @files = grep {-f $_} (glob("$dir/*"), glob("$dir/.*"));
my ($file, $filename);
while (defined($filename = $dh->read)) {
$file = "$dir/$filename";
next unless -f $file; # skip subdirectories etc.
unlink $file;
carp "CWB::Encoder: Can't delete file $file (trying to continue).\n"
if -f $file;
print STDERR "CWB::Encoder: deleting file $file\n"
if $self->{DEBUG};
}
$dh->close;
}
else {
print "Creating data directory $dir ...\n"
if $self->{VERBOSE};
croak "CWB::Encoder: Can't create data directory $dir\n"
unless mkdir $dir;
my $perm = $self->{PERM};
if ($perm) {
$perm =~ tr[642][753]; # derive directory permissions
CWB::Shell::Cmd("chmod $perm '$dir'");
$perm = "(chmod $perm)";
}
else {
$perm = "";
}
my $group = $self->{GROUP};
if ($group) {
( run in 1.161 second using v1.01-cache-2.11-cpan-e1769b4cff6 )