Alt-CWB-ambs
view release on metacpan or search on metacpan
lib/CWB/Encoder.pm view on Meta::CPAN
$bnc->memory(400); # use up to 400 MB of RAM (default: 75)
$bnc->validate(0); # disable validation for faster indexing
$bnc->debug(1); # enable debugging output
$bnc->make("word", "pos"); # build index & compress
$bnc->makeall; # process all p-attributes
$bnc = new CWB::Encoder "BNC";
$bnc->registry("/path/to/registry"); # will try to guess otherwise
$bnc->dir("/path/to/data/directory"); # directory for corpus data files
$bnc->overwrite(1); # may overwrite existing files / directories
$bnc->longname("British National Corpus"); # optional
$bnc->info("Line1.\nLine2.\n..."); # optional multi-line info text
$bnc->charset("latin1"); # defaults to latin1
$bnc->language("en"); # defaults to ??
$bnc->group("corpora"); # optional: group and access permissions
$bnc->perm("640"); # for newly created files & directories
$bnc->p_attributes("word"); # declare postional atts (no default!)
$bnc->p_attributes(qw<pos lemma>); # may be called repeatedly
$bnc->null_attributes("teiHeader"); # declare null atts (ignored)
$bnc->s_attributes("s"); # s-attributes in cwb-encode syntax
$bnc->s_attributes(qw<div0* div1*>);# * = store annotations (-V)
$bnc->s_attributes("bncDoc:0+id"); # recursion & XML attributes
$bnc->decode_entities(0); # don't decode XML entities (with -x flag)
$bnc->undef_symbol("__UNDEF__"); # mark missing values like cwb-encode
$bnc->memory(400); # use up to 400 MB of RAM (default: 75)
$bnc->validate(0); # disable validation for faster indexing
$bnc->verbose(1); # print some progress information
$bnc->debug(1); # enable debugging output
$bnc->encode(@files); # encoding, indexing, and compression
$pipe = $bnc->encode_pipe; # can also feed input text from Perl script
while (...) {
print $pipe "$line\n";
}
$bnc->close_pipe;
=head1 DESCRIPTION
This package contains modules for the automatic encoding and indexing
of CWB corpora.
B<CWB::Indexer> builds indices for some or all positional attributes
of an existing corpus (using the B<cwb-makeall> tool). In addition,
these attributes are automatically compressed (using the
B<cwb-huffcode> and B<cwb-compress-rdx> tools). Compression and
indexing is interleaved to minimise the required amount of temporary
disk space, and a B<make>-like system ensures that old index files are
automatically updated.
B<CWB::Encoder> automates all steps necessary to encode a CWB corpus
(which includes cleaning up old files, running B<cwb-encode>, editing
the registry entry, indexing & compressing positional attributes, and
setting access permissions). Both modules can be set up with a few
simple method calls. Full descriptions are given separately in the
following sections.
=cut
## ======================================================================
## automatic creation, compression and updating of CWB index files (for p-attributes)
## ======================================================================
package CWB::Indexer;
use CWB;
use Carp;
# makefile-like rules for creating / updating components
# TRIGGER .. update component when one of these comps exists & is newer
# NEEDED .. componentes required by command below
# CREATES .. these files will be created by COMMAND
# COMMAND .. shell command to create this component
# interpolates '#C' (corpus id), '#A' (attribute name), '#R' (registry flag),
# '#M' (memory limit), '#T' (no validate), '#V' (validate)
# (issues "can't create" error message if COMMAND starts with "ERROR")
# DELETE .. delete these components when target exist or has been created
our %RULES =
(
DIR => {
TRIGGER => [],
NEEDED => [],
CREATES => [],
COMMAND => "ERROR: Corpus data directory must be created manually.",
DELETE => [],
},
CORPUS => {
TRIGGER => [],
NEEDED => [],
CREATES => [],
COMMAND => "ERROR: You must run the cwb-encode tool first.",
DELETE => [],
},
LEXICON => {
TRIGGER => [],
NEEDED => [],
CREATES => [],
COMMAND => "ERROR: You must run the cwb-encode tool first.",
DELETE => [],
},
LEXIDX => {
TRIGGER => [],
NEEDED => [],
CREATES => [],
COMMAND => "ERROR: You must run the cwb-encode tool first.",
DELETE => [],
},
FREQS => {
TRIGGER => [qw<CORPUS LEXICON LEXIDX>],
NEEDED => [qw<CORPUS LEXICON LEXIDX>],
CREATES => [qw<FREQS>],
COMMAND => "$CWB::Makeall #R -c FREQS -P #A #C",
lib/CWB/Encoder.pm view on Meta::CPAN
unless $pipe;
croak "CWB::Encoder: Error in cwb-encode pipe ($!).\n"
unless $pipe->close;
print STDERR "CWB::Encoder: pipe to cwb-encode program closed\n"
if $self->{DEBUG};
$self->post_encode;
print "Encoding complete.\n"
if $self->{VERBOSE};
}
# internal method: called _after_ running cwb-encode
sub post_encode {
my $self = shift;
my $perm = $self->{PERM};
my $group = $self->{GROUP};
my $dir = $self->{DIR};
print "Setting access permissions ...\n" # set access permissions for created files
if $self->{VERBOSE};
foreach my $att (@{$self->{PATT}}) { # positional attributes
my $pattern = "'$dir'/$att.*";
print STDERR "CWB::Encoder: processing group $pattern\n"
if $self->{DEBUG} and ($perm or $group);
CWB::Shell::Cmd("chmod $perm $pattern")
if $perm;
CWB::Shell::Cmd("chgrp $group $pattern")
if $group;
}
foreach my $attspec (@{$self->{SATT}}) { # structural attributes
my $temp = $attspec; # don't modify original list
my $rec = ($temp =~ s/:([0-9]+)//) ? $1 : 0; # recursion depth
my ($att, @xmlatts) = split /\+/, $temp; # attribute name and XML tag attributes
foreach my $n ("", 1 .. $rec) { # indices of embedded regions
foreach my $ext ("", map {"_$_"} @xmlatts) { # extensions for XML tag attributes
my $pattern = "'$dir'/$att$ext$n.*";
print STDERR "CWB::Encoder: processing group $pattern\n"
if $self->{DEBUG} and ($perm or $group);
CWB::Shell::Cmd("chmod $perm $pattern")
if $perm;
CWB::Shell::Cmd("chgrp $group $pattern")
if $group;
}
}
}
print "Writing .info file ...\n" # write .info file
if $self->{VERBOSE};
my $infofile = "$dir/.info";
my $fh = CWB::OpenFile "> $infofile";
print $fh $self->{INFO}, "\n";
$fh->close;
CWB::Shell::Cmd("chmod $perm '$infofile'")
if $perm;
CWB::Shell::Cmd("chgrp $group '$infofile'")
if $group;
print "Editing registry entry ...\n" # edit registry file
if $self->{VERBOSE};
my $reg = $self->{REGISTRY};
my $name = $self->{NAME};
my $regfile = "$reg/$name";
my $rf = new CWB::RegistryFile $regfile;
croak "CWB::Encoder: Syntax error in registry entry $regfile\n"
unless defined $rf;
$rf->name($self->{LONGNAME});
# $rf->property("charset", $self->{CHARSET}); # -- already set by cwb-encode (since v2.2.101)
$rf->property("language", $self->{LANG});
$rf->write($regfile);
print STDERR "CWB::Encoder: registry entry $regfile has been edited\n"
if $self->{DEBUG};
print STDERR "CWB::Encoder: setting access permissions for $regfile\n"
if $self->{DEBUG} and ($perm or $group);
CWB::Shell::Cmd("chmod $perm '$regfile'")
if $perm;
CWB::Shell::Cmd("chgrp $group '$regfile'")
if $group;
my $idx = new CWB::Indexer "$reg:".(uc $name); # build indices and compress p-attributes
$idx->group($group)
if $group;
$idx->perm($perm)
if $perm;
$idx->memory($self->{MEMORY});
$idx->validate($self->{VALIDATE});
$idx->debug($self->{DEBUG});
print "Building indices and compressing p-attributes ...\n"
if $self->{VERBOSE};
$idx->makeall;
}
=back
=cut
## ======================================================================
1;
__END__
=head1 COPYRIGHT
Copyright (C) 2002-2010 Stefan Evert [http::/purl.org/stefan.evert]
This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.
=cut
( run in 0.857 second using v1.01-cache-2.11-cpan-99c4e6809bf )