Alt-CWB-ambs
view release on metacpan or search on metacpan
lib/CWB/Encoder.pm view on Meta::CPAN
my $self = {
NAME => undef, # name of the corpus (CWB corpus ID)
REGISTRY => "", # -r flag for non-default registry
FILES => {}, # lookup hash for component filenames
# $self->{FILES}->{$att}->{$comp} = $pathname;
TYPES => {}, # attribute types: P / S
GROUP => undef, # optional: set group for new files
PERM => undef, # optional: set permissions for new files
MEMORY => 75, # memory limit for index creation
VALIDATE => 1, # enable/disable validation
DEBUG => 0, # enable/disable debugging output
};
croak 'USAGE: $c = new CWB::Indexer $corpus_id;'
unless @_ == 1;
my $name = shift;
if ($name =~ /^\s*(.+)\s*:\s*([^:]+)$/) {
$self->{REGISTRY} = "-r '$1'";
$name = $2;
}
$self->{NAME} = $name;
# use cwb-describe-corpus to find out component pathnames
my @lines = ();
my $registry = $self->{REGISTRY};
my $cmd = "'$CWB::DescribeCorpus' $registry -d $name";
CWB::Shell::Cmd($cmd, \@lines);
my $comp = ""; # component name
my $attr = ""; # attribute name
foreach (@lines) {
if (/Component\s+([A-Z]+):/) {
$comp = $1;
}
elsif (/Attribute:\s+(\S+)/ or /Attribute\s+(\S+):/) {
$attr = $1;
}
elsif (/Path\/Value:\s+(\S(.*\S)?)/) {
croak "CWB::Indexer: Can't find component name for file $1 (aborted).\n"
unless $comp;
croak "CWB::Indexer: Can't find attribute name for file $1 (aborted).\n"
unless $attr;
$self->{FILES}->{$attr}->{$comp} = $1;
$comp = $attr = ""; # reset to check for syntax errors
}
elsif (/Type:\s+([A-Z])/) {
carp "CWB::Indexer: Missing attribute name in output of cwb-describe-corpus $name (skipped).\n"
unless $attr;
$self->{TYPES}->{$attr} = $1;
}
# all other lines are ignored
}
return bless($self, $class);
}
=item $idx->group($group);
=item $idx->perm($permission);
Optional group membership and access permissions for newly created
files (otherwise, neither B<chgrp> nor B<chmod> will be called). Note
that I<$permission> must be a string rather than an octal number (as
for the built-in B<chmod> function). Indexing will fail if the
specified group and/or permissions cannot be set.
=cut
sub group {
my ($self, $group) = @_;
$self->{GROUP} = $group;
}
sub perm {
my ($self, $perm) = @_;
$self->{PERM} = $perm;
}
=item $idx->memory($mbytes);
Set approximate memory limit for B<cwb-makeall> command, in MBytes.
The memory limit defaults to 75 MB, which is a reasonable value for
systems with at least 128 MB of RAM.
=cut
sub memory {
my ($self, $mem) = @_;
croak "CWB::Indexer: memory limit ($mem) must be positive integer number (aborted).\n"
unless $mem =~ /^[1-9][0-9]*$/;
$self->{MEMORY} = $mem;
}
=item $idx->validate(0);
Turn off validation of index and compressed files, which may give
substantial speed improvements for larger corpora.
=cut
sub validate {
my ($self, $yesno) = @_;
$self->{VALIDATE} = $yesno;
}
=item $idx->debug(1);
Activate debugging output (on STDERR).
=cut
sub debug {
my ($self, $yesno) = @_;
$self->{DEBUG} = $yesno;
}
# internal method: get full pathname of a component file
sub filename {
my ($self, $att, $comp) = @_;
my $path = $self->{FILES}->{$att}->{$comp};
croak "CWB::Indexer: can't determine filename for component $att/$comp (aborted).\n"
unless defined $path;
return $path;
}
lib/CWB/Encoder.pm view on Meta::CPAN
else {
my $age = -M $file;
foreach my $t (@$trigger) { # check for triggers that are newer than target
my $t_file = $self->filename($att, $t);
if (-f $t_file) {
my $t_age = -M $t_file;
if ($t_age < $age) {
$update = 1; # trigger is newer -> update
print STDERR "CWB::Indexer: component $att/$t is newer than $att/$comp -> update\n"
if $self->{DEBUG};
}
}
}
}
if ($update) { # (re-)create component if necessary
print STDERR
"CWB::Indexer: make_comp($att, $comp)\n",
"CWB::Indexer: creating component file $file\n"
if $self->{DEBUG};
foreach my $c (@$creates) { # delete old target files (first, to make room for intermediate files)
my $f = $self->filename($att, $c);
if (-f $f) {
unlink $f;
croak "CWB::Indexer: Can't delete file $f (aborted).\n"
if -e $f;
print STDERR "CWB::Indexer: deleting file $f\n"
if $self->{DEBUG};
}
}
foreach my $c (@$needed) { # recursively create/update prerequisites
$self->make_comp($att, $c);
}
if ($command =~ s/^\s*ERROR\s*(:\s*)?//) {
croak
"CWB::Indexer: Can't create component $att/$comp ($file)\n",
" $command\n";
}
$command =~ s/\#C/$self->{NAME}/g; # substitute variables in $command
$command =~ s/\#A/$att/g;
$command =~ s/\#R/$self->{REGISTRY}/g;
$command =~ s/\#M/-M $self->{MEMORY}/g;
$command =~ s/\#T/($self->{VALIDATE}) ? "" : "-T"/eg;
$command =~ s/\#V/($self->{VALIDATE}) ? "-V" : ""/eg;
print STDERR "CWB::Indexer: exec: $command\n"
if $self->{DEBUG};
CWB::Shell::Cmd $command; # execute creation command
my $perm = $self->{PERM}; # check that target file(s) exist and set permissions
my $group = $self->{GROUP};
foreach my $c (@$creates) {
my $f = $self->filename($att, $c);
croak "CWB::Indexer: Creation of component $att/$c ($f) failed (aborted).\n"
unless -s $f;
if ($perm) {
my $cmd = "chmod $perm '$f'";
print STDERR "CWB::Indexer: exec: $cmd\n"
if $self->{DEBUG};
CWB::Shell::Cmd $cmd;
}
if ($group) {
my $cmd = "chgrp $group '$f'";
print STDERR "CWB::Indexer: exec: $cmd\n"
if $self->{DEBUG};
CWB::Shell::Cmd $cmd;
}
}
print STDERR "CWB::Indexer: component $att/$comp has been created successfully\n"
if $self->{DEBUG};
}
# always run the cleanup so that unneccessary files are automatically deleted
foreach my $c (@$delete) { # delete intermediate components that are no longer needed
my $f = $self->filename($att, $c);
if (-f $f) {
print STDERR "CWB::Indexer: deleting file $f\n"
if $self->{DEBUG};
unlink $f;
croak "CWB::Indexer: Can't delete intermediate file $f (aborted).\n"
if -f $f;
}
}
}
=item $idx->make($att1, $att2, ...);
Process one or more positional attributes. An index is built for each
attribute and the data files are compressed. Missing files are
re-created (if possible) and old files are updated automatically.
=cut
sub make {
my $self = shift;
my $corpus = $self->{NAME};
foreach my $att (@_) {
my $type = $self->{TYPES}->{$att};
croak "CWB::Indexer: $corpus.$att is not a positional attribute (aborted).\n"
unless $type and $type eq "P";
print STDERR "CWB::Indexer: make($corpus.$att)\n"
if $self->{DEBUG};
foreach my $comp (@NEEDED) {
$self->make_comp($att, $comp);
}
print STDERR "CWB::Indexer: attribute $corpus.$att was indexed successfully\n"
if $self->{DEBUG};
}
}
=item $idx->makeall;
Process all positional attributes of the corpus.
=cut
lib/CWB/Encoder.pm view on Meta::CPAN
=cut
sub dir {
my ($self, $dir) = @_;
$self->{DIR} = $dir;
}
=item $enc->p_attributes($att1, $att2, ...);
Declare one or more B<positional attributes>. This method can be
called repeatedly with additional attributes. Note that I<all>
positional attributes, including C<word>, have to be declared
explicitly.
=cut
sub p_attributes {
my $self = shift;
push @{$self->{PATT}}, @_;
}
=item $enc->s_attributes($att1, $att2, ...);
Declare one or more B<structural attributes>. I<$att1> etc. are either
simple attribute names or complex declarations using the syntax of the
C<-S> and C<-V> flags in B<cwb-encode>. See the I<CWB Corpus Encoding
Tutorial> for details on the attribute declaration syntax for nesting
depth and XML tag attributes. By default, structural attributes are
encoded without annotation strings (C<-S> flag). In order to store
annotations (C<-V> flag), append an asterisk (C<*>) to the attribute
name or declaration. The I<CWB Corpus Encoding Tutorial> explains when
to use C<-S> and when to use C<-V>. The B<s_attributes> method can
be called repeatedly to add further attributes.
=cut
sub s_attributes {
my $self = shift;
push @{$self->{SATT}}, @_;
}
=item $enc->null_attributes($att1, $att2, ...);
Declare one or more B<null attributes>. XML start and end tags
with these names will be ignored (and not inserted as C<word>
tokens). This method can be called repeatedly.
=cut
sub null_attributes {
my $self = shift;
push @{$self->{NATT}}, @_;
}
=item $enc->group($group);
=item $enc->perm($permission);
Optional group membership and access permissions for newly created
files (otherwise, neither B<chgrp> nor B<chmod> will be called). Note
that I<$permission> must be a string rather than an octal number (as
for the built-in B<chmod> function). Encoding will fail if the
specified group and/or permissions cannot be set. If the data
directory has to be created, its access permissions and group
membership are set accordingly.
=cut
sub group {
my ($self, $group) = @_;
$self->{GROUP} = $group;
}
sub perm {
my ($self, $perm) = @_;
$self->{PERM} = $perm;
}
=item $enc->overwrite(1);
Allow B<CWB::Encoder> to overwrite existing files. This is required
when either the registry entry or the data directory exists already.
When overwriting is enabled, the registry entry and all files in the
data directory are deleted before encoding starts.
=cut
sub overwrite {
my ($self, $yesno) = @_;
$self->{OVERWRITE} = $yesno;
}
=item $enc->memory($mbytes);
Set approximate memory limit for B<cwb-makeall> command, in MBytes.
The memory limit defaults to 75 MB, which is a reasonable value for
systems with at least 128 MB of RAM. The memory setting is only used
when building indices for positional attributes, not during the
initial encoding process.
=cut
sub memory {
my ($self, $mem) = @_;
croak "CWB::Indexer: memory limit ($mem) must be positive integer number (aborted).\n"
unless $mem =~ /^[1-9][0-9]*$/;
$self->{MEMORY} = $mem;
}
=item $enc->validate(0);
Turn off validation of index and compressed files, which may give
substantial speed improvements for larger corpora.
=cut
sub validate {
my ($self, $yesno) = @_;
$self->{VALIDATE} = $yesno;
}
=item $enc->decode_entities(0);
lib/CWB/Encoder.pm view on Meta::CPAN
croak "CWB::Encoder: Corpus ID hasn't been specified (with name() method)\n"
unless $name;
croak "CWB::Encoder: No positional attributes specified.\n"
unless @{$self->{PATT}} > 0;
my $reg = $self->{REGISTRY};
if (not defined $reg) {
$reg = CWB::RegistryDirectory(); # try to guess registry if not specified
$self->{REGISTRY} = $reg;
}
croak "CWB::Encoder: Can't determine unique registry directory (path is $reg).\n"
if $reg =~ /:/;
croak "CWB::Encoder: Registry directory $reg does not exist.\n"
unless -d $reg;
print STDERR "CWB::Encoder: registry directory is $reg\n"
if $self->{DEBUG};
my $regfile = "$reg/$name"; # remove registry entry if it exists
if (-f $regfile) {
croak "CWB::Encoder: Registry file already exists (overwriting not enabled).\n"
unless $overwrite;
print "Removing registry file $reg/$name ...\n"
if $self->{VERBOSE};
unlink "$reg/$name";
croak "CWB::Encoder: Can't delete registry file $reg/$name\n"
if -f "$reg/$name";
print STDERR "CWB::Encoder: deleting file $reg/$name\n"
if $self->{DEBUG};
}
my $dir = $self->{DIR}; # check/create data directory
croak "CWB::Encoder: Data directory has not been set.\n"
unless $dir;
if (-d $dir) {
croak "CWB::Encoder: Data directory already exists (overwriting not enabled).\n"
unless $overwrite;
print "Cleaning up data directory $dir ...\n"
if $self->{VERBOSE};
my $dh = new DirHandle $dir;
my @files = grep {-f $_} (glob("$dir/*"), glob("$dir/.*"));
my ($file, $filename);
while (defined($filename = $dh->read)) {
$file = "$dir/$filename";
next unless -f $file; # skip subdirectories etc.
unlink $file;
carp "CWB::Encoder: Can't delete file $file (trying to continue).\n"
if -f $file;
print STDERR "CWB::Encoder: deleting file $file\n"
if $self->{DEBUG};
}
$dh->close;
}
else {
print "Creating data directory $dir ...\n"
if $self->{VERBOSE};
croak "CWB::Encoder: Can't create data directory $dir\n"
unless mkdir $dir;
my $perm = $self->{PERM};
if ($perm) {
$perm =~ tr[642][753]; # derive directory permissions
CWB::Shell::Cmd("chmod $perm '$dir'");
$perm = "(chmod $perm)";
}
else {
$perm = "";
}
my $group = $self->{GROUP};
if ($group) {
CWB::Shell::Cmd("chgrp $group '$dir'");
$group = "(chgrp $group)";
}
else {
$group = "";
}
print STDERR "CWB::Encoder: created directory $dir $perm $group\n"
if $self->{DEBUG};
}
}
# internal method: used to construct a cwb-encode command line for the specified input files
sub make_encode_cmd {
my $self = shift;
my @files = @_;
my %attr = (); # check for duplicate attributes
my $cmd = "'$CWB::Encode' -s"; # build encode command (-xsB flags are always recommended!)
$cmd .= " -B" unless $self->{UNDEF_SYMBOL} =~ /^\s*$/; # assume that whitespace-only strings are allowed unless undef symbol is set
$cmd .= " -x" if $self->{ENTITIES}; # -x only if we're allowed to decode entities
$cmd .= " -U '".$self->{UNDEF_SYMBOL}."'";
$cmd .= " -R '".$self->{REGISTRY}."/".$self->{NAME}."'"; # has been set and checked by prepare_encode()
$cmd .= " -d '".$self->{DIR}."'";
$cmd .= " -c ".$self->{CHARSET}; # from version 2.2.101, cwb-encode can set character set in registry file
foreach my $file (@files) { # check that all input files exist
croak "CWB::Encoder: Input file $file does not exist.\n"
unless -f $file;
$cmd .= " -f '$file'";
}
$cmd .= " -p -"; # declare all p-attributes explicitly
foreach my $att (@{$self->{PATT}}) { # declare p-attributes
croak "CWB::Encoder: Attribute $att declared twice!\n"
if exists $attr{$att};
$cmd .= " -P $att";
$attr{$att} = 1;
}
foreach my $att (@{$self->{NATT}}) { # declare null attributes
croak "CWB::Encoder: Attribute $att declared twice!\n"
if exists $attr{$att};
$cmd .= " -0 $att";
$attr{$att} = 1;
}
foreach my $attspec (@{$self->{SATT}}) { # declare s-attributes
my $flag = ($attspec =~ s/\*$//) ? "-V" : "-S"; # '*' indicates -V (rather than -S)
croak "CWB::Encoder: Invalid s-attribute specification '$attspec'\n"
unless $attspec =~ /^\S+$/ and $attspec =~ /^[^:+]+(:[0-9]+)?(\+[^:+]+)*$/;
my ($att) = split /:\+/, $attspec; # split attribute specification to get attribute name
croak "CWB::Encoder: Attribute $att declared twice!\n"
if exists $attr{$att};
$cmd .= " $flag $att";
$attr{$att} = 1;
}
lib/CWB/Encoder.pm view on Meta::CPAN
=cut
sub encode_pipe {
my $self = shift;
$self->prepare_encode;
my $cmd = $self->make_encode_cmd(@_);
print "Encoding corpus ".(uc $self->{NAME})." ...\n"
if $self->{VERBOSE};
print STDERR "CWB::Encoder: ... | $cmd\n"
if $self->{DEBUG};
my $pipe = CWB::OpenFile "| $cmd";
$self->{PIPE} = $pipe;
return $pipe;
}
=item $enc->close_pipe;
After opening an encode pipe with the B<encode_pipe> method and
B<print>ing the input text to this pipe, the B<close_pipe> method has
to be called to B<close> the pipe and trigger the post-encoding steps
(indexing, compression, and access permissions). When the
B<close_pipe> method returns, the corpus has been encoded
successfully.
=cut
sub close_pipe {
my $self = shift;
my $pipe = $self->{PIPE};
croak "CWB::Encoder: close_pipe() method only allowed after encode_pipe().\n"
unless $pipe;
croak "CWB::Encoder: Error in cwb-encode pipe ($!).\n"
unless $pipe->close;
print STDERR "CWB::Encoder: pipe to cwb-encode program closed\n"
if $self->{DEBUG};
$self->post_encode;
print "Encoding complete.\n"
if $self->{VERBOSE};
}
# internal method: called _after_ running cwb-encode
sub post_encode {
my $self = shift;
my $perm = $self->{PERM};
my $group = $self->{GROUP};
my $dir = $self->{DIR};
print "Setting access permissions ...\n" # set access permissions for created files
if $self->{VERBOSE};
foreach my $att (@{$self->{PATT}}) { # positional attributes
my $pattern = "'$dir'/$att.*";
print STDERR "CWB::Encoder: processing group $pattern\n"
if $self->{DEBUG} and ($perm or $group);
CWB::Shell::Cmd("chmod $perm $pattern")
if $perm;
CWB::Shell::Cmd("chgrp $group $pattern")
if $group;
}
foreach my $attspec (@{$self->{SATT}}) { # structural attributes
my $temp = $attspec; # don't modify original list
my $rec = ($temp =~ s/:([0-9]+)//) ? $1 : 0; # recursion depth
my ($att, @xmlatts) = split /\+/, $temp; # attribute name and XML tag attributes
foreach my $n ("", 1 .. $rec) { # indices of embedded regions
foreach my $ext ("", map {"_$_"} @xmlatts) { # extensions for XML tag attributes
my $pattern = "'$dir'/$att$ext$n.*";
print STDERR "CWB::Encoder: processing group $pattern\n"
if $self->{DEBUG} and ($perm or $group);
CWB::Shell::Cmd("chmod $perm $pattern")
if $perm;
CWB::Shell::Cmd("chgrp $group $pattern")
if $group;
}
}
}
print "Writing .info file ...\n" # write .info file
if $self->{VERBOSE};
my $infofile = "$dir/.info";
my $fh = CWB::OpenFile "> $infofile";
print $fh $self->{INFO}, "\n";
$fh->close;
CWB::Shell::Cmd("chmod $perm '$infofile'")
if $perm;
CWB::Shell::Cmd("chgrp $group '$infofile'")
if $group;
print "Editing registry entry ...\n" # edit registry file
if $self->{VERBOSE};
my $reg = $self->{REGISTRY};
my $name = $self->{NAME};
my $regfile = "$reg/$name";
my $rf = new CWB::RegistryFile $regfile;
croak "CWB::Encoder: Syntax error in registry entry $regfile\n"
unless defined $rf;
$rf->name($self->{LONGNAME});
# $rf->property("charset", $self->{CHARSET}); # -- already set by cwb-encode (since v2.2.101)
$rf->property("language", $self->{LANG});
$rf->write($regfile);
print STDERR "CWB::Encoder: registry entry $regfile has been edited\n"
if $self->{DEBUG};
print STDERR "CWB::Encoder: setting access permissions for $regfile\n"
if $self->{DEBUG} and ($perm or $group);
CWB::Shell::Cmd("chmod $perm '$regfile'")
if $perm;
CWB::Shell::Cmd("chgrp $group '$regfile'")
if $group;
my $idx = new CWB::Indexer "$reg:".(uc $name); # build indices and compress p-attributes
$idx->group($group)
if $group;
$idx->perm($perm)
if $perm;
$idx->memory($self->{MEMORY});
$idx->validate($self->{VALIDATE});
$idx->debug($self->{DEBUG});
print "Building indices and compressing p-attributes ...\n"
if $self->{VERBOSE};
$idx->makeall;
}
=back
=cut
## ======================================================================
1;
__END__
=head1 COPYRIGHT
Copyright (C) 2002-2010 Stefan Evert [http::/purl.org/stefan.evert]
This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.
=cut
( run in 1.002 second using v1.01-cache-2.11-cpan-39bf76dae61 )