Alt-CWB-ambs
view release on metacpan or search on metacpan
lib/CWB/Encoder.pm view on Meta::CPAN
directory has to be specified explicitly, separated from the corpus name
by a C<:> character. I<$registry_path> may contain multiple directories
separated by C<:> characters.
=cut
sub new {
my $class = shift;
my $self = {
NAME => undef, # name of the corpus (CWB corpus ID)
REGISTRY => "", # -r flag for non-default registry
FILES => {}, # lookup hash for component filenames
# $self->{FILES}->{$att}->{$comp} = $pathname;
TYPES => {}, # attribute types: P / S
GROUP => undef, # optional: set group for new files
PERM => undef, # optional: set permissions for new files
MEMORY => 75, # memory limit for index creation
VALIDATE => 1, # enable/disable validation
DEBUG => 0, # enable/disable debugging output
};
croak 'USAGE: $c = new CWB::Indexer $corpus_id;'
unless @_ == 1;
my $name = shift;
if ($name =~ /^\s*(.+)\s*:\s*([^:]+)$/) {
$self->{REGISTRY} = "-r '$1'";
$name = $2;
}
$self->{NAME} = $name;
# use cwb-describe-corpus to find out component pathnames
my @lines = ();
my $registry = $self->{REGISTRY};
my $cmd = "'$CWB::DescribeCorpus' $registry -d $name";
CWB::Shell::Cmd($cmd, \@lines);
my $comp = ""; # component name
my $attr = ""; # attribute name
foreach (@lines) {
if (/Component\s+([A-Z]+):/) {
$comp = $1;
}
elsif (/Attribute:\s+(\S+)/ or /Attribute\s+(\S+):/) {
$attr = $1;
}
elsif (/Path\/Value:\s+(\S(.*\S)?)/) {
croak "CWB::Indexer: Can't find component name for file $1 (aborted).\n"
unless $comp;
croak "CWB::Indexer: Can't find attribute name for file $1 (aborted).\n"
unless $attr;
$self->{FILES}->{$attr}->{$comp} = $1;
$comp = $attr = ""; # reset to check for syntax errors
}
elsif (/Type:\s+([A-Z])/) {
carp "CWB::Indexer: Missing attribute name in output of cwb-describe-corpus $name (skipped).\n"
unless $attr;
$self->{TYPES}->{$attr} = $1;
}
# all other lines are ignored
}
return bless($self, $class);
}
=item $idx->group($group);
=item $idx->perm($permission);
Optional group membership and access permissions for newly created
files (otherwise, neither B<chgrp> nor B<chmod> will be called). Note
that I<$permission> must be a string rather than an octal number (as
for the built-in B<chmod> function). Indexing will fail if the
specified group and/or permissions cannot be set.
=cut
sub group {
my ($self, $group) = @_;
$self->{GROUP} = $group;
}
sub perm {
my ($self, $perm) = @_;
$self->{PERM} = $perm;
}
=item $idx->memory($mbytes);
Set approximate memory limit for B<cwb-makeall> command, in MBytes.
The memory limit defaults to 75 MB, which is a reasonable value for
systems with at least 128 MB of RAM.
=cut
sub memory {
my ($self, $mem) = @_;
croak "CWB::Indexer: memory limit ($mem) must be positive integer number (aborted).\n"
unless $mem =~ /^[1-9][0-9]*$/;
$self->{MEMORY} = $mem;
}
=item $idx->validate(0);
Turn off validation of index and compressed files, which may give
substantial speed improvements for larger corpora.
=cut
sub validate {
my ($self, $yesno) = @_;
$self->{VALIDATE} = $yesno;
}
=item $idx->debug(1);
Activate debugging output (on STDERR).
=cut
sub debug {
my ($self, $yesno) = @_;
$self->{DEBUG} = $yesno;
lib/CWB/Encoder.pm view on Meta::CPAN
=cut
sub makeall {
my $self = shift;
foreach my $att (keys %{$self->{TYPES}}) {
$self->make($att)
if $self->{TYPES}->{$att} eq "P";
}
}
=back
=cut
## ======================================================================
## automatic encoding, indexing, and compression of corpora
## ======================================================================
package CWB::Encoder;
use CWB;
use Carp;
use DirHandle;
=head1 CWB::Encoder METHODS
=over 4
=item $enc = new CWB::Encoder $corpus;
Create a new B<CWB::Encoder> object for the specified corpus. Note
that the registry directory cannot be passed directly to the
constructor (use the B<registry> method instead).
=cut
sub new {
my $class = shift;
my $self = { # create and initialise object
NAME => undef, # name of corpus (CWB corpus ID)
LONGNAME => "", # long descriptive name
INFO => "Indexed with CWB::Encoder.", # contents of .info file
CHARSET => "latin1", # character set (corpus property)
LANG => "??", # language (corpus property)
REGISTRY => undef, # registry directory (will be automatically chosen if possible)
DIR => undef, # data directory
PATT => [], # positional attributes
SATT => [], # structural attributes (cwb-encode syntax for recursion and XML atts)
NATT => [], # null attributes (tags are ignored)
GROUP => undef, # optional: group and access
PERM => undef, # permissions for created files
OVERWRITE => undef, # can I overwrite existing files?
MEMORY => 75, # passed to CWB::Indexer
VALIDATE => 1, # passed to CWB::Indexer
ENTITIES => 1, # whether to decode XML entities (and skip comments etc.)
UNDEF_SYMBOL => "", # string to insert for missing values of p-attributes
VERBOSE => 0, # print some progress information (stdout)
DEBUG => 0,
PIPE => undef, # pipe to cwb-encode (for encode_pipe() method)
};
bless($self, $class);
$self->name(shift)
if @_;
return $self;
}
=item $enc->name($corpus);
Change the CWB name of a corpus after the encoder object I<$enc> has been created.
Has to be used if the constructor was called without arguments.
=cut
sub name {
my ($self, $name) = @_;
$self->{NAME} = lc($name);
}
=item $enc->longname($descriptive_name);
Optional long, descriptive name for a corpus (single line).
=cut
sub longname {
my ($self, $longname) = @_;
carp "CWB::Encoder: long name ($longname) must not contain \" and \\ characters (removed).\n"
if $longname =~ tr/\"\\//d;
$self->{LONGNAME} = $longname;
}
=item $enc->info($multiline_text);
Multi-line text that will be written to the C<.info> file of the
corpus.
=cut
sub info {
my ($self, $info) = @_;
$self->{INFO} = $info;
}
=item $enc->charset($code);
Set corpus character set (as a corpus property in the registry entry).
So far, only C<latin1> is fully supported. Other valid character sets are
C<latin2>, ..., C<latin9>, and C<utf8> (which will be supported by future
releases of the CWB). Any other I<$code> will raise a warning.
=cut
sub charset {
my ($self, $charset) = @_;
carp "CWB::Encoder: character set $charset not supported by CWB (latin1, ..., latin9, utf8).\n"
unless $charset =~ /^(latin[1-9]|utf8)$/;
$self->{CHARSET} = $charset;
}
=item $enc->language($code);
( run in 0.709 second using v1.01-cache-2.11-cpan-39bf76dae61 )