Alt-CWB-ambs
view release on metacpan or search on metacpan
lib/CWB/Encoder.pm view on Meta::CPAN
SATT => [], # structural attributes (cwb-encode syntax for recursion and XML atts)
NATT => [], # null attributes (tags are ignored)
GROUP => undef, # optional: group and access
PERM => undef, # permissions for created files
OVERWRITE => undef, # can I overwrite existing files?
MEMORY => 75, # passed to CWB::Indexer
VALIDATE => 1, # passed to CWB::Indexer
ENTITIES => 1, # whether to decode XML entities (and skip comments etc.)
UNDEF_SYMBOL => "", # string to insert for missing values of p-attributes
VERBOSE => 0, # print some progress information (stdout)
DEBUG => 0,
PIPE => undef, # pipe to cwb-encode (for encode_pipe() method)
};
bless($self, $class);
$self->name(shift)
if @_;
return $self;
}
=item $enc->name($corpus);
Change the CWB name of a corpus after the encoder object I<$enc> has been created.
Has to be used if the constructor was called without arguments.
=cut
sub name {
my ($self, $name) = @_;
$self->{NAME} = lc($name);
}
=item $enc->longname($descriptive_name);
Optional long, descriptive name for a corpus (single line).
=cut
sub longname {
my ($self, $longname) = @_;
carp "CWB::Encoder: long name ($longname) must not contain \" and \\ characters (removed).\n"
if $longname =~ tr/\"\\//d;
$self->{LONGNAME} = $longname;
}
=item $enc->info($multiline_text);
Multi-line text that will be written to the C<.info> file of the
corpus.
=cut
sub info {
my ($self, $info) = @_;
$self->{INFO} = $info;
}
=item $enc->charset($code);
Set corpus character set (as a corpus property in the registry entry).
So far, only C<latin1> is fully supported. Other valid character sets are
C<latin2>, ..., C<latin9>, and C<utf8> (which will be supported by future
releases of the CWB). Any other I<$code> will raise a warning.
=cut
sub charset {
my ($self, $charset) = @_;
carp "CWB::Encoder: character set $charset not supported by CWB (latin1, ..., latin9, utf8).\n"
unless $charset =~ /^(latin[1-9]|utf8)$/;
$self->{CHARSET} = $charset;
}
=item $enc->language($code);
Set corpus language (as an informational corpus property in the
registry entry). Use of a two-letter ISO code (C<de>, C<en>, C<fr>,
...) is recommended, and any other formats will raise a warning.
=cut
sub language {
my ($self, $lang) = @_;
carp "CWB::Encoder: language ($lang) should be two-letter ISO code.\n"
unless $lang =~ /^[a-z]{2}$/;
$self->{LANG} = $lang;
}
=item $enc->registry($registry_dir);
Specify registry directory I<$registry_dir>, which must be a single
directory rather than a path. If the registry directory is not set
explicitly, B<CWB::Encoder> attempts to determine the standard
registry directory, and will fail if there is no unique match
(e.g. when the C<CORPUS_REGISTRY> environment variable specifies
multiple directories).
=cut
sub registry {
my ($self, $registry) = @_;
$self->{REGISTRY} = $registry;
}
=item $enc->dir($data_dir);
Specify directory I<$data_dir> for corpus data files. The directory is
automatically created if it does not exist.
=cut
sub dir {
my ($self, $dir) = @_;
$self->{DIR} = $dir;
}
=item $enc->p_attributes($att1, $att2, ...);
Declare one or more B<positional attributes>. This method can be
called repeatedly with additional attributes. Note that I<all>
positional attributes, including C<word>, have to be declared
explicitly.
=cut
sub p_attributes {
my $self = shift;
push @{$self->{PATT}}, @_;
}
( run in 0.518 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )