Alt-CWB-ambs
view release on metacpan or search on metacpan
data/registry/dickens view on Meta::CPAN
# - must be identical to filename of registry entry
ID dickens
# data file directory (relative or absolute path)
HOME "/corpora/Registry/DemoCorpus/data"
# optional info file (displayed by "info" command in CQP)
INFO "/corpora/Corpus Data/DemoCorpus/data/.info"
# corpus properties provide additional information about the corpus:
##:: charset = "latin1" # change if your corpus uses different charset
##:: language = en # insert ISO code for language (de, en, fr, ...)
##
## p-attributes (token annotations)
##
ATTRIBUTE word
ATTRIBUTE pos
data/registry/dickens.ref view on Meta::CPAN
# CWB name of the corpus
# - must be lowercase (ID for corpus DICKENS is "dickens")
# - must be identical to filename of registry entry
ID test # name modified by CWB::RegistryFile module
# data file directory (relative or absolute path)
HOME /corpora/Registry/DemoCorpus/data
# corpus properties provide additional information about the corpus:
##:: charset = "latin1" # change if your corpus uses different charset
##:: language = "en" # insert ISO code for language (de, en, fr, ...)
##:: valid = "FALSE"
#========================================================================#
# POSITIONAL ATTRIBUTES
ATTRIBUTE word
ATTRIBUTE pos
ATTRIBUTE lemma
data/registry/vss view on Meta::CPAN
# long descriptive name for the corpus
NAME "Very Short Stories"
# corpus ID (must be lowercase in registry!)
ID vss
# path to binary data files
HOME data/vss
# optional info file (displayed by "info;" command in CQP)
INFO data/vss/.info
# corpus properties provide additional information about the corpus:
##:: charset = "latin1" # character encoding of corpus data
##:: language = "en" # insert ISO code for language (de, en, fr, ...)
#========================================================================#
##
## p-attributes (token annotations)
##
ATTRIBUTE word
ATTRIBUTE pos
lib/CWB/CEQL/String.pm view on Meta::CPAN
## SAME AS: $op = new CWB::CEQL::String ">=", "Operator";
print "42 $op 0\n"; # prints "42 >= 0"
if ($op->type eq "Operator") { ... }
$string = new CWB::CEQL::String "my string", "String";
$string .= " is beautiful"; # changes string, but not its type
$string->value("another string"); # $string = "..."; would replace with ordinary string
print $string->value, "\n"; # access string value explicitly
$string->attribute("charset", "ascii"); # declare and/or set user-defined attribute
if ($string->attribute("charset") eq "utf8") { ... }
$new_string = $string->copy; # $new_string = $string; would point to same object
=head1
=head1 DESCRIPTION
B<** TODO **>
Note: automatic conversion to number in numerical expression does usually not work -- use value() method explicitly in this case
lib/CWB/Encoder.pm view on Meta::CPAN
$bnc = new CWB::Encoder "BNC";
$bnc->registry("/path/to/registry"); # will try to guess otherwise
$bnc->dir("/path/to/data/directory"); # directory for corpus data files
$bnc->overwrite(1); # may overwrite existing files / directories
$bnc->longname("British National Corpus"); # optional
$bnc->info("Line1.\nLine2.\n..."); # optional multi-line info text
$bnc->charset("latin1"); # defaults to latin1
$bnc->language("en"); # defaults to ??
$bnc->group("corpora"); # optional: group and access permissions
$bnc->perm("640"); # for newly created files & directories
$bnc->p_attributes("word"); # declare postional atts (no default!)
$bnc->p_attributes(qw<pos lemma>); # may be called repeatedly
$bnc->null_attributes("teiHeader"); # declare null atts (ignored)
$bnc->s_attributes("s"); # s-attributes in cwb-encode syntax
$bnc->s_attributes(qw<div0* div1*>);# * = store annotations (-V)
lib/CWB/Encoder.pm view on Meta::CPAN
Multi-line text that will be written to the C<.info> file of the
corpus.
=cut
sub info {
my ($self, $info) = @_;
$self->{INFO} = $info;
}
=item $enc->charset($code);
Set corpus character set (as a corpus property in the registry entry).
So far, only C<latin1> is fully supported. Other valid character sets are
C<latin2>, ..., C<latin9>, and C<utf8> (which will be supported by future
releases of the CWB). Any other I<$code> will raise a warning.
=cut
sub charset {
my ($self, $charset) = @_;
carp "CWB::Encoder: character set $charset not supported by CWB (latin1, ..., latin9, utf8).\n"
unless $charset =~ /^(latin[1-9]|utf8)$/;
$self->{CHARSET} = $charset;
}
=item $enc->language($code);
Set corpus language (as an informational corpus property in the
registry entry). Use of a two-letter ISO code (C<de>, C<en>, C<fr>,
...) is recommended, and any other formats will raise a warning.
=cut
lib/CWB/Encoder.pm view on Meta::CPAN
print "Editing registry entry ...\n" # edit registry file
if $self->{VERBOSE};
my $reg = $self->{REGISTRY};
my $name = $self->{NAME};
my $regfile = "$reg/$name";
my $rf = new CWB::RegistryFile $regfile;
croak "CWB::Encoder: Syntax error in registry entry $regfile\n"
unless defined $rf;
$rf->name($self->{LONGNAME});
# $rf->property("charset", $self->{CHARSET}); # -- already set by cwb-encode (since v2.2.101)
$rf->property("language", $self->{LANG});
$rf->write($regfile);
print STDERR "CWB::Encoder: registry entry $regfile has been edited\n"
if $self->{DEBUG};
print STDERR "CWB::Encoder: setting access permissions for $regfile\n"
if $self->{DEBUG} and ($perm or $group);
CWB::Shell::Cmd("chmod $perm '$regfile'")
if $perm;
CWB::Shell::Cmd("chgrp $group '$regfile'")
if $group;
t/14_cwb_registry.t view on Meta::CPAN
our $dickens = new CWB::RegistryFile "data/registry/dickens";
isa_ok($dickens, CWB::RegistryFile, "load registry entry into RegistryFile object"); # T1
is($dickens->id, "dickens", "ID field"); # T2
like($dickens->name, qr/Charles Dickens/, "NAME field");
is($dickens->home, "/corpora/Registry/DemoCorpus/data", "HOME field");
is($dickens->info, "/corpora/Corpus Data/DemoCorpus/data/.info", "INFO field");
our @properties = $dickens->list_properties;
$ok = (@properties == 2) && (grep {/^charset$/} @properties) && (grep {/^language$/} @properties);
ok($ok, "list of corpus properties"); # T6
is($dickens->property("charset"), "latin1", "'charset' property");
is($dickens->property("language"), "en", "'language' property");
our @p_attr = $dickens->list_attributes("p"); # positional attributes
our @s_attr = $dickens->list_attributes("s"); # structural attributes
our @a_attr = $dickens->list_attributes("a"); # alignment attributes
our $N_attr = $dickens->list_attributes;
ok($N_attr == @p_attr + @s_attr + @a_attr, "consistent attribute counts from list_attributes()"); # T9
is(@p_attr+0, 4, "4 positional attributes");
ok((grep {/^word$/} @p_attr), "default p-attribute (word) is listed");
t/20_encode_vss.t view on Meta::CPAN
our $enc = new CWB::Encoder "VSS";
isa_ok($enc, CWB::Encoder, "create CWB::Encoder object"); # T1
$enc->registry($reg_dir); # set up paths and allow encoder to overwrite existing files
$enc->dir($data_dir);
$enc->overwrite(1);
$enc->longname("Very Short Stories"); # set up basic information
$enc->info("Info file for corpus VSS (Very Short Stories)\n");
$enc->charset("latin1");
$enc->language("en");
$enc->perm("640"); # set non-standard access permissions (but not group)
$enc->p_attributes(qw(word pos lemma)); # declare attributes
$enc->null_attributes("collection");
$enc->s_attributes(qw(story:0+num+title+author+year chapter:0+num p:0 s:0));
$enc->memory(100); # corpus is very small and should use little memory
$enc->validate(1); # validate all generated files
( run in 0.372 second using v1.01-cache-2.11-cpan-4d50c553e7e )