charset results from the CPAN

Alt-CWB-ambs

# - must be identical to filename of registry entry
ID dickens

# data file directory (relative or absolute path)
HOME "/corpora/Registry/DemoCorpus/data"

# optional info file (displayed by "info" command in CQP)
INFO "/corpora/Corpus Data/DemoCorpus/data/.info"

# corpus properties provide additional information about the corpus:
##:: charset  = "latin1" # change if your corpus uses different charset
##:: language = en       # insert ISO code for language (de, en, fr, ...)



##
## p-attributes (token annotations)
##

ATTRIBUTE word
ATTRIBUTE pos

data/registry/dickens.ref view on Meta::CPAN


# CWB name of the corpus
# - must be lowercase (ID for corpus DICKENS is "dickens")
# - must be identical to filename of registry entry
ID   test	# name modified by CWB::RegistryFile module

# data file directory (relative or absolute path)
HOME /corpora/Registry/DemoCorpus/data

# corpus properties provide additional information about the corpus:
##:: charset = "latin1"	# change if your corpus uses different charset
##:: language = "en"	# insert ISO code for language (de, en, fr, ...)
##:: valid = "FALSE"
#========================================================================#

# POSITIONAL ATTRIBUTES

ATTRIBUTE word
ATTRIBUTE pos
ATTRIBUTE lemma

data/registry/vss view on Meta::CPAN

# long descriptive name for the corpus
NAME "Very Short Stories"
# corpus ID (must be lowercase in registry!)
ID   vss
# path to binary data files
HOME data/vss
# optional info file (displayed by "info;" command in CQP)
INFO data/vss/.info

# corpus properties provide additional information about the corpus:
##:: charset = "latin1"	# character encoding of corpus data
##:: language = "en"	# insert ISO code for language (de, en, fr, ...)
#========================================================================#


##
## p-attributes (token annotations)
##

ATTRIBUTE word
ATTRIBUTE pos

lib/CWB/CEQL/String.pm view on Meta::CPAN

  ## SAME AS: $op = new CWB::CEQL::String ">=", "Operator";

  print "42 $op 0\n"; # prints "42 >= 0"
  if ($op->type eq "Operator") { ... }

  $string = new CWB::CEQL::String "my string", "String";
  $string .= " is beautiful";       # changes string, but not its type
  $string->value("another string"); # $string = "..."; would replace with ordinary string
  print $string->value, "\n";       # access string value explicitly

  $string->attribute("charset", "ascii"); # declare and/or set user-defined attribute
  if ($string->attribute("charset") eq "utf8") { ... }

  $new_string = $string->copy;      # $new_string = $string; would point to same object

=head1

=head1 DESCRIPTION

B<** TODO **>

Note: automatic conversion to number in numerical expression does usually not work -- use value() method explicitly in this case

lib/CWB/Encoder.pm view on Meta::CPAN



  $bnc = new CWB::Encoder "BNC";

  $bnc->registry("/path/to/registry");  # will try to guess otherwise
  $bnc->dir("/path/to/data/directory"); # directory for corpus data files
  $bnc->overwrite(1);         # may overwrite existing files / directories
  
  $bnc->longname("British National Corpus"); # optional
  $bnc->info("Line1.\nLine2.\n...");    # optional multi-line info text
  $bnc->charset("latin1");    # defaults to latin1
  $bnc->language("en");       # defaults to ??
  
  $bnc->group("corpora");     # optional: group and access permissions
  $bnc->perm("640");          # for newly created files & directories

  $bnc->p_attributes("word"); # declare postional atts (no default!)
  $bnc->p_attributes(qw<pos lemma>);  # may be called repeatedly
  $bnc->null_attributes("teiHeader"); # declare null atts (ignored)
  $bnc->s_attributes("s");    # s-attributes in cwb-encode syntax
  $bnc->s_attributes(qw<div0* div1*>);# * = store annotations (-V)

lib/CWB/Encoder.pm view on Meta::CPAN

Multi-line text that will be written to the C<.info> file of the
corpus.

=cut

sub info {
  my ($self, $info) = @_;
  $self->{INFO} = $info;
}

=item $enc->charset($code);

Set corpus character set (as a corpus property in the registry entry).
So far, only C<latin1> is fully supported. Other valid character sets are
C<latin2>, ..., C<latin9>, and C<utf8> (which will be supported by future
releases of the CWB). Any other I<$code> will raise a warning.

=cut

sub charset {
  my ($self, $charset) = @_;
  carp "CWB::Encoder: character set $charset not supported by CWB (latin1, ..., latin9, utf8).\n"
    unless $charset =~ /^(latin[1-9]|utf8)$/;
  $self->{CHARSET} = $charset;
}

=item $enc->language($code);

Set corpus language (as an informational corpus property in the
registry entry). Use of a two-letter ISO code (C<de>, C<en>, C<fr>,
...) is recommended, and any other formats will raise a warning.

=cut

lib/CWB/Encoder.pm view on Meta::CPAN


  print "Editing registry entry ...\n" # edit registry file
    if $self->{VERBOSE};
  my $reg = $self->{REGISTRY};
  my $name = $self->{NAME};
  my $regfile = "$reg/$name";
  my $rf = new CWB::RegistryFile $regfile;
  croak "CWB::Encoder: Syntax error in registry entry $regfile\n"
    unless defined $rf;
  $rf->name($self->{LONGNAME});
  # $rf->property("charset", $self->{CHARSET}); # -- already set by cwb-encode (since v2.2.101)
  $rf->property("language", $self->{LANG});
  $rf->write($regfile);
  print STDERR "CWB::Encoder: registry entry $regfile has been edited\n"
    if $self->{DEBUG};
  print STDERR "CWB::Encoder: setting access permissions for $regfile\n"
    if $self->{DEBUG} and ($perm or $group);
  CWB::Shell::Cmd("chmod $perm '$regfile'")
    if $perm;
  CWB::Shell::Cmd("chgrp $group '$regfile'")
    if $group;

t/14_cwb_registry.t view on Meta::CPAN


our $dickens = new CWB::RegistryFile "data/registry/dickens";
isa_ok($dickens, CWB::RegistryFile, "load registry entry into RegistryFile object"); # T1

is($dickens->id, "dickens", "ID field"); # T2
like($dickens->name, qr/Charles Dickens/, "NAME field");
is($dickens->home, "/corpora/Registry/DemoCorpus/data", "HOME field");
is($dickens->info, "/corpora/Corpus Data/DemoCorpus/data/.info", "INFO field");

our @properties = $dickens->list_properties;
$ok = (@properties == 2) && (grep {/^charset$/} @properties) && (grep {/^language$/} @properties);
ok($ok, "list of corpus properties"); # T6
is($dickens->property("charset"), "latin1", "'charset' property");
is($dickens->property("language"), "en", "'language' property");

our @p_attr = $dickens->list_attributes("p"); # positional attributes
our @s_attr = $dickens->list_attributes("s"); # structural attributes
our @a_attr = $dickens->list_attributes("a"); # alignment attributes
our $N_attr = $dickens->list_attributes;

ok($N_attr == @p_attr + @s_attr + @a_attr, "consistent attribute counts from list_attributes()"); # T9
is(@p_attr+0, 4, "4 positional attributes");
ok((grep {/^word$/} @p_attr), "default p-attribute (word) is listed");

t/20_encode_vss.t view on Meta::CPAN


our $enc = new CWB::Encoder "VSS";
isa_ok($enc, CWB::Encoder, "create CWB::Encoder object"); # T1

$enc->registry($reg_dir);       # set up paths and allow encoder to overwrite existing files
$enc->dir($data_dir);
$enc->overwrite(1);

$enc->longname("Very Short Stories"); # set up basic information
$enc->info("Info file for corpus VSS (Very Short Stories)\n");
$enc->charset("latin1");
$enc->language("en");

$enc->perm("640");              # set non-standard access permissions (but not group)

$enc->p_attributes(qw(word pos lemma)); # declare attributes
$enc->null_attributes("collection");
$enc->s_attributes(qw(story:0+num+title+author+year chapter:0+num p:0 s:0));

$enc->memory(100);              # corpus is very small and should use little memory
$enc->validate(1);              # validate all generated files

( run in 0.372 second using v1.01-cache-2.11-cpan-4d50c553e7e )