Alt-CWB-ambs
view release on metacpan or search on metacpan
lib/CWB/Encoder.pm view on Meta::CPAN
}
=item $enc->debug(1);
Activate debugging output (on STDERR).
=cut
sub debug {
my ($self, $yesno) = @_;
$self->{DEBUG} = $yesno;
$self->{VERBOSE} = 1 # debugging also activates verbose output
if $yesno;
}
# internal method: called _before_ running cwb-encode
sub prepare_encode {
my $self = shift;
my $overwrite = $self->{OVERWRITE};
my $name = $self->{NAME}; # check that setup is complete
croak "CWB::Encoder: Corpus ID hasn't been specified (with name() method)\n"
unless $name;
croak "CWB::Encoder: No positional attributes specified.\n"
unless @{$self->{PATT}} > 0;
my $reg = $self->{REGISTRY};
if (not defined $reg) {
$reg = CWB::RegistryDirectory(); # try to guess registry if not specified
$self->{REGISTRY} = $reg;
}
croak "CWB::Encoder: Can't determine unique registry directory (path is $reg).\n"
if $reg =~ /:/;
croak "CWB::Encoder: Registry directory $reg does not exist.\n"
unless -d $reg;
print STDERR "CWB::Encoder: registry directory is $reg\n"
if $self->{DEBUG};
my $regfile = "$reg/$name"; # remove registry entry if it exists
if (-f $regfile) {
croak "CWB::Encoder: Registry file already exists (overwriting not enabled).\n"
unless $overwrite;
print "Removing registry file $reg/$name ...\n"
if $self->{VERBOSE};
unlink "$reg/$name";
croak "CWB::Encoder: Can't delete registry file $reg/$name\n"
if -f "$reg/$name";
print STDERR "CWB::Encoder: deleting file $reg/$name\n"
if $self->{DEBUG};
}
my $dir = $self->{DIR}; # check/create data directory
croak "CWB::Encoder: Data directory has not been set.\n"
unless $dir;
if (-d $dir) {
croak "CWB::Encoder: Data directory already exists (overwriting not enabled).\n"
unless $overwrite;
print "Cleaning up data directory $dir ...\n"
if $self->{VERBOSE};
my $dh = new DirHandle $dir;
my @files = grep {-f $_} (glob("$dir/*"), glob("$dir/.*"));
my ($file, $filename);
while (defined($filename = $dh->read)) {
$file = "$dir/$filename";
next unless -f $file; # skip subdirectories etc.
unlink $file;
carp "CWB::Encoder: Can't delete file $file (trying to continue).\n"
if -f $file;
print STDERR "CWB::Encoder: deleting file $file\n"
if $self->{DEBUG};
}
$dh->close;
}
else {
print "Creating data directory $dir ...\n"
if $self->{VERBOSE};
croak "CWB::Encoder: Can't create data directory $dir\n"
unless mkdir $dir;
my $perm = $self->{PERM};
if ($perm) {
$perm =~ tr[642][753]; # derive directory permissions
CWB::Shell::Cmd("chmod $perm '$dir'");
$perm = "(chmod $perm)";
}
else {
$perm = "";
}
my $group = $self->{GROUP};
if ($group) {
CWB::Shell::Cmd("chgrp $group '$dir'");
$group = "(chgrp $group)";
}
else {
$group = "";
}
print STDERR "CWB::Encoder: created directory $dir $perm $group\n"
if $self->{DEBUG};
}
}
# internal method: used to construct a cwb-encode command line for the specified input files
sub make_encode_cmd {
my $self = shift;
my @files = @_;
my %attr = (); # check for duplicate attributes
my $cmd = "'$CWB::Encode' -s"; # build encode command (-xsB flags are always recommended!)
$cmd .= " -B" unless $self->{UNDEF_SYMBOL} =~ /^\s*$/; # assume that whitespace-only strings are allowed unless undef symbol is set
$cmd .= " -x" if $self->{ENTITIES}; # -x only if we're allowed to decode entities
$cmd .= " -U '".$self->{UNDEF_SYMBOL}."'";
$cmd .= " -R '".$self->{REGISTRY}."/".$self->{NAME}."'"; # has been set and checked by prepare_encode()
$cmd .= " -d '".$self->{DIR}."'";
$cmd .= " -c ".$self->{CHARSET}; # from version 2.2.101, cwb-encode can set character set in registry file
foreach my $file (@files) { # check that all input files exist
croak "CWB::Encoder: Input file $file does not exist.\n"
unless -f $file;
$cmd .= " -f '$file'";
}
$cmd .= " -p -"; # declare all p-attributes explicitly
foreach my $att (@{$self->{PATT}}) { # declare p-attributes
( run in 1.014 second using v1.01-cache-2.11-cpan-ceb78f64989 )