Alt-CWB-ambs

 view release on metacpan or  search on metacpan

t/20_encode_vss.t  view on Meta::CPAN

# -*-cperl-*-
## Test automatic corpus encoding and indexing with CWB::Encoder

use Test::More tests => 6;
use File::Path qw.make_path remove_tree.;
use CWB;
use CWB::Encoder;
use File::Compare qw(compare compare_text);
use DirHandle;
use Time::HiRes qw(time);

our $reg_dir = "tmp/registry";
our $data_dir = "tmp/vss";
our $vrt_file = "data/vrt/VeryShortStories.vrt";

make_path($reg_dir) unless -d $reg_dir;

our $enc = new CWB::Encoder "VSS";
isa_ok($enc, CWB::Encoder, "create CWB::Encoder object"); # T1

$enc->registry($reg_dir);       # set up paths and allow encoder to overwrite existing files
$enc->dir($data_dir);
$enc->overwrite(1);

$enc->longname("Very Short Stories"); # set up basic information
$enc->info("Info file for corpus VSS (Very Short Stories)\n");
$enc->charset("latin1");
$enc->language("en");

$enc->perm("640");              # set non-standard access permissions (but not group)

$enc->p_attributes(qw(word pos lemma)); # declare attributes
$enc->null_attributes("collection");
$enc->s_attributes(qw(story:0+num+title+author+year chapter:0+num p:0 s:0));

$enc->memory(100);              # corpus is very small and should use little memory
$enc->validate(1);              # validate all generated files
$enc->verbose(0);               # don't show any progress messages when running as self test
$enc->debug(0);

our $T0 = time;
eval { $enc->encode($vrt_file) };
ok(! $@, "corpus encoding and indexing"); # T2
our $elapsed = time - $T0;
diag(sprintf "VSS corpus encoded in %.1f seconds", $elapsed);

## now compare all created data files against reference corpus
our $ref_dir = "data/vss";
our $ref_regfile = "data/registry/vss";

our $dh = new DirHandle $ref_dir;
my $ok = 1;
while (defined (my $filename = $dh->read)) {
  my $ref_file = "$ref_dir/$filename";
  my $new_file = "$data_dir/$filename";
  next unless -f $ref_file;     # skip directories
  if (-f $new_file) {
    if (compare($new_file, $ref_file) != 0) {
      diag("data file '$filename' is corrput");
      $ok = 0;
    }
  }
  else {
    diag("failed to create data file '$filename'");
    $ok = 0;
  }
}
$dh->close;
ok($ok, "validation of created data files"); # T3

## compare generated registry entry against reference
our $my_cmp = sub {
  map { s{(tmp|data)/vss}{*/vss}g } @_; # ignore different data paths
  my $cmp = $_[0] cmp $_[1];
  if ($cmp) {
    diag("Difference detected in registry entry:\nNEW = $_[0]REF = $_[1]");
  }
  return $cmp;
};
$ok = (compare_text("$reg_dir/vss", $ref_regfile, $my_cmp) == 0);
ok($ok, "validation of generated registry entry"); # T4

## check file permissions and contents of .info file
my $mode;
(undef, undef, $mode) = stat "$data_dir/word.huf";
is((sprintf "%04o", ($mode & 07777)), "0640", "correct file access permissions (word.huf)"); # T5

my $fh = CWB::OpenFile "$data_dir/.info";
my $line = <$fh>;
like($line, qr/Very Short Stories/, "contents of .info file"); # T6
$fh->close;

remove_tree($reg_dir);



( run in 1.506 second using v1.01-cache-2.11-cpan-39bf76dae61 )