Alt-CWB-ambs
view release on metacpan or search on metacpan
script/cwb-align-import view on Meta::CPAN
#!/usr/bin/perl
## -*-cperl-*-
## Author: Stefan Evert
## Purpose: import existing sentence alignment into CWB
##
$| = 1;
use warnings;
use strict;
use CWB;
use CL;
use Getopt::Long;
use Pod::Usage;
## configuration variables
our $Verbose = 0; # -v ... show progress & status messages
our $Opt_Source = undef; # -l1 <id> ... source corpus (overrides alignment file header)
our $Opt_Target = undef; # -l2 <id> ... target corpus (overrides alignment file header)
our $Opt_Grid = undef; # -s <name> ... alignment grid attribute (usually <s>, overrides header)
our $Opt_Key = undef; # -k <pattern> ... key for identifying grid regions in alignment beads (overrides header)
our $Opt_NH = 0; # -nh ... alignment file does not have header (-l1, -l2, -s, -k are then required)
our $Opt_Inverse = 0; # -i ... encode "inverse" alignment (from target to source language)
our $Opt_Prune = 0; # -p ... automatically delete alignment beads if their keys are not found (implies -e)
our $Opt_Empty = 0; # -e ... allow 1:0 and 0:1 alignment beads (will be skipped)
our $Opt_Registry = undef; # -r <dir> ... use registry directory <dir>
our $Opt_Help = 0; # -h ... show usage message
our $Opt_Test = 0; # -t .. use only first 100000 sentences for testing
OPTIONS_AND_USAGE:
{
my $ok = GetOptions(
"v|verbose" => \$Verbose,
"l1|source=s" => \$Opt_Source,
"l2|target=s" => \$Opt_Target,
"s|grid=s" => \$Opt_Grid,
"k|key=s" => \$Opt_Key,
"nh|no-header" => \$Opt_NH,
"i|inverse" => \$Opt_Inverse,
"p|prune" => \$Opt_Prune,
"e|empty" => \$Opt_Empty,
"r|registry=s" => \$Opt_Registry,
"h|help" => \$Opt_Help,
"t|test" => \$Opt_Test,
);
pod2usage(-msg => "(Type 'perldoc cwb-align-import' for more information.)",
-exitval => 0, -verbose => 1) if $Opt_Help;
pod2usage(-msg => "(Type 'cwb-align-import -h' for more information.)",
-exitval => 1, -verbose => 0) if $ok and @ARGV == 0;
pod2usage(-msg => "SYNTAX ERROR.",
-exitval => 2, -verbose => 0)
unless $ok and @ARGV == 1;
die "Flags -l1, -l2, -s and -k must be specified if -nh option is used.\n"
if $Opt_NH and not($Opt_Source and $Opt_Target and $Opt_Grid and $Opt_Key);
$Opt_Empty = 1 if $Opt_Prune; # -p implies -e
}
## global variables
our ($C1_id, $C2_id, $C1_lc, $C2_lc, $S_id); # source and target corpus name (with lowercase variant) and alignment grid
our ($align_file, $FH); # alignment file and file handle
our ($key_pattern); # pattern used to generate keys that identify regions in the alignment grid
our (%R1, %R2); # hashes mapping keys to [start, end] regions, in source and target corpus
our @Beads; # list of alignment beads, with entries [$l1_start, $l1_end, $l2_start, $l2_end, ($annot)]
SETUP:
{
$align_file = shift @ARGV;
$FH = CWB::OpenFile $align_file;
unless ($Opt_NH) {
my $line = <$FH>;
chomp $line;
my @F = split /\t/, $line;
die "Format error in alignment file header: ``$line''\n" unless @F == 4;
($C1_id, $C2_id, $S_id, $key_pattern) = @F;
}
$C1_id = $Opt_Source if $Opt_Source;
$C2_id = $Opt_Target if $Opt_Target;
$S_id = $Opt_Grid if $Opt_Grid;
$key_pattern = $Opt_Key if $Opt_Key;
if ($Opt_Inverse) {
($C1_id, $C2_id) = ($C2_id, $C1_id); # swap source and target language with -i option
}
$C1_id = uc($C1_id);
$C2_id = uc($C2_id);
$C1_lc = lc($C1_id);
$C2_lc = lc($C2_id);
}
MAKE_KEYS:
{
print "Generating keys for grid regions:\n" if $Verbose;
print " - $C1_id " if $Verbose;
build_region_keys(\%R1, $C1_id, $S_id, $key_pattern);
print " ok\n" if $Verbose;
print " - $C2_id " if $Verbose;
build_region_keys(\%R2, $C2_id, $S_id, $key_pattern);
print " ok\n" if $Verbose;
}
READ_ALIGNMENT:
{
print "Processing " if $Verbose;
my $beads = 0;
my $lines = 0;
LINE:
while (<$FH>) {
$lines++;
print "." if $Verbose and ($lines & 0xFFFF) == 1; # 16 dots per 1M alignment beads
chomp;
my ($l1_keys, $l2_keys) = split /\t/; # annotations are ignored so far
if ($Opt_Inverse) {
($l1_keys, $l2_keys) = ($l2_keys, $l1_keys); # swap source and target language with -i option
}
my @l1_keys = split " ", $l1_keys;
my @l2_keys = split " ", $l2_keys;
( run in 0.505 second using v1.01-cache-2.11-cpan-ceb78f64989 )