Alt-CWB-ambs

 view release on metacpan or  search on metacpan

script/cwb-align-import  view on Meta::CPAN

#!/usr/bin/perl
## -*-cperl-*-
## Author:  Stefan Evert
## Purpose: import existing sentence alignment into CWB
##
$| = 1;
use warnings;
use strict;

use CWB;
use CL;

use Getopt::Long;
use Pod::Usage;

## configuration variables
our $Verbose      = 0;      # -v ... show progress & status messages
our $Opt_Source   = undef;  # -l1 <id> ... source corpus (overrides alignment file header)
our $Opt_Target   = undef;  # -l2 <id> ... target corpus (overrides alignment file header)
our $Opt_Grid     = undef;  # -s <name> ... alignment grid attribute (usually <s>, overrides header)
our $Opt_Key      = undef;  # -k <pattern> ... key for identifying grid regions in alignment beads (overrides header)
our $Opt_NH       = 0;      # -nh ... alignment file does not have header (-l1, -l2, -s, -k are then required)
our $Opt_Inverse  = 0;      # -i ... encode "inverse" alignment (from target to source language)
our $Opt_Prune    = 0;      # -p ... automatically delete alignment beads if their keys are not found (implies -e)
our $Opt_Empty    = 0;      # -e ... allow 1:0 and 0:1 alignment beads (will be skipped)
our $Opt_Registry = undef;  # -r <dir> ... use registry directory <dir>
our $Opt_Help     = 0;      # -h ... show usage message

our $Opt_Test     = 0;      # -t .. use only first 100000 sentences for testing

OPTIONS_AND_USAGE:
{
  my $ok = GetOptions(
    "v|verbose"    => \$Verbose,
    "l1|source=s"  => \$Opt_Source,
    "l2|target=s"  => \$Opt_Target,
    "s|grid=s"     => \$Opt_Grid,
    "k|key=s"      => \$Opt_Key,
    "nh|no-header" => \$Opt_NH,
    "i|inverse"    => \$Opt_Inverse,
    "p|prune"      => \$Opt_Prune,
    "e|empty"      => \$Opt_Empty,
    "r|registry=s" => \$Opt_Registry,
    "h|help"       => \$Opt_Help,
    "t|test"       => \$Opt_Test,
  );
  pod2usage(-msg => "(Type 'perldoc cwb-align-import' for more information.)",
    -exitval => 0, -verbose => 1) if $Opt_Help;
  pod2usage(-msg => "(Type 'cwb-align-import -h' for more information.)",
    -exitval => 1, -verbose => 0) if $ok and @ARGV == 0;
  pod2usage(-msg => "SYNTAX ERROR.", 
    -exitval => 2, -verbose => 0) 
    unless $ok and @ARGV == 1;
  die "Flags -l1, -l2, -s and -k must be specified if -nh option is used.\n"
    if $Opt_NH and not($Opt_Source and $Opt_Target and $Opt_Grid and $Opt_Key);
  $Opt_Empty = 1 if $Opt_Prune;  # -p implies -e
}

## global variables
our ($C1_id, $C2_id, $C1_lc, $C2_lc, $S_id); # source and target corpus name (with lowercase variant) and alignment grid 
our ($align_file, $FH); # alignment file and file handle
our ($key_pattern);     # pattern used to generate keys that identify regions in the alignment grid
our (%R1, %R2);         # hashes mapping keys to [start, end] regions, in source and target corpus
our @Beads;             # list of alignment beads, with entries [$l1_start, $l1_end, $l2_start, $l2_end, ($annot)]

SETUP:
{
  $align_file = shift @ARGV;
  $FH = CWB::OpenFile $align_file;
  unless ($Opt_NH) {
    my $line = <$FH>;
    chomp $line;
    my @F = split /\t/, $line;
    die "Format error in alignment file header: ``$line''\n" unless @F == 4;
    ($C1_id, $C2_id, $S_id, $key_pattern) = @F;
  }
  $C1_id = $Opt_Source if $Opt_Source;
  $C2_id = $Opt_Target if $Opt_Target;
  $S_id  = $Opt_Grid if $Opt_Grid;
  $key_pattern = $Opt_Key if $Opt_Key;
  if ($Opt_Inverse) {
    ($C1_id, $C2_id) = ($C2_id, $C1_id); # swap source and target language with -i option
  }

  $C1_id = uc($C1_id);
  $C2_id = uc($C2_id);
  $C1_lc = lc($C1_id);
  $C2_lc = lc($C2_id);
}

MAKE_KEYS:
{
  print "Generating keys for grid regions:\n" if $Verbose;
  print "  - $C1_id "             if $Verbose;
  build_region_keys(\%R1, $C1_id, $S_id, $key_pattern);
  print " ok\n"   if $Verbose;
  print "  - $C2_id " if $Verbose;
  build_region_keys(\%R2, $C2_id, $S_id, $key_pattern);
  print " ok\n" if $Verbose;
}

READ_ALIGNMENT:
{
  print "Processing " if $Verbose;
  my $beads = 0;
  my $lines = 0;

  LINE:
  while (<$FH>) {
    $lines++;
    print "." if $Verbose and ($lines & 0xFFFF) == 1;  # 16 dots per 1M alignment beads

    chomp;
    my ($l1_keys, $l2_keys) = split /\t/;        # annotations are ignored so far
    if ($Opt_Inverse) {  
      ($l1_keys, $l2_keys) = ($l2_keys, $l1_keys); # swap source and target language with -i option
    }
    my @l1_keys = split " ", $l1_keys;
    my @l2_keys = split " ", $l2_keys;



( run in 0.505 second using v1.01-cache-2.11-cpan-ceb78f64989 )