Text-SRT-Align

 view release on metacpan or  search on metacpan

lib/Text/SRT/Align.pm  view on Meta::CPAN


=cut

sub align{
    my $srcfile = shift;
    my $trgfile = shift;
    my $alignment = shift || [];

    my %options = @_;

    # make sure we print error messages in UTF8 ....
    # TODO: should we change it back after running the alignment procedure?
    binmode(STDERR,":utf8");

    # set some global variables
    # TODO: can we skip all these global settings?

    $VERBOSE        = $options{VERBOSE} if (defined $options{VERBOSE});

    # variables enabling lexical matching for time synchronization
    # - match entries from bilingual dictionaries
    # - match idenical tokens
    # - match "cognates" (using string similarity measures)
    $USE_DICTIONARY = $options{USE_DICTIONARY} if (defined $options{USE_DICTIONARY});
    $USE_IDENTICAL  = $options{USE_IDENTICAL} if (defined $options{USE_IDENTICAL});
    $USE_COGNATES   = $options{USE_COGNATES} if (defined $options{USE_COGNATES});

    # parameters for lexical matching
    $COGNATE_RANGE  = $options{COGNATE_RANGE} if (defined $options{COGNATE_RANGE});
    $UPPER_CASE     = $options{UPPER_CASE} if (defined $options{UPPER_CASE});
    $TOK_LEN        = $options{TOK_LEN} if (defined $options{TOK_LEN});
    $MINLENGTH      = $options{MINLENGTH} if (defined $options{MINLENGTH});
    $USE_WORDFREQ   = $options{USE_WORDFREQ} if (defined $options{USE_WORDFREQ});
    $CHAR_SET       = $options{CHAR_SET} if (defined $options{CHAR_SET});

    # window size for finding lexical matches (beginning and end of file)
    $WINDOW         = $options{WINDOW} if (defined $options{WINDOW});
    $MAX_MATCHES    = $options{MAX_MATCHES} if (defined $options{MAX_MATCHES});

    # scoring function
    $SCORE_PROPORTION = $options{SCORE_PROPORTION} if (defined $options{SCORE_PROPORTION});


    # TODO: is it OK if we never reset the dictionary?
    # (will be used again if align is called multiple times)
    if ($USE_DICTIONARY){
	&ReadDictionary(\%DIC,$USE_DICTIONARY);
    }
    elsif ($options{SOURCE_LANG} && $options{TARGET_LANG}){
	$options{BEST_ALIGN} = 
	    &initialize_dictionary( $options{SOURCE_LANG},
				    $options{TARGET_LANG} );
    }

    if (! -e $srcfile){$srcfile.='.gz';}
    if (! -e $trgfile){$trgfile.='.gz';}

    if (! -e $srcfile){die "$srcfile doesn't exist!\n";}
    if (! -e $trgfile){die "$trgfile doesn't exist!\n";}

    my @srcdata=();
    my @trgdata=();

    $srcfreq=undef;
    $trgfreq=undef;

    my %first=();   # word matches in initial part of the move
    my %last=();    # matches in final part of the movie

    print STDERR "parse '$srcfile' & '$trgfile' ... " if ($VERBOSE);
    &parse_bitext($srcfile,$trgfile,\@srcdata,\@trgdata,\%first,\%last);
    print STDERR "ok!\n" if ($VERBOSE);

    ## fix start and end times (without scaling and offsets)
    &set_sent_times(\@srcdata);
    &set_sent_times(\@trgdata);

    ## sort time frames by their starting time
    ## (strangely enough some subtitles are not chronogologically sorted)
    @srcdata = &sort_time_frames(\@srcdata);
    @trgdata = &sort_time_frames(\@trgdata);

    if (defined $options{HARD_BOUNDARIES}){
	&fit_hard_boundaries($options{HARD_BOUNDARIES},\@srcdata,\@trgdata);
    }

    my $score=undef;
    my $baseScore=undef;

    print STDERR "align sentences ... " if ($VERBOSE);
    if ($COGNATE_RANGE){
	if ($VERBOSE){
	    my $overlap = &time_overlap(\@srcdata,\@trgdata);
	    print STDERR "time overlap before = $overlap\n"
	}
	$score = &cognate_align($srcfile,$trgfile,
				\@srcdata,\@trgdata,
				\%first,\%last,$alignment);
    }
    if ($options{BEST_ALIGN}){
	if ($VERBOSE){
	    my $overlap = &time_overlap(\@srcdata,\@trgdata);
	    print STDERR "time overlap before = $overlap\n";
	}
	($score,$baseScore) = 
	    &best_align($srcfile,$trgfile,
			\@srcdata,\@trgdata,
			\%first,\%last,$alignment);
    }
    else{
	$score = &standard_align(\@srcdata,\@trgdata,\%first,\%last,$alignment);
    }

    my $overlap = &time_overlap(\@srcdata,\@trgdata);
    print STDERR "time overlap = $overlap\n" if ($VERBOSE);

    print STDERR "done!\n" if ($VERBOSE);
    if ($baseScore){
	print STDERR "ratio = $score ($baseScore)\n" if ($VERBOSE);
    }
    else {
	print STDERR "ratio = $score\n" if ($VERBOSE);
    }

    return wantarray ? ($score,$overlap) : $score;
}



=head2 C<initialize_dictionary( $srclang, $trglang )>

Load the provided dictionary if it exists for the given language pair.
Return 1 if it exists and could be loaded. Return 0 otherwise.

=cut


## NOTE: this resets the dictionary and removes existing entries in %DIC
## but only if the shared dic exists!

## alias for initialize_dictionary
sub initialize_lexicon{
    return initialize_dictionary(@_);
}

sub initialize_dictionary{
    my ($srclang,$trglang) = @_;

    # make a three-letter language code
    if (length($srclang) == 2){
	$srclang = language_code2code($srclang, 'alpha-2', 'alpha-3');
    }
    if (length($trglang) == 2){
	$trglang = language_code2code($trglang, 'alpha-2', 'alpha-3');
    }

    my $SharedHome = &dist_dir('Text-SRT-Align');
    if (-e "$SharedHome/dic/$srclang-$trglang"){
	%DIC=();
	%LOADED_DICS=();
	$USE_DICTIONARY = "$SharedHome/dic/$srclang-$trglang";
	&ReadDictionary(\%DIC,$USE_DICTIONARY);
	return 1 if (keys %DIC);
    }
    # inverse dictionary
    if (-e "$SharedHome/dic/$trglang-$srclang"){
	%DIC=();
	%LOADED_DICS=();
	$USE_DICTIONARY = "$SharedHome/dic/$trglang-$srclang";
	&ReadDictionary(\%DIC,$USE_DICTIONARY,1);
	return 1 if (keys %DIC);
    }
}

lib/Text/SRT/Align.pm  view on Meta::CPAN

=cut

sub load_lexicon{
    my ($dicfile,$inverse) = @_;
    &ReadDictionary(\%DIC,$dicfile,$inverse);
    return 1 if (keys %DIC);
}


=head2 C<print_ces( $srcfile, $trgfile, \@alignments )>

Print the sentence alignments in XCES Align format.

=cut


sub print_ces{
    my ($src,$trg,$alg,$meta,$fh)=@_;

    $fh = *STDOUT unless $fh;

    print $fh '<?xml version="1.0" encoding="utf-8"?>'."\n";
    print $fh '<!DOCTYPE cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">'."\n";
    print $fh '<cesAlign version="1.0">'."\n";
    print $fh "<linkGrp targType=\"s\" fromDoc=\"$src\" toDoc=\"$trg\"";
    if (ref($meta) eq 'HASH'){
	foreach my $k (keys %{$meta}){
	    print $fh " $k=\"$$meta{$k}\"";
	}
    }
    print $fh "\">\n";

    foreach my $i (0..$#{$alg}){
	print $fh "<link id=\"SL$i\" xtargets=\"";
	if (ref($alg->[$i]->{src}) eq 'ARRAY'){
	    print $fh join(' ',@{$alg->[$i]->{src}});
	}
	print $fh ';';
	if (ref($alg->[$i]->{trg}) eq 'ARRAY'){
	    print $fh join(' ',@{$alg->[$i]->{trg}});
	}
	print $fh "\" ";
	if (exists $alg->[$i]->{overlap}){
	    printf $fh "overlap=\"%5.3f\" ",$alg->[$i]->{overlap};
	}
	print $fh "/>\n";
    }
    print $fh "</linkGrp>\n</cesAlign>\n";
}




################################################################################





sub best_align{
    my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last,$alg)=@_;

    my %types;
    align_srt($srcdata,$trgdata,$alg,\%types);
    my $bestratio = $SCORE_PROPORTION ?
	($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
	($types{nonempty}+1)/($types{empty}+1);

    print STDERR "\nratio = " if $VERBOSE;
    print STDERR $bestratio if $VERBOSE;
    print STDERR "\n" if $VERBOSE;

    my @sortfirst = sort {$$first{$b} <=> $$first{$a} } keys %{$first};
    my @sortlast  = sort {$$last{$b} <=> $$last{$a} } keys %{$last};

    if ($MAX_MATCHES){
	@sortfirst = splice(@sortfirst,0,$MAX_MATCHES) if (@sortfirst > $MAX_MATCHES);
	@sortlast = splice(@sortlast,0,$MAX_MATCHES) if (@sortlast > $MAX_MATCHES);
    }

    my $standard = $bestratio;

    foreach my $sf (@sortfirst){
	foreach my $lf (@sortlast){

	    my @anchor = ($sf,$lf);

	    ## use only the first and the last one
	    if ($VERBOSE){
		print STDERR "use $anchor[0] and $anchor[1] as reference\n";
	    }

	    ## compute slope and offset for this movie
	    my ($slope,$offset) = ComputeOffset(\@anchor,$srcdata,$trgdata);
	    print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
	    if ($slope<=0){
		print STDERR "strange scaling factor -> ignore\n";
		next;
	    }
	    ## re-scale source language subtitles
#	    set_sent_times($srcdata,$slope,$offset);
	    synchronize($srcdata,$slope,$offset);

	    my %types=();
	    my @newalg=();
	    align_srt($srcdata,$trgdata,\@newalg,\%types);
	    my $newratio = $SCORE_PROPORTION ?
		($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
		($types{nonempty}+1)/($types{empty}+1);
	    print STDERR "ratio = " if $VERBOSE;
	    print STDERR $newratio if $VERBOSE;
	    if ($newratio > $bestratio){
		@{$alg} = @newalg;
		$bestratio = $newratio;
		print STDERR " ---> best!" if $VERBOSE;
	    }
	    print STDERR "\n" if $VERBOSE;
	}
    }
    print STDERR "\n" if $VERBOSE;
    if ($bestratio < 2){
	if ($FALLBACK && (-e $UPLUG)){
	    print STDERR "best ratio < 2 --> fall back to $FALLBACK!\n";
	    print `$UPLUG $FALLBACK -src $srcfile -trg $trgfile`;
	    exit;
	}
    }
    return ($bestratio,$standard);
}



sub standard_align{
    my ($srcdata,$trgdata,$first,$last,$alg)=@_;

    my %types;
    align_srt($srcdata,$trgdata,$alg,\%types);

    if ($types{empty}*2 > $types{nonempty}){

	if (keys %{$first} && keys %{$last}){
	    &use_anchor_points($srcdata,$trgdata,$first,$last);
	}
	@{$alg} = ();
	&align_srt($srcdata,$trgdata,$alg);
    }
    my $score = $SCORE_PROPORTION ?
	($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
	($types{nonempty}+1)/($types{empty}+1);
    return $score;
}



sub cognate_align{
    my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last,$alg)=@_;
    my $best;
    for (my $c=1;$c>$COGNATE_RANGE;$c-=0.05){
	$BEST_ALIGN=1;
	$USE_COGNATES=$c;
	print STDERR "use c=$USE_COGNATES";
	&parse_bitext($srcfile,$trgfile,$srcdata,$trgdata,$first,$last);
	##
	## TODO: Do I need to set starting time and to sort tie frames here again?
	##
	my @newalg=();
	my $new=best_align($srcfile,$trgfile,
			   $srcdata,$trgdata,
			   $first,$last,\@newalg);
	if ($new>$best){
	    print STDERR "--> best ($new)";
	    $best=$new;
	    @{$alg}=@newalg;
	}
	print STDERR "\n";
    }
    return $best;
}





sub align_srt{
    my ($src,$trg,$alg,$types)=@_;

    my %srcalign = ();
    my %trgalign = ();

    my %DIST;
    $DIST{0}{0} = 1;
    $DIST{0}{1} = 1;
    $DIST{1}{0} = 1;
#    $DIST{1}{1} = 1;
    $DIST{0}{2} = 1;
    $DIST{2}{0} = 1;
#    $DIST{1}{2} = 1;
#    $DIST{2}{1} = 1;
#    $DIST{0}{3} = 1;
#    $DIST{3}{0} = 1;
#    $DIST{1}{3} = 1;
#    $DIST{3}{1} = 1;
#    $DIST{2}{3} = 1;
#    $DIST{3}{2} = 1;


    my $s = 0;
    my $t = 0;

    while($s<=$#{$src} && $t<=$#{$trg}) {

	my ($srcbefore,$trgbefore,
	    $srcafter,$trgafter,
	    $common,$not_common) = &overlap($src->[$s]->{start},
					    $src->[$s]->{end},
					    $trg->[$t]->{start},
					    $trg->[$t]->{end});

	my $idx=$#{$alg}+1;

	if ($common<=0 && $srcbefore){
#	if ($srcbefore > $common+$srcafter){
	    $alg->[$idx]->{trg}=[];
	    $alg->[$idx]->{src}->[0]=$src->[$s]->{id};
	    $s++;
	    $$types{'1:0'}++;
	    $$types{empty}++;
	    next;

lib/Text/SRT/Align.pm  view on Meta::CPAN


    my $common = $common_end - $common_start;

#    print STDERR "    common: $common\n";
#    print STDERR "not common: $not_common\n";

    return ($srcbefore,$trgbefore,
	    $srcafter,$trgafter,
	    $common,$not_common);

}





sub ReadDictionary{
    my ($dic,$file,$inverse)=@_;
    return 1 if (exists $LOADED_DICS{$file});
    if (-e $file){
	if ($file=~/\.gz$/){
	    open DIC,"gzip -cd < $file |" || 
		die "cannot open dictionary file $file!\n";
	}
	else{
	    open DIC,"< $file " || die "cannot open dictionary file $file!\n";
	}
	binmode(DIC,":utf8");
	while (<DIC>){
	    chomp;

	    ## expect dictionary with only one-word items
	    ## and only two fields (source and target)
	    ##
	    # my ($src,$trg) = split(/\s/);
	    # $inverse ? $$dic{$trg}{$src}++ : $$dic{$src}{$trg}++;

	    ## accept also dic's with initial freq's/prob's
	    ##
	    my @f = split(/\s/);             # split on TAB
	    next unless ($#f==1 || $#f==5);  # expext 2 or 6 fields

	    ## expect src and trg token in certain fields
	    my ($src,$trg) = $#f==1 ? @f : ($f[2],$f[3]);

	    ## TODO: do we want to store prob's or freq's if they exist?
	    ##       (but they are not used at the moment for 
	    ##        ranking lexical matches anyway)

	    ## store lexical items, possibly in reversed order
	    $inverse ? $$dic{$trg}{$src}++ : $$dic{$src}{$trg}++;
	}
	$LOADED_DICS{$file} = 1;
    }
}




sub parse_bitext{
    my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last)=@_;


    ## first and last sentences (size = WINDOW)
    my $srcfirst=[];
    my $srclast=[];
    my $trgfirst=[];
    my $trglast=[];

    print STDERR "\n" if $VERBOSE;

    my $src_count=0;
    my $trg_count=0;

    if ($StoreXML && (exists $StoredXML{$srcfile})){
	# print STDERR "retrieve $srcfile ...\n";
	@{$srcdata} = @{ clone($StoredXML{$srcfile}{data}) };
	@{$srcfirst} = @{ clone($StoredXML{$srcfile}{first}) };
	@{$srclast} = @{ clone($StoredXML{$srcfile}{last}) };
	%{$srcfreq} = %{ clone($StoredXML{$srcfile}{freq}) };
	$src_count = $StoredXML{$srcfile}{count};
    }
    else{
	my ($src_fh,$src_ph) = init_parser($srcfile,$srcdata);

	$srcfreq = $src_ph->{WORDFREQ};

	## parse through source language text
	while (&ReadNextSentence($src_fh,$src_ph)){
#	    next unless (@{$src_ph->{WORDS}});
	    if (@{$srcfirst} < $WINDOW ){
		my $idx = scalar @{$srcfirst};
		if (@{$src_ph->{WORDS}}){
		    @{$srcfirst->[$idx]} = @{$src_ph->{WORDS}->[-1]};
		}
		else{@{$srcfirst->[$idx]}=();}
	    }
	    my $idx = scalar @{$srclast};
	    if (@{$src_ph->{WORDS}}){
		@{$srclast->[$idx]} = @{$src_ph->{WORDS}->[-1]};
		@{$src_ph->{WORDS}->[-1]} = undef;
	    }
	    else{@{$srclast->[$idx]}=();}
	    if (@{$srclast} > $WINDOW ){
		shift (@{$srclast});
	    }
	    $src_count++;
	}
	## store the data if flag is set
	if ($StoreXML){
	    # print STDERR "store $srcfile ...\n";
	    $StoredXML{$srcfile}{data}  = clone($srcdata);
	    $StoredXML{$srcfile}{first} = clone($srcfirst);
	    $StoredXML{$srcfile}{last}  = clone($srclast);
	    $StoredXML{$srcfile}{freq}  = clone($srcfreq);
	    $StoredXML{$srcfile}{count}  = $src_count;
	}
    }

    if ($StoreXML && (exists $StoredXML{$trgfile})){
	# print STDERR "retrieve $trgfile ...\n";
	@{$trgdata} = @{ clone($StoredXML{$trgfile}{data}) };
	@{$trgfirst} = @{ clone($StoredXML{$trgfile}{first}) };
	@{$trglast} = @{ clone($StoredXML{$trgfile}{last}) };
	%{$trgfreq} = %{ clone($StoredXML{$trgfile}{freq}) };
	$trg_count = $StoredXML{$trgfile}{count};
    }
    else{

	my ($trg_fh,$trg_ph) = init_parser($trgfile,$trgdata);

	$trgfreq = $trg_ph->{WORDFREQ};

	## parse through target language text
	while (ReadNextSentence($trg_fh,$trg_ph)){
#	    next unless (@{$trg_ph->{WORDS}});
	    if (@{$trgfirst} < $WINDOW ){
		my $idx = scalar @{$trgfirst};
		if (@{$trg_ph->{WORDS}}){
		    @{$trgfirst->[$idx]} = @{$trg_ph->{WORDS}->[-1]};
		}
		else{@{$trgfirst->[$idx]}=();}
	    }
	    my $idx = scalar @{$trglast};
	    if (@{$trg_ph->{WORDS}}){
		@{$trglast->[$idx]} = @{$trg_ph->{WORDS}->[-1]};
		@{$trg_ph->{WORDS}->[-1]} = undef;
	    }
	    else{@{$trglast->[$idx]}=();}
	    if (@{$trglast} > $WINDOW ){
		shift (@{$trglast});
	    }
	    $trg_count++;
	}
	## store the data if flag is set
	if ($StoreXML){
	    # print STDERR "store $trgfile ...\n";
	    $StoredXML{$trgfile}{data} = clone($trgdata);
	    $StoredXML{$trgfile}{first} = clone($trgfirst);
	    $StoredXML{$trgfile}{last}  = clone($trglast);
	    $StoredXML{$trgfile}{freq}  = clone($trgfreq);
	    $StoredXML{$trgfile}{count}  = $trg_count;
	}
    }

    # find matches in initial windows
#    my %first=();
    foreach my $s (0..$WINDOW-1){
	foreach my $t (0..$WINDOW-1){
	    if (my $score = find_match($srcfirst->[$s],$trgfirst->[$t])){
#		$score/=($s+$t)+2;
		print STDERR "in $s:$t ($score)\n" if $VERBOSE;
#		$$first{"$s:$t"}=$score;
		$$first{"$s:$t"}=1/($s+$t+2);
	    }
	}
    }

    # find matches in final windows
#    my %last=();
    foreach my $s (0..$WINDOW-1){
	foreach my $t (0..$WINDOW-1){
	    if (my $score = find_match($srclast->[$s],$trglast->[$t])){
		my $src = $src_count-$WINDOW+$s;
		my $trg = $trg_count-$WINDOW+$t;
#		$score/=(2*$WINDOW-$s-$t);
		print STDERR "in $src:$trg ($score)\n" if $VERBOSE;
#		$$last{"$src:$trg"}=$score;
		$$last{"$src:$trg"}=1/(2*$WINDOW-$s-$t);
	    }
	}
    }
}




sub use_anchor_points{

    my ($srcdata,$trgdata,$first,$last)=@_;

    my @sortfirst = sort {$$first{$b} <=> $$first{$a} } keys %{$first};
    my @sortlast  = sort {$$last{$b} <=> $$last{$a} } keys %{$last};

    ## I need at least 2 reference points!

    if (@sortfirst && @sortlast){
	my @fixpoints = ($sortfirst[0],$sortlast[0]);

	## use only the first and the last one
	if ($VERBOSE){
	    print STDERR "use $fixpoints[0] and $fixpoints[1] as reference\n";
	}

	## compute slope and offset for this movie
	my ($slope,$offset) = ComputeOffset(\@fixpoints,$srcdata,$trgdata);
	print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
	if ($slope<=0){
	    print STDERR "strange scaling factor -> ignore\n";
	    delete $last->{$sortlast[0]};
	    return use_anchor_points($srcdata,$trgdata,$first,$last);
	}
	## re-scale source language subtitles
#	set_sent_times($srcdata,$slope,$offset);
	synchronize($srcdata,$slope,$offset);
    }
}


sub fit_hard_boundaries{
    my ($hardstr,$src,$trg)=@_;
    my @pairs = split(/\+/,$hardstr);

    my %SrcIdx=();
    foreach my $i (0..$#{$src}){
	$SrcIdx{$src->[$i]->{id}}=$i;
    }
    my %TrgIdx=();
    foreach my $i (0..$#{$trg}){
	$TrgIdx{$trg->[$i]->{id}}=$i;
    }

    my @matches=();
    foreach (@pairs){
	my ($src,$trg) = split(/\:/);
	push (@matches,$SrcIdx{$src}.':'.$TrgIdx{$trg});
    }

    if (@matches > 1){

	## use only the first and the last one
	@matches=($matches[0],$matches[-1]);
	if ($VERBOSE){
	    print STDERR "use $matches[0] and $matches[-1] as reference\n";
	}

	## compute slope and offset for this movie
	my ($slope,$offset) = ComputeOffset(\@matches,$src,$trg);
	print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
	while (($slope<=0) && (@matches > 1)){
	    print STDERR "strange scaling factor -> ignore\n";
	    pop(@matches);
	    return 0 if (@matches==0);
	    ($slope,$offset) = ComputeOffset(\@matches,$src,$trg);
	    print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
	}
	## re-scale source language subtitles
#	set_sent_times($src,$slope,$offset);
	synchronize($src,$slope,$offset);
    }
}


sub ComputeOffset{
    my ($matches,$srcdata,$trgdata) = @_;

    my @params=();
    return AverageOffset(\@params) unless (ref($srcdata) eq 'ARRAY');
    return AverageOffset(\@params) unless (ref($trgdata) eq 'ARRAY');

    foreach my $i (0..$#{$matches}){
	foreach my $j ($i+1..$#{$matches}){
	    my ($s1,$t1) = split(/:/,$$matches[$i]);
	    my ($s2,$t2) = split(/:/,$$matches[$j]);

	    next unless (exists $srcdata->[$s1]);
	    next unless (exists $srcdata->[$s2]);
	    next unless (exists $trgdata->[$t1]);
	    next unless (exists $trgdata->[$t2]);

#	    my $x1=$srcdata->[$s1]->{start};
#	    my $y1=$trgdata->[$t1]->{start};
#	    my $x2=$srcdata->[$s2]->{start};
#	    my $y2=$trgdata->[$t2]->{start};

	    my $x1=$srcdata->[$s1]->{end};
	    my $y1=$trgdata->[$t1]->{end};
	    my $x2=$srcdata->[$s2]->{end};
	    my $y2=$trgdata->[$t2]->{end};

#	    print STDERR "fit line from $x1:$y1 to $x2:$y2\n" if $VERBOSE;
	    my ($slope,$offset)=FitLine($x1,$y1,$x2,$y2);
#	    print STDERR "time factor=$slope, offset=$offset\n" if $VERBOSE;
	    push (@params,($slope,$offset));
	}
    }
    return AverageOffset(\@params);
}

sub FitLine{
    my ($x1,$y1,$x2,$y2)=@_;

    if ($x1-$x2 != 0){
	my $a = ($y1-$y2)/($x1-$x2);
	my $b = $y2-$x2*$a;
	return ($a,$b);
    }
    return (1,0);
}


sub AverageOffset{
    my $data=shift;

    my $sum1=0;
    my $sum2=0;

    my $total=($#{$data}+1)/2;

    while (@{$data}){
	$sum1+=shift(@{$data});
	$sum2+=shift(@{$data});
    }
    if ($total>0){
	return ($sum1/$total,$sum2/$total);
    }
    return (1,0);
}




sub FindWordMatches{
    my ($src,$srcstart,$srcend,$trg,$trgstart,$trgend)=@_;

    foreach my $d (0..$WINDOW){
	foreach my $i (0..$WINDOW){
	    my $s = $srcstart+$i;
	    my $t = $trgstart+$i+$d;
	    if ($s <= $srcend && $t <= $trgend){
		if (find_match($src->[$s],$trg->[$t])){
		    foreach ($srcstart..$s){$src->[$_]=undef;}
		    foreach ($trgstart..$t){$trg->[$_]=undef;}
		    return ($s,$t);
		}
	    }
	    $s = $srcstart+$i+$d;
	    $t = $trgstart+$i;

lib/Text/SRT/Align.pm  view on Meta::CPAN

	    else{
		$sent->[$s]->{first} = $sent->[$s]->{first}-0.0000000001;
	    }
	}

	## last time tag is not at sentence end!
	## --> interpolate
	if ($sent->[$s]->{last_pos} != $sent->[$s]->{end_pos}){
	    my $char=$sent->[$s]->{last_pos}-$sent->[$s]->{first_pos};
	    my $time=$sent->[$s]->{last}-$sent->[$s]->{first};

	    if (not $char){
		print STDERR "strange?!?\n";
	    }

	    my $diff=$sent->[$s]->{end_pos}-$sent->[$s]->{last_pos};
	    if ($char*$diff){
		$sent->[$s]->{last} = $sent->[$s]->{last} + $time/$char*$diff;
	    }
	    else{
		$sent->[$s]->{last} = $sent->[$s]->{last} + 0.0000000001;
	    }
	}

	$sent->[$s]->{start} = $scale * $sent->[$s]->{first}+$offset;
	$sent->[$s]->{end} = $scale * $sent->[$s]->{last}+$offset;
    }

    ## take care of some special cases where the time slot is 0
    ## (or even negative)
    ## --> just change the start time to be a milisecond before end time

    foreach my $s (0..$#{$sent}){
	if ($sent->[$s]->{start} >= $sent->[$s]->{end}){
	    $sent->[$s]->{start} = $sent->[$s]->{end} - 0.00000001;
	}
    }

}


=head2 C<@newtimeframes = sort_time_frames( \@oldtimeframes )>

Sort time frames by their starting time.
(This is necessary because some subtitles do not list 
the frames in chronolgical order.)

=cut


sub sort_time_frames{
    my $sent = shift;
    my @sorted = ();
    foreach my $s (sort {$$sent[$a]{start} <=> $$sent[$b]{start}} 
		   0..$#{$sent}){
	push(@sorted,$$sent[$s]);
    }
    return @sorted;
}

=head2 C<time_overlap( \@srcdata, \@trgdata )>

Compute the proportion of overlapping in time between two sets of subtitles.
Returns overlap-ratio = common-time / ( common-time + different-time )

This is similar to time_overlap_ratio but uses the time frames from
subtitle data structures that may be synchronized using lexical anchors.

=cut

sub time_overlap{
    my ($srcdata,$trgdata) = @_;
    my @srctime = ();
    my @trgtime = ();
    foreach (0..$#{$srcdata}){
	push(@srctime,$$srcdata[$_]{start});
	push(@srctime,$$srcdata[$_]{end});
    }
    foreach (0..$#{$trgdata}){
	push(@trgtime,$$trgdata[$_]{start});
	push(@trgtime,$$trgdata[$_]{end});
    }
    return &time_overlap_ratio(\@srctime,\@trgtime);
}

=head2 C<time_overlap_ratio( \@timeframes1, \@timeframes2 )>

Compute the proportion of overlapping in time between two sets of times frames.
Returns overlap-ratio = common-time / ( common-time + different-time )

=cut


sub time_overlap_ratio{
    my ($frames1,$frames2)=@_;
    my $common=0;
    my $diff=0;

    my @time1=@{$frames1};
    my @time2=@{$frames2};

    my $start1=shift(@time1);
    my $end1=shift(@time1);

    my $start2=shift(@time2);
    my $end2=shift(@time2);

    ## TODO: should we skip extra frames in the beginning?
    ## (need to do do that here in that case)

    while ($end1 && $end2){

	# sub1 frame is completely before sub2 frame
	if ($end1 < $start2){
	    $diff+=($end1-$start1);
	    $start1=shift(@time1);
	    $end1=shift(@time1);
	    next;
	}
	# sub2 frame is completely before sub1 frame
	if ($end2 && ($end2 < $start1)){
	    $diff+=($end2-$start2);
	    $start2=shift(@time2);
	    $end2=shift(@time2);
	    next;
	}

	my $CommonStart;
	# sub1 frame starts before sub2 frame
	if ($start1 < $start2){
	    $diff+=($start2-$start1);
	    $CommonStart=$start2;
	}
	# sub2 frame starts before sub1 frame
	else{
	    $diff+=($start1-$start2);
	    $CommonStart=$start1;



( run in 3.164 seconds using v1.01-cache-2.11-cpan-df04353d9ac )