Text-SRT-Align
view release on metacpan or search on metacpan
lib/Text/SRT/Align.pm view on Meta::CPAN
=cut
sub align{
my $srcfile = shift;
my $trgfile = shift;
my $alignment = shift || [];
my %options = @_;
# make sure we print error messages in UTF8 ....
# TODO: should we change it back after running the alignment procedure?
binmode(STDERR,":utf8");
# set some global variables
# TODO: can we skip all these global settings?
$VERBOSE = $options{VERBOSE} if (defined $options{VERBOSE});
# variables enabling lexical matching for time synchronization
# - match entries from bilingual dictionaries
# - match idenical tokens
# - match "cognates" (using string similarity measures)
$USE_DICTIONARY = $options{USE_DICTIONARY} if (defined $options{USE_DICTIONARY});
$USE_IDENTICAL = $options{USE_IDENTICAL} if (defined $options{USE_IDENTICAL});
$USE_COGNATES = $options{USE_COGNATES} if (defined $options{USE_COGNATES});
# parameters for lexical matching
$COGNATE_RANGE = $options{COGNATE_RANGE} if (defined $options{COGNATE_RANGE});
$UPPER_CASE = $options{UPPER_CASE} if (defined $options{UPPER_CASE});
$TOK_LEN = $options{TOK_LEN} if (defined $options{TOK_LEN});
$MINLENGTH = $options{MINLENGTH} if (defined $options{MINLENGTH});
$USE_WORDFREQ = $options{USE_WORDFREQ} if (defined $options{USE_WORDFREQ});
$CHAR_SET = $options{CHAR_SET} if (defined $options{CHAR_SET});
# window size for finding lexical matches (beginning and end of file)
$WINDOW = $options{WINDOW} if (defined $options{WINDOW});
$MAX_MATCHES = $options{MAX_MATCHES} if (defined $options{MAX_MATCHES});
# scoring function
$SCORE_PROPORTION = $options{SCORE_PROPORTION} if (defined $options{SCORE_PROPORTION});
# TODO: is it OK if we never reset the dictionary?
# (will be used again if align is called multiple times)
if ($USE_DICTIONARY){
&ReadDictionary(\%DIC,$USE_DICTIONARY);
}
elsif ($options{SOURCE_LANG} && $options{TARGET_LANG}){
$options{BEST_ALIGN} =
&initialize_dictionary( $options{SOURCE_LANG},
$options{TARGET_LANG} );
}
if (! -e $srcfile){$srcfile.='.gz';}
if (! -e $trgfile){$trgfile.='.gz';}
if (! -e $srcfile){die "$srcfile doesn't exist!\n";}
if (! -e $trgfile){die "$trgfile doesn't exist!\n";}
my @srcdata=();
my @trgdata=();
$srcfreq=undef;
$trgfreq=undef;
my %first=(); # word matches in initial part of the move
my %last=(); # matches in final part of the movie
print STDERR "parse '$srcfile' & '$trgfile' ... " if ($VERBOSE);
&parse_bitext($srcfile,$trgfile,\@srcdata,\@trgdata,\%first,\%last);
print STDERR "ok!\n" if ($VERBOSE);
## fix start and end times (without scaling and offsets)
&set_sent_times(\@srcdata);
&set_sent_times(\@trgdata);
## sort time frames by their starting time
## (strangely enough some subtitles are not chronogologically sorted)
@srcdata = &sort_time_frames(\@srcdata);
@trgdata = &sort_time_frames(\@trgdata);
if (defined $options{HARD_BOUNDARIES}){
&fit_hard_boundaries($options{HARD_BOUNDARIES},\@srcdata,\@trgdata);
}
my $score=undef;
my $baseScore=undef;
print STDERR "align sentences ... " if ($VERBOSE);
if ($COGNATE_RANGE){
if ($VERBOSE){
my $overlap = &time_overlap(\@srcdata,\@trgdata);
print STDERR "time overlap before = $overlap\n"
}
$score = &cognate_align($srcfile,$trgfile,
\@srcdata,\@trgdata,
\%first,\%last,$alignment);
}
if ($options{BEST_ALIGN}){
if ($VERBOSE){
my $overlap = &time_overlap(\@srcdata,\@trgdata);
print STDERR "time overlap before = $overlap\n";
}
($score,$baseScore) =
&best_align($srcfile,$trgfile,
\@srcdata,\@trgdata,
\%first,\%last,$alignment);
}
else{
$score = &standard_align(\@srcdata,\@trgdata,\%first,\%last,$alignment);
}
my $overlap = &time_overlap(\@srcdata,\@trgdata);
print STDERR "time overlap = $overlap\n" if ($VERBOSE);
print STDERR "done!\n" if ($VERBOSE);
if ($baseScore){
print STDERR "ratio = $score ($baseScore)\n" if ($VERBOSE);
}
else {
print STDERR "ratio = $score\n" if ($VERBOSE);
}
return wantarray ? ($score,$overlap) : $score;
}
=head2 C<initialize_dictionary( $srclang, $trglang )>
Load the provided dictionary if it exists for the given language pair.
Return 1 if it exists and could be loaded. Return 0 otherwise.
=cut
## NOTE: this resets the dictionary and removes existing entries in %DIC
## but only if the shared dic exists!
## alias for initialize_dictionary
sub initialize_lexicon{
return initialize_dictionary(@_);
}
sub initialize_dictionary{
my ($srclang,$trglang) = @_;
# make a three-letter language code
if (length($srclang) == 2){
$srclang = language_code2code($srclang, 'alpha-2', 'alpha-3');
}
if (length($trglang) == 2){
$trglang = language_code2code($trglang, 'alpha-2', 'alpha-3');
}
my $SharedHome = &dist_dir('Text-SRT-Align');
if (-e "$SharedHome/dic/$srclang-$trglang"){
%DIC=();
%LOADED_DICS=();
$USE_DICTIONARY = "$SharedHome/dic/$srclang-$trglang";
&ReadDictionary(\%DIC,$USE_DICTIONARY);
return 1 if (keys %DIC);
}
# inverse dictionary
if (-e "$SharedHome/dic/$trglang-$srclang"){
%DIC=();
%LOADED_DICS=();
$USE_DICTIONARY = "$SharedHome/dic/$trglang-$srclang";
&ReadDictionary(\%DIC,$USE_DICTIONARY,1);
return 1 if (keys %DIC);
}
}
lib/Text/SRT/Align.pm view on Meta::CPAN
=cut
sub load_lexicon{
my ($dicfile,$inverse) = @_;
&ReadDictionary(\%DIC,$dicfile,$inverse);
return 1 if (keys %DIC);
}
=head2 C<print_ces( $srcfile, $trgfile, \@alignments )>
Print the sentence alignments in XCES Align format.
=cut
sub print_ces{
my ($src,$trg,$alg,$meta,$fh)=@_;
$fh = *STDOUT unless $fh;
print $fh '<?xml version="1.0" encoding="utf-8"?>'."\n";
print $fh '<!DOCTYPE cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">'."\n";
print $fh '<cesAlign version="1.0">'."\n";
print $fh "<linkGrp targType=\"s\" fromDoc=\"$src\" toDoc=\"$trg\"";
if (ref($meta) eq 'HASH'){
foreach my $k (keys %{$meta}){
print $fh " $k=\"$$meta{$k}\"";
}
}
print $fh "\">\n";
foreach my $i (0..$#{$alg}){
print $fh "<link id=\"SL$i\" xtargets=\"";
if (ref($alg->[$i]->{src}) eq 'ARRAY'){
print $fh join(' ',@{$alg->[$i]->{src}});
}
print $fh ';';
if (ref($alg->[$i]->{trg}) eq 'ARRAY'){
print $fh join(' ',@{$alg->[$i]->{trg}});
}
print $fh "\" ";
if (exists $alg->[$i]->{overlap}){
printf $fh "overlap=\"%5.3f\" ",$alg->[$i]->{overlap};
}
print $fh "/>\n";
}
print $fh "</linkGrp>\n</cesAlign>\n";
}
################################################################################
sub best_align{
my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last,$alg)=@_;
my %types;
align_srt($srcdata,$trgdata,$alg,\%types);
my $bestratio = $SCORE_PROPORTION ?
($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
($types{nonempty}+1)/($types{empty}+1);
print STDERR "\nratio = " if $VERBOSE;
print STDERR $bestratio if $VERBOSE;
print STDERR "\n" if $VERBOSE;
my @sortfirst = sort {$$first{$b} <=> $$first{$a} } keys %{$first};
my @sortlast = sort {$$last{$b} <=> $$last{$a} } keys %{$last};
if ($MAX_MATCHES){
@sortfirst = splice(@sortfirst,0,$MAX_MATCHES) if (@sortfirst > $MAX_MATCHES);
@sortlast = splice(@sortlast,0,$MAX_MATCHES) if (@sortlast > $MAX_MATCHES);
}
my $standard = $bestratio;
foreach my $sf (@sortfirst){
foreach my $lf (@sortlast){
my @anchor = ($sf,$lf);
## use only the first and the last one
if ($VERBOSE){
print STDERR "use $anchor[0] and $anchor[1] as reference\n";
}
## compute slope and offset for this movie
my ($slope,$offset) = ComputeOffset(\@anchor,$srcdata,$trgdata);
print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
if ($slope<=0){
print STDERR "strange scaling factor -> ignore\n";
next;
}
## re-scale source language subtitles
# set_sent_times($srcdata,$slope,$offset);
synchronize($srcdata,$slope,$offset);
my %types=();
my @newalg=();
align_srt($srcdata,$trgdata,\@newalg,\%types);
my $newratio = $SCORE_PROPORTION ?
($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
($types{nonempty}+1)/($types{empty}+1);
print STDERR "ratio = " if $VERBOSE;
print STDERR $newratio if $VERBOSE;
if ($newratio > $bestratio){
@{$alg} = @newalg;
$bestratio = $newratio;
print STDERR " ---> best!" if $VERBOSE;
}
print STDERR "\n" if $VERBOSE;
}
}
print STDERR "\n" if $VERBOSE;
if ($bestratio < 2){
if ($FALLBACK && (-e $UPLUG)){
print STDERR "best ratio < 2 --> fall back to $FALLBACK!\n";
print `$UPLUG $FALLBACK -src $srcfile -trg $trgfile`;
exit;
}
}
return ($bestratio,$standard);
}
sub standard_align{
my ($srcdata,$trgdata,$first,$last,$alg)=@_;
my %types;
align_srt($srcdata,$trgdata,$alg,\%types);
if ($types{empty}*2 > $types{nonempty}){
if (keys %{$first} && keys %{$last}){
&use_anchor_points($srcdata,$trgdata,$first,$last);
}
@{$alg} = ();
&align_srt($srcdata,$trgdata,$alg);
}
my $score = $SCORE_PROPORTION ?
($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
($types{nonempty}+1)/($types{empty}+1);
return $score;
}
sub cognate_align{
my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last,$alg)=@_;
my $best;
for (my $c=1;$c>$COGNATE_RANGE;$c-=0.05){
$BEST_ALIGN=1;
$USE_COGNATES=$c;
print STDERR "use c=$USE_COGNATES";
&parse_bitext($srcfile,$trgfile,$srcdata,$trgdata,$first,$last);
##
## TODO: Do I need to set starting time and to sort tie frames here again?
##
my @newalg=();
my $new=best_align($srcfile,$trgfile,
$srcdata,$trgdata,
$first,$last,\@newalg);
if ($new>$best){
print STDERR "--> best ($new)";
$best=$new;
@{$alg}=@newalg;
}
print STDERR "\n";
}
return $best;
}
sub align_srt{
my ($src,$trg,$alg,$types)=@_;
my %srcalign = ();
my %trgalign = ();
my %DIST;
$DIST{0}{0} = 1;
$DIST{0}{1} = 1;
$DIST{1}{0} = 1;
# $DIST{1}{1} = 1;
$DIST{0}{2} = 1;
$DIST{2}{0} = 1;
# $DIST{1}{2} = 1;
# $DIST{2}{1} = 1;
# $DIST{0}{3} = 1;
# $DIST{3}{0} = 1;
# $DIST{1}{3} = 1;
# $DIST{3}{1} = 1;
# $DIST{2}{3} = 1;
# $DIST{3}{2} = 1;
my $s = 0;
my $t = 0;
while($s<=$#{$src} && $t<=$#{$trg}) {
my ($srcbefore,$trgbefore,
$srcafter,$trgafter,
$common,$not_common) = &overlap($src->[$s]->{start},
$src->[$s]->{end},
$trg->[$t]->{start},
$trg->[$t]->{end});
my $idx=$#{$alg}+1;
if ($common<=0 && $srcbefore){
# if ($srcbefore > $common+$srcafter){
$alg->[$idx]->{trg}=[];
$alg->[$idx]->{src}->[0]=$src->[$s]->{id};
$s++;
$$types{'1:0'}++;
$$types{empty}++;
next;
lib/Text/SRT/Align.pm view on Meta::CPAN
my $common = $common_end - $common_start;
# print STDERR " common: $common\n";
# print STDERR "not common: $not_common\n";
return ($srcbefore,$trgbefore,
$srcafter,$trgafter,
$common,$not_common);
}
sub ReadDictionary{
my ($dic,$file,$inverse)=@_;
return 1 if (exists $LOADED_DICS{$file});
if (-e $file){
if ($file=~/\.gz$/){
open DIC,"gzip -cd < $file |" ||
die "cannot open dictionary file $file!\n";
}
else{
open DIC,"< $file " || die "cannot open dictionary file $file!\n";
}
binmode(DIC,":utf8");
while (<DIC>){
chomp;
## expect dictionary with only one-word items
## and only two fields (source and target)
##
# my ($src,$trg) = split(/\s/);
# $inverse ? $$dic{$trg}{$src}++ : $$dic{$src}{$trg}++;
## accept also dic's with initial freq's/prob's
##
my @f = split(/\s/); # split on TAB
next unless ($#f==1 || $#f==5); # expext 2 or 6 fields
## expect src and trg token in certain fields
my ($src,$trg) = $#f==1 ? @f : ($f[2],$f[3]);
## TODO: do we want to store prob's or freq's if they exist?
## (but they are not used at the moment for
## ranking lexical matches anyway)
## store lexical items, possibly in reversed order
$inverse ? $$dic{$trg}{$src}++ : $$dic{$src}{$trg}++;
}
$LOADED_DICS{$file} = 1;
}
}
sub parse_bitext{
my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last)=@_;
## first and last sentences (size = WINDOW)
my $srcfirst=[];
my $srclast=[];
my $trgfirst=[];
my $trglast=[];
print STDERR "\n" if $VERBOSE;
my $src_count=0;
my $trg_count=0;
if ($StoreXML && (exists $StoredXML{$srcfile})){
# print STDERR "retrieve $srcfile ...\n";
@{$srcdata} = @{ clone($StoredXML{$srcfile}{data}) };
@{$srcfirst} = @{ clone($StoredXML{$srcfile}{first}) };
@{$srclast} = @{ clone($StoredXML{$srcfile}{last}) };
%{$srcfreq} = %{ clone($StoredXML{$srcfile}{freq}) };
$src_count = $StoredXML{$srcfile}{count};
}
else{
my ($src_fh,$src_ph) = init_parser($srcfile,$srcdata);
$srcfreq = $src_ph->{WORDFREQ};
## parse through source language text
while (&ReadNextSentence($src_fh,$src_ph)){
# next unless (@{$src_ph->{WORDS}});
if (@{$srcfirst} < $WINDOW ){
my $idx = scalar @{$srcfirst};
if (@{$src_ph->{WORDS}}){
@{$srcfirst->[$idx]} = @{$src_ph->{WORDS}->[-1]};
}
else{@{$srcfirst->[$idx]}=();}
}
my $idx = scalar @{$srclast};
if (@{$src_ph->{WORDS}}){
@{$srclast->[$idx]} = @{$src_ph->{WORDS}->[-1]};
@{$src_ph->{WORDS}->[-1]} = undef;
}
else{@{$srclast->[$idx]}=();}
if (@{$srclast} > $WINDOW ){
shift (@{$srclast});
}
$src_count++;
}
## store the data if flag is set
if ($StoreXML){
# print STDERR "store $srcfile ...\n";
$StoredXML{$srcfile}{data} = clone($srcdata);
$StoredXML{$srcfile}{first} = clone($srcfirst);
$StoredXML{$srcfile}{last} = clone($srclast);
$StoredXML{$srcfile}{freq} = clone($srcfreq);
$StoredXML{$srcfile}{count} = $src_count;
}
}
if ($StoreXML && (exists $StoredXML{$trgfile})){
# print STDERR "retrieve $trgfile ...\n";
@{$trgdata} = @{ clone($StoredXML{$trgfile}{data}) };
@{$trgfirst} = @{ clone($StoredXML{$trgfile}{first}) };
@{$trglast} = @{ clone($StoredXML{$trgfile}{last}) };
%{$trgfreq} = %{ clone($StoredXML{$trgfile}{freq}) };
$trg_count = $StoredXML{$trgfile}{count};
}
else{
my ($trg_fh,$trg_ph) = init_parser($trgfile,$trgdata);
$trgfreq = $trg_ph->{WORDFREQ};
## parse through target language text
while (ReadNextSentence($trg_fh,$trg_ph)){
# next unless (@{$trg_ph->{WORDS}});
if (@{$trgfirst} < $WINDOW ){
my $idx = scalar @{$trgfirst};
if (@{$trg_ph->{WORDS}}){
@{$trgfirst->[$idx]} = @{$trg_ph->{WORDS}->[-1]};
}
else{@{$trgfirst->[$idx]}=();}
}
my $idx = scalar @{$trglast};
if (@{$trg_ph->{WORDS}}){
@{$trglast->[$idx]} = @{$trg_ph->{WORDS}->[-1]};
@{$trg_ph->{WORDS}->[-1]} = undef;
}
else{@{$trglast->[$idx]}=();}
if (@{$trglast} > $WINDOW ){
shift (@{$trglast});
}
$trg_count++;
}
## store the data if flag is set
if ($StoreXML){
# print STDERR "store $trgfile ...\n";
$StoredXML{$trgfile}{data} = clone($trgdata);
$StoredXML{$trgfile}{first} = clone($trgfirst);
$StoredXML{$trgfile}{last} = clone($trglast);
$StoredXML{$trgfile}{freq} = clone($trgfreq);
$StoredXML{$trgfile}{count} = $trg_count;
}
}
# find matches in initial windows
# my %first=();
foreach my $s (0..$WINDOW-1){
foreach my $t (0..$WINDOW-1){
if (my $score = find_match($srcfirst->[$s],$trgfirst->[$t])){
# $score/=($s+$t)+2;
print STDERR "in $s:$t ($score)\n" if $VERBOSE;
# $$first{"$s:$t"}=$score;
$$first{"$s:$t"}=1/($s+$t+2);
}
}
}
# find matches in final windows
# my %last=();
foreach my $s (0..$WINDOW-1){
foreach my $t (0..$WINDOW-1){
if (my $score = find_match($srclast->[$s],$trglast->[$t])){
my $src = $src_count-$WINDOW+$s;
my $trg = $trg_count-$WINDOW+$t;
# $score/=(2*$WINDOW-$s-$t);
print STDERR "in $src:$trg ($score)\n" if $VERBOSE;
# $$last{"$src:$trg"}=$score;
$$last{"$src:$trg"}=1/(2*$WINDOW-$s-$t);
}
}
}
}
sub use_anchor_points{
my ($srcdata,$trgdata,$first,$last)=@_;
my @sortfirst = sort {$$first{$b} <=> $$first{$a} } keys %{$first};
my @sortlast = sort {$$last{$b} <=> $$last{$a} } keys %{$last};
## I need at least 2 reference points!
if (@sortfirst && @sortlast){
my @fixpoints = ($sortfirst[0],$sortlast[0]);
## use only the first and the last one
if ($VERBOSE){
print STDERR "use $fixpoints[0] and $fixpoints[1] as reference\n";
}
## compute slope and offset for this movie
my ($slope,$offset) = ComputeOffset(\@fixpoints,$srcdata,$trgdata);
print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
if ($slope<=0){
print STDERR "strange scaling factor -> ignore\n";
delete $last->{$sortlast[0]};
return use_anchor_points($srcdata,$trgdata,$first,$last);
}
## re-scale source language subtitles
# set_sent_times($srcdata,$slope,$offset);
synchronize($srcdata,$slope,$offset);
}
}
sub fit_hard_boundaries{
my ($hardstr,$src,$trg)=@_;
my @pairs = split(/\+/,$hardstr);
my %SrcIdx=();
foreach my $i (0..$#{$src}){
$SrcIdx{$src->[$i]->{id}}=$i;
}
my %TrgIdx=();
foreach my $i (0..$#{$trg}){
$TrgIdx{$trg->[$i]->{id}}=$i;
}
my @matches=();
foreach (@pairs){
my ($src,$trg) = split(/\:/);
push (@matches,$SrcIdx{$src}.':'.$TrgIdx{$trg});
}
if (@matches > 1){
## use only the first and the last one
@matches=($matches[0],$matches[-1]);
if ($VERBOSE){
print STDERR "use $matches[0] and $matches[-1] as reference\n";
}
## compute slope and offset for this movie
my ($slope,$offset) = ComputeOffset(\@matches,$src,$trg);
print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
while (($slope<=0) && (@matches > 1)){
print STDERR "strange scaling factor -> ignore\n";
pop(@matches);
return 0 if (@matches==0);
($slope,$offset) = ComputeOffset(\@matches,$src,$trg);
print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
}
## re-scale source language subtitles
# set_sent_times($src,$slope,$offset);
synchronize($src,$slope,$offset);
}
}
sub ComputeOffset{
my ($matches,$srcdata,$trgdata) = @_;
my @params=();
return AverageOffset(\@params) unless (ref($srcdata) eq 'ARRAY');
return AverageOffset(\@params) unless (ref($trgdata) eq 'ARRAY');
foreach my $i (0..$#{$matches}){
foreach my $j ($i+1..$#{$matches}){
my ($s1,$t1) = split(/:/,$$matches[$i]);
my ($s2,$t2) = split(/:/,$$matches[$j]);
next unless (exists $srcdata->[$s1]);
next unless (exists $srcdata->[$s2]);
next unless (exists $trgdata->[$t1]);
next unless (exists $trgdata->[$t2]);
# my $x1=$srcdata->[$s1]->{start};
# my $y1=$trgdata->[$t1]->{start};
# my $x2=$srcdata->[$s2]->{start};
# my $y2=$trgdata->[$t2]->{start};
my $x1=$srcdata->[$s1]->{end};
my $y1=$trgdata->[$t1]->{end};
my $x2=$srcdata->[$s2]->{end};
my $y2=$trgdata->[$t2]->{end};
# print STDERR "fit line from $x1:$y1 to $x2:$y2\n" if $VERBOSE;
my ($slope,$offset)=FitLine($x1,$y1,$x2,$y2);
# print STDERR "time factor=$slope, offset=$offset\n" if $VERBOSE;
push (@params,($slope,$offset));
}
}
return AverageOffset(\@params);
}
sub FitLine{
my ($x1,$y1,$x2,$y2)=@_;
if ($x1-$x2 != 0){
my $a = ($y1-$y2)/($x1-$x2);
my $b = $y2-$x2*$a;
return ($a,$b);
}
return (1,0);
}
sub AverageOffset{
my $data=shift;
my $sum1=0;
my $sum2=0;
my $total=($#{$data}+1)/2;
while (@{$data}){
$sum1+=shift(@{$data});
$sum2+=shift(@{$data});
}
if ($total>0){
return ($sum1/$total,$sum2/$total);
}
return (1,0);
}
sub FindWordMatches{
my ($src,$srcstart,$srcend,$trg,$trgstart,$trgend)=@_;
foreach my $d (0..$WINDOW){
foreach my $i (0..$WINDOW){
my $s = $srcstart+$i;
my $t = $trgstart+$i+$d;
if ($s <= $srcend && $t <= $trgend){
if (find_match($src->[$s],$trg->[$t])){
foreach ($srcstart..$s){$src->[$_]=undef;}
foreach ($trgstart..$t){$trg->[$_]=undef;}
return ($s,$t);
}
}
$s = $srcstart+$i+$d;
$t = $trgstart+$i;
lib/Text/SRT/Align.pm view on Meta::CPAN
else{
$sent->[$s]->{first} = $sent->[$s]->{first}-0.0000000001;
}
}
## last time tag is not at sentence end!
## --> interpolate
if ($sent->[$s]->{last_pos} != $sent->[$s]->{end_pos}){
my $char=$sent->[$s]->{last_pos}-$sent->[$s]->{first_pos};
my $time=$sent->[$s]->{last}-$sent->[$s]->{first};
if (not $char){
print STDERR "strange?!?\n";
}
my $diff=$sent->[$s]->{end_pos}-$sent->[$s]->{last_pos};
if ($char*$diff){
$sent->[$s]->{last} = $sent->[$s]->{last} + $time/$char*$diff;
}
else{
$sent->[$s]->{last} = $sent->[$s]->{last} + 0.0000000001;
}
}
$sent->[$s]->{start} = $scale * $sent->[$s]->{first}+$offset;
$sent->[$s]->{end} = $scale * $sent->[$s]->{last}+$offset;
}
## take care of some special cases where the time slot is 0
## (or even negative)
## --> just change the start time to be a milisecond before end time
foreach my $s (0..$#{$sent}){
if ($sent->[$s]->{start} >= $sent->[$s]->{end}){
$sent->[$s]->{start} = $sent->[$s]->{end} - 0.00000001;
}
}
}
=head2 C<@newtimeframes = sort_time_frames( \@oldtimeframes )>
Sort time frames by their starting time.
(This is necessary because some subtitles do not list
the frames in chronolgical order.)
=cut
sub sort_time_frames{
my $sent = shift;
my @sorted = ();
foreach my $s (sort {$$sent[$a]{start} <=> $$sent[$b]{start}}
0..$#{$sent}){
push(@sorted,$$sent[$s]);
}
return @sorted;
}
=head2 C<time_overlap( \@srcdata, \@trgdata )>
Compute the proportion of overlapping in time between two sets of subtitles.
Returns overlap-ratio = common-time / ( common-time + different-time )
This is similar to time_overlap_ratio but uses the time frames from
subtitle data structures that may be synchronized using lexical anchors.
=cut
sub time_overlap{
my ($srcdata,$trgdata) = @_;
my @srctime = ();
my @trgtime = ();
foreach (0..$#{$srcdata}){
push(@srctime,$$srcdata[$_]{start});
push(@srctime,$$srcdata[$_]{end});
}
foreach (0..$#{$trgdata}){
push(@trgtime,$$trgdata[$_]{start});
push(@trgtime,$$trgdata[$_]{end});
}
return &time_overlap_ratio(\@srctime,\@trgtime);
}
=head2 C<time_overlap_ratio( \@timeframes1, \@timeframes2 )>
Compute the proportion of overlapping in time between two sets of times frames.
Returns overlap-ratio = common-time / ( common-time + different-time )
=cut
sub time_overlap_ratio{
my ($frames1,$frames2)=@_;
my $common=0;
my $diff=0;
my @time1=@{$frames1};
my @time2=@{$frames2};
my $start1=shift(@time1);
my $end1=shift(@time1);
my $start2=shift(@time2);
my $end2=shift(@time2);
## TODO: should we skip extra frames in the beginning?
## (need to do do that here in that case)
while ($end1 && $end2){
# sub1 frame is completely before sub2 frame
if ($end1 < $start2){
$diff+=($end1-$start1);
$start1=shift(@time1);
$end1=shift(@time1);
next;
}
# sub2 frame is completely before sub1 frame
if ($end2 && ($end2 < $start1)){
$diff+=($end2-$start2);
$start2=shift(@time2);
$end2=shift(@time2);
next;
}
my $CommonStart;
# sub1 frame starts before sub2 frame
if ($start1 < $start2){
$diff+=($start2-$start1);
$CommonStart=$start2;
}
# sub2 frame starts before sub1 frame
else{
$diff+=($start1-$start2);
$CommonStart=$start1;
( run in 3.164 seconds using v1.01-cache-2.11-cpan-df04353d9ac )