Text-SRT-Align
view release on metacpan or search on metacpan
lib/Text/SRT/Align.pm view on Meta::CPAN
&initialize_dictionary( $options{SOURCE_LANG},
$options{TARGET_LANG} );
}
if (! -e $srcfile){$srcfile.='.gz';}
if (! -e $trgfile){$trgfile.='.gz';}
if (! -e $srcfile){die "$srcfile doesn't exist!\n";}
if (! -e $trgfile){die "$trgfile doesn't exist!\n";}
my @srcdata=();
my @trgdata=();
$srcfreq=undef;
$trgfreq=undef;
my %first=(); # word matches in initial part of the move
my %last=(); # matches in final part of the movie
print STDERR "parse '$srcfile' & '$trgfile' ... " if ($VERBOSE);
&parse_bitext($srcfile,$trgfile,\@srcdata,\@trgdata,\%first,\%last);
print STDERR "ok!\n" if ($VERBOSE);
## fix start and end times (without scaling and offsets)
&set_sent_times(\@srcdata);
&set_sent_times(\@trgdata);
## sort time frames by their starting time
## (strangely enough some subtitles are not chronogologically sorted)
@srcdata = &sort_time_frames(\@srcdata);
@trgdata = &sort_time_frames(\@trgdata);
if (defined $options{HARD_BOUNDARIES}){
&fit_hard_boundaries($options{HARD_BOUNDARIES},\@srcdata,\@trgdata);
}
my $score=undef;
my $baseScore=undef;
print STDERR "align sentences ... " if ($VERBOSE);
if ($COGNATE_RANGE){
if ($VERBOSE){
my $overlap = &time_overlap(\@srcdata,\@trgdata);
print STDERR "time overlap before = $overlap\n"
}
$score = &cognate_align($srcfile,$trgfile,
\@srcdata,\@trgdata,
\%first,\%last,$alignment);
}
if ($options{BEST_ALIGN}){
if ($VERBOSE){
my $overlap = &time_overlap(\@srcdata,\@trgdata);
print STDERR "time overlap before = $overlap\n";
}
($score,$baseScore) =
&best_align($srcfile,$trgfile,
\@srcdata,\@trgdata,
\%first,\%last,$alignment);
}
else{
$score = &standard_align(\@srcdata,\@trgdata,\%first,\%last,$alignment);
}
my $overlap = &time_overlap(\@srcdata,\@trgdata);
print STDERR "time overlap = $overlap\n" if ($VERBOSE);
print STDERR "done!\n" if ($VERBOSE);
if ($baseScore){
print STDERR "ratio = $score ($baseScore)\n" if ($VERBOSE);
}
else {
print STDERR "ratio = $score\n" if ($VERBOSE);
}
lib/Text/SRT/Align.pm view on Meta::CPAN
################################################################################
sub best_align{
my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last,$alg)=@_;
my %types;
align_srt($srcdata,$trgdata,$alg,\%types);
my $bestratio = $SCORE_PROPORTION ?
($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
($types{nonempty}+1)/($types{empty}+1);
print STDERR "\nratio = " if $VERBOSE;
print STDERR $bestratio if $VERBOSE;
print STDERR "\n" if $VERBOSE;
my @sortfirst = sort {$$first{$b} <=> $$first{$a} } keys %{$first};
my @sortlast = sort {$$last{$b} <=> $$last{$a} } keys %{$last};
lib/Text/SRT/Align.pm view on Meta::CPAN
foreach my $lf (@sortlast){
my @anchor = ($sf,$lf);
## use only the first and the last one
if ($VERBOSE){
print STDERR "use $anchor[0] and $anchor[1] as reference\n";
}
## compute slope and offset for this movie
my ($slope,$offset) = ComputeOffset(\@anchor,$srcdata,$trgdata);
print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
if ($slope<=0){
print STDERR "strange scaling factor -> ignore\n";
next;
}
## re-scale source language subtitles
# set_sent_times($srcdata,$slope,$offset);
synchronize($srcdata,$slope,$offset);
my %types=();
my @newalg=();
align_srt($srcdata,$trgdata,\@newalg,\%types);
my $newratio = $SCORE_PROPORTION ?
($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
($types{nonempty}+1)/($types{empty}+1);
print STDERR "ratio = " if $VERBOSE;
print STDERR $newratio if $VERBOSE;
if ($newratio > $bestratio){
@{$alg} = @newalg;
$bestratio = $newratio;
print STDERR " ---> best!" if $VERBOSE;
}
lib/Text/SRT/Align.pm view on Meta::CPAN
print `$UPLUG $FALLBACK -src $srcfile -trg $trgfile`;
exit;
}
}
return ($bestratio,$standard);
}
sub standard_align{
my ($srcdata,$trgdata,$first,$last,$alg)=@_;
my %types;
align_srt($srcdata,$trgdata,$alg,\%types);
if ($types{empty}*2 > $types{nonempty}){
if (keys %{$first} && keys %{$last}){
&use_anchor_points($srcdata,$trgdata,$first,$last);
}
@{$alg} = ();
&align_srt($srcdata,$trgdata,$alg);
}
my $score = $SCORE_PROPORTION ?
($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
($types{nonempty}+1)/($types{empty}+1);
return $score;
}
sub cognate_align{
my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last,$alg)=@_;
my $best;
for (my $c=1;$c>$COGNATE_RANGE;$c-=0.05){
$BEST_ALIGN=1;
$USE_COGNATES=$c;
print STDERR "use c=$USE_COGNATES";
&parse_bitext($srcfile,$trgfile,$srcdata,$trgdata,$first,$last);
##
## TODO: Do I need to set starting time and to sort tie frames here again?
##
my @newalg=();
my $new=best_align($srcfile,$trgfile,
$srcdata,$trgdata,
$first,$last,\@newalg);
if ($new>$best){
print STDERR "--> best ($new)";
$best=$new;
@{$alg}=@newalg;
}
print STDERR "\n";
}
return $best;
}
lib/Text/SRT/Align.pm view on Meta::CPAN
$inverse ? $$dic{$trg}{$src}++ : $$dic{$src}{$trg}++;
}
$LOADED_DICS{$file} = 1;
}
}
sub parse_bitext{
my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last)=@_;
## first and last sentences (size = WINDOW)
my $srcfirst=[];
my $srclast=[];
my $trgfirst=[];
my $trglast=[];
print STDERR "\n" if $VERBOSE;
my $src_count=0;
my $trg_count=0;
if ($StoreXML && (exists $StoredXML{$srcfile})){
# print STDERR "retrieve $srcfile ...\n";
@{$srcdata} = @{ clone($StoredXML{$srcfile}{data}) };
@{$srcfirst} = @{ clone($StoredXML{$srcfile}{first}) };
@{$srclast} = @{ clone($StoredXML{$srcfile}{last}) };
%{$srcfreq} = %{ clone($StoredXML{$srcfile}{freq}) };
$src_count = $StoredXML{$srcfile}{count};
}
else{
my ($src_fh,$src_ph) = init_parser($srcfile,$srcdata);
$srcfreq = $src_ph->{WORDFREQ};
## parse through source language text
while (&ReadNextSentence($src_fh,$src_ph)){
# next unless (@{$src_ph->{WORDS}});
if (@{$srcfirst} < $WINDOW ){
my $idx = scalar @{$srcfirst};
if (@{$src_ph->{WORDS}}){
@{$srcfirst->[$idx]} = @{$src_ph->{WORDS}->[-1]};
lib/Text/SRT/Align.pm view on Meta::CPAN
}
else{@{$srclast->[$idx]}=();}
if (@{$srclast} > $WINDOW ){
shift (@{$srclast});
}
$src_count++;
}
## store the data if flag is set
if ($StoreXML){
# print STDERR "store $srcfile ...\n";
$StoredXML{$srcfile}{data} = clone($srcdata);
$StoredXML{$srcfile}{first} = clone($srcfirst);
$StoredXML{$srcfile}{last} = clone($srclast);
$StoredXML{$srcfile}{freq} = clone($srcfreq);
$StoredXML{$srcfile}{count} = $src_count;
}
}
if ($StoreXML && (exists $StoredXML{$trgfile})){
# print STDERR "retrieve $trgfile ...\n";
@{$trgdata} = @{ clone($StoredXML{$trgfile}{data}) };
lib/Text/SRT/Align.pm view on Meta::CPAN
}
}
}
}
sub use_anchor_points{
my ($srcdata,$trgdata,$first,$last)=@_;
my @sortfirst = sort {$$first{$b} <=> $$first{$a} } keys %{$first};
my @sortlast = sort {$$last{$b} <=> $$last{$a} } keys %{$last};
## I need at least 2 reference points!
if (@sortfirst && @sortlast){
my @fixpoints = ($sortfirst[0],$sortlast[0]);
## use only the first and the last one
if ($VERBOSE){
print STDERR "use $fixpoints[0] and $fixpoints[1] as reference\n";
}
## compute slope and offset for this movie
my ($slope,$offset) = ComputeOffset(\@fixpoints,$srcdata,$trgdata);
print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
if ($slope<=0){
print STDERR "strange scaling factor -> ignore\n";
delete $last->{$sortlast[0]};
return use_anchor_points($srcdata,$trgdata,$first,$last);
}
## re-scale source language subtitles
# set_sent_times($srcdata,$slope,$offset);
synchronize($srcdata,$slope,$offset);
}
}
sub fit_hard_boundaries{
my ($hardstr,$src,$trg)=@_;
my @pairs = split(/\+/,$hardstr);
my %SrcIdx=();
foreach my $i (0..$#{$src}){
lib/Text/SRT/Align.pm view on Meta::CPAN
print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
}
## re-scale source language subtitles
# set_sent_times($src,$slope,$offset);
synchronize($src,$slope,$offset);
}
}
sub ComputeOffset{
my ($matches,$srcdata,$trgdata) = @_;
my @params=();
return AverageOffset(\@params) unless (ref($srcdata) eq 'ARRAY');
return AverageOffset(\@params) unless (ref($trgdata) eq 'ARRAY');
foreach my $i (0..$#{$matches}){
foreach my $j ($i+1..$#{$matches}){
my ($s1,$t1) = split(/:/,$$matches[$i]);
my ($s2,$t2) = split(/:/,$$matches[$j]);
next unless (exists $srcdata->[$s1]);
next unless (exists $srcdata->[$s2]);
next unless (exists $trgdata->[$t1]);
next unless (exists $trgdata->[$t2]);
# my $x1=$srcdata->[$s1]->{start};
# my $y1=$trgdata->[$t1]->{start};
# my $x2=$srcdata->[$s2]->{start};
# my $y2=$trgdata->[$t2]->{start};
my $x1=$srcdata->[$s1]->{end};
my $y1=$trgdata->[$t1]->{end};
my $x2=$srcdata->[$s2]->{end};
my $y2=$trgdata->[$t2]->{end};
# print STDERR "fit line from $x1:$y1 to $x2:$y2\n" if $VERBOSE;
my ($slope,$offset)=FitLine($x1,$y1,$x2,$y2);
# print STDERR "time factor=$slope, offset=$offset\n" if $VERBOSE;
push (@params,($slope,$offset));
}
}
return AverageOffset(\@params);
}
lib/Text/SRT/Align.pm view on Meta::CPAN
sub sort_time_frames{
my $sent = shift;
my @sorted = ();
foreach my $s (sort {$$sent[$a]{start} <=> $$sent[$b]{start}}
0..$#{$sent}){
push(@sorted,$$sent[$s]);
}
return @sorted;
}
=head2 C<time_overlap( \@srcdata, \@trgdata )>
Compute the proportion of overlapping in time between two sets of subtitles.
Returns overlap-ratio = common-time / ( common-time + different-time )
This is similar to time_overlap_ratio but uses the time frames from
subtitle data structures that may be synchronized using lexical anchors.
=cut
sub time_overlap{
my ($srcdata,$trgdata) = @_;
my @srctime = ();
my @trgtime = ();
foreach (0..$#{$srcdata}){
push(@srctime,$$srcdata[$_]{start});
push(@srctime,$$srcdata[$_]{end});
}
foreach (0..$#{$trgdata}){
push(@trgtime,$$trgdata[$_]{start});
push(@trgtime,$$trgdata[$_]{end});
}
return &time_overlap_ratio(\@srctime,\@trgtime);
}
=head2 C<time_overlap_ratio( \@timeframes1, \@timeframes2 )>
( run in 0.790 second using v1.01-cache-2.11-cpan-454fe037f31 )