Text-SRT-Align

 view release on metacpan or  search on metacpan

lib/Text/SRT/Align.pm  view on Meta::CPAN

	    &initialize_dictionary( $options{SOURCE_LANG},
				    $options{TARGET_LANG} );
    }

    if (! -e $srcfile){$srcfile.='.gz';}
    if (! -e $trgfile){$trgfile.='.gz';}

    if (! -e $srcfile){die "$srcfile doesn't exist!\n";}
    if (! -e $trgfile){die "$trgfile doesn't exist!\n";}

    my @srcdata=();
    my @trgdata=();

    $srcfreq=undef;
    $trgfreq=undef;

    my %first=();   # word matches in initial part of the move
    my %last=();    # matches in final part of the movie

    print STDERR "parse '$srcfile' & '$trgfile' ... " if ($VERBOSE);
    &parse_bitext($srcfile,$trgfile,\@srcdata,\@trgdata,\%first,\%last);
    print STDERR "ok!\n" if ($VERBOSE);

    ## fix start and end times (without scaling and offsets)
    &set_sent_times(\@srcdata);
    &set_sent_times(\@trgdata);

    ## sort time frames by their starting time
    ## (strangely enough some subtitles are not chronogologically sorted)
    @srcdata = &sort_time_frames(\@srcdata);
    @trgdata = &sort_time_frames(\@trgdata);

    if (defined $options{HARD_BOUNDARIES}){
	&fit_hard_boundaries($options{HARD_BOUNDARIES},\@srcdata,\@trgdata);
    }

    my $score=undef;
    my $baseScore=undef;

    print STDERR "align sentences ... " if ($VERBOSE);
    if ($COGNATE_RANGE){
	if ($VERBOSE){
	    my $overlap = &time_overlap(\@srcdata,\@trgdata);
	    print STDERR "time overlap before = $overlap\n"
	}
	$score = &cognate_align($srcfile,$trgfile,
				\@srcdata,\@trgdata,
				\%first,\%last,$alignment);
    }
    if ($options{BEST_ALIGN}){
	if ($VERBOSE){
	    my $overlap = &time_overlap(\@srcdata,\@trgdata);
	    print STDERR "time overlap before = $overlap\n";
	}
	($score,$baseScore) = 
	    &best_align($srcfile,$trgfile,
			\@srcdata,\@trgdata,
			\%first,\%last,$alignment);
    }
    else{
	$score = &standard_align(\@srcdata,\@trgdata,\%first,\%last,$alignment);
    }

    my $overlap = &time_overlap(\@srcdata,\@trgdata);
    print STDERR "time overlap = $overlap\n" if ($VERBOSE);

    print STDERR "done!\n" if ($VERBOSE);
    if ($baseScore){
	print STDERR "ratio = $score ($baseScore)\n" if ($VERBOSE);
    }
    else {
	print STDERR "ratio = $score\n" if ($VERBOSE);
    }

lib/Text/SRT/Align.pm  view on Meta::CPAN




################################################################################





sub best_align{
    my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last,$alg)=@_;

    my %types;
    align_srt($srcdata,$trgdata,$alg,\%types);
    my $bestratio = $SCORE_PROPORTION ?
	($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
	($types{nonempty}+1)/($types{empty}+1);

    print STDERR "\nratio = " if $VERBOSE;
    print STDERR $bestratio if $VERBOSE;
    print STDERR "\n" if $VERBOSE;

    my @sortfirst = sort {$$first{$b} <=> $$first{$a} } keys %{$first};
    my @sortlast  = sort {$$last{$b} <=> $$last{$a} } keys %{$last};

lib/Text/SRT/Align.pm  view on Meta::CPAN

	foreach my $lf (@sortlast){

	    my @anchor = ($sf,$lf);

	    ## use only the first and the last one
	    if ($VERBOSE){
		print STDERR "use $anchor[0] and $anchor[1] as reference\n";
	    }

	    ## compute slope and offset for this movie
	    my ($slope,$offset) = ComputeOffset(\@anchor,$srcdata,$trgdata);
	    print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
	    if ($slope<=0){
		print STDERR "strange scaling factor -> ignore\n";
		next;
	    }
	    ## re-scale source language subtitles
#	    set_sent_times($srcdata,$slope,$offset);
	    synchronize($srcdata,$slope,$offset);

	    my %types=();
	    my @newalg=();
	    align_srt($srcdata,$trgdata,\@newalg,\%types);
	    my $newratio = $SCORE_PROPORTION ?
		($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
		($types{nonempty}+1)/($types{empty}+1);
	    print STDERR "ratio = " if $VERBOSE;
	    print STDERR $newratio if $VERBOSE;
	    if ($newratio > $bestratio){
		@{$alg} = @newalg;
		$bestratio = $newratio;
		print STDERR " ---> best!" if $VERBOSE;
	    }

lib/Text/SRT/Align.pm  view on Meta::CPAN

	    print `$UPLUG $FALLBACK -src $srcfile -trg $trgfile`;
	    exit;
	}
    }
    return ($bestratio,$standard);
}



sub standard_align{
    my ($srcdata,$trgdata,$first,$last,$alg)=@_;

    my %types;
    align_srt($srcdata,$trgdata,$alg,\%types);

    if ($types{empty}*2 > $types{nonempty}){

	if (keys %{$first} && keys %{$last}){
	    &use_anchor_points($srcdata,$trgdata,$first,$last);
	}
	@{$alg} = ();
	&align_srt($srcdata,$trgdata,$alg);
    }
    my $score = $SCORE_PROPORTION ?
	($types{nonempty}+1) / ($types{nonempty} + $types{empty} +1) :
	($types{nonempty}+1)/($types{empty}+1);
    return $score;
}



sub cognate_align{
    my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last,$alg)=@_;
    my $best;
    for (my $c=1;$c>$COGNATE_RANGE;$c-=0.05){
	$BEST_ALIGN=1;
	$USE_COGNATES=$c;
	print STDERR "use c=$USE_COGNATES";
	&parse_bitext($srcfile,$trgfile,$srcdata,$trgdata,$first,$last);
	##
	## TODO: Do I need to set starting time and to sort tie frames here again?
	##
	my @newalg=();
	my $new=best_align($srcfile,$trgfile,
			   $srcdata,$trgdata,
			   $first,$last,\@newalg);
	if ($new>$best){
	    print STDERR "--> best ($new)";
	    $best=$new;
	    @{$alg}=@newalg;
	}
	print STDERR "\n";
    }
    return $best;
}

lib/Text/SRT/Align.pm  view on Meta::CPAN

	    $inverse ? $$dic{$trg}{$src}++ : $$dic{$src}{$trg}++;
	}
	$LOADED_DICS{$file} = 1;
    }
}




sub parse_bitext{
    my ($srcfile,$trgfile,$srcdata,$trgdata,$first,$last)=@_;


    ## first and last sentences (size = WINDOW)
    my $srcfirst=[];
    my $srclast=[];
    my $trgfirst=[];
    my $trglast=[];

    print STDERR "\n" if $VERBOSE;

    my $src_count=0;
    my $trg_count=0;

    if ($StoreXML && (exists $StoredXML{$srcfile})){
	# print STDERR "retrieve $srcfile ...\n";
	@{$srcdata} = @{ clone($StoredXML{$srcfile}{data}) };
	@{$srcfirst} = @{ clone($StoredXML{$srcfile}{first}) };
	@{$srclast} = @{ clone($StoredXML{$srcfile}{last}) };
	%{$srcfreq} = %{ clone($StoredXML{$srcfile}{freq}) };
	$src_count = $StoredXML{$srcfile}{count};
    }
    else{
	my ($src_fh,$src_ph) = init_parser($srcfile,$srcdata);

	$srcfreq = $src_ph->{WORDFREQ};

	## parse through source language text
	while (&ReadNextSentence($src_fh,$src_ph)){
#	    next unless (@{$src_ph->{WORDS}});
	    if (@{$srcfirst} < $WINDOW ){
		my $idx = scalar @{$srcfirst};
		if (@{$src_ph->{WORDS}}){
		    @{$srcfirst->[$idx]} = @{$src_ph->{WORDS}->[-1]};

lib/Text/SRT/Align.pm  view on Meta::CPAN

	    }
	    else{@{$srclast->[$idx]}=();}
	    if (@{$srclast} > $WINDOW ){
		shift (@{$srclast});
	    }
	    $src_count++;
	}
	## store the data if flag is set
	if ($StoreXML){
	    # print STDERR "store $srcfile ...\n";
	    $StoredXML{$srcfile}{data}  = clone($srcdata);
	    $StoredXML{$srcfile}{first} = clone($srcfirst);
	    $StoredXML{$srcfile}{last}  = clone($srclast);
	    $StoredXML{$srcfile}{freq}  = clone($srcfreq);
	    $StoredXML{$srcfile}{count}  = $src_count;
	}
    }

    if ($StoreXML && (exists $StoredXML{$trgfile})){
	# print STDERR "retrieve $trgfile ...\n";
	@{$trgdata} = @{ clone($StoredXML{$trgfile}{data}) };

lib/Text/SRT/Align.pm  view on Meta::CPAN

	    }
	}
    }
}




sub use_anchor_points{

    my ($srcdata,$trgdata,$first,$last)=@_;

    my @sortfirst = sort {$$first{$b} <=> $$first{$a} } keys %{$first};
    my @sortlast  = sort {$$last{$b} <=> $$last{$a} } keys %{$last};

    ## I need at least 2 reference points!

    if (@sortfirst && @sortlast){
	my @fixpoints = ($sortfirst[0],$sortlast[0]);

	## use only the first and the last one
	if ($VERBOSE){
	    print STDERR "use $fixpoints[0] and $fixpoints[1] as reference\n";
	}

	## compute slope and offset for this movie
	my ($slope,$offset) = ComputeOffset(\@fixpoints,$srcdata,$trgdata);
	print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
	if ($slope<=0){
	    print STDERR "strange scaling factor -> ignore\n";
	    delete $last->{$sortlast[0]};
	    return use_anchor_points($srcdata,$trgdata,$first,$last);
	}
	## re-scale source language subtitles
#	set_sent_times($srcdata,$slope,$offset);
	synchronize($srcdata,$slope,$offset);
    }
}


sub fit_hard_boundaries{
    my ($hardstr,$src,$trg)=@_;
    my @pairs = split(/\+/,$hardstr);

    my %SrcIdx=();
    foreach my $i (0..$#{$src}){

lib/Text/SRT/Align.pm  view on Meta::CPAN

	    print STDERR "time factor: $slope - offset: $offset\n" if $VERBOSE;
	}
	## re-scale source language subtitles
#	set_sent_times($src,$slope,$offset);
	synchronize($src,$slope,$offset);
    }
}


sub ComputeOffset{
    my ($matches,$srcdata,$trgdata) = @_;

    my @params=();
    return AverageOffset(\@params) unless (ref($srcdata) eq 'ARRAY');
    return AverageOffset(\@params) unless (ref($trgdata) eq 'ARRAY');

    foreach my $i (0..$#{$matches}){
	foreach my $j ($i+1..$#{$matches}){
	    my ($s1,$t1) = split(/:/,$$matches[$i]);
	    my ($s2,$t2) = split(/:/,$$matches[$j]);

	    next unless (exists $srcdata->[$s1]);
	    next unless (exists $srcdata->[$s2]);
	    next unless (exists $trgdata->[$t1]);
	    next unless (exists $trgdata->[$t2]);

#	    my $x1=$srcdata->[$s1]->{start};
#	    my $y1=$trgdata->[$t1]->{start};
#	    my $x2=$srcdata->[$s2]->{start};
#	    my $y2=$trgdata->[$t2]->{start};

	    my $x1=$srcdata->[$s1]->{end};
	    my $y1=$trgdata->[$t1]->{end};
	    my $x2=$srcdata->[$s2]->{end};
	    my $y2=$trgdata->[$t2]->{end};

#	    print STDERR "fit line from $x1:$y1 to $x2:$y2\n" if $VERBOSE;
	    my ($slope,$offset)=FitLine($x1,$y1,$x2,$y2);
#	    print STDERR "time factor=$slope, offset=$offset\n" if $VERBOSE;
	    push (@params,($slope,$offset));
	}
    }
    return AverageOffset(\@params);
}

lib/Text/SRT/Align.pm  view on Meta::CPAN

sub sort_time_frames{
    my $sent = shift;
    my @sorted = ();
    foreach my $s (sort {$$sent[$a]{start} <=> $$sent[$b]{start}} 
		   0..$#{$sent}){
	push(@sorted,$$sent[$s]);
    }
    return @sorted;
}

=head2 C<time_overlap( \@srcdata, \@trgdata )>

Compute the proportion of overlapping in time between two sets of subtitles.
Returns overlap-ratio = common-time / ( common-time + different-time )

This is similar to time_overlap_ratio but uses the time frames from
subtitle data structures that may be synchronized using lexical anchors.

=cut

sub time_overlap{
    my ($srcdata,$trgdata) = @_;
    my @srctime = ();
    my @trgtime = ();
    foreach (0..$#{$srcdata}){
	push(@srctime,$$srcdata[$_]{start});
	push(@srctime,$$srcdata[$_]{end});
    }
    foreach (0..$#{$trgdata}){
	push(@trgtime,$$trgdata[$_]{start});
	push(@trgtime,$$trgdata[$_]{end});
    }
    return &time_overlap_ratio(\@srctime,\@trgtime);
}

=head2 C<time_overlap_ratio( \@timeframes1, \@timeframes2 )>



( run in 0.790 second using v1.01-cache-2.11-cpan-454fe037f31 )