Text-SRT-Align

 view release on metacpan or  search on metacpan

srt2xml  view on Meta::CPAN

#!/usr/bin/env perl
#-*-perl-*-
#
# convert srt files (movie subtitles) to tokenized XML (utf8)
# (very simple tokenization & sentence splitting)
#
# usage: ./srt2xml [-l lang-code] srt-file > xmlfile
#
# -e encoding ....
# -r file  ....... raw xml output file (without tokenizations)
# -s ............. always start a new sentence at new time frames

#
# 
#

=encoding UTF-8

=head1 NAME

srt2xml - script for converting SRT-files (subtitles) to tokenized XML

=head1 USAGE

 srt2xml [OPTIONS] < input.srt > output.xml

=head1 OPTIONS

 -e encoding ......... specify the character encoding of the SRT file
 -l lang-id .......... use non-breaking prefixes for the given language
 -r filename ......... save an untokenized version in <filename>
 -s .................. always start a new sentence at each time frame

=head1 DESCRIPTION

This script detects sentence boundaries and tokenizes the text in given SRT movie subtitle files and creates XML output.

=head1 TODO

Automatic detection of character encodings.

=cut 


use strict;
use utf8;

use Getopt::Std;
use IPC::Open3;
use FindBin qw($Bin);
use File::BOM qw( :all );
use File::ShareDir;
use Encode qw(decode encode);
use Locale::Codes::Language 3.26;


use vars qw/$opt_l $opt_e $opt_r $opt_s/;

$opt_l = 'unknown';
getopts('l:e:r:s');

# make a three-letter language code
if (length($opt_l) == 2){
    $opt_l = language_code2code($opt_l, 'alpha-2', 'alpha-3');
}

our $SHARED_HOME = File::ShareDir::dist_dir('Text-SRT-Align'); 

my $PAUSETHR1 = 1;       # > 1 second --> most probably new sentence
my $PAUSETHR2 = 3;       # > 3 second --> definitely new sentence


# for some languages: always split sentences at new time frames
# (because we have no good sentence splitter implemented for them)

my %SPLIT_AT_TIMEFRAME = (
    'heb' => 1,
    'ara' => 1,
    'sin' => 1,
    'tha' => 1,
    'urd' => 1,
    'zho' => 1,
    'chi' => 1,
    'far' => 1,
    'kor' => 1,
    'jpn' => 1
    );


## read (non-breaking) abbreviations (if the file exists)
## check also if Uplug is installed and use abbreviation lists from there

my %NONBREAKING=();
read_non_breaking($SHARED_HOME.'/nonbreaking_prefix.'.$opt_l,\%NONBREAKING);
eval{
    my $SHARED_UPLUG = File::ShareDir::dist_dir('Uplug');
    my $LangID = language_code2code($opt_l, 'alpha-2', 'alpha-3');
    read_non_breaking($SHARED_UPLUG.
		      '/lang/nonbreaking_prefixes/nonbreaking_prefix.'.
		      $LangID,
		      \%NONBREAKING);
};


# For Chinese: need text segmentation
#
# old way: require this module (but it's a non-standard one!)
# use Lingua::ZH::Segmenter qw/:all/;
#
# new way: use this module only if it is available (otherwise split on characters)
if ($opt_l=~/^(chi|zho)$/){

srt2xml  view on Meta::CPAN

use open qw(:std :locale);


my $enc = $opt_e || LangEncoding($opt_l);

# binmode(STDIN,":encoding($enc)");
binmode(STDIN);
binmode(STDOUT,':encoding(utf8)');

# open second output file for raw, untokenized XML

if ($opt_r){
    if ($opt_r=~/\.gz$/){
	open F,"| grep '.' | gzip -c > $opt_r" || warn "cannot open $opt_r";
    }
    else{
	open F,"| grep '.' > $opt_r" || warn "cannot open $opt_r";
    }
    binmode(F,':encoding(utf8)');
}

print_xml_header();

my $sid = 1;
print "  <s id=\"$sid\">\n";
print F "  <s id=\"$sid\">\n" if ($opt_r);
$sid++;
my $s_ended = 0;

##
## these RE's are not used at all ...
##
#my $s_start = '([\"\']?[\¿\¡\p{Lu}])';
#my $s_start_maybe = '(\-?\s*[\"\'\¿\¡]?[\p{N}\p{Ps}])';
#my $s_end = "([^\.]\.[\"\']?|[\.\!\?\:][\"\']?)";
#my $s_end_maybe = "([^\.]\.[\"\'\]\}\)]?\-?\s*|[\.\!\?\:][\"\'\]\}\)]?\-?\s*)";

# Greek: ';' is a question mark!

if ($opt_l eq 'ell'){
    my $s_end = "([^\.]\.[\"\']?|[\.\!\?\:\;][\"\']?)";
}



my $start=undef;
my $end=undef;
my $lastend = undef;
my $id=undef;
my $wid = 0;

my $newchunk = 0;

my @opentags=();
my @closedtags=();

my $first=1;

while (my $line = <>){

    # check if the first line has a BOM
    # --~ try to detect encoding!
    if ($first){
	my $check;
	($line, $enc) = decode_from_bom($line,$enc,$check);
	binmode(STDIN,":encoding($enc)");
	$first=0;
    }

    # remove dos line endings
    $line=~s/\r\n$/\n/;
    # some additional cleanup, see: http://stackoverflow.com/questions/1016910/how-can-i-strip-invalid-xml-characters-from-strings-in-perl
    $line=~tr/\e\x00-\x08\x0A\x0B\x0C\x0E-\x19//d;

    if (not defined $id){
	if ($line=~/^\s*([0-9]+)$/){
	    $id = $1;
	    next;
	}
    }
    elsif (not defined $start){
	if ($line=~/^([0-9:,]+) --> ([0-9:,]+)/){
	    $start = $1;
	    $end = $2;
#	    print "    <time id=\"start$id\" value=\"$start\" />\n";
	    $newchunk = 1;
	    if ($lastend){
		if (time2sec($start)-time2sec($lastend) > $PAUSETHR1){
		    if (not $s_ended){$s_ended = 2;}
		    elsif ($s_ended < 3){$s_ended++;}
#		$s_ended = 2;
		}
		if (time2sec($start)-time2sec($lastend) > $PAUSETHR2){
		    $s_ended = 3;
		}
	    }
	    next;
	}
    }

    if ($line=~/^\s*$/){
	if ($end){
	    # always close all open tags at end of time frame
	    closetags();
	    @closedtags = (); # flush tag-stack ....

	    print "    <time id=\"T${id}E\" value=\"$end\" />\n";
	    print F "\n    <time id=\"T${id}E\" value=\"$end\" />\n" if ($opt_r);
	    $lastend = $end;
	    $id=undef;
	    $start=undef;
	    $end=undef;
	    ## new fragment -> always a possible sentence end!
	    if (not $s_ended){$s_ended = 1;}
	    ## for some languages: always split here!
	    if ($SPLIT_AT_TIMEFRAME{$opt_l} || $opt_s){$s_ended = 3;}
	}
    }
    else{

	# some strange markup in curly brackets in some files



( run in 0.388 second using v1.01-cache-2.11-cpan-fe3c2283af0 )