Text-SRT-Align
view release on metacpan or search on metacpan
#!/usr/bin/env perl
#-*-perl-*-
#
# convert srt files (movie subtitles) to tokenized XML (utf8)
# (very simple tokenization & sentence splitting)
#
# usage: ./srt2xml [-l lang-code] srt-file > xmlfile
#
# -e encoding ....
# -r file ....... raw xml output file (without tokenizations)
# -s ............. always start a new sentence at new time frames
#
#
#
=encoding UTF-8
=head1 NAME
srt2xml - script for converting SRT-files (subtitles) to tokenized XML
=head1 USAGE
srt2xml [OPTIONS] < input.srt > output.xml
=head1 OPTIONS
-e encoding ......... specify the character encoding of the SRT file
-l lang-id .......... use non-breaking prefixes for the given language
-r filename ......... save an untokenized version in <filename>
-s .................. always start a new sentence at each time frame
=head1 DESCRIPTION
This script detects sentence boundaries and tokenizes the text in given SRT movie subtitle files and creates XML output.
=head1 TODO
Automatic detection of character encodings.
=cut
use strict;
use utf8;
use Getopt::Std;
use IPC::Open3;
use FindBin qw($Bin);
use File::BOM qw( :all );
use File::ShareDir;
use Encode qw(decode encode);
use Locale::Codes::Language 3.26;
use vars qw/$opt_l $opt_e $opt_r $opt_s/;
$opt_l = 'unknown';
getopts('l:e:r:s');
# make a three-letter language code
if (length($opt_l) == 2){
$opt_l = language_code2code($opt_l, 'alpha-2', 'alpha-3');
}
our $SHARED_HOME = File::ShareDir::dist_dir('Text-SRT-Align');
my $PAUSETHR1 = 1; # > 1 second --> most probably new sentence
my $PAUSETHR2 = 3; # > 3 second --> definitely new sentence
# for some languages: always split sentences at new time frames
# (because we have no good sentence splitter implemented for them)
my %SPLIT_AT_TIMEFRAME = (
'heb' => 1,
'ara' => 1,
'sin' => 1,
'tha' => 1,
'urd' => 1,
'zho' => 1,
'chi' => 1,
'far' => 1,
'kor' => 1,
'jpn' => 1
);
## read (non-breaking) abbreviations (if the file exists)
## check also if Uplug is installed and use abbreviation lists from there
my %NONBREAKING=();
read_non_breaking($SHARED_HOME.'/nonbreaking_prefix.'.$opt_l,\%NONBREAKING);
eval{
my $SHARED_UPLUG = File::ShareDir::dist_dir('Uplug');
my $LangID = language_code2code($opt_l, 'alpha-2', 'alpha-3');
read_non_breaking($SHARED_UPLUG.
'/lang/nonbreaking_prefixes/nonbreaking_prefix.'.
$LangID,
\%NONBREAKING);
};
# For Chinese: need text segmentation
#
# old way: require this module (but it's a non-standard one!)
# use Lingua::ZH::Segmenter qw/:all/;
#
# new way: use this module only if it is available (otherwise split on characters)
if ($opt_l=~/^(chi|zho)$/){
use open qw(:std :locale);
my $enc = $opt_e || LangEncoding($opt_l);
# binmode(STDIN,":encoding($enc)");
binmode(STDIN);
binmode(STDOUT,':encoding(utf8)');
# open second output file for raw, untokenized XML
if ($opt_r){
if ($opt_r=~/\.gz$/){
open F,"| grep '.' | gzip -c > $opt_r" || warn "cannot open $opt_r";
}
else{
open F,"| grep '.' > $opt_r" || warn "cannot open $opt_r";
}
binmode(F,':encoding(utf8)');
}
print_xml_header();
my $sid = 1;
print " <s id=\"$sid\">\n";
print F " <s id=\"$sid\">\n" if ($opt_r);
$sid++;
my $s_ended = 0;
##
## these RE's are not used at all ...
##
#my $s_start = '([\"\']?[\¿\¡\p{Lu}])';
#my $s_start_maybe = '(\-?\s*[\"\'\¿\¡]?[\p{N}\p{Ps}])';
#my $s_end = "([^\.]\.[\"\']?|[\.\!\?\:][\"\']?)";
#my $s_end_maybe = "([^\.]\.[\"\'\]\}\)]?\-?\s*|[\.\!\?\:][\"\'\]\}\)]?\-?\s*)";
# Greek: ';' is a question mark!
if ($opt_l eq 'ell'){
my $s_end = "([^\.]\.[\"\']?|[\.\!\?\:\;][\"\']?)";
}
my $start=undef;
my $end=undef;
my $lastend = undef;
my $id=undef;
my $wid = 0;
my $newchunk = 0;
my @opentags=();
my @closedtags=();
my $first=1;
while (my $line = <>){
# check if the first line has a BOM
# --~ try to detect encoding!
if ($first){
my $check;
($line, $enc) = decode_from_bom($line,$enc,$check);
binmode(STDIN,":encoding($enc)");
$first=0;
}
# remove dos line endings
$line=~s/\r\n$/\n/;
# some additional cleanup, see: http://stackoverflow.com/questions/1016910/how-can-i-strip-invalid-xml-characters-from-strings-in-perl
$line=~tr/\e\x00-\x08\x0A\x0B\x0C\x0E-\x19//d;
if (not defined $id){
if ($line=~/^\s*([0-9]+)$/){
$id = $1;
next;
}
}
elsif (not defined $start){
if ($line=~/^([0-9:,]+) --> ([0-9:,]+)/){
$start = $1;
$end = $2;
# print " <time id=\"start$id\" value=\"$start\" />\n";
$newchunk = 1;
if ($lastend){
if (time2sec($start)-time2sec($lastend) > $PAUSETHR1){
if (not $s_ended){$s_ended = 2;}
elsif ($s_ended < 3){$s_ended++;}
# $s_ended = 2;
}
if (time2sec($start)-time2sec($lastend) > $PAUSETHR2){
$s_ended = 3;
}
}
next;
}
}
if ($line=~/^\s*$/){
if ($end){
# always close all open tags at end of time frame
closetags();
@closedtags = (); # flush tag-stack ....
print " <time id=\"T${id}E\" value=\"$end\" />\n";
print F "\n <time id=\"T${id}E\" value=\"$end\" />\n" if ($opt_r);
$lastend = $end;
$id=undef;
$start=undef;
$end=undef;
## new fragment -> always a possible sentence end!
if (not $s_ended){$s_ended = 1;}
## for some languages: always split here!
if ($SPLIT_AT_TIMEFRAME{$opt_l} || $opt_s){$s_ended = 3;}
}
}
else{
# some strange markup in curly brackets in some files
( run in 0.388 second using v1.01-cache-2.11-cpan-fe3c2283af0 )