App-get_flash_videos

 view release on metacpan or  search on metacpan

lib/FlashVideo/Utils.pm  view on Meta::CPAN


  die "SAMI subtitles must be provided"      unless $sami_subtitles;
  die "Output SRT filename must be provided" unless $filename;

  # Use regexes to "parse" SAMI since HTML::TokeParser is too awkward. It
  # makes it hard to preserve linebreaks and other formatting in subtitles.
  # It's also quite slow.
  $sami_subtitles =~ s/[\r\n]//g; # flatten

  my @lines = split /<Sync\s/i, $sami_subtitles;
  shift @lines; # Skip headers

  my @subtitles;
  my $count = 0;

  my $last_proper_sub_end_time = '';

  for (@lines) {
    my ($begin, $sub);
    # Remove span elements
    s|<\/?span.*?>| |g;
    
    # replace "&amp;" with "&"
    s|&amp;|&|g;

    # replace "&nbsp;" with " "
    s{&(?:nbsp|#160);}{ }g;

    # Start="2284698"><P Class="ENCC">I won't have to drink it<br />in this crappy warehouse.</P></Sync>
    #($begin, $sub) = ($1, $2) if m{.*Start="(.+?)".+<P.+?>(.+?)<\/p>.*?<\/Sync>}i;

    ($begin, $sub) = ($1, $2) if m{[^>]*Start="(.+?)"[^>]*>(.*?)<\/Sync>}i;

    if (/^\s*Encrypted="true"\s*/i) {
      if ($decrypt_callback and ref($decrypt_callback) eq 'CODE') {
        $sub = $decrypt_callback->($sub);
      }
    }

    $sub =~ s@&amp;@&@g;
    $sub =~ s@(?:</?span[^>]*>|&nbsp;|&#160;)@ @g;

    # Do some tidying up.
    # Note only <P> tags are removed--<i> tags are left in place since VLC
    # and others support this for formatting.
    $sub =~ s{</?P[^>]*?>}{}g;  # remove <P Class="ENCC"> and similar

    # VLC is very sensitive to tag case.
    $sub =~ s{<(/)?([BI])>}{"<$1" . lc($2) . ">"}eg;
    
    decode_entities($sub); # in void context, this works in place

    if ($sub and ($begin or $begin == 0)) {
      # Convert milliseconds into HH:MM:ss,mmm format
      my $seconds = int( $begin / 1000.0 );
      my $ms = $begin - ( $seconds * 1000.0 );
      $begin = sprintf("%02d:%02d:%02d,%03d", (gmtime($seconds))[2,1,0], $ms );

      # Don't strip simple HTML like <i></i> - VLC and other players
      # support basic subtitle styling, see:
      # http://git.videolan.org/?p=vlc.git;a=blob;f=modules/codec/subtitles/subsdec.c

      # Leading/trailing spaces
      $sub =~ s/^\s*(.*?)\s*$/$1/;

      # strip multispaces
      $sub =~ s/\s{2,}/ /g;

      # Replace <br /> (and similar) with \n. VLC handles \n in SubRip files
      # fine. For <br> it is case and slash sensitive.
      $sub =~ s|<br ?\/? ?>|\n|ig;

      $sub =~ s/^\s*|\s*$//mg;

      if ($count and !$subtitles[$count - 1]->{end}) {
        $subtitles[$count - 1]->{end} = $begin;
      }

      # SAMI subtitles are a bit crap. Only a start time is specified for
      # each subtitle. No end time is specified, so the subtitle is displayed
      # until the next subtitle is ready to be shown. This means that if
      # subtitles aren't meant to be shown for part of the video, a dummy
      # subtitle (usually just a space) has to be inserted.
      if (!$sub or $sub =~ /^\s+$/) {
        if ($count) {
          $last_proper_sub_end_time = $subtitles[$count - 1]->{end};
        }

        # Gap in subtitles.
        next; # this is not a meaningful subtitle
      }

      push @subtitles, {
        start => $begin,
        text  => $sub,
      };

      $count++;
    }
  }

  # Ensure the end time for the last subtitle is correct.
  $subtitles[$count - 1]->{end} = $last_proper_sub_end_time;

  # Write subtitles
  open my $subtitle_fh, '>', $filename
    or die "Can't open subtitles file $filename: $!";

  # Set filehandle to UTF-8 to avoid "wide character in print" warnings.
  # Note this does *not* double-encode data as UTF-8 (verify with hexdump).
  # As per the documentation for binmode: ":utf8 just marks the data as
  # UTF-8 without further checking". This will cause mojibake if 
  # ISO-8859-1/Latin1 and UTF-8 and are mixed in the same file though.
  binmode $subtitle_fh, ':utf8';

  $count = 1;

  foreach my $subtitle (@subtitles) {
    print $subtitle_fh "$count\n$subtitle->{start} --> $subtitle->{end}\n" .
                       "$subtitle->{text}\n\n";
    $count++;



( run in 0.724 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )