Pod-Simple

 view release on metacpan or  search on metacpan

lib/Pod/Simple/BlackBox.pm  view on Meta::CPAN

    my $re = eval "no warnings; $use_utf8 qr/$input_re/";
    #print STDERR  __LINE__, ": $input_re: $@\n" if $@;
    return "" if $@;

    my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
    #print STDERR  __LINE__, ": $input_re: $@\n" if $@;
    return "" if $@;

    #print STDERR  __LINE__, ": SUCCESS: $re\n" if $matches;
    return $re if $matches;

    #print STDERR  __LINE__, ": $re: didn't match\n";
    return "";
}

BEGIN {
  require Pod::Simple;
  *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
}

# Matches a character iff the character will have a different meaning
# if we choose CP1252 vs UTF-8 if there is no =encoding line.
# This is broken for early Perls on non-ASCII platforms.
my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
$non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;

# Use patterns understandable by Perl 5.6, if possible
my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
my $cn_re = my_qr('\p{IsCn}', "\x{09E4}");  # <reserved> code point unlikely
                                            # to get assigned
my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
                           "\x{250}");
$rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;

my $script_run_re = eval 'no warnings "experimental::script_run";
                          qr/(*script_run: ^ .* $ )/x';
my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
unless ($latin_re) {
    # This was machine generated to be the ranges of the union of the above
    # three properties, with things that were undefined by Unicode 4.1 filling
    # gaps.  That is the version in use when Perl advanced enough to
    # successfully compile and execute the above pattern.
    $latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\...
}

my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");

# Latin script code points not in the first release of Unicode
my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");

# If this perl doesn't have the Deprecated property, there's only one code
# point in it that we need be concerned with.
my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
$deprecated_re = qr/\x{149}/ unless $deprecated_re;

my $utf8_bom;
if ( do { no integer; "$]" >= 5.007_003 }) {
  $utf8_bom = "\x{FEFF}";
  utf8::encode($utf8_bom);
} else {
  $utf8_bom = "\xEF\xBB\xBF";   # No EBCDIC BOM detection for early Perls.
}

# This is used so that the 'content_seen' method doesn't return true on a
# file that just happens to have a line that matches /^=[a-zA-z]/.  Only if
# there is a valid =foo line will we return that content was seen.
my $seen_legal_directive = 0;

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

sub parse_line { shift->parse_lines(@_) } # alias

# - - -  Turn back now!  Run away!  - - -

sub parse_lines {             # Usage: $parser->parse_lines(@lines)
  # an undef means end-of-stream
  my $self = shift;

  my $code_handler = $self->{'code_handler'};
  my $cut_handler  = $self->{'cut_handler'};
  my $wl_handler   = $self->{'whiteline_handler'};
  $self->{'line_count'} ||= 0;

  my $scratch;

  DEBUG > 4 and
   print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";

  DEBUG > 5 and
   print STDERR "#  About to parse lines: ",
     join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";

  my $paras = ($self->{'paras'} ||= []);
   # paragraph buffer.  Because we need to defer processing of =over
   # directives and verbatim paragraphs.  We call _ponder_paragraph_buffer
   # to process this.

  $self->{'pod_para_count'} ||= 0;

  # An attempt to match the pod portions of a line.  This is not fool proof,
  # but is good enough to serve as part of the heuristic for guessing the pod
  # encoding if not specified.
  my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
  my $pod_chars_re = qr/ ^ = [A-Za-z]+ | [\Q$codes\E] < /x;

  my $line;
  foreach my $source_line (@_) {
    if( $self->{'source_dead'} ) {
      DEBUG > 4 and print STDERR "# Source is dead.\n";
      last;
    }

    unless( defined $source_line ) {
      DEBUG > 4 and print STDERR "# Undef-line seen.\n";

      push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
      push @$paras, $paras->[-1], $paras->[-1];
       # So that it definitely fills the buffer.
      $self->{'source_dead'} = 1;
      $self->_ponder_paragraph_buffer;
      next;
    }


    if( $self->{'line_count'}++ ) {
      ($line = $source_line) =~ tr/\n\r//d;
       # If we don't have two vars, we'll end up with that there
       # tr/// modding the (potentially read-only) original source line!

    } else {
      DEBUG > 2 and print STDERR "First line: [$source_line]\n";

      if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
        DEBUG and print STDERR "UTF-8 BOM seen.  Faking a '=encoding utf8'.\n";
        $self->_handle_encoding_line( "=encoding utf8" );
        delete $self->{'_processed_encoding'};
        $line =~ tr/\n\r//d;

      } elsif( $line =~ s/^\xFE\xFF//s ) {
        DEBUG and print STDERR "Big-endian UTF-16 BOM seen.  Aborting parsing.\n";
        $self->scream(
          $self->{'line_count'},
          "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
        );
        splice @_;
        push @_, undef;
        next;

        # TODO: implement somehow?

      } elsif( $line =~ s/^\xFF\xFE//s ) {
        DEBUG and print STDERR "Little-endian UTF-16 BOM seen.  Aborting parsing.\n";
        $self->scream(
          $self->{'line_count'},
          "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
        );
        splice @_;
        push @_, undef;
        next;

        # TODO: implement somehow?

      } else {
        DEBUG > 2 and print STDERR "First line is BOM-less.\n";
        ($line = $source_line) =~ tr/\n\r//d;
      }
    }

    if(!$self->{'parse_characters'} && !$self->{'encoding'}
      && ($self->{'in_pod'} || $line =~ /^=/s)
      && $line =~ /$non_ascii_re/
    ) {

      my $encoding;

      # No =encoding line, and we are at the first pod line in the input that
      # contains a non-ascii byte, that is, one whose meaning varies depending
      # on whether the file is encoded in UTF-8 or CP1252, which are the two
      # possibilities permitted by the pod spec.  (ASCII is assumed if the
      # file only contains ASCII bytes.)  In order to process this line, we
      # need to figure out what encoding we will use for the file.
      #
      # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
      # 160-255, but it is used here, as it often colloquially is, to refer to
      # the complete set of code points 0-255, including ASCII (0-127), the C1
      # controls (128-159), and strict Latin 1 (160-255).
      #
      # CP1252 is effectively a superset of Latin 1, because it differs only
      # from colloquial 8859-1 in the C1 controls, which are very unlikely to
      # actually be present in 8859-1 files, so can be used for other purposes
      # without conflict.  CP 1252 uses most of them for graphic characters.
      #
      # Note that all ASCII-range bytes represent their corresponding code
      # points in both CP1252 and UTF-8.  In ASCII platform UTF-8, all other
      # code points require multiple (non-ASCII) bytes to represent.  (A
      # separate paragraph for EBCDIC is below.)  The multi-byte
      # representation is quite structured.  If we find an isolated byte that
      # would require multiple bytes to represent in UTF-8, we know that the
      # encoding is not UTF-8.  If we find a sequence of bytes that violates
      # the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
      # hence must be 1252.
      #
      # But there are ambiguous cases where we could guess wrong.  If so, the
      # user will end up having to supply an =encoding line.  We use all
      # readily available information to improve our chances of guessing
      # right.  The odds of something not being UTF-8, but still passing a
      # UTF-8 validity test go down very rapidly with increasing length of the
      # sequence.  Therefore we look at all non-ascii sequences on the line.
      # If any of the sequences can't be UTF-8, we quit there and choose
      # CP1252.  If all could be UTF-8, we see if any of the code points
      # represented are unlikely to be in pod.  If so, we guess CP1252.  If
      # not, we check if the line is all in the same script; if not guess
      # CP1252; otherwise UTF-8.  For perls that don't have convenient script
      # run testing, see if there is both Latin and non-Latin.  If so, CP1252,
      # otherwise UTF-8.
      #
      # On EBCDIC platforms, the situation is somewhat different.  In
      # UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
      # but so do the bytes that are for the C1 controls.  Recall that these
      # correspond to the unused portion of 8859-1 that 1252 mostly takes
      # over.  That means that there are fewer code points that are
      # represented by multi-bytes.  But, note that the these controls are
      # very unlikely to be in pod text.  So if we encounter one of them, it
      # means that it is quite likely CP1252 and not UTF-8.  The net result is



( run in 0.529 second using v1.01-cache-2.11-cpan-5a3173703d6 )