HTML-Encoding

 view release on metacpan or  search on metacpan

lib/HTML/Encoding.pm  view on Meta::CPAN

    # in array context return all encodings,
    # in scalar context return best match.
    return wantarray ? @sort : $sort[0];
}

sub encoding_from_xml_declaration
{
    my $decl = shift;

    return unless defined $decl;
    return unless length $decl;

    # todo: move this to some better place...
    my $ws = qr/[\x09\x85\x20\x0d\x0a\x{2028}]*/;
    
    # skip if not an XML declaration
    return unless $decl =~ /^<\?xml$ws/i;

    # attempt to extract encoding pseudo attribute
    return unless $decl =~ /encoding$ws=$ws'([^']+)'/i or
                  $decl =~ /encoding$ws=$ws"([^"]+)"/i;

    # no encoding pseudo-attribute
    return unless defined $1;
    my $enco = $1;

    # strip leading/trailing whitespace/quotes
    $enco =~ s/^[\s'"]+|[\s'"]+$//g;
    
    # collapse white-space
    $enco =~ s/\s+/ /g;
    
    # treat empty charset as if it were unspecified
    return unless length $enco;

    return $enco;
}

sub encoding_from_byte_order_mark
{
    my $text = shift;
    my %o = @_;
    my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
    my %resu;

    return unless defined $text;
    return unless length $text;

    foreach my $e (@$encodings)
    {
        my $map = _get_character_map($e);
        my $bom = $map->{BM};
        
        # encoding cannot encode U+FEFF
        next unless defined $bom;
        
        # remember match length
        $resu{$e} = length $bom if $text =~ /^(\Q$bom\E)/;
    }

    # does not start with BOM
    return unless keys %resu;
    
    # sort by match length, longest match first
    my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
    
    # in array context return all encodings,
    # in scalar context return best match.
    return wantarray ? @sort : $sort[0];
}

sub encoding_from_content_type
{
    my $text = shift;

    # nothing to do...
    return unless defined $text and length $text;
    
    # downgrade Unicode strings
    $text = Encode::encode_utf8($text) if Encode::is_utf8($text);
    
    # split parameters, only look at the first set
    my %data = @{(split_header_words($text))[0]};
    
    # extract first charset parameter if any
    my $char;
    foreach my $param (keys %data) {
      $char = $data{$param} and last if 'charset' eq lc $param;
    }

    # no charset parameter    
    return unless defined $char;
    
    # there are no special escapes so just remove \s
    $char =~ tr/\\//d;
    
    # strip leading/trailing whitespace/quotes
    $char =~ s/^[\s'"]+|[\s'"]+$//g;
    
    # collapse white-space
    $char =~ s/\s+/ /g;
    
    # treat empty charset as if it were unspecified
    return unless length $char;
    
    return $char
}

sub encoding_from_xml_document
{
    my $text = shift;
    my %o = @_;
    my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
    my %resu;
    
    return unless defined $text;
    return unless length $text;
    
    my @boms = encoding_from_byte_order_mark($text, encodings => $encodings);

    # BOM determines encoding
    return wantarray ? (bom => \@boms) : $boms[0] if @boms;
    
    # no BOM
    my @decls = xml_declaration_from_octets($text, encodings => $encodings);
    foreach my $decl (@decls)
    {
        my $enco = encoding_from_xml_declaration($decl);
        $resu{$enco}++ if defined $enco and length $enco;
    }

    return unless keys %resu;
    my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
    
    # in array context return all encodings,
    # in scalar context return best match.
    return wantarray ? (xml => \@sort) : $sort[0];
}

sub encoding_from_html_document
{
    my $text = shift;
    my %o = @_;
    my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
    my $popts = $o{parser_options} || {};
    my $xhtml = exists $o{xhtml} ? $o{xhtml} : 1;
    
    return unless defined $text;
    return unless length $text;
    
    if ($xhtml)
    {
        my @xml = wantarray
                    ? encoding_from_xml_document($text, encodings => $encodings)
                    : scalar encoding_from_xml_document($text, encodings => $encodings);
        
        return wantarray
          ? @xml
          : $xml[0]
            if @xml and defined $xml[0];
    }
    else
    {
        my @boms = encoding_from_byte_order_mark($text, encodings => $encodings);

        # BOM determines encoding
        return wantarray ? (bom => \@boms) : $boms[0] if @boms;
    }

    # no BOM
    my @resu;
    
    # sanity check to exclude e.g. UTF-32
    my @first = encoding_from_first_chars($text, encodings => $encodings);
    
    # fall back to provided encoding list
    @first = @$encodings unless @first;
    
    foreach my $try (@first)
    {
        push @resu, encoding_from_meta_element($text, $try, %$popts);
    }

    return unless @resu;
    return wantarray ? (meta => \@resu) : $resu[0];
}

sub encoding_from_http_message
{
    my $mess      = shift;
    my %o         = @_;

    my $encodings = $o{encodings}        || $DEFAULT_ENCODINGS;
    my $is_html   = $o{is_html}          || qr{^text/html$}i;
    my $is_xml    = $o{is_xml}           || qr{^.+/(?:.+\+)?xml$}i;
    my $is_t_xml  = $o{is_text_xml}      || qr{^text/(?:.+\+)?xml$}i;
    my $html_d    = $o{html_default}     || "ISO-8859-1";
    my $xml_d     = $o{xml_default}      || "UTF-8";
    my $txml      = $o{text_xml_default};
    
    my $xhtml     = exists $o{xhtml}   ? $o{xhtml}   : 1;
    my $default   = exists $o{default} ? $o{default} : 1;
    
    my $type      = $mess->header('Content-Type');
    my $charset   = encoding_from_content_type($type);
    
    if ($mess->content_type =~ $is_xml)
    {
        return wantarray ? (protocol => $charset) : $charset
          if defined $charset;
          
        # special case for text/xml at user option
        return wantarray ? (protocol_default => $txml) : $txml
          if defined $txml and $mess->content_type =~ $is_t_xml;
          
        if (wantarray)
        {
            my @xml = encoding_from_xml_document($mess->content, encodings => $encodings);
            return @xml if @xml;
        }
        else
        {
            my $xml = scalar encoding_from_xml_document($mess->content, encodings => $encodings);
            return $xml if defined $xml;
        }
        
        return wantarray ? (default => $xml_d) : $xml_d if defined $default;
    }
    
    if ($mess->content_type =~ $is_html)

lib/HTML/Encoding.pm  view on Meta::CPAN

=head1 ENCODING SOURCES

C<encoding_from_xml_document>, C<encoding_from_html_document>, and
C<encoding_from_http_message> return in list context the encoding
source and the encoding name, possible encoding sources are

  * protocol         (Content-Type: text/html;charset=encoding)
  * bom              (leading U+FEFF)
  * xml              (<?xml version='1.0' encoding='encoding'?>)
  * meta             (<meta http-equiv=...)
  * default          (default fallback value)
  * protocol_default (protocol default)

=head1 ROUTINES

Routines exported by this module at user option. By default, nothing
is exported.

=over 2

=item encoding_from_content_type($content_type)

Takes a byte string and uses L<HTTP::Headers::Util> to extract the
charset parameter from the C<Content-Type> header value and returns
its value or C<undef> (or an empty list in list context) if there
is no such value. Only the first component will be examined
(HTTP/1.1 only allows for one component), any backslash escapes in
strings will be unescaped, all leading and trailing quote marks
and white-space characters will be removed, all white-space will be
collapsed to a single space, empty charset values will be ignored
and no case folding is performed.

Examples:

  +-----------------------------------------+-----------+
  | encoding_from_content_type(...)         | returns   |
  +-----------------------------------------+-----------+
  | "text/html"                             | undef     |
  | "text/html,text/plain;charset=utf-8"    | undef     |
  | "text/html;charset="                    | undef     |
  | "text/html;charset=\"\\u\\t\\f\\-\\8\"" | 'utf-8'   |
  | "text/html;charset=utf\\-8"             | 'utf\\-8' |
  | "text/html;charset='utf-8'"             | 'utf-8'   |
  | "text/html;charset=\" UTF-8 \""         | 'UTF-8'   |
  +-----------------------------------------+-----------+

If you pass a string with the UTF-8 flag turned on the string will
be converted to bytes before it is passed to L<HTTP::Headers::Util>.
The return value will thus never have the UTF-8 flag turned on (this
might change in future versions).

=item encoding_from_byte_order_mark($octets [, %options])

Takes a sequence of octets and attempts to read a byte order mark
at the beginning of the octet sequence. It will go through the list
of $options{encodings} or the list of default encodings if no
encodings are specified and match the beginning of the string against
any byte order mark octet sequence found.

The result can be ambiguous, for example qq(\xFF\xFE\x00\x00) could
be both, a complete BOM in UTF-32LE or a UTF-16LE BOM followed by a
U+0000 character. It is also possible that C<$octets> starts with
something that looks like a byte order mark but actually is not.

encoding_from_byte_order_mark sorts the list of possible encodings
by the length of their BOM octet sequence and returns in scalar
context only the encoding with the longest match, and all encodings
ordered by length of their BOM octet sequence in list context.

Examples:

  +-------------------------+------------+-----------------------+
  | Input                   | Encodings  | Result                |
  +-------------------------+------------+-----------------------+
  | "\xFF\xFE\x00\x00"      | default    | qw(UTF-32LE)          |
  | "\xFF\xFE\x00\x00"      | default    | qw(UTF-32LE UTF-16LE) |
  | "\xEF\xBB\xBF"          | default    | qw(UTF-8)             |
  | "Hello World!"          | default    | undef                 |
  | "\xDD\x73\x66\x73"      | default    | undef                 |
  | "\xDD\x73\x66\x73"      | UTF-EBCDIC | qw(UTF-EBCDIC)        |
  | "\x2B\x2F\x76\x38\x2D"  | default    | undef                 |
  | "\x2B\x2F\x76\x38\x2D"  | UTF-7      | qw(UTF-7)             |
  +-------------------------+------------+-----------------------+

Note however that for UTF-7 it is in theory possible that the U+FEFF
combines with other characters in which case such detection would fail,
for example consider:

  +--------------------------------------+-----------+-----------+
  | Input                                | Encodings | Result    |
  +--------------------------------------+-----------+-----------+
  | "\x2B\x2F\x76\x38\x41\x39\x67\x2D"   | default   | undef     |
  | "\x2B\x2F\x76\x38\x41\x39\x67\x2D"   | UTF-7     | undef     |
  +--------------------------------------+-----------+-----------+

This might change in future versions, although this is not very
relevant for most applications as there should never be need to use
UTF-7 in the encoding list for existing documents.

If no BOM can be found it returns C<undef> in scalar context and an
empty list in list context. This routine should not be used with
strings with the UTF-8 flag turned on. 

=item encoding_from_xml_declaration($declaration)

Attempts to extract the value of the encoding pseudo-attribute in an XML
declaration or text declaration in the character string $declaration. If
there does not appear to be such a value it returns nothing. This would
typically be used with the return values of xml_declaration_from_octets.
Normalizes whitespaces like encoding_from_content_type.

Examples:

  +-------------------------------------------+---------+
  | encoding_from_xml_declaration(...)        | Result  |
  +-------------------------------------------+---------+
  | "<?xml version='1.0' encoding='utf-8'?>"  | 'utf-8' |
  | "<?xml encoding='utf-8'?>"                | 'utf-8' |
  | "<?xml encoding=\"utf-8\"?>"              | 'utf-8' |
  | "<?xml foo='bar' encoding='utf-8'?>"      | 'utf-8' |
  | "<?xml encoding='a' encoding='b'?>"       | 'a'     |
  | "<?xml encoding=' a    b '?>"             | 'a b'   |
  | "<?xml-stylesheet encoding='utf-8'?>"     | undef   |
  | " <?xml encoding='utf-8'?>"               | undef   |
  | "<?xml encoding =\x{2028}'utf-8'?>"       | 'utf-8' |
  | "<?xml version='1.0' encoding=utf-8?>"    | undef   |
  | "<?xml x='encoding=\"a\"' encoding='b'?>" | 'a'     |
  +-------------------------------------------+---------+

Note that encoding_from_xml_declaration() determines the encoding even
if the XML declaration is not well-formed or violates other requirements
of the relevant XML specification as long as it can find an encoding
pseudo-attribute in the provided string. This means XML processors must
apply further checks to determine whether the entity is well-formed, etc.

=item xml_declaration_from_octets($octets [, %options])

Attempts to find a ">" character in the byte string $octets using the
encodings in $encodings and upon success attempts to find a preceding
"<" character. Returns all the strings found this way in the order of
number of successful matches in list context and the best match in
scalar context. Should probably be combined with the only user of this
routine, encoding_from_xml_declaration... You can modify the list of
suspected encodings using $options{encodings};

=item encoding_from_first_chars($octets [, %options])

Assuming that documents start with "<" optionally preceded by whitespace
characters, encoding_from_first_chars attempts to determine an encoding
by matching $octets against something like /^[@{$options{whitespace}}]*</
in the various suspected $options{encodings}.

This is useful to distinguish e.g. UTF-16LE from UTF-8 if the byte string
does not start with a byte order mark nor an XML declaration (e.g. if the
document is a HTML document) to get at least a base encoding which can be
used to decode enough of the document to find <meta> elements using
encoding_from_meta_element. $options{whitespace} defaults to qw/CR LF SP TB/.
Returns nothing if unsuccessful. Returns the matching encodings in order
of the number of octets matched in list context and the best match in
scalar context.



( run in 0.446 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )