HTML-Encoding
view release on metacpan or search on metacpan
lib/HTML/Encoding.pm view on Meta::CPAN
my $map = _get_character_map($e);
my $bom = $map->{BM};
# encoding cannot encode U+FEFF
next unless defined $bom;
# remember match length
$resu{$e} = length $bom if $text =~ /^(\Q$bom\E)/;
}
# does not start with BOM
return unless keys %resu;
# sort by match length, longest match first
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
# in array context return all encodings,
# in scalar context return best match.
return wantarray ? @sort : $sort[0];
}
lib/HTML/Encoding.pm view on Meta::CPAN
my $text = shift;
my %o = @_;
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
my %resu;
return unless defined $text;
return unless length $text;
my @boms = encoding_from_byte_order_mark($text, encodings => $encodings);
# BOM determines encoding
return wantarray ? (bom => \@boms) : $boms[0] if @boms;
# no BOM
my @decls = xml_declaration_from_octets($text, encodings => $encodings);
foreach my $decl (@decls)
{
my $enco = encoding_from_xml_declaration($decl);
$resu{$enco}++ if defined $enco and length $enco;
}
return unless keys %resu;
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
lib/HTML/Encoding.pm view on Meta::CPAN
return wantarray
? @xml
: $xml[0]
if @xml and defined $xml[0];
}
else
{
my @boms = encoding_from_byte_order_mark($text, encodings => $encodings);
# BOM determines encoding
return wantarray ? (bom => \@boms) : $boms[0] if @boms;
}
# no BOM
my @resu;
# sanity check to exclude e.g. UTF-32
my @first = encoding_from_first_chars($text, encodings => $encodings);
# fall back to provided encoding list
@first = @$encodings unless @first;
foreach my $try (@first)
{
lib/HTML/Encoding.pm view on Meta::CPAN
=item encoding_from_byte_order_mark($octets [, %options])
Takes a sequence of octets and attempts to read a byte order mark
at the beginning of the octet sequence. It will go through the list
of $options{encodings} or the list of default encodings if no
encodings are specified and match the beginning of the string against
any byte order mark octet sequence found.
The result can be ambiguous, for example qq(\xFF\xFE\x00\x00) could
be both, a complete BOM in UTF-32LE or a UTF-16LE BOM followed by a
U+0000 character. It is also possible that C<$octets> starts with
something that looks like a byte order mark but actually is not.
encoding_from_byte_order_mark sorts the list of possible encodings
by the length of their BOM octet sequence and returns in scalar
context only the encoding with the longest match, and all encodings
ordered by length of their BOM octet sequence in list context.
Examples:
+-------------------------+------------+-----------------------+
| Input | Encodings | Result |
+-------------------------+------------+-----------------------+
| "\xFF\xFE\x00\x00" | default | qw(UTF-32LE) |
| "\xFF\xFE\x00\x00" | default | qw(UTF-32LE UTF-16LE) |
| "\xEF\xBB\xBF" | default | qw(UTF-8) |
| "Hello World!" | default | undef |
lib/HTML/Encoding.pm view on Meta::CPAN
| Input | Encodings | Result |
+--------------------------------------+-----------+-----------+
| "\x2B\x2F\x76\x38\x41\x39\x67\x2D" | default | undef |
| "\x2B\x2F\x76\x38\x41\x39\x67\x2D" | UTF-7 | undef |
+--------------------------------------+-----------+-----------+
This might change in future versions, although this is not very
relevant for most applications as there should never be need to use
UTF-7 in the encoding list for existing documents.
If no BOM can be found it returns C<undef> in scalar context and an
empty list in list context. This routine should not be used with
strings with the UTF-8 flag turned on.
=item encoding_from_xml_declaration($declaration)
Attempts to extract the value of the encoding pseudo-attribute in an XML
declaration or text declaration in the character string $declaration. If
there does not appear to be such a value it returns nothing. This would
typically be used with the return values of xml_declaration_from_octets.
Normalizes whitespaces like encoding_from_content_type.
( run in 0.427 second using v1.01-cache-2.11-cpan-e9daa2b36ef )