Encode
view release on metacpan or search on metacpan
=head3 decode_utf8
$string = decode_utf8($octets [, CHECK]);
B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
Do not use it for data exchange.
Unless you want Perl's older "lax" mode, prefer
C<$string = decode("UTF-8", $octets [, CHECK])>.
Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
The sequence of octets represented by $octets is decoded
from (loose, not strict) utf8 into a sequence of logical characters.
Because not all sequences of octets are valid not strict utf8,
it is quite possible for this function to fail.
For CHECK, see L</"Handling Malformed Data">.
B<CAVEAT>: the input I<$octets> might be modified in-place depending on
what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
left unchanged.
=head2 Listing available encodings
=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
=back
For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
When you decode, C<\xI<HH>> is inserted for a malformed character, where
I<HH> is the hex representation of the octet that could not be decoded to
utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
the Unicode code point (in any number of hex digits) of the character that
cannot be found in the character repertoire of the encoding.
The HTML/XML character reference modes are about the same. In place of
C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
=head2 coderef for CHECK
As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
ordinal value of the unmapped character as an argument and returns
octets that represent the fallback character. For instance:
$ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
Fallback for C<decode> must return decoded string (sequence of characters)
and takes a list of ordinal values as its arguments. So for
example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
a fallback for bytes that are not valid UTF-8, you could write
$str = decode 'UTF-8', $octets, sub {
my $tmp = join '', map chr, @_;
return decode 'ISO-8859-15', $tmp;
};
=head1 Defining Encodings
if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
STRLEN sublen;
char *substr;
SV* subchar =
(fallback_cb != &PL_sv_undef)
? do_fallback_cb(aTHX_ ch, fallback_cb)
: newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04" UVxf "}" :
check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
"&#x%" UVxf ";", (UV)ch);
substr = SvPV(subchar, sublen);
if (SvUTF8(subchar) && sublen && !utf8_to_bytes((U8 *)substr, &sublen)) { /* make sure no decoded string gets in */
SvREFCNT_dec(subchar);
croak("Wide character");
}
sdone += slen + clen;
ddone += dlen + sublen;
sv_catpvn(dst, substr, sublen);
SvREFCNT_dec(subchar);
} else {
/* fallback char */
sdone += slen + clen;
SV* subchar;
if (encode) {
subchar =
(fallback_cb != &PL_sv_undef)
? do_fallback_cb(aTHX_ uv, fallback_cb)
: newSVpvf(check & ENCODE_PERLQQ
? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
: check & ENCODE_HTMLCREF ? "&#%" UVuf ";"
: "&#x%" UVxf ";", uv);
substr = SvPV(subchar, sublen);
if (SvUTF8(subchar) && sublen && !utf8_to_bytes((U8 *)substr, &sublen)) { /* make sure no decoded string gets in */
SvREFCNT_dec(subchar);
croak("Wide character");
}
} else {
if (fallback_cb != &PL_sv_undef) {
/* in decode mode we have sequence of wrong bytes */
subchar = do_bytes_fallback_cb(aTHX_ s, ulen, fallback_cb);
} else {
char *ptr = esc;
/* ENCODE_PERLQQ is already stored in esc */
+-------------- Unicode Character ID in hex
The format is roughly the same as a header section except for the
fallback flag: | followed by 0..3. The meaning of the possible
values is as follows:
=over 4
=item |0
Round trip safe. A character decoded to Unicode encodes back to the
same byte sequence. Most characters have this flag.
=item |1
Fallback for unicode -> encoding. When seen, enc2xs adds this
character for the encode map only.
=item |2
Skip sub-char mapping should there be no code point.
while (<$ifh>) {
Encode::from_to( $_, $from, $to, $Opt{check} );
print;
}
}
}
elsif ( $scheme eq 'decode_encode' ) { # step-by-step
if ($need2slurp){
local $/;
$_ = <$ifh>;
my $decoded = decode( $from, $_, $Opt{check} );
my $encoded = encode( $to, $decoded );
print $encoded;
}else{
while (<$ifh>) {
my $decoded = decode( $from, $_, $Opt{check} );
my $encoded = encode( $to, $decoded );
print $encoded;
}
}
}
else { # won't reach
die "$name: unknown scheme: $scheme";
}
}
}
else {
lib/Encode/MIME/Header.pm view on Meta::CPAN
my $orig = $2;
my $charset = $3;
my ($mime_enc, $text) = split /\?/, $5;
$text =~ tr/\r\n//d;
my $enc = Encode::find_mime_encoding($charset);
# in non strict mode allow also perl encoding aliases
if ( not defined $enc and not $STRICT_DECODE ) {
# make sure that decoded string will be always strict UTF-8
$charset = 'UTF-8' if lc($charset) eq 'utf8';
$enc = Encode::find_encoding($charset);
}
if ( not defined $enc ) {
Carp::croak qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR;
Carp::carp qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR;
$stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
$output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace
$stop ? $orig : '';
} else {
if ( uc($mime_enc) eq 'B' and $obj->{decode_b} ) {
my $decoded = _decode_b($enc, $text, $chk);
$stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
$output .= (defined $decoded ? $decoded : $text) unless $stop;
$stop ? $orig : '';
} elsif ( uc($mime_enc) eq 'Q' and $obj->{decode_q} ) {
my $decoded = _decode_q($enc, $text, $chk);
$stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
$output .= (defined $decoded ? $decoded : $text) unless $stop;
$stop ? $orig : '';
} else {
Carp::croak qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR;
Carp::carp qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR;
$stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
$output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace
$stop ? $orig : '';
}
}
lib/Encode/MIME/Header.pm view on Meta::CPAN
encoding names supported by this module: C<MIME-Header>, C<MIME-B> and
C<MIME-Q>.
=head1 DESCRIPTION
Decode method takes an unstructured field body of the email header (or
L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token) as its input and
decodes each MIME encoded-word from input string to a sequence of bytes
according to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and
L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Subsequently, each sequence
of bytes with the corresponding MIME charset is decoded with
L<the Encode module|Encode> and finally, one output string is returned. Text
parts of the input string which do not contain MIME encoded-word stay
unmodified in the output string. Folded newlines between two consecutive MIME
encoded-words are discarded, others are preserved in the output string.
C<MIME-B> can decode Base64 variant, C<MIME-Q> can decode Quoted-Printable
variant and C<MIME-Header> can decode both of them. If L<Encode module|Encode>
does not support particular MIME charset or chosen variant then an action based
on L<CHECK flags|Encode/Handling Malformed Data> is performed (by default, the
MIME encoded-word is not decoded).
Encode method takes a scalar string as its input and uses
L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for encoding it to UTF-8
bytes. Then a sequence of UTF-8 bytes is encoded into MIME encoded-words
(C<MIME-Header> and C<MIME-B> use a Base64 variant while C<MIME-Q> uses a
Quoted-Printable variant) where each MIME encoded-word is limited to 75
characters. MIME encoded-words are separated by C<CRLF SPACE> and joined to
one output string. Output string is suitable for unstructured field body of
the email header.
t/mime_header_iso2022jp.t view on Meta::CPAN
"´Á»ú¡¢¥«¥¿¥«¥Ê¡¢¤Ò¤é¤¬¤Ê¤Îº®¤¸¤Ã¤¿Subject Header."
=> "=?ISO-2022-JP?B?GyRCNEE7eiEiJSslPyUrJUohIiRSJGkkLCRKJE46LiQ4JEMkPxsoQlN1?=\n =?ISO-2022-JP?B?YmplY3Q=?= Header.",
);
for my $k (keys %mime){
$mime{"$k\n"} = $mime{$k} . "\n";
}
for my $decoded (sort keys %mime){
my $encoded = $mime{$decoded};
my $header = Encode::encode('MIME-Header-ISO_2022_JP', decode('euc-jp', $decoded));
my $utf8 = Encode::decode('MIME-Header', $header);
is(encode('euc-jp', $utf8), $decoded);
is($header, $encoded);
}
__END__
t/utf8warnings.t view on Meta::CPAN
plan tests => 12;
my @invalid;
ok ! defined eval { encode('UTF-8', "\x{D800}", FB_CROAK | LEAVE_SRC) }, 'Surrogate codepoint \x{D800} is not encoded to strict UTF-8';
like $@, qr/^"\\x\{d800\}" does not map to UTF-8 at $script line /, 'Error message contains strict UTF-8 name';
@invalid = ();
encode('UTF-8', "\x{D800}", sub { @invalid = @_; return ""; });
is_deeply \@invalid, [ 0xD800 ], 'Fallback coderef contains invalid codepoint 0xD800';
ok ! defined eval { decode('UTF-8', "\xed\xa0\x80", FB_CROAK | LEAVE_SRC) }, 'Surrogate UTF-8 byte sequence \xED\xA0\x80 is decoded with strict UTF-8 decoder';
like $@, qr/^UTF-8 "\\xED\\xA0\\x80" does not map to Unicode at $script line /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
@invalid = ();
decode('UTF-8', "\xed\xa0\x80", sub { @invalid = @_; return ""; });
is_deeply \@invalid, [ 0xED, 0xA0, 0x80 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0, 0x80';
ok ! defined eval { decode('UTF-8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with strict UTF-8 decoder';
like $@, qr/^UTF-8 "\\xED\\xA0" does not map to Unicode at $script line /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
@invalid = ();
decode('UTF-8', "\xed\xa0", sub { @invalid = @_; return ""; });
is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';
ok ! defined eval { decode('utf8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with non-strict utf8 decoder';
like $@, qr/^utf8 "\\xED\\xA0" does not map to Unicode at $script line /, 'Error message contains non-strict utf8 name and original (not decoded) invalid sequence';
decode('utf8', "\xed\xa0", sub { @invalid = @_; return ""; });
is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';
( run in 0.289 second using v1.01-cache-2.11-cpan-0d8aa00de5b )