Unicode-Tussle
view release on metacpan or search on metacpan
script/uniquote view on Meta::CPAN
}
debug("setting encoding on $handle to $enc_name") if $enc_name;
return 1 if eval {
use warnings FATAL => "all";
no strict "refs";
binmode($handle, ":$enc_name") if $enc_name;
1;
};
for ($@) {
s/ at .* line \d+\.//;
s/$/ for $path/;
}
yuck("set_encoding: $@");
return undef;
}
sub find_converters() {
state $aref = [
qw{
names hex octal backslash
bytes
html4
html5
xml
}
];
return wantarray() ? @$aref : $aref;
}
sub convert_to_names(_) {
my $string = shift();
no warnings "utf8";
$string =~ s/($Ugly_RX)/sprintf("\\N{%s}", num2name(ord $1))/ge;
return $string;
}
sub convert_to_backslashes(_) {
local $_ = shift();
s,\\,\\\\,g if $Opt{backslashes};
s,\0,\\0,g;
s,\a,\\a,g;
s,\t,\\t,g;
s,\r,\\r,g;
# s,\n,\\n,g;
s,\f,\\f,g;
s,\e,\\e,g;
s/(?!\n)([\0-\37\177])/sprintf("\\c%s", chr(ord($1) ^ 64))/ge;
return $_;
}
UNITCHECK {
my %html4_2unicode = (
# Number aliases: these are \p{Other_Number}
"sup1" => "SUPERSCRIPT ONE", # ¹ U+00B9
"sup2" => "SUPERSCRIPT TWO", # ² U+00B2
"sup3" => "SUPERSCRIPT THREE", # ³ U+00B3
"frac12" => "VULGAR FRACTION ONE HALF", # ½ U+00BD
"frac14" => "VULGAR FRACTION ONE QUARTER", # ¼ U+00BC
"frac34" => "VULGAR FRACTION THREE QUARTERS", # ¾ U+00BE
# Currency sign aliases: \p{Currency_Symbol}
"curren" => "CURRENCY SIGN", # ¤ U+00A4
"cent" => "CENT SIGN", # ¢ U+00A2
"pound" => "POUND SIGN", # £ U+00A3
"yen" => "YEN SIGN", # ¥ U+00A5
"euro" => "EURO SIGN", # ⬠U+20AC
# Latin letter aliases in NFC and grouped by first letter
#
# NOTE: some like BLACK LETTER blah and the trademark
# symbol are only Latin in NFKD form.
"ordf" => "FEMININE ORDINAL INDICATOR", # ª U+00AA
"Aacute" => "LATIN CAPITAL LETTER A WITH ACUTE", # Ã U+00C1
"aacute" => "LATIN SMALL LETTER A WITH ACUTE", # á U+00E1
"Agrave" => "LATIN CAPITAL LETTER A WITH GRAVE", # Ã U+00C0
"agrave" => "LATIN SMALL LETTER A WITH GRAVE", # Ã U+00E0
"Acirc" => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", # Ã U+00C2
"acirc" => "LATIN SMALL LETTER A WITH CIRCUMFLEX", # â U+00E2
"Aring" => "LATIN CAPITAL LETTER A WITH RING ABOVE", # Ã
U+00C5
"aring" => "LATIN SMALL LETTER A WITH RING ABOVE", # å U+00E5
"Auml" => "LATIN CAPITAL LETTER A WITH DIAERESIS", # Ã U+00C4
"auml" => "LATIN SMALL LETTER A WITH DIAERESIS", # ä U+00E4
"Atilde" => "LATIN CAPITAL LETTER A WITH TILDE", # Ã U+00C3
"atilde" => "LATIN SMALL LETTER A WITH TILDE", # ã U+00E3
"AElig" => "LATIN CAPITAL LETTER AE", # Ã U+00C6
"aelig" => "LATIN SMALL LETTER AE", # æ U+00E6
"Ccedil" => "LATIN CAPITAL LETTER C WITH CEDILLA", # Ã U+00C7
"ccedil" => "LATIN SMALL LETTER C WITH CEDILLA", # ç U+00E7
"ETH" => "LATIN CAPITAL LETTER ETH", # Ã U+00D0
"eth" => "LATIN SMALL LETTER ETH", # ð U+00F0
"Eacute" => "LATIN CAPITAL LETTER E WITH ACUTE", # Ã U+00C9
"eacute" => "LATIN SMALL LETTER E WITH ACUTE", # é U+00E9
"Egrave" => "LATIN CAPITAL LETTER E WITH GRAVE", # Ã U+00C8
"egrave" => "LATIN SMALL LETTER E WITH GRAVE", # è U+00E8
"Ecirc" => "LATIN CAPITAL LETTER E WITH CIRCUMFLEX", # Ã U+00CA
"ecirc" => "LATIN SMALL LETTER E WITH CIRCUMFLEX", # ê U+00EA
"Euml" => "LATIN CAPITAL LETTER E WITH DIAERESIS", # Ã U+00CB
"euml" => "LATIN SMALL LETTER E WITH DIAERESIS", # ë U+00EB
"fnof" => "LATIN SMALL LETTER F WITH HOOK", # Æ U+0192
"image" => "BLACK-LETTER CAPITAL I", # â U+2111
"Iacute" => "LATIN CAPITAL LETTER I WITH ACUTE", # Ã U+00CD
"iacute" => "LATIN SMALL LETTER I WITH ACUTE", # Ã U+00ED
script/uniquote view on Meta::CPAN
"Yscr" => "\N{MATHEMATICAL SCRIPT CAPITAL Y}", # ð´
"yopf" => "\N{MATHEMATICAL DOUBLE-STRUCK SMALL Y}", # ðª
"yfr" => "\N{MATHEMATICAL FRAKTUR SMALL Y}", # ð¶
"yscr" => "\N{MATHEMATICAL SCRIPT SMALL Y}", # ð
);
sub convert_to_html4(_) {
state $entity;
if (!$entity && $Opt{names}) {
$entity = {};
while (my($hsym, $usym) = each %html4_2unicode) {
my $codepoint = charnames::vianame($usym) || panic("huh?");
$entity->{ chr $codepoint } = "&$hsym;";
}
debug("entity count: " . keys %$entity);
}
no warnings "utf8";
local $_ = $_[0];
##s/&/&/g;
##s/</</g;
##s/>/>/g;
##s/"/"/g;
# ' is only in XHTML, not HTML4. Be conservative
###s/'/'/g;
s {
( $Ugly_RX )
}{
$entity->{$1} || sprintf("&#%d;", ord $1)
}gxe;
return $_;
}
sub convert_to_html5(_) {
state $entity;
if (!$entity && $Opt{names}) {
$entity = {};
while (my($hsym, $ustr) = each %html5_2unicode) {
$entity->{ $ustr } = "&$hsym;";
}
debug("entity count: " . keys %$entity);
}
no warnings "utf8";
local $_ = $_[0];
s/&/&/g;
s/</</g;
s/>/>/g;
s/"/"/g;
s/'/'/g;
s {
( $Ugly_RX )
}{
$entity->{$1} || sprintf("&#%d;", ord $1)
}gxe;
return $_;
}
} # end UNITCHECK private scope for hash
sub convert_to_xml(_) {
local $_ = shift();
no warnings "utf8";
s/&/&/g;
s/</</g;
s/>/>/g;
s/"/"/g;
s/'/'/g;
s/($Ugly_RX+)/sprintf("&#x%x;", ord $1)/ge;
return $_;
}
sub convert_to_U_numbers(_) {
my $string = shift();
no warnings "utf8";
$string =~ s/($Ugly_RX)/sprintf("\\N{U+%v02X}", $1)/ge;
return $string;;
}
sub convert_to_hex(_) {
my $string = shift();
my $minwidth = $Opt{bytes} ? 2 : 4;
$string =~ s/($Ugly_RX)/sprintf("\\x{%02X}", ord $1)/ge;
return $string;;
}
sub convert_to_octal(_) {
my $string = shift();
no warnings "utf8";
$string =~ s/($Ugly_RX)/sprintf("\\%o", ord $1)/ge;
return $string;;
}
sub convert_to_bytes(_) {
require Encode;
my $string = shift();
$string =~ s/($Ugly_RX)/sprintf("\\x%02X", ord $1)/ge;
return $string;
}
sub commaʼd_list {
my($argc, @arg) = (scalar @_, @_);
my $sepchar = grep(/,/ => @arg) ? ";" : ",";
return q() unless @arg;
return $arg[0] if @arg == 1;
panic("list @arg already has an and in it") if grep /\band\b/, @arg;
return "$arg[0] and $arg[1]" if @arg == 2;
return join "$sepchar " => (
@arg[ 0 .. ($#arg-1) ],
"and $arg[-1]",
);
}
sub usage($) {
( run in 0.914 second using v1.01-cache-2.11-cpan-39bf76dae61 )