Unicode-Tussle

 view release on metacpan or  search on metacpan

script/uniquote  view on Meta::CPAN

    }

    debug("setting encoding on $handle to $enc_name") if $enc_name;

    return 1 if eval {
        use warnings FATAL => "all";
        no strict "refs";
        binmode($handle, ":$enc_name") if $enc_name;
        1;
    };

    for ($@) {
        s/ at .* line \d+\.//;
        s/$/ for $path/;
    }

    yuck("set_encoding: $@");

    return undef;
}

sub find_converters() {
    state $aref = [
        qw{
            names hex octal backslash
            bytes
            html4
            html5
	    xml
        }
    ];

    return wantarray() ? @$aref : $aref;
}

sub convert_to_names(_) {
    my $string = shift();
    no warnings "utf8";
    $string =~ s/($Ugly_RX)/sprintf("\\N{%s}", num2name(ord $1))/ge;
    return $string;
}

sub convert_to_backslashes(_) {

    local $_ = shift();

    s,\\,\\\\,g if $Opt{backslashes};
    s,\0,\\0,g;
    s,\a,\\a,g;
    s,\t,\\t,g;
    s,\r,\\r,g;
  # s,\n,\\n,g;
    s,\f,\\f,g;
    s,\e,\\e,g;
    s/(?!\n)([\0-\37\177])/sprintf("\\c%s", chr(ord($1) ^ 64))/ge;

    return $_;
}


UNITCHECK {

my %html4_2unicode = (

# Number aliases: these are \p{Other_Number}
      "sup1" => "SUPERSCRIPT ONE",                            # ¹ U+00B9
      "sup2" => "SUPERSCRIPT TWO",                            # ² U+00B2
      "sup3" => "SUPERSCRIPT THREE",                          # ³ U+00B3
    "frac12" => "VULGAR FRACTION ONE HALF",                   # ½ U+00BD
    "frac14" => "VULGAR FRACTION ONE QUARTER",                # ¼ U+00BC
    "frac34" => "VULGAR FRACTION THREE QUARTERS",             # ¾ U+00BE

# Currency sign aliases: \p{Currency_Symbol}

    "curren" => "CURRENCY SIGN",                              # ¤ U+00A4
      "cent" => "CENT SIGN",                                  # ¢ U+00A2
     "pound" => "POUND SIGN",                                 # £ U+00A3
       "yen" => "YEN SIGN",                                   # ¥ U+00A5
      "euro" => "EURO SIGN",                                  # € U+20AC

# Latin letter aliases in NFC and grouped by first letter
#
#   NOTE: some like BLACK LETTER blah and the trademark
#         symbol are only Latin in NFKD form.

      "ordf" => "FEMININE ORDINAL INDICATOR",                 # ª U+00AA
    "Aacute" => "LATIN CAPITAL LETTER A WITH ACUTE",          # Á U+00C1
    "aacute" => "LATIN SMALL LETTER A WITH ACUTE",            # á U+00E1
    "Agrave" => "LATIN CAPITAL LETTER A WITH GRAVE",          # À U+00C0
    "agrave" => "LATIN SMALL LETTER A WITH GRAVE",            # à U+00E0
     "Acirc" => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",     # Â U+00C2
     "acirc" => "LATIN SMALL LETTER A WITH CIRCUMFLEX",       # â U+00E2
     "Aring" => "LATIN CAPITAL LETTER A WITH RING ABOVE",     # Ã… U+00C5
     "aring" => "LATIN SMALL LETTER A WITH RING ABOVE",       # å U+00E5
      "Auml" => "LATIN CAPITAL LETTER A WITH DIAERESIS",      # Ä U+00C4
      "auml" => "LATIN SMALL LETTER A WITH DIAERESIS",        # ä U+00E4
    "Atilde" => "LATIN CAPITAL LETTER A WITH TILDE",          # Ã U+00C3
    "atilde" => "LATIN SMALL LETTER A WITH TILDE",            # ã U+00E3
     "AElig" => "LATIN CAPITAL LETTER AE",                    # Æ U+00C6
     "aelig" => "LATIN SMALL LETTER AE",                      # æ U+00E6

    "Ccedil" => "LATIN CAPITAL LETTER C WITH CEDILLA",        # Ç U+00C7
    "ccedil" => "LATIN SMALL LETTER C WITH CEDILLA",          # ç U+00E7

       "ETH" => "LATIN CAPITAL LETTER ETH",                   # Ð U+00D0
       "eth" => "LATIN SMALL LETTER ETH",                     # ð U+00F0

    "Eacute" => "LATIN CAPITAL LETTER E WITH ACUTE",          # É U+00C9
    "eacute" => "LATIN SMALL LETTER E WITH ACUTE",            # é U+00E9
    "Egrave" => "LATIN CAPITAL LETTER E WITH GRAVE",          # È U+00C8
    "egrave" => "LATIN SMALL LETTER E WITH GRAVE",            # è U+00E8
     "Ecirc" => "LATIN CAPITAL LETTER E WITH CIRCUMFLEX",     # Ê U+00CA
     "ecirc" => "LATIN SMALL LETTER E WITH CIRCUMFLEX",       # ê U+00EA
      "Euml" => "LATIN CAPITAL LETTER E WITH DIAERESIS",      # Ë U+00CB
      "euml" => "LATIN SMALL LETTER E WITH DIAERESIS",        # ë U+00EB

      "fnof" => "LATIN SMALL LETTER F WITH HOOK",             # Æ’ U+0192

     "image" => "BLACK-LETTER CAPITAL I",                     # â„‘ U+2111
    "Iacute" => "LATIN CAPITAL LETTER I WITH ACUTE",          # Í U+00CD
    "iacute" => "LATIN SMALL LETTER I WITH ACUTE",            # í U+00ED

script/uniquote  view on Meta::CPAN

	     "Yscr" => "\N{MATHEMATICAL SCRIPT CAPITAL Y}",	# 𝒴
	     "yopf" => "\N{MATHEMATICAL DOUBLE-STRUCK SMALL Y}",	# 𝕪
	      "yfr" => "\N{MATHEMATICAL FRAKTUR SMALL Y}",	# 𝔶
	     "yscr" => "\N{MATHEMATICAL SCRIPT SMALL Y}",	# 𝓎

);

sub convert_to_html4(_) {
    state $entity;
    if (!$entity && $Opt{names}) {
        $entity = {};
        while (my($hsym, $usym) = each %html4_2unicode) {
            my $codepoint = charnames::vianame($usym) || panic("huh?");
            $entity->{ chr $codepoint } = "&$hsym;";
        }
        debug("entity count: " . keys %$entity);
    }
    no warnings "utf8";
    local $_ = $_[0];
    ##s/&/&/g;
    ##s/</&lt;/g;
    ##s/>/&gt;/g;
    ##s/"/&quot;/g;
    # &apos; is only in XHTML, not HTML4.  Be conservative
    ###s/'/&apos;/g;
    s {
        ( $Ugly_RX )
    }{
        $entity->{$1} || sprintf("&#%d;", ord $1)
    }gxe;
    return $_;
}

sub convert_to_html5(_) {
    state $entity;

    if (!$entity && $Opt{names}) {
        $entity = {};
        while (my($hsym, $ustr) = each %html5_2unicode) {
            $entity->{ $ustr } = "&$hsym;";
        }
        debug("entity count: " . keys %$entity);
    }
    no warnings "utf8";
    local $_ = $_[0];

    s/&/&amp;/g;
    s/</&lt;/g;
    s/>/&gt;/g;
    s/"/&quot;/g;
    s/'/&apos;/g;

    s {
        ( $Ugly_RX )
    }{
        $entity->{$1} || sprintf("&#%d;", ord $1)
    }gxe;
    return $_;
}

} # end UNITCHECK private scope for hash

sub convert_to_xml(_) {
    local $_ = shift();
    no warnings "utf8";

    s/&/&amp;/g;
    s/</&lt;/g;
    s/>/&gt;/g;
    s/"/&quot;/g;
    s/'/&apos;/g;

    s/($Ugly_RX+)/sprintf("&#x%x;", ord $1)/ge;
    return $_;
}

sub convert_to_U_numbers(_) {
    my $string = shift();
    no warnings "utf8";
    $string =~ s/($Ugly_RX)/sprintf("\\N{U+%v02X}", $1)/ge;
    return $string;;
}

sub convert_to_hex(_) {
    my $string = shift();
    my $minwidth = $Opt{bytes} ? 2 : 4;
    $string =~ s/($Ugly_RX)/sprintf("\\x{%02X}", ord $1)/ge;
    return $string;;
}

sub convert_to_octal(_) {
    my $string = shift();
    no warnings "utf8";
    $string =~ s/($Ugly_RX)/sprintf("\\%o", ord $1)/ge;
    return $string;;
}

sub convert_to_bytes(_) {
    require Encode;
    my $string = shift();
    $string =~ s/($Ugly_RX)/sprintf("\\x%02X", ord $1)/ge;
    return $string;
}

sub commaʼd_list {
    my($argc, @arg) = (scalar @_, @_);
    my $sepchar = grep(/,/ => @arg) ? ";" : ",";

	return q() unless @arg;
	return $arg[0] if @arg == 1;

	panic("list @arg already has an and in it") if grep /\band\b/, @arg;
	return "$arg[0] and $arg[1]" if @arg == 2;

	return join "$sepchar " => (
		@arg[ 0 .. ($#arg-1) ],
		"and $arg[-1]",
	);
}

sub usage($) {



( run in 0.914 second using v1.01-cache-2.11-cpan-39bf76dae61 )