Encode-Guess-Educated

 view release on metacpan or  search on metacpan

lib/Encode/Guess/Educated.pm  view on Meta::CPAN

    _validate_list_context();

    return  map  { $_->[0] }
	    sort { $a->[1] cmp $b->[1] }
	    map  { [ $_ => lc str2nummistr($_) ] }
	    @_
	    ;

} 

sub uniq {
    _validate_private_method();
    _validate_list_context();

    my %seen;
    my @retlist;
    for (@_) {
	push @retlist, $_ unless $seen{$_}++;
    } 
    return @retlist;
} 

sub uniquote(_) {
    _validate_argc(@_ => 1);

    my($str) = @_;
    _validate_nonref($str);

    $str =~ s{ ( \P{ASCII} ) }
	     {
		my $ord = ord $1;
		my $name = charnames::viacode($ord) || sprintf("U+%04X", $ord);
		sprintf("\\N{%s}", $name);
	    }xge;
    return $str;
} 

sub debugging() { 
    _validate_private_method();
    return our $DEBUG;
}  

sub whoami()  { (caller(1))[3] }
sub whowasi() { (caller(2))[3] }

sub debug {
    _validate_private_method();
    return unless debugging();
    my($fmt, @args) = @_;
    my $subname = whowasi();
    printf STDOUT "DEBUG(%s): $fmt", $subname, @args;
    print "\n" unless $fmt =~ /\n\z/;
} 

########################################################################
########################################################################
########################################################################

# Class initializers

UNITCHECK { 

####################################
# Incidence of non-ASCII code points in PubMed Open Access as of December 2010.
#
# Table is UCA sorted and formatted using the dump_training_data
# object method, because sorting on anything else is trivial, so the 
# hard one is the default.
####################################

my %oed2_training = (
    0x000314 =>     241,   # ◌ ̔  gc=Mn   sc=Inherited  COMBINING REVERSED COMMA ABOVE
    0x000301 =>     325,   # ◌ ́  gc=Mn   sc=Inherited  COMBINING ACUTE ACCENT
    0x000300 =>       2,   # ◌ ̀  gc=Mn   sc=Inherited  COMBINING GRAVE ACCENT
    0x000306 =>    2214,   # ◌ ̆  gc=Mn   sc=Inherited  COMBINING BREVE
    0x000302 =>     201,   # ◌ ̂  gc=Mn   sc=Inherited  COMBINING CIRCUMFLEX ACCENT
    0x00030C =>       5,   # ◌ ̌  gc=Mn   sc=Inherited  COMBINING CARON
    0x000308 =>       5,   # ◌ ̈  gc=Mn   sc=Inherited  COMBINING DIAERESIS
    0x000303 =>     106,   # ◌ ̃  gc=Mn   sc=Inherited  COMBINING TILDE
    0x000307 =>      28,   # ◌ ̇  gc=Mn   sc=Inherited  COMBINING DOT ABOVE
    0x000327 =>     710,   # ◌ ̧  gc=Mn   sc=Inherited  COMBINING CEDILLA
    0x000304 =>     129,   # ◌ ̄  gc=Mn   sc=Inherited  COMBINING MACRON
    0x000320 =>     133,   # ◌ ̠  gc=Mn   sc=Inherited  COMBINING MINUS SIGN BELOW
    0x000336 =>     267,   # ◌ ̶  gc=Mn   sc=Inherited  COMBINING LONG STROKE OVERLAY
    0x000323 =>       6,   # ◌ ̣  gc=Mn   sc=Inherited  COMBINING DOT BELOW
    0x00032D =>      15,   # ◌ ̭  gc=Mn   sc=Inherited  COMBINING CIRCUMFLEX ACCENT BELOW
    0x000345 =>       9,   # ◌ ͅ  gc=Mn   sc=Inherited  COMBINING GREEK YPOGEGRAMMENI
    0x000651 =>       2,   # ◌ ّ  gc=Mn   sc=Inherited  ARABIC SHADDA
    0x0020E9 =>       2,   # ◌ ⃩  gc=Mn   sc=Inherited  COMBINING WIDE BRIDGE ABOVE
    0x0000B4 =>      48,   #  ´  gc=Sk   sc=Common     ACUTE ACCENT
    0x0000AF =>       5,   #  ¯  gc=Sk   sc=Common     MACRON
    0x0002D8 =>       4,   #  ˘  gc=Sk   sc=Common     BREVE
    0x0000A8 =>       6,   #  ¨  gc=Sk   sc=Common     DIAERESIS
    0x0000B8 =>       1,   #  ¸  gc=Sk   sc=Common     CEDILLA
    0x002010 => 1205194,   #  ‐  gc=Pd   sc=Common     HYPHEN
    0x002013 =>  163112,   #  –  gc=Pd   sc=Common     EN DASH
    0x002014 =>     430,   #  —  gc=Pd   sc=Common     EM DASH
    0x0000B7 =>  143383,   #  ·  gc=Po   sc=Common     MIDDLE DOT
    0x002018 =>  228766,   #  ‘  gc=Pi   sc=Common     LEFT SINGLE QUOTATION MARK
    0x002019 =>  737362,   #  ’  gc=Pf   sc=Common     RIGHT SINGLE QUOTATION MARK
    0x002039 =>      11,   #  ‹  gc=Pi   sc=Common     SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    0x00203A =>      12,   #  ›  gc=Pf   sc=Common     SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    0x00FF08 =>      13,   #  ( gc=Ps   sc=Common     FULLWIDTH LEFT PARENTHESIS
    0x00FF09 =>      13,   #  ) gc=Pe   sc=Common     FULLWIDTH RIGHT PARENTHESIS
    0x00FF3B =>       4,   #  ï¼» gc=Ps   sc=Common     FULLWIDTH LEFT SQUARE BRACKET
    0x00FF3D =>       4,   #  ï¼½ gc=Pe   sc=Common     FULLWIDTH RIGHT SQUARE BRACKET
    0x00FF5B =>     102,   #  ï½› gc=Ps   sc=Common     FULLWIDTH LEFT CURLY BRACKET
    0x00FF5D =>     101,   #  } gc=Pe   sc=Common     FULLWIDTH RIGHT CURLY BRACKET
    0x0000A7 =>   42343,   #  §  gc=So   sc=Common     SECTION SIGN
    0x0000B6 =>     235,   #  ¶  gc=So   sc=Common     PILCROW SIGN
    0x00204B =>   13003,   #  ⁋  gc=Po   sc=Common     REVERSED PILCROW SIGN
    0x0000A9 =>       4,   #  ©  gc=So   sc=Common     COPYRIGHT SIGN
    0x00FF0F =>       1,   #  / gc=Po   sc=Common     FULLWIDTH SOLIDUS
    0x002030 =>      16,   #  ‰  gc=Po   sc=Common     PER MILLE SIGN
    0x002020 =>    8882,   #  †  gc=Po   sc=Common     DAGGER
    0x002021 =>       9,   #  ‡  gc=Po   sc=Common     DOUBLE DAGGER
    0x002032 =>     967,   #  ′  gc=Po   sc=Common     PRIME
    0x002033 =>     362,   #  ″  gc=Po   sc=Common     DOUBLE PRIME
    0x002034 =>      24,   #  ‴  gc=Po   sc=Common     TRIPLE PRIME
    0x002038 =>       2,   #  ‸  gc=Po   sc=Common     CARET
    0x0002C8 =>    2550,   #  ˈ  gc=Lm   sc=Common     MODIFIER LETTER VERTICAL LINE



( run in 1.912 second using v1.01-cache-2.11-cpan-39bf76dae61 )