Encode-Guess-Educated
view release on metacpan or search on metacpan
lib/Encode/Guess/Educated.pm view on Meta::CPAN
_validate_list_context();
return map { $_->[0] }
sort { $a->[1] cmp $b->[1] }
map { [ $_ => lc str2nummistr($_) ] }
@_
;
}
sub uniq {
_validate_private_method();
_validate_list_context();
my %seen;
my @retlist;
for (@_) {
push @retlist, $_ unless $seen{$_}++;
}
return @retlist;
}
sub uniquote(_) {
_validate_argc(@_ => 1);
my($str) = @_;
_validate_nonref($str);
$str =~ s{ ( \P{ASCII} ) }
{
my $ord = ord $1;
my $name = charnames::viacode($ord) || sprintf("U+%04X", $ord);
sprintf("\\N{%s}", $name);
}xge;
return $str;
}
sub debugging() {
_validate_private_method();
return our $DEBUG;
}
sub whoami() { (caller(1))[3] }
sub whowasi() { (caller(2))[3] }
sub debug {
_validate_private_method();
return unless debugging();
my($fmt, @args) = @_;
my $subname = whowasi();
printf STDOUT "DEBUG(%s): $fmt", $subname, @args;
print "\n" unless $fmt =~ /\n\z/;
}
########################################################################
########################################################################
########################################################################
# Class initializers
UNITCHECK {
####################################
# Incidence of non-ASCII code points in PubMed Open Access as of December 2010.
#
# Table is UCA sorted and formatted using the dump_training_data
# object method, because sorting on anything else is trivial, so the
# hard one is the default.
####################################
my %oed2_training = (
0x000314 => 241, # â Ì gc=Mn sc=Inherited COMBINING REVERSED COMMA ABOVE
0x000301 => 325, # â Ì gc=Mn sc=Inherited COMBINING ACUTE ACCENT
0x000300 => 2, # â Ì gc=Mn sc=Inherited COMBINING GRAVE ACCENT
0x000306 => 2214, # â Ì gc=Mn sc=Inherited COMBINING BREVE
0x000302 => 201, # â Ì gc=Mn sc=Inherited COMBINING CIRCUMFLEX ACCENT
0x00030C => 5, # â Ì gc=Mn sc=Inherited COMBINING CARON
0x000308 => 5, # â Ì gc=Mn sc=Inherited COMBINING DIAERESIS
0x000303 => 106, # â Ì gc=Mn sc=Inherited COMBINING TILDE
0x000307 => 28, # â Ì gc=Mn sc=Inherited COMBINING DOT ABOVE
0x000327 => 710, # â ̧ gc=Mn sc=Inherited COMBINING CEDILLA
0x000304 => 129, # â Ì gc=Mn sc=Inherited COMBINING MACRON
0x000320 => 133, # â Ì gc=Mn sc=Inherited COMBINING MINUS SIGN BELOW
0x000336 => 267, # â ̶ gc=Mn sc=Inherited COMBINING LONG STROKE OVERLAY
0x000323 => 6, # â Ì£ gc=Mn sc=Inherited COMBINING DOT BELOW
0x00032D => 15, # â Ì gc=Mn sc=Inherited COMBINING CIRCUMFLEX ACCENT BELOW
0x000345 => 9, # â Í
gc=Mn sc=Inherited COMBINING GREEK YPOGEGRAMMENI
0x000651 => 2, # â Ù gc=Mn sc=Inherited ARABIC SHADDA
0x0020E9 => 2, # â â© gc=Mn sc=Inherited COMBINING WIDE BRIDGE ABOVE
0x0000B4 => 48, # ´ gc=Sk sc=Common ACUTE ACCENT
0x0000AF => 5, # ¯ gc=Sk sc=Common MACRON
0x0002D8 => 4, # Ë gc=Sk sc=Common BREVE
0x0000A8 => 6, # ¨ gc=Sk sc=Common DIAERESIS
0x0000B8 => 1, # ¸ gc=Sk sc=Common CEDILLA
0x002010 => 1205194, # â gc=Pd sc=Common HYPHEN
0x002013 => 163112, # â gc=Pd sc=Common EN DASH
0x002014 => 430, # â gc=Pd sc=Common EM DASH
0x0000B7 => 143383, # · gc=Po sc=Common MIDDLE DOT
0x002018 => 228766, # â gc=Pi sc=Common LEFT SINGLE QUOTATION MARK
0x002019 => 737362, # â gc=Pf sc=Common RIGHT SINGLE QUOTATION MARK
0x002039 => 11, # â¹ gc=Pi sc=Common SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x00203A => 12, # ⺠gc=Pf sc=Common SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x00FF08 => 13, # ï¼ gc=Ps sc=Common FULLWIDTH LEFT PARENTHESIS
0x00FF09 => 13, # ï¼ gc=Pe sc=Common FULLWIDTH RIGHT PARENTHESIS
0x00FF3B => 4, # ï¼» gc=Ps sc=Common FULLWIDTH LEFT SQUARE BRACKET
0x00FF3D => 4, # ï¼½ gc=Pe sc=Common FULLWIDTH RIGHT SQUARE BRACKET
0x00FF5B => 102, # ï½ gc=Ps sc=Common FULLWIDTH LEFT CURLY BRACKET
0x00FF5D => 101, # ï½ gc=Pe sc=Common FULLWIDTH RIGHT CURLY BRACKET
0x0000A7 => 42343, # § gc=So sc=Common SECTION SIGN
0x0000B6 => 235, # ¶ gc=So sc=Common PILCROW SIGN
0x00204B => 13003, # â gc=Po sc=Common REVERSED PILCROW SIGN
0x0000A9 => 4, # © gc=So sc=Common COPYRIGHT SIGN
0x00FF0F => 1, # ï¼ gc=Po sc=Common FULLWIDTH SOLIDUS
0x002030 => 16, # â° gc=Po sc=Common PER MILLE SIGN
0x002020 => 8882, # â gc=Po sc=Common DAGGER
0x002021 => 9, # â¡ gc=Po sc=Common DOUBLE DAGGER
0x002032 => 967, # â² gc=Po sc=Common PRIME
0x002033 => 362, # â³ gc=Po sc=Common DOUBLE PRIME
0x002034 => 24, # â´ gc=Po sc=Common TRIPLE PRIME
0x002038 => 2, # ⸠gc=Po sc=Common CARET
0x0002C8 => 2550, # Ë gc=Lm sc=Common MODIFIER LETTER VERTICAL LINE
( run in 1.912 second using v1.01-cache-2.11-cpan-39bf76dae61 )