Text-TinySegmenter
view release on metacpan or search on metacpan
lib/Text/TinySegmenter.pm view on Meta::CPAN
package Text::TinySegmenter;
use 5.8.1;
use strict;
use warnings;
use utf8;
our $VERSION = '0.01';
my %Patterns = (
"[ä¸äºä¸åäºå
ä¸å
«ä¹åç¾åä¸åå
]" => "M",
"[ä¸-é¾ ã
ããµã¶]" => "H",
"[ã-ã]" => "I",
"[ã¡-ã´ã¼ï½±-ï¾ï¾ï½°]" => "K",
"[a-zA-Zï½-ï½ï¼¡-Z]" => "A",
"[0-9ï¼-ï¼]" => "N",
);
my @CharType;
{
while (my ($key, $val) = each %Patterns) {
push @CharType, [qr/$key/, $val];
}
}
my $BIAS = -332;
my %BC1 = ("HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378);
my %BC2 = ("AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920);
my %BC3 = ("HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266);
my %BP1 = ("BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352);
my %BP2 = ("BO" => 60,"OO" => -1762);
my %BQ1 = ("BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965);
my %BQ2 = ("BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146);
my %BQ3 = ("BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699);
my %BQ4 = ("BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973);
my %BW1 = (",ã¨" => 660,",å" => 727,"B1ã" => 1404,"B1å" => 542,"ãã¨" => 660,"ãå" => 727,"ãã¨" => 1682,"ãã£" => 1505,"ãã" => 1743,"ãã£" => -2055,"ãã" => 672,"ãã" => -4817,"ãã" => 665,"ãã" => 3472,"ãã" => ...
my %BW2 = (".." => -11822,"11" => -669,"ââ" => -5730,"ââ" => -13175,"ãã" => -1609,"ãã" => 2490,"ãã" => -1350,"ãã" => -602,"ãã" => -7194,"ãã" => 4612,"ãã" => 853,"ãã" => -3198,"ãã" => 1941,"ããª" => -1597,...
my %BW3 = ("ãã" => -2194,"ãã" => 719,"ãã" => 3846,"ã." => -1185,"ãã" => -1185,"ãã" => 5308,"ãã" => 2079,"ãã" => 3029,"ãã" => 2056,"ãã£" => 1883,"ãã" => 5600,"ãã" => 1527,"ãã¡" => 1117,"ãã¨" => 4798,"...
my %TC1 = ("AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832);
my %TC2 = ("HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649);
my %TC3 = ("AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2...
my %TC4 = ("HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 30...
my %TQ1 = ("BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68);
my %TQ2 = ("BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591);
my %TQ3 = ("BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH...
my %TQ4 = ("BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8...
my %TW1 = ("ã«ã¤ã" => -4681,"æ±äº¬é½" => 2026);
my %TW2 = ("ããç¨" => -2049,"ãã£ã" => -1256,"ããã" => -2434,"ããã" => 3873,"ãã®å¾" => -4430,"ã ã£ã¦" => -1049,"ã¦ãã" => 1833,"ã¨ãã¦" => -4657,"ã¨ãã«" => -4517,"ãã®ã§" => 1882,"䏿°ã«" => -792,"åãã¦" ...
my %TW3 = ("ããã " => -1734,"ãã¦ã" => 1314,"ã¨ãã¦" => -4314,"ã«ã¤ã" => -5483,"ã«ã¨ã£" => -5989,"ã«å½ã" => -6247,"ã®ã§," => -727,"ã®ã§ã" => -727,"ã®ãã®" => -600,"ããã" => -3752,"åäºæ" => -2287);
my %TW4 = ("ãã." => 8576,"ããã" => 8576,"ãããª" => -2348,"ãã¦ã" => 2958,"ãã," => 1516,"ããã" => 1516,"ã¦ãã" => 1538,"ã¨ãã" => 1349,"ã¾ãã" => 5543,"ã¾ãã" => 1097,"ããã¨" => -4258,"ããã¨" => 5865);
my %UC1 = ("A" => 484,"K" => 93,"M" => 645,"O" => -505);
my %UC2 = ("A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646);
my %UC3 = ("A" => -1370,"I" => 2311);
my %UC4 = ("A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646);
my %UC5 = ("H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831);
my %UC6 = ("H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387);
my %UP1 = ("O" => -214);
my %UP2 = ("B" => 69,"O" => 935);
my %UP3 = ("B" => 189);
my %UQ1 = ("BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422);
my %UQ2 = ("BH" => 216,"BI" => 113,"OK" => 1759);
my %UQ3 = ("BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212);
my %UW1 = ("," => 156,"ã" => 156,"ã" => -463,"ã" => -941,"ã" => -127,"ã" => -553,"ã" => 121,"ã" => 505,"ã§" => -201,"ã¨" => -547,"ã©" => -123,"ã«" => -789,"ã®" => -185,"ã¯" => -847,"ã" => -466,"ã" => -470,"ã" => 182,"ã" =>...
my %UW2 = ("," => -829,"ã" => -829,"ã" => 892,"ã" => -645,"ã" => 3145,"ã" => -538,"ã" => 505,"ã" => 134,"ã" => -502,"ã" => 1454,"ã" => -856,"ã" => -412,"ã" => 1141,"ã" => 878,"ã" => 540,"ã" => 1529,"ã" => -675,"ã" =>...
my %UW3 = ("," => 4889,"1" => -800,"â" => -1723,"ã" => 4889,"ã
" => -2311,"ã" => 5827,"ã" => 2670,"ã" => -3573,"ã" => -2696,"ã" => 1006,"ã" => 2342,"ã" => 1983,"ã" => -4864,"ã" => -1163,"ã" => 3271,"ã" => 1004,"ã" => 388,...
my %UW4 = ("," => 3930,"." => 3508,"â" => -4841,"ã" => 3930,"ã" => 3508,"ã" => 4999,"ã" => 1895,"ã" => 3798,"ã" => -5156,"ã" => 4752,"ã" => -3435,"ã" => -640,"ã" => -2514,"ã" => 2405,"ã" => 530,"ã" => 6006,"ã" => -4482,"...
my %UW5 = ("," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"ã" => 465,"ã" => -299,"ã" => 363,"ã" => 1655,"ã" => 331,"ã" => -503,"ã" => 1199,"ã" => 527,"ã" => 647,"ã" => -421,"ã" => 1624,"ã" => 1971,"ã" => 312,...
my %UW6 = ("," => 227,"." => 808,"1" => -270,"E1" => 306,"ã" => 227,"ã" => 808,"ã" => -307,"ã" => 189,"ã" => 241,"ã" => -73,"ã" => -121,"ã" => -200,"ã" => 1782,"ã" => 383,"ã" => -428,"ã£" => 573,"ã¦" => -1014,"ã§" => 101,"ã¨...
sub _ctype {
my $str = shift;
for my $type (@CharType) {
if ($str =~ $type->[0]) {
return $type->[1];
}
}
return "O";
}
sub _ts {
$_[0] || 0;
}
sub segment {
my ($class, $input) = @_;
if (!defined $input || $input eq '') {
return wantarray ? () : [];
}
my @result;
my @seg = ("B3","B2","B1");
my @ctype = ("O","O","O");
my @o = split //, $input;
for my $c (@o) {
push @seg, $c;
push @ctype, _ctype($c);
}
push @seg, "E1";
push @seg, "E2";
push @seg, "E3";
push @ctype, "O";
push @ctype, "O";
push @ctype, "O";
my $word = $seg[3];
my $p1 = "U";
my $p2 = "U";
( run in 1.350 second using v1.01-cache-2.11-cpan-437f7b0c052 )