Text-TinySegmenter

 view release on metacpan or  search on metacpan

lib/Text/TinySegmenter.pm  view on Meta::CPAN

package Text::TinySegmenter;
use 5.8.1;
use strict;
use warnings;
use utf8;

our $VERSION = '0.01';

my %Patterns = (
    "[一二三四五六七八九十百千万億兆]" => "M",
    "[一-龠々〆ヵヶ]"                  => "H",
    "[ぁ-ん]"                          => "I",
    "[ァ-ヴーア-ン゙ー]"                   => "K",
    "[a-zA-Za-zA-Z]"               => "A",
    "[0-90-9]"                       => "N",
);

my @CharType;

{
    while (my ($key, $val) = each %Patterns) {
        push @CharType, [qr/$key/, $val];
    }
}

my $BIAS = -332;
my %BC1 = ("HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378);
my %BC2 = ("AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920);
my %BC3 = ("HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266);
my %BP1 = ("BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352);
my %BP2 = ("BO" => 60,"OO" => -1762);
my %BQ1 = ("BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965);
my %BQ2 = ("BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146);
my %BQ3 = ("BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699);
my %BQ4 = ("BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973);
my %BW1 = (",と" => 660,",同" => 727,"B1あ" => 1404,"B1同" => 542,"、と" => 660,"、同" => 727,"」と" => 1682,"あっ" => 1505,"いう" => 1743,"いっ" => -2055,"いる" => 672,"うし" => -4817,"うん" => 665,"から" => 3472,"がら" => ...
my %BW2 = (".." => -11822,"11" => -669,"――" => -5730,"−−" => -13175,"いう" => -1609,"うか" => 2490,"かし" => -1350,"かも" => -602,"から" => -7194,"かれ" => 4612,"がい" => 853,"がら" => -3198,"きた" => 1941,"くな" => -1597,...
my %BW3 = ("あた" => -2194,"あり" => 719,"ある" => 3846,"い." => -1185,"い。" => -1185,"いい" => 5308,"いえ" => 2079,"いく" => 3029,"いた" => 2056,"いっ" => 1883,"いる" => 5600,"いわ" => 1527,"うち" => 1117,"うと" => 4798,"...
my %TC1 = ("AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832);
my %TC2 = ("HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649);
my %TC3 = ("AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2...
my %TC4 = ("HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 30...
my %TQ1 = ("BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68);
my %TQ2 = ("BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591);
my %TQ3 = ("BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH...
my %TQ4 = ("BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8...
my %TW1 = ("につい" => -4681,"東京都" => 2026);
my %TW2 = ("ある程" => -2049,"いった" => -1256,"ころが" => -2434,"しょう" => 3873,"その後" => -4430,"だって" => -1049,"ていた" => 1833,"として" => -4657,"ともに" => -4517,"もので" => 1882,"一気に" => -792,"初めて" ...
my %TW3 = ("いただ" => -1734,"してい" => 1314,"として" => -4314,"につい" => -5483,"にとっ" => -5989,"に当た" => -6247,"ので," => -727,"ので、" => -727,"のもの" => -600,"れから" => -3752,"十二月" => -2287);
my %TW4 = ("いう." => 8576,"いう。" => 8576,"からな" => -2348,"してい" => 2958,"たが," => 1516,"たが、" => 1516,"ている" => 1538,"という" => 1349,"ました" => 5543,"ません" => 1097,"ようと" => -4258,"よると" => 5865);
my %UC1 = ("A" => 484,"K" => 93,"M" => 645,"O" => -505);
my %UC2 = ("A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646);
my %UC3 = ("A" => -1370,"I" => 2311);
my %UC4 = ("A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646);
my %UC5 = ("H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831);
my %UC6 = ("H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387);
my %UP1 = ("O" => -214);
my %UP2 = ("B" => 69,"O" => 935);
my %UP3 = ("B" => 189);
my %UQ1 = ("BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422);
my %UQ2 = ("BH" => 216,"BI" => 113,"OK" => 1759);
my %UQ3 = ("BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212);
my %UW1 = ("," => 156,"、" => 156,"「" => -463,"あ" => -941,"う" => -127,"が" => -553,"き" => 121,"こ" => 505,"で" => -201,"と" => -547,"ど" => -123,"に" => -789,"の" => -185,"は" => -847,"も" => -466,"や" => -470,"よ" => 182,"ら" =>...
my %UW2 = ("," => -829,"、" => -829,"〇" => 892,"「" => -645,"」" => 3145,"あ" => -538,"い" => 505,"う" => 134,"お" => -502,"か" => 1454,"が" => -856,"く" => -412,"こ" => 1141,"さ" => 878,"ざ" => 540,"し" => 1529,"す" => -675,"せ" =>...
my %UW3 = ("," => 4889,"1" => -800,"−" => -1723,"、" => 4889,"々" => -2311,"〇" => 5827,"」" => 2670,"〓" => -3573,"あ" => -2696,"い" => 1006,"う" => 2342,"え" => 1983,"お" => -4864,"か" => -1163,"が" => 3271,"く" => 1004,"け" => 388,...
my %UW4 = ("," => 3930,"." => 3508,"―" => -4841,"、" => 3930,"。" => 3508,"〇" => 4999,"「" => 1895,"」" => 3798,"〓" => -5156,"あ" => 4752,"い" => -3435,"う" => -640,"え" => -2514,"お" => 2405,"か" => 530,"が" => 6006,"き" => -4482,"...
my %UW5 = ("," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"、" => 465,"。" => -299,"「" => 363,"あ" => 1655,"い" => 331,"う" => -503,"え" => 1199,"お" => 527,"か" => 647,"が" => -421,"き" => 1624,"ぎ" => 1971,"く" => 312,...
my %UW6 = ("," => 227,"." => 808,"1" => -270,"E1" => 306,"、" => 227,"。" => 808,"あ" => -307,"う" => 189,"か" => 241,"が" => -73,"く" => -121,"こ" => -200,"じ" => 1782,"す" => 383,"た" => -428,"っ" => 573,"て" => -1014,"で" => 101,"と...

sub _ctype {
    my $str = shift;
    for my $type (@CharType) {
        if ($str =~ $type->[0]) {
            return $type->[1];
        }
    }
    return "O";
}

sub _ts {
    $_[0] || 0;
}

sub segment {
    my ($class, $input) = @_;
    if (!defined $input || $input eq '') {
        return wantarray ? () : [];
    }
    my @result;
    my @seg = ("B3","B2","B1");
    my @ctype = ("O","O","O");
    my @o = split //, $input;
    for my $c (@o) {
        push @seg, $c;
        push @ctype, _ctype($c);
    }
    push @seg, "E1";
    push @seg, "E2";
    push @seg, "E3";
    push @ctype, "O";
    push @ctype, "O";
    push @ctype, "O";
    my $word = $seg[3];
    my $p1 = "U";
    my $p2 = "U";



( run in 1.350 second using v1.01-cache-2.11-cpan-437f7b0c052 )