Acme-Lou
view release on metacpan or search on metacpan
author/loucsv.pl view on Meta::CPAN
use strict;
use warnings;
use utf8;
use open OUT => qw/:utf8 :std/;
use autodie;
use Encode;
use FindBin;
=head1 USAGE
useage:
cat mecab-ipadic-2.7.0-20060707/*.csv | perl loucsv.pl > lou.csv
original:
http://chasen.org/~taku/blog/archives/2007/01/_mecab.html
=cut
my $ja2kana = get_ja2kana();
my $pos_form_okuri_map = {
'åè©-*' => '',
'æåè©-*' => '',
'æ¥ç¶è©-*' => '',
'é£ä½è©-*' => '',
'åè©-ä»®å®å½¢' => 'ãã',
'åè©-ä»®å®ç¸®ç´ï¼' => 'ãã',
'åè©-åºæ¬å½¢' => 'ãã',
'åè©-ä½è¨æ¥ç¶' => 'ãã',
'åè©-ä½è¨æ¥ç¶ç¹æ®ï¼' => 'ã',
'åè©-æèªåºæ¬å½¢' => 'ãã',
'åè©-æªç¶ã¬ã«æ¥ç¶' => 'ã',
#'åè©-æªç¶å½¢' => '',
#'åè©-æªç¶ç¹æ®' => '',
'åè©-å½ä»¤ï½
' => '',
'åè©-å½ä»¤ï½ï½' => '',
'åè©-å½ä»¤ï½ï½' => '',
#'åè©-é£ç¨ã¿æ¥ç¶' => '',
'形容è©-ã¬ã«æ¥ç¶' => '',
'åè©-é£ç¨å½¢' => 'ã',
'形容è©-ä»®å®å½¢' => 'ãªã',
'形容è©-ä»®å®ç¸®ç´ï¼' => 'ãªã',
'形容è©-ä»®å®ç¸®ç´ï¼' => 'ãªã',
'形容è©-åºæ¬å½¢' => 'ãª',
'形容è©-ä½è¨æ¥ç¶' => 'ãª',
'形容è©-æèªåºæ¬å½¢' => '',
'形容è©-æªç¶ã¦æ¥ç¶' => 'ã ã',
'形容è©-æªç¶ãæ¥ç¶' => 'ãããã',
'形容è©-å½ä»¤ï½
' => 'ã§ãã',
'形容è©-é£ç¨ã´ã¶ã¤æ¥ç¶' => '',
'形容è©-é£ç¨ã¿æ¥ç¶' => 'ã ã£',
'形容è©-é£ç¨ãæ¥ç¶' => 'ã«',
};
while (my $line = <>) {
chomp $line;
$line = decode('euc-jp', $line);
my $lou = lou($line);
next if not defined $lou;
print $lou;
}
print <DATA>;
exit;
sub lou {
my $feature = shift or return;
# 表層形,å·¦æèID,峿èID,ã³ã¹ã,åè©,åè©ç´°åé¡1,åè©ç´°åé¡2,åè©ç´°åé¡3,æ´»ç¨å,æ´»ç¨å½¢,åå½¢,èªã¿,çºé³
my ($c, $d, $e, $cost, $pos, $f, $g, $h, $type, $form, $lemma, $i, $j) = split /,/, $feature;
my $kana = $ja2kana->{$lemma};
if (not defined $kana) {
return;
}
my $pos_form = "$pos-$form";
my $okuri = $pos_form_okuri_map->{$pos_form};
if (not defined $okuri) {
return;
}
$cost -= 1000;
$cost = 1 if $cost <= 0;
return join ",", $c, $d, $e, $cost, $pos, $f, $g, $h, $type, $form, $lemma, $i, $j, $kana, $okuri, "\n";
}
sub get_ja2kana {
my $dic = {};
open my $fh, '<:encoding(utf-8)', "$FindBin::Bin/ja2kana.csv";
while (my $line = <$fh>) {
chomp $line;
next if $line =~ /^#/;
my ($ja, $lou) = split /,/, $line;
next if not $ja or not $lou;
$lou =~ s/\s//g;
next if $lou =~ /^[a-z]+$/i;
$dic->{$ja} = $lou;
}
$dic;
}
__DATA__
ã«ã¼èª,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ã«ã¼èª,
å
æ°,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ãã¡ã¤ã³,
ä¸å¹´,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ã¯ã³ã¤ã¤ã¼,
ãããå¹´,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ã°ããã¤ã¤ã¼,
è¯ããå¹´,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ã°ããã¤ã¤ã¼,
æ°å¹´,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ãã¥ã¼ã¤ã¤ã¼,
ãé¡ããã¾ã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ããªã¼ãº,
æ°´èã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ã¦ã©ã¼ã¿ã¼èã,
ç¬ãæ©ãã°æ£ã«å½ãã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ç¬ãã¦ã©ã¼ã¯ããã°ãã¼ã«ã«ããããã,
æ³£ãã£é¢ã«è,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,æ³£ãã£é¢ã«ãã¼,
ãã¶ããæ£,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ãã¶ããã¹ãã£ãã¯,
è®ããæ£,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,è®ããã¹ãã£ãã¯,
èªããæ£,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,èªããã¹ãã£ãã¯,
念ã«ã¯å¿µãå
¥ãã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,念ã«ã¯å¿µããããã¤ã³,
ç«ã¦æ¿ã«æ°´,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ç«ã¦æ¿ã«ã¦ã©ã¼ã¿ã¼,
éèã«å¡©,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,éèã«ã½ã«ã,
èªå·±ç ´ç£,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ãã¤ãã¤ã»ã«ãç ´ç£,
å°ç温æå,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,å°çãããå,
ã¡ããç©ããã°å±±ã¨ãªã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ãã©ãã·ã¥ãç©ããã°ãã¦ã³ãã³ã¨ãªã,
ç¥ãã¬ãä»,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ãã³ããã¦ãä»,
è«ãã証æ ,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,è«ããã¨ããã³ã¹,
天ä¸ãåé¡,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ããã³ä¸ããããã¬ã ,
è² ãããåã¡,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ã«ã¼ãºãããã¦ã¤ã³,
é¨éã£ã¦å°åºã¾ã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ã¬ã¤ã³éã£ã¦ã¢ã¼ã¹åºã¾ã,
ç³ã®ä¸ã«ãä¸å¹´,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ã¹ãã¼ã³ã®ä¸ã«ãã¹ãªã¼ã¤ã¤ã¼,
ç³æ©ãå©ãã¦æ¸¡ã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,ç³æ©ãã¿ãããã¦æ¸¡ã,
河童ã®å·æµã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,河童ã®ãªãã¼æµã,
å£ã«è³ãããéåã«ç®ãã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,å£ã«ã¤ã¤ã¼ãããéåã«ã¢ã¤ãºãã,
弿³ãçã®èª¤ã¾ã,1,1,1200,ãã®ä»,ä¸è¬,*,*,*,*,*,*,*,弿³ããã³ã·ã«ã®èª¤ã¾ã,
( run in 0.643 second using v1.01-cache-2.11-cpan-99c4e6809bf )