OPUS-Tools

 view release on metacpan or  search on metacpan

scripts/convert/text2utf8.pl  view on Meta::CPAN

#!/usr/bin/perl
#

use strict;

use Getopt::Std;
use Encode;
use File::BOM qw( :all );

use vars qw/$opt_l $opt_e/;

getopts('l:e:');

my $lang = $opt_l || 'unknown';
my $enc = $opt_e || LangEncoding($lang);


binmode(STDIN);
binmode(STDOUT,':encoding(utf8)');

my $line = <>;
($line, $enc) = decode_from_bom($line,$enc);
binmode(STDIN,":encoding($enc)");


do {
    # remove dos line endings
    $line=~s/\r\n$/\n/;

    print $line;
    $line = <STDIN>;
}
until (! $line);



## guess character encoding

sub LangEncoding{
    my $lang = shift;

# supported by Perl Encode:
# http://perldoc.perl.org/Encode/Supported.html

    return 'utf-8' if ($lang=~/^(utf8)$/);
    return 'iso-8859-4' if ($lang=~/^(ice)$/);
    ## what is scc?
    return 'cp1250' if ($lang=~/^(alb|bos|cze|pol|rum|scc|scr|slv|hrv)$/); 
#    return 'iso-8859-2' if ($lang=~/^(alb|bos)$/);
    return 'cp1251' if ($lang=~/^(bul|mac|rus|bel)$/);
#    return 'cp1252' if ($lang=~/^(dan|dut|epo|est|fin|fre|ger|hun|ita|nor|pob|pol|por|spa|swe)$/);
    return 'cp1253' if ($lang=~/^(ell|gre)$/);
    return 'cp1254' if ($lang=~/^(tur)$/);
    return 'cp1255' if ($lang=~/^(heb)$/);
    return 'cp1256' if ($lang=~/^(ara)$/);
    return 'cp1257' if ($lang=~/^(lat|lit)$/);  # correct?
    return 'big5-eten' if ($lang=~/^(chi|zho)$/);
#    return 'utf-8' if ($lang=~/^(jpn)$/);
    return 'shiftjis' if ($lang=~/^(jpn)$/);
#    return 'cp932' if ($lang=~/^(jpn)$/);
    return 'euc-kr' if ($lang=~/^(kor)$/);
#    return 'cp949' if ($lang=~/^(kor)$/);
    return 'cp1252';
#    return 'iso-8859-6' if ($lang=~/^(ara)$/);
#    return 'iso-8859-7' if ($lang=~/^(ell|gre)$/);
#    return 'iso-8859-1';

## unknown: haw (hawaiian), hrv (crotioan), amh (amharic) gai (borei)



( run in 2.759 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )