Lingua-Identify-Blacklists
view release on metacpan or search on metacpan
lib/Lingua/Identify/Blacklists.pm view on Meta::CPAN
train( { cs => $file_with_cs_text, sk => $file_with_sk_text }, %para );
=head1 Description
This module adds a blacklist classifier to a general purpose language identification tool. Related languages can easily be confused with each other and standard language detection tools do not work very well for distinguishing them. With this module ...
Since version 0.03 it also integrates a standard language identifier (Lingua::Identify::CLD) and can now be used for general language identification. It calls the blacklist classifier only for those languages that can be confused and for which approp...
=head1 Settings
Module-internal variables that can be modified:
$BLACKLISTDIR # directory with all blacklists (default: module-share-dir)
$LOWERCASE # lowercase all data, yes/no (1/0), default: 1
$TOKENIZE # tokenize all data, yes/no (1/0), default: 1
$ALPHA_ONLY # don't use tokens with non-alphabetic characters, default: 1
$MAX_LINE_LENGTH # max line length when reading from files (default=2**16)
$CLD_TEXT_SIZE # text size in characters used for language ident. with CLD
$VERBOSE # verbose output (default=0)
Tokenization is very simple and replaces all non-alphabetic characters with a white-space character.
=cut
our $BLACKLISTDIR;
eval{ $BLACKLISTDIR = &dist_dir('Lingua-Identify-Blacklists') . '/blacklists' };
our $LOWERCASE = 1;
our $TOKENIZE = 1;
our $ALPHA_ONLY = 1;
our $MAX_LINE_LENGTH = 2**16; # limit the length of one line to be read
our $CLD_TEXT_SIZE = 2**16; # text size used for detecting lang with CLD
our $VERBOSE = 0;
my %blacklists = (); # hash of blacklists (langpair => {blacklist}, ...)
my %confusable = (); # hash of confusable languages (lang => [other_langs])
## the compact language identifier from Google Chrome
my $CLD = new Lingua::Identify::CLD;
# load all blacklists in the gneral BLACKLISTDIR
&load_blacklists( $BLACKLISTDIR );
=head1 Exported Functions
=head2 C<$langID = identify( $text [,%options] )>
Analyses a given text and returns a language ID as the result of the classification. C<%options> can be used to change the behaviour of the classifier. Possible options are
assumed => $assumed_lang
langs => \@list_of_possible_langs
use_margin => $score
If C<langs> are specified, it runs the classifier with blacklists for those languages (in a cascaded way, i.e. best1 = lang1 vs lang2, best2 = best1 vs lang3, ...). If C<use_margin> is specified, it runs all versus all and returns the language that w...
If the C<assumed> language is given, it runs the blacklist classifier for all languages that can be confused with $assumed_lang (if blacklist models exist for them).
If neither C<langs> not C<assumed> are specified, it first runs a general-purpose language identification (using Lingua::Identify::CLD and Lingua::Identify) and then checks with the blacklist classifier whether the detected language can be confused w...
=cut
sub identify{
my $text = shift;
my %options = @_;
my %dic = ();
my $total = 0;
# run the blacklist classifier if 'langs' are specified
if (exists $options{langs}){
&process_string( $text, \%dic, $total, $options{text_size} );
return &classify( \%dic, %options );
}
# otherwise: check if there is an 'assumed' language
# if not: classify with CLD
$options{assumed} = &identify_language( $text )
unless (exists $options{assumed});
# if there is an 'assumed' language:
# check if it can be confused with others (i.e. blacklists exist)
if (exists $confusable{$options{assumed}}){
$options{langs} = $confusable{$options{assumed}};
# finally: process the text and classify
&process_string( $text, \%dic, $total );
return &classify( \%dic, %options );
}
return $options{assumed};
}
=head2 C<$langID = identify_file( $filename [,%options] )>
Does the same as C<identify> but reads text from a file. It also takes the same options as the 'identify' function but allows two extra options:
text_size => $size, # number of tokens to be used for classification
every_line => 1
Using the C<every_line> option, the classifier checks every input line seperately and returns a list of language ID's.
@langIDs = identify_file( $filename, every_line => 1, %options )
=cut
sub identify_file{
my $file = shift;
my %options = @_;
my %dic = ();
my $total = 0;
my @predictions = ();
lib/Lingua/Identify/Blacklists.pm view on Meta::CPAN
&initialize();
# classify test data
if ($#evaldata == $#langs){
print STDERR "classify ....\n";
my $correct=0;
my $count=0;
my %guesses=();
my %correct_lang=();
my %count_lang=();
my $t1 = new Benchmark;
foreach my $i (0..$#langs){
open IN,"<:encoding(UTF-8)",$evaldata[$i] || die "...";
while (<IN>){
chomp;
my %dic = ();
&process_string($_,\%dic);
my $guess = &classify(\%dic,@langs);
$count++;
$count_lang{$langs[$i]}++;
if ($guess eq $langs[$i]){
$correct++;
$correct_lang{$langs[$i]}++;
}
$guesses{$langs[$i]}{$guess}++;
}
close IN;
}
print STDERR "classification took: ".
timestr(timediff(new Benchmark, $t1)).".\n";
printf "accuracy: %6.4f\n ",$correct/$count;
foreach my $c (@langs){
print " $c";
}
print "\n";
foreach my $c (@langs){
print "$c ";
foreach my $g (@langs){
printf "%4d",$guesses{$c}{$g};
}
printf " %6.4f",$correct_lang{$c}/$count_lang{$c};
print "\n";
}
}
system("wc -l $Lingua::Identify::Blacklists::BLACKLISTDIR/*.txt");
}
=head2 Module-internal functions
The following functions are not exported and are mainly used for internal purposes (but may be used from the outside if needed).
initialize() # reset the repository of blacklists
identify_language($text) # return lang-ID for $text (using CLD)
classify(\%dic,%options) # run the classifier
classify_cascaded(\%dic,@langs) # run a cascade of binary classifications
# run all versus all and return the one that wins most binary decisions
# (a score margin is used to adjust the reliability of the decisions)
classify_with_margin(\%dic,$margin,@langs)
load_blacklists($dir) # load all blacklists available in $dir
load_blacklist(\%list,$dir, # load a lang-pair specific blacklist
$lang1,$lang2)
read_file($file,\%dic,$max) # read a file and count token frequencies
process_string($string) # process a given string (lowercasing ...)
=cut
sub initialize{ %blacklists = (); %confusable = (); }
sub identify_language{
my ($lang, $id, $conf) = $CLD->identify( $_[0] );
# strangely enough CLD is not really reliable for English
# (all kinds of garbish input is recognized as English)
# --> check with Lingua::Identify
if ($id eq 'en'){
$id = $id = langof( $_[0] ) ? $id : 'unknown';
}
return $id;
}
sub classify{
my $dic = shift;
my %options = @_;
$options{langs} = '' unless ($options{langs});
my @langs = ref($options{langs}) eq 'ARRAY' ?
@{$options{langs}} : split( /\s+/, $options{langs} ) ;
@langs = available_languages() unless (@langs);
return &classify_with_margin( $dic, $options{use_margin}, @langs )
if ($options{use_margin});
return &classify_cascaded( $dic, @langs );
}
sub classify_cascaded{
my $dic = shift;
my @langs = @_;
my $lang1 = shift(@langs);
foreach my $lang2 (@langs){
# load blacklists on demand
unless (exists $blacklists{"$lang1-$lang2"}){
$blacklists{"$lang1-$lang2"}={};
&load_blacklist($blacklists{"$lang1-$lang2"},
$BLACKLISTDIR,$lang1,$lang2);
}
my $list = $blacklists{"$lang1-$lang2"};
my $score = 0;
foreach my $w (keys %{$dic}){
if (exists $$list{$w}){
$score += $$dic{$w} * $$list{$w};
print STDERR "$$dic{$w} x $w found ($$list{$w})\n" if ($VERBOSE);
}
}
if ($score < 0){
$lang1 = $lang2;
}
print STDERR "select $lang1 ($score)\n" if ($VERBOSE);
}
return $lang1;
}
# OTHER WAY OF CLASSIFYING
# test all against all ...
sub classify_with_margin{
my $dic = shift;
my $margin = shift;
my @langs = @_;
my %selected = ();
while (@langs){
my $lang1 = shift(@langs);
foreach my $lang2 (@langs){
# load blacklists on demand
unless (exists $blacklists{"$lang1-$lang2"}){
$blacklists{"$lang1-$lang2"}={};
&load_blacklist($blacklists{"$lang1-$lang2"},
$BLACKLISTDIR,$lang1,$lang2);
}
my $list = $blacklists{"$lang1-$lang2"};
my $score = 0;
foreach my $w (keys %{$dic}){
if (exists $$list{$w}){
$score += $$dic{$w} * $$list{$w};
print STDERR "$$dic{$w} x $w found ($$list{$w})\n"
if ($VERBOSE);
}
}
next if (abs($score) < $margin);
( run in 0.984 second using v1.01-cache-2.11-cpan-d7f47b0818f )