cascade results from the CPAN

cascade

Lingua-Identify-Blacklists

view release on metacpan or search on metacpan

lib/Lingua/Identify/Blacklists.pm view on Meta::CPAN


  train( { cs => $file_with_cs_text, sk => $file_with_sk_text }, %para );

=head1 Description

This module adds a blacklist classifier to a general purpose language identification tool. Related languages can easily be confused with each other and standard language detection tools do not work very well for distinguishing them. With this module ...

Since version 0.03 it also integrates a standard language identifier (Lingua::Identify::CLD) and can now be used for general language identification. It calls the blacklist classifier only for those languages that can be confused and for which approp...


=head1 Settings

Module-internal variables that can be modified:

 $BLACKLISTDIR     # directory with all blacklists (default: module-share-dir)
 $LOWERCASE        # lowercase all data, yes/no (1/0), default: 1
 $TOKENIZE         # tokenize all data, yes/no (1/0), default: 1
 $ALPHA_ONLY       # don't use tokens with non-alphabetic characters, default: 1
 $MAX_LINE_LENGTH  # max line length when reading from files (default=2**16)
 $CLD_TEXT_SIZE    # text size in characters used for language ident. with CLD
 $VERBOSE          # verbose output (default=0)

Tokenization is very simple and replaces all non-alphabetic characters with a white-space character.

=cut


our $BLACKLISTDIR;
eval{ $BLACKLISTDIR = &dist_dir('Lingua-Identify-Blacklists') . '/blacklists' };

our $LOWERCASE       = 1;
our $TOKENIZE        = 1;
our $ALPHA_ONLY      = 1;
our $MAX_LINE_LENGTH = 2**16;    # limit the length of one line to be read
our $CLD_TEXT_SIZE   = 2**16;    # text size used for detecting lang with CLD
our $VERBOSE         = 0;

my %blacklists = ();  # hash of blacklists (langpair => {blacklist}, ...)
my %confusable = ();  # hash of confusable languages (lang => [other_langs])

## the compact language identifier from Google Chrome
my $CLD = new Lingua::Identify::CLD;


# load all blacklists in the gneral BLACKLISTDIR
&load_blacklists( $BLACKLISTDIR );




=head1 Exported Functions

=head2 C<$langID = identify( $text [,%options] )>

Analyses a given text and returns a language ID as the result of the classification. C<%options> can be used to change the behaviour of the classifier. Possible options are

  assumed    => $assumed_lang
  langs      => \@list_of_possible_langs
  use_margin => $score

If C<langs> are specified, it runs the classifier with blacklists for those languages (in a cascaded way, i.e. best1 = lang1 vs lang2, best2 = best1 vs lang3, ...). If C<use_margin> is specified, it runs all versus all and returns the language that w...

If the C<assumed> language is given, it runs the blacklist classifier for all languages that can be confused with $assumed_lang (if blacklist models exist for them).

If neither C<langs> not C<assumed> are specified, it first runs a general-purpose language identification (using Lingua::Identify::CLD and Lingua::Identify) and then checks with the blacklist classifier whether the detected language can be confused w...

=cut

sub identify{
  my $text = shift;
  my %options = @_;

  my %dic = ();
  my $total = 0;

  # run the blacklist classifier if 'langs' are specified
  if (exists $options{langs}){
      &process_string( $text, \%dic, $total, $options{text_size} );
      return &classify( \%dic, %options );
  }

  # otherwise: check if there is an 'assumed' language
  # if not: classify with CLD
  $options{assumed} = &identify_language( $text ) 
      unless (exists $options{assumed});

  # if there is an 'assumed' language:
  # check if it can be confused with others (i.e. blacklists exist)
  if (exists $confusable{$options{assumed}}){
      $options{langs} = $confusable{$options{assumed}};
      # finally: process the text and classify
      &process_string( $text, \%dic, $total );
      return &classify( \%dic, %options );
  }
  return $options{assumed};
}



=head2 C<$langID = identify_file( $filename [,%options] )>

Does the same as C<identify> but reads text from a file. It also takes the same options as the 'identify' function but allows two extra options:

  text_size  => $size,  # number of tokens to be used for classification
  every_line => 1

Using the C<every_line> option, the classifier checks every input line seperately and returns a list of language ID's.

 @langIDs = identify_file( $filename, every_line => 1, %options )

=cut



sub identify_file{
    my $file = shift;
    my %options = @_;
    
    my %dic = ();
    my $total = 0;
    my @predictions = ();

lib/Lingua/Identify/Blacklists.pm view on Meta::CPAN

    &initialize();

    # classify test data

    if ($#evaldata == $#langs){
        print STDERR "classify ....\n";

        my $correct=0;
        my $count=0;
        my %guesses=();

        my %correct_lang=();
        my %count_lang=();

        my $t1 = new Benchmark;
        foreach my $i (0..$#langs){
            open IN,"<:encoding(UTF-8)",$evaldata[$i] || die "...";
            while (<IN>){
                chomp;
                my %dic = ();
                &process_string($_,\%dic);
                my $guess = &classify(\%dic,@langs);
                $count++;
                $count_lang{$langs[$i]}++;
                if ($guess eq $langs[$i]){
                    $correct++;
                    $correct_lang{$langs[$i]}++;
                }
                $guesses{$langs[$i]}{$guess}++;
            }
            close IN;
        }
        print STDERR "classification took: ".
            timestr(timediff(new Benchmark, $t1)).".\n";

        printf "accuracy: %6.4f\n   ",$correct/$count;
        foreach my $c (@langs){
            print "  $c";
        }
        print "\n";
        foreach my $c (@langs){
            print "$c ";
            foreach my $g (@langs){
                printf "%4d",$guesses{$c}{$g};
            }
            printf "  %6.4f",$correct_lang{$c}/$count_lang{$c};
            print "\n";
        }
    }
    system("wc -l $Lingua::Identify::Blacklists::BLACKLISTDIR/*.txt");
}


=head2 Module-internal functions

The following functions are not exported and are mainly used for internal purposes (but may be used from the outside if needed).

 initialize()                     # reset the repository of blacklists
 identify_language($text)         # return lang-ID for $text (using CLD)
 classify(\%dic,%options)         # run the classifier
 classify_cascaded(\%dic,@langs)  # run a cascade of binary classifications

 # run all versus all and return the one that wins most binary decisions
 # (a score margin is used to adjust the reliability of the decisions)

 classify_with_margin(\%dic,$margin,@langs) 

 load_blacklists($dir)                # load all blacklists available in $dir
 load_blacklist(\%list,$dir,      # load a lang-pair specific blacklist
                $lang1,$lang2)  
 read_file($file,\%dic,$max)      # read a file and count token frequencies
 process_string($string)          # process a given string (lowercasing ...)

=cut


sub initialize{ %blacklists = (); %confusable = (); }

sub identify_language{
    my ($lang, $id, $conf) = $CLD->identify( $_[0] );

    # strangely enough CLD is not really reliable for English
    # (all kinds of garbish input is recognized as English)
    # --> check with Lingua::Identify
    if ($id eq 'en'){
	$id = $id = langof( $_[0] ) ? $id : 'unknown';
    }
    return $id;
}

sub classify{
    my $dic         = shift;
    my %options     = @_;
    $options{langs} = '' unless ($options{langs});

    my @langs = ref($options{langs}) eq 'ARRAY' ? 
	@{$options{langs}} : split( /\s+/, $options{langs} ) ;

    @langs = available_languages() unless (@langs);

    return &classify_with_margin( $dic, $options{use_margin}, @langs ) 
	if ($options{use_margin});
    return &classify_cascaded( $dic, @langs );
}

sub classify_cascaded{
    my $dic = shift;
    my @langs = @_;

    my $lang1 = shift(@langs);
    foreach my $lang2 (@langs){

        # load blacklists on demand
        unless (exists $blacklists{"$lang1-$lang2"}){
            $blacklists{"$lang1-$lang2"}={};
            &load_blacklist($blacklists{"$lang1-$lang2"},
                            $BLACKLISTDIR,$lang1,$lang2);
        }
        my $list = $blacklists{"$lang1-$lang2"};

        my $score = 0;
	foreach my $w (keys %{$dic}){
	    if (exists $$list{$w}){
                $score += $$dic{$w} * $$list{$w};
                print STDERR "$$dic{$w} x $w found ($$list{$w})\n" if ($VERBOSE);
            }
        }
        if ($score < 0){
            $lang1 = $lang2;
        }
        print STDERR "select $lang1 ($score)\n" if ($VERBOSE);
    }
    return $lang1;
}


# OTHER WAY OF CLASSIFYING
# test all against all ...

sub classify_with_margin{
    my $dic = shift;
    my $margin = shift;
    my @langs = @_;

    my %selected = ();
    while (@langs){
        my $lang1 = shift(@langs);
        foreach my $lang2 (@langs){

            # load blacklists on demand
            unless (exists $blacklists{"$lang1-$lang2"}){
                $blacklists{"$lang1-$lang2"}={};
                &load_blacklist($blacklists{"$lang1-$lang2"},
                                $BLACKLISTDIR,$lang1,$lang2);
            }
            my $list = $blacklists{"$lang1-$lang2"};

            my $score = 0;
            foreach my $w (keys %{$dic}){
                if (exists $$list{$w}){
                    $score += $$dic{$w} * $$list{$w};
                    print STDERR "$$dic{$w} x $w found ($$list{$w})\n" 
                        if ($VERBOSE);
                }
            }
            next if (abs($score) < $margin);

( run in 0.984 second using v1.01-cache-2.11-cpan-d7f47b0818f )