Algorithm-WordLevelStatistics
view release on metacpan or search on metacpan
lib/Algorithm/WordLevelStatistics.pm view on Meta::CPAN
# Copyright 2009 Francesco Nidito. All rights reserved.
#
# This library is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
package Algorithm::WordLevelStatistics;
use strict;
use vars qw($VERSION);
$VERSION = '0.03';
sub new {
my $class = shift;
return bless {
version => $VERSION,
}, $class;
}
# computes the statistical level of a single word (given its spectrum and the total number of words in the text)
sub compute_spectrum {
my ($self, $N, $s) = @_;
my $n = @{$s};
my $ls = { count => $n, C => 0, sigma_nor => 0 };
if( $n > 3 ) {
# position -> distance from preceding element in text
my @tmp = ();
for( my $i = 0; $i < ($n-1); ++$i ){ push @tmp, ($s->[$i+1] - $s->[$i]); }
my ($avg, $sigma) = $self->_mean_and_variance( \@tmp );
$sigma = sqrt($sigma)/$avg;
# normalize sigma using an hypothetical uniform distribution
my $p = $n/$N;
$ls->{sigma_nor} = $sigma/sqrt(1.0-$p);
# this is not simple:
$ls->{C} = ($ls->{sigma_nor} - (2.0*$n-1.0)/(2.0*$n+2.0)) * ( sqrt($n)*(1.0+2.8*$n**-0.865) );
}
return $ls;
}
# computes the statistical level of a group of words (given their spectra)
sub compute_spectra {
my ($self, $s) = @_;
# count the total number of "words" in text
my $N = 0;
foreach my $i (keys(%{$s})){ $N += @{ $s->{$i} }; }
# computes the level statistic for all terms
my %r = ();
foreach my $i (keys(%{$s})){
$r{$i} = $self->compute_spectrum( $N, $s->{$i} );
}
return \%r;
}
# fast, on-line algorithm to compute mean and variance:
# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
sub _mean_and_variance {
my ($self, $v) = @_;
my ($n, $mean, $M2) = (0, 0, 0);
foreach my $x (@{$v}) {
$n++;
my $delta = $x - $mean;
$mean += $delta/$n;
$M2 += $delta*($x - $mean);
}
( run in 2.133 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )