Algorithm-WordLevelStatistics

 view release on metacpan or  search on metacpan

lib/Algorithm/WordLevelStatistics.pm  view on Meta::CPAN

# Copyright 2009 Francesco Nidito. All rights reserved.
#
# This library is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.

package Algorithm::WordLevelStatistics;

use strict;

use vars qw($VERSION);
$VERSION = '0.03';

sub new {
  my $class = shift;
  return bless {
                version => $VERSION,
               }, $class;
}

# computes the statistical level of a single word (given its spectrum and the total number of words in the text)
sub compute_spectrum {
  my ($self, $N, $s) = @_;

  my $n = @{$s};

  my $ls = { count => $n, C => 0, sigma_nor => 0 };
  if( $n > 3 ) {
    # position -> distance from preceding element in text
    my @tmp = ();
    for( my $i = 0; $i < ($n-1); ++$i ){ push @tmp, ($s->[$i+1] - $s->[$i]); }

    my ($avg, $sigma) = $self->_mean_and_variance( \@tmp );
    $sigma = sqrt($sigma)/$avg;

    # normalize sigma using an hypothetical uniform distribution
    my $p = $n/$N;
    $ls->{sigma_nor} = $sigma/sqrt(1.0-$p);

    # this is not simple:
    $ls->{C} = ($ls->{sigma_nor} - (2.0*$n-1.0)/(2.0*$n+2.0)) * ( sqrt($n)*(1.0+2.8*$n**-0.865) );
  }

  return $ls;
}

# computes the statistical level of a group of words (given their spectra)
sub compute_spectra {
  my ($self, $s) = @_;

  # count the total number of "words" in text
  my $N = 0;
  foreach my $i (keys(%{$s})){ $N += @{ $s->{$i} }; }

  # computes the level statistic for all terms
  my %r = ();
  foreach my $i (keys(%{$s})){
    $r{$i} = $self->compute_spectrum( $N, $s->{$i} );
  }

  return \%r;
}

# fast, on-line algorithm to compute mean and variance:
# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
sub _mean_and_variance {
  my ($self, $v) = @_;
  my ($n, $mean, $M2) = (0, 0, 0);

  foreach my $x (@{$v}) {
    $n++;
    my $delta = $x - $mean;
    $mean += $delta/$n;
    $M2 += $delta*($x - $mean);
  }



( run in 2.133 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )