Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
package Algorithm::ExpectationMaximization;
#---------------------------------------------------------------------------
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::ExpectationMaximization is a pure Perl implementation for
# Expectation-Maximization based clustering of multi-dimensional data that can be
# modeled as a Gaussian mixture.
# ---------------------------------------------------------------------------
use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use Scalar::Util 'blessed';
our $VERSION = '1.22';
# from perl docs:
my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
# Constructor:
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
bless {
_datafile => $args{datafile} || croak("datafile required"),
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || croak("number of clusters required"),
_terminal_output => $args{terminal_output} || 0,
_seeding => $args{seeding} || 'random',
_seed_tags => $args{seed_tags} || [],
_max_em_iterations=> $args{max_em_iterations} || 100,
_class_priors => $args{class_priors} || [],
_debug => $args{debug} || 0,
_N => 0,
_data => {},
_data_id_tags => [],
_clusters => [],
_cluster_centers => [],
_data_dimensions => 0,
_cluster_normalizers => [],
_cluster_means => [],
_cluster_covariances => [],
_class_labels_for_data => {},
_class_probs_at_each_data_point => {},
_expected_class_probs => {},
_old_priors => [],
_old_old_priors => [],
_fisher_quality_vs_iteration => [],
_mdl_quality_vs_iterations => [],
}, $class;
}
sub read_data_from_file {
my $self = shift;
my $filename = $self->{_datafile};
$self->read_data_from_file_csv() if $filename =~ /.csv$/;
$self->read_data_from_file_dat() if $filename =~ /.dat$/;
}
sub read_data_from_file_csv {
my $self = shift;
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
"the size of at least one of the data records in the file.\n"
unless scalar(@mask) == scalar(@splits);
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_data} = \%data_hash;
$self->{_data_id_tags} = \@data_tags;
$self->{_N} = scalar @data_tags;
# Need to make the following call to set the global mean and covariance:
# my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
# Need to make the following call to set the global eigenvec eigenval sets:
# $self->eigen_analysis_of_covariance($covariance);
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
if $#fields != $#mask;
my $record_id;
foreach my $i (0..@fields-1) {
if ($mask[$i] eq '0') {
next;
} elsif ($mask[$i] eq 'N') {
$record_id = $fields[$i];
} elsif ($mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "misformed mask for reading the data file";
}
}
my @nums = map {/$_num_regex/;$_} @data_fields;
$self->{_data}->{ $record_id } = \@nums;
}
my @all_data_ids = keys %{$self->{_data}};
$self->{_data_id_tags} = \@all_data_ids;
$self->{_N} = scalar @all_data_ids;
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
if ( $self->{_N} < (2 * $self->{_K} ** 2) );
}
}
# This is the heart of the module --- in the sense that this method implements the EM
# algorithm for the estimating the parameters of a Gaussian mixture model for the
# data. In the implementation shown below, we declare convergence for the EM
# algorithm when the change in the class priors over three iterations falls below a
# threshold. The current value of this threshold, as can be seen in the function
# compare_array_floats(), is 0.00001.
sub EM {
my $self = shift;
$self->initialize_class_priors();
for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations};
$em_iteration++) {
if ($em_iteration == 0) {
print "\nSeeding the EM algorithm with:\n";
$self->display_seeding_stats();
print "\nFinished displaying the seeding information\n";
print "\nWill print out a dot for each iteration of EM:\n\n";
}
my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
print $progress_indicator;
foreach my $data_id (@{$self->{_data_id_tags}}) {
$self->{_class_probs_at_each_data_point}->{$data_id} = [];
$self->{_expected_class_probs}->{$data_id} = [];
}
# Calculate prob(x | C_i) --- this is the prob of data point x as
# a member of class C_i. You must do this for all K classes:
foreach my $cluster_index(0..$self->{_K}-1) {
$self->find_prob_at_each_datapoint_for_given_mean_and_covar(
$self->{_cluster_means}->[$cluster_index],
$self->{_cluster_covariances}->[$cluster_index] );
}
$self->{_cluster_normalizers} = [];
if ($self->{_debug}) {
print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
foreach my $data_id (sort keys %{$self->{_data}}) {
my $class_probs_at_a_point =
$self->{_class_probs_at_each_data_point}->{$data_id};
print "Class probs for $data_id: @$class_probs_at_a_point\n"
}
}
# Calculate prob(C_i | x) which is the posterior prob of class
# considered as a r.v. to be C_i at a given point x. For a given
# x, the sum of such probabilities over all C_i must add up to 1:
$self->find_expected_classes_at_each_datapoint();
if ($self->{_debug}) {
print "\n\nDisplaying expected class probs at each data point:\n\n";
foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
my $expected_classes_at_a_point =
$self->{_expected_class_probs}->{$data_id};
print "Expected classes $data_id: @$expected_classes_at_a_point\n";
}
}
# 1. UPDATE MEANS:
my @new_means;
foreach my $cluster_index(0..$self->{_K}-1) {
$new_means[$cluster_index] =
Math::GSL::Matrix->new($self->{_data_dimensions},1);
$new_means[$cluster_index]->zero();
foreach my $data_id (keys %{$self->{_data}}) {
my $data_record = $self->{_data}->{$data_id};
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col(0,$data_record);
$new_means[$cluster_index] +=
$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
$data_vec->copy();
$self->{_cluster_normalizers}->[$cluster_index] +=
$self->{_expected_class_probs}->{$data_id}->[$cluster_index];
}
$new_means[$cluster_index] *= 1.0 /
$self->{_cluster_normalizers}->[$cluster_index];
}
if ($self->{_debug}) {
foreach my $meanvec (@new_means) {
display_matrix("At EM Iteration $em_iteration, new mean vector is",
$meanvec);
}
}
$self->{_cluster_means} = \@new_means;
# 2. UPDATE COVARIANCES:
my @new_covariances;
foreach my $cluster_index(0..$self->{_K}-1) {
$new_covariances[$cluster_index] =
Math::GSL::Matrix->new($self->{_data_dimensions},
$self->{_data_dimensions});
$new_covariances[$cluster_index]->zero();
my $normalizer = 0;
foreach my $data_id (keys %{$self->{_data}}) {
my $data_record = $self->{_data}->{$data_id};
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col(0,$data_record);
my $mean_subtracted_data =
$data_vec - $self->{_cluster_means}->[$cluster_index];
my $outer_product = outer_product($mean_subtracted_data,
$mean_subtracted_data);
$new_covariances[$cluster_index] +=
$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
$outer_product;
}
$new_covariances[$cluster_index] *=
1.0 /
$self->{_cluster_normalizers}->[$cluster_index];
}
$self->{_cluster_covariances} = \@new_covariances;
# 3. UPDATE PRIORS:
$self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
if @{$self->{_old_priors}} > 0;
$self->{_old_priors} = deep_copy_array( $self->{_class_priors} );
foreach my $cluster_index(0..$self->{_K}-1) {
$self->{_class_priors}->[$cluster_index] =
$self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
}
my @priors = @{$self->{_class_priors}};
print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
push @{$self->{_fisher_quality_vs_iteration}},
$self->clustering_quality_fisher();
push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
if ( ($em_iteration > 5 && $self->reached_convergence())
|| ($em_iteration == $self->{_max_em_iterations} - 1) ) {
my @old_old_priors = @{$self->{_old_old_priors}};
my @old_priors = @{$self->{_old_priors}};
print "\n\nPrevious to previous priors: @old_old_priors\n";
print "Previous priors: @old_priors\n";
print "Current class priors: @{$self->{_class_priors}}\n";
print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n"
if $em_iteration < $self->{_max_em_iterations} - 1;
last;
}
}
print "\n\n\n";
}
sub reached_convergence {
my $self = shift;
return 1 if compare_array_floats($self->{_old_old_priors},
$self->{_old_priors})
&&
compare_array_floats($self->{_old_priors},
$self->{_class_priors});
return 0;
}
# Classify the data into disjoint clusters using the Naive Bayes' classification:
sub run_bayes_classifier {
my $self = shift;
$self->classify_all_data_tuples_bayes($self->{_cluster_means},
$self->{_cluster_covariances});
}
# Should NOT be called before run_bayes_classifier is run
sub return_disjoint_clusters {
my $self = shift;
return $self->{_clusters};
}
sub return_clusters_with_posterior_probs_above_threshold {
my $self = shift;
my $theta = shift;
my @class_distributions;
foreach my $cluster_index (0..$self->{_K}-1) {
push @class_distributions, [];
}
foreach my $data_tag (@{$self->{_data_id_tags}}) {
foreach my $cluster_index (0..$self->{_K}-1) {
push @{$class_distributions[$cluster_index]}, $data_tag
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
> $theta;
}
}
return \@class_distributions;
}
sub return_individual_class_distributions_above_given_threshold {
my $self = shift;
my $theta = shift;
my @probability_distributions;
foreach my $cluster_index (0..$self->{_K}-1) {
push @probability_distributions, [];
}
foreach my $cluster_index (0..$self->{_K}-1) {
my $mean_vec = $self->{_cluster_means}->[$cluster_index];
my $covar = $self->{_cluster_covariances}->[$cluster_index];
foreach my $data_id (keys %{$self->{_data}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col( 0, $self->{_data}->{$data_id});
my $datavec_minus_mean = $data_vec - $mean_vec;
display_matrix( "datavec minus mean is ", $datavec_minus_mean )
if $self->{_debug};
my $exponent = undef;
if ($self->{_data_dimensions} > 1) {
$exponent = -0.5 * vector_matrix_multiply(
transpose($datavec_minus_mean),
matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
} else {
my @var_inverse = $covar->inverse()->as_list;
my $var_inverse_val = $var_inverse[0];
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
}
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
my $coefficient = 1.0 / \
( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) );
my $prob = $coefficient * exp($exponent);
push @{$probability_distributions[$cluster_index]}, $data_id
if $prob > $theta;
}
}
return \@probability_distributions;
}
sub return_estimated_priors {
my $self = shift;
return $self->{_class_priors};
}
# Calculates the MDL (Minimum Description Length) clustering criterion according to
# Rissanen. (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978,
# and "A Universal Prior for Integers and Estimation by Minimum Description Length,"
# Annals of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood
# term for all of the observed data and a model-complexity penalty term. In general,
# both the log-likelihood and the model-complexity terms increase as the number of
# clusters increases. The form of the MDL criterion used in the implementation below
# uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz,
# "Estimating the Dimensions of a Model," The Annals of Statistics, 1978. In
# general, the smaller the value of the MDL quality measure calculated below, the
# better the clustering of the data.
sub clustering_quality_mdl {
my $self = shift;
# Calculate the inverses of all of the covariance matrices in order to avoid
# having to calculate them repeatedly inside the inner 'foreach' loop in the
# main part of this method. Here we go:
my @covar_inverses;
foreach my $cluster_index (0..$self->{_K}-1) {
my $covar = $self->{_cluster_covariances}->[$cluster_index];
push @covar_inverses, $covar->inverse();
}
# For the clustering quality, first calculate the log-likelihood of all the
# observed data:
my $log_likelihood = 0;
foreach my $tag (@{$self->{_data_id_tags}}) {
my $likelihood_for_each_tag = 0;
foreach my $cluster_index (0..$self->{_K}-1) {
my $mean_vec = $self->{_cluster_means}->[$cluster_index];
my $covar = $self->{_cluster_covariances}->[$cluster_index];
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col( 0, $self->{_data}->{$tag});
my $datavec_minus_mean = $data_vec - $mean_vec;
my $exponent = undef;
if ($self->{_data_dimensions} > 1) {
$exponent = -0.5 * vector_matrix_multiply(
transpose($datavec_minus_mean),
matrix_vector_multiply($covar_inverses[$cluster_index],
$datavec_minus_mean ) );
} else {
my @var_inverse = $covar_inverses[$cluster_index]->as_list;
my $var_inverse_val = $var_inverse[0];
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
}
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
# the individual clusters.
sub clustering_quality_fisher {
my $self = shift;
my @cluster_quality_indices;
my $fisher_trace = 0;
my $S_w =
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
$S_w->zero;
my $S_b =
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
$S_b->zero;
my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$global_mean->zero;
foreach my $cluster_index(0..$self->{_K}-1) {
$global_mean = $self->{_class_priors}->[$cluster_index] *
$self->{_cluster_means}->[$cluster_index];
}
foreach my $cluster_index(0..$self->{_K}-1) {
$S_w += $self->{_cluster_covariances}->[$cluster_index] *
$self->{_class_priors}->[$cluster_index];
my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
- $global_mean;
my $outer_product = outer_product( $class_mean_minus_global_mean,
$class_mean_minus_global_mean );
$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
}
my $fisher = matrix_multiply($S_w->inverse, $S_b);
return $fisher unless defined blessed($fisher);
return matrix_trace($fisher);
}
sub display_seeding_stats {
my $self = shift;
foreach my $cluster_index(0..$self->{_K}-1) {
print "\nSeeding for cluster $cluster_index:\n";
my $mean = $self->{_cluster_means}->[$cluster_index];
display_matrix("The mean is: ", $mean);
my $covariance = $self->{_cluster_covariances}->[$cluster_index];
display_matrix("The covariance is: ", $covariance);
}
}
sub display_fisher_quality_vs_iterations {
my $self = shift;
print "\n\nFisher Quality vs. Iterations: " .
"@{$self->{_fisher_quality_vs_iteration}}\n\n";
}
sub display_mdl_quality_vs_iterations {
my $self = shift;
print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}
sub find_prob_at_each_datapoint_for_given_mean_and_covar {
my $self = shift;
my $mean_vec_ref = shift;
my $covar_ref = shift;
foreach my $data_id (keys %{$self->{_data}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col( 0, $self->{_data}->{$data_id});
if ($self->{_debug}) {
display_matrix("data vec in find prob function", $data_vec);
display_matrix("mean vec in find prob function", $mean_vec_ref);
display_matrix("covariance in find prob function", $covar_ref);
}
my $datavec_minus_mean = $data_vec - $mean_vec_ref;
display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
my $exponent = undef;
if ($self->{_data_dimensions} > 1) {
$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
} elsif (defined blessed($covar_ref)) {
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
my @covar_as_matrix = $covar_ref->as_list;
my $covar_val = $covar_as_matrix[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
} else {
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
}
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
my $coefficient = undef;
if ($self->{_data_dimensions} > 1) {
$coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
$covar_ref->det()) ;
} elsif (!defined blessed($covar_ref)) {
$coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
} else {
my @covar_as_matrix = $covar_ref->as_list;
my $covar_val = $covar_as_matrix[0];
$coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
}
my $prob = $coefficient * exp($exponent);
push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
}
}
sub find_expected_classes_at_each_datapoint {
my $self = shift;
my @priors = @{$self->{_class_priors}};
foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
my $numerator =
vector_2_vector_multiply(
$self->{_class_probs_at_each_data_point}->{$data_id},
$self->{_class_priors} );
my $sum = 0;
foreach my $part (@$numerator) {
$sum += $part;
}
$self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
}
}
sub initialize_class_priors {
my $self = shift;
if (@{$self->{_class_priors}} == 0) {
my $prior = 1.0 / $self->{_K};
foreach my $class_index (0..$self->{_K}-1) {
push @{$self->{_class_priors}}, $prior;
}
}
die "Mismatch between number of values for class priors " .
"and the number of clusters expected"
unless @{$self->{_class_priors}} == $self->{_K};
my $sum = 0;
foreach my $prior (@{$self->{_class_priors}}) {
$sum += $prior;
}
die "Your priors in the constructor call do not add up to 1"
unless abs($sum - 1) < 0.001;
print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
}
sub estimate_class_priors {
my $self = shift;
foreach my $datatag (keys %{$self->{_data}}) {
my $class_label = $self->{_class_labels}->{$datatag};
$self->{_class_priors}[$class_label]++;
}
foreach my $prior (@{$self->{_class_priors}}) {
$prior /= $self->{_total_number_data_tuples};
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
print "\nFor cluster $cluster_index: rows: $num_rows and cols: $num_cols\n";
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
my $col_index = 0;
foreach my $ele (@{$clusters->[$cluster_index]}) {
$matrix->set_col($col_index++, $ele);
}
# display_matrix( "Displaying cluster matrix", $matrix );
foreach my $j (0..$num_cols-1) {
$mean_vec += $matrix->col($j);
}
$mean_vec *= 1.0 / $num_cols;
push @cluster_mean_vecs, $mean_vec;
display_matrix( "Displaying the mean vector", $mean_vec );
foreach my $j (0..$num_cols-1) {
my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
$matrix->set_col($j, \@new_col);
}
# display_matrix("Displaying mean subtracted data as a matrix", $matrix );
my $transposed = transpose( $matrix );
# display_matrix("Displaying transposed matrix",$transposed);
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
push @cluster_covariances, $covariance;
display_matrix("Displaying the cluster covariance", $covariance );
}
return (\@cluster_mean_vecs, \@cluster_covariances);
}
sub find_data_dimensionality {
my $clusters = shift;
my @first_cluster = @{$clusters->[0]};
my @first_data_element = @{$first_cluster[0]};
return scalar(@first_data_element);
}
sub find_seed_centered_covariances {
my $self = shift;
my $seed_tags = shift;
my (@seed_mean_vecs, @seed_based_covariances);
foreach my $seed_tag (@$seed_tags) {
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
$mean_vec->set_col(0, $self->{_data}->{$seed_tag});
push @seed_mean_vecs, $mean_vec;
display_matrix( "Displaying the seed mean vector", $mean_vec );
my $col_index = 0;
foreach my $tag (@{$self->{_data_id_tags}}) {
$matrix->set_col($col_index++, $self->{_data}->{$tag});
}
foreach my $j (0..$num_cols-1) {
my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
$matrix->set_col($j, \@new_col);
}
my $transposed = transpose( $matrix );
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
push @seed_based_covariances, $covariance;
display_matrix("Displaying the seed covariance", $covariance )
if $self->{_debug};
}
return (\@seed_mean_vecs, \@seed_based_covariances);
}
# The most popular seeding mode for EM is random. We include two other seeding modes
# --- kmeans and manual --- since they do produce good results for specialized cases.
# For example, when the clusters in your data are non-overlapping and not too
# anisotropic, the kmeans based seeding should work at least as well as the random
# seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
# the advantage of avoiding the getting stuck in a local-maximum problem of the EM
# algorithm.
sub seed_the_clusters {
my $self = shift;
if ($self->{_seeding} eq 'random') {
my @covariances;
my @means;
my @all_tags = @{$self->{_data_id_tags}};
my @seed_tags;
foreach my $i (0..$self->{_K}-1) {
push @seed_tags, $all_tags[int rand( $self->{_N} )];
}
print "Random Seeding: Randomly selected seeding tags are @seed_tags\n\n";
my ($seed_means, $seed_covars) =
$self->find_seed_centered_covariances(\@seed_tags);
$self->{_cluster_means} = $seed_means;
$self->{_cluster_covariances} = $seed_covars;
} elsif ($self->{_seeding} eq 'kmeans') {
$self->kmeans();
my $clusters = $self->{_clusters};
my @dataclusters;
foreach my $index (0..@$clusters-1) {
push @dataclusters, [];
}
foreach my $cluster_index (0..$self->{_K}-1) {
foreach my $tag (@{$clusters->[$cluster_index]}) {
my $data = $self->{_data}->{$tag};
push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
}
}
($self->{_cluster_means}, $self->{_cluster_covariances}) =
find_cluster_means_and_covariances(\@dataclusters);
} elsif ($self->{_seeding} eq 'manual') {
die "You have not supplied the seeding tags for the option \"manual\""
unless @{$self->{_seed_tags}} > 0;
print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
foreach my $tag (@{$self->{_seed_tags}}) {
die "invalid tag used for manual seeding"
unless exists $self->{_data}->{$tag};
}
my ($seed_means, $seed_covars) =
$self->find_seed_centered_covariances($self->{_seed_tags});
$self->{_cluster_means} = $seed_means;
$self->{_cluster_covariances} = $seed_covars;
} else {
die "Incorrect call syntax used. See documentation.";
}
}
# This is the top-level method for kmeans based initialization of the EM
# algorithm. The means and the covariances returned by kmeans are used to seed the EM
# algorithm.
sub kmeans {
my $self = shift;
my $K = $self->{_K};
$self->cluster_for_fixed_K_single_smart_try($K);
if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
return ($self->{_clusters}, $self->{_cluster_centers});
} else {
die "kmeans clustering failed.";
}
}
# Used by the kmeans algorithm for the initialization of the EM iterations. We do
# initial kmeans cluster seeding by subjecting the data to principal components
# analysis in order to discover the direction of maximum variance in the data space.
# Subsequently, we try to find the K largest peaks along this direction. The
# coordinates of these peaks serve as the seeds for the K clusters.
sub cluster_for_fixed_K_single_smart_try {
my $self = shift;
my $K = shift;
print "Clustering for K = $K\n" if $self->{_terminal_output};
my ($clusters, $cluster_centers) =
$self->cluster_for_given_K($K);
$self->{_clusters} = $clusters;
$self->{_cluster_centers} = $cluster_centers;
}
# Used by the kmeans part of the code for the initialization of the EM algorithm:
sub cluster_for_given_K {
my $self = shift;
my $K = shift;
my $cluster_centers = $self->get_initial_cluster_centers($K);
my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
my $cluster_nonexistant_flag = 0;
foreach my $trial (0..2) {
($clusters, $cluster_centers) =
$self->assign_data_to_clusters( $clusters, $K );
my $num_of_clusters_returned = @$clusters;
foreach my $cluster (@$clusters) {
$cluster_nonexistant_flag = 1 if ((!defined $cluster)
|| (@$cluster == 0));
}
last unless $cluster_nonexistant_flag;
}
return ($clusters, $cluster_centers);
}
# Used by the kmeans part of the code for the initialization of the EM algorithm:
sub get_initial_cluster_centers {
my $self = shift;
my $K = shift;
if ($self->{_data_dimensions} == 1) {
my @one_d_data;
foreach my $j (0..$self->{_N}-1) {
my $tag = $self->{_data_id_tags}[$j];
push @one_d_data, $self->{_data}->{$tag}->[0];
}
my @peak_points =
find_peak_points_in_given_direction(\@one_d_data,$K);
print "highest points at data values: @peak_points\n"
if $self->{_debug};
my @cluster_centers;
foreach my $peakpoint (@peak_points) {
push @cluster_centers, [$peakpoint];
}
return \@cluster_centers;
}
my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_id_tags}.
# The actual data for clustering is stored in a hash at $self->{_data}
# whose keys are the record labels; the value associated with each
# key is the array holding the corresponding numerical multidimensional
# data.
foreach my $j (0..$num_cols-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
$matrix->set_col($j, $data);
}
if ($self->{_debug}) {
print "\nDisplaying the original data as a matrix:";
display_matrix( $matrix );
}
foreach my $j (0..$num_cols-1) {
$mean_vec += $matrix->col($j);
}
$mean_vec *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "Displaying the mean vector for the data:";
display_matrix( $mean_vec );
}
foreach my $j (0..$num_cols-1) {
my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
$matrix->set_col($j, \@new_col);
}
if ($self->{_debug}) {
print "Displaying mean subtracted data as a matrix:";
display_matrix( $matrix );
}
my $transposed = transpose( $matrix );
if ($self->{_debug}) {
print "Displaying transposed data matrix:";
display_matrix( $transposed );
}
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for your data:";
display_matrix( $covariance );
}
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] >
$eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] <
$eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
}
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
print "\nsmallest eigen index: $smallest_eigen_index\n\n"
if $self->{_debug};
foreach my $i (0..$num_of_eigens-1) {
my @vec = $eigenvectors->[$i]->as_list;
print "Eigenvector $i: @vec\n" if $self->{_debug};
}
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
my @max_var_direction;
# Each element of the array @largest_eigen_vec is a Math::Complex object
foreach my $k (0..@largest_eigen_vec-1) {
my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
if ($theta eq '0') {
$max_var_direction[$k] = $mag;
} elsif ($theta eq 'pi') {
$max_var_direction[$k] = -1.0 * $mag;
} else {
die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
}
}
# "Maximum variance direction: @max_var_direction
print "\nMaximum Variance Direction: @max_var_direction\n\n"
if $self->{_debug};
# We now project all data points on the largest eigenvector.
# Each projection will yield a single point on the eigenvector.
my @projections;
foreach my $j (0..$self->{_N}-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
die "Dimensionality of the largest eigenvector does not "
. "match the dimensionality of the data"
unless @max_var_direction == $self->{_data_dimensions};
my $projection = vector_multiply($data, \@max_var_direction);
push @projections, $projection;
}
print "All projection points: @projections\n" if $self->{_debug};
my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
print "highest points at points along largest eigenvec: @peak_points\n"
if $self->{_debug};
my @cluster_centers;
foreach my $peakpoint (@peak_points) {
my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
push @cluster_centers, \@actual_peak_coords;
}
return \@cluster_centers;
}
# Used by the kmeans part of the code: This method is called by the previous method
# to locate K peaks in a smoothed histogram of the data points projected onto the
# maximal variance direction.
sub find_peak_points_in_given_direction {
my $dataref = shift;
my $how_many = shift;
my @data = @$dataref;
my ($min, $max) = minmax(\@data);
my $num_points = @data;
my @sorted_data = sort {$a <=> $b} @data;
#print "\n\nSorted data: @sorted_data\n";
my $scale = $max - $min;
foreach my $index (0..$#sorted_data-1) {
$sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
}
my $avg_diff = 0;
foreach my $index (0..$#sorted_data-1) {
my $diff = $sorted_data[$index+1] - $sorted_data[$index];
$avg_diff += ($diff - $avg_diff) / ($index + 1);
}
my $delta = 1.0 / 1000.0;
# It would be nice to set the delta adaptively, but I must
# change the number of cells in the next foreach loop accordingly
# my $delta = $avg_diff / 20;
my @accumulator = (0) x 1000;
foreach my $index (0..@sorted_data-1) {
my $cell_index = int($sorted_data[$index] / $delta);
my $smoothness = 40;
for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
next if $index < 0 || $index > 999;
$accumulator[$index]++;
}
}
my $peaks_array = non_maximum_supression( \@accumulator );
my $peaks_index_hash = get_value_index_hash( $peaks_array );
my @K_highest_peak_locations;
my $k = 0;
foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
my $unscaled_peak_point =
$min + $peaks_index_hash->{$peak} * $scale * $delta;
push @K_highest_peak_locations, $unscaled_peak_point
if $k < $how_many;
last if ++$k == $how_many;
}
return @K_highest_peak_locations;
}
# Used by the kmeans part of the code: The purpose of this routine is to form initial
# clusters by assigning the data samples to the initial clusters formed by the
# previous routine on the basis of the best proximity of the data samples to the
# different cluster centers.
sub assign_data_to_clusters_initial {
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
}
# Meant only for constructing a deep copy of a hash in which each value is an
# anonymous array of numbers:
sub deep_copy_hash {
my $ref_in = shift;
my $ref_out;
while ( my ($key, $value) = each( %$ref_in ) ) {
$ref_out->{$key} = deep_copy_array( $value );
}
return $ref_out;
}
# Meant only for an array of numbers:
sub deep_copy_array {
my $ref_in = shift;
my $ref_out;
foreach my $i (0..@{$ref_in}-1) {
$ref_out->[$i] = $ref_in->[$i];
}
return $ref_out;
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
sub mean_and_variance {
my @data = @{shift @_};
my ($mean, $variance);
foreach my $i (1..@data) {
if ($i == 1) {
$mean = $data[0];
$variance = 0;
} else {
# data[$i-1] because of zero-based indexing of vector
$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
$variance = ( (($i-1)/$i) * $variance )
+ ($data[$i-1]-$mean)**2 / ($i-1);
}
}
return ($mean, $variance);
}
sub check_for_illegal_params {
my @params = @_;
my @legal_params = qw / datafile
mask
K
terminal_output
max_em_iterations
seeding
class_priors
seed_tags
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub get_value_index_hash {
my $arr = shift;
my %hash;
foreach my $index (0..@$arr-1) {
$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
}
return \%hash;
}
sub non_maximum_supression {
my $arr = shift;
my @output = (0) x @$arr;
my @final_output = (0) x @$arr;
my %hash;
my @array_of_runs = ([$arr->[0]]);
foreach my $index (1..@$arr-1) {
if ($arr->[$index] == $arr->[$index-1]) {
push @{$array_of_runs[-1]}, $arr->[$index];
} else {
push @array_of_runs, [$arr->[$index]];
}
}
my $runstart_index = 0;
foreach my $run_index (1..@array_of_runs-2) {
$runstart_index += @{$array_of_runs[$run_index-1]};
if ($array_of_runs[$run_index]->[0] >
$array_of_runs[$run_index-1]->[0] &&
$array_of_runs[$run_index]->[0] >
$array_of_runs[$run_index+1]->[0]) {
my $run_center = @{$array_of_runs[$run_index]} / 2;
my $assignment_index = $runstart_index + $run_center;
$output[$assignment_index] = $arr->[$assignment_index];
}
}
if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
$runstart_index += @{$array_of_runs[-2]};
my $run_center = @{$array_of_runs[-1]} / 2;
my $assignment_index = $runstart_index + $run_center;
$output[$assignment_index] = $arr->[$assignment_index];
}
if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
my $run_center = @{$array_of_runs[0]} / 2;
$output[$run_center] = $arr->[$run_center];
}
return \@output;
( run in 0.569 second using v1.01-cache-2.11-cpan-0bd6704ced7 )