Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
}
# For the clustering quality, first calculate the log-likelihood of all the
# observed data:
my $log_likelihood = 0;
foreach my $tag (@{$self->{_data_id_tags}}) {
my $likelihood_for_each_tag = 0;
foreach my $cluster_index (0..$self->{_K}-1) {
my $mean_vec = $self->{_cluster_means}->[$cluster_index];
my $covar = $self->{_cluster_covariances}->[$cluster_index];
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col( 0, $self->{_data}->{$tag});
my $datavec_minus_mean = $data_vec - $mean_vec;
my $exponent = undef;
if ($self->{_data_dimensions} > 1) {
$exponent = -0.5 * vector_matrix_multiply(
transpose($datavec_minus_mean),
matrix_vector_multiply($covar_inverses[$cluster_index],
$datavec_minus_mean ) );
} else {
my @var_inverse = $covar_inverses[$cluster_index]->as_list;
my $var_inverse_val = $var_inverse[0];
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
}
next if $covar->det() < 0;
my $coefficient = 1.0 /
( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions}
* sqrt($covar->det()) );
my $prob = $coefficient * exp($exponent);
$likelihood_for_each_tag +=
$prob * $self->{_class_priors}->[$cluster_index];
}
$log_likelihood += log( $likelihood_for_each_tag );
}
# Now calculate the model complexity penalty. $L is the total number of
# parameters it takes to specify a mixture of K Gaussians. If d is the
# dimensionality of the data space, the covariance matrix of each Gaussian takes
# (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
# then you need d mean value parameters, and one prior probability parameter
# for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
# is subtracted is to account for the normalization on the class priors.
my $L = (0.5 * $self->{_K} *
($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1;
my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
return $mdl_criterion;
}
# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
# is the within-class scatter matrix, more commonly denoted S_w, and SB the
# between-class scatter matrix, more commonly denoted S_b (the underscore means
# subscript). This measure can be thought of as the normalized average distance
# between the clusters, the normalization being provided by average cluster
# covariance SW^-1. Therefore, the larger the value of this quality measure, the
# better the separation between the clusters. Since this measure has its roots in
# the Fisher linear discriminant function, we incorporate the word 'fisher' in the
# name of the quality measure. Note that this measure is good only when the clusters
# are disjoint. When the clusters exhibit significant overlap, the numbers produced
# by this quality measure tend to be generally meaningless. As an extreme case,
# let's say your data was produced by a set of Gaussians, all with the same mean
# vector, but each with a distinct covariance. For this extreme case, this measure
# will produce a value close to zero --- depending on the accuracy with which the
# means are estimated --- even when your clusterer is doing a good job of identifying
# the individual clusters.
sub clustering_quality_fisher {
my $self = shift;
my @cluster_quality_indices;
my $fisher_trace = 0;
my $S_w =
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
$S_w->zero;
my $S_b =
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
$S_b->zero;
my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$global_mean->zero;
foreach my $cluster_index(0..$self->{_K}-1) {
$global_mean = $self->{_class_priors}->[$cluster_index] *
$self->{_cluster_means}->[$cluster_index];
}
foreach my $cluster_index(0..$self->{_K}-1) {
$S_w += $self->{_cluster_covariances}->[$cluster_index] *
$self->{_class_priors}->[$cluster_index];
my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
- $global_mean;
my $outer_product = outer_product( $class_mean_minus_global_mean,
$class_mean_minus_global_mean );
$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
}
my $fisher = matrix_multiply($S_w->inverse, $S_b);
return $fisher unless defined blessed($fisher);
return matrix_trace($fisher);
}
sub display_seeding_stats {
my $self = shift;
foreach my $cluster_index(0..$self->{_K}-1) {
print "\nSeeding for cluster $cluster_index:\n";
my $mean = $self->{_cluster_means}->[$cluster_index];
display_matrix("The mean is: ", $mean);
my $covariance = $self->{_cluster_covariances}->[$cluster_index];
display_matrix("The covariance is: ", $covariance);
}
}
sub display_fisher_quality_vs_iterations {
my $self = shift;
print "\n\nFisher Quality vs. Iterations: " .
"@{$self->{_fisher_quality_vs_iteration}}\n\n";
}
sub display_mdl_quality_vs_iterations {
my $self = shift;
print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}
sub find_prob_at_each_datapoint_for_given_mean_and_covar {
my $self = shift;
my $mean_vec_ref = shift;
my $covar_ref = shift;
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
output_datafile => $out_datafile,
total_number_of_data_points => $N );
# where the value of $N is the total number of data points you would like to see
# generated for all of the Gaussians. How this total number is divided up amongst
# the Gaussians is decided by the prior probabilities for the Gaussian components
# as declared in input parameter file. The synthetic data may be visualized in a
# terminal window and the visualization written out as a PNG image to a diskfile
# by
my $data_visualization_mask = "11";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);
=head1 CHANGES
Version 1.22 should work with data in CSV files.
Version 1.21 incorporates minor code clean up. Overall, the module implementation
remains unchanged.
Version 1.2 allows the module to also be used for 1-D data. The visualization code
for 1-D shows the clusters through their histograms.
Version 1.1 incorporates much cleanup of the documentation associated with the
module. Both the top-level module documentation, especially the Description part,
and the comments embedded in the code were revised for better utilization of the
module. The basic implementation code remains unchanged.
=head1 DESCRIPTION
B<Algorithm::ExpectationMaximization> is a I<perl5> module for the
Expectation-Maximization (EM) method of clustering numerical data that lends itself
to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the
sense that it is not a Perl wrapper around a C library that actually does the
clustering), the code in the module can easily be modified to experiment with several
aspects of EM.
Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
C<K> Gaussian components, each characterized by its own mean vector and its own
covariance matrix. Obviously, given observed data for clustering, we do not know
which of the C<K> Gaussian components was responsible for any of the data elements.
GMM also associates a prior probability with each Gaussian component. In general,
these priors will also be unknown. So the problem of clustering consists of
estimating the posterior class probability at each data element and also estimating
the class priors. Once these posterior class probabilities and the priors are
estimated with EM, we can use the naive Bayes' classifier to partition the data into
disjoint clusters. Or, for "soft" clustering, we can find all the data elements that
belong to a Gaussian component on the basis of the posterior class probabilities at
the data elements exceeding a prescribed threshold.
If you do not mind the fact that it is possible for the EM algorithm to occasionally
get stuck in a local maximum and to, therefore, produce a wrong answer even when you
know the data to be perfectly multimodal Gaussian, EM is probably the most magical
approach to clustering multidimensional data. Consider the case of clustering
three-dimensional data. Each Gaussian cluster in 3D space is characterized by the
following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
must be symmetric positive-definite), the 3 unique elements of the mean, and the
prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
your data. What that means is that you would want the values for 59 variables
(remember the unit-summation constraint on the class priors which reduces the overall
number of variables by one) to be estimated by the algorithm that seeks to discover
the clusters in your data. What's amazing is that, despite the large number of
variables that must be optimized simultaneously, the EM algorithm will very likely
give you a good approximation to the right answer.
At its core, EM depends on the notion of unobserved data and the averaging of the
log-likelihood of the data actually observed over all admissible probabilities for
the unobserved data. But what is unobserved data? While in some cases where EM is
used, the unobserved data is literally the missing data, in others, it is something
that cannot be seen directly but that nonetheless is relevant to the data actually
observed. For the case of clustering multidimensional numerical data that can be
modeled as a Gaussian mixture, it turns out that the best way to think of the
unobserved data is in terms of a sequence of random variables, one for each observed
data point, whose values dictate the selection of the Gaussian for that data point.
This point is explained in great detail in my on-line tutorial at
L<https://engineering.purdue.edu/kak/Tutorials/ExpectationMaximization.pdf>.
The EM algorithm in our context reduces to an iterative invocation of the following
steps: (1) Given the current guess for the means and the covariances of the different
Gaussians in our mixture model, use Bayes' Rule to update the posterior class
probabilities at each of the data points; (2) Using the updated posterior class
probabilities, first update the class priors; (3) Using the updated class priors,
update the class means and the class covariances; and go back to Step (1). Ideally,
the iterations should terminate when the expected log-likelihood of the observed data
has reached a maximum and does not change with any further iterations. The stopping
rule used in this module is the detection of no change over three consecutive
iterations in the values calculated for the priors.
This module provides three different choices for seeding the clusters: (1) random,
(2) kmeans, and (3) manual. When random seeding is chosen, the algorithm randomly
selects C<K> data elements as cluster seeds. That is, the data vectors associated
with these seeds are treated as initial guesses for the means of the Gaussian
distributions. The covariances are then set to the values calculated from the entire
dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
the other hand, the means and the covariances are set to whatever values are returned
by the kmeans algorithm. And, when seeding is set to manual, you are allowed to
choose C<K> data elements --- by specifying their tag names --- for the seeds. The
rest of the EM initialization for the manual mode is the same as for the random mode.
The algorithm allows for the initial priors to be specified for the manual mode of
seeding.
Much of code for the kmeans based seeding of EM was drawn from the
C<Algorithm::KMeans> module by me. The code from that module used here corresponds to
the case when the C<cluster_seeding> option in the C<Algorithm::KMeans> module is set
to C<smart>. The C<smart> option for KMeans consists of subjecting the data to a
principal components analysis (PCA) to discover the direction of maximum variance in
the data space. The data points are then projected on to this direction and a
histogram constructed from the projections. Centers of the C<K> largest peaks in
this smoothed histogram are used to seed the KMeans based clusterer. As you'd
expect, the output of the KMeans used to seed the EM algorithm.
This module uses two different criteria to measure the quality of the clustering
achieved. The first is the Minimum Description Length (MDL) proposed originally by
Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
"A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
of Statistics, 1983.) The MDL criterion is a difference of a log-likelihood term for
all of the observed data and a model-complexity penalty term. In general, both the
log-likelihood and the model-complexity terms increase as the number of clusters
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
now present an example of clustering in 3D. The datafile used in this example is
C<mydatafile4.dat>. This mixture data corresponds to three well-separated but highly
anisotropic Gaussians. The EM derived clustering for this data is shown in the files
C<save_example_4_cluster_plot.png> and C<save_example_4_posterior_prob_plot.png>, the
former displaying the hard clusters obtained by using the naive Bayes' classifier and
the latter showing the soft clusters obtained on the basis of the posterior class
probabilities at the data points.
You may also wish to run this example on the data in a CSV file in the C<examples>
directory. The name of the file is C<sphericaldata.csv>.
=item I<canned_example5.pl>
We again demonstrate clustering in 3D but now we have one Gaussian cluster that
"cuts" through the other two Gaussian clusters. The datafile used in this example is
C<mydatafile5.dat>. The three Gaussians in this case are highly overlapping and
highly anisotropic. The EM derived clustering for this data is shown in the files
C<save_example_5_cluster_plot.png> and C<save_example_5_posterior_prob_plot.png>, the
former displaying the hard clusters obtained by using the naive Bayes' classifier and
the latter showing the soft clusters obtained through the posterior class
probabilities at the data points.
=item I<canned_example6.pl>
This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
In order to visualize the clusters for the 1-D case, we show them through their
respective histograms. The datafile used in this example is C<mydatafile7.dat>. The
data consists of two overlapping Gaussians. The EM derived clustering for this data
is shown in the files C<save_example_6_cluster_plot.png> and
C<save_example_6_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained through the posterior class probabilities at the data points.
=back
Going through the six examples listed above will make you familiar with how to make
the calls to the clustering and the visualization methods. The C<examples> directory
also includes several parameter files with names like
param1.txt
param2.txt
param3.txt
...
These were used to generate the synthetic data for which the results are shown in the
C<examples> directory. Just make a copy of one of these files and edit it if you
would like to generate your own multivariate data for clustering. Note that you can
generate data with any dimensionality through appropriate entries in the parameter
file.
=head1 CAVEATS
When you run the scripts in the C<examples> directory, your results will NOT always
look like what I have shown in the PNG image files in the directory. As mentioned
earlier in Description, the EM algorithm starting from randomly chosen initial
guesses for the cluster means can get stuck in a local maximum.
That raises an interesting question of how one judges the correctness of clustering
results when dealing with real experimental data. For real data, the best approach
is to try the EM algorithm multiple times with all of the seeding options included in
this module. It would be safe to say that, at least in low dimensional spaces and
with sufficient data, a majority of your runs should yield "correct" results.
Also bear in mind that a pure Perl implementation is not meant for the clustering of
very large data files. It is really designed more for researching issues related to
EM based approaches to clustering.
=head1 REQUIRED
This module requires the following three modules:
Math::Random
Graphics::GnuplotIF
Math::GSL::Matrix
the first for generating the multivariate random numbers, the second for the
visualization of the clusters, and the last for access to the Perl wrappers for the
GNU Scientific Library. The C<Matrix> module of this library is used for various
algebraic operations on the covariance matrices of the Gaussians.
=head1 EXPORT
None by design.
=head1 BUGS
Please notify the author if you encounter any bugs. When sending email, please place
the string 'Algorithm EM' in the subject line.
=head1 INSTALLATION
Download the archive from CPAN in any directory of your choice. Unpack the archive
with a command that on a Linux machine would look like:
tar zxvf Algorithm-ExpectationMaximization-1.22.tar.gz
This will create an installation directory for you whose name will be
C<Algorithm-ExpectationMaximization-1.22>. Enter this directory and execute the
following commands for a standard install of the module if you have root privileges:
perl Makefile.PL
make
make test
sudo make install
If you do not have root privileges, you can carry out a non-standard install the
module in any directory of your choice by:
perl Makefile.PL prefix=/some/other/directory/
make
make test
make install
With a non-standard install, you may also have to set your PERL5LIB environment
variable so that this module can find the required other modules. How you do that
would depend on what platform you are working on. In order to install this module in
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
variable by
( run in 1.105 second using v1.01-cache-2.11-cpan-483215c6ad5 )