Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
}
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
foreach my $i (0..$K-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
my $ellipse_filename = "__contour_" . $i . ".dat";
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
}
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
my $all_joined_data = join ':', @all_data;
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
foreach my $i (0..@separated-1) {
$separated[$i] =~ s/SEPERATOR//g;
my @cluster_for_hist = split /:/, $separated[$i];
@cluster_for_hist = grep $_, @cluster_for_hist;
my ($minval,$maxval) = minmax(\@cluster_for_hist);
push @all_minvals, $minval;
push @all_maxvals, $maxval;
push @all_clusters_for_hist, \@cluster_for_hist;
}
push @all_minmaxvals, @all_minvals;
push @all_minmaxvals, @all_maxvals;
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
my $delta = ($abs_maxval - $abs_minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
foreach my $cindex (0..@all_clusters_for_hist-1) {
my $filename = basename($master_datafile);
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
close OUTPUT;
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( 'set terminal png color',
'set output "cluster_plot.png"');
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd('set terminal png',
'set output "cluster_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd('set terminal png',
'set output "cluster_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
}
}
# This method is for the visualization of the posterior class distributions. In
# other words, this method allows us to see the soft clustering produced by the EM
# algorithm. While much of the gnuplot logic here is the same as in the
# visualize_clusters() method, there are significant differences in how the data is
# pooled for the purpose of display.
sub visualize_distributions {
my $self = shift;
my $v_mask;
my $pause_time;
if (@_ == 1) {
$v_mask = shift || die "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || die "visualization mask missing";
$pause_time = shift;
} else {
die "visualize_distributions() called with wrong args";
}
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
if ($visualization_data_field_width == 2) {
foreach my $cluster_index (0..$self->{_K}-1) {
my $contour_filename = "__contour2_" . $cluster_index . ".dat";
my $mean = $self->{_cluster_means}->[$cluster_index];
my $covariance = $self->{_cluster_covariances}->[$cluster_index];
my ($mux,$muy) = $mean->as_list();
my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
my ($sigmayx,$vary) = $covariance->row(1)->as_list();
die "Your covariance matrix does not look right"
unless $sigmaxy == $sigmayx;
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
}
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
foreach my $i (0..$self->{_K}-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
my $ellipse_filename = "__contour2_" . $i . ".dat";
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
}
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
my $all_joined_data = join ':', @all_data;
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
foreach my $i (0..@separated-1) {
$separated[$i] =~ s/SEPERATOR//g;
my @cluster_for_hist = split /:/, $separated[$i];
@cluster_for_hist = grep $_, @cluster_for_hist;
my ($minval,$maxval) = minmax(\@cluster_for_hist);
push @all_minvals, $minval;
push @all_maxvals, $maxval;
push @all_clusters_for_hist, \@cluster_for_hist;
}
push @all_minmaxvals, @all_minvals;
push @all_minmaxvals, @all_maxvals;
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
my $delta = ($abs_maxval - $abs_minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
foreach my $cindex (0..@all_clusters_for_hist-1) {
my $localfilename = basename($filename);
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
close OUTPUT;
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "posterior_prob_plot.png"');
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "posterior_prob_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "posterior_prob_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
}
}
# The method shown below should be called only AFTER you have called the method
# read_data_from_file(). The visualize_data() is meant for the visualization of the
# original data in its various 2D or 3D subspaces.
sub visualize_data {
my $self = shift;
my $v_mask = shift || die "visualization mask missing";
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
my %visualization_data;
my $data_source = $self->{_data};
while ( my ($record_id, $data) = each %{$data_source} ) {
my @fields = @$data;
die "\nABORTED: Visualization mask size exceeds data record size"
if $#v_mask > $#fields;
my @data_fields;
foreach my $i (0..@fields-1) {
if ($v_mask[$i] eq '0') {
next;
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "Misformed visualization mask. It can only have 1s and 0s";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my $filename = basename($master_datafile);
my $temp_file;
$temp_file = "__temp_data_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!";
foreach my $datapoint (values %visualization_data) {
print OUTPUT "@$datapoint";
print OUTPUT "\n";
}
close OUTPUT;
my $plot = Graphics::GnuplotIF->new( persist => 1 );
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
my $plot_title = '"original data provided for EM"';
my $arg_string ;
if ($visualization_data_field_width > 2) {
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 2) {
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "Misformed visualization mask. It can only have 1s and 0s";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my $filename = basename($master_datafile);
my $temp_file;
$temp_file = "__temp_data_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!";
foreach my $datapoint (values %visualization_data) {
print OUTPUT "@$datapoint";
print OUTPUT "\n";
}
close OUTPUT;
my $plot = Graphics::GnuplotIF->new( persist => 1 );
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
my $plot_title = '"original data provided for EM"';
my $arg_string ;
if ($visualization_data_field_width > 2) {
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 2) {
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_} @all_data;
@all_data = grep $_, @all_data;
my ($minval,$maxval) = minmax(\@all_data);
my $delta = ($maxval - $minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
my $localfilename = basename($filename);
my $temp_file = "__temp1dhist_" . $localfilename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@all_data-1) {
$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
close OUTPUT;
}
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "data_scatter_plot.png"');
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "data_scatter_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "data_scatter_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
}
}
################### Generating Synthetic Data for Clustering ###################
# The data generated corresponds to a multivariate distribution. The mean and the
# covariance of each Gaussian in the distribution are specified individually in a
# parameter file. The parameter file must also state the prior probabilities to be
# associated with each Gaussian. See the example parameter file param1.txt in the
# examples directory. Just edit this file for your own needs.
#
# The multivariate random numbers are generated by calling the Math::Random module.
# As you would expect, that module will insist that the covariance matrix you
# specify be symmetric and positive definite.
sub cluster_data_generator {
my $class = shift;
die "illegal call of a class method"
unless $class eq 'Algorithm::ExpectationMaximization';
my %args = @_;
my $input_parameter_file = $args{input_parameter_file};
my $output_file = $args{output_datafile};
my $N = $args{total_number_of_data_points};
my @all_params;
my $param_string;
if (defined $input_parameter_file) {
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
chomp @all_params;
$param_string = join ' ', @all_params;
} else {
# Just for testing. Used in t/test.t
$param_string = "priors 0.34 0.33 0.33 " .
"cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
}
my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
croak "You did not specify the prior probabilities in the parameter file"
unless $priors_string;
my @priors = split /\s+/, $priors_string;
@priors = grep {/$_num_regex/; $_} @priors;
my $sum = 0;
foreach my $prior (@priors) {
$sum += $prior;
}
croak "Your priors in the parameter file do not add up to 1" unless $sum == 1;
my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/;
my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string;
@cluster_strings = grep $_, @cluster_strings;
my $K = @cluster_strings;
croak "Too many clusters requested" if $K > 12;
croak "Mismatch between the number of values for priors and the number " .
"of clusters" unless $K == @priors;
my @point_labels = ('a'..'z');
print "Prior probabilities recorded from param file: @priors\n";
print "Number of Gaussians used for the synthetic data: $K\n";
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
# The call `run_bayes_classifier()' shown above carries out a disjoint clustering
# of all the data points using the naive Bayes' classifier. And the call
# `return_disjoint_clusters()' returns the clusters thus formed to you. Once you
# have obtained access to the clusters in this manner, you can display them in
# your terminal window by
foreach my $index (0..@$clusters-1) {
print "Cluster $index (Naive Bayes): @{$clusters->[$index]}\n\n"
}
# If you would like to also see the clusters purely on the basis of the posterior
# class probabilities exceeding a threshold, call
my $theta1 = 0.2;
my $posterior_prob_clusters =
$clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);
# where you can obviously set the threshold $theta1 to any value you wish. Note
# that now you may end up with clusters that overlap. You can display them in
# your terminal window in the same manner as shown above for the naive Bayes'
# clusters.
# You can write the naive Bayes' clusters out to files, one cluster per file, by
# calling
$clusterer->write_naive_bayes_clusters_to_files();
# The clusters are placed in files with names like
naive_bayes_cluster1.txt
naive_bayes_cluster2.txt
...
# In the same manner, you can write out the posterior probability based possibly
# overlapping clusters to files by calling:
$clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);
# where the threshold $theta1 sets the probability threshold for deciding which
# data elements to place in a cluster. These clusters are placed in files with
# names like
posterior_prob_cluster1.txt
posterior_prob_cluster2.txt
...
# CLUSTER VISUALIZATION:
# You must first set the mask for cluster visualization. This mask tells the
# module which 2D or 3D subspace of the original data space you wish to visualize
# the clusters in:
my $visualization_mask = "111";
$clusterer->visualize_clusters($visualization_mask);
$clusterer->visualize_distributions($visualization_mask);
$clusterer->plot_hardcopy_clusters($visualization_mask);
$clusterer->plot_hardcopy_distributions($visualization_mask);
# where the last two invocations are for writing out the PNG plots of the
# visualization displays to disk files. The PNG image of the posterior
# probability distributions is written out to a file named posterior_prob_plot.png
# and the PNG image of the disjoint clusters to a file called cluster_plot.png.
# SYNTHETIC DATA GENERATION:
# The module has been provided with a class method for generating multivariate
# data for experimenting with the EM algorithm. The data generation is controlled
# by the contents of a parameter file that is supplied as an argument to the data
# generator method. The priors, the means, and the covariance matrices in the
# parameter file must be according to the syntax shown in the `param1.txt' file in
# the `examples' directory. It is best to edit a copy of this file for your
# synthetic data generation needs.
my $parameter_file = "param1.txt";
my $out_datafile = "mydatafile1.dat";
Algorithm::ExpectationMaximization->cluster_data_generator(
input_parameter_file => $parameter_file,
output_datafile => $out_datafile,
total_number_of_data_points => $N );
# where the value of $N is the total number of data points you would like to see
# generated for all of the Gaussians. How this total number is divided up amongst
# the Gaussians is decided by the prior probabilities for the Gaussian components
# as declared in input parameter file. The synthetic data may be visualized in a
# terminal window and the visualization written out as a PNG image to a diskfile
# by
my $data_visualization_mask = "11";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);
=head1 CHANGES
Version 1.22 should work with data in CSV files.
Version 1.21 incorporates minor code clean up. Overall, the module implementation
remains unchanged.
Version 1.2 allows the module to also be used for 1-D data. The visualization code
for 1-D shows the clusters through their histograms.
Version 1.1 incorporates much cleanup of the documentation associated with the
module. Both the top-level module documentation, especially the Description part,
and the comments embedded in the code were revised for better utilization of the
module. The basic implementation code remains unchanged.
=head1 DESCRIPTION
B<Algorithm::ExpectationMaximization> is a I<perl5> module for the
Expectation-Maximization (EM) method of clustering numerical data that lends itself
to modeling as a Gaussian mixture. Since the module is entirely in Perl (in the
sense that it is not a Perl wrapper around a C library that actually does the
clustering), the code in the module can easily be modified to experiment with several
aspects of EM.
Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
C<K> Gaussian components, each characterized by its own mean vector and its own
covariance matrix. Obviously, given observed data for clustering, we do not know
which of the C<K> Gaussian components was responsible for any of the data elements.
GMM also associates a prior probability with each Gaussian component. In general,
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
be written out to disk files by invoking C<write_naive_bayes_clusters_to_files()>.
This method writes out the clusters to files, one cluster per file. What is written
out to each file consists of the symbolic names of the data records that belong to
the cluster corresponding to that file. The clusters are placed in files with names
like
naive_bayes_cluster1.txt
naive_bayes_cluster2.txt
...
The soft clusters on the other hand are created by calling
C<return_clusters_with_posterior_probs_above_threshold($theta1)>
on an instance of the module, where the argument C<$theta1>
is the threshold for deciding whether a data element belongs
in a soft cluster. The posterior class probability at a
data element must exceed the threshold for the element to
belong to the corresponding cluster. The soft cluster can
be written out to disk files by calling
C<write_posterior_prob_clusters_above_threshold_to_files($theta1)>.
As with the hard clusters, each cluster is placed in a separate
file. The filenames for such clusters look like:
posterior_prob_cluster1.txt
posterior_prob_cluster2.txt
...
=head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?
The module constructor requires that you supply a value for the parameter C<K>, which
is the number of clusters you expect to see in the data. But what if you do not have
a good value for C<K>? Note that it is possible to search for the best C<K> by using
the two clustering quality criteria included in the module. However, I have
intentionally not yet incorporated that feature in the module because it slows down
the execution of the code --- especially when the dimensionality of the data
increases. However, nothing prevents you from writing a script --- along the lines
of the five "canned_example" scripts in the C<examples> directory --- that would use
the two clustering quality metrics for finding the best choice for C<K> for a given
dataset. Obviously, you will now have to incorporate the call to the constructor in
a loop and check the value of the quality measures for each value of C<K>.
=head1 SOME RESULTS OBTAINED WITH THIS MODULE
If you would like to see some results that have been obtained with this module, check
out Section 7 of the report
L<https://engineering.purdue.edu/kak/Tutorials/ExpectationMaximization.pdf>.
=head1 THE C<examples> DIRECTORY
Becoming familiar with this directory should be your best
strategy to become comfortable with this module (and its
future versions). You are urged to start by executing the
following five example scripts:
=over 16
=item I<canned_example1.pl>
This example applies the EM algorithm to the data contained in the datafile
C<mydatafile.dat>. The mixture data in the file corresponds to three overlapping
Gaussian components in a star-shaped pattern. The EM based clustering for this data
is shown in the files C<save_example_1_cluster_plot.png> and
C<save_example_1_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained on the basis of the posterior class probabilities at the data
points.
=item I<canned_example2.pl>
The datafile used in this example is C<mydatafile2.dat>. This mixture data
corresponds to two well-separated relatively isotropic Gaussians. EM based clustering for this
data is shown in the files C<save_example_2_cluster_plot.png> and
C<save_example_2_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained by using the posterior class probabilities at the data points.
=item I<canned_example3.pl>
Like the first example, this example again involves three Gaussians, but now their
means are not co-located. Additionally, we now seed the clusters manually by
specifying three selected data points as the initial guesses for the cluster means.
The datafile used for this example is C<mydatafile3.dat>. The EM based clustering
for this data is shown in the files C<save_example_3_cluster_plot.png> and
C<save_example_3_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained on the basis of the posterior class probabilities at the data
points.
=item I<canned_example4.pl>
Whereas the three previous examples demonstrated EM based clustering of 2D data, we
now present an example of clustering in 3D. The datafile used in this example is
C<mydatafile4.dat>. This mixture data corresponds to three well-separated but highly
anisotropic Gaussians. The EM derived clustering for this data is shown in the files
C<save_example_4_cluster_plot.png> and C<save_example_4_posterior_prob_plot.png>, the
former displaying the hard clusters obtained by using the naive Bayes' classifier and
the latter showing the soft clusters obtained on the basis of the posterior class
probabilities at the data points.
You may also wish to run this example on the data in a CSV file in the C<examples>
directory. The name of the file is C<sphericaldata.csv>.
=item I<canned_example5.pl>
We again demonstrate clustering in 3D but now we have one Gaussian cluster that
"cuts" through the other two Gaussian clusters. The datafile used in this example is
C<mydatafile5.dat>. The three Gaussians in this case are highly overlapping and
highly anisotropic. The EM derived clustering for this data is shown in the files
C<save_example_5_cluster_plot.png> and C<save_example_5_posterior_prob_plot.png>, the
former displaying the hard clusters obtained by using the naive Bayes' classifier and
the latter showing the soft clusters obtained through the posterior class
probabilities at the data points.
=item I<canned_example6.pl>
This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
In order to visualize the clusters for the 1-D case, we show them through their
respective histograms. The datafile used in this example is C<mydatafile7.dat>. The
data consists of two overlapping Gaussians. The EM derived clustering for this data
is shown in the files C<save_example_6_cluster_plot.png> and
C<save_example_6_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained through the posterior class probabilities at the data points.
=back
Going through the six examples listed above will make you familiar with how to make
the calls to the clustering and the visualization methods. The C<examples> directory
also includes several parameter files with names like
param1.txt
param2.txt
param3.txt
...
These were used to generate the synthetic data for which the results are shown in the
C<examples> directory. Just make a copy of one of these files and edit it if you
would like to generate your own multivariate data for clustering. Note that you can
generate data with any dimensionality through appropriate entries in the parameter
file.
=head1 CAVEATS
When you run the scripts in the C<examples> directory, your results will NOT always
look like what I have shown in the PNG image files in the directory. As mentioned
earlier in Description, the EM algorithm starting from randomly chosen initial
guesses for the cluster means can get stuck in a local maximum.
That raises an interesting question of how one judges the correctness of clustering
results when dealing with real experimental data. For real data, the best approach
is to try the EM algorithm multiple times with all of the seeding options included in
this module. It would be safe to say that, at least in low dimensional spaces and
with sufficient data, a majority of your runs should yield "correct" results.
Also bear in mind that a pure Perl implementation is not meant for the clustering of
very large data files. It is really designed more for researching issues related to
EM based approaches to clustering.
=head1 REQUIRED
This module requires the following three modules:
Math::Random
Graphics::GnuplotIF
Math::GSL::Matrix
the first for generating the multivariate random numbers, the second for the
visualization of the clusters, and the last for access to the Perl wrappers for the
GNU Scientific Library. The C<Matrix> module of this library is used for various
algebraic operations on the covariance matrices of the Gaussians.
=head1 EXPORT
None by design.
=head1 BUGS
Please notify the author if you encounter any bugs. When sending email, please place
the string 'Algorithm EM' in the subject line.
( run in 2.268 seconds using v1.01-cache-2.11-cpan-df04353d9ac )