Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  '"original data provided for EM"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_} @all_data;
        @all_data = grep $_, @all_data;
        my ($minval,$maxval) = minmax(\@all_data);
        my $delta = ($maxval - $minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        my $localfilename = basename($filename);
        my $temp_file = "__temp1dhist_" .  $localfilename;
        unlink $temp_file if -e $temp_file;
        open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
        print OUTPUT "Xstep histval\n";
        my @histogram = (0) x 100;
        foreach my $i (0..@all_data-1) {
            $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
        }
        foreach my $i (0..@histogram-1) {
            print OUTPUT "$i $histogram[$i]\n";        
        }
        $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
        close OUTPUT;
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}


###################  Generating Synthetic Data for Clustering  ###################

#  The data generated corresponds to a multivariate distribution.  The mean and the
#  covariance of each Gaussian in the distribution are specified individually in a
#  parameter file. The parameter file must also state the prior probabilities to be
#  associated with each Gaussian.  See the example parameter file param1.txt in the
#  examples directory.  Just edit this file for your own needs.
#
#  The multivariate random numbers are generated by calling the Math::Random module.
#  As you would expect, that module will insist that the covariance matrix you
#  specify be symmetric and positive definite.
sub cluster_data_generator {
    my $class = shift;
    die "illegal call of a class method" 
        unless $class eq 'Algorithm::ExpectationMaximization';
    my %args = @_;
    my $input_parameter_file = $args{input_parameter_file};
    my $output_file = $args{output_datafile};
    my $N = $args{total_number_of_data_points};
    my @all_params;
    my $param_string;
    if (defined $input_parameter_file) {
        open INPUT, $input_parameter_file || "unable to open parameter file: $!";
        @all_params = <INPUT>;
        @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
        chomp @all_params;
        $param_string = join ' ', @all_params;
    } else {
        # Just for testing. Used in t/test.t
        $param_string = "priors 0.34 0.33 0.33 " .
                        "cluster 5 0 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 5 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 0 5  1 0 0 0 1 0 0 0 1";
    }
    my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
    croak "You did not specify the prior probabilities in the parameter file"
        unless $priors_string;
    my @priors = split /\s+/, $priors_string;
    @priors = grep {/$_num_regex/; $_} @priors;
    my $sum = 0;
    foreach my $prior (@priors) {
        $sum += $prior;
    }
    croak "Your priors in the parameter file do not add up to 1"  unless $sum == 1;
    my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/;
    my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string;
    @cluster_strings = grep  $_, @cluster_strings;

    my $K = @cluster_strings;
    croak "Too many clusters requested" if $K > 12;
    croak "Mismatch between the number of values for priors and the number " . 
        "of clusters" unless $K == @priors;
    my @point_labels = ('a'..'z');
    print "Prior probabilities recorded from param file: @priors\n";
    print "Number of Gaussians used for the synthetic data: $K\n";
    my @means;
    my @covariances;
    my $data_dimension;
    foreach my $i (0..$K-1) {
        my @num_strings = split /  /, $cluster_strings[$i];
        my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
        $data_dimension = @cluster_mean;
        push @means, \@cluster_mean;
        my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
        croak "dimensionality error" if @covariance_nums != 
                                      ($data_dimension ** 2);
        my $cluster_covariance;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

  #  If you would like to also see the clusters purely on the basis of the posterior
  #  class probabilities exceeding a threshold, call

  my $theta1 = 0.2;
  my $posterior_prob_clusters =
           $clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);

  #  where you can obviously set the threshold $theta1 to any value you wish.  Note
  #  that now you may end up with clusters that overlap.  You can display them in
  #  your terminal window in the same manner as shown above for the naive Bayes'
  #  clusters.

  #  You can write the naive Bayes' clusters out to files, one cluster per file, by
  #  calling

  $clusterer->write_naive_bayes_clusters_to_files();  

  #  The clusters are placed in files with names like

         naive_bayes_cluster1.txt
         naive_bayes_cluster2.txt
         ...

  #  In the same manner, you can write out the posterior probability based possibly
  #  overlapping clusters to files by calling:

  $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);

  #  where the threshold $theta1 sets the probability threshold for deciding which
  #  data elements to place in a cluster.  These clusters are placed in files with
  #  names like

         posterior_prob_cluster1.txt
         posterior_prob_cluster2.txt
         ...

  # CLUSTER VISUALIZATION:

  #  You must first set the mask for cluster visualization. This mask tells the 
  #  module which 2D or 3D subspace of the original data space you wish to visualize 
  #  the clusters in:

  my $visualization_mask = "111";
  $clusterer->visualize_clusters($visualization_mask);
  $clusterer->visualize_distributions($visualization_mask);
  $clusterer->plot_hardcopy_clusters($visualization_mask);
  $clusterer->plot_hardcopy_distributions($visualization_mask);

  #  where the last two invocations are for writing out the PNG plots of the
  #  visualization displays to disk files.  The PNG image of the posterior
  #  probability distributions is written out to a file named posterior_prob_plot.png
  #  and the PNG image of the disjoint clusters to a file called cluster_plot.png.

  # SYNTHETIC DATA GENERATION:

  #  The module has been provided with a class method for generating multivariate
  #  data for experimenting with the EM algorithm.  The data generation is controlled
  #  by the contents of a parameter file that is supplied as an argument to the data
  #  generator method.  The priors, the means, and the covariance matrices in the
  #  parameter file must be according to the syntax shown in the `param1.txt' file in
  #  the `examples' directory. It is best to edit a copy of this file for your
  #  synthetic data generation needs.

  my $parameter_file = "param1.txt";
  my $out_datafile = "mydatafile1.dat";
  Algorithm::ExpectationMaximization->cluster_data_generator(
                          input_parameter_file => $parameter_file,
                          output_datafile => $out_datafile,
                          total_number_of_data_points => $N );

  #  where the value of $N is the total number of data points you would like to see
  #  generated for all of the Gaussians.  How this total number is divided up amongst
  #  the Gaussians is decided by the prior probabilities for the Gaussian components
  #  as declared in input parameter file.  The synthetic data may be visualized in a
  #  terminal window and the visualization written out as a PNG image to a diskfile
  #  by

  my $data_visualization_mask = "11";                                            
  $clusterer->visualize_data($data_visualization_mask);                          
  $clusterer->plot_hardcopy_data($data_visualization_mask);


=head1 CHANGES

Version 1.22 should work with data in CSV files.

Version 1.21 incorporates minor code clean up.  Overall, the module implementation
remains unchanged.

Version 1.2 allows the module to also be used for 1-D data.  The visualization code
for 1-D shows the clusters through their histograms.

Version 1.1 incorporates much cleanup of the documentation associated with the
module.  Both the top-level module documentation, especially the Description part,
and the comments embedded in the code were revised for better utilization of the
module.  The basic implementation code remains unchanged.


=head1 DESCRIPTION

B<Algorithm::ExpectationMaximization> is a I<perl5> module for the
Expectation-Maximization (EM) method of clustering numerical data that lends itself
to modeling as a Gaussian mixture.  Since the module is entirely in Perl (in the
sense that it is not a Perl wrapper around a C library that actually does the
clustering), the code in the module can easily be modified to experiment with several
aspects of EM.

Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
C<K> Gaussian components, each characterized by its own mean vector and its own
covariance matrix.  Obviously, given observed data for clustering, we do not know
which of the C<K> Gaussian components was responsible for any of the data elements.
GMM also associates a prior probability with each Gaussian component.  In general,
these priors will also be unknown.  So the problem of clustering consists of
estimating the posterior class probability at each data element and also estimating
the class priors. Once these posterior class probabilities and the priors are
estimated with EM, we can use the naive Bayes' classifier to partition the data into
disjoint clusters.  Or, for "soft" clustering, we can find all the data elements that
belong to a Gaussian component on the basis of the posterior class probabilities at
the data elements exceeding a prescribed threshold.

If you do not mind the fact that it is possible for the EM algorithm to occasionally

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

This method create a PNG file from the C<gnuplot> created
display of the naive Bayes' clusters obtained from the data.
The plotting functionality of C<gnuplot> is accessed through
the Perl wrappers provided by the C<Graphics::GnuplotIF>
module.

=item B<plot_hardcopy_distributions($visualization_mask):>

    $clusterer->plot_hardcopy_distributions($visualization_mask);

This method create a PNG file from the C<gnuplot> created
display of the clusters that correspond to the posterior
class probabilities exceeding a specified threshold. The
plotting functionality of C<gnuplot> is accessed through the
Perl wrappers provided by the C<Graphics::GnuplotIF> module.

=item B<display_fisher_quality_vs_iterations():>

    $clusterer->display_fisher_quality_vs_iterations();

This method measures the quality of clustering by
calculating the normalized average squared distance between
the cluster centers, the normalization being provided by the
average cluster covariance. See the Description for further
details.  In general, this measure is NOT useful for
overlapping clusters.

=item B<display_mdl_quality_vs_iterations():>

    $clusterer->display_mdl_quality_vs_iterations();

At the end of each iteration, this method measures the
quality of clustering my calculating its MDL (Minimum
Description Length).  As stated earlier in Description, the
MDL measure is a difference of a log-likelihood term for all
of the observed data and a model-complexity penalty term.
The smaller the value returned by this method, the better
the clustering.

=item B<return_estimated_priors():>

    my $estimated_priors = $clusterer->return_estimated_priors();
    print "Estimated class priors: @$estimated_priors\n";

This method can be used to access the final values of the
class priors as estimated by the EM algorithm.

=item  B<cluster_data_generator()>

    Algorithm::ExpectationMaximization->cluster_data_generator(
                            input_parameter_file => $parameter_file,
                            output_datafile => $out_datafile,
                            total_number_of_data_points => 300 
    );

for generating multivariate data for clustering if you wish to play with synthetic
data for experimenting with the EM algorithm.  The input parameter file must specify
the priors to be used for the Gaussians, their means, and their covariance matrices.
The format of the information contained in the parameter file must be as shown in the
file C<param1.txt> provided in the C<examples> directory.  It will be easiest for you
to just edit a copy of this file for your data generation needs.  In addition to the
format of the parameter file, the main constraint you need to observe in specifying
the parameters is that the dimensionality of the covariance matrices must correspond
to the dimensionality of the mean vectors.  The multivariate random numbers are
generated by calling the C<Math::Random> module.  As you would expect, this module
requires that the covariance matrices you specify in your parameter file be symmetric
and positive definite.  Should the covariances in your parameter file not obey this
condition, the C<Math::Random> module will let you know.

=item B<visualize_data($data_visualization_mask):>

    $clusterer->visualize_data($data_visualization_mask);                          

This is the method to call if you want to visualize the data
you plan to cluster with the EM algorithm.  You'd need to
specify argument mask in a manner similar to the
visualization of the clusters, as explained earlier.

=item B<plot_hardcopy_data($data_visualization_mask):>

    $clusterer->plot_hardcopy_data($data_visualization_mask); 

This method creates a PNG file that can be used to print out
a hardcopy of the data in different 2D and 3D subspaces of
the data space. The visualization mask is used to select the
subspace for the PNG image.

=back

=head1 HOW THE CLUSTERS ARE OUTPUT

This module produces two different types of clusters: the "hard" clusters and the
"soft" clusters.  The hard clusters correspond to the naive Bayes' classification of
the data points on the basis of the Gaussian distributions and the class priors
estimated by the EM algorithm. Such clusters partition the data into disjoint
subsets.  On the other hand, the soft clusters correspond to the posterior class
probabilities calculated by the EM algorithm.  A data element belongs to a cluster if
its posterior probability for that Gaussian exceeds a threshold.

After the EM algorithm has finished running, the hard clusters are created by
invoking the method C<run_bayes_classifier()> on an instance of the module and then
made user-accessible by calling C<return_disjoint_clusters()>.  These clusters may
then be displayed in a terminal window by dereferencing each element of the array
whose reference is returned b C<return_disjoint_clusters()>.  The hard clusters can
be written out to disk files by invoking C<write_naive_bayes_clusters_to_files()>.
This method writes out the clusters to files, one cluster per file.  What is written
out to each file consists of the symbolic names of the data records that belong to
the cluster corresponding to that file.  The clusters are placed in files with names
like

    naive_bayes_cluster1.txt
    naive_bayes_cluster2.txt
    ...

The soft clusters on the other hand are created by calling
C<return_clusters_with_posterior_probs_above_threshold($theta1)>
on an instance of the module, where the argument C<$theta1>
is the threshold for deciding whether a data element belongs
in a soft cluster.  The posterior class probability at a
data element must exceed the threshold for the element to
belong to the corresponding cluster.  The soft cluster can

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

=item I<canned_example3.pl>

Like the first example, this example again involves three Gaussians, but now their
means are not co-located.  Additionally, we now seed the clusters manually by
specifying three selected data points as the initial guesses for the cluster means.
The datafile used for this example is C<mydatafile3.dat>.  The EM based clustering
for this data is shown in the files C<save_example_3_cluster_plot.png> and
C<save_example_3_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained on the basis of the posterior class probabilities at the data
points.

=item I<canned_example4.pl>

Whereas the three previous examples demonstrated EM based clustering of 2D data, we
now present an example of clustering in 3D.  The datafile used in this example is
C<mydatafile4.dat>.  This mixture data corresponds to three well-separated but highly
anisotropic Gaussians. The EM derived clustering for this data is shown in the files
C<save_example_4_cluster_plot.png> and C<save_example_4_posterior_prob_plot.png>, the
former displaying the hard clusters obtained by using the naive Bayes' classifier and
the latter showing the soft clusters obtained on the basis of the posterior class
probabilities at the data points.

You may also wish to run this example on the data in a CSV file in the C<examples>
directory. The name of the file is C<sphericaldata.csv>.  

=item I<canned_example5.pl>

We again demonstrate clustering in 3D but now we have one Gaussian cluster that
"cuts" through the other two Gaussian clusters.  The datafile used in this example is
C<mydatafile5.dat>.  The three Gaussians in this case are highly overlapping and
highly anisotropic.  The EM derived clustering for this data is shown in the files
C<save_example_5_cluster_plot.png> and C<save_example_5_posterior_prob_plot.png>, the
former displaying the hard clusters obtained by using the naive Bayes' classifier and
the latter showing the soft clusters obtained through the posterior class
probabilities at the data points.

=item I<canned_example6.pl>

This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
In order to visualize the clusters for the 1-D case, we show them through their
respective histograms.  The datafile used in this example is C<mydatafile7.dat>.  The
data consists of two overlapping Gaussians.  The EM derived clustering for this data
is shown in the files C<save_example_6_cluster_plot.png> and
C<save_example_6_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained through the posterior class probabilities at the data points.

=back

Going through the six examples listed above will make you familiar with how to make
the calls to the clustering and the visualization methods.  The C<examples> directory
also includes several parameter files with names like

    param1.txt
    param2.txt
    param3.txt 
    ...

These were used to generate the synthetic data for which the results are shown in the
C<examples> directory.  Just make a copy of one of these files and edit it if you
would like to generate your own multivariate data for clustering.  Note that you can
generate data with any dimensionality through appropriate entries in the parameter
file.

=head1 CAVEATS

When you run the scripts in the C<examples> directory, your results will NOT always
look like what I have shown in the PNG image files in the directory.  As mentioned
earlier in Description, the EM algorithm starting from randomly chosen initial
guesses for the cluster means can get stuck in a local maximum.

That raises an interesting question of how one judges the correctness of clustering
results when dealing with real experimental data.  For real data, the best approach
is to try the EM algorithm multiple times with all of the seeding options included in
this module.  It would be safe to say that, at least in low dimensional spaces and
with sufficient data, a majority of your runs should yield "correct" results.

Also bear in mind that a pure Perl implementation is not meant for the clustering of
very large data files.  It is really designed more for researching issues related to
EM based approaches to clustering.


=head1 REQUIRED

This module requires the following three modules:

   Math::Random
   Graphics::GnuplotIF
   Math::GSL::Matrix

the first for generating the multivariate random numbers, the second for the
visualization of the clusters, and the last for access to the Perl wrappers for the
GNU Scientific Library.  The C<Matrix> module of this library is used for various
algebraic operations on the covariance matrices of the Gaussians.

=head1 EXPORT

None by design.


=head1 BUGS

Please notify the author if you encounter any bugs.  When sending email, please place
the string 'Algorithm EM' in the subject line.

=head1 INSTALLATION

Download the archive from CPAN in any directory of your choice.  Unpack the archive
with a command that on a Linux machine would look like:

    tar zxvf Algorithm-ExpectationMaximization-1.22.tar.gz

This will create an installation directory for you whose name will be
C<Algorithm-ExpectationMaximization-1.22>.  Enter this directory and execute the
following commands for a standard install of the module if you have root privileges:

    perl Makefile.PL
    make
    make test
    sudo make install



( run in 0.852 second using v1.01-cache-2.11-cpan-385001e3568 )