Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        $covariance *= 1.0 / $num_cols;
        push @cluster_covariances, $covariance;
        display_matrix("Displaying the cluster covariance",  $covariance );
    }
    return (\@cluster_mean_vecs, \@cluster_covariances);
}

sub find_data_dimensionality {
    my $clusters = shift;
    my @first_cluster = @{$clusters->[0]};
    my @first_data_element = @{$first_cluster[0]};
    return scalar(@first_data_element);
}

sub find_seed_centered_covariances {
    my $self = shift;
    my $seed_tags = shift;
    my (@seed_mean_vecs, @seed_based_covariances);
    foreach my $seed_tag (@$seed_tags) {
        my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
        my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);  
        my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
        $mean_vec->set_col(0, $self->{_data}->{$seed_tag});
        push @seed_mean_vecs, $mean_vec;
        display_matrix( "Displaying the seed mean vector",  $mean_vec );
        my $col_index = 0;
        foreach my $tag (@{$self->{_data_id_tags}}) {
            $matrix->set_col($col_index++, $self->{_data}->{$tag});
        }
        foreach my $j (0..$num_cols-1) {
            my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
            $matrix->set_col($j, \@new_col);
        }
        my $transposed = transpose( $matrix );
        my $covariance = matrix_multiply( $matrix, $transposed );
        $covariance *= 1.0 / $num_cols;
        push @seed_based_covariances, $covariance;
        display_matrix("Displaying the seed covariance",  $covariance )
            if $self->{_debug};
    }
    return (\@seed_mean_vecs, \@seed_based_covariances);
}

# The most popular seeding mode for EM is random. We include two other seeding modes
# --- kmeans and manual --- since they do produce good results for specialized cases.
# For example, when the clusters in your data are non-overlapping and not too
# anisotropic, the kmeans based seeding should work at least as well as the random
# seeding.  In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
# the advantage of avoiding the getting stuck in a local-maximum problem of the EM
# algorithm.
sub seed_the_clusters {
    my $self = shift;
    if ($self->{_seeding} eq 'random') {
        my @covariances;
        my @means;
        my @all_tags = @{$self->{_data_id_tags}}; 
        my @seed_tags;
        foreach my $i (0..$self->{_K}-1) {
            push @seed_tags, $all_tags[int rand( $self->{_N} )];
        }
        print "Random Seeding: Randomly selected seeding tags are  @seed_tags\n\n";
        my ($seed_means, $seed_covars) = 
                        $self->find_seed_centered_covariances(\@seed_tags);
        $self->{_cluster_means} = $seed_means;
        $self->{_cluster_covariances} = $seed_covars;
    } elsif ($self->{_seeding} eq 'kmeans') {
        $self->kmeans();
        my $clusters = $self->{_clusters};
        my @dataclusters;
        foreach my $index (0..@$clusters-1) {
            push @dataclusters, [];
        }
        foreach my $cluster_index (0..$self->{_K}-1) {
            foreach my $tag (@{$clusters->[$cluster_index]}) {
                my $data = $self->{_data}->{$tag};
                push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
            }
        }
        ($self->{_cluster_means}, $self->{_cluster_covariances}) =
                           find_cluster_means_and_covariances(\@dataclusters);
    } elsif ($self->{_seeding} eq 'manual') {
        die "You have not supplied the seeding tags for the option \"manual\""
            unless @{$self->{_seed_tags}} > 0;
        print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
        foreach my $tag (@{$self->{_seed_tags}}) {
            die "invalid tag used for manual seeding" 
                unless exists $self->{_data}->{$tag};
        }
        my ($seed_means, $seed_covars) = 
            $self->find_seed_centered_covariances($self->{_seed_tags});
        $self->{_cluster_means} = $seed_means;
        $self->{_cluster_covariances} = $seed_covars;
    } else {
        die "Incorrect call syntax used.  See documentation.";
    }
}

# This is the top-level method for kmeans based initialization of the EM
# algorithm. The means and the covariances returned by kmeans are used to seed the EM
# algorithm.
sub kmeans {
    my $self = shift;
    my $K = $self->{_K};
    $self->cluster_for_fixed_K_single_smart_try($K);
    if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
        return ($self->{_clusters}, $self->{_cluster_centers});
    } else {
        die "kmeans clustering failed.";
    }
}

# Used by the kmeans algorithm for the initialization of the EM iterations.  We do
# initial kmeans cluster seeding by subjecting the data to principal components
# analysis in order to discover the direction of maximum variance in the data space.
# Subsequently, we try to find the K largest peaks along this direction.  The
# coordinates of these peaks serve as the seeds for the K clusters.
sub cluster_for_fixed_K_single_smart_try {
    my $self = shift;
    my $K = shift;
    print "Clustering for K = $K\n" if $self->{_terminal_output};
    my ($clusters, $cluster_centers) =

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    foreach my $i (0..$nrows1-1) {
        my $row = $matrix1->row($i);
        my $row_times_col = matrix_vector_multiply($row, $col);
            push @product_col, $row_times_col;
    }
    $product->set_col(0, \@product_col);
    return $product;
}

sub matrix_trace {
    my $matrix = shift;
    my ($nrows, $ncols) = ($matrix->rows(), $matrix->cols());
    die "trace can only be calculated for a square matrix"
                                      unless $ncols == $nrows;
    my @elements = $matrix->as_list;
    my $trace = 0;
    foreach my $i (0..$nrows-1) {
        $trace += $elements[$i +  $i * $ncols];
    }
    return $trace;
}

1;

=pod

=head1 NAME

Algorithm::ExpectationMaximization -- A Perl module for clustering numerical
multi-dimensional data with the Expectation-Maximization algorithm.

=head1 SYNOPSIS

  use Algorithm::ExpectationMaximization;

  #  First name the data file:

  my $datafile = "mydatafile.csv";

  #  Next, set the mask to indicate which columns of the datafile to use for
  #  clustering and which column contains a symbolic ID for each data record. For
  #  example, if the symbolic name is in the first column, you want the second column
  #  to be ignored, and you want the next three columns to be used for 3D clustering:

  my $mask = "N0111";

  #  Now construct an instance of the clusterer.  The parameter `K' controls the
  #  number of clusters.  Here is an example call to the constructor for instance
  #  creation:

  my $clusterer = Algorithm::ExpectationMaximization->new(
                                      datafile            => $datafile,
                                      mask                => $mask,
                                      K                   => 3,
                                      max_em_iterations   => 300,
                                      seeding             => 'random',
                                      terminal_output     => 1,
                  );
 
  #  Note the choice for `seeding'. The choice `random' means that the clusterer will
  #  randomly select `K' data points to serve as initial cluster centers.  Other
  #  possible choices for the constructor parameter `seeding' are `kmeans' and
  #  `manual'.  With the `kmeans' option for `seeding', the output of a K-means
  #  clusterer is used for the cluster seeds and the initial cluster covariances.  If
  #  you use the `manual' option for seeding, you must also specify the data elements
  #  to use for seeding the clusters.

  #  Here is an example of a call to the constructor when we choose the `manual'
  #  option for seeding the clusters and for specifying the data elements for
  #  seeding.  The data elements are specified by their tag names.  In this case,
  #  these names are `a26', `b53', and `c49':

  my $clusterer = Algorithm::ExpectationMaximization->new(
                                      datafile            => $datafile,
                                      mask                => $mask,
                                      class_priors        => [0.6, 0.2, 0.2],
                                      K                   => 3,
                                      max_em_iterations   => 300,
                                      seeding             => 'manual',
                                      seed_tags           => ['a26', 'b53', 'c49'],
                                      terminal_output     => 1,
                                    );

  #  This example call to the constructor also illustrates how you can inject class
  #  priors into the clustering process. The class priors are the prior probabilities
  #  of the class distributions in your dataset.  As explained later, injecting class
  #  priors in the manner shown above makes statistical sense only for the case of
  #  manual seeding.  When you do inject class priors, the order in which the priors
  #  are expressed must correspond to the manually specified seeds for the clusters.

  #  After the invocation of the constructor, the following calls are mandatory
  #  for reasons that should be obvious from the names of the methods:

  $clusterer->read_data_from_file();
  srand(time);
  $clusterer->seed_the_clusters();
  $clusterer->EM();
  $clusterer->run_bayes_classifier();
  my $clusters = $clusterer->return_disjoint_clusters();

  #  where the call to `EM()' is the invocation of the expectation-maximization
  #  algorithm.  The call to `srand(time)' is to seed the pseudo random number
  #  generator afresh for each run of the cluster seeding procedure.  If you want to
  #  see repeatable results from one run to another of the algorithm with random
  #  seeding, you would obviously not invoke `srand(time)'.

  #  The call `run_bayes_classifier()' shown above carries out a disjoint clustering
  #  of all the data points using the naive Bayes' classifier. And the call
  #  `return_disjoint_clusters()' returns the clusters thus formed to you.  Once you
  #  have obtained access to the clusters in this manner, you can display them in
  #  your terminal window by

  foreach my $index (0..@$clusters-1) {
      print "Cluster $index (Naive Bayes):   @{$clusters->[$index]}\n\n"
  }

  #  If you would like to also see the clusters purely on the basis of the posterior
  #  class probabilities exceeding a threshold, call

  my $theta1 = 0.2;
  my $posterior_prob_clusters =

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN


Version 1.22 should work with data in CSV files.

Version 1.21 incorporates minor code clean up.  Overall, the module implementation
remains unchanged.

Version 1.2 allows the module to also be used for 1-D data.  The visualization code
for 1-D shows the clusters through their histograms.

Version 1.1 incorporates much cleanup of the documentation associated with the
module.  Both the top-level module documentation, especially the Description part,
and the comments embedded in the code were revised for better utilization of the
module.  The basic implementation code remains unchanged.


=head1 DESCRIPTION

B<Algorithm::ExpectationMaximization> is a I<perl5> module for the
Expectation-Maximization (EM) method of clustering numerical data that lends itself
to modeling as a Gaussian mixture.  Since the module is entirely in Perl (in the
sense that it is not a Perl wrapper around a C library that actually does the
clustering), the code in the module can easily be modified to experiment with several
aspects of EM.

Gaussian Mixture Modeling (GMM) is based on the assumption that the data consists of
C<K> Gaussian components, each characterized by its own mean vector and its own
covariance matrix.  Obviously, given observed data for clustering, we do not know
which of the C<K> Gaussian components was responsible for any of the data elements.
GMM also associates a prior probability with each Gaussian component.  In general,
these priors will also be unknown.  So the problem of clustering consists of
estimating the posterior class probability at each data element and also estimating
the class priors. Once these posterior class probabilities and the priors are
estimated with EM, we can use the naive Bayes' classifier to partition the data into
disjoint clusters.  Or, for "soft" clustering, we can find all the data elements that
belong to a Gaussian component on the basis of the posterior class probabilities at
the data elements exceeding a prescribed threshold.

If you do not mind the fact that it is possible for the EM algorithm to occasionally
get stuck in a local maximum and to, therefore, produce a wrong answer even when you
know the data to be perfectly multimodal Gaussian, EM is probably the most magical
approach to clustering multidimensional data.  Consider the case of clustering
three-dimensional data.  Each Gaussian cluster in 3D space is characterized by the
following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
must be symmetric positive-definite), the 3 unique elements of the mean, and the
prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
your data.  What that means is that you would want the values for 59 variables
(remember the unit-summation constraint on the class priors which reduces the overall
number of variables by one) to be estimated by the algorithm that seeks to discover
the clusters in your data.  What's amazing is that, despite the large number of
variables that must be optimized simultaneously, the EM algorithm will very likely
give you a good approximation to the right answer.

At its core, EM depends on the notion of unobserved data and the averaging of the
log-likelihood of the data actually observed over all admissible probabilities for
the unobserved data.  But what is unobserved data?  While in some cases where EM is
used, the unobserved data is literally the missing data, in others, it is something
that cannot be seen directly but that nonetheless is relevant to the data actually
observed. For the case of clustering multidimensional numerical data that can be
modeled as a Gaussian mixture, it turns out that the best way to think of the
unobserved data is in terms of a sequence of random variables, one for each observed
data point, whose values dictate the selection of the Gaussian for that data point.
This point is explained in great detail in my on-line tutorial at
L<https://engineering.purdue.edu/kak/Tutorials/ExpectationMaximization.pdf>.

The EM algorithm in our context reduces to an iterative invocation of the following
steps: (1) Given the current guess for the means and the covariances of the different
Gaussians in our mixture model, use Bayes' Rule to update the posterior class
probabilities at each of the data points; (2) Using the updated posterior class
probabilities, first update the class priors; (3) Using the updated class priors,
update the class means and the class covariances; and go back to Step (1).  Ideally,
the iterations should terminate when the expected log-likelihood of the observed data
has reached a maximum and does not change with any further iterations.  The stopping
rule used in this module is the detection of no change over three consecutive
iterations in the values calculated for the priors.

This module provides three different choices for seeding the clusters: (1) random,
(2) kmeans, and (3) manual.  When random seeding is chosen, the algorithm randomly
selects C<K> data elements as cluster seeds.  That is, the data vectors associated
with these seeds are treated as initial guesses for the means of the Gaussian
distributions.  The covariances are then set to the values calculated from the entire
dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
the other hand, the means and the covariances are set to whatever values are returned
by the kmeans algorithm.  And, when seeding is set to manual, you are allowed to
choose C<K> data elements --- by specifying their tag names --- for the seeds.  The
rest of the EM initialization for the manual mode is the same as for the random mode.
The algorithm allows for the initial priors to be specified for the manual mode of
seeding.

Much of code for the kmeans based seeding of EM was drawn from the
C<Algorithm::KMeans> module by me. The code from that module used here corresponds to
the case when the C<cluster_seeding> option in the C<Algorithm::KMeans> module is set
to C<smart>.  The C<smart> option for KMeans consists of subjecting the data to a
principal components analysis (PCA) to discover the direction of maximum variance in
the data space.  The data points are then projected on to this direction and a
histogram constructed from the projections.  Centers of the C<K> largest peaks in
this smoothed histogram are used to seed the KMeans based clusterer.  As you'd
expect, the output of the KMeans used to seed the EM algorithm.

This module uses two different criteria to measure the quality of the clustering
achieved. The first is the Minimum Description Length (MDL) proposed originally by
Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
"A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
of Statistics, 1983.)  The MDL criterion is a difference of a log-likelihood term for
all of the observed data and a model-complexity penalty term. In general, both the
log-likelihood and the model-complexity terms increase as the number of clusters
increases.  The form of the MDL criterion in this module uses for the penalty term
the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions
of a Model," The Annals of Statistics, 1978.  In general, the smaller the value of
MDL quality measure, the better the clustering of the data.

For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is
the within-class scatter matrix, more commonly denoted S_w, and SB the between-class
scatter matrix, more commonly denoted S_b (the underscore means subscript).  This
measure can be thought of as the normalized average distance between the clusters,
the normalization being provided by average cluster covariance SW^-1. Therefore, the
larger the value of this quality measure, the better the separation between the
clusters.  Since this measure has its roots in the Fisher linear discriminant
function, we incorporate the word C<fisher> in the name of the quality measure.
I<Note that this measure is good only when the clusters are disjoint.> When the
clusters exhibit significant overlap, the numbers produced by this quality measure
tend to be generally meaningless.

=head1 METHODS

The module provides the following methods for EM based
clustering, for cluster visualization, for data
visualization, and for the generation of data for testing a
clustering algorithm:

=over

=item B<new():>

A call to C<new()> constructs a new instance of the
C<Algorithm::ExpectationMaximization> class.  A typical form
of this call when you want to use random option for seeding
the algorithm is:

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

details.  In general, this measure is NOT useful for
overlapping clusters.

=item B<display_mdl_quality_vs_iterations():>

    $clusterer->display_mdl_quality_vs_iterations();

At the end of each iteration, this method measures the
quality of clustering my calculating its MDL (Minimum
Description Length).  As stated earlier in Description, the
MDL measure is a difference of a log-likelihood term for all
of the observed data and a model-complexity penalty term.
The smaller the value returned by this method, the better
the clustering.

=item B<return_estimated_priors():>

    my $estimated_priors = $clusterer->return_estimated_priors();
    print "Estimated class priors: @$estimated_priors\n";

This method can be used to access the final values of the
class priors as estimated by the EM algorithm.

=item  B<cluster_data_generator()>

    Algorithm::ExpectationMaximization->cluster_data_generator(
                            input_parameter_file => $parameter_file,
                            output_datafile => $out_datafile,
                            total_number_of_data_points => 300 
    );

for generating multivariate data for clustering if you wish to play with synthetic
data for experimenting with the EM algorithm.  The input parameter file must specify
the priors to be used for the Gaussians, their means, and their covariance matrices.
The format of the information contained in the parameter file must be as shown in the
file C<param1.txt> provided in the C<examples> directory.  It will be easiest for you
to just edit a copy of this file for your data generation needs.  In addition to the
format of the parameter file, the main constraint you need to observe in specifying
the parameters is that the dimensionality of the covariance matrices must correspond
to the dimensionality of the mean vectors.  The multivariate random numbers are
generated by calling the C<Math::Random> module.  As you would expect, this module
requires that the covariance matrices you specify in your parameter file be symmetric
and positive definite.  Should the covariances in your parameter file not obey this
condition, the C<Math::Random> module will let you know.

=item B<visualize_data($data_visualization_mask):>

    $clusterer->visualize_data($data_visualization_mask);                          

This is the method to call if you want to visualize the data
you plan to cluster with the EM algorithm.  You'd need to
specify argument mask in a manner similar to the
visualization of the clusters, as explained earlier.

=item B<plot_hardcopy_data($data_visualization_mask):>

    $clusterer->plot_hardcopy_data($data_visualization_mask); 

This method creates a PNG file that can be used to print out
a hardcopy of the data in different 2D and 3D subspaces of
the data space. The visualization mask is used to select the
subspace for the PNG image.

=back

=head1 HOW THE CLUSTERS ARE OUTPUT

This module produces two different types of clusters: the "hard" clusters and the
"soft" clusters.  The hard clusters correspond to the naive Bayes' classification of
the data points on the basis of the Gaussian distributions and the class priors
estimated by the EM algorithm. Such clusters partition the data into disjoint
subsets.  On the other hand, the soft clusters correspond to the posterior class
probabilities calculated by the EM algorithm.  A data element belongs to a cluster if
its posterior probability for that Gaussian exceeds a threshold.

After the EM algorithm has finished running, the hard clusters are created by
invoking the method C<run_bayes_classifier()> on an instance of the module and then
made user-accessible by calling C<return_disjoint_clusters()>.  These clusters may
then be displayed in a terminal window by dereferencing each element of the array
whose reference is returned b C<return_disjoint_clusters()>.  The hard clusters can
be written out to disk files by invoking C<write_naive_bayes_clusters_to_files()>.
This method writes out the clusters to files, one cluster per file.  What is written
out to each file consists of the symbolic names of the data records that belong to
the cluster corresponding to that file.  The clusters are placed in files with names
like

    naive_bayes_cluster1.txt
    naive_bayes_cluster2.txt
    ...

The soft clusters on the other hand are created by calling
C<return_clusters_with_posterior_probs_above_threshold($theta1)>
on an instance of the module, where the argument C<$theta1>
is the threshold for deciding whether a data element belongs
in a soft cluster.  The posterior class probability at a
data element must exceed the threshold for the element to
belong to the corresponding cluster.  The soft cluster can
be written out to disk files by calling
C<write_posterior_prob_clusters_above_threshold_to_files($theta1)>.
As with the hard clusters, each cluster is placed in a separate
file. The filenames for such clusters look like:

    posterior_prob_cluster1.txt
    posterior_prob_cluster2.txt
    ...

=head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?

The module constructor requires that you supply a value for the parameter C<K>, which
is the number of clusters you expect to see in the data.  But what if you do not have
a good value for C<K>?  Note that it is possible to search for the best C<K> by using
the two clustering quality criteria included in the module.  However, I have
intentionally not yet incorporated that feature in the module because it slows down
the execution of the code --- especially when the dimensionality of the data
increases.  However, nothing prevents you from writing a script --- along the lines
of the five "canned_example" scripts in the C<examples> directory --- that would use
the two clustering quality metrics for finding the best choice for C<K> for a given
dataset.  Obviously, you will now have to incorporate the call to the constructor in
a loop and check the value of the quality measures for each value of C<K>.

=head1 SOME RESULTS OBTAINED WITH THIS MODULE

If you would like to see some results that have been obtained with this module, check
out Section 7 of the report
L<https://engineering.purdue.edu/kak/Tutorials/ExpectationMaximization.pdf>.

=head1 THE C<examples> DIRECTORY

Becoming familiar with this directory should be your best
strategy to become comfortable with this module (and its
future versions).  You are urged to start by executing the
following five example scripts:

=over 16

=item I<canned_example1.pl>

This example applies the EM algorithm to the data contained in the datafile
C<mydatafile.dat>.  The mixture data in the file corresponds to three overlapping
Gaussian components in a star-shaped pattern.  The EM based clustering for this data
is shown in the files C<save_example_1_cluster_plot.png> and
C<save_example_1_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained on the basis of the posterior class probabilities at the data
points.  

=item I<canned_example2.pl>

The datafile used in this example is C<mydatafile2.dat>.  This mixture data
corresponds to two well-separated relatively isotropic Gaussians.  EM based clustering for this
data is shown in the files C<save_example_2_cluster_plot.png> and
C<save_example_2_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained by using the posterior class probabilities at the data points.

=item I<canned_example3.pl>

Like the first example, this example again involves three Gaussians, but now their
means are not co-located.  Additionally, we now seed the clusters manually by
specifying three selected data points as the initial guesses for the cluster means.
The datafile used for this example is C<mydatafile3.dat>.  The EM based clustering
for this data is shown in the files C<save_example_3_cluster_plot.png> and
C<save_example_3_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained on the basis of the posterior class probabilities at the data
points.

=item I<canned_example4.pl>

Whereas the three previous examples demonstrated EM based clustering of 2D data, we
now present an example of clustering in 3D.  The datafile used in this example is
C<mydatafile4.dat>.  This mixture data corresponds to three well-separated but highly
anisotropic Gaussians. The EM derived clustering for this data is shown in the files
C<save_example_4_cluster_plot.png> and C<save_example_4_posterior_prob_plot.png>, the
former displaying the hard clusters obtained by using the naive Bayes' classifier and
the latter showing the soft clusters obtained on the basis of the posterior class
probabilities at the data points.

You may also wish to run this example on the data in a CSV file in the C<examples>
directory. The name of the file is C<sphericaldata.csv>.  

=item I<canned_example5.pl>

We again demonstrate clustering in 3D but now we have one Gaussian cluster that
"cuts" through the other two Gaussian clusters.  The datafile used in this example is
C<mydatafile5.dat>.  The three Gaussians in this case are highly overlapping and
highly anisotropic.  The EM derived clustering for this data is shown in the files
C<save_example_5_cluster_plot.png> and C<save_example_5_posterior_prob_plot.png>, the
former displaying the hard clusters obtained by using the naive Bayes' classifier and
the latter showing the soft clusters obtained through the posterior class
probabilities at the data points.

=item I<canned_example6.pl>

This example, added in Version 1.2, demonstrates the use of this module for 1-D data.
In order to visualize the clusters for the 1-D case, we show them through their
respective histograms.  The datafile used in this example is C<mydatafile7.dat>.  The
data consists of two overlapping Gaussians.  The EM derived clustering for this data
is shown in the files C<save_example_6_cluster_plot.png> and
C<save_example_6_posterior_prob_plot.png>, the former displaying the hard clusters
obtained by using the naive Bayes' classifier and the latter showing the soft
clusters obtained through the posterior class probabilities at the data points.

=back

Going through the six examples listed above will make you familiar with how to make
the calls to the clustering and the visualization methods.  The C<examples> directory
also includes several parameter files with names like

    param1.txt
    param2.txt
    param3.txt 
    ...

These were used to generate the synthetic data for which the results are shown in the
C<examples> directory.  Just make a copy of one of these files and edit it if you
would like to generate your own multivariate data for clustering.  Note that you can
generate data with any dimensionality through appropriate entries in the parameter
file.



( run in 0.415 second using v1.01-cache-2.11-cpan-4991d5b9bd9 )