Algorithm-KMeans

 view release on metacpan or  search on metacpan

lib/Algorithm/KMeans.pm  view on Meta::CPAN

    }
    while ( my ($record_id, $data) = each %{$data_source} ) {
        my @fields = @$data;
        croak "\nABORTED: Visualization mask size exceeds data record size\n" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                croak "Misformed visualization mask. It can only have 1s and 0s\n";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($master_datafile);
    my $temp_file;
    if ($datatype eq 'original') {
        $temp_file = "__temp_data_" . $filename;
    } elsif ($datatype eq 'normed') {
        $temp_file = "__temp_normed_data_" . $filename;
    } else {
        croak "ABORTED: Improper call to visualize_data()";
    }
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!\n";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;
    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  $datatype eq 'original' ? '"data"' : '"normed data"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        $arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        croak "No provision for plotting 1-D data\n";
    }
}

###########################  Generating Synthetic Data for Clustering  ##############################

#  The data generated corresponds to a multivariate distribution.  The mean and the
#  covariance of each Gaussian in the distribution are specified individually in a
#  parameter file.  See the example parameter file param.txt in the examples
#  directory.  Just edit this file for your own needs.
#
#  The multivariate random numbers are generated by calling the Math::Random module.
#  As you would expect, that module will insist that the covariance matrix you
#  specify be symmetric and positive definite.
sub cluster_data_generator {
    my $class = shift;
    croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
    my %args = @_;
    my $input_parameter_file = $args{input_parameter_file};
    my $output_file = $args{output_datafile};
    my $N = $args{number_data_points_per_cluster};
    my @all_params;
    my $param_string;
    if (defined $input_parameter_file) {
        open INPUT, $input_parameter_file || "unable to open parameter file: $!";
        @all_params = <INPUT>;
        @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
        chomp @all_params;
        $param_string = join ' ', @all_params;
    } else {
        # Just for testing. Used in t/test.t
        $param_string = "cluster 5 0 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 5 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 0 5  1 0 0 0 1 0 0 0 1";
    }
    my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;
    @cluster_strings = grep  $_, @cluster_strings;
    my $K = @cluster_strings;
    croak "Too many clusters requested" if $K > 12;
    my @point_labels = ('a'..'z');
    print "Number of Gaussians used for the synthetic data: $K\n";
    my @means;
    my @covariances;
    my $data_dimension;
    foreach my $i (0..$K-1) {
        my @num_strings = split /  /, $cluster_strings[$i];
        my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
        $data_dimension = @cluster_mean;
        push @means, \@cluster_mean;
        my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
        croak "dimensionality error" if @covariance_nums != 
                                      ($data_dimension ** 2);
        my $cluster_covariance;
        foreach my $j (0..$data_dimension-1) {
            foreach my $k (0..$data_dimension-1) {        
                $cluster_covariance->[$j]->[$k] = 
                         $covariance_nums[$j*$data_dimension + $k];
            }
        }
        push @covariances, $cluster_covariance;
    }
    random_seed_from_phrase( 'hellojello' );
    my @data_dump;
    foreach my $i (0..$K-1) {
        my @m = @{shift @means};
        my @covar = @{shift @covariances};
        my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
        my $p = 0;
        my $label = $point_labels[$i];
        @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;

lib/Algorithm/KMeans.pm  view on Meta::CPAN

                                          cluster_seeding => 'random',    # or 'smart'
                                          terminal_output => 1,
                                          write_clusters_to_files => 1,
                                        );

  # Although not shown above, you can obviously set the 'do_variance_normalization'
  # flag here also if you wish.

  # For very large data files, setting K to 0 will result in searching through too
  # many values for K.  For such cases, you can range limit the values of K to search
  # through by

  my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                          mask     => "N111",
                                          Kmin     => 3,
                                          Kmax     => 10,
                                          cluster_seeding => 'random',    # or 'smart'
                                          terminal_output => 1,
                                          write_clusters_to_files => 1,
                                        );

  # FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
  # INSTANCE TO ACTUALLY CLUSTER THE DATA:

  $clusterer->read_data_from_file();
  $clusterer->kmeans();

  # If you want to directly access the clusters and the cluster centers in your own
  # top-level script, replace the above two statements with:

  $clusterer->read_data_from_file();
  my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();

  # You can subsequently access the clusters directly in your own code, as in:

  foreach my $cluster_id (sort keys %{$clusters_hash}) {
      print "\n$cluster_id   =>   @{$clusters_hash->{$cluster_id}}\n";
  }
  foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
      print "\n$cluster_id   =>   @{$cluster_centers_hash->{$cluster_id}}\n";
  }


  # CLUSTER VISUALIZATION:

  # You must first set the mask for cluster visualization. This mask tells the module
  # which 2D or 3D subspace of the original data space you wish to visualize the
  # clusters in:

  my $visualization_mask = "111";
  $clusterer->visualize_clusters($visualization_mask);


  # SYNTHETIC DATA GENERATION:

  # The module has been provided with a class method for generating multivariate data
  # for experimenting with clustering.  The data generation is controlled by the
  # contents of the parameter file that is supplied as an argument to the data
  # generator method.  The mean and covariance matrix entries in the parameter file
  # must be according to the syntax shown in the param.txt file in the examples
  # directory. It is best to edit this file as needed:

  my $parameter_file = "param.txt";
  my $out_datafile = "mydatafile.dat";
  Algorithm::KMeans->cluster_data_generator(
                          input_parameter_file => $parameter_file,
                          output_datafile => $out_datafile,
                          number_data_points_per_cluster => $N );

=head1 CHANGES

Version 2.05 removes the restriction on the version of Perl that is required.  This
is based on Srezic's recommendation.  He had no problem building and testing the
previous version with Perl 5.8.9.  Version 2.05 also includes a small augmentation of
the code in the method C<read_data_from_file_csv()> for guarding against user errors
in the specification of the mask that tells the module which columns of the data file
are to be used for clustering.

Version 2.04 allows you to use CSV data files for clustering.

Version 2.03 incorporates minor code cleanup.  The main implementation of the module
remains unchanged.

Version 2.02 downshifts the version of Perl that is required for this module.  The
module should work with versions 5.10 and higher of Perl.  The implementation code
for the module remains unchanged.

Version 2.01 removes many errors in the documentation. The changes made to the module
in Version 2.0 were not reflected properly in the documentation page for that
version.  The implementation code remains unchanged.

Version 2.0 includes significant additional functionality: (1) You now have the
option to cluster using the Mahalanobis distance metric (the default is the Euclidean
metric); and (2) With the two C<which_cluster> methods that have been added to the
module, you can now determine the best cluster for a new data sample after you have
created the clusters with the previously available data.  Finding the best cluster
for a new data sample can be done using either the Euclidean metric or the
Mahalanobis metric.

Version 1.40 includes a C<smart> option for seeding the clusters.  This option,
supplied through the constructor parameter C<cluster_seeding>, means that the
clusterer will (1) Subject the data to principal components analysis in order to
determine the maximum variance direction; (2) Project the data onto this direction;
(3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
locations of the highest peaks as initial guesses for the cluster centers.  If you
don't want to use this option, set C<cluster_seeding> to C<random>. That should work
as in the previous version of the module.

Version 1.30 includes a bug fix for the case when the datafile contains empty lines,
that is, lines with no data records.  Another bug fix in Version 1.30 deals with the
case when you want the module to figure out how many clusters to form (this is the
C<K=0> option in the constructor call) and the number of data records is close to the
minimum.

Version 1.21 includes fixes to handle the possibility that, when clustering the data
for a fixed number of clusters, a cluster may become empty during iterative
calculation of cluster assignments of the data elements and the updating of the
cluster centers.  The code changes are in the C<assign_data_to_clusters()> and
C<update_cluster_centers()> subroutines.

Version 1.20 includes an option to normalize the data with respect to its variability

lib/Algorithm/KMeans.pm  view on Meta::CPAN

Remember, C<K> is the number of clusters the data is partitioned into.

=item B<show_QoC_values()>

    $clusterer->show_QoC_values();

presents a table with C<K> values in the left column and the corresponding QoC
(Quality-of-Clustering) values in the right column.  Note that this call makes sense
only if you either supply the C<K=0> option to the constructor, or if you specify
values for the C<Kmin> and C<Kmax> options.

=item B<visualize_clusters()>

    $clusterer->visualize_clusters( $visualization_mask )

The visualization mask here does not have to be identical to the one used for
clustering, but must be a subset of that mask.  This is convenient for visualizing
the clusters in two- or three-dimensional subspaces of the original space.

=item B<visualize_data()>

    $clusterer->visualize_data($visualization_mask, 'original');

    $clusterer->visualize_data($visualization_mask, 'normed');

This method requires a second argument and, as shown, it must be either the string
C<original> or the string C<normed>, the former for the visualization of the raw data
and the latter for the visualization of the data after its different dimensions are
normalized by the standard-deviations along those directions.  If you call the method
with the second argument set to C<normed>, but do so without turning on the
C<do_variance_normalization> option in the KMeans constructor, it will let you know.


=item  B<which_cluster_for_new_data_element()>

If you wish to determine the cluster membership of a new data sample after you have
created the clusters with the existing data samples, you would need to call this
method. The C<which_cluster_for_new_data.pl> script in the C<examples> directory
shows how to use this method.


=item  B<which_cluster_for_new_data_element_mahalanobis()>

This does the same thing as the previous method, except that it determines the
cluster membership using the Mahalanobis distance metric.  As for the previous
method, see the C<which_cluster_for_new_data.pl> script in the C<examples> directory
for how to use this method.


=item  B<cluster_data_generator()>

    Algorithm::KMeans->cluster_data_generator(
                            input_parameter_file => $parameter_file,
                            output_datafile => $out_datafile,
                            number_data_points_per_cluster => 20 );

for generating multivariate data for clustering if you wish to play with synthetic
data for clustering.  The input parameter file contains the means and the variances
for the different Gaussians you wish to use for the synthetic data.  See the file
C<param.txt> provided in the examples directory.  It will be easiest for you to just
edit this file for your data generation needs.  In addition to the format of the
parameter file, the main constraint you need to observe in specifying the parameters
is that the dimensionality of the covariance matrix must correspond to the
dimensionality of the mean vectors.  The multivariate random numbers are generated by
calling the C<Math::Random> module.  As you would expect, this module requires that
the covariance matrices you specify in your parameter file be symmetric and positive
definite.  Should the covariances in your parameter file not obey this condition, the
C<Math::Random> module will let you know.

=back

=head1 HOW THE CLUSTERS ARE OUTPUT

When the option C<terminal_output> is set in the call to the constructor, the
clusters are displayed on the terminal screen.

When the option C<write_clusters_to_files> is set in the call to the constructor, the
module dumps the clusters in files named

    cluster0.txt
    cluster1.txt
    cluster2.txt
    ...
    ...

in the directory in which you execute the module.  The number of such files will
equal the number of clusters formed.  All such existing files in the directory are
destroyed before any fresh ones are created.  Each cluster file contains the symbolic
ID tags of the data samples in that cluster.

The module also leaves in your directory a printable `.png' file that is a point plot
of the different clusters. The name of this file is always C<clustering_results.png>.

Also, as mentioned previously, a call to C<kmeans()> in your own code will return the
clusters and the cluster centers.

=head1 REQUIRED

This module requires the following three modules:

   Math::Random
   Graphics::GnuplotIF
   Math::GSL

With regard to the third item above, what is actually required is the
C<Math::GSL::Matrix> module.  However, that module is a part of the C<Math::GSL>
distribution. The acronym GSL stands for the GNU Scientific Library.  C<Math::GSL> is
a Perl interface to the GSL C-based library.


=head1 THE C<examples> DIRECTORY

The C<examples> directory contains several scripts to help you become familiar with
this module.  The following script is an example of how the module can be expected to
be used most of the time. It calls for clustering to be carried out with a fixed
C<K>:

        cluster_and_visualize.pl

The more time you spend with this script, the more comfortable you will become with
the use of this module. The script file contains a large comment block that mentions
six locations in the script where you have to make decisions about how to use the
module.

See the following script if you do not know what value to use for C<K> for clustering
your data:

        find_best_K_and_cluster.pl

This script uses the C<K=0> option in the constructor that causes the module to
search for the best C<K> for your data.  Since this search is virtually unbounded ---
limited only by the number of samples in your data file --- the script may take a
long time to run for a large data file.  Hence the next script.

If your datafile is too large, you may need to range limit the values of C<K> that
are searched through, as in the following script:

        find_best_K_in_range_and_cluster.pl

If you also want to include data normalization (it may reduce the performance of the
clusterer in some cases), see the following script:

        cluster_after_data_normalization.pl

When you include the data normalization step and you would like to visualize the data
before and after normalization, see the following script:

        cluster_and_visualize_with_data_visualization.pl*

After you are done clustering, let's say you want to find the cluster membership of a
new data sample. To see how you can do that, see the script:

        which_cluster_for_new_data.pl

This script returns two answers for which cluster a new data sample belongs to: one
using the Euclidean metric to calculate the distances between the new data sample and
the cluster centers, and the other using the Mahalanobis metric.  If the clusters are
strongly elliptical in shape, you are likely to get better results with the
Mahalanobis metric.  (To see that you can get two different answers using the two
different distance metrics, run the C<which_cluster_for_new_data.pl> script on the
data in the file C<mydatafile3.dat>.  To make this run, note that you have to comment
out and uncomment the lines at four different locations in the script.)

The C<examples> directory also contains the following support scripts:

For generating the data for experiments with clustering:

        data_generator.pl

For cleaning up the examples directory:

        cleanup_directory.pl

The examples directory also includes a parameter file, C<param.txt>, for generating
synthetic data for clustering.  Just edit this file if you would like to generate
your own multivariate data for clustering.  The parameter file is for the 3D case,
but you can generate data with any dimensionality through appropriate entries in the
parameter file.

=head1 EXPORT

None by design.

=head1 CAVEATS

K-Means based clustering usually does not work well when the clusters are strongly
overlapping and when the extent of variability along the different dimensions is
different for the different clusters.  The module does give you the ability to
normalize the variability in your data with the constructor option
C<do_variance_normalization>.  However, as described elsewhere, this may actually
reduce the performance of the clusterer if the data variability along a direction is
more a result of the separation between the means than because of intra-cluster
variability.  For better clustering with difficult-to-cluster data, you could try
using the author's C<Algorithm::ExpectationMaximization> module.

=head1 BUGS

Please notify the author if you encounter any bugs.  When sending email, please place
the string 'KMeans' in the subject line.

=head1 INSTALLATION

Download the archive from CPAN in any directory of your choice.  Unpack the archive
with a command that on a Linux machine would look like:

    tar zxvf Algorithm-KMeans-2.05.tar.gz

This will create an installation directory for you whose name will be
C<Algorithm-KMeans-2.05>.  Enter this directory and execute the following commands
for a standard install of the module if you have root privileges:

    perl Makefile.PL
    make
    make test
    sudo make install

If you do not have root privileges, you can carry out a non-standard install the
module in any directory of your choice by:

    perl Makefile.PL prefix=/some/other/directory/
    make
    make test
    make install

With a non-standard install, you may also have to set your PERL5LIB environment
variable so that this module can find the required other modules. How you do that
would depend on what platform you are working on.  In order to install this module in
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
variable by

    setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/

If I used bash, I'd need to declare:

    export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/



( run in 0.996 second using v1.01-cache-2.11-cpan-13bb782fe5a )