Algorithm-KMeans
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
} elsif ($visualization_data_field_width == 2) {
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 1 ) {
$arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
}
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
croak "No provision for plotting 1-D data\n";
}
}
########################### Generating Synthetic Data for Clustering ##############################
# The data generated corresponds to a multivariate distribution. The mean and the
# covariance of each Gaussian in the distribution are specified individually in a
# parameter file. See the example parameter file param.txt in the examples
# directory. Just edit this file for your own needs.
#
# The multivariate random numbers are generated by calling the Math::Random module.
# As you would expect, that module will insist that the covariance matrix you
# specify be symmetric and positive definite.
sub cluster_data_generator {
my $class = shift;
croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
my %args = @_;
my $input_parameter_file = $args{input_parameter_file};
my $output_file = $args{output_datafile};
my $N = $args{number_data_points_per_cluster};
my @all_params;
my $param_string;
if (defined $input_parameter_file) {
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
chomp @all_params;
$param_string = join ' ', @all_params;
} else {
# Just for testing. Used in t/test.t
$param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
}
my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;
@cluster_strings = grep $_, @cluster_strings;
my $K = @cluster_strings;
croak "Too many clusters requested" if $K > 12;
my @point_labels = ('a'..'z');
print "Number of Gaussians used for the synthetic data: $K\n";
my @means;
my @covariances;
my $data_dimension;
foreach my $i (0..$K-1) {
my @num_strings = split / /, $cluster_strings[$i];
my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
$data_dimension = @cluster_mean;
push @means, \@cluster_mean;
my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
croak "dimensionality error" if @covariance_nums !=
($data_dimension ** 2);
my $cluster_covariance;
foreach my $j (0..$data_dimension-1) {
foreach my $k (0..$data_dimension-1) {
$cluster_covariance->[$j]->[$k] =
$covariance_nums[$j*$data_dimension + $k];
}
}
push @covariances, $cluster_covariance;
}
random_seed_from_phrase( 'hellojello' );
my @data_dump;
foreach my $i (0..$K-1) {
my @m = @{shift @means};
my @covar = @{shift @covariances};
my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
my $p = 0;
my $label = $point_labels[$i];
@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
push @data_dump, @new_data;
}
fisher_yates_shuffle( \@data_dump );
open OUTPUT, ">$output_file";
foreach my $ele (@data_dump) {
foreach my $coord ( @$ele ) {
print OUTPUT "$coord ";
}
print OUTPUT "\n";
}
print "Data written out to file $output_file\n";
close OUTPUT;
}
sub add_point_coords {
my $self = shift;
my @arr_of_ids = @{shift @_}; # array of data element names
my @result;
my $data_dimensionality = $self->{_data_dimensions};
foreach my $i (0..$data_dimensionality-1) {
$result[$i] = 0.0;
}
foreach my $id (@arr_of_ids) {
my $ele = $self->{_data}->{$id};
my $i = 0;
foreach my $component (@$ele) {
$result[$i] += $component;
$i++;
}
}
return \@result;
}
sub add_point_coords_from_original_data {
my $self = shift;
my @arr_of_ids = @{shift @_}; # array of data element names
my @result;
my $data_dimensionality = $self->{_data_dimensions};
foreach my $i (0..$data_dimensionality-1) {
$result[$i] = 0.0;
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
Kmin => 3,
Kmax => 10,
cluster_seeding => 'random', # or 'smart'
terminal_output => 1,
write_clusters_to_files => 1,
);
# FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
# INSTANCE TO ACTUALLY CLUSTER THE DATA:
$clusterer->read_data_from_file();
$clusterer->kmeans();
# If you want to directly access the clusters and the cluster centers in your own
# top-level script, replace the above two statements with:
$clusterer->read_data_from_file();
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
# You can subsequently access the clusters directly in your own code, as in:
foreach my $cluster_id (sort keys %{$clusters_hash}) {
print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
}
foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
}
# CLUSTER VISUALIZATION:
# You must first set the mask for cluster visualization. This mask tells the module
# which 2D or 3D subspace of the original data space you wish to visualize the
# clusters in:
my $visualization_mask = "111";
$clusterer->visualize_clusters($visualization_mask);
# SYNTHETIC DATA GENERATION:
# The module has been provided with a class method for generating multivariate data
# for experimenting with clustering. The data generation is controlled by the
# contents of the parameter file that is supplied as an argument to the data
# generator method. The mean and covariance matrix entries in the parameter file
# must be according to the syntax shown in the param.txt file in the examples
# directory. It is best to edit this file as needed:
my $parameter_file = "param.txt";
my $out_datafile = "mydatafile.dat";
Algorithm::KMeans->cluster_data_generator(
input_parameter_file => $parameter_file,
output_datafile => $out_datafile,
number_data_points_per_cluster => $N );
=head1 CHANGES
Version 2.05 removes the restriction on the version of Perl that is required. This
is based on Srezic's recommendation. He had no problem building and testing the
previous version with Perl 5.8.9. Version 2.05 also includes a small augmentation of
the code in the method C<read_data_from_file_csv()> for guarding against user errors
in the specification of the mask that tells the module which columns of the data file
are to be used for clustering.
Version 2.04 allows you to use CSV data files for clustering.
Version 2.03 incorporates minor code cleanup. The main implementation of the module
remains unchanged.
Version 2.02 downshifts the version of Perl that is required for this module. The
module should work with versions 5.10 and higher of Perl. The implementation code
for the module remains unchanged.
Version 2.01 removes many errors in the documentation. The changes made to the module
in Version 2.0 were not reflected properly in the documentation page for that
version. The implementation code remains unchanged.
Version 2.0 includes significant additional functionality: (1) You now have the
option to cluster using the Mahalanobis distance metric (the default is the Euclidean
metric); and (2) With the two C<which_cluster> methods that have been added to the
module, you can now determine the best cluster for a new data sample after you have
created the clusters with the previously available data. Finding the best cluster
for a new data sample can be done using either the Euclidean metric or the
Mahalanobis metric.
Version 1.40 includes a C<smart> option for seeding the clusters. This option,
supplied through the constructor parameter C<cluster_seeding>, means that the
clusterer will (1) Subject the data to principal components analysis in order to
determine the maximum variance direction; (2) Project the data onto this direction;
(3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
locations of the highest peaks as initial guesses for the cluster centers. If you
don't want to use this option, set C<cluster_seeding> to C<random>. That should work
as in the previous version of the module.
Version 1.30 includes a bug fix for the case when the datafile contains empty lines,
that is, lines with no data records. Another bug fix in Version 1.30 deals with the
case when you want the module to figure out how many clusters to form (this is the
C<K=0> option in the constructor call) and the number of data records is close to the
minimum.
Version 1.21 includes fixes to handle the possibility that, when clustering the data
for a fixed number of clusters, a cluster may become empty during iterative
calculation of cluster assignments of the data elements and the updating of the
cluster centers. The code changes are in the C<assign_data_to_clusters()> and
C<update_cluster_centers()> subroutines.
Version 1.20 includes an option to normalize the data with respect to its variability
along the different coordinates before clustering is carried out.
Version 1.1.1 allows for range limiting the values of C<K> to search through. C<K>
stands for the number of clusters to form. This version also declares the module
dependencies in the C<Makefile.PL> file.
Version 1.1 is a an object-oriented version of the implementation presented in
version 1.0. The current version should lend itself more easily to code extension.
You could, for example, create your own class by subclassing from the class presented
here and, in your subclass, use your own criteria for the similarity distance between
the data points and for the QoC (Quality of Clustering) metric, and, possibly a
different rule to stop the iterations. Version 1.1 also allows you to directly
access the clusters formed and the cluster centers in your calling script.
=head1 SPECIAL USAGE NOTE
If you were directly accessing in your own scripts the clusters produced by the older
versions of this module, you'd need to make changes to your code if you wish to use
Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers,
Versions 2.0 and higher return hashes. This change was made necessary by the logic
required for implementing the two new C<which_cluster> methods that were introduced
in Version 2.0. These methods return the best cluster for a new data sample from the
clusters you created using the existing data. One of the C<which_cluster> methods is
based on the Euclidean metric for finding the cluster that is closest to the new data
sample, and the other on the Mahalanobis metric. Another point of incompatibility
with the previous versions is that you must now explicitly set the C<cluster_seeding>
( run in 0.997 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )