Algorithm-KMeans
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
package Algorithm::KMeans;
#------------------------------------------------------------------------------------
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::KMeans is a Perl module for clustering multidimensional data.
# -----------------------------------------------------------------------------------
#use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
our $VERSION = '2.05';
# from Perl docs:
my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
# Constructor:
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
bless {
_datafile => $args{datafile} || croak("datafile required"),
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || 0,
_K_min => $args{Kmin} || 'unknown',
_K_max => $args{Kmax} || 'unknown',
_cluster_seeding => $args{cluster_seeding} || croak("must choose smart or random ".
"for cluster seeding"),
_var_normalize => $args{do_variance_normalization} || 0,
_use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0,
_clusters_2_files => $args{write_clusters_to_files} || 0,
_terminal_output => $args{terminal_output} || 0,
_debug => $args{debug} || 0,
_N => 0,
_K_best => 'unknown',
_original_data => {},
_data => {},
_data_id_tags => [],
_QoC_values => {},
_clusters => [],
_cluster_centers => [],
_clusters_hash => {},
_cluster_centers_hash => {},
_cluster_covariances_hash => {},
_data_dimensions => 0,
}, $class;
}
sub read_data_from_file {
my $self = shift;
my $filename = $self->{_datafile};
$self->read_data_from_file_csv() if $filename =~ /.csv$/;
$self->read_data_from_file_dat() if $filename =~ /.dat$/;
}
sub read_data_from_file_csv {
my $self = shift;
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
"the size of at least one of the data records in the file: $!"
unless scalar(@mask) == scalar(@splits);
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_original_data} = \%data_hash;
$self->{_data_id_tags} = \@data_tags;
$self->{_N} = scalar @data_tags;
if ($self->{_var_normalize}) {
$self->{_data} = variance_normalization( $self->{_original_data} );
} else {
$self->{_data} = deep_copy_hash( $self->{_original_data} );
}
# Need to make the following call to set the global mean and covariance:
# my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
# Need to make the following call to set the global eigenvec eigenval sets:
# $self->eigen_analysis_of_covariance($covariance);
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
if ( $self->{_N} < (2 * $self->{_K} ** 2) );
print "\n\n\n";
}
}
sub read_data_from_file_dat {
my $self = shift;
my $datafile = $self->{_datafile};
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open INPUT, $datafile or die "unable to open file $datafile: $!\n";
chomp( my @raw_data = <INPUT> );
close INPUT;
# Transform strings into number data
foreach my $record (@raw_data) {
next unless $record;
next if $record =~ /^#/;
my @data_fields;
my @fields = split /\s+/, $record;
die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
my $record_id;
foreach my $i (0..@fields-1) {
if ($mask[$i] eq '0') {
next;
} elsif ($mask[$i] eq 'N') {
$record_id = $fields[$i];
} elsif ($mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "misformed mask for reading the data file\n";
}
}
my @nums = map {/$_num_regex/;$_} @data_fields;
$self->{_original_data}->{ $record_id } = \@nums;
}
if ($self->{_var_normalize}) {
$self->{_data} = variance_normalization( $self->{_original_data} );
} else {
$self->{_data} = deep_copy_hash( $self->{_original_data} );
}
my @all_data_ids = keys %{$self->{_data}};
$self->{_data_id_tags} = \@all_data_ids;
$self->{_N} = scalar @all_data_ids;
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
if ( $self->{_N} < (2 * $self->{_K} ** 2) );
print "\n\n\n";
}
}
sub kmeans {
my $self = shift;
my $K = $self->{_K};
if ( ($K == 0) ||
( ($self->{_K_min} ne 'unknown') &&
($self->{_K_max} ne 'unknown') ) ) {
eval {
$self->iterate_through_K();
};
die "in kmeans(): $@" if ($@);
} elsif ( $K =~ /\d+/) {
eval {
$self->cluster_for_fixed_K_multiple_random_tries($K)
if $self->{_cluster_seeding} eq 'random';
$self->cluster_for_fixed_K_single_smart_try($K)
if $self->{_cluster_seeding} eq 'smart';
};
die "in kmeans(): $@" if ($@);
} else {
croak "Incorrect call syntax used. See documentation.\n";
}
if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
foreach my $i (0..@{$self->{_clusters}}-1) {
$self->{_clusters_hash}->{"cluster$i"} = $self->{_clusters}->[$i];
$self->{_cluster_centers_hash}->{"cluster$i"} = $self->{_cluster_centers}->[$i];
$self->{_cluster_covariances_hash}->{"cluster$i"} =
$self->estimate_cluster_covariance($self->{_clusters}->[$i]);
}
$self->write_clusters_to_files( $self->{_clusters} ) if $self->{_clusters_2_files};
return ($self->{_clusters_hash}, $self->{_cluster_centers_hash});
} else {
croak "Clustering failed. Try again.";
}
}
# This is the subroutine to call if you prefer purely random choices for the initial
# seeding of the cluster centers.
sub cluster_for_fixed_K_multiple_random_tries {
my $self = shift;
my $K = shift;
my @all_data_ids = @{$self->{_data_id_tags}};
my $QoC;
my @QoC_values;
my @array_of_clusters;
my @array_of_cluster_centers;
my $clusters;
lib/Algorithm/KMeans.pm view on Meta::CPAN
}
my ($min,$max) = minmax(\@cluster_center_indices );
my $quality = ($max - $min) / $data_store_size;
last if $quality > 0.3;
}
return @cluster_center_indices;
}
# This function is used when you set the "cluster_seeding" option to 'random' in the
# constructor. This routine merely reaches into the data array with the random
# integers, as constructed by the previous routine, serving as indices and fetching
# values corresponding to those indices. The fetched data samples serve as the
# initial cluster centers.
sub get_initial_cluster_centers {
my $self = shift;
my $K = shift;
my @cluster_center_indices = $self->initialize_cluster_centers($K);
my @result;
foreach my $i (@cluster_center_indices) {
my $tag = $self->{_data_id_tags}[$i];
push @result, $self->{_data}->{$tag};
}
return \@result;
}
# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
# option in the constructor. It subjects the data to a principal components analysis
# to figure out the direction of maximal variance. Subsequently, it tries to locate
# K peaks in a smoothed histogram of the data points projected onto the maximal
# variance direction.
sub get_initial_cluster_centers_smart {
my $self = shift;
my $K = shift;
if ($self->{_data_dimensions} == 1) {
my @one_d_data;
foreach my $j (0..$self->{_N}-1) {
my $tag = $self->{_data_id_tags}[$j];
push @one_d_data, $self->{_data}->{$tag}->[0];
}
my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
print "highest points at data values: @peak_points\n" if $self->{_debug};
my @cluster_centers;
foreach my $peakpoint (@peak_points) {
push @cluster_centers, [$peakpoint];
}
return \@cluster_centers;
}
my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_id_tags}. The
# actual data for clustering is stored in a hash at $self->{_data} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
foreach my $j (0..$num_cols-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
$matrix->set_col($j, $data);
}
if ($self->{_debug}) {
print "\nDisplaying the original data as a matrix:";
display_matrix( $matrix );
}
foreach my $j (0..$num_cols-1) {
$mean_vec += $matrix->col($j);
}
$mean_vec *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "Displaying the mean vector for the data:";
display_matrix( $mean_vec );
}
foreach my $j (0..$num_cols-1) {
my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
$matrix->set_col($j, \@new_col);
}
if ($self->{_debug}) {
print "Displaying mean subtracted data as a matrix:";
display_matrix( $matrix );
}
my $transposed = transpose( $matrix );
if ($self->{_debug}) {
print "Displaying transposed data matrix:";
display_matrix( $transposed );
}
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for your data:";
display_matrix( $covariance );
}
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
}
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
foreach my $i (0..$num_of_eigens-1) {
my @vec = $eigenvectors->[$i]->as_list;
print "Eigenvector $i: @vec\n" if $self->{_debug};
}
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
my @max_var_direction;
# Each element of the array @largest_eigen_vec is a Math::Complex object
foreach my $k (0..@largest_eigen_vec-1) {
my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
if ($theta eq '0') {
$max_var_direction[$k] = $mag;
} elsif ($theta eq 'pi') {
$max_var_direction[$k] = -1.0 * $mag;
} else {
die "eigendecomposition of covariance matrix produced a complex " .
"eigenvector --- something is wrong";
}
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
display_matrix( $covariance );
}
return $covariance;
}
sub write_clusters_to_files {
my $self = shift;
my @clusters = @{$self->{_clusters}};
unlink glob "cluster*.dat";
foreach my $i (0..@clusters-1) {
my $filename = "cluster" . $i . ".txt";
print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
foreach my $ele (@{$clusters[$i]}) {
print FILEHANDLE "$ele ";
}
close FILEHANDLE;
}
}
sub get_K_best {
my $self = shift;
croak "You need to run the clusterer with K=0 option " .
"before you can call this method" if $self->{_K_best} eq 'unknown';
print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
return $self->{_K_best};
}
sub show_QoC_values {
my $self = shift;
croak "\nYou need to run the clusterer with K=0 option before you can call this method"
if $self->{_K_best} eq 'unknown';
print "\nShown below are K on the left and the QoC values on the right (the smaller " .
"the QoC, the better it is)\n";
foreach my $key (sort keys %{$self->{_QoC_values}} ) {
print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
}
}
sub DESTROY {
unlink "__temp_" . basename($_[0]->{_datafile});
unlink "__temp_data_" . basename($_[0]->{_datafile});
unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
}
################################## Visualization Code ###################################
# It makes sense to call visualize_clusters() only AFTER you have called kmeans().
#
# The visualize_clusters() implementation automatically figures out whether it
# should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
# supplied as one of the arguments is greater than 2, it does a 3D plot for the
# first three data coordinates. That is, the clusters will be displayed in the 3D
# space formed by the first three data coordinates. On the other hand, if the number
# of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
# only one on bit is specified for the mask, visualize_clusters() aborts.
#
# The visualization code consists of first accessing each of clusters created by the
# kmeans() subroutine. Note that the clusters contain only the symbolic names for
# the individual records in the source data file. We therefore next reach into the
# $self->{_original_data} hash and get the data coordinates associated with each
# symbolic label in a cluster. The numerical data thus generated is then written
# out to a temp file. When doing so we must remember to insert TWO BLANK LINES
# between the data blocks corresponding to the different clusters. This constraint
# is imposed on us by Gnuplot when plotting data from the same file since we want to
# use different point styles for the data points in different cluster files.
#
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
# module to plot the data clusters.
sub visualize_clusters {
my $self = shift;
my $v_mask;
my $pause_time;
if (@_ == 1) {
$v_mask = shift || croak "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || croak "visualization mask missing";
$pause_time = shift;
} else {
croak "visualize_clusters() called with wrong args";
}
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
croak "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
my %visualization_data;
while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
my @fields = @$data;
croak "\nABORTED: Visualization mask size exceeds data record size\n"
if $#v_mask > $#fields;
my @data_fields;
foreach my $i (0..@fields-1) {
if ($v_mask[$i] eq '0') {
next;
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
croak "Misformed visualization mask. It can only have 1s and 0s\n";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my @all_data_ids = @{$self->{_data_id_tags}};
my $K = scalar @{$self->{_clusters}};
my $filename = basename($master_datafile);
my $temp_file = "__temp_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!\n";
foreach my $cluster (@{$self->{_clusters}}) {
foreach my $item (@$cluster) {
print OUTPUT "@{$visualization_data{$item}}";
print OUTPUT "\n";
}
print OUTPUT "\n\n";
}
close OUTPUT;
my $plot;
my $hardcopy_plot;
if (!defined $pause_time) {
$plot = Graphics::GnuplotIF->new( persist => 1 );
$hardcopy_plot = Graphics::GnuplotIF->new();
$hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
} else {
$plot = Graphics::GnuplotIF->new();
}
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
my $arg_string = "";
if ($visualization_data_field_width > 2) {
foreach my $i (0..$K-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, ";
}
} elsif ($visualization_data_field_width == 2) {
foreach my $i (0..$K-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, ";
}
} elsif ($visualization_data_field_width == 1 ) {
foreach my $i (0..$K-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, ";
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "splot $arg_string" );
$hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time;
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "plot $arg_string" );
$hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
} elsif ($visualization_data_field_width == 1) {
croak "No provision for plotting 1-D data\n";
}
}
# It makes sense to call visualize_data() only AFTER you have called the method
# read_data_from_file().
#
# The visualize_data() is meant for the visualization of the original data in its
# various 2D or 3D subspaces. The method can also be used to visualize the normed
# data in a similar manner. Recall the normed data is the original data after each
# data dimension is normalized by the standard-deviation along that dimension.
#
# Whether you see the original data or the normed data depends on the second
# argument supplied in the method call. It must be either the string 'original' or
# the string 'normed'.
sub visualize_data {
my $self = shift;
my $v_mask = shift || croak "visualization mask missing";
my $datatype = shift; # must be either 'original' or 'normed'
croak "\n\nABORTED: You called visualize_data() for normed data " .
"but without first turning on data normalization in the " .
"in the KMeans constructor"
if ($datatype eq 'normed') && ! $self->{_var_normalize};
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
croak "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
my %visualization_data;
my $data_source;
if ($datatype eq 'original') {
$data_source = $self->{_original_data};
} elsif ($datatype eq 'normed') {
$data_source = $self->{_data};
} else {
croak "\n\nABORTED: improper call to visualize_data()";
}
while ( my ($record_id, $data) = each %{$data_source} ) {
my @fields = @$data;
croak "\nABORTED: Visualization mask size exceeds data record size\n"
if $#v_mask > $#fields;
my @data_fields;
foreach my $i (0..@fields-1) {
if ($v_mask[$i] eq '0') {
next;
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
croak "Misformed visualization mask. It can only have 1s and 0s\n";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my $filename = basename($master_datafile);
my $temp_file;
if ($datatype eq 'original') {
$temp_file = "__temp_data_" . $filename;
} elsif ($datatype eq 'normed') {
$temp_file = "__temp_normed_data_" . $filename;
} else {
croak "ABORTED: Improper call to visualize_data()";
}
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!\n";
foreach my $datapoint (values %visualization_data) {
print OUTPUT "@$datapoint";
print OUTPUT "\n";
}
close OUTPUT;
my $plot = Graphics::GnuplotIF->new( persist => 1 );
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
my $arg_string ;
if ($visualization_data_field_width > 2) {
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 2) {
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 1 ) {
$arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
}
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
croak "No provision for plotting 1-D data\n";
}
}
########################### Generating Synthetic Data for Clustering ##############################
# The data generated corresponds to a multivariate distribution. The mean and the
# covariance of each Gaussian in the distribution are specified individually in a
# parameter file. See the example parameter file param.txt in the examples
# directory. Just edit this file for your own needs.
#
# The multivariate random numbers are generated by calling the Math::Random module.
# As you would expect, that module will insist that the covariance matrix you
# specify be symmetric and positive definite.
sub cluster_data_generator {
my $class = shift;
croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
my %args = @_;
my $input_parameter_file = $args{input_parameter_file};
my $output_file = $args{output_datafile};
my $N = $args{number_data_points_per_cluster};
my @all_params;
my $param_string;
if (defined $input_parameter_file) {
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
chomp @all_params;
$param_string = join ' ', @all_params;
} else {
# Just for testing. Used in t/test.t
$param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
}
my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;
@cluster_strings = grep $_, @cluster_strings;
my $K = @cluster_strings;
croak "Too many clusters requested" if $K > 12;
my @point_labels = ('a'..'z');
print "Number of Gaussians used for the synthetic data: $K\n";
my @means;
my @covariances;
my $data_dimension;
foreach my $i (0..$K-1) {
my @num_strings = split / /, $cluster_strings[$i];
my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
$data_dimension = @cluster_mean;
push @means, \@cluster_mean;
my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
croak "dimensionality error" if @covariance_nums !=
($data_dimension ** 2);
my $cluster_covariance;
foreach my $j (0..$data_dimension-1) {
foreach my $k (0..$data_dimension-1) {
$cluster_covariance->[$j]->[$k] =
$covariance_nums[$j*$data_dimension + $k];
}
}
push @covariances, $cluster_covariance;
}
random_seed_from_phrase( 'hellojello' );
my @data_dump;
foreach my $i (0..$K-1) {
my @m = @{shift @means};
my @covar = @{shift @covariances};
my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
my $p = 0;
my $label = $point_labels[$i];
@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
push @data_dump, @new_data;
}
fisher_yates_shuffle( \@data_dump );
open OUTPUT, ">$output_file";
foreach my $ele (@data_dump) {
foreach my $coord ( @$ele ) {
print OUTPUT "$coord ";
}
print OUTPUT "\n";
}
print "Data written out to file $output_file\n";
close OUTPUT;
}
sub add_point_coords {
my $self = shift;
my @arr_of_ids = @{shift @_}; # array of data element names
my @result;
my $data_dimensionality = $self->{_data_dimensions};
foreach my $i (0..$data_dimensionality-1) {
$result[$i] = 0.0;
}
foreach my $id (@arr_of_ids) {
my $ele = $self->{_data}->{$id};
my $i = 0;
foreach my $component (@$ele) {
$result[$i] += $component;
$i++;
}
}
return \@result;
}
sub add_point_coords_from_original_data {
my $self = shift;
my @arr_of_ids = @{shift @_}; # array of data element names
my @result;
my $data_dimensionality = $self->{_data_dimensions};
foreach my $i (0..$data_dimensionality-1) {
$result[$i] = 0.0;
}
foreach my $id (@arr_of_ids) {
my $ele = $self->{_original_data}->{$id};
my $i = 0;
foreach my $component (@$ele) {
$result[$i] += $component;
$i++;
}
}
return \@result;
}
################################### Support Routines ########################################
sub get_index_at_value {
my $value = shift;
my @array = @{shift @_};
foreach my $i (0..@array-1) {
return $i if $value == $array[$i];
}
return -1;
}
# This routine is really not necessary in light of the new `~~' operator in Perl.
# Will use the new operator in the next version.
sub vector_equal {
my $vec1 = shift;
my $vec2 = shift;
die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2;
foreach my $i (0..@$vec1-1){
return 0 if $vec1->[$i] != $vec2->[$i];
}
return 1;
}
# Returns the minimum value and its positional index in an array
sub minimum {
my $arr = shift;
my $min;
my $index;
foreach my $i (0..@{$arr}-1) {
if ( (!defined $min) || ($arr->[$i] < $min) ) {
$index = $i;
$min = $arr->[$i];
}
}
return ($min, $index);
}
sub minmax {
my $arr = shift;
my $min;
my $max;
foreach my $i (0..@{$arr}-1) {
next if !defined $arr->[$i];
if ( (!defined $min) && (!defined $max) ) {
$min = $arr->[$i];
$max = $arr->[$i];
} elsif ( $arr->[$i] < $min ) {
$min = $arr->[$i];
} elsif ( $arr->[$i] > $max ) {
$max = $arr->[$i];
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
return ($min, $max);
}
# Meant only for constructing a deep copy of an array of arrays:
sub deep_copy_AoA {
my $ref_in = shift;
my $ref_out;
foreach my $i (0..@{$ref_in}-1) {
foreach my $j (0..@{$ref_in->[$i]}-1) {
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
}
}
return $ref_out;
}
# Meant only for constructing a deep copy of an array of arrays for the case when
# some elements of the top-level array may be undefined:
sub deep_copy_AoA_with_nulls {
my $ref_in = shift;
my $ref_out;
foreach my $i (0..@{$ref_in}-1) {
if ( !defined $ref_in->[$i] ) {
$ref_out->[$i] = undef;
next;
}
foreach my $j (0..@{$ref_in->[$i]}-1) {
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
}
}
return $ref_out;
}
# Meant only for constructing a deep copy of a hash in which each value is an
# anonymous array of numbers:
sub deep_copy_hash {
my $ref_in = shift;
my $ref_out;
while ( my ($key, $value) = each( %$ref_in ) ) {
$ref_out->{$key} = deep_copy_array( $value );
}
return $ref_out;
}
# Meant only for an array of numbers:
sub deep_copy_array {
my $ref_in = shift;
my $ref_out;
foreach my $i (0..@{$ref_in}-1) {
$ref_out->[$i] = $ref_in->[$i];
}
return $ref_out;
}
sub display_cluster_centers {
my $self = shift;
my @clusters = @{shift @_};
my $i = 0;
foreach my $cluster (@clusters) {
my $cluster_size = @$cluster;
my @cluster_center =
@{$self->add_point_coords_from_original_data( $cluster )};
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
print "\ncluster $i ($cluster_size records):\n";
print "cluster center $i: " .
"@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n";
$i++;
}
}
# For displaying the individual clusters on a terminal screen. Each cluster is
# displayed through the symbolic names associated with the data points.
sub display_clusters {
my @clusters = @{shift @_};
my $i = 0;
foreach my $cluster (@clusters) {
@$cluster = sort @$cluster;
my $cluster_size = @$cluster;
print "\n\nCluster $i ($cluster_size records):\n";
foreach my $ele (@$cluster) {
print " $ele";
}
$i++
}
print "\n\n";
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
sub variance_normalization {
my %data_hash = %{shift @_};
my @all_data_points = values %data_hash;
my $dimensions = @{$all_data_points[0]};
my @data_projections;
foreach my $data_point (@all_data_points) {
my $i = 0;
foreach my $proj (@$data_point) {
push @{$data_projections[$i++]}, $proj;
}
}
my @variance_vec;
foreach my $vec (@data_projections) {
my ($mean, $variance) = mean_and_variance( $vec );
push @variance_vec, $variance;
}
my %new_data_hash;
while (my ($label, $data) = each(%data_hash) ) {
my @new_data;
foreach my $i (0..@{$data}-1) {
my $new = $data->[$i] / sqrt($variance_vec[$i]);
push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
}
$new_data_hash{$label} = \@new_data;
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
write_clusters_to_files => 1,
);
# But bear in mind that such data normalization may actually decrease the
# performance of the clusterer if the variability in the data is more a result of
# the separation between the means than a consequence of intra-cluster variance.
# Set K to 0 if you want the module to figure out the optimum number of clusters
# from the data. (It is best to run this option with the terminal_output set to 1
# so that you can see the different value of QoC for the different K):
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => $mask,
K => 0,
cluster_seeding => 'random', # or 'smart'
terminal_output => 1,
write_clusters_to_files => 1,
);
# Although not shown above, you can obviously set the 'do_variance_normalization'
# flag here also if you wish.
# For very large data files, setting K to 0 will result in searching through too
# many values for K. For such cases, you can range limit the values of K to search
# through by
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => "N111",
Kmin => 3,
Kmax => 10,
cluster_seeding => 'random', # or 'smart'
terminal_output => 1,
write_clusters_to_files => 1,
);
# FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
# INSTANCE TO ACTUALLY CLUSTER THE DATA:
$clusterer->read_data_from_file();
$clusterer->kmeans();
# If you want to directly access the clusters and the cluster centers in your own
# top-level script, replace the above two statements with:
$clusterer->read_data_from_file();
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
# You can subsequently access the clusters directly in your own code, as in:
foreach my $cluster_id (sort keys %{$clusters_hash}) {
print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
}
foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
}
# CLUSTER VISUALIZATION:
# You must first set the mask for cluster visualization. This mask tells the module
# which 2D or 3D subspace of the original data space you wish to visualize the
# clusters in:
my $visualization_mask = "111";
$clusterer->visualize_clusters($visualization_mask);
# SYNTHETIC DATA GENERATION:
# The module has been provided with a class method for generating multivariate data
# for experimenting with clustering. The data generation is controlled by the
# contents of the parameter file that is supplied as an argument to the data
# generator method. The mean and covariance matrix entries in the parameter file
# must be according to the syntax shown in the param.txt file in the examples
# directory. It is best to edit this file as needed:
my $parameter_file = "param.txt";
my $out_datafile = "mydatafile.dat";
Algorithm::KMeans->cluster_data_generator(
input_parameter_file => $parameter_file,
output_datafile => $out_datafile,
number_data_points_per_cluster => $N );
=head1 CHANGES
Version 2.05 removes the restriction on the version of Perl that is required. This
is based on Srezic's recommendation. He had no problem building and testing the
previous version with Perl 5.8.9. Version 2.05 also includes a small augmentation of
the code in the method C<read_data_from_file_csv()> for guarding against user errors
in the specification of the mask that tells the module which columns of the data file
are to be used for clustering.
Version 2.04 allows you to use CSV data files for clustering.
Version 2.03 incorporates minor code cleanup. The main implementation of the module
remains unchanged.
Version 2.02 downshifts the version of Perl that is required for this module. The
module should work with versions 5.10 and higher of Perl. The implementation code
for the module remains unchanged.
Version 2.01 removes many errors in the documentation. The changes made to the module
in Version 2.0 were not reflected properly in the documentation page for that
version. The implementation code remains unchanged.
Version 2.0 includes significant additional functionality: (1) You now have the
option to cluster using the Mahalanobis distance metric (the default is the Euclidean
metric); and (2) With the two C<which_cluster> methods that have been added to the
module, you can now determine the best cluster for a new data sample after you have
created the clusters with the previously available data. Finding the best cluster
for a new data sample can be done using either the Euclidean metric or the
Mahalanobis metric.
Version 1.40 includes a C<smart> option for seeding the clusters. This option,
supplied through the constructor parameter C<cluster_seeding>, means that the
clusterer will (1) Subject the data to principal components analysis in order to
determine the maximum variance direction; (2) Project the data onto this direction;
(3) Find peaks in a smoothed histogram of the projected points; and (4) Use the
locations of the highest peaks as initial guesses for the cluster centers. If you
don't want to use this option, set C<cluster_seeding> to C<random>. That should work
as in the previous version of the module.
lib/Algorithm/KMeans.pm view on Meta::CPAN
=item C<write_clusters_to_files>:
This parameter is also boolean. When set to 1, the clusters are written out to files
that are named in the following manner:
cluster0.txt
cluster1.txt
cluster2.txt
...
...
Before the clusters are written to these files, the module destroys all files with
such names in the directory in which you call the module.
=back
=over
=item B<read_data_from_file()>
$clusterer->read_data_from_file()
=item B<kmeans()>
$clusterer->kmeans();
or
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
The first call above works solely by side-effect. The second call also returns the
clusters and the cluster centers. See the C<cluster_and_visualize.pl> script in the
C<examples> directory for how you can in your own code extract the clusters and the
cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>.
=item B<get_K_best()>
$clusterer->get_K_best();
This call makes sense only if you supply either the C<K=0> option to the constructor,
or if you specify values for the C<Kmin> and C<Kmax> options. The C<K=0> and the
C<(Kmin,Kmax)> options cause the module to determine the best value for C<K>.
Remember, C<K> is the number of clusters the data is partitioned into.
=item B<show_QoC_values()>
$clusterer->show_QoC_values();
presents a table with C<K> values in the left column and the corresponding QoC
(Quality-of-Clustering) values in the right column. Note that this call makes sense
only if you either supply the C<K=0> option to the constructor, or if you specify
values for the C<Kmin> and C<Kmax> options.
=item B<visualize_clusters()>
$clusterer->visualize_clusters( $visualization_mask )
The visualization mask here does not have to be identical to the one used for
clustering, but must be a subset of that mask. This is convenient for visualizing
the clusters in two- or three-dimensional subspaces of the original space.
=item B<visualize_data()>
$clusterer->visualize_data($visualization_mask, 'original');
$clusterer->visualize_data($visualization_mask, 'normed');
This method requires a second argument and, as shown, it must be either the string
C<original> or the string C<normed>, the former for the visualization of the raw data
and the latter for the visualization of the data after its different dimensions are
normalized by the standard-deviations along those directions. If you call the method
with the second argument set to C<normed>, but do so without turning on the
C<do_variance_normalization> option in the KMeans constructor, it will let you know.
=item B<which_cluster_for_new_data_element()>
If you wish to determine the cluster membership of a new data sample after you have
created the clusters with the existing data samples, you would need to call this
method. The C<which_cluster_for_new_data.pl> script in the C<examples> directory
shows how to use this method.
=item B<which_cluster_for_new_data_element_mahalanobis()>
This does the same thing as the previous method, except that it determines the
cluster membership using the Mahalanobis distance metric. As for the previous
method, see the C<which_cluster_for_new_data.pl> script in the C<examples> directory
for how to use this method.
=item B<cluster_data_generator()>
Algorithm::KMeans->cluster_data_generator(
input_parameter_file => $parameter_file,
output_datafile => $out_datafile,
number_data_points_per_cluster => 20 );
for generating multivariate data for clustering if you wish to play with synthetic
data for clustering. The input parameter file contains the means and the variances
for the different Gaussians you wish to use for the synthetic data. See the file
C<param.txt> provided in the examples directory. It will be easiest for you to just
edit this file for your data generation needs. In addition to the format of the
parameter file, the main constraint you need to observe in specifying the parameters
is that the dimensionality of the covariance matrix must correspond to the
dimensionality of the mean vectors. The multivariate random numbers are generated by
calling the C<Math::Random> module. As you would expect, this module requires that
the covariance matrices you specify in your parameter file be symmetric and positive
definite. Should the covariances in your parameter file not obey this condition, the
C<Math::Random> module will let you know.
=back
=head1 HOW THE CLUSTERS ARE OUTPUT
When the option C<terminal_output> is set in the call to the constructor, the
clusters are displayed on the terminal screen.
When the option C<write_clusters_to_files> is set in the call to the constructor, the
module dumps the clusters in files named
cluster0.txt
cluster1.txt
cluster2.txt
...
...
in the directory in which you execute the module. The number of such files will
equal the number of clusters formed. All such existing files in the directory are
( run in 1.387 second using v1.01-cache-2.11-cpan-140bd7fdf52 )