Algorithm-KMeans
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
"for cluster seeding"),
_var_normalize => $args{do_variance_normalization} || 0,
_use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0,
_clusters_2_files => $args{write_clusters_to_files} || 0,
_terminal_output => $args{terminal_output} || 0,
_debug => $args{debug} || 0,
_N => 0,
_K_best => 'unknown',
_original_data => {},
_data => {},
_data_id_tags => [],
_QoC_values => {},
_clusters => [],
_cluster_centers => [],
_clusters_hash => {},
_cluster_centers_hash => {},
_cluster_covariances_hash => {},
_data_dimensions => 0,
}, $class;
}
sub read_data_from_file {
my $self = shift;
my $filename = $self->{_datafile};
$self->read_data_from_file_csv() if $filename =~ /.csv$/;
$self->read_data_from_file_dat() if $filename =~ /.dat$/;
}
sub read_data_from_file_csv {
my $self = shift;
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
"the size of at least one of the data records in the file: $!"
unless scalar(@mask) == scalar(@splits);
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_original_data} = \%data_hash;
$self->{_data_id_tags} = \@data_tags;
$self->{_N} = scalar @data_tags;
if ($self->{_var_normalize}) {
$self->{_data} = variance_normalization( $self->{_original_data} );
} else {
$self->{_data} = deep_copy_hash( $self->{_original_data} );
}
# Need to make the following call to set the global mean and covariance:
# my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
# Need to make the following call to set the global eigenvec eigenval sets:
# $self->eigen_analysis_of_covariance($covariance);
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
if ( $self->{_N} < (2 * $self->{_K} ** 2) );
print "\n\n\n";
}
}
sub read_data_from_file_dat {
my $self = shift;
my $datafile = $self->{_datafile};
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open INPUT, $datafile or die "unable to open file $datafile: $!\n";
chomp( my @raw_data = <INPUT> );
close INPUT;
# Transform strings into number data
foreach my $record (@raw_data) {
next unless $record;
next if $record =~ /^#/;
my @data_fields;
my @fields = split /\s+/, $record;
die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
my $record_id;
foreach my $i (0..@fields-1) {
if ($mask[$i] eq '0') {
next;
} elsif ($mask[$i] eq 'N') {
$record_id = $fields[$i];
} elsif ($mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "misformed mask for reading the data file\n";
}
}
my @nums = map {/$_num_regex/;$_} @data_fields;
$self->{_original_data}->{ $record_id } = \@nums;
}
if ($self->{_var_normalize}) {
$self->{_data} = variance_normalization( $self->{_original_data} );
} else {
$self->{_data} = deep_copy_hash( $self->{_original_data} );
}
my @all_data_ids = keys %{$self->{_data}};
$self->{_data_id_tags} = \@all_data_ids;
$self->{_N} = scalar @all_data_ids;
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
if ( $self->{_N} < (2 * $self->{_K} ** 2) );
print "\n\n\n";
}
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
# coordinates of the data element. Our goal is to find the Mahalanobis distance
# from a given data element to a cluster whose center and covariance are known. As
# for the previous method, it requires three arguments.
sub distance_mahalanobis2 {
my $self = shift;
my $ele = shift; # is now a ref to the array of coords for a point
my $cluster_center = shift;
my $cluster_covariance = shift;
return undef unless defined $cluster_covariance;
my $det_of_covar = $cluster_covariance->det();
my @diff_from_mean = vector_subtract($ele, $cluster_center);
my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$vec->set_col(0, \@diff_from_mean);
my $transposed = transpose( $vec );
my $covariance_inverse;
if ($cluster_covariance->det() > 0.001) {
$covariance_inverse = $cluster_covariance->inverse;
} else {
die "covariance matrix may not have an inverse";
}
my $determinant = $covariance_inverse->det();
my $distance = $transposed * $covariance_inverse * $vec;
my @distance = $distance->as_list;
$distance = $distance[0];
return sqrt $distance;
}
sub estimate_cluster_covariance {
my $self = shift;
my $cluster = shift;
my $cluster_size = @$cluster;
my @cluster_center = @{$self->add_point_coords( $cluster )};
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
# for covariance calculation:
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_id_tags}. The
# actual data for clustering is stored in a hash at $self->{_data} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
foreach my $j (0..$num_cols-1) {
my $tag = $cluster->[$j];
my $data = $self->{_data}->{$tag};
my @diff_from_mean = vector_subtract($data, \@cluster_center);
$matrix->set_col($j, \@diff_from_mean);
}
my $transposed = transpose( $matrix );
my $covariance = $matrix * $transposed;
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for cluster:";
display_matrix( $covariance );
}
return $covariance;
}
sub write_clusters_to_files {
my $self = shift;
my @clusters = @{$self->{_clusters}};
unlink glob "cluster*.dat";
foreach my $i (0..@clusters-1) {
my $filename = "cluster" . $i . ".txt";
print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
foreach my $ele (@{$clusters[$i]}) {
print FILEHANDLE "$ele ";
}
close FILEHANDLE;
}
}
sub get_K_best {
my $self = shift;
croak "You need to run the clusterer with K=0 option " .
"before you can call this method" if $self->{_K_best} eq 'unknown';
print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
return $self->{_K_best};
}
sub show_QoC_values {
my $self = shift;
croak "\nYou need to run the clusterer with K=0 option before you can call this method"
if $self->{_K_best} eq 'unknown';
print "\nShown below are K on the left and the QoC values on the right (the smaller " .
"the QoC, the better it is)\n";
foreach my $key (sort keys %{$self->{_QoC_values}} ) {
print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
}
}
sub DESTROY {
unlink "__temp_" . basename($_[0]->{_datafile});
unlink "__temp_data_" . basename($_[0]->{_datafile});
unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
}
################################## Visualization Code ###################################
# It makes sense to call visualize_clusters() only AFTER you have called kmeans().
#
# The visualize_clusters() implementation automatically figures out whether it
# should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
# supplied as one of the arguments is greater than 2, it does a 3D plot for the
# first three data coordinates. That is, the clusters will be displayed in the 3D
# space formed by the first three data coordinates. On the other hand, if the number
# of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
# only one on bit is specified for the mask, visualize_clusters() aborts.
#
# The visualization code consists of first accessing each of clusters created by the
# kmeans() subroutine. Note that the clusters contain only the symbolic names for
# the individual records in the source data file. We therefore next reach into the
# $self->{_original_data} hash and get the data coordinates associated with each
# symbolic label in a cluster. The numerical data thus generated is then written
# out to a temp file. When doing so we must remember to insert TWO BLANK LINES
# between the data blocks corresponding to the different clusters. This constraint
# is imposed on us by Gnuplot when plotting data from the same file since we want to
# use different point styles for the data points in different cluster files.
#
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
# module to plot the data clusters.
lib/Algorithm/KMeans.pm view on Meta::CPAN
You could, for example, create your own class by subclassing from the class presented
here and, in your subclass, use your own criteria for the similarity distance between
the data points and for the QoC (Quality of Clustering) metric, and, possibly a
different rule to stop the iterations. Version 1.1 also allows you to directly
access the clusters formed and the cluster centers in your calling script.
=head1 SPECIAL USAGE NOTE
If you were directly accessing in your own scripts the clusters produced by the older
versions of this module, you'd need to make changes to your code if you wish to use
Version 2.0 or higher. Instead of returning arrays of clusters and cluster centers,
Versions 2.0 and higher return hashes. This change was made necessary by the logic
required for implementing the two new C<which_cluster> methods that were introduced
in Version 2.0. These methods return the best cluster for a new data sample from the
clusters you created using the existing data. One of the C<which_cluster> methods is
based on the Euclidean metric for finding the cluster that is closest to the new data
sample, and the other on the Mahalanobis metric. Another point of incompatibility
with the previous versions is that you must now explicitly set the C<cluster_seeding>
parameter in the call to the constructor to either C<random> or C<smart>. This
parameter does not have a default associated with it starting with Version 2.0.
=head1 DESCRIPTION
Clustering with K-Means takes place iteratively and involves two steps: 1) assignment
of data samples to clusters on the basis of how far the data samples are from the
cluster centers; and 2) Recalculation of the cluster centers (and cluster covariances
if you are using the Mahalanobis distance metric for clustering).
Obviously, before the two-step approach can proceed, we need to initialize the the
cluster centers. How this initialization is carried out is important. The module
gives you two very different ways for carrying out this initialization. One option,
called the C<smart> option, consists of subjecting the data to principal components
analysis to discover the direction of maximum variance in the data space. The data
points are then projected on to this direction and a histogram constructed from the
projections. Centers of the smoothed histogram are used to seed the clustering
operation. The other option is to choose the cluster centers purely randomly. You
get the first option if you set C<cluster_seeding> to C<smart> in the constructor,
and you get the second option if you set it to C<random>.
How to specify the number of clusters, C<K>, is one of the most vexing issues in any
approach to clustering. In some case, we can set C<K> on the basis of prior
knowledge. But, more often than not, no such prior knowledge is available. When the
programmer does not explicitly specify a value for C<K>, the approach taken in the
current implementation is to try all possible values between 2 and some largest
possible value that makes statistical sense. We then choose that value for C<K>
which yields the best value for the QoC (Quality of Clustering) metric. It is
generally believed that the largest value for C<K> should not exceed C<sqrt(N/2)>
where C<N> is the number of data samples to be clustered.
What to use for the QoC metric is obviously a critical issue unto itself. In the
current implementation, the value of QoC is the ratio of the average radius of the
clusters and the average distance between the cluster centers.
Every iterative algorithm requires a stopping criterion. The criterion implemented
here is that we stop iterations when there is no re-assignment of the data points
during the assignment step.
Ordinarily, the output produced by a K-Means clusterer will correspond to a local
minimum for the QoC values, as opposed to a global minimum. The current
implementation protects against that when the module constructor is called with the
C<random> option for C<cluster_seeding> by trying different randomly selected initial
cluster centers and then selecting the one that gives the best overall QoC value.
A K-Means clusterer will generally produce good results if the overlap between the
clusters is minimal and if each cluster exhibits variability that is uniform in all
directions. When the data variability is different along the different directions in
the data space, the results you obtain with a K-Means clusterer may be improved by
first normalizing the data appropriately, as can be done in this module when you set
the C<do_variance_normalization> option in the constructor. However, as pointed out
elsewhere in this documentation, such normalization may actually decrease the
performance of the clusterer if the overall data variability along any dimension is
more a result of separation between the means than a consequence of intra-cluster
variability.
=head1 METHODS
The module provides the following methods for clustering, for cluster visualization,
for data visualization, for the generation of data for testing a clustering
algorithm, and for determining the cluster membership of a new data sample:
=over 4
=item B<new():>
my $clusterer = Algorithm::KMeans->new(datafile => $datafile,
mask => $mask,
K => $K,
cluster_seeding => 'random', # also try 'smart'
use_mahalanobis_metric => 1, # also try '0'
terminal_output => 1,
write_clusters_to_files => 1,
);
A call to C<new()> constructs a new instance of the C<Algorithm::KMeans> class. When
C<$K> is a non-zero positive integer, the module will construct exactly that many
clusters. However, when C<$K> is 0, the module will find the best number of clusters
to partition the data into. As explained in the Description, setting
C<cluster_seeding> to C<smart> causes PCA (principal components analysis) to be used
for discovering the best choices for the initial cluster centers. If you want purely
random decisions to be made for the initial choices for the cluster centers, set
C<cluster_seeding> to C<random>.
The data file is expected to contain entries in the following format
c20 0 10.7087017086940 9.63528386251712 10.9512155258108 ...
c7 0 12.8025925026787 10.6126270065785 10.5228482095349 ...
b9 0 7.60118206283120 5.05889245193079 5.82841781759102 ...
....
....
where the first column contains the symbolic ID tag for each data record and the rest
of the columns the numerical information. As to which columns are actually used for
clustering is decided by the string value of the mask. For example, if we wanted to
cluster on the basis of the entries in just the 3rd, the 4th, and the 5th columns
above, the mask value would be C<N0111> where the character C<N> indicates that the
ID tag is in the first column, the character C<0> that the second column is to be
ignored, and the C<1>'s that follow that the 3rd, the 4th, and the 5th columns are to
be used for clustering.
( run in 0.804 second using v1.01-cache-2.11-cpan-39bf76dae61 )