Algorithm-KMeans
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
push @array_of_clusters, $clusters;
push @array_of_cluster_centers, $cluster_centers;
}
my ($min, $max) = minmax( \@QoC_values );
my $K_best_relative_to_Kmin = get_index_at_value($min, \@QoC_values );
my $K_best = $K_best_relative_to_Kmin + $Kmin;
if ($self->{_terminal_output}) {
print "\nDisplaying final clusters for best K (= $K_best) :\n";
display_clusters( $array_of_clusters[$K_best_relative_to_Kmin] );
$self->display_cluster_centers($array_of_clusters[$K_best_relative_to_Kmin]);
print "\nBest clustering achieved for K=$K_best with QoC = $min\n" if defined $min;
my @printableQoC = grep {$_} @QoC_values;
print "\nQoC values array (the smaller the value, the better it is) for different " .
"K starting with K=$Kmin: @printableQoC\n";
}
$self->{_K_best} = $K_best;
foreach my $i (0..@QoC_values-1) {
my $k = $i + $Kmin;
$self->{_QoC_values}->{"$k"} = $QoC_values[$i];
}
$self->{_clusters} = $array_of_clusters[$K_best_relative_to_Kmin];
$self->{_cluster_centers} =
$array_of_cluster_centers[$K_best_relative_to_Kmin];
}
# This is the function to call if you already know what value you want to use for K,
# the number of expected clusters. The purpose of this function is to do the
# initialization of the cluster centers and to carry out the initial assignment of
# the data to the clusters with the initial cluster centers. The initialization
# consists of 3 steps: Construct a random sequence of K integers between 0 and N-1
# where N is the number of data points to be clustered; 2) Call
# get_initial_cluster_centers() to index into the data array with the random integers
# to get a list of K data points that would serve as the initial cluster centers; and
# (3) Call assign_data_to_clusters_initial() to assign the rest of the data to each
# of the K clusters on the basis of the proximity to the cluster centers.
sub cluster_for_given_K {
my $self = shift;
my $K = shift;
my @all_data_ids = @{$self->{_data_id_tags}};
my $cluster_centers;
if ($self->{_cluster_seeding} eq 'smart') {
$cluster_centers = $self->get_initial_cluster_centers_smart($K);
} elsif ($self->{_cluster_seeding} eq 'random') {
$cluster_centers = $self->get_initial_cluster_centers($K);
} else {
die "You must either choose 'smart' for cluster_seeding or 'random'. " .
"Fix your constructor call."
}
my $clusters;
if ($self->{_use_mahalanobis_metric}) {
my $clusters_and_determinants =
$self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);
$clusters = $clusters_and_determinants->[0];
my @determinants = @{$clusters_and_determinants->[1]};
my ($min,$max) = minmax(\@determinants);
die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
"least one cluster is too small" if $min / $max < 0.001;
} else {
$clusters = $self->assign_data_to_clusters_initial($cluster_centers);
}
my $cluster_nonexistent_flag = 0;
foreach my $trial (0..2) {
if ($self->{_use_mahalanobis_metric}) {
($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
} else {
($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
}
my $num_of_clusters_returned = @$clusters;
foreach my $cluster (@$clusters) {
$cluster_nonexistent_flag = 1 if ((!defined $cluster) || (@$cluster == 0));
}
last unless $cluster_nonexistent_flag;
}
return ($clusters, $cluster_centers);
}
# This function is used when you set the "cluster_seeding" option to 'random' in the
# constructor. Returns a set of K random integers. These serve as indices to reach
# into the data array. A data element whose index is one of the random numbers
# returned by this routine serves as an initial cluster center. Note the quality
# check it runs on the list of K random integers constructed. We first make sure
# that all K random integers are different. Subsequently, we carry out a quality
# assessment of the K random integers constructed. This quality measure consists of
# the ratio of the values spanned by the random integers to the value of N, the total
# number of data points to be clustered. Currently, if this ratio is less than 0.3,
# we discard the K integers and try again.
sub initialize_cluster_centers {
my $self = shift;
my $K = shift;
my $data_store_size = $self->{_N};
my @cluster_center_indices;
while (1) {
foreach my $i (0..$K-1) {
$cluster_center_indices[$i] = int rand( $data_store_size );
next if $i == 0;
foreach my $j (0..$i-1) {
while ( $cluster_center_indices[$j] ==
$cluster_center_indices[$i] ) {
my $old = $cluster_center_indices[$i];
$cluster_center_indices[$i] = int rand($data_store_size);
}
}
}
my ($min,$max) = minmax(\@cluster_center_indices );
my $quality = ($max - $min) / $data_store_size;
last if $quality > 0.3;
}
return @cluster_center_indices;
}
# This function is used when you set the "cluster_seeding" option to 'random' in the
# constructor. This routine merely reaches into the data array with the random
# integers, as constructed by the previous routine, serving as indices and fetching
# values corresponding to those indices. The fetched data samples serve as the
# initial cluster centers.
sub get_initial_cluster_centers {
my $self = shift;
my $K = shift;
my @cluster_center_indices = $self->initialize_cluster_centers($K);
my @result;
foreach my $i (@cluster_center_indices) {
my $tag = $self->{_data_id_tags}[$i];
push @result, $self->{_data}->{$tag};
}
return \@result;
}
# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
# option in the constructor. It subjects the data to a principal components analysis
# to figure out the direction of maximal variance. Subsequently, it tries to locate
# K peaks in a smoothed histogram of the data points projected onto the maximal
# variance direction.
lib/Algorithm/KMeans.pm view on Meta::CPAN
my $self = shift;
my @cluster_centers = @{ shift @_ };
my @clusters;
foreach my $ele (@{$self->{_data_id_tags}}) {
my $best_cluster;
my @dist_from_clust_centers;
foreach my $center (@cluster_centers) {
push @dist_from_clust_centers, $self->distance($ele, $center);
}
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
push @{$clusters[$best_center_index]}, $ele;
}
return \@clusters;
}
sub assign_data_to_clusters_initial_mahalanobis {
my $self = shift;
my @cluster_centers = @{ shift @_ };
my @clusters;
foreach my $ele (@{$self->{_data_id_tags}}) {
my $best_cluster;
my @dist_from_clust_centers;
foreach my $center (@cluster_centers) {
push @dist_from_clust_centers, $self->distance($ele, $center);
}
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
push @{$clusters[$best_center_index]}, $ele if defined $best_center_index;
}
# Since a cluster center may not correspond to any particular sample, it is possible
# for one of the elements of the array @clusters to be null using the above
# strategy for populating the initial clusters. Let's say there are five cluster
# centers in the array @cluster_centers. The $best_center_index may populate the
# the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave
# $clusters[3] as undefined. So, in what follows, we must first check if all of
# the elements of @clusters are defined.
my @determinants;
foreach my $cluster(@clusters) {
die "The clustering program started with bad initialization. Please start over"
unless defined $cluster;
my $covariance = $self->estimate_cluster_covariance($cluster);
my $determinant = $covariance->det();
push @determinants, $determinant;
}
return [\@clusters, \@determinants];
}
# This is the main routine that along with the update_cluster_centers() routine
# constitute the two key steps of the K-Means algorithm. In most cases, the infinite
# while() loop will terminate automatically when the cluster assignments of the data
# points remain unchanged. For the sake of safety, we keep track of the number of
# iterations. If this number reaches 100, we exit the while() loop anyway. In most
# cases, this limit will not be reached.
sub assign_data_to_clusters {
my $self = shift;
my $clusters = shift;
my $K = shift;
my $final_cluster_centers;
my $iteration_index = 0;
while (1) {
my $new_clusters;
my $assignment_changed_flag = 0;
my $current_cluster_center_index = 0;
my $cluster_size_zero_condition = 0;
my $how_many = @$clusters;
my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
$iteration_index++;
foreach my $cluster (@$clusters) {
my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
foreach my $ele (@$cluster) {
my @dist_from_clust_centers;
foreach my $center (@$cluster_centers) {
push @dist_from_clust_centers,
$self->distance($ele, $center);
}
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
my $best_cluster_center = $cluster_centers->[$best_center_index];
if (vector_equal($current_cluster_center, $best_cluster_center)){
push @{$new_clusters->[$current_cluster_center_index]}, $ele;
} else {
$assignment_changed_flag = 1;
push @{$new_clusters->[$best_center_index]}, $ele;
}
}
$current_cluster_center_index++;
}
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
# Now make sure that none of the K clusters is an empty cluster:
foreach my $newcluster (@$new_clusters) {
$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
}
push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
# During clustering for a fixed K, should a cluster inadvertently
# become empty, steal a member from the largest cluster to hopefully
# spawn a new cluster:
my $largest_cluster;
foreach my $local_cluster (@$new_clusters) {
next if !defined $local_cluster;
$largest_cluster = $local_cluster if !defined $largest_cluster;
if (@$local_cluster > @$largest_cluster) {
$largest_cluster = $local_cluster;
}
}
foreach my $local_cluster (@$new_clusters) {
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
push @$local_cluster, pop @$largest_cluster;
}
}
next if (($cluster_size_zero_condition) && ($iteration_index < 100));
last if $iteration_index == 100;
$clusters = deep_copy_AoA( $new_clusters );
last if $assignment_changed_flag == 0;
}
$final_cluster_centers = $self->update_cluster_centers( $clusters );
return ($clusters, $final_cluster_centers);
}
sub assign_data_to_clusters_mahalanobis {
my $self = shift;
my $clusters = shift;
my $K = shift;
my $final_cluster_centers;
my $iteration_index = 0;
while (1) {
my $new_clusters;
my $assignment_changed_flag = 0;
my $current_cluster_center_index = 0;
my $cluster_size_zero_condition = 0;
my $how_many = @$clusters;
my $cluster_centers_and_covariances =
$self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
my $cluster_centers = $cluster_centers_and_covariances->[0];
my $cluster_covariances = $cluster_centers_and_covariances->[1];
$iteration_index++;
foreach my $cluster (@$clusters) {
my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index];
foreach my $ele (@$cluster) {
my @mahalanobis_dist_from_clust_centers;
foreach my $i (0..@$clusters-1) {
my $center = $cluster_centers->[$i];
my $covariance = $cluster_covariances->[$i];
my $maha_distance;
eval {
$maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
};
next if $@;
push @mahalanobis_dist_from_clust_centers, $maha_distance;
}
my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
die "The Mahalanobis metric may not be appropriate for the data"
unless defined $best_center_index;
my $best_cluster_center = $cluster_centers->[$best_center_index];
if (vector_equal($current_cluster_center, $best_cluster_center)){
push @{$new_clusters->[$current_cluster_center_index]}, $ele;
} else {
$assignment_changed_flag = 1;
push @{$new_clusters->[$best_center_index]}, $ele;
}
}
$current_cluster_center_index++;
}
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
# Now make sure that none of the K clusters is an empty cluster:
foreach my $newcluster (@$new_clusters) {
$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
}
push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
# During clustering for a fixed K, should a cluster inadvertently
# become empty, steal a member from the largest cluster to hopefully
# spawn a new cluster:
my $largest_cluster;
foreach my $local_cluster (@$new_clusters) {
next if !defined $local_cluster;
$largest_cluster = $local_cluster if !defined $largest_cluster;
if (@$local_cluster > @$largest_cluster) {
$largest_cluster = $local_cluster;
}
}
foreach my $local_cluster (@$new_clusters) {
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
push @$local_cluster, pop @$largest_cluster;
}
}
next if (($cluster_size_zero_condition) && ($iteration_index < 100));
last if $iteration_index == 100;
# Now do a deep copy of new_clusters into clusters
$clusters = deep_copy_AoA( $new_clusters );
last if $assignment_changed_flag == 0;
}
$final_cluster_centers = $self->update_cluster_centers( $clusters );
return ($clusters, $final_cluster_centers);
}
sub update_cluster_centers_and_covariances_mahalanobis {
my $self = shift;
my @clusters = @{ shift @_ };
my @new_cluster_centers;
my @new_cluster_covariances;
# During clustering for a fixed K, should a cluster inadvertently become empty,
# steal a member from the largest cluster to hopefully spawn a new cluster:
my $largest_cluster;
foreach my $cluster (@clusters) {
next if !defined $cluster;
$largest_cluster = $cluster if !defined $largest_cluster;
if (@$cluster > @$largest_cluster) {
$largest_cluster = $cluster;
}
}
foreach my $cluster (@clusters) {
if ( (!defined $cluster) || (@$cluster == 0) ) {
push @$cluster, pop @$largest_cluster;
}
}
foreach my $cluster (@clusters) {
die "Cluster became empty --- untenable condition " .
"for a given K. Try again. \n" if !defined $cluster;
my $cluster_size = @$cluster;
die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
my @new_cluster_center = @{$self->add_point_coords( $cluster )};
@new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
push @new_cluster_centers, \@new_cluster_center;
# for covariance calculation:
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_id_tags}. The
# actual data for clustering is stored in a hash at $self->{_data} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
foreach my $j (0..$num_cols-1) {
my $tag = $cluster->[$j];
my $data = $self->{_data}->{$tag};
my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
$matrix->set_col($j, \@diff_from_mean);
}
my $transposed = transpose( $matrix );
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for cluster:";
display_matrix( $covariance );
}
push @new_cluster_covariances, $covariance;
}
return [\@new_cluster_centers, \@new_cluster_covariances];
}
# After each new assignment of the data points to the clusters on the basis of the
lib/Algorithm/KMeans.pm view on Meta::CPAN
}
}
sub variance_normalization {
my %data_hash = %{shift @_};
my @all_data_points = values %data_hash;
my $dimensions = @{$all_data_points[0]};
my @data_projections;
foreach my $data_point (@all_data_points) {
my $i = 0;
foreach my $proj (@$data_point) {
push @{$data_projections[$i++]}, $proj;
}
}
my @variance_vec;
foreach my $vec (@data_projections) {
my ($mean, $variance) = mean_and_variance( $vec );
push @variance_vec, $variance;
}
my %new_data_hash;
while (my ($label, $data) = each(%data_hash) ) {
my @new_data;
foreach my $i (0..@{$data}-1) {
my $new = $data->[$i] / sqrt($variance_vec[$i]);
push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
}
$new_data_hash{$label} = \@new_data;
}
return \%new_data_hash;
}
sub mean_and_variance {
my @data = @{shift @_};
my ($mean, $variance);
foreach my $i (1..@data) {
if ($i == 1) {
$mean = $data[0];
$variance = 0;
} else {
$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
$variance = ( (($i-1)/$i) * $variance ) + ($data[$i-1]-$mean)**2 / ($i-1);
}
}
return ($mean, $variance);
}
sub check_for_illegal_params {
my @params = @_;
my @legal_params = qw / datafile
mask
K
Kmin
Kmax
terminal_output
write_clusters_to_files
do_variance_normalization
cluster_seeding
use_mahalanobis_metric
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub get_value_index_hash {
my $arr = shift;
my %hash;
foreach my $index (0..@$arr-1) {
$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
}
return \%hash;
}
sub non_maximum_suppression {
my $arr = shift;
my @output = (0) x @$arr;
my @final_output = (0) x @$arr;
my %hash;
my @array_of_runs = ([$arr->[0]]);
foreach my $index (1..@$arr-1) {
if ($arr->[$index] == $arr->[$index-1]) {
push @{$array_of_runs[-1]}, $arr->[$index];
} else {
push @array_of_runs, [$arr->[$index]];
}
}
my $runstart_index = 0;
foreach my $run_index (1..@array_of_runs-2) {
$runstart_index += @{$array_of_runs[$run_index-1]};
if ($array_of_runs[$run_index]->[0] >
$array_of_runs[$run_index-1]->[0] &&
$array_of_runs[$run_index]->[0] >
$array_of_runs[$run_index+1]->[0]) {
my $run_center = @{$array_of_runs[$run_index]} / 2;
my $assignment_index = $runstart_index + $run_center;
$output[$assignment_index] = $arr->[$assignment_index];
}
}
if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
$runstart_index += @{$array_of_runs[-2]};
my $run_center = @{$array_of_runs[-1]} / 2;
my $assignment_index = $runstart_index + $run_center;
$output[$assignment_index] = $arr->[$assignment_index];
}
if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
my $run_center = @{$array_of_runs[0]} / 2;
$output[$run_center] = $arr->[$run_center];
}
return \@output;
}
sub display_matrix {
my $matrix = shift;
my $nrows = $matrix->rows();
my $ncols = $matrix->cols();
print "\n\nDisplaying matrix of size $nrows rows and $ncols columns:\n";
foreach my $i (0..$nrows-1) {
my $row = $matrix->row($i);
my @row_as_list = $row->as_list;
print "@row_as_list\n";
}
print "\n\n";
lib/Algorithm/KMeans.pm view on Meta::CPAN
# Now construct an instance of the clusterer. The parameter K controls the number
# of clusters. If you know how many clusters you want (let's say 3), call
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => $mask,
K => 3,
cluster_seeding => 'random',
terminal_output => 1,
write_clusters_to_files => 1,
);
# By default, this constructor call will set you up for clustering based on
# Euclidean distances. If you want the module to use Mahalanobis distances, your
# constructor call will look like:
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => $mask,
K => 3,
cluster_seeding => 'random',
use_mahalanobis_metric => 1,
terminal_output => 1,
write_clusters_to_files => 1,
);
# For both constructor calls shown above, you can use smart seeding of the clusters
# by changing 'random' to 'smart' for the cluster_seeding option. See the
# explanation of smart seeding in the Methods section of this documentation.
# If your data is such that its variability along the different dimensions of the
# data space is significantly different, you may get better clustering if you first
# normalize your data by setting the constructor parameter
# do_variance_normalization as shown below:
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => $mask,
K => 3,
cluster_seeding => 'smart', # or 'random'
terminal_output => 1,
do_variance_normalization => 1,
write_clusters_to_files => 1,
);
# But bear in mind that such data normalization may actually decrease the
# performance of the clusterer if the variability in the data is more a result of
# the separation between the means than a consequence of intra-cluster variance.
# Set K to 0 if you want the module to figure out the optimum number of clusters
# from the data. (It is best to run this option with the terminal_output set to 1
# so that you can see the different value of QoC for the different K):
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => $mask,
K => 0,
cluster_seeding => 'random', # or 'smart'
terminal_output => 1,
write_clusters_to_files => 1,
);
# Although not shown above, you can obviously set the 'do_variance_normalization'
# flag here also if you wish.
# For very large data files, setting K to 0 will result in searching through too
# many values for K. For such cases, you can range limit the values of K to search
# through by
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => "N111",
Kmin => 3,
Kmax => 10,
cluster_seeding => 'random', # or 'smart'
terminal_output => 1,
write_clusters_to_files => 1,
);
# FOR ALL CASES ABOVE, YOU'D NEED TO MAKE THE FOLLOWING CALLS ON THE CLUSTERER
# INSTANCE TO ACTUALLY CLUSTER THE DATA:
$clusterer->read_data_from_file();
$clusterer->kmeans();
# If you want to directly access the clusters and the cluster centers in your own
# top-level script, replace the above two statements with:
$clusterer->read_data_from_file();
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
# You can subsequently access the clusters directly in your own code, as in:
foreach my $cluster_id (sort keys %{$clusters_hash}) {
print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
}
foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
}
# CLUSTER VISUALIZATION:
# You must first set the mask for cluster visualization. This mask tells the module
# which 2D or 3D subspace of the original data space you wish to visualize the
# clusters in:
my $visualization_mask = "111";
$clusterer->visualize_clusters($visualization_mask);
# SYNTHETIC DATA GENERATION:
# The module has been provided with a class method for generating multivariate data
# for experimenting with clustering. The data generation is controlled by the
# contents of the parameter file that is supplied as an argument to the data
# generator method. The mean and covariance matrix entries in the parameter file
# must be according to the syntax shown in the param.txt file in the examples
# directory. It is best to edit this file as needed:
my $parameter_file = "param.txt";
my $out_datafile = "mydatafile.dat";
Algorithm::KMeans->cluster_data_generator(
input_parameter_file => $parameter_file,
output_datafile => $out_datafile,
( run in 0.992 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )