Algorithm-KMeans
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
foreach my $cluster (@$clusters) {
my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index];
foreach my $ele (@$cluster) {
my @mahalanobis_dist_from_clust_centers;
foreach my $i (0..@$clusters-1) {
my $center = $cluster_centers->[$i];
my $covariance = $cluster_covariances->[$i];
my $maha_distance;
eval {
$maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
};
next if $@;
push @mahalanobis_dist_from_clust_centers, $maha_distance;
}
my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
die "The Mahalanobis metric may not be appropriate for the data"
unless defined $best_center_index;
my $best_cluster_center = $cluster_centers->[$best_center_index];
if (vector_equal($current_cluster_center, $best_cluster_center)){
push @{$new_clusters->[$current_cluster_center_index]}, $ele;
} else {
$assignment_changed_flag = 1;
push @{$new_clusters->[$best_center_index]}, $ele;
}
}
$current_cluster_center_index++;
}
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
# Now make sure that none of the K clusters is an empty cluster:
foreach my $newcluster (@$new_clusters) {
$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
}
push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
# During clustering for a fixed K, should a cluster inadvertently
# become empty, steal a member from the largest cluster to hopefully
# spawn a new cluster:
my $largest_cluster;
foreach my $local_cluster (@$new_clusters) {
next if !defined $local_cluster;
$largest_cluster = $local_cluster if !defined $largest_cluster;
if (@$local_cluster > @$largest_cluster) {
$largest_cluster = $local_cluster;
}
}
foreach my $local_cluster (@$new_clusters) {
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
push @$local_cluster, pop @$largest_cluster;
}
}
next if (($cluster_size_zero_condition) && ($iteration_index < 100));
last if $iteration_index == 100;
# Now do a deep copy of new_clusters into clusters
$clusters = deep_copy_AoA( $new_clusters );
last if $assignment_changed_flag == 0;
}
$final_cluster_centers = $self->update_cluster_centers( $clusters );
return ($clusters, $final_cluster_centers);
}
sub update_cluster_centers_and_covariances_mahalanobis {
my $self = shift;
my @clusters = @{ shift @_ };
my @new_cluster_centers;
my @new_cluster_covariances;
# During clustering for a fixed K, should a cluster inadvertently become empty,
# steal a member from the largest cluster to hopefully spawn a new cluster:
my $largest_cluster;
foreach my $cluster (@clusters) {
next if !defined $cluster;
$largest_cluster = $cluster if !defined $largest_cluster;
if (@$cluster > @$largest_cluster) {
$largest_cluster = $cluster;
}
}
foreach my $cluster (@clusters) {
if ( (!defined $cluster) || (@$cluster == 0) ) {
push @$cluster, pop @$largest_cluster;
}
}
foreach my $cluster (@clusters) {
die "Cluster became empty --- untenable condition " .
"for a given K. Try again. \n" if !defined $cluster;
my $cluster_size = @$cluster;
die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
my @new_cluster_center = @{$self->add_point_coords( $cluster )};
@new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
push @new_cluster_centers, \@new_cluster_center;
# for covariance calculation:
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_id_tags}. The
# actual data for clustering is stored in a hash at $self->{_data} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
foreach my $j (0..$num_cols-1) {
my $tag = $cluster->[$j];
my $data = $self->{_data}->{$tag};
my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
$matrix->set_col($j, \@diff_from_mean);
}
my $transposed = transpose( $matrix );
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for cluster:";
display_matrix( $covariance );
}
push @new_cluster_covariances, $covariance;
}
return [\@new_cluster_centers, \@new_cluster_covariances];
}
# After each new assignment of the data points to the clusters on the basis of the
# current values for the cluster centers, we call the routine shown here for updating
# the values of the cluster centers.
sub update_cluster_centers {
my $self = shift;
my @clusters = @{ shift @_ };
my @new_cluster_centers;
# During clustering for a fixed K, should a cluster inadvertently become empty,
# steal a member from the largest cluster to hopefully spawn a new cluster:
my $largest_cluster;
foreach my $cluster (@clusters) {
next if !defined $cluster;
$largest_cluster = $cluster if !defined $largest_cluster;
if (@$cluster > @$largest_cluster) {
$largest_cluster = $cluster;
}
}
foreach my $cluster (@clusters) {
if ( (!defined $cluster) || (@$cluster == 0) ) {
push @$cluster, pop @$largest_cluster;
}
}
foreach my $cluster (@clusters) {
die "Cluster became empty --- untenable condition " .
"for a given K. Try again. \n" if !defined $cluster;
my $cluster_size = @$cluster;
die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
my @new_cluster_center = @{$self->add_point_coords( $cluster )};
@new_cluster_center = map {my $x = $_/$cluster_size; $x}
@new_cluster_center;
push @new_cluster_centers, \@new_cluster_center;
}
return \@new_cluster_centers;
}
sub which_cluster_for_new_data_element {
my $self = shift;
my $ele = shift;
die "The dimensionality of the new data element is not correct: $!"
unless @$ele == $self->{_data_dimensions};
my %distance_to_new_ele_hash;
foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
$distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele,
$self->{_cluster_centers_hash}->{$cluster_id});
}
my @values = values %distance_to_new_ele_hash;
my ($min,$max) = minmax(\@values);
my $answer;
foreach my $cluster_id (keys %distance_to_new_ele_hash) {
$answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
}
return $answer;
}
sub which_cluster_for_new_data_element_mahalanobis {
my $self = shift;
my $ele = shift;
die "The dimensionality of the new data element is not correct: $!"
unless @$ele == $self->{_data_dimensions};
my %distance_to_new_ele_hash;
foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
$distance_to_new_ele_hash{$cluster_id} =
$self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id},
$self->{_cluster_covariances_hash}->{$cluster_id});
view all matches for this distributionview release on metacpan - search on metacpan
( run in 0.433 second using v1.00-cache-2.02-grep-82fe00e-cpan-b63e86051f13 )