Algorithm-KMeans
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
my $clusters_and_determinants =
$self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);
$clusters = $clusters_and_determinants->[0];
my @determinants = @{$clusters_and_determinants->[1]};
my ($min,$max) = minmax(\@determinants);
die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
"least one cluster is too small" if $min / $max < 0.001;
} else {
$clusters = $self->assign_data_to_clusters_initial($cluster_centers);
}
my $cluster_nonexistent_flag = 0;
foreach my $trial (0..2) {
if ($self->{_use_mahalanobis_metric}) {
($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
} else {
($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
}
my $num_of_clusters_returned = @$clusters;
foreach my $cluster (@$clusters) {
$cluster_nonexistent_flag = 1 if ((!defined $cluster) || (@$cluster == 0));
}
last unless $cluster_nonexistent_flag;
}
return ($clusters, $cluster_centers);
}
# This function is used when you set the "cluster_seeding" option to 'random' in the
# constructor. Returns a set of K random integers. These serve as indices to reach
# into the data array. A data element whose index is one of the random numbers
# returned by this routine serves as an initial cluster center. Note the quality
# check it runs on the list of K random integers constructed. We first make sure
# that all K random integers are different. Subsequently, we carry out a quality
lib/Algorithm/KMeans.pm view on Meta::CPAN
# iterations. If this number reaches 100, we exit the while() loop anyway. In most
# cases, this limit will not be reached.
sub assign_data_to_clusters {
my $self = shift;
my $clusters = shift;
my $K = shift;
my $final_cluster_centers;
my $iteration_index = 0;
while (1) {
my $new_clusters;
my $assignment_changed_flag = 0;
my $current_cluster_center_index = 0;
my $cluster_size_zero_condition = 0;
my $how_many = @$clusters;
my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
$iteration_index++;
foreach my $cluster (@$clusters) {
my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
foreach my $ele (@$cluster) {
my @dist_from_clust_centers;
foreach my $center (@$cluster_centers) {
push @dist_from_clust_centers,
$self->distance($ele, $center);
}
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
my $best_cluster_center = $cluster_centers->[$best_center_index];
if (vector_equal($current_cluster_center, $best_cluster_center)){
push @{$new_clusters->[$current_cluster_center_index]}, $ele;
} else {
$assignment_changed_flag = 1;
push @{$new_clusters->[$best_center_index]}, $ele;
}
}
$current_cluster_center_index++;
}
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
# Now make sure that none of the K clusters is an empty cluster:
foreach my $newcluster (@$new_clusters) {
$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
}
}
foreach my $local_cluster (@$new_clusters) {
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
push @$local_cluster, pop @$largest_cluster;
}
}
next if (($cluster_size_zero_condition) && ($iteration_index < 100));
last if $iteration_index == 100;
$clusters = deep_copy_AoA( $new_clusters );
last if $assignment_changed_flag == 0;
}
$final_cluster_centers = $self->update_cluster_centers( $clusters );
return ($clusters, $final_cluster_centers);
}
sub assign_data_to_clusters_mahalanobis {
my $self = shift;
my $clusters = shift;
my $K = shift;
my $final_cluster_centers;
my $iteration_index = 0;
while (1) {
my $new_clusters;
my $assignment_changed_flag = 0;
my $current_cluster_center_index = 0;
my $cluster_size_zero_condition = 0;
my $how_many = @$clusters;
my $cluster_centers_and_covariances =
$self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
my $cluster_centers = $cluster_centers_and_covariances->[0];
my $cluster_covariances = $cluster_centers_and_covariances->[1];
$iteration_index++;
foreach my $cluster (@$clusters) {
my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
lib/Algorithm/KMeans.pm view on Meta::CPAN
next if $@;
push @mahalanobis_dist_from_clust_centers, $maha_distance;
}
my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
die "The Mahalanobis metric may not be appropriate for the data"
unless defined $best_center_index;
my $best_cluster_center = $cluster_centers->[$best_center_index];
if (vector_equal($current_cluster_center, $best_cluster_center)){
push @{$new_clusters->[$current_cluster_center_index]}, $ele;
} else {
$assignment_changed_flag = 1;
push @{$new_clusters->[$best_center_index]}, $ele;
}
}
$current_cluster_center_index++;
}
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
# Now make sure that none of the K clusters is an empty cluster:
foreach my $newcluster (@$new_clusters) {
$cluster_size_zero_condition = 1 if ((!defined $newcluster) or (@$newcluster == 0));
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
}
foreach my $local_cluster (@$new_clusters) {
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
push @$local_cluster, pop @$largest_cluster;
}
}
next if (($cluster_size_zero_condition) && ($iteration_index < 100));
last if $iteration_index == 100;
# Now do a deep copy of new_clusters into clusters
$clusters = deep_copy_AoA( $new_clusters );
last if $assignment_changed_flag == 0;
}
$final_cluster_centers = $self->update_cluster_centers( $clusters );
return ($clusters, $final_cluster_centers);
}
sub update_cluster_centers_and_covariances_mahalanobis {
my $self = shift;
my @clusters = @{ shift @_ };
my @new_cluster_centers;
my @new_cluster_covariances;
lib/Algorithm/KMeans.pm view on Meta::CPAN
K
Kmin
Kmax
terminal_output
write_clusters_to_files
do_variance_normalization
cluster_seeding
use_mahalanobis_metric
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub get_value_index_hash {
my $arr = shift;
my %hash;
foreach my $index (0..@$arr-1) {
$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
}
return \%hash;
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => $mask,
K => 0,
cluster_seeding => 'random', # or 'smart'
terminal_output => 1,
write_clusters_to_files => 1,
);
# Although not shown above, you can obviously set the 'do_variance_normalization'
# flag here also if you wish.
# For very large data files, setting K to 0 will result in searching through too
# many values for K. For such cases, you can range limit the values of K to search
# through by
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => "N111",
Kmin => 3,
Kmax => 10,
cluster_seeding => 'random', # or 'smart'
( run in 7.052 seconds using v1.01-cache-2.11-cpan-94b05bcf43c )