Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
$iteration_index++;
foreach my $cluster (@$clusters) {
my $current_cluster_center =
$cluster_centers->[$current_cluster_center_index];
foreach my $ele (@$cluster) {
my @dist_from_clust_centers;
foreach my $center (@$cluster_centers) {
push @dist_from_clust_centers,
$self->distance($ele, $center);
}
my ($min, $best_center_index) =
minimum( \@dist_from_clust_centers );
my $best_cluster_center =
$cluster_centers->[$best_center_index];
if (vector_equal($current_cluster_center,
$best_cluster_center)){
push @{$new_clusters->[$current_cluster_center_index]},
$ele;
} else {
$assignment_changed_flag = 1;
push @{$new_clusters->[$best_center_index]}, $ele;
}
}
$current_cluster_center_index++;
}
# Now make sure that we still have K clusters since K is fixed:
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
# Now make sure that none of the K clusters is an empty cluster:
foreach my $newcluster (@$new_clusters) {
$cluster_size_zero_condition = 1 if ((!defined $newcluster)
or (@$newcluster == 0));
}
push @$new_clusters, (undef) x ($K - @$new_clusters)
if @$new_clusters < $K;
my $largest_cluster;
foreach my $local_cluster (@$new_clusters) {
next if !defined $local_cluster;
$largest_cluster = $local_cluster if !defined $largest_cluster;
if (@$local_cluster > @$largest_cluster) {
$largest_cluster = $local_cluster;
}
}
foreach my $local_cluster (@$new_clusters) {
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
push @$local_cluster, pop @$largest_cluster;
}
}
next if (($cluster_size_zero_condition) && ($iteration_index < 100));
last if $iteration_index == 100;
# Now do a deep copy of new_clusters into clusters
$clusters = deep_copy_AoA( $new_clusters );
last if $assignment_changed_flag == 0;
}
$final_cluster_centers = $self->update_cluster_centers( $clusters );
return ($clusters, $final_cluster_centers);
}
# Used by the kmeans part of the code: After each new assignment of the data points
# to the clusters on the basis of the current values for the cluster centers, we call
# the routine shown here for updating the values of the cluster centers.
sub update_cluster_centers {
my $self = shift;
my @clusters = @{ shift @_ };
my @new_cluster_centers;
my $largest_cluster;
foreach my $cluster (@clusters) {
next if !defined $cluster;
$largest_cluster = $cluster if !defined $largest_cluster;
if (@$cluster > @$largest_cluster) {
$largest_cluster = $cluster;
}
}
foreach my $cluster (@clusters) {
if ( (!defined $cluster) || (@$cluster == 0) ) {
push @$cluster, pop @$largest_cluster;
}
}
foreach my $cluster (@clusters) {
die "Cluster became empty --- untenable condition " .
"for a given K. Try again. " if !defined $cluster;
my $cluster_size = @$cluster;
die "Cluster size is zero --- untenable." if $cluster_size == 0;
my @new_cluster_center = @{$self->add_point_coords( $cluster )};
@new_cluster_center = map {my $x = $_/$cluster_size; $x}
@new_cluster_center;
push @new_cluster_centers, \@new_cluster_center;
}
return \@new_cluster_centers;
}
# The following routine is for computing the distance between a data point specified
# by its symbolic name in the master datafile and a point (such as the center of a
# cluster) expressed as a vector of coordinates:
sub distance {
my $self = shift;
my $ele1_id = shift @_; # symbolic name of data sample
my @ele1 = @{$self->{_data}->{$ele1_id}};
my @ele2 = @{shift @_};
die "wrong data types for distance calculation" if @ele1 != @ele2;
my $how_many = @ele1;
my $squared_sum = 0;
foreach my $i (0..$how_many-1) {
$squared_sum += ($ele1[$i] - $ele2[$i])**2;
}
my $dist = sqrt $squared_sum;
return $dist;
}
# The following routine does the same as above but now both
# arguments are expected to be arrays of numbers:
sub distance2 {
my $self = shift;
my @ele1 = @{shift @_};
my @ele2 = @{shift @_};
die "wrong data types for distance calculation" if @ele1 != @ele2;
my $how_many = @ele1;
my $squared_sum = 0;
foreach my $i (0..$how_many-1) {
$squared_sum += ($ele1[$i] - $ele2[$i])**2;
}
return sqrt $squared_sum;
( run in 1.348 second using v1.01-cache-2.11-cpan-385001e3568 )