Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
foreach my $cluster_index (0..$self->{_K}-1) {
foreach my $tag (@{$clusters->[$cluster_index]}) {
my $data = $self->{_data}->{$tag};
push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
}
}
($self->{_cluster_means}, $self->{_cluster_covariances}) =
find_cluster_means_and_covariances(\@dataclusters);
} elsif ($self->{_seeding} eq 'manual') {
die "You have not supplied the seeding tags for the option \"manual\""
unless @{$self->{_seed_tags}} > 0;
print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
foreach my $tag (@{$self->{_seed_tags}}) {
die "invalid tag used for manual seeding"
unless exists $self->{_data}->{$tag};
}
my ($seed_means, $seed_covars) =
$self->find_seed_centered_covariances($self->{_seed_tags});
$self->{_cluster_means} = $seed_means;
$self->{_cluster_covariances} = $seed_covars;
} else {
die "Incorrect call syntax used. See documentation.";
}
}
# This is the top-level method for kmeans based initialization of the EM
# algorithm. The means and the covariances returned by kmeans are used to seed the EM
# algorithm.
sub kmeans {
my $self = shift;
my $K = $self->{_K};
$self->cluster_for_fixed_K_single_smart_try($K);
if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
return ($self->{_clusters}, $self->{_cluster_centers});
} else {
die "kmeans clustering failed.";
}
}
# Used by the kmeans algorithm for the initialization of the EM iterations. We do
# initial kmeans cluster seeding by subjecting the data to principal components
# analysis in order to discover the direction of maximum variance in the data space.
# Subsequently, we try to find the K largest peaks along this direction. The
# coordinates of these peaks serve as the seeds for the K clusters.
sub cluster_for_fixed_K_single_smart_try {
my $self = shift;
my $K = shift;
print "Clustering for K = $K\n" if $self->{_terminal_output};
my ($clusters, $cluster_centers) =
$self->cluster_for_given_K($K);
$self->{_clusters} = $clusters;
$self->{_cluster_centers} = $cluster_centers;
}
# Used by the kmeans part of the code for the initialization of the EM algorithm:
sub cluster_for_given_K {
my $self = shift;
my $K = shift;
my $cluster_centers = $self->get_initial_cluster_centers($K);
my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);
my $cluster_nonexistant_flag = 0;
foreach my $trial (0..2) {
($clusters, $cluster_centers) =
$self->assign_data_to_clusters( $clusters, $K );
my $num_of_clusters_returned = @$clusters;
foreach my $cluster (@$clusters) {
$cluster_nonexistant_flag = 1 if ((!defined $cluster)
|| (@$cluster == 0));
}
last unless $cluster_nonexistant_flag;
}
return ($clusters, $cluster_centers);
}
# Used by the kmeans part of the code for the initialization of the EM algorithm:
sub get_initial_cluster_centers {
my $self = shift;
my $K = shift;
if ($self->{_data_dimensions} == 1) {
my @one_d_data;
foreach my $j (0..$self->{_N}-1) {
my $tag = $self->{_data_id_tags}[$j];
push @one_d_data, $self->{_data}->{$tag}->[0];
}
my @peak_points =
find_peak_points_in_given_direction(\@one_d_data,$K);
print "highest points at data values: @peak_points\n"
if $self->{_debug};
my @cluster_centers;
foreach my $peakpoint (@peak_points) {
push @cluster_centers, [$peakpoint];
}
return \@cluster_centers;
}
my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_id_tags}.
# The actual data for clustering is stored in a hash at $self->{_data}
# whose keys are the record labels; the value associated with each
# key is the array holding the corresponding numerical multidimensional
# data.
foreach my $j (0..$num_cols-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
$matrix->set_col($j, $data);
}
if ($self->{_debug}) {
print "\nDisplaying the original data as a matrix:";
display_matrix( $matrix );
}
foreach my $j (0..$num_cols-1) {
$mean_vec += $matrix->col($j);
}
$mean_vec *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "Displaying the mean vector for the data:";
display_matrix( $mean_vec );
}
foreach my $j (0..$num_cols-1) {
my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
$matrix->set_col($j, \@new_col);
}
if ($self->{_debug}) {
print "Displaying mean subtracted data as a matrix:";
display_matrix( $matrix );
}
my $transposed = transpose( $matrix );
if ($self->{_debug}) {
print "Displaying transposed data matrix:";
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
# It would be nice to set the delta adaptively, but I must
# change the number of cells in the next foreach loop accordingly
# my $delta = $avg_diff / 20;
my @accumulator = (0) x 1000;
foreach my $index (0..@sorted_data-1) {
my $cell_index = int($sorted_data[$index] / $delta);
my $smoothness = 40;
for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
next if $index < 0 || $index > 999;
$accumulator[$index]++;
}
}
my $peaks_array = non_maximum_supression( \@accumulator );
my $peaks_index_hash = get_value_index_hash( $peaks_array );
my @K_highest_peak_locations;
my $k = 0;
foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
my $unscaled_peak_point =
$min + $peaks_index_hash->{$peak} * $scale * $delta;
push @K_highest_peak_locations, $unscaled_peak_point
if $k < $how_many;
last if ++$k == $how_many;
}
return @K_highest_peak_locations;
}
# Used by the kmeans part of the code: The purpose of this routine is to form initial
# clusters by assigning the data samples to the initial clusters formed by the
# previous routine on the basis of the best proximity of the data samples to the
# different cluster centers.
sub assign_data_to_clusters_initial {
my $self = shift;
my @cluster_centers = @{ shift @_ };
my @clusters;
foreach my $ele (@{$self->{_data_id_tags}}) {
my $best_cluster;
my @dist_from_clust_centers;
foreach my $center (@cluster_centers) {
push @dist_from_clust_centers, $self->distance($ele, $center);
}
my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
push @{$clusters[$best_center_index]}, $ele;
}
return \@clusters;
}
# Used by the kmeans part of the code: This is the main routine that along with the
# update_cluster_centers() routine constitute the two key steps of the K-Means
# algorithm. In most cases, the infinite while() loop will terminate automatically
# when the cluster assignments of the data points remain unchanged. For the sake of
# safety, we keep track of the number of iterations. If this number reaches 100, we
# exit the while() loop anyway. In most cases, this limit will not be reached.
sub assign_data_to_clusters {
my $self = shift;
my $clusters = shift;
my $K = shift;
my $final_cluster_centers;
my $iteration_index = 0;
while (1) {
my $new_clusters;
my $assignment_changed_flag = 0;
my $current_cluster_center_index = 0;
my $cluster_size_zero_condition = 0;
my $how_many = @$clusters;
my $cluster_centers = $self->update_cluster_centers(
deep_copy_AoA_with_nulls( $clusters ) );
$iteration_index++;
foreach my $cluster (@$clusters) {
my $current_cluster_center =
$cluster_centers->[$current_cluster_center_index];
foreach my $ele (@$cluster) {
my @dist_from_clust_centers;
foreach my $center (@$cluster_centers) {
push @dist_from_clust_centers,
$self->distance($ele, $center);
}
my ($min, $best_center_index) =
minimum( \@dist_from_clust_centers );
my $best_cluster_center =
$cluster_centers->[$best_center_index];
if (vector_equal($current_cluster_center,
$best_cluster_center)){
push @{$new_clusters->[$current_cluster_center_index]},
$ele;
} else {
$assignment_changed_flag = 1;
push @{$new_clusters->[$best_center_index]}, $ele;
}
}
$current_cluster_center_index++;
}
# Now make sure that we still have K clusters since K is fixed:
next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
# Now make sure that none of the K clusters is an empty cluster:
foreach my $newcluster (@$new_clusters) {
$cluster_size_zero_condition = 1 if ((!defined $newcluster)
or (@$newcluster == 0));
}
push @$new_clusters, (undef) x ($K - @$new_clusters)
if @$new_clusters < $K;
my $largest_cluster;
foreach my $local_cluster (@$new_clusters) {
next if !defined $local_cluster;
$largest_cluster = $local_cluster if !defined $largest_cluster;
if (@$local_cluster > @$largest_cluster) {
$largest_cluster = $local_cluster;
}
}
foreach my $local_cluster (@$new_clusters) {
if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
push @$local_cluster, pop @$largest_cluster;
}
}
next if (($cluster_size_zero_condition) && ($iteration_index < 100));
last if $iteration_index == 100;
# Now do a deep copy of new_clusters into clusters
$clusters = deep_copy_AoA( $new_clusters );
last if $assignment_changed_flag == 0;
}
$final_cluster_centers = $self->update_cluster_centers( $clusters );
return ($clusters, $final_cluster_centers);
}
# Used by the kmeans part of the code: After each new assignment of the data points
# to the clusters on the basis of the current values for the cluster centers, we call
# the routine shown here for updating the values of the cluster centers.
sub update_cluster_centers {
my $self = shift;
my @clusters = @{ shift @_ };
my @new_cluster_centers;
my $largest_cluster;
foreach my $cluster (@clusters) {
next if !defined $cluster;
$largest_cluster = $cluster if !defined $largest_cluster;
if (@$cluster > @$largest_cluster) {
$largest_cluster = $cluster;
}
}
foreach my $cluster (@clusters) {
if ( (!defined $cluster) || (@$cluster == 0) ) {
push @$cluster, pop @$largest_cluster;
}
}
foreach my $cluster (@clusters) {
die "Cluster became empty --- untenable condition " .
"for a given K. Try again. " if !defined $cluster;
my $cluster_size = @$cluster;
die "Cluster size is zero --- untenable." if $cluster_size == 0;
my @new_cluster_center = @{$self->add_point_coords( $cluster )};
@new_cluster_center = map {my $x = $_/$cluster_size; $x}
@new_cluster_center;
push @new_cluster_centers, \@new_cluster_center;
}
return \@new_cluster_centers;
}
# The following routine is for computing the distance between a data point specified
# by its symbolic name in the master datafile and a point (such as the center of a
# cluster) expressed as a vector of coordinates:
sub distance {
my $self = shift;
my $ele1_id = shift @_; # symbolic name of data sample
my @ele1 = @{$self->{_data}->{$ele1_id}};
my @ele2 = @{shift @_};
die "wrong data types for distance calculation" if @ele1 != @ele2;
my $how_many = @ele1;
my $squared_sum = 0;
foreach my $i (0..$how_many-1) {
$squared_sum += ($ele1[$i] - $ele2[$i])**2;
}
my $dist = sqrt $squared_sum;
return $dist;
}
# The following routine does the same as above but now both
# arguments are expected to be arrays of numbers:
sub distance2 {
my $self = shift;
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
# Meant only for constructing a deep copy of a hash in which each value is an
# anonymous array of numbers:
sub deep_copy_hash {
my $ref_in = shift;
my $ref_out;
while ( my ($key, $value) = each( %$ref_in ) ) {
$ref_out->{$key} = deep_copy_array( $value );
}
return $ref_out;
}
# Meant only for an array of numbers:
sub deep_copy_array {
my $ref_in = shift;
my $ref_out;
foreach my $i (0..@{$ref_in}-1) {
$ref_out->[$i] = $ref_in->[$i];
}
return $ref_out;
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
sub mean_and_variance {
my @data = @{shift @_};
my ($mean, $variance);
foreach my $i (1..@data) {
if ($i == 1) {
$mean = $data[0];
$variance = 0;
} else {
# data[$i-1] because of zero-based indexing of vector
$mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
$variance = ( (($i-1)/$i) * $variance )
+ ($data[$i-1]-$mean)**2 / ($i-1);
}
}
return ($mean, $variance);
}
sub check_for_illegal_params {
my @params = @_;
my @legal_params = qw / datafile
mask
K
terminal_output
max_em_iterations
seeding
class_priors
seed_tags
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub get_value_index_hash {
my $arr = shift;
my %hash;
foreach my $index (0..@$arr-1) {
$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
}
return \%hash;
}
sub non_maximum_supression {
my $arr = shift;
my @output = (0) x @$arr;
my @final_output = (0) x @$arr;
my %hash;
my @array_of_runs = ([$arr->[0]]);
foreach my $index (1..@$arr-1) {
if ($arr->[$index] == $arr->[$index-1]) {
push @{$array_of_runs[-1]}, $arr->[$index];
} else {
push @array_of_runs, [$arr->[$index]];
}
}
my $runstart_index = 0;
foreach my $run_index (1..@array_of_runs-2) {
$runstart_index += @{$array_of_runs[$run_index-1]};
if ($array_of_runs[$run_index]->[0] >
$array_of_runs[$run_index-1]->[0] &&
$array_of_runs[$run_index]->[0] >
$array_of_runs[$run_index+1]->[0]) {
my $run_center = @{$array_of_runs[$run_index]} / 2;
my $assignment_index = $runstart_index + $run_center;
$output[$assignment_index] = $arr->[$assignment_index];
}
}
if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
$runstart_index += @{$array_of_runs[-2]};
my $run_center = @{$array_of_runs[-1]} / 2;
my $assignment_index = $runstart_index + $run_center;
$output[$assignment_index] = $arr->[$assignment_index];
}
if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
my $run_center = @{$array_of_runs[0]} / 2;
$output[$run_center] = $arr->[$run_center];
}
return \@output;
}
sub display_matrix {
my $message = shift;
my $matrix = shift;
if (!defined blessed($matrix)) {
print "display_matrix called on a scalar value: $matrix\n";
return;
}
my $nrows = $matrix->rows();
my $ncols = $matrix->cols();
print "$message ($nrows rows and $ncols columns)\n";
foreach my $i (0..$nrows-1) {
( run in 0.531 second using v1.01-cache-2.11-cpan-f0fbb3f571b )