Algorithm-LinearManifoldDataClusterer
view release on metacpan or search on metacpan
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
package Algorithm::LinearManifoldDataClusterer;
#------------------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
# resides on a low-dimensional manifold in a high-dimensional measurement space.
# -----------------------------------------------------------------------------------
use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(reduce any);
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use POSIX ();
our $VERSION = '1.01';
# Constructor:
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
bless {
_datafile => $args{datafile} || croak("datafile required"),
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || 0,
_P => $args{P} || 0,
_terminal_output => $args{terminal_output} || 0,
_max_iterations => $args{max_iterations} || 0,
_delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001,
_delta_normalized_error => undef,
_cluster_search_multiplier => $args{cluster_search_multiplier} || 1,
_visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1,
_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
_make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1,
_debug => $args{debug} || 0,
_N => 0,
_KM => $args{K} * $args{cluster_search_multiplier},
_data_hash => {},
_data_tags => [],
_data_dimensions => 0,
_final_clusters => [],
_auto_retry_flag => 0,
_num_iterations_actually_used => undef,
_scale_factor => undef,
_data_tags_to_cluster_label_hash => {},
_final_reference_vecs_for_all_subspaces => [],
_reconstruction_error_as_a_function_of_iteration => [],
_final_trailing_eigenvec_matrices_for_all_subspaces => [],
_subspace_construction_error_as_a_function_of_iteration => [],
}, $class;
}
sub get_data_from_csv {
my $self = shift;
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_data_hash} = \%data_hash;
$self->{_data_tags} = \@data_tags;
$self->{_N} = scalar @data_tags;
}
sub estimate_mean_and_covariance {
my $self = shift;
my $tag_set = shift;
my $cluster_size = @$tag_set;
my @cluster_center = @{$self->add_point_coords($tag_set)};
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
# for covariance calculation:
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
if $self->{_debug};
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_tags}. The actual
# data for clustering is stored in a hash at $self->{_data_hash} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
$mean_vec->set_col(0, \@cluster_center);
if ($self->{_debug}) {
print "\nDisplaying the mean vector for the cluster:\n";
display_matrix( $mean_vec ) if $self->{_terminal_output};
}
foreach my $j (0..$num_cols-1) {
my $tag = $tag_set->[$j];
my $data = $self->{_data_hash}->{$tag};
my @diff_from_mean = vector_subtract($data, \@cluster_center);
$matrix->set_col($j, \@diff_from_mean);
}
my $transposed = transpose( $matrix );
my $covariance = $matrix * $transposed;
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for cluster:";
display_matrix( $covariance ) if $self->{_terminal_output};
}
return ($mean_vec, $covariance);
}
sub eigen_analysis_of_covariance {
my $self = shift;
my $covariance = shift;
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
}
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
my @all_my_eigenvecs;
foreach my $i (0..$num_of_eigens-1) {
my @vec = $eigenvectors->[$i]->as_list;
my @eigenvec;
foreach my $ele (@vec) {
my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
if ($theta eq "0") {
push @eigenvec, $mag;
} elsif ($theta eq "pi") {
push @eigenvec, -1.0 * $mag;
} else {
die "Eigendecomposition produced a complex eigenvector -- " .
"which should not happen for a covariance matrix!";
}
}
print "Eigenvector $i: @eigenvec\n" if $self->{_debug};
push @all_my_eigenvecs, \@eigenvec;
}
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
my @sorted_eigenvecs;
my @sorted_eigenvals;
foreach my $i (0..@sorted_eigenvec_indexes-1) {
$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
}
if ($self->{_debug}) {
print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
foreach my $i (0..@sorted_eigenvecs-1) {
print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
}
}
return (\@sorted_eigenvecs, \@sorted_eigenvals);
}
sub auto_retry_clusterer {
my $self = shift;
$self->{_auto_retry_flag} = 1;
my $clusters;
$@ = 1;
my $retry_attempts = 1;
while ($@) {
eval {
$clusters = $self->linear_manifold_clusterer();
};
if ($@) {
if ($self->{_terminal_output}) {
print "Clustering failed. Trying again. --- $@";
print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n";
}
$retry_attempts++;
} else {
print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output};
return $clusters;
}
}
}
sub linear_manifold_clusterer {
my $self = shift;
my $KM = $self->{_KM};
my @initial_cluster_center_tags;
my $visualization_msg;
my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n"
if $self->{_debug};
@initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
if ($self->{_debug}) {
foreach my $centroid (@initial_cluster_center_coords) {
print "Initial cluster center coords: @{$centroid}\n";
}
}
my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
if ($self->{_data_dimensions} == 3) {
$visualization_msg = "initial_clusters";
$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters)
if $self->{_visualize_each_iteration};
$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
if $self->{_make_png_for_each_iteration};
}
foreach my $cluster (@$initial_clusters) {
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
display_mean_and_covariance($mean, $covariance) if $self->{_debug};
}
my @clusters = @$initial_clusters;
display_clusters(\@clusters) if $self->{_debug};
my $iteration_index = 0;
my $unimodal_correction_flag;
my $previous_min_value_for_unimodality_quotient;
while ($iteration_index < $self->{_max_iterations}) {
print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
if $self->{_terminal_output};
my $total_reconstruction_error_this_iteration = 0;
my @subspace_construction_errors_this_iteration;
my @trailing_eigenvec_matrices_for_all_subspaces;
my @reference_vecs_for_all_subspaces;
foreach my $cluster (@clusters) {
next if @$cluster == 0;
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
display_mean_and_covariance($mean, $covariance) if $self->{_debug};
print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
push @subspace_construction_errors_this_iteration, $subspace_construction_error;
my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
scalar(@trailing_eigenvecs));
foreach my $j (0..@trailing_eigenvecs-1) {
print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
}
push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
push @reference_vecs_for_all_subspaces, $mean;
}
$self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
my %best_subspace_based_partition_of_data;
foreach my $i (0..$self->{_KM}-1) {
$best_subspace_based_partition_of_data{$i} = [];
}
foreach my $data_tag (@{$self->{_data_tags}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
my @errors = map {$self->reconstruction_error($data_vec,
$trailing_eigenvec_matrices_for_all_subspaces[$_],
$reference_vecs_for_all_subspaces[$_])}
0 .. $self->{_KM}-1;
my ($minval, $index_for_closest_subspace) = minimum(\@errors);
$total_reconstruction_error_this_iteration += $minval;
push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
$data_tag;
}
print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
"iteration and re-assigning the data samples to the new subspaces on the basis of\n".
"the least reconstruction error.\n\n" .
"Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
if $self->{_terminal_output};
foreach my $i (0..$self->{_KM}-1) {
$clusters[$i] = $best_subspace_based_partition_of_data{$i};
}
display_clusters(\@clusters) if $self->{_terminal_output};
# Check if any cluster has lost all its elements. If so, fragment the worst
# existing cluster to create the additional clusters needed:
if (any {@$_ == 0} @clusters) {
die "empty cluster found" if $self->{_auto_retry_flag};
print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" .
"from the cluster with the largest subspace construction error.\n\n";
$total_reconstruction_error_this_iteration = 0;
@subspace_construction_errors_this_iteration = ();
my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
my $max = List::Util::max @subspace_construction_errors_this_iteration;
my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
my @cluster_fragments = cluster_split($clusters[$maxindex],
$how_many_extra_clusters_needed + 1);
my @newclusters;
push @newclusters, @clusters[0 .. $maxindex-1];
push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
push @newclusters, @cluster_fragments;
@newclusters = grep {@$_ != 0} @newclusters;
die "something went wrong with cluster fragmentation"
unless $self->{_KM} = @newclusters;
@trailing_eigenvec_matrices_for_all_subspaces = ();
@reference_vecs_for_all_subspaces = ();
foreach my $cluster (@newclusters) {
die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
push @subspace_construction_errors_this_iteration, $subspace_construction_error;
my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
scalar(@trailing_eigenvecs));
foreach my $j (0..@trailing_eigenvecs-1) {
$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
}
push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
push @reference_vecs_for_all_subspaces, $mean;
}
my %best_subspace_based_partition_of_data;
foreach my $i (0..$self->{_KM}-1) {
$best_subspace_based_partition_of_data{$i} = [];
}
foreach my $data_tag (@{$self->{_data_tags}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
my @errors = map {reconstruction_error($data_vec,
$trailing_eigenvec_matrices_for_all_subspaces[$_],
$reference_vecs_for_all_subspaces[$_])}
0 .. $self->{_KM}-1;
my ($minval, $index_for_closest_subspace) = minimum(\@errors);
$total_reconstruction_error_this_iteration += $minval;
push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
$data_tag;
}
print "empty-cluster jag: total reconstruction error in this iteration: \n" .
"$total_reconstruction_error_this_iteration\n"
if $self->{_debug};
foreach my $i (0..$self->{_KM}-1) {
$clusters[$i] = $best_subspace_based_partition_of_data{$i};
}
display_clusters(\@newclusters) if $self->{_terminal_output};
@clusters = grep {@$_ != 0} @newclusters;
die "linear manifold based algorithm does not appear to work in this case $!"
unless @clusters == $self->{_KM};
}# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters
if ($self->{_data_dimensions} == 3) {
$visualization_msg = "clustering_at_iteration_$iteration_index";
$self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
if $self->{_visualize_each_iteration};
$self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
if $self->{_make_png_for_each_iteration};
}
my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_],
$reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
die "\n\nBailing out!\n" .
"It does not look like these iterations will lead to a good clustering result.\n" .
"Program terminating. Try running again.\n"
if defined($previous_min_value_for_unimodality_quotient)
&& ($min_value_for_unimodality_quotient < 0.4)
&& ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
if ( $min_value_for_unimodality_quotient < 0.5 ) {
$unimodal_correction_flag = 1;
print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
my @sorted_cluster_indexes =
sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
my @newclusters;
foreach my $cluster_index (0..@clusters - 1) {
push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
}
@clusters = @newclusters;
my $worst_cluster = pop @clusters;
print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
my $second_worst_cluster = pop @clusters;
print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
push @$worst_cluster, @$second_worst_cluster;
fisher_yates_shuffle($worst_cluster);
my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
push @clusters, \@first_half;
push @clusters, \@second_half;
if ($self->{_terminal_output}) {
print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
display_clusters(\@clusters);
}
}
if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
if (($last_recon_error - $total_reconstruction_error_this_iteration)
< $self->{_delta_normalized_error}) {
push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
$total_reconstruction_error_this_iteration;
last;
}
}
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
my @array_of_partitioned_cluster_groups = (\@clusters);
my @partitioned_cluster_groups;
my $how_many_clusters_looking_for = $self->{_K};
while (scalar(@final_clusters) < $self->{_K}) {
@partitioned_cluster_groups =
$self->graph_partition(shift @array_of_partitioned_cluster_groups,
$how_many_clusters_looking_for );
if (@{$partitioned_cluster_groups[0]} == 1) {
my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
push @final_clusters, $singular_cluster;
$how_many_clusters_looking_for--;
push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
} elsif (@{$partitioned_cluster_groups[1]} == 1) {
my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
push @final_clusters, $singular_cluster;
$how_many_clusters_looking_for--;
push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
} else {
push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
}
}
my @data_clustered;
foreach my $cluster (@final_clusters) {
push @data_clustered, @$cluster;
}
unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
$self->{_final_clusters} = \@final_clusters;
my %data_clustered = map {$_ => 1} @data_clustered;
my @data_tags_not_clustered =
grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
if ($self->{_terminal_output}) {
print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n";
display_clusters(\@final_clusters);
print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
}
if ($self->{_data_dimensions} == 3) {
$visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
$self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
if $self->{_visualize_each_iteration};
$self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
if $self->{_make_png_for_each_iteration};
}
my %data_tags_to_cluster_label_hash;
foreach my $i (0..@final_clusters-1) {
map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
}
$self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
foreach my $tag (@data_tags_not_clustered) {
my $which_cluster = $self->which_cluster_for_new_element($tag);
$self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
}
die "Some data elements are still missing from the final tally"
unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) ==
scalar(@{$self->{_data_tags}});
my @new_final_clusters;
map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
push @{$new_final_clusters[$_]}, $ele
if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ }
} 0..$self->{_K}-1;
if ($self->{_debug}) {
print "\ndisplaying the final clusters after accounting for unclustered data:\n";
display_clusters(\@new_final_clusters);
}
$self->{_final_clusters} = \@new_final_clusters;
@final_clusters = @new_final_clusters;
}
} else {
@final_clusters = @clusters;
}
print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
display_clusters(\@final_clusters) if $self->{_terminal_output};
return \@final_clusters;
}
sub display_reconstruction_errors_as_a_function_of_iterations {
my $self = shift;
print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
"@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
}
sub set_termination_reconstruction_error_threshold {
my $self = shift;
my $all_ref_vecs = shift;
my @mean_vecs = @$all_ref_vecs;
my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_;
my @result = $result->as_list;
sqrt($result[0])
} @mean_vecs;
$self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
$self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) *
$self->{_delta_reconstruction_error};
}
# This method is called only in the `unless' clause at the end of the main
# linear_manifold_clusterer() method. It is called to find the cluster labels for
# those data elements that were left unclustered by the main part of the algorithm
# when graph partitioning is used to merge similar sub-clusters. The operating logic
# here is that graph partition yields K main clusters even though each main cluster
# may not yet be fully populated.
sub which_cluster_for_new_element {
my $self = shift;
my $data_tag = shift;
# The following `unless' clause is called only the first time the current method
# is called:
unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) {
my @trailing_eigenvec_matrices_for_all_subspaces;
my @reference_vecs_for_all_subspaces;
foreach my $cluster (@{$self->{_final_clusters}}) {
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
scalar(@trailing_eigenvecs));
foreach my $j (0..@trailing_eigenvecs-1) {
$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
}
push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
push @reference_vecs_for_all_subspaces, $mean;
}
$self->{_final_trailing_eigenvec_matrices_for_all_subspaces} =
\@trailing_eigenvec_matrices_for_all_subspaces;
$self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces;
}
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
my @errors = map {$self->reconstruction_error($data_vec,
$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_],
$self->{_final_reference_vecs_for_all_subspaces}->[$_])}
0 .. $self->{_K}-1;
my ($minval, $index_for_closest_subspace) = minimum(\@errors);
return $index_for_closest_subspace;
}
sub graph_partition {
my $self = shift;
my $clusters = shift;
my $how_many_clusters_looking_for = shift;
print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n";
my $num_nodes = scalar @$clusters;
my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes);
my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
$D->identity;
my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
$neg_sqrt_of_D->identity;
my @subspace_construction_errors;
my @trailing_eigenvec_matrices_for_all_subspaces;
my @reference_vecs_for_all_subspaces;
foreach my $cluster (@$clusters) {
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
push @subspace_construction_errors, $subspace_construction_error;
my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
scalar(@trailing_eigenvecs));
foreach my $j (0..@trailing_eigenvecs-1) {
print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
}
push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
push @reference_vecs_for_all_subspaces, $mean;
}
# We consider the similarity matrix W to be a sum of two parts W_recon_based and
# W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two
# similarity measures separately in W_recon_based and W_dist_bet_means based. Our
# goal is to fill up these component matrices with the raw values while at the
# same time collecting information needed for normalizing these two separate
# measures of similarity.
my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
my @all_pairwise_reconstruction_errors;
my @all_dist_between_means_errors;
foreach my $i (0..$num_nodes-1) {
foreach my $j (0..$num_nodes-1) {
my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity(
$clusters->[$i],
$trailing_eigenvec_matrices_for_all_subspaces[$i],
$reference_vecs_for_all_subspaces[$i],
$clusters->[$j],
$trailing_eigenvec_matrices_for_all_subspaces[$j],
$reference_vecs_for_all_subspaces[$j]);
$W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity);
$W_dist_between_means_based->set_elem($i, $j, $dist_bet_means);
push @all_pairwise_reconstruction_errors, $recon_error_similarity;
push @all_dist_between_means_errors, $dist_bet_means;
}
}
my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) /
(scalar @all_pairwise_reconstruction_errors);
my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) /
(scalar @all_dist_between_means_errors );
die "\n\nBailing out!\n" .
"Dealing with highly defective clusters. Try again.\n"
if ($recon_error_normalizer == 0) || ($dist_bet_means_based_normalizer == 0);
foreach my $i (0..$num_nodes-1) {
foreach my $j (0..$num_nodes-1) {
my $recon_val = $W_reconstruction_error_based->get_elem($i,$j);
my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer );
$W_reconstruction_error_based->set_elem($i,$j,$new_recon_val);
my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j);
my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer );
$W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val);
}
}
$W = $W_reconstruction_error_based + $W_dist_between_means_based;
if ($self->{_debug}) {
print "\nDisplaying the similarity matrix W for the cluster graph:\n";
display_matrix($W) if $self->{_terminal_output};
}
my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1);
foreach my $col (0..$num_nodes-1) {
$add_all_columns += $W->col($col);
}
foreach my $i (0..$num_nodes-1) {
$D->set_elem($i,$i, $add_all_columns->get_elem($i,0));
$neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0)));
}
# the Laplacian matrix:
my $Laplacian = $D - $W;
# the Symmetric Normalized Laplacian matrix A:
my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D;
foreach my $i (0..$num_nodes-1) {
foreach my $j (0..$num_nodes-1) {
$A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01;
}
}
if ($self->{_terminal_output}) {
print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" .
"A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n";
display_matrix( $A );
}
my ($eigenvalues, $eigenvectors) = $A->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
if ($self->{_debug2}) {
print "Eigenvalue 0: $eigenvalues->[0]\n";
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n";
}
print "\nlargest eigen index: $largest_eigen_index\n";
print "\nsmallest eigen index: $smallest_eigen_index\n\n";
}
my @all_my_eigenvecs;
foreach my $i (0..$num_of_eigens-1) {
my @vec = $eigenvectors->[$i]->as_list;
my @eigenvec;
foreach my $ele (@vec) {
my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
if ($theta eq "0") {
push @eigenvec, $mag;
} elsif ($theta eq "pi") {
push @eigenvec, -1.0 * $mag;
} else {
die "Eigendecomposition produced a complex eigenvector!";
}
}
print "Eigenvector $i: @eigenvec\n" if $self->{_debug2};
push @all_my_eigenvecs, \@eigenvec;
}
if ($self->{_debug2}) {
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector of A: @largest_eigen_vec\n";
}
my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2};
my @sorted_eigenvecs;
my @sorted_eigenvals;
foreach my $i (0..@sorted_eigenvec_indexes-1) {
$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
}
if ($self->{_debug2}) {
print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n";
foreach my $i (0..@sorted_eigenvecs-1) {
print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
}
}
my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2];
print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output};
my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec;
my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive;
print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n"
if $self->{_terminal_output};
if ($how_many_clusters_looking_for <= 3) {
my @merged_cluster;
my $final_cluster;
my @newclusters;
if ($how_many_positive == 1) {
foreach my $i (0..@$clusters-1) {
if ($best_partitioning_eigenvec->[$i] > 0) {
$final_cluster = $clusters->[$i];
} else {
push @newclusters, $clusters->[$i];
}
}
return ([$final_cluster], \@newclusters);
} elsif ($how_many_negative == 1) {
foreach my $i (0..@$clusters-1) {
if ($best_partitioning_eigenvec->[$i] < 0) {
$final_cluster = $clusters->[$i];
} else {
push @newclusters, $clusters->[$i];
}
}
return ([$final_cluster], \@newclusters);
} elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) {
foreach my $i (0..@$clusters-1) {
if ($best_partitioning_eigenvec->[$i] > 0) {
push @merged_cluster, @{$clusters->[$i]};
} else {
push @newclusters, $clusters->[$i];
}
}
return ([\@merged_cluster], \@newclusters);
} elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) {
foreach my $i (0..@$clusters-1) {
if ($best_partitioning_eigenvec->[$i] < 0) {
push @merged_cluster, @{$clusters->[$i]};
} else {
push @newclusters, $clusters->[$i];
}
}
return ([\@merged_cluster], \@newclusters);
} else {
die "\n\nBailing out!\n\n" .
"No consensus support for dominant clusters in the graph partitioning step\n" .
"of the algorithm. This can be caused by bad random selection of initial\n" .
"cluster centers. Please run this program again.\n";
}
} else {
my @positive_clusters;
my @negative_clusters;
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
my $mean = shift;
my $delta = 0.4 * $self->{_scale_factor}; # Radius of the delta ball along each dimension
my @mean = $mean->as_list;
my @data_tags_for_range_tests;
foreach my $dimen (0..$self->{_data_dimensions}-1) {
my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
my $range = $max - $min;
my $mean_along_this_dimen = $mean[$dimen];
my @tags = grep {$_}
map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range)
&&
($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) )
? $_ : undef }
@$cluster;
push @data_tags_for_range_tests, \@tags;
}
# Now find the intersection of the tag sets for each of the dimensions
my %intersection_hash;
foreach my $dimen (0..$self->{_data_dimensions}-1) {
my %tag_hash_for_this_dimen = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]};
if ($dimen == 0) {
%intersection_hash = %tag_hash_for_this_dimen;
} else {
%intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}}
keys %intersection_hash;
}
}
my @intersection_set = keys %intersection_hash;
my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster);
return $cluster_unimodality_index;
}
sub find_best_ref_vector {
my $self = shift;
my $cluster = shift;
my $trailing_eigenvec_matrix = shift;
my $mean = shift; # a GSL marix ref
my @min_bounds;
my @max_bounds;
my @ranges;
foreach my $dimen (0..$self->{_data_dimensions}-1) {
my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
push @min_bounds, $min;
push @max_bounds, $max;
push @ranges, $max - $min;
}
print "min bounds are: @min_bounds\n";
print "max bounds are: @max_bounds\n";
my $max_iterations = 100;
my @random_points;
my $iteration = 0;
while ($iteration++ < $max_iterations) {
my @coordinate_vec;
foreach my $dimen (0..$self->{_data_dimensions}-1) {
push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]);
}
push @random_points, \@coordinate_vec;
}
if ($self->{_debug}) {
print "\nrandom points\n";
map {print "@$_\n"} @random_points;
}
my @mean = $mean->as_list;
unshift @random_points, \@mean;
my @reconstruction_errors;
foreach my $candidate_ref_vec (@random_points) {
my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$ref_vec->set_col(0, $candidate_ref_vec);
my $reconstruction_error_for_a_ref_vec = 0;
foreach my $data_tag (@{$self->{_data_tags}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec);
$reconstruction_error_for_a_ref_vec += $error;
}
push @reconstruction_errors, $reconstruction_error_for_a_ref_vec;
}
my $recon_error_for_original_mean = shift @reconstruction_errors;
my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors);
my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs }
@reconstruction_errors;
my $refvec = $random_points[$minindex];
return $refvec;
}
## The reconstruction error relates to the size of the perpendicular from a data
## point X to the hyperplane that defines a given subspace on the manifold.
sub reconstruction_error {
my $self = shift;
my $data_vec = shift;
my $trailing_eigenvecs = shift;
my $ref_vec = shift;
my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs *
transpose($trailing_eigenvecs) * ($data_vec - $ref_vec);
my @error_squared_as_list = $error_squared->as_list();
my $error_squared_as_scalar = shift @error_squared_as_list;
return $error_squared_as_scalar;
}
# Returns a set of KM random integers. These serve as indices to reach into the data
# array. A data element whose index is one of the random numbers returned by this
# routine serves as an initial cluster center. Note the quality check it runs on the
# list of the random integers constructed. We first make sure that all the random
# integers returned are different. Subsequently, we carry out a quality assessment
# of the random integers constructed. This quality measure consists of the ratio of
# the values spanned by the random integers to the value of N, the total number of
# data points to be clustered. Currently, if this ratio is less than 0.3, we discard
# the K integers and try again.
sub initialize_cluster_centers {
my $self = shift;
my $K = shift; # This value is set to the parameter KM in the call to this subroutine
my $data_store_size = shift;
my @cluster_center_indices;
while (1) {
foreach my $i (0..$K-1) {
$cluster_center_indices[$i] = int rand( $data_store_size );
next if $i == 0;
foreach my $j (0..$i-1) {
while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) {
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
} elsif ( $arr->[$i] < $min ) {
$min = $arr->[$i];
} elsif ( $arr->[$i] > $max ) {
$max = $arr->[$i];
}
}
return ($min, $max);
}
# Meant only for constructing a deep copy of an array of arrays:
sub deep_copy_AoA {
my $ref_in = shift;
my $ref_out;
foreach my $i (0..@{$ref_in}-1) {
foreach my $j (0..@{$ref_in->[$i]}-1) {
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
}
}
return $ref_out;
}
# For displaying the individual clusters on a terminal screen. Each cluster is
# displayed through the symbolic names associated with the data points.
sub display_clusters {
my @clusters = @{shift @_};
my $i = 0;
foreach my $cluster (@clusters) {
@$cluster = sort @$cluster;
my $cluster_size = @$cluster;
print "\n\nCluster $i ($cluster_size records):\n";
foreach my $ele (@$cluster) {
print " $ele";
}
$i++
}
print "\n\n";
}
sub display_mean_and_covariance {
my $mean = shift;
my $covariance = shift;
print "\nDisplay the mean:\n";
display_matrix($mean);
print "\nDisplay the covariance:\n";
display_matrix($covariance);
}
sub check_for_illegal_params {
my @params = @_;
my @legal_params = qw / datafile
mask
K
P
terminal_output
cluster_search_multiplier
max_iterations
delta_reconstruction_error
visualize_each_iteration
show_hidden_in_3D_plots
make_png_for_each_iteration
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub display_matrix {
my $matrix = shift;
my $nrows = $matrix->rows();
my $ncols = $matrix->cols();
print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
foreach my $i (0..$nrows-1) {
my $row = $matrix->row($i);
my @row_as_list = $row->as_list;
# print "@row_as_list\n";
map { printf("%.4f ", $_) } @row_as_list;
print "\n";
}
print "\n\n";
}
sub transpose {
my $matrix = shift;
my $num_rows = $matrix->rows();
my $num_cols = $matrix->cols();
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
foreach my $i (0..$num_rows-1) {
my @row = $matrix->row($i)->as_list;
$transpose->set_col($i, \@row );
}
return $transpose;
}
sub vector_subtract {
my $vec1 = shift;
my $vec2 = shift;
die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
my @result;
foreach my $i (0..@$vec1-1){
push @result, $vec1->[$i] - $vec2->[$i];
}
return @result;
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
######################### Generating Synthetic Data for Manifold Clustering ##########################
################################## Class DataGenerator ########################################
## The embedded class defined below is for generating synthetic data for
## experimenting with linear manifold clustering when the data resides on the
## surface of a sphere. See the script generate_data_on_a_sphere.pl in the
## `examples' directory for how to specify the number of clusters and the spread of
## each cluster in the data that is generated.
package DataGenerator;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if _check_for_illegal_params3(@params) == 0;
bless {
_output_file => $args{output_file}
|| croak("name for output_file required"),
_total_number_of_samples_needed => $args{total_number_of_samples_needed}
|| croak("total_number_of_samples_needed required"),
_number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3,
_cluster_width => $args{cluster_width} || 0.1,
_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1,
_debug => $args{debug} || 0,
}, $class;
}
sub _check_for_illegal_params3 {
my @params = @_;
my @legal_params = qw / output_file
total_number_of_samples_needed
number_of_clusters_on_sphere
cluster_width
show_hidden_in_3D_plots
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## We first generate a set of points randomly on the unit sphere --- the number of
## points being equal to the number of clusters desired. These points will serve as
## cluster means (or, as cluster centroids) subsequently when we ask
## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
## points on the sphere. The second argument is the cluster mean and the third
## argument the cluster covariance. For the synthetic data, we set the cluster
## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
## variance along the azimuth direction and the (1,1) element corresponding to the
## variance along the elevation direction.
##
## When you generate the points in the 2D spherical coordinates of
## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
## the multivariate-normal function whose azimuth angle is outside the interval
## (0,360) and/or whose elevation angle is outside the interval (-90,90).
##
## Note that the first of the two dimensions for which the multivariate-normal
## function returns the points is for the azimuth angle and the second for the
## elevation angle.
##
## With regard to the relationship of the Cartesian coordinates to the spherical
## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
## and z the vertical axis. The elevation angle theta is measure with respect to
## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the
## elevation angle of +90 and the lowest points on the sphere (the Nadir)
## corresponds to the elevation angle of -90. The azimuth is measured with respect
## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is
## measured from the XY plane and its range is (-90,90) degrees.
sub gen_data_and_write_to_csv {
my $self = shift;
my $K = $self->{_number_of_clusters_on_sphere};
# $N is the number of samples to be generated for each cluster:
my $N = int($self->{_total_number_of_samples_needed} / $K);
my $output_file = $self->{_output_file};
# Designated all of the data elements in a cluster by a letter that is followed by
# an integer that identifies a specific data element.
my @point_labels = ('a'..'z');
# Our first job is to define $K random points in the 2D space (azimuth,
# elevation) to serve as cluster centers on the surface of the sphere. This we
# do by calling a uniformly distributed 1-D random number generator, first for
# the azimuth and then for the elevation in the loop shown below:
my @cluster_centers;
my @covariances;
foreach my $i (0..$K-1) {
my $azimuth = rand(360);
my $elevation = rand(90) - 90;
my @mean = ($azimuth, $elevation);
push @cluster_centers, \@mean;
my $cluster_covariance;
# The j-th dimension is for azimuth and k-th for elevation for the directions
# to surface of the sphere:
foreach my $j (0..1) {
foreach my $k (0..1) {
$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2
if $j == 0 && $k == 0;
$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2
if $j == 1 && $k == 1;
$cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
}
}
push @covariances, $cluster_covariance;
}
if ($self->{_debug}) {
foreach my $i (0..$K-1) {
print "\n\nCluster center: @{$cluster_centers[$i]}\n";
print "\nCovariance:\n";
foreach my $j (0..1) {
foreach my $k (0..1) {
print "$covariances[$i]->[$j]->[$k] ";
}
print "\n";
}
}
}
my @data_dump;
foreach my $i (0..$K-1) {
my @m = @{shift @cluster_centers};
my @covar = @{shift @covariances};
my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar);
if ($self->{_debug}) {
print "\nThe points for cluster $i:\n";
map { print "@$_ "; } @new_data;
print "\n\n";
}
my @wrapped_data;
foreach my $d (@new_data) {
my $wrapped_d;
if ($d->[0] >= 360.0) {
$wrapped_d->[0] = $d->[0] - 360.0;
} elsif ($d->[0] < 0) {
$wrapped_d->[0] = 360.0 - abs($d->[0]);
}
if ($d->[1] >= 90.0) {
$wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360);
$wrapped_d->[1] = 180.0 - $d->[1];
} elsif ($d->[1] < -90.0) {
$wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360);
$wrapped_d->[1] = -180.0 - $d->[1];
}
$wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0];
$wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1];
push @wrapped_data, $wrapped_d;
}
if ($self->{_debug}) {
print "\nThe unwrapped points for cluster $i:\n";
map { print "@$_ "; } @wrapped_data;
print "\n\n";
}
my $label = $point_labels[$i];
my $j = 0;
@new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data;
push @data_dump, @new_data;
}
if ($self->{_debug}) {
print "\n\nThe labeled points for clusters:\n";
map { print "@$_\n"; } @data_dump;
}
fisher_yates_shuffle( \@data_dump );
open OUTPUT, ">$output_file";
my $total_num_of_points = $N * $K;
print "Total number of data points that will be written out to the file: $total_num_of_points\n"
if $self->{_debug};
foreach my $ele (@data_dump) {
my ($x,$y,$z);
my $label = $ele->[0];
my $azimuth = $ele->[1];
my $elevation = $ele->[2];
$x = cos($elevation) * cos($azimuth);
$y = cos($elevation) * sin($azimuth);
$z = sin($elevation);
my $csv_str = join ",", ($label,$x,$y,$z);
print OUTPUT "$csv_str\n";
}
print "\n\n";
print "Data written out to file $output_file\n" if $self->{_debug};
close OUTPUT;
}
# This version for the embedded class for data generation
sub visualize_data_on_sphere {
my $self = shift;
my $datafile = shift;
my $filename = File::Basename::basename($datafile);
my $temp_file = "__temp_" . $filename;
$temp_file =~ s/\.\w+$/\.txt/;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!";
open INPUT, "< $filename" or die "Unable to open $filename: $!";
local $/ = undef;
my @all_records = split /\s+/, <INPUT>;
my %clusters;
foreach my $record (@all_records) {
my @splits = split /,/, $record;
my $record_name = shift @splits;
$record_name =~ /(\w+?)_.*/;
my $primary_cluster_label = $1;
push @{$clusters{$primary_cluster_label}}, \@splits;
}
foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
print OUTPUT "\n\n";
}
my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
close OUTPUT;
my $plot = Graphics::GnuplotIF->new( persist => 1 );
my $arg_string = "";
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
$plot->gnuplot_cmd( "set pointsize 2" );
$plot->gnuplot_cmd( "set parametric" );
$plot->gnuplot_cmd( "set size ratio 1" );
$plot->gnuplot_cmd( "set xlabel \"X\"" );
$plot->gnuplot_cmd( "set ylabel \"Y\"" );
$plot->gnuplot_cmd( "set zlabel \"Z\"" );
# set the range for azimuth angles:
$plot->gnuplot_cmd( "set urange [0:2*pi]" );
# set the range for the elevation angles:
$plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
# Parametric functions for the sphere
$plot->gnuplot_cmd( "r=1" );
$plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" );
$plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" );
$plot->gnuplot_cmd( "fz(v) = r*sin(v)" );
my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
foreach my $i (0..scalar(keys %clusters)-1) {
my $j = $i + 1;
# The following statement puts the titles on the data points
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
# $plot->gnuplot_cmd( "splot $arg_string" );
$plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
}
( run in 0.701 second using v1.01-cache-2.11-cpan-0bd6704ced7 )