Algorithm-LinearManifoldDataClusterer
view release on metacpan or search on metacpan
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
package Algorithm::LinearManifoldDataClusterer;
#------------------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
# resides on a low-dimensional manifold in a high-dimensional measurement space.
# -----------------------------------------------------------------------------------
use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(reduce any);
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use POSIX ();
our $VERSION = '1.01';
# Constructor:
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
bless {
_datafile => $args{datafile} || croak("datafile required"),
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || 0,
_P => $args{P} || 0,
_terminal_output => $args{terminal_output} || 0,
_max_iterations => $args{max_iterations} || 0,
_delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001,
_delta_normalized_error => undef,
_cluster_search_multiplier => $args{cluster_search_multiplier} || 1,
_visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1,
_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
_make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1,
_debug => $args{debug} || 0,
_N => 0,
_KM => $args{K} * $args{cluster_search_multiplier},
_data_hash => {},
_data_tags => [],
_data_dimensions => 0,
_final_clusters => [],
_auto_retry_flag => 0,
_num_iterations_actually_used => undef,
_scale_factor => undef,
_data_tags_to_cluster_label_hash => {},
_final_reference_vecs_for_all_subspaces => [],
_reconstruction_error_as_a_function_of_iteration => [],
_final_trailing_eigenvec_matrices_for_all_subspaces => [],
_subspace_construction_error_as_a_function_of_iteration => [],
}, $class;
}
sub get_data_from_csv {
my $self = shift;
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_data_hash} = \%data_hash;
$self->{_data_tags} = \@data_tags;
$self->{_N} = scalar @data_tags;
}
sub estimate_mean_and_covariance {
my $self = shift;
my $tag_set = shift;
my $cluster_size = @$tag_set;
my @cluster_center = @{$self->add_point_coords($tag_set)};
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
# for covariance calculation:
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
if $self->{_debug};
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_tags}. The actual
# data for clustering is stored in a hash at $self->{_data_hash} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
$mean_vec->set_col(0, \@cluster_center);
if ($self->{_debug}) {
print "\nDisplaying the mean vector for the cluster:\n";
display_matrix( $mean_vec ) if $self->{_terminal_output};
}
foreach my $j (0..$num_cols-1) {
my $tag = $tag_set->[$j];
my $data = $self->{_data_hash}->{$tag};
my @diff_from_mean = vector_subtract($data, \@cluster_center);
$matrix->set_col($j, \@diff_from_mean);
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for cluster:";
display_matrix( $covariance ) if $self->{_terminal_output};
}
return ($mean_vec, $covariance);
}
sub eigen_analysis_of_covariance {
my $self = shift;
my $covariance = shift;
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
}
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
my @all_my_eigenvecs;
foreach my $i (0..$num_of_eigens-1) {
my @vec = $eigenvectors->[$i]->as_list;
my @eigenvec;
foreach my $ele (@vec) {
my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
if ($theta eq "0") {
push @eigenvec, $mag;
} elsif ($theta eq "pi") {
push @eigenvec, -1.0 * $mag;
} else {
die "Eigendecomposition produced a complex eigenvector -- " .
"which should not happen for a covariance matrix!";
}
}
print "Eigenvector $i: @eigenvec\n" if $self->{_debug};
push @all_my_eigenvecs, \@eigenvec;
}
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
my @sorted_eigenvecs;
my @sorted_eigenvals;
foreach my $i (0..@sorted_eigenvec_indexes-1) {
$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
}
if ($self->{_debug}) {
print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
foreach my $i (0..@sorted_eigenvecs-1) {
print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
}
}
return (\@sorted_eigenvecs, \@sorted_eigenvals);
}
sub auto_retry_clusterer {
my $self = shift;
$self->{_auto_retry_flag} = 1;
my $clusters;
$@ = 1;
my $retry_attempts = 1;
while ($@) {
eval {
$clusters = $self->linear_manifold_clusterer();
};
if ($@) {
if ($self->{_terminal_output}) {
print "Clustering failed. Trying again. --- $@";
print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
print "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n";
}
$retry_attempts++;
} else {
print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output};
return $clusters;
}
}
}
sub linear_manifold_clusterer {
my $self = shift;
my $KM = $self->{_KM};
my @initial_cluster_center_tags;
my $visualization_msg;
my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n"
if $self->{_debug};
@initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
if ($self->{_debug}) {
foreach my $centroid (@initial_cluster_center_coords) {
print "Initial cluster center coords: @{$centroid}\n";
}
}
my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
if ($self->{_data_dimensions} == 3) {
$visualization_msg = "initial_clusters";
$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters)
if $self->{_visualize_each_iteration};
$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
if $self->{_make_png_for_each_iteration};
}
foreach my $cluster (@$initial_clusters) {
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
display_mean_and_covariance($mean, $covariance) if $self->{_debug};
}
my @clusters = @$initial_clusters;
display_clusters(\@clusters) if $self->{_debug};
my $iteration_index = 0;
my $unimodal_correction_flag;
my $previous_min_value_for_unimodality_quotient;
while ($iteration_index < $self->{_max_iterations}) {
print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
if $self->{_terminal_output};
my $total_reconstruction_error_this_iteration = 0;
my @subspace_construction_errors_this_iteration;
my @trailing_eigenvec_matrices_for_all_subspaces;
my @reference_vecs_for_all_subspaces;
foreach my $cluster (@clusters) {
next if @$cluster == 0;
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
display_mean_and_covariance($mean, $covariance) if $self->{_debug};
print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
push @subspace_construction_errors_this_iteration, $subspace_construction_error;
my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
scalar(@trailing_eigenvecs));
foreach my $j (0..@trailing_eigenvecs-1) {
print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
}
push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
push @reference_vecs_for_all_subspaces, $mean;
}
$self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
my %best_subspace_based_partition_of_data;
foreach my $i (0..$self->{_KM}-1) {
$best_subspace_based_partition_of_data{$i} = [];
}
foreach my $data_tag (@{$self->{_data_tags}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
my @errors = map {$self->reconstruction_error($data_vec,
$trailing_eigenvec_matrices_for_all_subspaces[$_],
$reference_vecs_for_all_subspaces[$_])}
0 .. $self->{_KM}-1;
my ($minval, $index_for_closest_subspace) = minimum(\@errors);
$total_reconstruction_error_this_iteration += $minval;
push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
$data_tag;
}
print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
"iteration and re-assigning the data samples to the new subspaces on the basis of\n".
"the least reconstruction error.\n\n" .
"Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
if $self->{_terminal_output};
foreach my $i (0..$self->{_KM}-1) {
$clusters[$i] = $best_subspace_based_partition_of_data{$i};
}
display_clusters(\@clusters) if $self->{_terminal_output};
# Check if any cluster has lost all its elements. If so, fragment the worst
# existing cluster to create the additional clusters needed:
if (any {@$_ == 0} @clusters) {
die "empty cluster found" if $self->{_auto_retry_flag};
print "\nOne or more clusters have become empty. Will carve out the needed clusters\n" .
"from the cluster with the largest subspace construction error.\n\n";
$total_reconstruction_error_this_iteration = 0;
@subspace_construction_errors_this_iteration = ();
my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
my $max = List::Util::max @subspace_construction_errors_this_iteration;
my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
my @cluster_fragments = cluster_split($clusters[$maxindex],
$how_many_extra_clusters_needed + 1);
my @newclusters;
push @newclusters, @clusters[0 .. $maxindex-1];
push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
push @newclusters, @cluster_fragments;
@newclusters = grep {@$_ != 0} @newclusters;
die "something went wrong with cluster fragmentation"
unless $self->{_KM} = @newclusters;
@trailing_eigenvec_matrices_for_all_subspaces = ();
@reference_vecs_for_all_subspaces = ();
foreach my $cluster (@newclusters) {
die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
push @subspace_construction_errors_this_iteration, $subspace_construction_error;
my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
scalar(@trailing_eigenvecs));
foreach my $j (0..@trailing_eigenvecs-1) {
$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
}
push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
push @reference_vecs_for_all_subspaces, $mean;
}
my %best_subspace_based_partition_of_data;
foreach my $i (0..$self->{_KM}-1) {
$best_subspace_based_partition_of_data{$i} = [];
}
foreach my $data_tag (@{$self->{_data_tags}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
my @errors = map {reconstruction_error($data_vec,
$trailing_eigenvec_matrices_for_all_subspaces[$_],
$reference_vecs_for_all_subspaces[$_])}
0 .. $self->{_KM}-1;
my ($minval, $index_for_closest_subspace) = minimum(\@errors);
$total_reconstruction_error_this_iteration += $minval;
push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
$data_tag;
}
print "empty-cluster jag: total reconstruction error in this iteration: \n" .
"$total_reconstruction_error_this_iteration\n"
if $self->{_debug};
foreach my $i (0..$self->{_KM}-1) {
$clusters[$i] = $best_subspace_based_partition_of_data{$i};
}
display_clusters(\@newclusters) if $self->{_terminal_output};
@clusters = grep {@$_ != 0} @newclusters;
die "linear manifold based algorithm does not appear to work in this case $!"
unless @clusters == $self->{_KM};
}# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters
if ($self->{_data_dimensions} == 3) {
$visualization_msg = "clustering_at_iteration_$iteration_index";
$self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
if $self->{_visualize_each_iteration};
$self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
if $self->{_make_png_for_each_iteration};
}
my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_],
$reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
die "\n\nBailing out!\n" .
"It does not look like these iterations will lead to a good clustering result.\n" .
"Program terminating. Try running again.\n"
if defined($previous_min_value_for_unimodality_quotient)
&& ($min_value_for_unimodality_quotient < 0.4)
&& ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
if ( $min_value_for_unimodality_quotient < 0.5 ) {
$unimodal_correction_flag = 1;
print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
my @sorted_cluster_indexes =
sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
my @newclusters;
foreach my $cluster_index (0..@clusters - 1) {
push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
}
@clusters = @newclusters;
my $worst_cluster = pop @clusters;
print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
my $second_worst_cluster = pop @clusters;
print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
push @$worst_cluster, @$second_worst_cluster;
fisher_yates_shuffle($worst_cluster);
my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
push @clusters, \@first_half;
push @clusters, \@second_half;
if ($self->{_terminal_output}) {
print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
display_clusters(\@clusters);
}
}
if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
if (($last_recon_error - $total_reconstruction_error_this_iteration)
< $self->{_delta_normalized_error}) {
push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
$total_reconstruction_error_this_iteration;
last;
}
}
push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
$total_reconstruction_error_this_iteration;
$iteration_index++;
$previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient;
} # end of while loop on iteration_index
$self->{_num_iterations_actually_used} =
scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
if ($self->{_terminal_output}) {
print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
"merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
print "Total reconstruction error as a function of iterations: " .
"@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
}
# now merge sub-clusters if cluster_search_multiplier > 1
my @final_clusters;
if ($self->{_cluster_search_multiplier} > 1) {
print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
my @array_of_partitioned_cluster_groups = (\@clusters);
my @partitioned_cluster_groups;
my $how_many_clusters_looking_for = $self->{_K};
while (scalar(@final_clusters) < $self->{_K}) {
@partitioned_cluster_groups =
$self->graph_partition(shift @array_of_partitioned_cluster_groups,
$how_many_clusters_looking_for );
if (@{$partitioned_cluster_groups[0]} == 1) {
my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
} elsif ( $arr->[$i] > $max ) {
$max = $arr->[$i];
}
}
return ($min, $max);
}
# Meant only for constructing a deep copy of an array of arrays:
sub deep_copy_AoA {
my $ref_in = shift;
my $ref_out;
foreach my $i (0..@{$ref_in}-1) {
foreach my $j (0..@{$ref_in->[$i]}-1) {
$ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
}
}
return $ref_out;
}
# For displaying the individual clusters on a terminal screen. Each cluster is
# displayed through the symbolic names associated with the data points.
sub display_clusters {
my @clusters = @{shift @_};
my $i = 0;
foreach my $cluster (@clusters) {
@$cluster = sort @$cluster;
my $cluster_size = @$cluster;
print "\n\nCluster $i ($cluster_size records):\n";
foreach my $ele (@$cluster) {
print " $ele";
}
$i++
}
print "\n\n";
}
sub display_mean_and_covariance {
my $mean = shift;
my $covariance = shift;
print "\nDisplay the mean:\n";
display_matrix($mean);
print "\nDisplay the covariance:\n";
display_matrix($covariance);
}
sub check_for_illegal_params {
my @params = @_;
my @legal_params = qw / datafile
mask
K
P
terminal_output
cluster_search_multiplier
max_iterations
delta_reconstruction_error
visualize_each_iteration
show_hidden_in_3D_plots
make_png_for_each_iteration
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub display_matrix {
my $matrix = shift;
my $nrows = $matrix->rows();
my $ncols = $matrix->cols();
print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
foreach my $i (0..$nrows-1) {
my $row = $matrix->row($i);
my @row_as_list = $row->as_list;
# print "@row_as_list\n";
map { printf("%.4f ", $_) } @row_as_list;
print "\n";
}
print "\n\n";
}
sub transpose {
my $matrix = shift;
my $num_rows = $matrix->rows();
my $num_cols = $matrix->cols();
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
foreach my $i (0..$num_rows-1) {
my @row = $matrix->row($i)->as_list;
$transpose->set_col($i, \@row );
}
return $transpose;
}
sub vector_subtract {
my $vec1 = shift;
my $vec2 = shift;
die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
my @result;
foreach my $i (0..@$vec1-1){
push @result, $vec1->[$i] - $vec2->[$i];
}
return @result;
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
######################### Generating Synthetic Data for Manifold Clustering ##########################
################################## Class DataGenerator ########################################
## The embedded class defined below is for generating synthetic data for
## experimenting with linear manifold clustering when the data resides on the
## surface of a sphere. See the script generate_data_on_a_sphere.pl in the
## `examples' directory for how to specify the number of clusters and the spread of
## each cluster in the data that is generated.
package DataGenerator;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if _check_for_illegal_params3(@params) == 0;
bless {
_output_file => $args{output_file}
|| croak("name for output_file required"),
_total_number_of_samples_needed => $args{total_number_of_samples_needed}
|| croak("total_number_of_samples_needed required"),
_number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3,
_cluster_width => $args{cluster_width} || 0.1,
_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1,
_debug => $args{debug} || 0,
}, $class;
}
sub _check_for_illegal_params3 {
my @params = @_;
my @legal_params = qw / output_file
total_number_of_samples_needed
number_of_clusters_on_sphere
cluster_width
show_hidden_in_3D_plots
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## We first generate a set of points randomly on the unit sphere --- the number of
## points being equal to the number of clusters desired. These points will serve as
## cluster means (or, as cluster centroids) subsequently when we ask
## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
## points on the sphere. The second argument is the cluster mean and the third
## argument the cluster covariance. For the synthetic data, we set the cluster
## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
## variance along the azimuth direction and the (1,1) element corresponding to the
## variance along the elevation direction.
##
## When you generate the points in the 2D spherical coordinates of
## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
## the multivariate-normal function whose azimuth angle is outside the interval
## (0,360) and/or whose elevation angle is outside the interval (-90,90).
##
## Note that the first of the two dimensions for which the multivariate-normal
## function returns the points is for the azimuth angle and the second for the
## elevation angle.
##
## With regard to the relationship of the Cartesian coordinates to the spherical
## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
## and z the vertical axis. The elevation angle theta is measure with respect to
## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the
## elevation angle of +90 and the lowest points on the sphere (the Nadir)
## corresponds to the elevation angle of -90. The azimuth is measured with respect
## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is
## measured from the XY plane and its range is (-90,90) degrees.
sub gen_data_and_write_to_csv {
my $self = shift;
my $K = $self->{_number_of_clusters_on_sphere};
# $N is the number of samples to be generated for each cluster:
my $N = int($self->{_total_number_of_samples_needed} / $K);
my $output_file = $self->{_output_file};
# Designated all of the data elements in a cluster by a letter that is followed by
# an integer that identifies a specific data element.
my @point_labels = ('a'..'z');
# Our first job is to define $K random points in the 2D space (azimuth,
# elevation) to serve as cluster centers on the surface of the sphere. This we
# do by calling a uniformly distributed 1-D random number generator, first for
# the azimuth and then for the elevation in the loop shown below:
my @cluster_centers;
my @covariances;
foreach my $i (0..$K-1) {
my $azimuth = rand(360);
my $elevation = rand(90) - 90;
my @mean = ($azimuth, $elevation);
push @cluster_centers, \@mean;
my $cluster_covariance;
# The j-th dimension is for azimuth and k-th for elevation for the directions
# to surface of the sphere:
foreach my $j (0..1) {
foreach my $k (0..1) {
$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2
if $j == 0 && $k == 0;
$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2
if $j == 1 && $k == 1;
$cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
}
( run in 1.103 second using v1.01-cache-2.11-cpan-5735350b133 )