Algorithm-LinearManifoldDataClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/LinearManifoldDataClusterer.pm  view on Meta::CPAN

package Algorithm::LinearManifoldDataClusterer;

#------------------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
# resides on a low-dimensional manifold in a high-dimensional measurement space.
# -----------------------------------------------------------------------------------

use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(reduce any);
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use POSIX (); 

our $VERSION = '1.01';

# Constructor:
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params(@params) == 0;
    bless {
        _datafile                     =>   $args{datafile} || croak("datafile required"),
        _mask                         =>   $args{mask}     || croak("mask required"),
        _K                            =>   $args{K}        || 0,
        _P                            =>   $args{P}        || 0,
        _terminal_output              =>   $args{terminal_output} || 0,
        _max_iterations               =>   $args{max_iterations} || 0,
        _delta_reconstruction_error   =>   $args{delta_reconstruction_error} || 0.001,
        _delta_normalized_error       =>   undef,
        _cluster_search_multiplier    =>   $args{cluster_search_multiplier} || 1,
        _visualize_each_iteration     =>   $args{visualize_each_iteration} == 0 ? 0 : 1,
        _show_hidden_in_3D_plots      =>   $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
        _make_png_for_each_iteration  =>   $args{make_png_for_each_iteration} == 0 ? 0 : 1,
        _debug                        =>   $args{debug} || 0,
        _N                            =>   0,
        _KM                           =>   $args{K} * $args{cluster_search_multiplier},
        _data_hash                    =>   {},
        _data_tags                    =>   [],
        _data_dimensions              =>   0,
        _final_clusters               =>   [],
        _auto_retry_flag              =>   0,
        _num_iterations_actually_used =>   undef,
        _scale_factor                 =>   undef,
        _data_tags_to_cluster_label_hash  => {},
        _final_reference_vecs_for_all_subspaces => [],
        _reconstruction_error_as_a_function_of_iteration => [],
        _final_trailing_eigenvec_matrices_for_all_subspaces => [],
        _subspace_construction_error_as_a_function_of_iteration => [],
    }, $class;
}

sub get_data_from_csv {
    my $self = shift;
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n" if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_data_hash} = \%data_hash;
    $self->{_data_tags} = \@data_tags;
    $self->{_N} = scalar @data_tags;
}

sub estimate_mean_and_covariance {
    my $self = shift;
    my $tag_set = shift;
    my $cluster_size = @$tag_set;
    my @cluster_center = @{$self->add_point_coords($tag_set)};
    @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
    # for covariance calculation:
    my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
    print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
        if $self->{_debug};
    my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
    my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
    # All the record labels are stored in the array $self->{_data_tags}.  The actual
    # data for clustering is stored in a hash at $self->{_data_hash} whose keys are
    # the record labels; the value associated with each key is the array holding the
    # corresponding numerical multidimensional data.
    $mean_vec->set_col(0, \@cluster_center);
    if ($self->{_debug}) {
        print "\nDisplaying the mean vector for the cluster:\n";
        display_matrix( $mean_vec ) if $self->{_terminal_output};
    }
    foreach my $j (0..$num_cols-1) {
        my $tag = $tag_set->[$j];            
        my $data = $self->{_data_hash}->{$tag};
        my @diff_from_mean = vector_subtract($data, \@cluster_center);
        $matrix->set_col($j, \@diff_from_mean);
    }
    my $transposed = transpose( $matrix );
    my $covariance = $matrix * $transposed;
    $covariance *= 1.0 / $num_cols;
    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for cluster:";
        display_matrix( $covariance ) if $self->{_terminal_output};
    }
    return ($mean_vec, $covariance);
}

sub eigen_analysis_of_covariance {
    my $self = shift;
    my $covariance = shift;
    my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    print "Eigenvalue 0:   $eigenvalues->[0]\n" if $self->{_debug};
    foreach my $i (1..$num_of_eigens-1) {
        $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
        $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
        print "Eigenvalue $i:   $eigenvalues->[$i]\n" if $self->{_debug};
    }
    print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
    print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
    my @all_my_eigenvecs;
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        my @eigenvec;
        foreach my $ele (@vec) {
            my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
            if ($theta eq "0") {
                push @eigenvec, $mag;
            } elsif ($theta eq "pi") {
                push @eigenvec, -1.0 * $mag;   
            } else {
                die "Eigendecomposition produced a complex eigenvector -- " .
                    "which should not happen for a covariance matrix!";
            }
        }
        print "Eigenvector $i:   @eigenvec\n" if $self->{_debug};
        push @all_my_eigenvecs, \@eigenvec;
    }
    my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
    print "\nLargest eigenvector:   @largest_eigen_vec\n" if $self->{_debug};
    my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
    my @sorted_eigenvecs;
    my @sorted_eigenvals;
    foreach my $i (0..@sorted_eigenvec_indexes-1) {
        $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];        
        $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];       
    }
    if ($self->{_debug}) {
        print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
        foreach my $i (0..@sorted_eigenvecs-1) {
            print "eigenvec:  @{$sorted_eigenvecs[$i]}       eigenvalue: $sorted_eigenvals[$i]\n";
        }
    }
    return (\@sorted_eigenvecs, \@sorted_eigenvals);
}

sub auto_retry_clusterer {
    my $self = shift;    
    $self->{_auto_retry_flag} = 1;
    my $clusters;
    $@ = 1;
    my $retry_attempts = 1;
    while ($@) {
        eval {
            $clusters = $self->linear_manifold_clusterer();
        };
        if ($@) {
            if ($self->{_terminal_output}) {
                print "Clustering failed. Trying again. --- $@";
                print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
                print     "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n";
            }
            $retry_attempts++;
        } else {
            print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output};
            return $clusters;
        }
    }
}

sub linear_manifold_clusterer {
    my $self = shift;
    my $KM = $self->{_KM};
    my @initial_cluster_center_tags;
    my $visualization_msg;
    my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
    print "Randomly selected indexes for cluster center tags:  @initial_cluster_center_indexes\n"
        if $self->{_debug};
    @initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
    my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
    if ($self->{_debug}) {
        foreach my $centroid (@initial_cluster_center_coords) {
            print "Initial cluster center coords:  @{$centroid}\n";
        }
    }
    my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
    if ($self->{_data_dimensions} == 3) {
        $visualization_msg = "initial_clusters";
        $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters) 
            if $self->{_visualize_each_iteration};
        $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
            if $self->{_make_png_for_each_iteration};
    }
    foreach my $cluster (@$initial_clusters) {
        my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
        display_mean_and_covariance($mean, $covariance) if $self->{_debug};
    }
    my @clusters = @$initial_clusters;
    display_clusters(\@clusters) if $self->{_debug};
    my $iteration_index = 0;
    my $unimodal_correction_flag;
    my $previous_min_value_for_unimodality_quotient;
    while ($iteration_index < $self->{_max_iterations}) {
        print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
            if $self->{_terminal_output};
        my $total_reconstruction_error_this_iteration = 0;
        my @subspace_construction_errors_this_iteration;
        my @trailing_eigenvec_matrices_for_all_subspaces;
        my @reference_vecs_for_all_subspaces;
        foreach my $cluster (@clusters) {
            next if @$cluster == 0;
            my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
            display_mean_and_covariance($mean, $covariance) if $self->{_debug};
            print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
            my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
            my @trailing_eigenvecs =  @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
            my @trailing_eigenvals =  @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
            my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
            push @subspace_construction_errors_this_iteration, $subspace_construction_error;
            my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, 
                                                                 scalar(@trailing_eigenvecs));
            foreach my $j (0..@trailing_eigenvecs-1) {
                print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
                $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
            }
            push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
            push @reference_vecs_for_all_subspaces, $mean;
        }
        $self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
        my %best_subspace_based_partition_of_data;
        foreach my $i (0..$self->{_KM}-1) {
            $best_subspace_based_partition_of_data{$i} = [];
        }
        foreach my $data_tag (@{$self->{_data_tags}}) {
            my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
            $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
            my @errors = map {$self->reconstruction_error($data_vec,
                                  $trailing_eigenvec_matrices_for_all_subspaces[$_],
                                  $reference_vecs_for_all_subspaces[$_])}
                         0 .. $self->{_KM}-1;
            my ($minval, $index_for_closest_subspace) = minimum(\@errors);
            $total_reconstruction_error_this_iteration += $minval;
            push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
                                                                      $data_tag;
        }
        print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
              "iteration and re-assigning the data samples to the new subspaces on the basis of\n".
              "the least reconstruction error.\n\n" .
              "Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
                  if $self->{_terminal_output};
        foreach my $i (0..$self->{_KM}-1) {
            $clusters[$i] = $best_subspace_based_partition_of_data{$i};
        }
        display_clusters(\@clusters) if $self->{_terminal_output};
        # Check if any cluster has lost all its elements. If so, fragment the worst
        # existing cluster to create the additional clusters needed:
        if (any {@$_ == 0} @clusters) {
            die "empty cluster found" if $self->{_auto_retry_flag};
            print "\nOne or more clusters have become empty.  Will carve out the needed clusters\n" .
                  "from the cluster with the largest subspace construction error.\n\n";
            $total_reconstruction_error_this_iteration = 0;
            @subspace_construction_errors_this_iteration = ();
            my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
            print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
            my $max = List::Util::max @subspace_construction_errors_this_iteration;
            my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
            my @cluster_fragments = cluster_split($clusters[$maxindex], 
                                                  $how_many_extra_clusters_needed + 1);
            my @newclusters;
            push @newclusters, @clusters[0 .. $maxindex-1];
            push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
            push @newclusters, @cluster_fragments;
            @newclusters = grep {@$_ != 0} @newclusters; 
            die "something went wrong with cluster fragmentation" 
                unless $self->{_KM} = @newclusters;
            @trailing_eigenvec_matrices_for_all_subspaces = ();
            @reference_vecs_for_all_subspaces = ();
            foreach my $cluster (@newclusters) {
                die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
                my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
                my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
                my @trailing_eigenvecs =  @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
                my @trailing_eigenvals =  @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
                my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
                push @subspace_construction_errors_this_iteration, $subspace_construction_error;
                my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, 
                                                      scalar(@trailing_eigenvecs));
                foreach my $j (0..@trailing_eigenvecs-1) {
                    $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
                }
                push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
                push @reference_vecs_for_all_subspaces, $mean;
            }
            my %best_subspace_based_partition_of_data;
            foreach my $i (0..$self->{_KM}-1) {
                $best_subspace_based_partition_of_data{$i} = [];
            }
            foreach my $data_tag (@{$self->{_data_tags}}) {
                my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
                $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
                my @errors = map {reconstruction_error($data_vec,
                                      $trailing_eigenvec_matrices_for_all_subspaces[$_],
                                      $reference_vecs_for_all_subspaces[$_])}
                             0 .. $self->{_KM}-1;
                my ($minval, $index_for_closest_subspace) = minimum(\@errors);
                $total_reconstruction_error_this_iteration += $minval;
                push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
                                                                      $data_tag;
            }
            print "empty-cluster jag: total reconstruction error in this iteration: \n" .
                  "$total_reconstruction_error_this_iteration\n"
                if $self->{_debug};
            foreach my $i (0..$self->{_KM}-1) {
                $clusters[$i] = $best_subspace_based_partition_of_data{$i};
            }
            display_clusters(\@newclusters) if $self->{_terminal_output};
            @clusters = grep {@$_ != 0} @newclusters;
            die "linear manifold based algorithm does not appear to work in this case $!" 
                unless @clusters == $self->{_KM};
        }# end of foreach my $cluster (@clusters) ... loop  followed by if clause for empty clusters
        if ($self->{_data_dimensions} == 3) {
            $visualization_msg = "clustering_at_iteration_$iteration_index";
            $self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
                if $self->{_visualize_each_iteration};
            $self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
                if $self->{_make_png_for_each_iteration};
        }
        my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_], 
                                                      $reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
        my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
        print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
        die "\n\nBailing out!\n" .
            "It does not look like these iterations will lead to a good clustering result.\n" .
            "Program terminating.  Try running again.\n" 
            if defined($previous_min_value_for_unimodality_quotient)
               && ($min_value_for_unimodality_quotient < 0.4)
               && ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
        if ( $min_value_for_unimodality_quotient < 0.5 ) {
            $unimodal_correction_flag = 1;
            print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
            my @sorted_cluster_indexes = 
               sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
            my @newclusters;
            foreach my $cluster_index (0..@clusters - 1) {
                push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
            }
            @clusters = @newclusters;
            my $worst_cluster = pop @clusters;
            print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
            my $second_worst_cluster = pop @clusters;
            print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
            push @$worst_cluster, @$second_worst_cluster;
            fisher_yates_shuffle($worst_cluster);
            my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
            my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
            push @clusters, \@first_half;
            push @clusters, \@second_half;
            if ($self->{_terminal_output}) {
                print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
                display_clusters(\@clusters);      
            }
        }
        if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
            my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
            push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
            if (($last_recon_error - $total_reconstruction_error_this_iteration) 
                                                        < $self->{_delta_normalized_error}) {
                push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, 
                                                      $total_reconstruction_error_this_iteration;
                last;
            }
        }

lib/Algorithm/LinearManifoldDataClusterer.pm  view on Meta::CPAN

        my @array_of_partitioned_cluster_groups = (\@clusters);
        my @partitioned_cluster_groups;
        my $how_many_clusters_looking_for = $self->{_K};
        while (scalar(@final_clusters) < $self->{_K}) {
            @partitioned_cluster_groups = 
                   $self->graph_partition(shift @array_of_partitioned_cluster_groups, 
                                                             $how_many_clusters_looking_for );
            if (@{$partitioned_cluster_groups[0]} == 1) {
                my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
                push @final_clusters, $singular_cluster;
                $how_many_clusters_looking_for--;
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
            } elsif (@{$partitioned_cluster_groups[1]} == 1) {
                my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
                push @final_clusters, $singular_cluster;
                $how_many_clusters_looking_for--;
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
            } else {
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
            }
        }
        my @data_clustered;
        foreach my $cluster (@final_clusters) {
            push @data_clustered, @$cluster;
        }
        unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
            $self->{_final_clusters} = \@final_clusters;
            my %data_clustered = map {$_ => 1} @data_clustered;
            my @data_tags_not_clustered = 
                   grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
            if ($self->{_terminal_output}) {
                print "\n\nNot all data clustered.  The most reliable clusters found by graph partitioning:\n";
                display_clusters(\@final_clusters);
                print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
            }
            if ($self->{_data_dimensions} == 3) {
                $visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
                $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
                    if $self->{_visualize_each_iteration};
                $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
                    if $self->{_make_png_for_each_iteration};
            }
            my %data_tags_to_cluster_label_hash;
            foreach my $i (0..@final_clusters-1) {
                map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
            }
            $self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
            foreach my $tag (@data_tags_not_clustered) {
                my $which_cluster = $self->which_cluster_for_new_element($tag);
                $self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
            }
            die "Some data elements are still missing from the final tally" 
              unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) == 
                scalar(@{$self->{_data_tags}});
            my @new_final_clusters;
            map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
                      push @{$new_final_clusters[$_]}, $ele 
                            if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ } 
                } 0..$self->{_K}-1;
            if ($self->{_debug}) {
                print "\ndisplaying the final clusters after accounting for unclustered data:\n"; 
                display_clusters(\@new_final_clusters);
            }
            $self->{_final_clusters} = \@new_final_clusters;
            @final_clusters = @new_final_clusters;
        }
    } else {
        @final_clusters = @clusters;
    }
    print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
    display_clusters(\@final_clusters) if $self->{_terminal_output};
    return \@final_clusters;
} 

sub display_reconstruction_errors_as_a_function_of_iterations {
    my $self = shift;            
    print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
    print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
          "@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
}

sub set_termination_reconstruction_error_threshold {
    my $self = shift;        
    my $all_ref_vecs = shift;
    my @mean_vecs = @$all_ref_vecs;
    my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_; 
                                                      my @result = $result->as_list; 
                                                      sqrt($result[0])
                                                    } @mean_vecs;
    $self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
    $self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) * 
                                       $self->{_delta_reconstruction_error};
}

# This method is called only in the `unless' clause at the end of the main
# linear_manifold_clusterer() method.  It is called to find the cluster labels for
# those data elements that were left unclustered by the main part of the algorithm
# when graph partitioning is used to merge similar sub-clusters.  The operating logic
# here is that graph partition yields K main clusters even though each main cluster
# may not yet be fully populated.
sub which_cluster_for_new_element {
    my $self = shift;    
    my $data_tag = shift;
    # The following `unless' clause is called only the first time the current method
    # is called:
    unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) {
        my @trailing_eigenvec_matrices_for_all_subspaces;
        my @reference_vecs_for_all_subspaces;
        foreach my $cluster (@{$self->{_final_clusters}}) {
            my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
            my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
            my @trailing_eigenvecs =  @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
            my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, 
                                                                 scalar(@trailing_eigenvecs));
            foreach my $j (0..@trailing_eigenvecs-1) {
                $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
            }
            push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
            push @reference_vecs_for_all_subspaces, $mean;
        }
        $self->{_final_trailing_eigenvec_matrices_for_all_subspaces} = 
                                        \@trailing_eigenvec_matrices_for_all_subspaces;
        $self->{_final_reference_vecs_for_all_subspaces} = \@reference_vecs_for_all_subspaces;
    }
    my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
    $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
    my @errors = map {$self->reconstruction_error($data_vec,
                          $self->{_final_trailing_eigenvec_matrices_for_all_subspaces}->[$_],
                          $self->{_final_reference_vecs_for_all_subspaces}->[$_])}
                         0 .. $self->{_K}-1;
    my ($minval, $index_for_closest_subspace) = minimum(\@errors);
    return $index_for_closest_subspace;
}

sub graph_partition {
    my $self = shift;
    my $clusters = shift;
    my $how_many_clusters_looking_for = shift;
    print "\n\nGraph partitioning looking for $how_many_clusters_looking_for clusters\n\n";
    my $num_nodes = scalar @$clusters;
    my $W = Math::GSL::Matrix->new($num_nodes,$num_nodes);
    my $D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
    $D->identity;
    my $neg_sqrt_of_D = Math::GSL::Matrix->new($num_nodes,$num_nodes);
    $neg_sqrt_of_D->identity;
    my @subspace_construction_errors;
    my @trailing_eigenvec_matrices_for_all_subspaces;
    my @reference_vecs_for_all_subspaces;
    foreach my $cluster (@$clusters) {
        my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
        my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
        my @trailing_eigenvecs =  @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
        my @trailing_eigenvals =  @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
        my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
        push @subspace_construction_errors, $subspace_construction_error;
        my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, 
                                                             scalar(@trailing_eigenvecs));
        foreach my $j (0..@trailing_eigenvecs-1) {
            print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
            $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
        }
        push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
        push @reference_vecs_for_all_subspaces, $mean;
    }
    # We consider the similarity matrix W to be a sum of two parts W_recon_based and
    # W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two
    # similarity measures separately in W_recon_based and W_dist_bet_means based. Our
    # goal is to fill up these component matrices with the raw values while at the
    # same time collecting information needed for normalizing these two separate
    # measures of similarity.
    my $W_reconstruction_error_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
    my $W_dist_between_means_based = Math::GSL::Matrix->new($num_nodes,$num_nodes);
    my @all_pairwise_reconstruction_errors;
    my @all_dist_between_means_errors;
    foreach my $i (0..$num_nodes-1) {
        foreach my $j (0..$num_nodes-1) {
            my ($recon_error_similarity, $dist_bet_means) = $self->pairwise_cluster_similarity(
                                                      $clusters->[$i], 
                                                      $trailing_eigenvec_matrices_for_all_subspaces[$i],
                                                      $reference_vecs_for_all_subspaces[$i],
                                                      $clusters->[$j],
                                                      $trailing_eigenvec_matrices_for_all_subspaces[$j],
                                                      $reference_vecs_for_all_subspaces[$j]);
            $W_reconstruction_error_based->set_elem($i, $j, $recon_error_similarity);
            $W_dist_between_means_based->set_elem($i, $j, $dist_bet_means);
            push @all_pairwise_reconstruction_errors, $recon_error_similarity;
            push @all_dist_between_means_errors, $dist_bet_means;
        }
    }
    my $recon_error_normalizer = (reduce {$a + $b} @all_pairwise_reconstruction_errors) / 
                                 (scalar @all_pairwise_reconstruction_errors);
    my $dist_bet_means_based_normalizer = (reduce {$a + $b} @all_dist_between_means_errors ) /
                                 (scalar @all_dist_between_means_errors );
    die "\n\nBailing out!\n" .
        "Dealing with highly defective clusters.  Try again.\n" 
        if ($recon_error_normalizer == 0) || ($dist_bet_means_based_normalizer == 0);
    foreach my $i (0..$num_nodes-1) {
        foreach my $j (0..$num_nodes-1) {
            my $recon_val = $W_reconstruction_error_based->get_elem($i,$j);
            my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer );
            $W_reconstruction_error_based->set_elem($i,$j,$new_recon_val);
            my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j);
            my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer );
            $W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val);
        }
    }
    $W = $W_reconstruction_error_based + $W_dist_between_means_based;
    if ($self->{_debug}) {
        print "\nDisplaying the similarity matrix W for the cluster graph:\n";
        display_matrix($W) if $self->{_terminal_output};
    }
    my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1);
    foreach my $col (0..$num_nodes-1) {
        $add_all_columns += $W->col($col);
    }
    foreach my $i (0..$num_nodes-1) {
        $D->set_elem($i,$i, $add_all_columns->get_elem($i,0));
        $neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0)));
    }
    # the Laplacian matrix:
    my $Laplacian = $D - $W;
    # the Symmetric Normalized Laplacian matrix A:
    my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D;
    foreach my $i (0..$num_nodes-1) {
        foreach my $j (0..$num_nodes-1) {
            $A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01;
        }
    }
    if ($self->{_terminal_output}) {
        print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" .
              "A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n";
        display_matrix( $A );
    }
    my ($eigenvalues, $eigenvectors) = $A->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    if ($self->{_debug2}) {
        print "Eigenvalue 0:   $eigenvalues->[0]\n";
        foreach my $i (1..$num_of_eigens-1) {
            $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
            $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
            print "Eigenvalue $i:   $eigenvalues->[$i]\n";
        }
        print "\nlargest eigen index: $largest_eigen_index\n";
        print "\nsmallest eigen index: $smallest_eigen_index\n\n";
    }
    my @all_my_eigenvecs;
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        my @eigenvec;
        foreach my $ele (@vec) {
            my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
            if ($theta eq "0") {
                push @eigenvec, $mag;
            } elsif ($theta eq "pi") {
                push @eigenvec, -1.0 * $mag;   
            } else {
                die "Eigendecomposition produced a complex eigenvector!";
            }
        }
        print "Eigenvector $i:   @eigenvec\n" if $self->{_debug2};
        push @all_my_eigenvecs, \@eigenvec;
    }
    if ($self->{_debug2}) {
        my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
        print "\nLargest eigenvector of A:   @largest_eigen_vec\n";
    }
    my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
    print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2};
    my @sorted_eigenvecs;
    my @sorted_eigenvals;
    foreach my $i (0..@sorted_eigenvec_indexes-1) {
        $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];        
        $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];       
    }
    if ($self->{_debug2}) {
        print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n";
        foreach my $i (0..@sorted_eigenvecs-1) {
            print "eigenvec:  @{$sorted_eigenvecs[$i]}       eigenvalue: $sorted_eigenvals[$i]\n";
        }
    }
    my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2];
    print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output};
    my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec;
    my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive;
    print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n"
        if $self->{_terminal_output};
    if ($how_many_clusters_looking_for <= 3) {
        my @merged_cluster;
        my $final_cluster;
        my @newclusters;
        if ($how_many_positive == 1) {
            foreach my $i (0..@$clusters-1) {
                if ($best_partitioning_eigenvec->[$i] > 0) {
                    $final_cluster = $clusters->[$i]; 
                } else {
                    push @newclusters, $clusters->[$i];
                }
            }
            return ([$final_cluster], \@newclusters);
        } elsif ($how_many_negative == 1) {
            foreach my $i (0..@$clusters-1) {
                if ($best_partitioning_eigenvec->[$i] < 0) {
                    $final_cluster = $clusters->[$i]; 
                } else {
                    push @newclusters, $clusters->[$i];
                }
            }
            return ([$final_cluster], \@newclusters);
        } elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) {
            foreach my $i (0..@$clusters-1) {
                if ($best_partitioning_eigenvec->[$i] > 0) {
                    push @merged_cluster, @{$clusters->[$i]}; 
                } else {
                    push @newclusters, $clusters->[$i];
                }
            }
            return ([\@merged_cluster], \@newclusters);
        } elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) {
            foreach my $i (0..@$clusters-1) {
                if ($best_partitioning_eigenvec->[$i] < 0) {
                    push @merged_cluster, @{$clusters->[$i]}; 
                } else {
                    push @newclusters, $clusters->[$i];
                }
            }
            return ([\@merged_cluster], \@newclusters);
        } else {
            die "\n\nBailing out!\n\n" .
                "No consensus support for dominant clusters in the graph partitioning step\n" .
                "of the algorithm. This can be caused by bad random selection of initial\n" .
                "cluster centers.  Please run this program again.\n";
        }
    } else {
        my @positive_clusters;
        my @negative_clusters;

lib/Algorithm/LinearManifoldDataClusterer.pm  view on Meta::CPAN

    my $mean = shift;   
    my $delta = 0.4 * $self->{_scale_factor};  # Radius of the delta ball along each dimension
    my @mean = $mean->as_list;
    my @data_tags_for_range_tests;
    foreach my $dimen (0..$self->{_data_dimensions}-1) {
        my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
        my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
        my $range = $max - $min;
        my $mean_along_this_dimen = $mean[$dimen];
        my @tags =  grep {$_} 
                    map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range) 
                            && 
                            ($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) ) 
                          ? $_ : undef }
                    @$cluster; 
        push @data_tags_for_range_tests, \@tags;
    }
    # Now find the intersection of the tag sets for each of the dimensions
    my %intersection_hash;
    foreach my $dimen (0..$self->{_data_dimensions}-1) {
        my %tag_hash_for_this_dimen  = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]};
        if ($dimen == 0) {
            %intersection_hash = %tag_hash_for_this_dimen;
        } else {
            %intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}} 
                                 keys %intersection_hash;
        }
    }
    my @intersection_set = keys %intersection_hash;
    my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster);
    return $cluster_unimodality_index;
}

sub find_best_ref_vector {
    my $self = shift;
    my $cluster = shift;
    my $trailing_eigenvec_matrix = shift;
    my $mean = shift;        # a GSL marix ref
    my @min_bounds;
    my @max_bounds;
    my @ranges;
    foreach my $dimen (0..$self->{_data_dimensions}-1) {
        my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
        my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
        push @min_bounds, $min;
        push @max_bounds, $max;
        push @ranges, $max - $min;
    }
    print "min bounds are: @min_bounds\n";
    print "max bounds are: @max_bounds\n";
    my $max_iterations = 100;
    my @random_points;
    my $iteration = 0;
    while ($iteration++ < $max_iterations) {
        my @coordinate_vec;
        foreach my $dimen (0..$self->{_data_dimensions}-1) {        
            push @coordinate_vec,  $min_bounds[$dimen] + rand($ranges[$dimen]);
        }
        push @random_points, \@coordinate_vec;
    }
    if ($self->{_debug}) {
        print "\nrandom points\n";
        map {print "@$_\n"} @random_points;
    }
    my @mean = $mean->as_list;
    unshift @random_points, \@mean;
    my @reconstruction_errors;
    foreach my $candidate_ref_vec (@random_points) {
        my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
        $ref_vec->set_col(0, $candidate_ref_vec);
        my $reconstruction_error_for_a_ref_vec = 0;
        foreach my $data_tag (@{$self->{_data_tags}}) {
            my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
            $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
            my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec);
            $reconstruction_error_for_a_ref_vec += $error;
        }
        push @reconstruction_errors, $reconstruction_error_for_a_ref_vec;
    }
    my $recon_error_for_original_mean = shift @reconstruction_errors;
    my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors);
    my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs }
                                                    @reconstruction_errors;
    my $refvec = $random_points[$minindex];
    return $refvec;
}

##  The reconstruction error relates to the size of the perpendicular from a data
##  point X to the hyperplane that defines a given subspace on the manifold.
sub reconstruction_error {
    my $self = shift;
    my $data_vec = shift;
    my $trailing_eigenvecs = shift;
    my $ref_vec = shift;    
    my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs *  
                                 transpose($trailing_eigenvecs) * ($data_vec - $ref_vec);
    my @error_squared_as_list = $error_squared->as_list();
    my $error_squared_as_scalar = shift @error_squared_as_list;
    return $error_squared_as_scalar;
}

# Returns a set of KM random integers.  These serve as indices to reach into the data
# array.  A data element whose index is one of the random numbers returned by this
# routine serves as an initial cluster center.  Note the quality check it runs on the
# list of the random integers constructed.  We first make sure that all the random
# integers returned are different.  Subsequently, we carry out a quality assessment
# of the random integers constructed.  This quality measure consists of the ratio of
# the values spanned by the random integers to the value of N, the total number of
# data points to be clustered.  Currently, if this ratio is less than 0.3, we discard
# the K integers and try again.
sub initialize_cluster_centers {
    my $self = shift;
    my $K = shift;   # This value is set to the parameter KM in the call to this subroutine
    my $data_store_size = shift;
    my @cluster_center_indices;
    while (1) {
        foreach my $i (0..$K-1) {
            $cluster_center_indices[$i] = int rand( $data_store_size );
            next if $i == 0;
            foreach my $j (0..$i-1) {
                while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) {

lib/Algorithm/LinearManifoldDataClusterer.pm  view on Meta::CPAN

        } elsif ( $arr->[$i] < $min ) {
            $min = $arr->[$i];
        } elsif ( $arr->[$i] > $max ) {
            $max = $arr->[$i];
        }
    }
    return ($min, $max);
}

# Meant only for constructing a deep copy of an array of arrays:
sub deep_copy_AoA {
    my $ref_in = shift;
    my $ref_out;
    foreach my $i (0..@{$ref_in}-1) {
        foreach my $j (0..@{$ref_in->[$i]}-1) {
            $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
        }
    }
    return $ref_out;
}

# For displaying the individual clusters on a terminal screen.  Each cluster is
# displayed through the symbolic names associated with the data points.
sub display_clusters {
    my @clusters = @{shift @_};
    my $i = 0;
    foreach my $cluster (@clusters) {
        @$cluster = sort @$cluster;
        my $cluster_size = @$cluster;
        print "\n\nCluster $i ($cluster_size records):\n";
        foreach my $ele (@$cluster) {
            print "  $ele";
        }
        $i++
    }
    print "\n\n";
}

sub display_mean_and_covariance {
    my $mean = shift;
    my $covariance = shift;
    print "\nDisplay the mean:\n";
    display_matrix($mean);
    print "\nDisplay the covariance:\n";
    display_matrix($covariance);
}

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / datafile
                            mask
                            K
                            P
                            terminal_output
                            cluster_search_multiplier
                            max_iterations
                            delta_reconstruction_error
                            visualize_each_iteration
                            show_hidden_in_3D_plots
                            make_png_for_each_iteration
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub display_matrix {
    my $matrix = shift;
    my $nrows = $matrix->rows();
    my $ncols = $matrix->cols();
    print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
    foreach my $i (0..$nrows-1) {
        my $row = $matrix->row($i);
        my @row_as_list = $row->as_list;
#        print "@row_as_list\n";
        map { printf("%.4f ", $_) } @row_as_list;
        print "\n";
    }
    print "\n\n";
}

sub transpose {
    my $matrix = shift;
    my $num_rows = $matrix->rows();
    my $num_cols = $matrix->cols();
    my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
    foreach my $i (0..$num_rows-1) {
        my @row = $matrix->row($i)->as_list;
        $transpose->set_col($i, \@row );
    }
    return $transpose;
}

sub vector_subtract {
    my $vec1 = shift;
    my $vec2 = shift;
    die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
    my @result;
    foreach my $i (0..@$vec1-1){
        push @result, $vec1->[$i] - $vec2->[$i];
    }
    return @result;
}

# from perl docs:
sub fisher_yates_shuffle {                
    my $arr =  shift;                
    my $i = @$arr;                   
    while (--$i) {                   
        my $j = int rand( $i + 1 );  
        @$arr[$i, $j] = @$arr[$j, $i]; 
    }
}

#########################  Generating Synthetic Data for Manifold Clustering  ##########################

##################################      Class DataGenerator     ########################################

##  The embedded class defined below is for generating synthetic data for
##  experimenting with linear manifold clustering when the data resides on the
##  surface of a sphere.  See the script generate_data_on_a_sphere.pl in the
##  `examples' directory for how to specify the number of clusters and the spread of
##  each cluster in the data that is generated.

package DataGenerator;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if _check_for_illegal_params3(@params) == 0;   
    bless {
        _output_file                       =>   $args{output_file} 
                                                   || croak("name for output_file required"),
        _total_number_of_samples_needed    =>   $args{total_number_of_samples_needed} 
                                                   || croak("total_number_of_samples_needed required"),
        _number_of_clusters_on_sphere      =>   $args{number_of_clusters_on_sphere}   || 3,
        _cluster_width                     =>   $args{cluster_width}   || 0.1,
        _show_hidden_in_3D_plots           =>   $args{show_hidden_in_3D_plots} || 1,
        _debug                             =>   $args{debug} || 0,
    }, $class;
}

sub _check_for_illegal_params3 {
    my @params = @_;
    my @legal_params = qw / output_file
                            total_number_of_samples_needed
                            number_of_clusters_on_sphere
                            cluster_width
                            show_hidden_in_3D_plots
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

##  We first generate a set of points randomly on the unit sphere --- the number of
##  points being equal to the number of clusters desired.  These points will serve as
##  cluster means (or, as cluster centroids) subsequently when we ask
##  Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
##  points on the sphere.  The second argument is the cluster mean and the third
##  argument the cluster covariance.  For the synthetic data, we set the cluster
##  covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
##  variance along the azimuth direction and the (1,1) element corresponding to the
##  variance along the elevation direction.
##
##  When you generate the points in the 2D spherical coordinates of
##  (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
##  the multivariate-normal function whose azimuth angle is outside the interval
##  (0,360) and/or whose elevation angle is outside the interval (-90,90).
##
##  Note that the first of the two dimensions for which the multivariate-normal
##  function returns the points is for the azimuth angle and the second for the
##  elevation angle.
##
##  With regard to the relationship of the Cartesian coordinates to the spherical
##  (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
##  and z the vertical axis.  The elevation angle theta is measure with respect to
##  the XY-plane.  The highest point on the sphere (the Zenith) corresponds to the
##  elevation angle of +90 and the lowest points on the sphere (the Nadir)
##  corresponds to the elevation angle of -90.  The azimuth is measured with respect
##  X-axis.  The range of the azimuth is from 0 to 360 degrees.  The elevation is
##  measured from the XY plane and its range is (-90,90) degrees.
sub gen_data_and_write_to_csv {
    my $self = shift;
    my $K = $self->{_number_of_clusters_on_sphere};
    # $N is the number of samples to be generated for each cluster:
    my $N = int($self->{_total_number_of_samples_needed} / $K);
    my $output_file = $self->{_output_file};
    # Designated all of the data elements in a cluster by a letter that is followed by
    # an integer that identifies a specific data element.
    my @point_labels = ('a'..'z');
    # Our first job is to define $K random points in the 2D space (azimuth,
    # elevation) to serve as cluster centers on the surface of the sphere.  This we
    # do by calling a uniformly distributed 1-D random number generator, first for
    # the azimuth and then for the elevation in the loop shown below:
    my @cluster_centers;
    my @covariances;
    foreach my $i (0..$K-1) {
        my $azimuth = rand(360);
        my $elevation =  rand(90) - 90; 
        my @mean = ($azimuth, $elevation);
        push @cluster_centers, \@mean;
        my $cluster_covariance;
        # The j-th dimension is for azimuth and k-th for elevation for the directions
        # to surface of the sphere:
        foreach my $j (0..1) {
            foreach my $k (0..1) {
                $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2 
                                                                        if $j == 0 && $k == 0;
                $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2 
                                                                        if $j == 1 && $k == 1;
                $cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
            }
        }
        push @covariances, $cluster_covariance;
    }
    if ($self->{_debug}) {
        foreach my $i (0..$K-1) {
            print "\n\nCluster center:  @{$cluster_centers[$i]}\n";
            print "\nCovariance:\n";
            foreach my $j (0..1) {
                foreach my $k (0..1) {
                    print "$covariances[$i]->[$j]->[$k]  ";
                }
                print "\n";
            }
        }
    }
    my @data_dump;
    foreach my $i (0..$K-1) {
        my @m = @{shift @cluster_centers};
        my @covar = @{shift @covariances};
        my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar);
        if ($self->{_debug}) {
            print "\nThe points for cluster $i:\n";
            map { print "@$_   "; } @new_data;
            print "\n\n";        
        }
        my @wrapped_data;
        foreach my $d (@new_data) {
            my $wrapped_d;
            if ($d->[0] >= 360.0) {
                $wrapped_d->[0] = $d->[0] - 360.0;
            } elsif ($d->[0] < 0) {
                $wrapped_d->[0] = 360.0 - abs($d->[0]);
            }
            if ($d->[1] >= 90.0) {
                $wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360);
                $wrapped_d->[1] = 180.0 - $d->[1];
            } elsif ($d->[1] < -90.0) {
                $wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360);
                $wrapped_d->[1] = -180.0 - $d->[1];
            } 
            $wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0];
            $wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1];
            push @wrapped_data, $wrapped_d;
        }
        if ($self->{_debug}) {
            print "\nThe unwrapped points for cluster $i:\n";
            map { print "@$_   "; } @wrapped_data;
            print "\n\n";        
        }
        my $label = $point_labels[$i];
        my $j = 0;
        @new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data;
        push @data_dump, @new_data;
    }
    if ($self->{_debug}) {
        print "\n\nThe labeled points for clusters:\n";
        map { print "@$_\n"; } @data_dump;
    }
    fisher_yates_shuffle( \@data_dump );
    open OUTPUT, ">$output_file";
    my $total_num_of_points = $N * $K;
    print "Total number of data points that will be written out to the file: $total_num_of_points\n"
        if $self->{_debug};
    foreach my $ele (@data_dump) {
        my ($x,$y,$z);
        my $label = $ele->[0];
        my $azimuth = $ele->[1];
        my $elevation = $ele->[2];
        $x = cos($elevation) * cos($azimuth);
        $y = cos($elevation) * sin($azimuth); 
        $z = sin($elevation);
        my $csv_str = join ",", ($label,$x,$y,$z);
        print OUTPUT "$csv_str\n";
    }
    print "\n\n";
    print "Data written out to file $output_file\n" if $self->{_debug};
    close OUTPUT;
}

# This version for the embedded class for data generation
sub visualize_data_on_sphere {
    my $self = shift;
    my $datafile = shift;
    my $filename = File::Basename::basename($datafile);
    my $temp_file = "__temp_" . $filename;
    $temp_file =~ s/\.\w+$/\.txt/;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    open INPUT, "< $filename" or die "Unable to open $filename: $!";
    local $/ = undef;
    my @all_records = split /\s+/, <INPUT>;
    my %clusters;
    foreach my $record (@all_records) {    
        my @splits = split /,/, $record;
        my $record_name = shift @splits;
        $record_name =~ /(\w+?)_.*/;
        my $primary_cluster_label = $1;
        push @{$clusters{$primary_cluster_label}}, \@splits;
    }
    foreach my $key (sort {"\L$a" cmp "\L$b"} keys %clusters) {
        map {print OUTPUT "$_"} map {"@$_\n"} @{$clusters{$key}};
        print OUTPUT "\n\n";
    }
    my @sorted_cluster_keys = sort {"\L$a" cmp "\L$b"} keys %clusters;
    close OUTPUT;   
    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    my $arg_string = "";
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set hidden3d" ) unless $self->{_show_hidden_in_3D_plots};
    $plot->gnuplot_cmd( "set pointsize 2" );
    $plot->gnuplot_cmd( "set parametric" );
    $plot->gnuplot_cmd( "set size ratio 1" );
    $plot->gnuplot_cmd( "set xlabel \"X\"" );
    $plot->gnuplot_cmd( "set ylabel \"Y\"" );
    $plot->gnuplot_cmd( "set zlabel \"Z\"" );
    # set the range for azimuth angles:
    $plot->gnuplot_cmd( "set urange [0:2*pi]" );
    # set the range for the elevation angles:
    $plot->gnuplot_cmd( "set vrange [-pi/2:pi/2]" );
    # Parametric functions for the sphere
    $plot->gnuplot_cmd( "r=1" );
    $plot->gnuplot_cmd( "fx(v,u) = r*cos(v)*cos(u)" );
    $plot->gnuplot_cmd( "fy(v,u) = r*cos(v)*sin(u)" );
    $plot->gnuplot_cmd( "fz(v)   = r*sin(v)" );
    my $sphere_arg_str = "fx(v,u),fy(v,u),fz(v) notitle with lines lt 0,";
    foreach my $i (0..scalar(keys %clusters)-1) {
        my $j = $i + 1;
        # The following statement puts the titles on the data points
        $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"$sorted_cluster_keys[$i] \" with points lt $j pt $j, ";
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
#    $plot->gnuplot_cmd( "splot $arg_string" );
    $plot->gnuplot_cmd( "splot $sphere_arg_str $arg_string" );
}



( run in 0.701 second using v1.01-cache-2.11-cpan-0bd6704ced7 )