flag results from the CPAN

Algorithm-LinearManifoldDataClusterer

view release on metacpan or search on metacpan

lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN

package Algorithm::LinearManifoldDataClusterer;

#------------------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
# resides on a low-dimensional manifold in a high-dimensional measurement space.
# -----------------------------------------------------------------------------------

use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(reduce any);
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use POSIX (); 

our $VERSION = '1.01';

# Constructor:
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params(@params) == 0;
    bless {
        _datafile                     =>   $args{datafile} || croak("datafile required"),
        _mask                         =>   $args{mask}     || croak("mask required"),
        _K                            =>   $args{K}        || 0,
        _P                            =>   $args{P}        || 0,
        _terminal_output              =>   $args{terminal_output} || 0,
        _max_iterations               =>   $args{max_iterations} || 0,
        _delta_reconstruction_error   =>   $args{delta_reconstruction_error} || 0.001,
        _delta_normalized_error       =>   undef,
        _cluster_search_multiplier    =>   $args{cluster_search_multiplier} || 1,
        _visualize_each_iteration     =>   $args{visualize_each_iteration} == 0 ? 0 : 1,
        _show_hidden_in_3D_plots      =>   $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
        _make_png_for_each_iteration  =>   $args{make_png_for_each_iteration} == 0 ? 0 : 1,
        _debug                        =>   $args{debug} || 0,
        _N                            =>   0,
        _KM                           =>   $args{K} * $args{cluster_search_multiplier},
        _data_hash                    =>   {},
        _data_tags                    =>   [],
        _data_dimensions              =>   0,
        _final_clusters               =>   [],
        _auto_retry_flag              =>   0,
        _num_iterations_actually_used =>   undef,
        _scale_factor                 =>   undef,
        _data_tags_to_cluster_label_hash  => {},
        _final_reference_vecs_for_all_subspaces => [],
        _reconstruction_error_as_a_function_of_iteration => [],
        _final_trailing_eigenvec_matrices_for_all_subspaces => [],
        _subspace_construction_error_as_a_function_of_iteration => [],
    }, $class;
}

sub get_data_from_csv {
    my $self = shift;
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n" if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_data_hash} = \%data_hash;
    $self->{_data_tags} = \@data_tags;
    $self->{_N} = scalar @data_tags;
}

sub estimate_mean_and_covariance {
    my $self = shift;
    my $tag_set = shift;
    my $cluster_size = @$tag_set;
    my @cluster_center = @{$self->add_point_coords($tag_set)};
    @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
    # for covariance calculation:
    my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
    print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
        if $self->{_debug};
    my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
    my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
    # All the record labels are stored in the array $self->{_data_tags}.  The actual
    # data for clustering is stored in a hash at $self->{_data_hash} whose keys are
    # the record labels; the value associated with each key is the array holding the
    # corresponding numerical multidimensional data.
    $mean_vec->set_col(0, \@cluster_center);
    if ($self->{_debug}) {
        print "\nDisplaying the mean vector for the cluster:\n";
        display_matrix( $mean_vec ) if $self->{_terminal_output};
    }
    foreach my $j (0..$num_cols-1) {
        my $tag = $tag_set->[$j];            
        my $data = $self->{_data_hash}->{$tag};
        my @diff_from_mean = vector_subtract($data, \@cluster_center);
        $matrix->set_col($j, \@diff_from_mean);

lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN

    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for cluster:";
        display_matrix( $covariance ) if $self->{_terminal_output};
    }
    return ($mean_vec, $covariance);
}

sub eigen_analysis_of_covariance {
    my $self = shift;
    my $covariance = shift;
    my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    print "Eigenvalue 0:   $eigenvalues->[0]\n" if $self->{_debug};
    foreach my $i (1..$num_of_eigens-1) {
        $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
        $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
        print "Eigenvalue $i:   $eigenvalues->[$i]\n" if $self->{_debug};
    }
    print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
    print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
    my @all_my_eigenvecs;
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        my @eigenvec;
        foreach my $ele (@vec) {
            my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
            if ($theta eq "0") {
                push @eigenvec, $mag;
            } elsif ($theta eq "pi") {
                push @eigenvec, -1.0 * $mag;   
            } else {
                die "Eigendecomposition produced a complex eigenvector -- " .
                    "which should not happen for a covariance matrix!";
            }
        }
        print "Eigenvector $i:   @eigenvec\n" if $self->{_debug};
        push @all_my_eigenvecs, \@eigenvec;
    }
    my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
    print "\nLargest eigenvector:   @largest_eigen_vec\n" if $self->{_debug};
    my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
    my @sorted_eigenvecs;
    my @sorted_eigenvals;
    foreach my $i (0..@sorted_eigenvec_indexes-1) {
        $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];        
        $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];       
    }
    if ($self->{_debug}) {
        print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
        foreach my $i (0..@sorted_eigenvecs-1) {
            print "eigenvec:  @{$sorted_eigenvecs[$i]}       eigenvalue: $sorted_eigenvals[$i]\n";
        }
    }
    return (\@sorted_eigenvecs, \@sorted_eigenvals);
}

sub auto_retry_clusterer {
    my $self = shift;    
    $self->{_auto_retry_flag} = 1;
    my $clusters;
    $@ = 1;
    my $retry_attempts = 1;
    while ($@) {
        eval {
            $clusters = $self->linear_manifold_clusterer();
        };
        if ($@) {
            if ($self->{_terminal_output}) {
                print "Clustering failed. Trying again. --- $@";
                print "\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
                print     "VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV\n\n";
            }
            $retry_attempts++;
        } else {
            print "\n\nNumber of retry attempts: $retry_attempts\n\n" if $self->{_terminal_output};
            return $clusters;
        }
    }
}

sub linear_manifold_clusterer {
    my $self = shift;
    my $KM = $self->{_KM};
    my @initial_cluster_center_tags;
    my $visualization_msg;
    my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
    print "Randomly selected indexes for cluster center tags:  @initial_cluster_center_indexes\n"
        if $self->{_debug};
    @initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
    my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
    if ($self->{_debug}) {
        foreach my $centroid (@initial_cluster_center_coords) {
            print "Initial cluster center coords:  @{$centroid}\n";
        }
    }
    my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
    if ($self->{_data_dimensions} == 3) {
        $visualization_msg = "initial_clusters";
        $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters) 
            if $self->{_visualize_each_iteration};
        $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
            if $self->{_make_png_for_each_iteration};
    }
    foreach my $cluster (@$initial_clusters) {
        my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
        display_mean_and_covariance($mean, $covariance) if $self->{_debug};
    }
    my @clusters = @$initial_clusters;
    display_clusters(\@clusters) if $self->{_debug};
    my $iteration_index = 0;
    my $unimodal_correction_flag;
    my $previous_min_value_for_unimodality_quotient;
    while ($iteration_index < $self->{_max_iterations}) {
        print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
            if $self->{_terminal_output};
        my $total_reconstruction_error_this_iteration = 0;
        my @subspace_construction_errors_this_iteration;
        my @trailing_eigenvec_matrices_for_all_subspaces;
        my @reference_vecs_for_all_subspaces;
        foreach my $cluster (@clusters) {
            next if @$cluster == 0;
            my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
            display_mean_and_covariance($mean, $covariance) if $self->{_debug};
            print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
            my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
            my @trailing_eigenvecs =  @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
            my @trailing_eigenvals =  @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
            my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
            push @subspace_construction_errors_this_iteration, $subspace_construction_error;
            my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, 
                                                                 scalar(@trailing_eigenvecs));
            foreach my $j (0..@trailing_eigenvecs-1) {
                print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
                $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
            }
            push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
            push @reference_vecs_for_all_subspaces, $mean;
        }
        $self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
        my %best_subspace_based_partition_of_data;
        foreach my $i (0..$self->{_KM}-1) {
            $best_subspace_based_partition_of_data{$i} = [];
        }
        foreach my $data_tag (@{$self->{_data_tags}}) {
            my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
            $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
            my @errors = map {$self->reconstruction_error($data_vec,
                                  $trailing_eigenvec_matrices_for_all_subspaces[$_],
                                  $reference_vecs_for_all_subspaces[$_])}
                         0 .. $self->{_KM}-1;
            my ($minval, $index_for_closest_subspace) = minimum(\@errors);
            $total_reconstruction_error_this_iteration += $minval;
            push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
                                                                      $data_tag;
        }
        print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
              "iteration and re-assigning the data samples to the new subspaces on the basis of\n".
              "the least reconstruction error.\n\n" .
              "Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
                  if $self->{_terminal_output};
        foreach my $i (0..$self->{_KM}-1) {
            $clusters[$i] = $best_subspace_based_partition_of_data{$i};
        }
        display_clusters(\@clusters) if $self->{_terminal_output};
        # Check if any cluster has lost all its elements. If so, fragment the worst
        # existing cluster to create the additional clusters needed:
        if (any {@$_ == 0} @clusters) {
            die "empty cluster found" if $self->{_auto_retry_flag};
            print "\nOne or more clusters have become empty.  Will carve out the needed clusters\n" .
                  "from the cluster with the largest subspace construction error.\n\n";
            $total_reconstruction_error_this_iteration = 0;
            @subspace_construction_errors_this_iteration = ();
            my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
            print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
            my $max = List::Util::max @subspace_construction_errors_this_iteration;
            my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
            my @cluster_fragments = cluster_split($clusters[$maxindex], 
                                                  $how_many_extra_clusters_needed + 1);
            my @newclusters;
            push @newclusters, @clusters[0 .. $maxindex-1];
            push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
            push @newclusters, @cluster_fragments;
            @newclusters = grep {@$_ != 0} @newclusters; 
            die "something went wrong with cluster fragmentation" 
                unless $self->{_KM} = @newclusters;
            @trailing_eigenvec_matrices_for_all_subspaces = ();
            @reference_vecs_for_all_subspaces = ();
            foreach my $cluster (@newclusters) {
                die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
                my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
                my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
                my @trailing_eigenvecs =  @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
                my @trailing_eigenvals =  @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
                my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
                push @subspace_construction_errors_this_iteration, $subspace_construction_error;
                my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, 
                                                      scalar(@trailing_eigenvecs));
                foreach my $j (0..@trailing_eigenvecs-1) {
                    $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
                }
                push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
                push @reference_vecs_for_all_subspaces, $mean;
            }
            my %best_subspace_based_partition_of_data;
            foreach my $i (0..$self->{_KM}-1) {
                $best_subspace_based_partition_of_data{$i} = [];
            }
            foreach my $data_tag (@{$self->{_data_tags}}) {
                my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
                $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
                my @errors = map {reconstruction_error($data_vec,
                                      $trailing_eigenvec_matrices_for_all_subspaces[$_],
                                      $reference_vecs_for_all_subspaces[$_])}
                             0 .. $self->{_KM}-1;
                my ($minval, $index_for_closest_subspace) = minimum(\@errors);
                $total_reconstruction_error_this_iteration += $minval;
                push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
                                                                      $data_tag;
            }
            print "empty-cluster jag: total reconstruction error in this iteration: \n" .
                  "$total_reconstruction_error_this_iteration\n"
                if $self->{_debug};
            foreach my $i (0..$self->{_KM}-1) {
                $clusters[$i] = $best_subspace_based_partition_of_data{$i};
            }
            display_clusters(\@newclusters) if $self->{_terminal_output};
            @clusters = grep {@$_ != 0} @newclusters;
            die "linear manifold based algorithm does not appear to work in this case $!" 
                unless @clusters == $self->{_KM};
        }# end of foreach my $cluster (@clusters) ... loop  followed by if clause for empty clusters
        if ($self->{_data_dimensions} == 3) {
            $visualization_msg = "clustering_at_iteration_$iteration_index";
            $self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
                if $self->{_visualize_each_iteration};
            $self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
                if $self->{_make_png_for_each_iteration};
        }
        my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_], 
                                                      $reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
        my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
        print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
        die "\n\nBailing out!\n" .
            "It does not look like these iterations will lead to a good clustering result.\n" .
            "Program terminating.  Try running again.\n" 
            if defined($previous_min_value_for_unimodality_quotient)
               && ($min_value_for_unimodality_quotient < 0.4)
               && ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
        if ( $min_value_for_unimodality_quotient < 0.5 ) {
            $unimodal_correction_flag = 1;
            print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
            my @sorted_cluster_indexes = 
               sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
            my @newclusters;
            foreach my $cluster_index (0..@clusters - 1) {
                push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
            }
            @clusters = @newclusters;
            my $worst_cluster = pop @clusters;
            print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
            my $second_worst_cluster = pop @clusters;
            print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
            push @$worst_cluster, @$second_worst_cluster;
            fisher_yates_shuffle($worst_cluster);
            my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
            my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
            push @clusters, \@first_half;
            push @clusters, \@second_half;
            if ($self->{_terminal_output}) {
                print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
                display_clusters(\@clusters);      
            }
        }
        if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
            my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
            push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
            if (($last_recon_error - $total_reconstruction_error_this_iteration) 
                                                        < $self->{_delta_normalized_error}) {
                push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, 
                                                      $total_reconstruction_error_this_iteration;
                last;
            }
        }
        push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, 
                                              $total_reconstruction_error_this_iteration;
        $iteration_index++;
        $previous_min_value_for_unimodality_quotient =  $min_value_for_unimodality_quotient;
    } # end of while loop on iteration_index
    $self->{_num_iterations_actually_used} = 
                                 scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
    if ($self->{_terminal_output}) {
        print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
        print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
              "merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
        print "Total reconstruction error as a function of iterations: " .
                                       "@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
    }
    # now merge sub-clusters if cluster_search_multiplier > 1
    my @final_clusters;
    if ($self->{_cluster_search_multiplier} > 1) {
        print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
        my @array_of_partitioned_cluster_groups = (\@clusters);
        my @partitioned_cluster_groups;
        my $how_many_clusters_looking_for = $self->{_K};
        while (scalar(@final_clusters) < $self->{_K}) {
            @partitioned_cluster_groups = 
                   $self->graph_partition(shift @array_of_partitioned_cluster_groups, 
                                                             $how_many_clusters_looking_for );
            if (@{$partitioned_cluster_groups[0]} == 1) {
                my $singular_cluster = shift @{$partitioned_cluster_groups[0]};

lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN

        } elsif ( $arr->[$i] > $max ) {
            $max = $arr->[$i];
        }
    }
    return ($min, $max);
}

# Meant only for constructing a deep copy of an array of arrays:
sub deep_copy_AoA {
    my $ref_in = shift;
    my $ref_out;
    foreach my $i (0..@{$ref_in}-1) {
        foreach my $j (0..@{$ref_in->[$i]}-1) {
            $ref_out->[$i]->[$j] = $ref_in->[$i]->[$j];
        }
    }
    return $ref_out;
}

# For displaying the individual clusters on a terminal screen.  Each cluster is
# displayed through the symbolic names associated with the data points.
sub display_clusters {
    my @clusters = @{shift @_};
    my $i = 0;
    foreach my $cluster (@clusters) {
        @$cluster = sort @$cluster;
        my $cluster_size = @$cluster;
        print "\n\nCluster $i ($cluster_size records):\n";
        foreach my $ele (@$cluster) {
            print "  $ele";
        }
        $i++
    }
    print "\n\n";
}

sub display_mean_and_covariance {
    my $mean = shift;
    my $covariance = shift;
    print "\nDisplay the mean:\n";
    display_matrix($mean);
    print "\nDisplay the covariance:\n";
    display_matrix($covariance);
}

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / datafile
                            mask
                            K
                            P
                            terminal_output
                            cluster_search_multiplier
                            max_iterations
                            delta_reconstruction_error
                            visualize_each_iteration
                            show_hidden_in_3D_plots
                            make_png_for_each_iteration
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub display_matrix {
    my $matrix = shift;
    my $nrows = $matrix->rows();
    my $ncols = $matrix->cols();
    print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
    foreach my $i (0..$nrows-1) {
        my $row = $matrix->row($i);
        my @row_as_list = $row->as_list;
#        print "@row_as_list\n";
        map { printf("%.4f ", $_) } @row_as_list;
        print "\n";
    }
    print "\n\n";
}

sub transpose {
    my $matrix = shift;
    my $num_rows = $matrix->rows();
    my $num_cols = $matrix->cols();
    my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
    foreach my $i (0..$num_rows-1) {
        my @row = $matrix->row($i)->as_list;
        $transpose->set_col($i, \@row );
    }
    return $transpose;
}

sub vector_subtract {
    my $vec1 = shift;
    my $vec2 = shift;
    die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
    my @result;
    foreach my $i (0..@$vec1-1){
        push @result, $vec1->[$i] - $vec2->[$i];
    }
    return @result;
}

# from perl docs:
sub fisher_yates_shuffle {                
    my $arr =  shift;                
    my $i = @$arr;                   
    while (--$i) {                   
        my $j = int rand( $i + 1 );  
        @$arr[$i, $j] = @$arr[$j, $i]; 
    }
}

#########################  Generating Synthetic Data for Manifold Clustering  ##########################

##################################      Class DataGenerator     ########################################

##  The embedded class defined below is for generating synthetic data for
##  experimenting with linear manifold clustering when the data resides on the
##  surface of a sphere.  See the script generate_data_on_a_sphere.pl in the
##  `examples' directory for how to specify the number of clusters and the spread of
##  each cluster in the data that is generated.

package DataGenerator;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if _check_for_illegal_params3(@params) == 0;   
    bless {
        _output_file                       =>   $args{output_file} 
                                                   || croak("name for output_file required"),
        _total_number_of_samples_needed    =>   $args{total_number_of_samples_needed} 
                                                   || croak("total_number_of_samples_needed required"),
        _number_of_clusters_on_sphere      =>   $args{number_of_clusters_on_sphere}   || 3,
        _cluster_width                     =>   $args{cluster_width}   || 0.1,
        _show_hidden_in_3D_plots           =>   $args{show_hidden_in_3D_plots} || 1,
        _debug                             =>   $args{debug} || 0,
    }, $class;
}

sub _check_for_illegal_params3 {
    my @params = @_;
    my @legal_params = qw / output_file
                            total_number_of_samples_needed
                            number_of_clusters_on_sphere
                            cluster_width
                            show_hidden_in_3D_plots
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

##  We first generate a set of points randomly on the unit sphere --- the number of
##  points being equal to the number of clusters desired.  These points will serve as
##  cluster means (or, as cluster centroids) subsequently when we ask
##  Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
##  points on the sphere.  The second argument is the cluster mean and the third
##  argument the cluster covariance.  For the synthetic data, we set the cluster
##  covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
##  variance along the azimuth direction and the (1,1) element corresponding to the
##  variance along the elevation direction.
##
##  When you generate the points in the 2D spherical coordinates of
##  (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
##  the multivariate-normal function whose azimuth angle is outside the interval
##  (0,360) and/or whose elevation angle is outside the interval (-90,90).
##
##  Note that the first of the two dimensions for which the multivariate-normal
##  function returns the points is for the azimuth angle and the second for the
##  elevation angle.
##
##  With regard to the relationship of the Cartesian coordinates to the spherical
##  (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
##  and z the vertical axis.  The elevation angle theta is measure with respect to
##  the XY-plane.  The highest point on the sphere (the Zenith) corresponds to the
##  elevation angle of +90 and the lowest points on the sphere (the Nadir)
##  corresponds to the elevation angle of -90.  The azimuth is measured with respect
##  X-axis.  The range of the azimuth is from 0 to 360 degrees.  The elevation is
##  measured from the XY plane and its range is (-90,90) degrees.
sub gen_data_and_write_to_csv {
    my $self = shift;
    my $K = $self->{_number_of_clusters_on_sphere};
    # $N is the number of samples to be generated for each cluster:
    my $N = int($self->{_total_number_of_samples_needed} / $K);
    my $output_file = $self->{_output_file};
    # Designated all of the data elements in a cluster by a letter that is followed by
    # an integer that identifies a specific data element.
    my @point_labels = ('a'..'z');
    # Our first job is to define $K random points in the 2D space (azimuth,
    # elevation) to serve as cluster centers on the surface of the sphere.  This we
    # do by calling a uniformly distributed 1-D random number generator, first for
    # the azimuth and then for the elevation in the loop shown below:
    my @cluster_centers;
    my @covariances;
    foreach my $i (0..$K-1) {
        my $azimuth = rand(360);
        my $elevation =  rand(90) - 90; 
        my @mean = ($azimuth, $elevation);
        push @cluster_centers, \@mean;
        my $cluster_covariance;
        # The j-th dimension is for azimuth and k-th for elevation for the directions
        # to surface of the sphere:
        foreach my $j (0..1) {
            foreach my $k (0..1) {
                $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2 
                                                                        if $j == 0 && $k == 0;
                $cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2 
                                                                        if $j == 1 && $k == 1;
                $cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
            }

( run in 1.103 second using v1.01-cache-2.11-cpan-5735350b133 )