best results from the CPAN

best

Algorithm-LinearManifoldDataClusterer

view release on metacpan or search on metacpan

lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN


sub linear_manifold_clusterer {
    my $self = shift;
    my $KM = $self->{_KM};
    my @initial_cluster_center_tags;
    my $visualization_msg;
    my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
    print "Randomly selected indexes for cluster center tags:  @initial_cluster_center_indexes\n"
        if $self->{_debug};
    @initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
    my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
    if ($self->{_debug}) {
        foreach my $centroid (@initial_cluster_center_coords) {
            print "Initial cluster center coords:  @{$centroid}\n";
        }
    }
    my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
    if ($self->{_data_dimensions} == 3) {
        $visualization_msg = "initial_clusters";
        $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters) 
            if $self->{_visualize_each_iteration};
        $self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
            if $self->{_make_png_for_each_iteration};
    }
    foreach my $cluster (@$initial_clusters) {
        my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
        display_mean_and_covariance($mean, $covariance) if $self->{_debug};
    }
    my @clusters = @$initial_clusters;
    display_clusters(\@clusters) if $self->{_debug};
    my $iteration_index = 0;
    my $unimodal_correction_flag;
    my $previous_min_value_for_unimodality_quotient;
    while ($iteration_index < $self->{_max_iterations}) {
        print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
            if $self->{_terminal_output};
        my $total_reconstruction_error_this_iteration = 0;
        my @subspace_construction_errors_this_iteration;
        my @trailing_eigenvec_matrices_for_all_subspaces;
        my @reference_vecs_for_all_subspaces;
        foreach my $cluster (@clusters) {
            next if @$cluster == 0;
            my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
            display_mean_and_covariance($mean, $covariance) if $self->{_debug};
            print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
            my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
            my @trailing_eigenvecs =  @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
            my @trailing_eigenvals =  @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
            my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
            push @subspace_construction_errors_this_iteration, $subspace_construction_error;
            my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, 
                                                                 scalar(@trailing_eigenvecs));
            foreach my $j (0..@trailing_eigenvecs-1) {
                print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
                $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
            }
            push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
            push @reference_vecs_for_all_subspaces, $mean;
        }
        $self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
        my %best_subspace_based_partition_of_data;
        foreach my $i (0..$self->{_KM}-1) {
            $best_subspace_based_partition_of_data{$i} = [];
        }
        foreach my $data_tag (@{$self->{_data_tags}}) {
            my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
            $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
            my @errors = map {$self->reconstruction_error($data_vec,
                                  $trailing_eigenvec_matrices_for_all_subspaces[$_],
                                  $reference_vecs_for_all_subspaces[$_])}
                         0 .. $self->{_KM}-1;
            my ($minval, $index_for_closest_subspace) = minimum(\@errors);
            $total_reconstruction_error_this_iteration += $minval;
            push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
                                                                      $data_tag;
        }
        print "Finished calculating the eigenvectors for the clusters produced by the previous\n" .
              "iteration and re-assigning the data samples to the new subspaces on the basis of\n".
              "the least reconstruction error.\n\n" .
              "Total reconstruction error in this iteration: $total_reconstruction_error_this_iteration\n"
                  if $self->{_terminal_output};
        foreach my $i (0..$self->{_KM}-1) {
            $clusters[$i] = $best_subspace_based_partition_of_data{$i};
        }
        display_clusters(\@clusters) if $self->{_terminal_output};
        # Check if any cluster has lost all its elements. If so, fragment the worst
        # existing cluster to create the additional clusters needed:
        if (any {@$_ == 0} @clusters) {
            die "empty cluster found" if $self->{_auto_retry_flag};
            print "\nOne or more clusters have become empty.  Will carve out the needed clusters\n" .
                  "from the cluster with the largest subspace construction error.\n\n";
            $total_reconstruction_error_this_iteration = 0;
            @subspace_construction_errors_this_iteration = ();
            my $how_many_extra_clusters_needed = $self->{_KM} - scalar(grep {@$_ != 0} @clusters);
            print "number of extra clusters needed at iteration $iteration_index: $how_many_extra_clusters_needed\n";
            my $max = List::Util::max @subspace_construction_errors_this_iteration;
            my $maxindex = List::Util::first {$_ == $max} @subspace_construction_errors_this_iteration;
            my @cluster_fragments = cluster_split($clusters[$maxindex], 
                                                  $how_many_extra_clusters_needed + 1);
            my @newclusters;
            push @newclusters, @clusters[0 .. $maxindex-1];
            push @newclusters, @clusters[$maxindex+1 .. $self->{_KM}-1];
            push @newclusters, @cluster_fragments;
            @newclusters = grep {@$_ != 0} @newclusters; 
            die "something went wrong with cluster fragmentation" 
                unless $self->{_KM} = @newclusters;
            @trailing_eigenvec_matrices_for_all_subspaces = ();
            @reference_vecs_for_all_subspaces = ();
            foreach my $cluster (@newclusters) {
                die "Linear Manifold Clustering did not work $!" if @$cluster == 0;
                my ($mean, $covariance) = estimate_mean_and_covariance($cluster);
                my ($eigenvecs, $eigenvals) = eigen_analysis_of_covariance($covariance);
                my @trailing_eigenvecs =  @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
                my @trailing_eigenvals =  @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
                my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
                push @subspace_construction_errors_this_iteration, $subspace_construction_error;
                my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions}, 
                                                      scalar(@trailing_eigenvecs));
                foreach my $j (0..@trailing_eigenvecs-1) {
                    $trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
                }
                push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
                push @reference_vecs_for_all_subspaces, $mean;
            }
            my %best_subspace_based_partition_of_data;
            foreach my $i (0..$self->{_KM}-1) {
                $best_subspace_based_partition_of_data{$i} = [];
            }
            foreach my $data_tag (@{$self->{_data_tags}}) {
                my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
                $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
                my @errors = map {reconstruction_error($data_vec,
                                      $trailing_eigenvec_matrices_for_all_subspaces[$_],
                                      $reference_vecs_for_all_subspaces[$_])}
                             0 .. $self->{_KM}-1;
                my ($minval, $index_for_closest_subspace) = minimum(\@errors);
                $total_reconstruction_error_this_iteration += $minval;
                push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
                                                                      $data_tag;
            }
            print "empty-cluster jag: total reconstruction error in this iteration: \n" .
                  "$total_reconstruction_error_this_iteration\n"
                if $self->{_debug};
            foreach my $i (0..$self->{_KM}-1) {
                $clusters[$i] = $best_subspace_based_partition_of_data{$i};
            }
            display_clusters(\@newclusters) if $self->{_terminal_output};
            @clusters = grep {@$_ != 0} @newclusters;
            die "linear manifold based algorithm does not appear to work in this case $!" 
                unless @clusters == $self->{_KM};
        }# end of foreach my $cluster (@clusters) ... loop  followed by if clause for empty clusters
        if ($self->{_data_dimensions} == 3) {
            $visualization_msg = "clustering_at_iteration_$iteration_index";
            $self->visualize_clusters_on_sphere($visualization_msg, \@clusters)
                if $self->{_visualize_each_iteration};
            $self->visualize_clusters_on_sphere($visualization_msg, \@clusters, "png")
                if $self->{_make_png_for_each_iteration};
        }
        my @cluster_unimodality_quotients = map {$self->cluster_unimodality_quotient($clusters[$_], 
                                                      $reference_vecs_for_all_subspaces[$_])} 0..@clusters-1;
        my $min_value_for_unimodality_quotient = List::Util::min @cluster_unimodality_quotients;
        print "\nCluster unimodality quotients: @cluster_unimodality_quotients\n" if $self->{_terminal_output};
        die "\n\nBailing out!\n" .
            "It does not look like these iterations will lead to a good clustering result.\n" .
            "Program terminating.  Try running again.\n" 
            if defined($previous_min_value_for_unimodality_quotient)
               && ($min_value_for_unimodality_quotient < 0.4)
               && ($min_value_for_unimodality_quotient < (0.5 * $previous_min_value_for_unimodality_quotient));
        if ( $min_value_for_unimodality_quotient < 0.5 ) {
            $unimodal_correction_flag = 1;
            print "\nApplying unimodality correction:\n\n" if $self->{_terminal_output};
            my @sorted_cluster_indexes = 
               sort {$cluster_unimodality_quotients[$b] <=> $cluster_unimodality_quotients[$a]} 0..@clusters-1;
            my @newclusters;
            foreach my $cluster_index (0..@clusters - 1) {
                push @newclusters, $clusters[$sorted_cluster_indexes[$cluster_index]];
            }
            @clusters = @newclusters;
            my $worst_cluster = pop @clusters;
            print "\nthe worst cluster: @$worst_cluster\n" if $self->{_terminal_output};
            my $second_worst_cluster = pop @clusters;
            print "\nthe second worst cluster: @$second_worst_cluster\n" if $self->{_terminal_output};
            push @$worst_cluster, @$second_worst_cluster;
            fisher_yates_shuffle($worst_cluster);
            my @first_half = @$worst_cluster[0 .. int(scalar(@$worst_cluster)/2) - 1];
            my @second_half = @$worst_cluster[int(scalar(@$worst_cluster)/2) .. @$worst_cluster - 1];
            push @clusters, \@first_half;
            push @clusters, \@second_half;
            if ($self->{_terminal_output}) {
                print "\n\nShowing the clusters obtained after applying the unimodality correction:\n";
                display_clusters(\@clusters);      
            }
        }
        if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
            my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
            push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
            if (($last_recon_error - $total_reconstruction_error_this_iteration) 
                                                        < $self->{_delta_normalized_error}) {
                push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, 
                                                      $total_reconstruction_error_this_iteration;
                last;
            }
        }
        push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, 
                                              $total_reconstruction_error_this_iteration;
        $iteration_index++;
        $previous_min_value_for_unimodality_quotient =  $min_value_for_unimodality_quotient;
    } # end of while loop on iteration_index
    $self->{_num_iterations_actually_used} = 
                                 scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
    if ($self->{_terminal_output}) {
        print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
        print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
              "merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
        print "Total reconstruction error as a function of iterations: " .
                                       "@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
    }
    # now merge sub-clusters if cluster_search_multiplier > 1
    my @final_clusters;
    if ($self->{_cluster_search_multiplier} > 1) {
        print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
        my @array_of_partitioned_cluster_groups = (\@clusters);
        my @partitioned_cluster_groups;
        my $how_many_clusters_looking_for = $self->{_K};
        while (scalar(@final_clusters) < $self->{_K}) {
            @partitioned_cluster_groups = 
                   $self->graph_partition(shift @array_of_partitioned_cluster_groups, 
                                                             $how_many_clusters_looking_for );
            if (@{$partitioned_cluster_groups[0]} == 1) {
                my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
                push @final_clusters, $singular_cluster;
                $how_many_clusters_looking_for--;
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
            } elsif (@{$partitioned_cluster_groups[1]} == 1) {
                my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
                push @final_clusters, $singular_cluster;
                $how_many_clusters_looking_for--;
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
            } else {
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
            }
        }
        my @data_clustered;
        foreach my $cluster (@final_clusters) {
            push @data_clustered, @$cluster;
        }
        unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
            $self->{_final_clusters} = \@final_clusters;
            my %data_clustered = map {$_ => 1} @data_clustered;
            my @data_tags_not_clustered = 
                   grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
            if ($self->{_terminal_output}) {
                print "\n\nNot all data clustered.  The most reliable clusters found by graph partitioning:\n";
                display_clusters(\@final_clusters);
                print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
            }
            if ($self->{_data_dimensions} == 3) {
                $visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
                $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
                    if $self->{_visualize_each_iteration};
                $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
                    if $self->{_make_png_for_each_iteration};
            }
            my %data_tags_to_cluster_label_hash;
            foreach my $i (0..@final_clusters-1) {
                map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
            }
            $self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
            foreach my $tag (@data_tags_not_clustered) {
                my $which_cluster = $self->which_cluster_for_new_element($tag);
                $self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
            }
            die "Some data elements are still missing from the final tally" 
              unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) == 
                scalar(@{$self->{_data_tags}});
            my @new_final_clusters;
            map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
                      push @{$new_final_clusters[$_]}, $ele 
                            if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ } 
                } 0..$self->{_K}-1;
            if ($self->{_debug}) {
                print "\ndisplaying the final clusters after accounting for unclustered data:\n"; 
                display_clusters(\@new_final_clusters);
            }
            $self->{_final_clusters} = \@new_final_clusters;
            @final_clusters = @new_final_clusters;
        }
    } else {
        @final_clusters = @clusters;
    }
    print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
    display_clusters(\@final_clusters) if $self->{_terminal_output};
    return \@final_clusters;
} 

sub display_reconstruction_errors_as_a_function_of_iterations {
    my $self = shift;            
    print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
    print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
          "@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
}

sub set_termination_reconstruction_error_threshold {
    my $self = shift;        
    my $all_ref_vecs = shift;
    my @mean_vecs = @$all_ref_vecs;
    my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_; 
                                                      my @result = $result->as_list; 
                                                      sqrt($result[0])
                                                    } @mean_vecs;
    $self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
    $self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) * 
                                       $self->{_delta_reconstruction_error};
}

# This method is called only in the `unless' clause at the end of the main
# linear_manifold_clusterer() method.  It is called to find the cluster labels for
# those data elements that were left unclustered by the main part of the algorithm

lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN

    my $A = $neg_sqrt_of_D * $Laplacian * $neg_sqrt_of_D;
    foreach my $i (0..$num_nodes-1) {
        foreach my $j (0..$num_nodes-1) {
            $A->set_elem($i,$j,0) if abs($A->get_elem($i,$j)) < 0.01;
        }
    }
    if ($self->{_terminal_output}) {
        print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" .
              "A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n";
        display_matrix( $A );
    }
    my ($eigenvalues, $eigenvectors) = $A->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    if ($self->{_debug2}) {
        print "Eigenvalue 0:   $eigenvalues->[0]\n";
        foreach my $i (1..$num_of_eigens-1) {
            $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
            $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
            print "Eigenvalue $i:   $eigenvalues->[$i]\n";
        }
        print "\nlargest eigen index: $largest_eigen_index\n";
        print "\nsmallest eigen index: $smallest_eigen_index\n\n";
    }
    my @all_my_eigenvecs;
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        my @eigenvec;
        foreach my $ele (@vec) {
            my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
            if ($theta eq "0") {
                push @eigenvec, $mag;
            } elsif ($theta eq "pi") {
                push @eigenvec, -1.0 * $mag;   
            } else {
                die "Eigendecomposition produced a complex eigenvector!";
            }
        }
        print "Eigenvector $i:   @eigenvec\n" if $self->{_debug2};
        push @all_my_eigenvecs, \@eigenvec;
    }
    if ($self->{_debug2}) {
        my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
        print "\nLargest eigenvector of A:   @largest_eigen_vec\n";
    }
    my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
    print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2};
    my @sorted_eigenvecs;
    my @sorted_eigenvals;
    foreach my $i (0..@sorted_eigenvec_indexes-1) {
        $sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];        
        $sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];       
    }
    if ($self->{_debug2}) {
        print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n";
        foreach my $i (0..@sorted_eigenvecs-1) {
            print "eigenvec:  @{$sorted_eigenvecs[$i]}       eigenvalue: $sorted_eigenvals[$i]\n";
        }
    }
    my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2];
    print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output};
    my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec;
    my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive;
    print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n"
        if $self->{_terminal_output};
    if ($how_many_clusters_looking_for <= 3) {
        my @merged_cluster;
        my $final_cluster;
        my @newclusters;
        if ($how_many_positive == 1) {
            foreach my $i (0..@$clusters-1) {
                if ($best_partitioning_eigenvec->[$i] > 0) {
                    $final_cluster = $clusters->[$i]; 
                } else {
                    push @newclusters, $clusters->[$i];
                }
            }
            return ([$final_cluster], \@newclusters);
        } elsif ($how_many_negative == 1) {
            foreach my $i (0..@$clusters-1) {
                if ($best_partitioning_eigenvec->[$i] < 0) {
                    $final_cluster = $clusters->[$i]; 
                } else {
                    push @newclusters, $clusters->[$i];
                }
            }
            return ([$final_cluster], \@newclusters);
        } elsif ($how_many_positive <= $self->{_cluster_search_multiplier}) {
            foreach my $i (0..@$clusters-1) {
                if ($best_partitioning_eigenvec->[$i] > 0) {
                    push @merged_cluster, @{$clusters->[$i]}; 
                } else {
                    push @newclusters, $clusters->[$i];
                }
            }
            return ([\@merged_cluster], \@newclusters);
        } elsif ($how_many_negative <= $self->{_cluster_search_multiplier}) {
            foreach my $i (0..@$clusters-1) {
                if ($best_partitioning_eigenvec->[$i] < 0) {
                    push @merged_cluster, @{$clusters->[$i]}; 
                } else {
                    push @newclusters, $clusters->[$i];
                }
            }
            return ([\@merged_cluster], \@newclusters);
        } else {
            die "\n\nBailing out!\n\n" .
                "No consensus support for dominant clusters in the graph partitioning step\n" .
                "of the algorithm. This can be caused by bad random selection of initial\n" .
                "cluster centers.  Please run this program again.\n";
        }
    } else {
        my @positive_clusters;
        my @negative_clusters;
        foreach my $i (0..@$clusters-1) {
            if ($best_partitioning_eigenvec->[$i] > 0) {
                push @positive_clusters, $clusters->[$i]; 
            } else {
                    push @negative_clusters, $clusters->[$i];
            }
        }
        return (\@positive_clusters, \@negative_clusters);
    }
}

sub pairwise_cluster_similarity {
    my $self = shift;
    my $cluster1 = shift;
    my $trailing_eigenvec_matrix_cluster1 = shift;   
    my $reference_vec_cluster1 = shift;
    my $cluster2 = shift;
    my $trailing_eigenvec_matrix_cluster2 = shift;   
    my $reference_vec_cluster2 = shift;
    my $total_reconstruction_error_in_this_iteration = 0;
    my @errors_for_1_on_2 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
                                 $data_vec->set_col(0, $self->{_data_hash}->{$_});
                                 $self->reconstruction_error($data_vec,
                                                             $trailing_eigenvec_matrix_cluster2,
                                                             $reference_vec_cluster2)} 
                                 @$cluster1;
    my @errors_for_2_on_1 = map {my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
                                 $data_vec->set_col(0, $self->{_data_hash}->{$_});
                                 $self->reconstruction_error($data_vec,
                                                             $trailing_eigenvec_matrix_cluster1,
                                                             $reference_vec_cluster1)} 
                                 @$cluster2;
    my $type_1_error = reduce {abs($a) + abs($b)} @errors_for_1_on_2;
    my $type_2_error = reduce {abs($a) + abs($b)} @errors_for_2_on_1;
    my $total_reconstruction_error = $type_1_error + $type_2_error;
    my $diff_between_the_means = $reference_vec_cluster1 - $reference_vec_cluster2;
    my $dist_squared = transpose($diff_between_the_means) * $diff_between_the_means;
    my @dist_squared_as_list = $dist_squared->as_list();
    my $dist_between_means_based_error = shift @dist_squared_as_list;
    return ($total_reconstruction_error, $dist_between_means_based_error);
}

# delta ball
sub cluster_unimodality_quotient {
    my $self = shift;
    my $cluster = shift;
    my $mean = shift;   
    my $delta = 0.4 * $self->{_scale_factor};  # Radius of the delta ball along each dimension
    my @mean = $mean->as_list;
    my @data_tags_for_range_tests;
    foreach my $dimen (0..$self->{_data_dimensions}-1) {
        my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
        my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
        my $range = $max - $min;
        my $mean_along_this_dimen = $mean[$dimen];
        my @tags =  grep {$_} 
                    map { ( ($self->{_data_hash}->{$_}->[$dimen] > $mean_along_this_dimen - $delta * $range) 
                            && 
                            ($self->{_data_hash}->{$_}->[$dimen] < $mean_along_this_dimen + $delta * $range) ) 
                          ? $_ : undef }
                    @$cluster; 
        push @data_tags_for_range_tests, \@tags;
    }
    # Now find the intersection of the tag sets for each of the dimensions
    my %intersection_hash;
    foreach my $dimen (0..$self->{_data_dimensions}-1) {
        my %tag_hash_for_this_dimen  = map {$_ => 1} @{$data_tags_for_range_tests[$dimen]};
        if ($dimen == 0) {
            %intersection_hash = %tag_hash_for_this_dimen;
        } else {
            %intersection_hash = map {$_ => 1} grep {$tag_hash_for_this_dimen{$_}} 
                                 keys %intersection_hash;
        }
    }
    my @intersection_set = keys %intersection_hash;
    my $cluster_unimodality_index = scalar(@intersection_set) / scalar(@$cluster);
    return $cluster_unimodality_index;
}

sub find_best_ref_vector {
    my $self = shift;
    my $cluster = shift;
    my $trailing_eigenvec_matrix = shift;
    my $mean = shift;        # a GSL marix ref
    my @min_bounds;
    my @max_bounds;
    my @ranges;
    foreach my $dimen (0..$self->{_data_dimensions}-1) {
        my @values = map {$_->[$dimen]} map {$self->{_data_hash}->{$_}} @$cluster;
        my ($min, $max) = (List::Util::min(@values), List::Util::max(@values));
        push @min_bounds, $min;
        push @max_bounds, $max;
        push @ranges, $max - $min;
    }
    print "min bounds are: @min_bounds\n";
    print "max bounds are: @max_bounds\n";
    my $max_iterations = 100;
    my @random_points;
    my $iteration = 0;
    while ($iteration++ < $max_iterations) {
        my @coordinate_vec;
        foreach my $dimen (0..$self->{_data_dimensions}-1) {        
            push @coordinate_vec,  $min_bounds[$dimen] + rand($ranges[$dimen]);
        }
        push @random_points, \@coordinate_vec;
    }
    if ($self->{_debug}) {
        print "\nrandom points\n";
        map {print "@$_\n"} @random_points;
    }
    my @mean = $mean->as_list;
    unshift @random_points, \@mean;
    my @reconstruction_errors;
    foreach my $candidate_ref_vec (@random_points) {
        my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
        $ref_vec->set_col(0, $candidate_ref_vec);
        my $reconstruction_error_for_a_ref_vec = 0;
        foreach my $data_tag (@{$self->{_data_tags}}) {
            my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
            $data_vec->set_col(0, $self->{_data_hash}->{$data_tag});
            my $error = $self->reconstruction_error($data_vec,$trailing_eigenvec_matrix,$ref_vec);
            $reconstruction_error_for_a_ref_vec += $error;
        }
        push @reconstruction_errors, $reconstruction_error_for_a_ref_vec;
    }
    my $recon_error_for_original_mean = shift @reconstruction_errors;
    my $smallest_error_randomly_selected_ref_vecs = List::Util::min(@reconstruction_errors);
    my $minindex = List::Util::first { $_ == $smallest_error_randomly_selected_ref_vecs }
                                                    @reconstruction_errors;
    my $refvec = $random_points[$minindex];
    return $refvec;
}

##  The reconstruction error relates to the size of the perpendicular from a data
##  point X to the hyperplane that defines a given subspace on the manifold.
sub reconstruction_error {
    my $self = shift;
    my $data_vec = shift;
    my $trailing_eigenvecs = shift;
    my $ref_vec = shift;    
    my $error_squared = transpose($data_vec - $ref_vec) * $trailing_eigenvecs *  
                                 transpose($trailing_eigenvecs) * ($data_vec - $ref_vec);
    my @error_squared_as_list = $error_squared->as_list();
    my $error_squared_as_scalar = shift @error_squared_as_list;
    return $error_squared_as_scalar;
}

# Returns a set of KM random integers.  These serve as indices to reach into the data
# array.  A data element whose index is one of the random numbers returned by this
# routine serves as an initial cluster center.  Note the quality check it runs on the
# list of the random integers constructed.  We first make sure that all the random
# integers returned are different.  Subsequently, we carry out a quality assessment
# of the random integers constructed.  This quality measure consists of the ratio of
# the values spanned by the random integers to the value of N, the total number of
# data points to be clustered.  Currently, if this ratio is less than 0.3, we discard
# the K integers and try again.
sub initialize_cluster_centers {
    my $self = shift;
    my $K = shift;   # This value is set to the parameter KM in the call to this subroutine
    my $data_store_size = shift;
    my @cluster_center_indices;
    while (1) {
        foreach my $i (0..$K-1) {
            $cluster_center_indices[$i] = int rand( $data_store_size );
            next if $i == 0;
            foreach my $j (0..$i-1) {
                while ( $cluster_center_indices[$j] == $cluster_center_indices[$i] ) {
                    my $old = $cluster_center_indices[$i];
                    $cluster_center_indices[$i] = int rand($data_store_size);
                }
            }
        }
        my ($min,$max) = minmax(\@cluster_center_indices );
        my $quality = ($max - $min) / $data_store_size;
        last if $quality > 0.3;
    }
    return @cluster_center_indices;
}

# The purpose of this routine is to form initial clusters by assigning the data
# samples to the initial clusters formed by the previous routine on the basis of the
# best proximity of the data samples to the different cluster centers.
sub assign_data_to_clusters_initial {
    my $self = shift;
    my @cluster_centers = @{ shift @_ };
    my @clusters;
    foreach my $ele (@{$self->{_data_tags}}) {
        my $best_cluster;
        my @dist_from_clust_centers;
        foreach my $center (@cluster_centers) {
            push @dist_from_clust_centers, $self->distance($ele, $center);
        }
        my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
        push @{$clusters[$best_center_index]}, $ele;
    }
    return \@clusters;
}    

# The following routine is for computing the distance between a data point specified
# by its symbolic name in the master datafile and a point (such as the center of a
# cluster) expressed as a vector of coordinates:
sub distance {
    my $self = shift;
    my $ele1_id = shift @_;            # symbolic name of data sample
    my @ele1 = @{$self->{_data_hash}->{$ele1_id}};
    my @ele2 = @{shift @_};
    die "wrong data types for distance calculation\n" if @ele1 != @ele2;
    my $how_many = @ele1;
    my $squared_sum = 0;
    foreach my $i (0..$how_many-1) {
        $squared_sum += ($ele1[$i] - $ele2[$i])**2;
    }    
    my $dist = sqrt $squared_sum;
    return $dist;
}

sub write_clusters_to_files {
    my $self = shift;
    my $clusters = shift;
    my @clusters = @$clusters;
    unlink glob "cluster*.txt";
    foreach my $i (0..@clusters-1) {
        my $filename = "cluster" . $i . ".txt";
        print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
        open FILEHANDLE, "| sort > $filename"  or die "Unable to open file: $!";
        foreach my $ele (@{$clusters[$i]}) {        
            print FILEHANDLE "$ele ";
        }
        close FILEHANDLE;
    }
}

sub DESTROY {
    my $filename = basename($_[0]->{_datafile});
    $filename =~ s/\.\w+$/\.txt/;
    unlink "__temp_" . $filename;
}

##################################  Visualization Code ###################################

sub add_point_coords {
    my $self = shift;
    my @arr_of_ids = @{shift @_};      # array of data element names
    my @result;
    my $data_dimensionality = $self->{_data_dimensions};
    foreach my $i (0..$data_dimensionality-1) {
        $result[$i] = 0.0;
    }
    foreach my $id (@arr_of_ids) {
        my $ele = $self->{_data_hash}->{$id};
        my $i = 0;
        foreach my $component (@$ele) {
            $result[$i] += $component;
            $i++;

lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN

  use Algorithm::LinearManifoldDataClusterer;

  my $output_file = "4_clusters_on_a_sphere_1000_samples.csv";

  my $training_data_gen = DataGenerator->new(
                             output_file => $output_file,
                             cluster_width => 0.015,
                             total_number_of_samples_needed => 1000,
                             number_of_clusters_on_sphere => 4,
                             show_hidden_in_3D_plots => 0,
                          );
  $training_data_gen->gen_data_and_write_to_csv();
  $training_data_gen->visualize_data_on_sphere($output_file);


=head1 CHANGES

Version 1.01: Typos and other errors removed in the documentation. Also included in
the documentation a link to a tutorial on data processing on manifolds.


=head1 DESCRIPTION

If you are new to machine learning and data clustering on linear and nonlinear
manifolds, your first question is likely to be: What is a manifold?  A manifold is a
space that is locally Euclidean. And a space is locally Euclidean if it allows for
the points in a small neighborhood to be represented by, say, the Cartesian
coordinates and if the distances between the points in the neighborhood are given by
the Euclidean metric.  For an example, the set of all points on the surface of a
sphere does NOT constitute a Euclidean space.  Nonetheless, if you confined your
attention to a small enough neighborhood around a point, the space would seem to be
locally Euclidean.  The surface of a sphere is a 2-dimensional manifold embedded in a
3-dimensional space.  A plane in a 3-dimensional space is also a 2-dimensional
manifold. You would think of the surface of a sphere as a nonlinear manifold, whereas
a plane would be a linear manifold.  However, note that any nonlinear manifold is
locally a linear manifold.  That is, given a sufficiently small neighborhood on a
nonlinear manifold, you can always think of it as a locally flat surface.

As to why we need machine learning and data clustering on manifolds, there exist many
important applications in which the measured data resides on a nonlinear manifold.
For example, when you record images of a human face from different angles, all the
image pixels taken together fall on a low-dimensional surface in a high-dimensional
measurement space. The same is believed to be true for the satellite images of a land
mass that are recorded with the sun at different angles with respect to the direction
of the camera.

Reducing the dimensionality of the sort of data mentioned above is critical to the
proper functioning of downstream classification algorithms, and the most popular
traditional method for dimensionality reduction is the Principal Components Analysis
(PCA) algorithm.  However, using PCA is tantamount to passing a linear least-squares
hyperplane through the surface on which the data actually resides.  As to why that
might be a bad thing to do, just imagine the consequences of assuming that your data
falls on a straight line when, in reality, it falls on a strongly curving arc.  This
is exactly what happens with PCA --- it gives you a linear manifold approximation to
your data that may actually reside on a curved surface.

That brings us to the purpose of this module, which is to cluster data that resides
on a nonlinear manifold.  Since a nonlinear manifold is locally linear, we can think
of each data cluster on a nonlinear manifold as falling on a locally linear portion
of the manifold, meaning on a hyperplane.  The logic of the module is based on
finding a set of hyperplanes that best describes the data, with each hyperplane
derived from a local data cluster.  This is like constructing a piecewise linear
approximation to data that falls on a curve as opposed to constructing a single
straight line approximation to all of the data.  So whereas the frequently used PCA
algorithm gives you a single hyperplane approximation to all your data, what this
module returns is a set of hyperplane approximations, with each hyperplane derived by
applying the PCA algorithm locally to a data cluster.

That brings us to the problem of how to actually discover the best set of hyperplane
approximations to the data.  What is probably the most popular algorithm today for
that purpose is based on the following key idea: Given a set of subspaces to which a
data element can be assigned, you assign it to that subspace for which the
B<reconstruction error> is the least.  But what do we mean by a B<subspace> and what
is B<reconstruction error>?

To understand the notions of B<subspace> and B<reconstruction-error>, let's revisit
the traditional approach of dimensionality reduction by the PCA algorithm.  The PCA
algorithm consists of: (1) Subtracting from each data element the global mean of the
data; (2) Calculating the covariance matrix of the data; (3) Carrying out an
eigendecomposition of the covariance matrix and ordering the eigenvectors according
to decreasing values of the corresponding eigenvalues; (4) Forming a B<subspace> by
discarding the trailing eigenvectors whose corresponding eigenvalues are relatively
small; and, finally, (5) projecting all the data elements into the subspace so
formed. The error incurred in representing a data element by its projection into the
subspace is known as the B<reconstruction error>.  This error is the projection of
the data element into the space spanned by the discarded trailing eigenvectors.

I<In linear-manifold based machine learning, instead of constructing a single
subspace in the manner described above, we construct a set of subspaces, one for each
data cluster on the nonlinear manifold.  After the subspaces have been constructed, a
data element is assigned to that subspace for which the reconstruction error is the
least.> On the face of it, this sounds like a chicken-and-egg sort of a problem.  You
need to have already clustered the data in order to construct the subspaces at
different places on the manifold so that you can figure out which cluster to place a
data element in.

Such problems, when they do possess a solution, are best tackled through iterative
algorithms in which you start with a guess for the final solution, you rearrange the
measured data on the basis of the guess, and you then use the new arrangement of the
data to refine the guess.  Subsequently, you iterate through the second and the third
steps until you do not see any discernible changes in the new arrangements of the
data.  This forms the basis of the clustering algorithm that is described under
B<Phase 1> in the section that follows.  This algorithm was first proposed in the
article "Dimension Reduction by Local Principal Component Analysis" by Kambhatla and
Leen that appeared in the journal Neural Computation in 1997.

Unfortunately, experiments show that the algorithm as proposed by Kambhatla and Leen
is much too sensitive to how the clusters are seeded initially.  To get around this
limitation of the basic clustering-by-minimization-of-reconstruction-error, this
module implements a two phased approach.  In B<Phase 1>, we introduce a multiplier
effect in our search for clusters by looking for C<M*K> clusters instead of the main
C<K> clusters.  In this manner, we increase the odds that each original cluster will
be visited by one or more of the C<M*K> randomly selected seeds at the beginning,
where C<M> is the integer value given to the constructor parameter
C<cluster_search_multiplier>.  Subsequently, we merge the clusters that belong
together in order to form the final C<K> clusters.  That work is done in B<Phase 2>
of the algorithm.

For the cluster merging operation in Phase 2, we model the C<M*K> clusters as the
nodes of an attributed graph in which the weight given to an edge connecting a pair
of nodes is a measure of the similarity between the two clusters corresponding to the
two nodes.  Subsequently, we use spectral clustering to merge the most similar nodes
in our quest to partition the data into C<K> clusters.  For that purpose, we use the
Shi-Malik normalized cuts algorithm.  The pairwise node similarity required by this
algorithm is measured by the C<pairwise_cluster_similarity()> method of the
C<LinearManifoldDataClusterer> class.  The smaller the overall reconstruction error
when all of the data elements in one cluster are projected into the other's subspace
and vice versa, the greater the similarity between two clusters.  Additionally, the
smaller the distance between the mean vectors of the clusters, the greater the
similarity between two clusters.  The overall similarity between a pair of clusters
is a combination of these two similarity measures.

For additional information regarding the theoretical underpinnings of the algorithm
implemented in this module, visit
L<https://engineering.purdue.edu/kak/Tutorials/ClusteringDataOnManifolds.pdf>


=head1 SUMMARY OF THE ALGORITHM

We now present a summary of the two phases of the algorithm implemented in this
module.  Note particularly the important role played by the constructor parameter
C<cluster_search_multiplier>.  It is only when the integer value given to this
parameter is greater than 1 that Phase 2 of the algorithm kicks in.

=over 4

=item B<Phase 1:>

Through iterative minimization of the total reconstruction error, this phase of the
algorithm returns C<M*K> clusters where C<K> is the actual number of clusters you
expect to find in your data and where C<M> is the integer value given to the
constructor parameter C<cluster_search_multiplier>.  As previously mentioned, on
account of the sensitivity of the reconstruction-error based clustering to how the
clusters are initially seeded, our goal is to look for C<M*K> clusters with the idea
of increasing the odds that each of the C<K> clusters will see at least one seed at
the beginning of the algorithm.

lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN

This parameter names the data file that contains the multidimensional data records
you want the module to cluster.  This file must be in CSV format and each record in
the file must include a symbolic tag for the record.  Here are first few rows of such
a CSV file in the C<examples> directory:

    d_161,0.0739248630173239,0.231119293395665,-0.970112873251437
    a_59,0.459932215884786,0.0110216469739639,0.887885623314902
    a_225,0.440503220903039,-0.00543366086464691,0.897734586447273
    a_203,0.441656364946433,0.0437191337788422,0.896118459046532
    ...
    ...

What you see in the first column --- C<d_161>, C<a_59>, C<a_225>, C<a_203> --- are
the symbolic tags associated with four 3-dimensional data records.

=item C<mask>:

This parameter supplies the mask to be applied to the columns of your data file.  For
the data file whose first few records are shown above, the mask should be C<N111>
since the symbolic tag is in the first column of the CSV file and since, presumably,
you want to cluster the data in the next three columns.

=item C<K>:

This parameter supplies the number of clusters you are looking for.

=item C<P>:

This parameter specifies the dimensionality of the manifold on which the data resides.

=item C<cluster_search_multiplier>:

As should be clear from the C<Summary of the Algorithm> section, this parameter plays
a very important role in the successful clustering of your data.  As explained in
C<Description>, the basic algorithm used for clustering in Phase 1 --- clustering by
the minimization of the reconstruction error --- is sensitive to the choice of the
cluster seeds that are selected randomly at the beginning of the algorithm.  Should
it happen that the seeds miss one or more of the clusters, the clustering produced is
likely to not be correct.  By giving an integer value to C<cluster_search_multiplier>
that is greater than 1, you'll increase the odds that the randomly selected seeds
will see all clusters.  When you set C<cluster_search_multiplier> to C<M>, you ask
Phase 1 of the algorithm to construct C<M*K> clusters as opposed to just C<K>
clusters. Subsequently, in Phase 2, the module uses inter-cluster similarity based
graph partitioning to merge the C<M*K> clusters into C<K> clusters.

=item C<max_iterations>:

This hard limits the number of iterations in Phase 1 of the algorithm.  Ordinarily,
the iterations stop automatically when the change in the total reconstruction error
from one iteration to the next is less than the value specified by the parameter
C<delta_reconstruction_error>.

=item C<delta_reconstruction_error>:

It is this parameter that determines when the iterations will actually terminate in
Phase 1 of the algorithm.  When the difference in the total reconstruction error from
one iteration to the next is less than the value given to this parameter, the
iterations come to an end. B<IMPORTANT: I have noticed that the larger the number of
data samples that need to be clustered, the larger must be the value give to this
parameter.  That makes intuitive sense since the total reconstruction error is the
sum of all such errors for all of the data elements.> Unfortunately, the best value
for this parameter does NOT appear to depend linearly on the total number of data
records to be clustered.

=item C<terminal_output>:

When this parameter is set, you will see in your terminal window the different
clusters as lists of the symbolic tags associated with the data records.  You will
also see in your terminal window the output produced by the different steps of the
graph partitioning algorithm as smaller clusters are merged to form the final C<K>
clusters --- assuming that you set the parameter C<cluster_search_multiplier> to an
integer value that is greater than 1.

=item C<visualize_each_iteration>:

As its name implies, when this option is set to 1, you'll see 3D plots of the
clustering results for each iteration --- but only if your data is 3-dimensional.

=item C<show_hidden_in_3D_plots>:

This parameter is important for controlling the visualization of the clusters on the
surface of a sphere.  If the clusters are too spread out, seeing all of the clusters
all at once can be visually confusing.  When you set this parameter, the clusters on
the back side of the sphere will not be visible.  Note that no matter how you set
this parameter, you can interact with the 3D plot of the data and rotate it with your
mouse pointer to see all of the data that is output by the clustering code.

=item C<make_png_for_each_iteration>:

If you set this option to 1, the module will output a Gnuplot in the form of a PNG
image for each iteration in Phase 1 of the algorithm.  In Phase 2, the module will
output the clustering result produced by the graph partitioning algorithm.

=back

=over

=item B<get_data_from_csv()>:

    $clusterer->get_data_from_csv();

As you can guess from its name, the method extracts the data from the CSV file named
in the constructor.

=item B<linear_manifold_clusterer()>:

    $clusterer->linear_manifold_clusterer();   

    or 

    my $clusters = $clusterer->linear_manifold_clusterer();

This is the main call to the linear-manifold based clusterer.  The first call works
by side-effect, meaning that you will see the clusters in your terminal window and
they would be written out to disk files (depending on the constructor options you
have set).  The second call also returns the clusters as a reference to an array of
anonymous arrays, each holding the symbolic tags for a cluster.

=item B<display_reconstruction_errors_as_a_function_of_iterations()>:

    $clusterer->display_reconstruction_errors_as_a_function_of_iterations();

lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN

tags of the data samples in that cluster.

Assuming that the data dimensionality is 3, if you have set the constructor parameter
C<visualize_each_iteration>, the module will deposit in your directory printable PNG
files that are point plots of the different clusters in the different iterations of
the algorithm.  Such printable files are also generated for the initial clusters at
the beginning of the iterations and for the final clusters in Phase 1 of the
algorithm.  You will also see in your directory a PNG file for the clustering result
produced by graph partitioning in Phase 2.

Also, as mentioned previously, a call to C<linear_manifold_clusterer()> in your own
code will return the clusters to you directly.

=head1 REQUIRED

This module requires the following modules:

    List::Util
    File::Basename
    Math::Random
    Graphics::GnuplotIF
    Math::GSL::Matrix
    POSIX

=head1 THE C<examples> DIRECTORY

The C<examples> directory contains the following four scripts that you might want to
play with in order to become familiar with the module:

    example1.pl

    example2.pl

    example3.pl

    example4.pl

These scripts demonstrate linear-manifold based clustering on the 3-dimensional data
in the following three CSV files:

    3_clusters_on_a_sphere_498_samples.csv            (used in example1.pl and example4.pl)

    3_clusters_on_a_sphere_3000_samples.csv           (used in example2.pl)

    4_clusters_on_a_sphere_1000_samples.csv           (used in example3.pl)

Note that even though the first two of these files both contain exactly three
clusters, the clusters look very different in the two data files.  The clusters are
much more spread out in C<3_clusters_on_a_sphere_3000_samples.csv>.

The code in C<example4.pl> is special because it shows how you can call the
C<auto_retry_clusterer()> method of the module for automatic repeated invocations of
the clustering program until success is achieved.  The value of the constructor
parameter C<cluster_search_multiplier> is set to 1 in C<example4.pl>, implying that
when you execute C<example4.pl> you will not be invoking Phase 2 of the algorithm.
You might wish to change the value given to the parameter
C<cluster_search_multiplier> to see how it affects the number of attempts needed to
achieve success.

The C<examples> directory also includes PNG image files that show some intermediate
and the best final results that can be achieved by the first three examples, that
is, for the scripts C<example1.pl>, C<example2.pl>, and C<example3.pl>. If you are on
a Linux machine and if you have the C<ImageMagick> library installed, you can use the
C<display> command to see the results in the PNG images.  The results you get with
your own runs of the three example scripts may or may not look like what you see in
the outputs shown in the PNG files depending on how the module seeds the clusters.
Your best results should look like what you see in the PNG images.

The C<examples> directory also contains the following utility scripts:

For generating the data for experiments with clustering:

    generate_data_on_a_sphere.pl

For visualizing the data generated by the above script:

    data_visualizer.pl

For cleaning up the examples directory:

    cleanup_directory.pl

Invoking the C<cleanup_directory.pl> script will get rid of all the PNG image files
that are generated by the module when you run it with the constructor option
C<make_png_for_each_iteration> set to 1.

=head1 EXPORT

None by design.

=head1 CAVEATS

The performance of the algorithm depends much on the values you choose for the
constructor parameters.  And, even for the best choices for the parameters, the
algorithm is not theoretically guaranteed to return the best results.

Even after you have discovered the best choices for the constructor parameters, the
best way to use this module is to try it multiple times on any given data and accept
only those results that make the best intuitive sense.

The module was designed with the hope that it would rather fail than give you wrong
results. So if you see it failing a few times before it returns a good answer, that's
a good thing.  

However, if the module fails too often or is too quick to give you wrong answers,
that means the module is not working on your data.  If that happens, I'd love to hear
from you.  That might indicate to me how I should enhance the power of this module
for its future versions.

=head1 BUGS

Please notify the author if you encounter any bugs.  When sending email, please place
the string 'LinearManifoldDataClusterer' in the subject line.

=head1 INSTALLATION

Download the archive from CPAN in any directory of your choice.  Unpack the archive
with a command that on a Linux machine would look like:

    tar zxvf Algorithm-LinearManifoldDataClusterer-1.01.tar.gz

This will create an installation directory for you whose name will be
C<Algorithm-LinearManifoldDataClusterer-1.01>.  Enter this directory and execute the following commands
for a standard install of the module if you have root privileges:

    perl Makefile.PL
    make
    make test
    sudo make install

If you do not have root privileges, you can carry out a non-standard install the
module in any directory of your choice by:

    perl Makefile.PL prefix=/some/other/directory/
    make
    make test
    make install

With a non-standard install, you may also have to set your PERL5LIB environment
variable so that this module can find the required other modules. How you do that
would depend on what platform you are working on.  In order to install this module in
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
variable by

    setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/

If I used bash, I'd need to declare:

    export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/


=head1 THANKS

I have learned much from my conversations with Donghun Kim whose research on face
recognition in the wild involves clustering image data on manifolds.  I have also had
several fruitful conversations with Bharath Kumar Comandur and Tanmay Prakash with
regard to the ideas that are incorporated in this module.

=head1 AUTHOR

( run in 1.013 second using v1.01-cache-2.11-cpan-5b529ec07f3 )