Algorithm-LinearManifoldDataClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/LinearManifoldDataClusterer.pm  view on Meta::CPAN

                display_clusters(\@clusters);      
            }
        }
        if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
            my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
            push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
            if (($last_recon_error - $total_reconstruction_error_this_iteration) 
                                                        < $self->{_delta_normalized_error}) {
                push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, 
                                                      $total_reconstruction_error_this_iteration;
                last;
            }
        }
        push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, 
                                              $total_reconstruction_error_this_iteration;
        $iteration_index++;
        $previous_min_value_for_unimodality_quotient =  $min_value_for_unimodality_quotient;
    } # end of while loop on iteration_index
    $self->{_num_iterations_actually_used} = 
                                 scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
    if ($self->{_terminal_output}) {
        print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
        print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
              "merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
        print "Total reconstruction error as a function of iterations: " .
                                       "@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
    }
    # now merge sub-clusters if cluster_search_multiplier > 1
    my @final_clusters;
    if ($self->{_cluster_search_multiplier} > 1) {
        print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
        my @array_of_partitioned_cluster_groups = (\@clusters);
        my @partitioned_cluster_groups;
        my $how_many_clusters_looking_for = $self->{_K};
        while (scalar(@final_clusters) < $self->{_K}) {
            @partitioned_cluster_groups = 
                   $self->graph_partition(shift @array_of_partitioned_cluster_groups, 
                                                             $how_many_clusters_looking_for );
            if (@{$partitioned_cluster_groups[0]} == 1) {
                my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
                push @final_clusters, $singular_cluster;
                $how_many_clusters_looking_for--;
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
            } elsif (@{$partitioned_cluster_groups[1]} == 1) {
                my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
                push @final_clusters, $singular_cluster;
                $how_many_clusters_looking_for--;
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
            } else {
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
                push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
            }
        }
        my @data_clustered;
        foreach my $cluster (@final_clusters) {
            push @data_clustered, @$cluster;
        }
        unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
            $self->{_final_clusters} = \@final_clusters;
            my %data_clustered = map {$_ => 1} @data_clustered;
            my @data_tags_not_clustered = 
                   grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
            if ($self->{_terminal_output}) {
                print "\n\nNot all data clustered.  The most reliable clusters found by graph partitioning:\n";
                display_clusters(\@final_clusters);
                print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
            }
            if ($self->{_data_dimensions} == 3) {
                $visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
                $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
                    if $self->{_visualize_each_iteration};
                $self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
                    if $self->{_make_png_for_each_iteration};
            }
            my %data_tags_to_cluster_label_hash;
            foreach my $i (0..@final_clusters-1) {
                map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
            }
            $self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
            foreach my $tag (@data_tags_not_clustered) {
                my $which_cluster = $self->which_cluster_for_new_element($tag);
                $self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
            }
            die "Some data elements are still missing from the final tally" 
              unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) == 
                scalar(@{$self->{_data_tags}});
            my @new_final_clusters;
            map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
                      push @{$new_final_clusters[$_]}, $ele 
                            if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ } 
                } 0..$self->{_K}-1;
            if ($self->{_debug}) {
                print "\ndisplaying the final clusters after accounting for unclustered data:\n"; 
                display_clusters(\@new_final_clusters);
            }
            $self->{_final_clusters} = \@new_final_clusters;
            @final_clusters = @new_final_clusters;
        }
    } else {
        @final_clusters = @clusters;
    }
    print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
    display_clusters(\@final_clusters) if $self->{_terminal_output};
    return \@final_clusters;
} 

sub display_reconstruction_errors_as_a_function_of_iterations {
    my $self = shift;            
    print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
    print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
          "@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
}

sub set_termination_reconstruction_error_threshold {
    my $self = shift;        
    my $all_ref_vecs = shift;
    my @mean_vecs = @$all_ref_vecs;
    my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_; 
                                                      my @result = $result->as_list; 
                                                      sqrt($result[0])
                                                    } @mean_vecs;
    $self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
    $self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) * 
                                       $self->{_delta_reconstruction_error};
}

# This method is called only in the `unless' clause at the end of the main
# linear_manifold_clusterer() method.  It is called to find the cluster labels for
# those data elements that were left unclustered by the main part of the algorithm
# when graph partitioning is used to merge similar sub-clusters.  The operating logic
# here is that graph partition yields K main clusters even though each main cluster
# may not yet be fully populated.
sub which_cluster_for_new_element {
    my $self = shift;    
    my $data_tag = shift;
    # The following `unless' clause is called only the first time the current method
    # is called:
    unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) {
        my @trailing_eigenvec_matrices_for_all_subspaces;
        my @reference_vecs_for_all_subspaces;



( run in 1.831 second using v1.01-cache-2.11-cpan-5b529ec07f3 )