Algorithm-LinearManifoldDataClusterer
view release on metacpan or search on metacpan
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
display_clusters(\@clusters);
}
}
if (@{$self->{_reconstruction_error_as_a_function_of_iteration}} > 0) {
my $last_recon_error = pop @{$self->{_reconstruction_error_as_a_function_of_iteration}};
push @{$self->{_reconstruction_error_as_a_function_of_iteration}}, $last_recon_error;
if (($last_recon_error - $total_reconstruction_error_this_iteration)
< $self->{_delta_normalized_error}) {
push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
$total_reconstruction_error_this_iteration;
last;
}
}
push @{$self->{_reconstruction_error_as_a_function_of_iteration}},
$total_reconstruction_error_this_iteration;
$iteration_index++;
$previous_min_value_for_unimodality_quotient = $min_value_for_unimodality_quotient;
} # end of while loop on iteration_index
$self->{_num_iterations_actually_used} =
scalar @{$self->{_reconstruction_error_as_a_function_of_iteration}};
if ($self->{_terminal_output}) {
print "\nIterations of the main loop terminated at iteration number $iteration_index.\n";
print "Will now invoke graph partitioning to discover dominant clusters and to\n" .
"merge small clusters.\n\n" if $self->{_cluster_search_multiplier} > 1;
print "Total reconstruction error as a function of iterations: " .
"@{$self->{_reconstruction_error_as_a_function_of_iteration}}";
}
# now merge sub-clusters if cluster_search_multiplier > 1
my @final_clusters;
if ($self->{_cluster_search_multiplier} > 1) {
print "\n\nInvoking recursive graph partitioning to merge small clusters\n\n";
my @array_of_partitioned_cluster_groups = (\@clusters);
my @partitioned_cluster_groups;
my $how_many_clusters_looking_for = $self->{_K};
while (scalar(@final_clusters) < $self->{_K}) {
@partitioned_cluster_groups =
$self->graph_partition(shift @array_of_partitioned_cluster_groups,
$how_many_clusters_looking_for );
if (@{$partitioned_cluster_groups[0]} == 1) {
my $singular_cluster = shift @{$partitioned_cluster_groups[0]};
push @final_clusters, $singular_cluster;
$how_many_clusters_looking_for--;
push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
} elsif (@{$partitioned_cluster_groups[1]} == 1) {
my $singular_cluster = shift @{$partitioned_cluster_groups[1]};
push @final_clusters, $singular_cluster;
$how_many_clusters_looking_for--;
push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
} else {
push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[0];
push @array_of_partitioned_cluster_groups, $partitioned_cluster_groups[1];
}
}
my @data_clustered;
foreach my $cluster (@final_clusters) {
push @data_clustered, @$cluster;
}
unless (scalar(@data_clustered) == scalar(@{$self->{_data_tags}})) {
$self->{_final_clusters} = \@final_clusters;
my %data_clustered = map {$_ => 1} @data_clustered;
my @data_tags_not_clustered =
grep {$_} map {exists $data_clustered{$_} ? undef : $_} @{$self->{_data_tags}};
if ($self->{_terminal_output}) {
print "\n\nNot all data clustered. The most reliable clusters found by graph partitioning:\n";
display_clusters(\@final_clusters);
print "\n\nData not yet clustered:\n\n@data_tags_not_clustered\n";
}
if ($self->{_data_dimensions} == 3) {
$visualization_msg = "$self->{_K}_best_clusters_produced_by_graph_partitioning";
$self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters)
if $self->{_visualize_each_iteration};
$self->visualize_clusters_on_sphere($visualization_msg, \@final_clusters, "png")
if $self->{_make_png_for_each_iteration};
}
my %data_tags_to_cluster_label_hash;
foreach my $i (0..@final_clusters-1) {
map {$data_tags_to_cluster_label_hash{$_} = $i} @{$final_clusters[$i]};
}
$self->{_data_tags_to_cluster_label_hash} = \%data_tags_to_cluster_label_hash;
foreach my $tag (@data_tags_not_clustered) {
my $which_cluster = $self->which_cluster_for_new_element($tag);
$self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
}
die "Some data elements are still missing from the final tally"
unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) ==
scalar(@{$self->{_data_tags}});
my @new_final_clusters;
map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
push @{$new_final_clusters[$_]}, $ele
if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ }
} 0..$self->{_K}-1;
if ($self->{_debug}) {
print "\ndisplaying the final clusters after accounting for unclustered data:\n";
display_clusters(\@new_final_clusters);
}
$self->{_final_clusters} = \@new_final_clusters;
@final_clusters = @new_final_clusters;
}
} else {
@final_clusters = @clusters;
}
print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
display_clusters(\@final_clusters) if $self->{_terminal_output};
return \@final_clusters;
}
sub display_reconstruction_errors_as_a_function_of_iterations {
my $self = shift;
print "\n\nNumber of iterations used in Phase 1: $self->{_num_iterations_actually_used}\n";
print "\nTotal reconstruction error as a function of iterations in Phase 1: " .
"@{$self->{_reconstruction_error_as_a_function_of_iteration}}\n";
}
sub set_termination_reconstruction_error_threshold {
my $self = shift;
my $all_ref_vecs = shift;
my @mean_vecs = @$all_ref_vecs;
my $sum_of_mean_magnitudes = reduce {$a+$b} map { my $result = transpose($_) * $_;
my @result = $result->as_list;
sqrt($result[0])
} @mean_vecs;
$self->{_scale_factor} = $sum_of_mean_magnitudes / @mean_vecs;
$self->{_delta_normalized_error} = ($sum_of_mean_magnitudes / @mean_vecs ) *
$self->{_delta_reconstruction_error};
}
# This method is called only in the `unless' clause at the end of the main
# linear_manifold_clusterer() method. It is called to find the cluster labels for
# those data elements that were left unclustered by the main part of the algorithm
# when graph partitioning is used to merge similar sub-clusters. The operating logic
# here is that graph partition yields K main clusters even though each main cluster
# may not yet be fully populated.
sub which_cluster_for_new_element {
my $self = shift;
my $data_tag = shift;
# The following `unless' clause is called only the first time the current method
# is called:
unless (@{$self->{_final_trailing_eigenvec_matrices_for_all_subspaces}} > 0) {
my @trailing_eigenvec_matrices_for_all_subspaces;
my @reference_vecs_for_all_subspaces;
( run in 1.831 second using v1.01-cache-2.11-cpan-5b529ec07f3 )