view release on metacpan or search on metacpan
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
_K => $args{K} || 0,
_P => $args{P} || 0,
_terminal_output => $args{terminal_output} || 0,
_max_iterations => $args{max_iterations} || 0,
_delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001,
_delta_normalized_error => undef,
_cluster_search_multiplier => $args{cluster_search_multiplier} || 1,
_visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1,
_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
_make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1,
_debug => $args{debug} || 0,
_N => 0,
_KM => $args{K} * $args{cluster_search_multiplier},
_data_hash => {},
_data_tags => [],
_data_dimensions => 0,
_final_clusters => [],
_auto_retry_flag => 0,
_num_iterations_actually_used => undef,
_scale_factor => undef,
_data_tags_to_cluster_label_hash => {},
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
sub estimate_mean_and_covariance {
my $self = shift;
my $tag_set = shift;
my $cluster_size = @$tag_set;
my @cluster_center = @{$self->add_point_coords($tag_set)};
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
# for covariance calculation:
my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$tag_set));
print "\nThe data will be stuffed into a matrix of $num_rows rows and $num_cols columns\n\n"
if $self->{_debug};
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_tags}. The actual
# data for clustering is stored in a hash at $self->{_data_hash} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
$mean_vec->set_col(0, \@cluster_center);
if ($self->{_debug}) {
print "\nDisplaying the mean vector for the cluster:\n";
display_matrix( $mean_vec ) if $self->{_terminal_output};
}
foreach my $j (0..$num_cols-1) {
my $tag = $tag_set->[$j];
my $data = $self->{_data_hash}->{$tag};
my @diff_from_mean = vector_subtract($data, \@cluster_center);
$matrix->set_col($j, \@diff_from_mean);
}
my $transposed = transpose( $matrix );
my $covariance = $matrix * $transposed;
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for cluster:";
display_matrix( $covariance ) if $self->{_terminal_output};
}
return ($mean_vec, $covariance);
}
sub eigen_analysis_of_covariance {
my $self = shift;
my $covariance = shift;
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
}
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
my @all_my_eigenvecs;
foreach my $i (0..$num_of_eigens-1) {
my @vec = $eigenvectors->[$i]->as_list;
my @eigenvec;
foreach my $ele (@vec) {
my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
if ($theta eq "0") {
push @eigenvec, $mag;
} elsif ($theta eq "pi") {
push @eigenvec, -1.0 * $mag;
} else {
die "Eigendecomposition produced a complex eigenvector -- " .
"which should not happen for a covariance matrix!";
}
}
print "Eigenvector $i: @eigenvec\n" if $self->{_debug};
push @all_my_eigenvecs, \@eigenvec;
}
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
my @sorted_eigenvecs;
my @sorted_eigenvals;
foreach my $i (0..@sorted_eigenvec_indexes-1) {
$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
}
if ($self->{_debug}) {
print "\nHere come sorted eigenvectors --- from the largest to the smallest:\n";
foreach my $i (0..@sorted_eigenvecs-1) {
print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
}
}
return (\@sorted_eigenvecs, \@sorted_eigenvals);
}
sub auto_retry_clusterer {
my $self = shift;
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
}
}
sub linear_manifold_clusterer {
my $self = shift;
my $KM = $self->{_KM};
my @initial_cluster_center_tags;
my $visualization_msg;
my @initial_cluster_center_indexes = $self->initialize_cluster_centers($KM, $self->{_N});
print "Randomly selected indexes for cluster center tags: @initial_cluster_center_indexes\n"
if $self->{_debug};
@initial_cluster_center_tags = map {$self->{_data_tags}->[$_]} @initial_cluster_center_indexes;
my @initial_cluster_center_coords = map {$self->{_data_hash}->{$_}} @initial_cluster_center_tags;
if ($self->{_debug}) {
foreach my $centroid (@initial_cluster_center_coords) {
print "Initial cluster center coords: @{$centroid}\n";
}
}
my $initial_clusters = $self->assign_data_to_clusters_initial(\@initial_cluster_center_coords);
if ($self->{_data_dimensions} == 3) {
$visualization_msg = "initial_clusters";
$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters)
if $self->{_visualize_each_iteration};
$self->visualize_clusters_on_sphere($visualization_msg, $initial_clusters, "png")
if $self->{_make_png_for_each_iteration};
}
foreach my $cluster (@$initial_clusters) {
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
display_mean_and_covariance($mean, $covariance) if $self->{_debug};
}
my @clusters = @$initial_clusters;
display_clusters(\@clusters) if $self->{_debug};
my $iteration_index = 0;
my $unimodal_correction_flag;
my $previous_min_value_for_unimodality_quotient;
while ($iteration_index < $self->{_max_iterations}) {
print "\n\n========================== STARTING ITERATION $iteration_index =====================\n\n"
if $self->{_terminal_output};
my $total_reconstruction_error_this_iteration = 0;
my @subspace_construction_errors_this_iteration;
my @trailing_eigenvec_matrices_for_all_subspaces;
my @reference_vecs_for_all_subspaces;
foreach my $cluster (@clusters) {
next if @$cluster == 0;
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
display_mean_and_covariance($mean, $covariance) if $self->{_debug};
print "--------------end of displaying mean and covariance\n\n" if $self->{_debug};
my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
push @subspace_construction_errors_this_iteration, $subspace_construction_error;
my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
scalar(@trailing_eigenvecs));
foreach my $j (0..@trailing_eigenvecs-1) {
print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
}
push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
push @reference_vecs_for_all_subspaces, $mean;
}
$self->set_termination_reconstruction_error_threshold(\@reference_vecs_for_all_subspaces);
my %best_subspace_based_partition_of_data;
foreach my $i (0..$self->{_KM}-1) {
$best_subspace_based_partition_of_data{$i} = [];
}
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
$trailing_eigenvec_matrices_for_all_subspaces[$_],
$reference_vecs_for_all_subspaces[$_])}
0 .. $self->{_KM}-1;
my ($minval, $index_for_closest_subspace) = minimum(\@errors);
$total_reconstruction_error_this_iteration += $minval;
push @{$best_subspace_based_partition_of_data{$index_for_closest_subspace}},
$data_tag;
}
print "empty-cluster jag: total reconstruction error in this iteration: \n" .
"$total_reconstruction_error_this_iteration\n"
if $self->{_debug};
foreach my $i (0..$self->{_KM}-1) {
$clusters[$i] = $best_subspace_based_partition_of_data{$i};
}
display_clusters(\@newclusters) if $self->{_terminal_output};
@clusters = grep {@$_ != 0} @newclusters;
die "linear manifold based algorithm does not appear to work in this case $!"
unless @clusters == $self->{_KM};
}# end of foreach my $cluster (@clusters) ... loop followed by if clause for empty clusters
if ($self->{_data_dimensions} == 3) {
$visualization_msg = "clustering_at_iteration_$iteration_index";
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
$self->{_data_tags_to_cluster_label_hash}->{$tag} = $which_cluster;
}
die "Some data elements are still missing from the final tally"
unless scalar(keys %{$self->{_data_tags_to_cluster_label_hash}}) ==
scalar(@{$self->{_data_tags}});
my @new_final_clusters;
map { foreach my $ele (keys %{$self->{_data_tags_to_cluster_label_hash}}) {
push @{$new_final_clusters[$_]}, $ele
if $self->{_data_tags_to_cluster_label_hash}->{$ele} == $_ }
} 0..$self->{_K}-1;
if ($self->{_debug}) {
print "\ndisplaying the final clusters after accounting for unclustered data:\n";
display_clusters(\@new_final_clusters);
}
$self->{_final_clusters} = \@new_final_clusters;
@final_clusters = @new_final_clusters;
}
} else {
@final_clusters = @clusters;
}
print "\n\nDisplaying final clustering results:\n\n" if $self->{_terminal_output};
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
foreach my $cluster (@$clusters) {
my ($mean, $covariance) = $self->estimate_mean_and_covariance($cluster);
my ($eigenvecs, $eigenvals) = $self->eigen_analysis_of_covariance($covariance);
my @trailing_eigenvecs = @{$eigenvecs}[$self->{_P} .. $self->{_data_dimensions}-1];
my @trailing_eigenvals = @{$eigenvals}[$self->{_P} .. $self->{_data_dimensions}-1];
my $subspace_construction_error = reduce {abs($a) + abs($b)} @trailing_eigenvals;
push @subspace_construction_errors, $subspace_construction_error;
my $trailing_eigenvec_matrix = Math::GSL::Matrix->new($self->{_data_dimensions},
scalar(@trailing_eigenvecs));
foreach my $j (0..@trailing_eigenvecs-1) {
print "trailing eigenvec column: @{$trailing_eigenvecs[$j]}\n" if $self->{_debug};
$trailing_eigenvec_matrix->set_col($j, $trailing_eigenvecs[$j]);
}
push @trailing_eigenvec_matrices_for_all_subspaces,$trailing_eigenvec_matrix;
push @reference_vecs_for_all_subspaces, $mean;
}
# We consider the similarity matrix W to be a sum of two parts W_recon_based and
# W_dist_bet_means based. IMPORTANT: For coding reasons, we first store the two
# similarity measures separately in W_recon_based and W_dist_bet_means based. Our
# goal is to fill up these component matrices with the raw values while at the
# same time collecting information needed for normalizing these two separate
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
foreach my $j (0..$num_nodes-1) {
my $recon_val = $W_reconstruction_error_based->get_elem($i,$j);
my $new_recon_val = exp( -1.0 * $recon_val / $recon_error_normalizer );
$W_reconstruction_error_based->set_elem($i,$j,$new_recon_val);
my $mean_dist_val = $W_dist_between_means_based->get_elem($i,$j);
my $new_mean_dist_val = exp( -1.0 * $mean_dist_val / $dist_bet_means_based_normalizer );
$W_dist_between_means_based->set_elem($i,$j,$new_mean_dist_val);
}
}
$W = $W_reconstruction_error_based + $W_dist_between_means_based;
if ($self->{_debug}) {
print "\nDisplaying the similarity matrix W for the cluster graph:\n";
display_matrix($W) if $self->{_terminal_output};
}
my $add_all_columns = Math::GSL::Matrix->new($num_nodes,1);
foreach my $col (0..$num_nodes-1) {
$add_all_columns += $W->col($col);
}
foreach my $i (0..$num_nodes-1) {
$D->set_elem($i,$i, $add_all_columns->get_elem($i,0));
$neg_sqrt_of_D->set_elem($i,$i, 1.0 / sqrt($add_all_columns->get_elem($i,0)));
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
}
if ($self->{_terminal_output}) {
print "\nDisplaying the Symmetric Normalized Laplacian matrix A:\n" .
"A = neg_sqrt(D) * Laplacian_matrix * neg_sqrt(D)\n";
display_matrix( $A );
}
my ($eigenvalues, $eigenvectors) = $A->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
if ($self->{_debug2}) {
print "Eigenvalue 0: $eigenvalues->[0]\n";
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n";
}
print "\nlargest eigen index: $largest_eigen_index\n";
print "\nsmallest eigen index: $smallest_eigen_index\n\n";
}
my @all_my_eigenvecs;
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
foreach my $ele (@vec) {
my ($mag, $theta) = $ele =~ /\[(\d*\.?\d*e?[+-]?\d*),(\S+)\]/;
if ($theta eq "0") {
push @eigenvec, $mag;
} elsif ($theta eq "pi") {
push @eigenvec, -1.0 * $mag;
} else {
die "Eigendecomposition produced a complex eigenvector!";
}
}
print "Eigenvector $i: @eigenvec\n" if $self->{_debug2};
push @all_my_eigenvecs, \@eigenvec;
}
if ($self->{_debug2}) {
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector of A: @largest_eigen_vec\n";
}
my @sorted_eigenvec_indexes = sort {$eigenvalues->[$b] <=> $eigenvalues->[$a]} 0..@all_my_eigenvecs-1;
print "sorted eigenvec indexes for A: @sorted_eigenvec_indexes\n" if $self->{_debug2};
my @sorted_eigenvecs;
my @sorted_eigenvals;
foreach my $i (0..@sorted_eigenvec_indexes-1) {
$sorted_eigenvecs[$i] = $all_my_eigenvecs[$sorted_eigenvec_indexes[$i]];
$sorted_eigenvals[$i] = $eigenvalues->[$sorted_eigenvec_indexes[$i]];
}
if ($self->{_debug2}) {
print "\nHere come sorted eigenvectors for A --- from the largest to the smallest:\n";
foreach my $i (0..@sorted_eigenvecs-1) {
print "eigenvec: @{$sorted_eigenvecs[$i]} eigenvalue: $sorted_eigenvals[$i]\n";
}
}
my $best_partitioning_eigenvec = $sorted_eigenvecs[@sorted_eigenvec_indexes-2];
print "\nBest graph partitioning eigenvector: @$best_partitioning_eigenvec\n" if $self->{_terminal_output};
my $how_many_positive = reduce {$a + $b} map {$_ > 0 ? 1 : 0 } @$best_partitioning_eigenvec;
my $how_many_negative = scalar(@$best_partitioning_eigenvec) - $how_many_positive;
print "Have $how_many_positive positive and $how_many_negative negative elements in the partitioning vec\n"
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
my $max_iterations = 100;
my @random_points;
my $iteration = 0;
while ($iteration++ < $max_iterations) {
my @coordinate_vec;
foreach my $dimen (0..$self->{_data_dimensions}-1) {
push @coordinate_vec, $min_bounds[$dimen] + rand($ranges[$dimen]);
}
push @random_points, \@coordinate_vec;
}
if ($self->{_debug}) {
print "\nrandom points\n";
map {print "@$_\n"} @random_points;
}
my @mean = $mean->as_list;
unshift @random_points, \@mean;
my @reconstruction_errors;
foreach my $candidate_ref_vec (@random_points) {
my $ref_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$ref_vec->set_col(0, $candidate_ref_vec);
my $reconstruction_error_for_a_ref_vec = 0;
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
mask
K
P
terminal_output
cluster_search_multiplier
max_iterations
delta_reconstruction_error
visualize_each_iteration
show_hidden_in_3D_plots
make_png_for_each_iteration
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
"--- perhaps a misspelling\n"
if _check_for_illegal_params3(@params) == 0;
bless {
_output_file => $args{output_file}
|| croak("name for output_file required"),
_total_number_of_samples_needed => $args{total_number_of_samples_needed}
|| croak("total_number_of_samples_needed required"),
_number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3,
_cluster_width => $args{cluster_width} || 0.1,
_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1,
_debug => $args{debug} || 0,
}, $class;
}
sub _check_for_illegal_params3 {
my @params = @_;
my @legal_params = qw / output_file
total_number_of_samples_needed
number_of_clusters_on_sphere
cluster_width
show_hidden_in_3D_plots
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
foreach my $k (0..1) {
$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 360.0) ** 2
if $j == 0 && $k == 0;
$cluster_covariance->[$j]->[$k] = ($self->{_cluster_width} * 180.0) ** 2
if $j == 1 && $k == 1;
$cluster_covariance->[$j]->[$k] = 0.0 if $j != $k;
}
}
push @covariances, $cluster_covariance;
}
if ($self->{_debug}) {
foreach my $i (0..$K-1) {
print "\n\nCluster center: @{$cluster_centers[$i]}\n";
print "\nCovariance:\n";
foreach my $j (0..1) {
foreach my $k (0..1) {
print "$covariances[$i]->[$j]->[$k] ";
}
print "\n";
}
}
}
my @data_dump;
foreach my $i (0..$K-1) {
my @m = @{shift @cluster_centers};
my @covar = @{shift @covariances};
my @new_data = Math::Random::random_multivariate_normal($N, @m, @covar);
if ($self->{_debug}) {
print "\nThe points for cluster $i:\n";
map { print "@$_ "; } @new_data;
print "\n\n";
}
my @wrapped_data;
foreach my $d (@new_data) {
my $wrapped_d;
if ($d->[0] >= 360.0) {
$wrapped_d->[0] = $d->[0] - 360.0;
} elsif ($d->[0] < 0) {
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
$wrapped_d->[0] = POSIX::fmod($d->[0] + 180.0, 360);
$wrapped_d->[1] = 180.0 - $d->[1];
} elsif ($d->[1] < -90.0) {
$wrapped_d->[0] = POSIX::fmod($d->[0] + 180, 360);
$wrapped_d->[1] = -180.0 - $d->[1];
}
$wrapped_d->[0] = $d->[0] unless defined $wrapped_d->[0];
$wrapped_d->[1] = $d->[1] unless defined $wrapped_d->[1];
push @wrapped_data, $wrapped_d;
}
if ($self->{_debug}) {
print "\nThe unwrapped points for cluster $i:\n";
map { print "@$_ "; } @wrapped_data;
print "\n\n";
}
my $label = $point_labels[$i];
my $j = 0;
@new_data = map {unshift @$_, $label."_".$j; $j++; $_} @wrapped_data;
push @data_dump, @new_data;
}
if ($self->{_debug}) {
print "\n\nThe labeled points for clusters:\n";
map { print "@$_\n"; } @data_dump;
}
fisher_yates_shuffle( \@data_dump );
open OUTPUT, ">$output_file";
my $total_num_of_points = $N * $K;
print "Total number of data points that will be written out to the file: $total_num_of_points\n"
if $self->{_debug};
foreach my $ele (@data_dump) {
my ($x,$y,$z);
my $label = $ele->[0];
my $azimuth = $ele->[1];
my $elevation = $ele->[2];
$x = cos($elevation) * cos($azimuth);
$y = cos($elevation) * sin($azimuth);
$z = sin($elevation);
my $csv_str = join ",", ($label,$x,$y,$z);
print OUTPUT "$csv_str\n";
}
print "\n\n";
print "Data written out to file $output_file\n" if $self->{_debug};
close OUTPUT;
}
# This version for the embedded class for data generation
sub visualize_data_on_sphere {
my $self = shift;
my $datafile = shift;
my $filename = File::Basename::basename($datafile);
my $temp_file = "__temp_" . $filename;
$temp_file =~ s/\.\w+$/\.txt/;