Algorithm-LinearManifoldDataClusterer
view release on metacpan or search on metacpan
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
package Algorithm::LinearManifoldDataClusterer;
#------------------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
# resides on a low-dimensional manifold in a high-dimensional measurement space.
# -----------------------------------------------------------------------------------
use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(reduce any);
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use POSIX ();
our $VERSION = '1.01';
# Constructor:
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
bless {
_datafile => $args{datafile} || croak("datafile required"),
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || 0,
_P => $args{P} || 0,
_terminal_output => $args{terminal_output} || 0,
_max_iterations => $args{max_iterations} || 0,
_delta_reconstruction_error => $args{delta_reconstruction_error} || 0.001,
_delta_normalized_error => undef,
_cluster_search_multiplier => $args{cluster_search_multiplier} || 1,
_visualize_each_iteration => $args{visualize_each_iteration} == 0 ? 0 : 1,
_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
_make_png_for_each_iteration => $args{make_png_for_each_iteration} == 0 ? 0 : 1,
_debug => $args{debug} || 0,
_N => 0,
_KM => $args{K} * $args{cluster_search_multiplier},
_data_hash => {},
_data_tags => [],
_data_dimensions => 0,
_final_clusters => [],
_auto_retry_flag => 0,
_num_iterations_actually_used => undef,
_scale_factor => undef,
_data_tags_to_cluster_label_hash => {},
_final_reference_vecs_for_all_subspaces => [],
_reconstruction_error_as_a_function_of_iteration => [],
_final_trailing_eigenvec_matrices_for_all_subspaces => [],
_subspace_construction_error_as_a_function_of_iteration => [],
}, $class;
}
sub get_data_from_csv {
my $self = shift;
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n" if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_data_hash} = \%data_hash;
$self->{_data_tags} = \@data_tags;
$self->{_N} = scalar @data_tags;
}
sub estimate_mean_and_covariance {
my $self = shift;
my $tag_set = shift;
my $cluster_size = @$tag_set;
my @cluster_center = @{$self->add_point_coords($tag_set)};
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
lib/Algorithm/LinearManifoldDataClusterer.pm view on Meta::CPAN
map { printf("%.4f ", $_) } @row_as_list;
print "\n";
}
print "\n\n";
}
sub transpose {
my $matrix = shift;
my $num_rows = $matrix->rows();
my $num_cols = $matrix->cols();
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
foreach my $i (0..$num_rows-1) {
my @row = $matrix->row($i)->as_list;
$transpose->set_col($i, \@row );
}
return $transpose;
}
sub vector_subtract {
my $vec1 = shift;
my $vec2 = shift;
die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
my @result;
foreach my $i (0..@$vec1-1){
push @result, $vec1->[$i] - $vec2->[$i];
}
return @result;
}
# from perl docs:
sub fisher_yates_shuffle {
my $arr = shift;
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
######################### Generating Synthetic Data for Manifold Clustering ##########################
################################## Class DataGenerator ########################################
## The embedded class defined below is for generating synthetic data for
## experimenting with linear manifold clustering when the data resides on the
## surface of a sphere. See the script generate_data_on_a_sphere.pl in the
## `examples' directory for how to specify the number of clusters and the spread of
## each cluster in the data that is generated.
package DataGenerator;
use strict;
use Carp;
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if _check_for_illegal_params3(@params) == 0;
bless {
_output_file => $args{output_file}
|| croak("name for output_file required"),
_total_number_of_samples_needed => $args{total_number_of_samples_needed}
|| croak("total_number_of_samples_needed required"),
_number_of_clusters_on_sphere => $args{number_of_clusters_on_sphere} || 3,
_cluster_width => $args{cluster_width} || 0.1,
_show_hidden_in_3D_plots => $args{show_hidden_in_3D_plots} || 1,
_debug => $args{debug} || 0,
}, $class;
}
sub _check_for_illegal_params3 {
my @params = @_;
my @legal_params = qw / output_file
total_number_of_samples_needed
number_of_clusters_on_sphere
cluster_width
show_hidden_in_3D_plots
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
## We first generate a set of points randomly on the unit sphere --- the number of
## points being equal to the number of clusters desired. These points will serve as
## cluster means (or, as cluster centroids) subsequently when we ask
## Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
## points on the sphere. The second argument is the cluster mean and the third
## argument the cluster covariance. For the synthetic data, we set the cluster
## covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
## variance along the azimuth direction and the (1,1) element corresponding to the
## variance along the elevation direction.
##
## When you generate the points in the 2D spherical coordinates of
## (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
## the multivariate-normal function whose azimuth angle is outside the interval
## (0,360) and/or whose elevation angle is outside the interval (-90,90).
##
## Note that the first of the two dimensions for which the multivariate-normal
## function returns the points is for the azimuth angle and the second for the
## elevation angle.
##
## With regard to the relationship of the Cartesian coordinates to the spherical
## (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
## and z the vertical axis. The elevation angle theta is measure with respect to
## the XY-plane. The highest point on the sphere (the Zenith) corresponds to the
## elevation angle of +90 and the lowest points on the sphere (the Nadir)
## corresponds to the elevation angle of -90. The azimuth is measured with respect
## X-axis. The range of the azimuth is from 0 to 360 degrees. The elevation is
## measured from the XY plane and its range is (-90,90) degrees.
( run in 0.847 second using v1.01-cache-2.11-cpan-ceb78f64989 )