Algorithm-LinearManifoldDataClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/LinearManifoldDataClusterer.pm  view on Meta::CPAN

package Algorithm::LinearManifoldDataClusterer;

#------------------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::LinearManifoldDataClusterer is a Perl module for clustering data that
# resides on a low-dimensional manifold in a high-dimensional measurement space.
# -----------------------------------------------------------------------------------

use 5.10.0;
use strict;
use warnings;
use Carp;
use List::Util qw(reduce any);
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use POSIX (); 

our $VERSION = '1.01';

# Constructor:
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params(@params) == 0;
    bless {
        _datafile                     =>   $args{datafile} || croak("datafile required"),
        _mask                         =>   $args{mask}     || croak("mask required"),
        _K                            =>   $args{K}        || 0,
        _P                            =>   $args{P}        || 0,
        _terminal_output              =>   $args{terminal_output} || 0,
        _max_iterations               =>   $args{max_iterations} || 0,
        _delta_reconstruction_error   =>   $args{delta_reconstruction_error} || 0.001,
        _delta_normalized_error       =>   undef,
        _cluster_search_multiplier    =>   $args{cluster_search_multiplier} || 1,
        _visualize_each_iteration     =>   $args{visualize_each_iteration} == 0 ? 0 : 1,
        _show_hidden_in_3D_plots      =>   $args{show_hidden_in_3D_plots} == 0 ? 0 : 1,
        _make_png_for_each_iteration  =>   $args{make_png_for_each_iteration} == 0 ? 0 : 1,
        _debug                        =>   $args{debug} || 0,
        _N                            =>   0,
        _KM                           =>   $args{K} * $args{cluster_search_multiplier},
        _data_hash                    =>   {},
        _data_tags                    =>   [],
        _data_dimensions              =>   0,
        _final_clusters               =>   [],
        _auto_retry_flag              =>   0,
        _num_iterations_actually_used =>   undef,
        _scale_factor                 =>   undef,
        _data_tags_to_cluster_label_hash  => {},
        _final_reference_vecs_for_all_subspaces => [],
        _reconstruction_error_as_a_function_of_iteration => [],
        _final_trailing_eigenvec_matrices_for_all_subspaces => [],
        _subspace_construction_error_as_a_function_of_iteration => [],
    }, $class;
}

sub get_data_from_csv {
    my $self = shift;
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n" if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_data_hash} = \%data_hash;
    $self->{_data_tags} = \@data_tags;
    $self->{_N} = scalar @data_tags;
}

sub estimate_mean_and_covariance {
    my $self = shift;
    my $tag_set = shift;
    my $cluster_size = @$tag_set;
    my @cluster_center = @{$self->add_point_coords($tag_set)};
    @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;

lib/Algorithm/LinearManifoldDataClusterer.pm  view on Meta::CPAN

        map { printf("%.4f ", $_) } @row_as_list;
        print "\n";
    }
    print "\n\n";
}

sub transpose {
    my $matrix = shift;
    my $num_rows = $matrix->rows();
    my $num_cols = $matrix->cols();
    my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
    foreach my $i (0..$num_rows-1) {
        my @row = $matrix->row($i)->as_list;
        $transpose->set_col($i, \@row );
    }
    return $transpose;
}

sub vector_subtract {
    my $vec1 = shift;
    my $vec2 = shift;
    die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
    my @result;
    foreach my $i (0..@$vec1-1){
        push @result, $vec1->[$i] - $vec2->[$i];
    }
    return @result;
}

# from perl docs:
sub fisher_yates_shuffle {                
    my $arr =  shift;                
    my $i = @$arr;                   
    while (--$i) {                   
        my $j = int rand( $i + 1 );  
        @$arr[$i, $j] = @$arr[$j, $i]; 
    }
}

#########################  Generating Synthetic Data for Manifold Clustering  ##########################

##################################      Class DataGenerator     ########################################

##  The embedded class defined below is for generating synthetic data for
##  experimenting with linear manifold clustering when the data resides on the
##  surface of a sphere.  See the script generate_data_on_a_sphere.pl in the
##  `examples' directory for how to specify the number of clusters and the spread of
##  each cluster in the data that is generated.

package DataGenerator;

use strict;                                                         
use Carp;

sub new {                                                           
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if _check_for_illegal_params3(@params) == 0;   
    bless {
        _output_file                       =>   $args{output_file} 
                                                   || croak("name for output_file required"),
        _total_number_of_samples_needed    =>   $args{total_number_of_samples_needed} 
                                                   || croak("total_number_of_samples_needed required"),
        _number_of_clusters_on_sphere      =>   $args{number_of_clusters_on_sphere}   || 3,
        _cluster_width                     =>   $args{cluster_width}   || 0.1,
        _show_hidden_in_3D_plots           =>   $args{show_hidden_in_3D_plots} || 1,
        _debug                             =>   $args{debug} || 0,
    }, $class;
}

sub _check_for_illegal_params3 {
    my @params = @_;
    my @legal_params = qw / output_file
                            total_number_of_samples_needed
                            number_of_clusters_on_sphere
                            cluster_width
                            show_hidden_in_3D_plots
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

##  We first generate a set of points randomly on the unit sphere --- the number of
##  points being equal to the number of clusters desired.  These points will serve as
##  cluster means (or, as cluster centroids) subsequently when we ask
##  Math::Random::random_multivariate_normal($N, @m, @covar) to return $N number of
##  points on the sphere.  The second argument is the cluster mean and the third
##  argument the cluster covariance.  For the synthetic data, we set the cluster
##  covariance to a 2x2 diagonal matrix, with the (0,0) element corresponding to the
##  variance along the azimuth direction and the (1,1) element corresponding to the
##  variance along the elevation direction.
##
##  When you generate the points in the 2D spherical coordinates of
##  (azimuth,elevation), you also need `wrap-around' logic for those points yielded by
##  the multivariate-normal function whose azimuth angle is outside the interval
##  (0,360) and/or whose elevation angle is outside the interval (-90,90).
##
##  Note that the first of the two dimensions for which the multivariate-normal
##  function returns the points is for the azimuth angle and the second for the
##  elevation angle.
##
##  With regard to the relationship of the Cartesian coordinates to the spherical
##  (azimuth, elevation) coordinates, we assume that (x,y) is the horizontal plane
##  and z the vertical axis.  The elevation angle theta is measure with respect to
##  the XY-plane.  The highest point on the sphere (the Zenith) corresponds to the
##  elevation angle of +90 and the lowest points on the sphere (the Nadir)
##  corresponds to the elevation angle of -90.  The azimuth is measured with respect
##  X-axis.  The range of the azimuth is from 0 to 360 degrees.  The elevation is
##  measured from the XY plane and its range is (-90,90) degrees.



( run in 0.847 second using v1.01-cache-2.11-cpan-ceb78f64989 )