Algorithm-KMeans

 view release on metacpan or  search on metacpan

lib/Algorithm/KMeans.pm  view on Meta::CPAN

package Algorithm::KMeans;

#------------------------------------------------------------------------------------
# Copyright (c) 2014 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::KMeans is a Perl module for clustering multidimensional data.
# -----------------------------------------------------------------------------------

#use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;


our $VERSION = '2.05';

# from Perl docs:
my $_num_regex =  '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$'; 

# Constructor:
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params(@params) == 0;
    bless {
        _datafile                 =>   $args{datafile} || croak("datafile required"),
        _mask                     =>   $args{mask}     || croak("mask required"),
        _K                        =>   $args{K}        || 0,
        _K_min                    =>   $args{Kmin} || 'unknown',
        _K_max                    =>   $args{Kmax} || 'unknown',
        _cluster_seeding          =>   $args{cluster_seeding} || croak("must choose smart or random ".
                                                                       "for cluster seeding"),
        _var_normalize            =>   $args{do_variance_normalization} || 0,
        _use_mahalanobis_metric   =>   $args{use_mahalanobis_metric} || 0,  
        _clusters_2_files         =>   $args{write_clusters_to_files} || 0,
        _terminal_output          =>   $args{terminal_output} || 0,
        _debug                    =>   $args{debug} || 0,
        _N                        =>   0,
        _K_best                   =>   'unknown',
        _original_data            =>   {},
        _data                     =>   {},
        _data_id_tags             =>   [],
        _QoC_values               =>   {},
        _clusters                 =>   [],
        _cluster_centers          =>   [],
        _clusters_hash            =>   {},
        _cluster_centers_hash     =>   {},
        _cluster_covariances_hash =>   {},
        _data_dimensions          =>   0,

    }, $class;
}

sub read_data_from_file {
    my $self = shift;
    my $filename = $self->{_datafile};
    $self->read_data_from_file_csv() if $filename =~ /.csv$/;
    $self->read_data_from_file_dat() if $filename =~ /.dat$/;
}

sub read_data_from_file_csv {
    my $self = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
            "the size of at least one of the data records in the file: $!"
            unless scalar(@mask) == scalar(@splits);
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_original_data} = \%data_hash;
    $self->{_data_id_tags} = \@data_tags;
    $self->{_N} = scalar @data_tags;
    if ($self->{_var_normalize}) {
        $self->{_data} =  variance_normalization( $self->{_original_data} ); 
    } else {
        $self->{_data} = deep_copy_hash( $self->{_original_data} );
    }
    # Need to make the following call to set the global mean and covariance:
    # my $covariance =  $self->estimate_mean_and_covariance(\@data_tags);
    # Need to make the following call to set the global eigenvec eigenval sets:
    # $self->eigen_analysis_of_covariance($covariance);
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .

lib/Algorithm/KMeans.pm  view on Meta::CPAN

# assessment of the K random integers constructed.  This quality measure consists of
# the ratio of the values spanned by the random integers to the value of N, the total
# number of data points to be clustered.  Currently, if this ratio is less than 0.3,
# we discard the K integers and try again.
sub initialize_cluster_centers {
    my $self = shift;
    my $K = shift;
    my $data_store_size = $self->{_N};
    my @cluster_center_indices;
    while (1) {
        foreach my $i (0..$K-1) {
            $cluster_center_indices[$i] = int rand( $data_store_size );
            next if $i == 0;
            foreach my $j (0..$i-1) {
                while ( $cluster_center_indices[$j] == 
                        $cluster_center_indices[$i] ) {
                    my $old = $cluster_center_indices[$i];
                    $cluster_center_indices[$i] = int rand($data_store_size);
                }
            }
        }
        my ($min,$max) = minmax(\@cluster_center_indices );
        my $quality = ($max - $min) / $data_store_size;
        last if $quality > 0.3;
    }
    return @cluster_center_indices;
}

# This function is used when you set the "cluster_seeding" option to 'random' in the
# constructor.  This routine merely reaches into the data array with the random
# integers, as constructed by the previous routine, serving as indices and fetching
# values corresponding to those indices.  The fetched data samples serve as the
# initial cluster centers.
sub get_initial_cluster_centers {
    my $self = shift;
    my $K = shift;
    my @cluster_center_indices = $self->initialize_cluster_centers($K);
    my @result;
    foreach my $i (@cluster_center_indices) {    
        my $tag = $self->{_data_id_tags}[$i];     
        push @result, $self->{_data}->{$tag};
    }
    return \@result;
}

# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
# option in the constructor.  It subjects the data to a principal components analysis
# to figure out the direction of maximal variance.  Subsequently, it tries to locate
# K peaks in a smoothed histogram of the data points projected onto the maximal
# variance direction.
sub get_initial_cluster_centers_smart {
    my $self = shift;
    my $K = shift;
    if ($self->{_data_dimensions} == 1) {
        my @one_d_data;
        foreach my $j (0..$self->{_N}-1) {
            my $tag = $self->{_data_id_tags}[$j];     
            push @one_d_data, $self->{_data}->{$tag}->[0];
        }
        my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
        print "highest points at data values: @peak_points\n" if $self->{_debug};
        my @cluster_centers;
        foreach my $peakpoint (@peak_points) {
            push @cluster_centers, [$peakpoint];
        }
        return \@cluster_centers;
    }
    my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
    my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
    my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
    # All the record labels are stored in the array $self->{_data_id_tags}.  The
    # actual data for clustering is stored in a hash at $self->{_data} whose keys are
    # the record labels; the value associated with each key is the array holding the
    # corresponding numerical multidimensional data.
    foreach my $j (0..$num_cols-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        $matrix->set_col($j, $data);
    }
    if ($self->{_debug}) {
        print "\nDisplaying the original data as a matrix:";
        display_matrix( $matrix ); 
    }
    foreach my $j (0..$num_cols-1) {
        $mean_vec += $matrix->col($j);
    }
    $mean_vec *=  1.0 / $num_cols;
    if ($self->{_debug}) {
        print "Displaying the mean vector for the data:";
        display_matrix( $mean_vec );
    }
    foreach my $j (0..$num_cols-1) {
        my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
        $matrix->set_col($j, \@new_col);
    }
    if ($self->{_debug}) {
        print "Displaying mean subtracted data as a matrix:";
        display_matrix( $matrix ); 
    }
    my $transposed = transpose( $matrix );
    if ($self->{_debug}) {
        print "Displaying transposed data matrix:";
        display_matrix( $transposed );
    }
    my $covariance = matrix_multiply( $matrix, $transposed );
    $covariance *= 1.0 / $num_cols;
    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for your data:";
        display_matrix( $covariance );
    }
    my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    print "Eigenvalue 0:   $eigenvalues->[0]\n" if $self->{_debug};
    foreach my $i (1..$num_of_eigens-1) {
        $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
        $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
        print "Eigenvalue $i:   $eigenvalues->[$i]\n" if $self->{_debug};
    }
    print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
    print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        print "Eigenvector $i:   @vec\n" if $self->{_debug};
    }
    my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
    print "\nLargest eigenvector:   @largest_eigen_vec\n" if $self->{_debug};
    my @max_var_direction;
    # Each element of the array @largest_eigen_vec is a Math::Complex object
    foreach my $k (0..@largest_eigen_vec-1) {
        my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
        if ($theta eq '0') {
            $max_var_direction[$k] = $mag;
        } elsif ($theta eq 'pi') {
            $max_var_direction[$k] = -1.0 * $mag;
        } else {
            die "eigendecomposition of covariance matrix produced a complex " .
                "eigenvector --- something is wrong";
        }
    }
    print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
    # We now project all data points on the largest eigenvector.
    # Each projection will yield a single point on the eigenvector.
    my @projections;
    foreach my $j (0..$self->{_N}-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        die "Dimensionality of the largest eigenvector does not "
            . "match the dimensionality of the data" 
          unless @max_var_direction == $self->{_data_dimensions};
        my $projection = vector_multiply($data, \@max_var_direction);
        push @projections, $projection;
    }
    print "All projection points: @projections\n" if $self->{_debug};
    my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
    print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
    my @cluster_centers;
    foreach my $peakpoint (@peak_points) {
        my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
        push @cluster_centers, \@actual_peak_coords;
    }
    return \@cluster_centers;
}

# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
# option in the constructor.  It is called by the previous method to locate K peaks
# in a smoothed histogram of the data points projected onto the maximal variance
# direction.
sub find_peak_points_in_given_direction {
    my $dataref = shift;
    my $how_many = shift;
    my @data = @$dataref;
    my ($min, $max) = minmax(\@data);
    my $num_points = @data;
    my @sorted_data = sort {$a <=> $b} @data;
    my $scale = $max - $min;
    foreach my $index (0..$#sorted_data-1) {
        $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
    }
    my $avg_diff = 0;
    foreach my $index (0..$#sorted_data-1) {
        my $diff = $sorted_data[$index+1] - $sorted_data[$index];
        $avg_diff += ($diff - $avg_diff) / ($index + 1);
    }
    my $delta = 1.0 / 1000.0;
    my @accumulator = (0) x 1000;
    foreach my $index (0..@sorted_data-1) {
        my $cell_index = int($sorted_data[$index] / $delta);
        my $smoothness = 40;
        for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
            next if $index < 0 || $index > 999;
            $accumulator[$index]++;
        }
    }
    my $peaks_array = non_maximum_suppression( \@accumulator );
    my $peaks_index_hash = get_value_index_hash( $peaks_array );
    my @K_highest_peak_locations;
    my $k = 0;
    foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
        my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta;
        push @K_highest_peak_locations, $unscaled_peak_point
            if $k < $how_many;
        last if ++$k == $how_many;
    }
    return @K_highest_peak_locations;
}

# The purpose of this routine is to form initial clusters by assigning the data
# samples to the initial clusters formed by the previous routine on the basis of the
# best proximity of the data samples to the different cluster centers.
sub assign_data_to_clusters_initial {
    my $self = shift;
    my @cluster_centers = @{ shift @_ };
    my @clusters;
    foreach my $ele (@{$self->{_data_id_tags}}) {
        my $best_cluster;

lib/Algorithm/KMeans.pm  view on Meta::CPAN

        foreach my $local_cluster (@$new_clusters) {
            if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
                push @$local_cluster, pop @$largest_cluster;
            }
        }
        next if (($cluster_size_zero_condition) && ($iteration_index < 100));
        last if $iteration_index == 100;
        # Now do a deep copy of new_clusters into clusters
	$clusters = deep_copy_AoA( $new_clusters );
        last if $assignment_changed_flag == 0;
    }
    $final_cluster_centers = $self->update_cluster_centers( $clusters );
    return ($clusters, $final_cluster_centers);
}

sub update_cluster_centers_and_covariances_mahalanobis {
    my $self = shift;
    my @clusters = @{ shift @_ };
    my @new_cluster_centers;
    my @new_cluster_covariances;
    # During clustering for a fixed K, should a cluster inadvertently become empty,
    # steal a member from the largest cluster to hopefully spawn a new cluster:
    my $largest_cluster;
    foreach my $cluster (@clusters) {
        next if !defined $cluster;
        $largest_cluster = $cluster if !defined $largest_cluster;
        if (@$cluster > @$largest_cluster) {
            $largest_cluster = $cluster; 
        }
    }        
    foreach my $cluster (@clusters) {
        if ( (!defined $cluster) || (@$cluster == 0) ) {
            push @$cluster, pop @$largest_cluster;
        }
    }
    foreach my $cluster (@clusters) {
        die "Cluster became empty --- untenable condition " .
            "for a given K.  Try again. \n" if !defined $cluster;
        my $cluster_size = @$cluster;
        die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
        my @new_cluster_center = @{$self->add_point_coords( $cluster )};
        @new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
        push @new_cluster_centers, \@new_cluster_center;
        # for covariance calculation:
        my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
        my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
        my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
        # All the record labels are stored in the array $self->{_data_id_tags}.  The
        # actual data for clustering is stored in a hash at $self->{_data} whose keys are
        # the record labels; the value associated with each key is the array holding the
        # corresponding numerical multidimensional data.
        foreach my $j (0..$num_cols-1) {
            my $tag = $cluster->[$j];            
            my $data = $self->{_data}->{$tag};
            my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
            $matrix->set_col($j, \@diff_from_mean);
        }
        my $transposed = transpose( $matrix );
        my $covariance = matrix_multiply( $matrix, $transposed );
        $covariance *= 1.0 / $num_cols;
        if ($self->{_debug}) {
            print "\nDisplaying the Covariance Matrix for cluster:";
            display_matrix( $covariance );
        }
        push @new_cluster_covariances, $covariance;
    }
    return [\@new_cluster_centers, \@new_cluster_covariances];
}

# After each new assignment of the data points to the clusters on the basis of the
# current values for the cluster centers, we call the routine shown here for updating
# the values of the cluster centers.
sub update_cluster_centers {
    my $self = shift;
    my @clusters = @{ shift @_ };
    my @new_cluster_centers;
    # During clustering for a fixed K, should a cluster inadvertently become empty,
    # steal a member from the largest cluster to hopefully spawn a new cluster:
    my $largest_cluster;
    foreach my $cluster (@clusters) {
        next if !defined $cluster;
        $largest_cluster = $cluster if !defined $largest_cluster;
        if (@$cluster > @$largest_cluster) {
            $largest_cluster = $cluster; 
        }
    }        
    foreach my $cluster (@clusters) {
        if ( (!defined $cluster) || (@$cluster == 0) ) {
            push @$cluster, pop @$largest_cluster;
        }
    }
    foreach my $cluster (@clusters) {
        die "Cluster became empty --- untenable condition " .
            "for a given K.  Try again. \n" if !defined $cluster;
        my $cluster_size = @$cluster;
        die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
        my @new_cluster_center = @{$self->add_point_coords( $cluster )};
        @new_cluster_center = map {my $x = $_/$cluster_size; $x} 
                                  @new_cluster_center;
        push @new_cluster_centers, \@new_cluster_center;
    }        
    return \@new_cluster_centers;
}

sub which_cluster_for_new_data_element {
    my $self = shift;
    my $ele = shift;
    die "The dimensionality of the new data element is not correct: $!"
        unless @$ele == $self->{_data_dimensions};
    my %distance_to_new_ele_hash;
    foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
        $distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele, 
                                             $self->{_cluster_centers_hash}->{$cluster_id});
    }
    my @values = values %distance_to_new_ele_hash;
    my ($min,$max) = minmax(\@values);
    my $answer;
    foreach my $cluster_id (keys %distance_to_new_ele_hash) {
        $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
    }
    return $answer;

lib/Algorithm/KMeans.pm  view on Meta::CPAN

    } else {
        die "covariance matrix may not have an inverse";
    }
    my $determinant = $covariance_inverse->det();
    my $distance = $transposed * $covariance_inverse * $vec;
    my @distance = $distance->as_list;
    $distance = $distance[0];
    return sqrt $distance;
}
# Same as the previous method except that the first argument can be the actual
# coordinates of the data element.  Our goal is to find the Mahalanobis distance
# from a given data element to a cluster whose center and covariance are known.  As
# for the previous method, it requires three arguments.
sub distance_mahalanobis2 {
    my $self = shift;
    my $ele = shift;              # is now a ref to the array of coords for a point
    my $cluster_center = shift;
    my $cluster_covariance = shift;
    return undef unless defined $cluster_covariance;
    my $det_of_covar = $cluster_covariance->det();
    my @diff_from_mean = vector_subtract($ele, $cluster_center);
    my $vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
    $vec->set_col(0, \@diff_from_mean);
    my $transposed = transpose( $vec );
    my $covariance_inverse;
    if ($cluster_covariance->det() > 0.001) {
        $covariance_inverse = $cluster_covariance->inverse;
    } else {
        die "covariance matrix may not have an inverse";
    }
    my $determinant = $covariance_inverse->det();
    my $distance = $transposed * $covariance_inverse * $vec;
    my @distance = $distance->as_list;
    $distance = $distance[0];
    return sqrt $distance;
}

sub estimate_cluster_covariance {
    my $self = shift;
    my $cluster = shift;
    my $cluster_size = @$cluster;
    my @cluster_center = @{$self->add_point_coords( $cluster )};
    @cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
    # for covariance calculation:
    my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
    my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
    my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
    # All the record labels are stored in the array $self->{_data_id_tags}.  The
    # actual data for clustering is stored in a hash at $self->{_data} whose keys are
    # the record labels; the value associated with each key is the array holding the
    # corresponding numerical multidimensional data.
    foreach my $j (0..$num_cols-1) {
        my $tag = $cluster->[$j];            
        my $data = $self->{_data}->{$tag};
        my @diff_from_mean = vector_subtract($data, \@cluster_center);
        $matrix->set_col($j, \@diff_from_mean);
    }
    my $transposed = transpose( $matrix );
    my $covariance = $matrix * $transposed;
    $covariance *= 1.0 / $num_cols;
    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for cluster:";
        display_matrix( $covariance );
    }
    return $covariance;
}

sub write_clusters_to_files {
    my $self = shift;
    my @clusters = @{$self->{_clusters}};
    unlink glob "cluster*.dat";
    foreach my $i (0..@clusters-1) {
        my $filename = "cluster" . $i . ".txt";
        print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
        open FILEHANDLE, "| sort > $filename"  or die "Unable to open file: $!";
        foreach my $ele (@{$clusters[$i]}) {        
            print FILEHANDLE "$ele ";
        }
        close FILEHANDLE;
    }
}

sub get_K_best {
    my $self = shift;
    croak "You need to run the clusterer with K=0 option " .
          "before you can call this method" if $self->{_K_best} eq 'unknown';
    print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
    return $self->{_K_best};
}

sub show_QoC_values {
    my $self = shift;
    croak "\nYou need to run the clusterer with K=0 option before you can call this method" 
                            if $self->{_K_best} eq 'unknown';
    print "\nShown below are K on the left and the QoC values on the right (the smaller " .
          "the QoC, the better it is)\n";
    foreach my $key (sort keys %{$self->{_QoC_values}} ) {
        print " $key  =>  $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
    }
}

sub DESTROY {
    unlink "__temp_" . basename($_[0]->{_datafile});
    unlink "__temp_data_" . basename($_[0]->{_datafile});
    unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
}

##################################  Visualization Code ###################################

#  It makes sense to call visualize_clusters() only AFTER you have called kmeans().
#
#  The visualize_clusters() implementation automatically figures out whether it
#  should do a 2D plot or a 3D plot.  If the number of on bits in the mask that is
#  supplied as one of the arguments is greater than 2, it does a 3D plot for the
#  first three data coordinates.  That is, the clusters will be displayed in the 3D
#  space formed by the first three data coordinates. On the other hand, if the number
#  of on bits in the mask is exactly 2, it does a 2D plot.  Should it happen that
#  only one on bit is specified for the mask, visualize_clusters() aborts.
#
#  The visualization code consists of first accessing each of clusters created by the
#  kmeans() subroutine.  Note that the clusters contain only the symbolic names for

lib/Algorithm/KMeans.pm  view on Meta::CPAN

        my $j = int rand( $i + 1 );  
        @$arr[$i, $j] = @$arr[$j, $i]; 
    }
}

sub variance_normalization {
    my %data_hash = %{shift @_};
    my @all_data_points = values %data_hash;
    my $dimensions = @{$all_data_points[0]};
    my @data_projections;
    foreach my $data_point (@all_data_points) {
        my $i = 0;
        foreach my $proj (@$data_point) {
            push @{$data_projections[$i++]}, $proj;
        }
    }
    my @variance_vec;
    foreach my $vec (@data_projections) {
        my ($mean, $variance) = mean_and_variance( $vec );
        push @variance_vec, $variance;
    }
    my %new_data_hash;
    while (my ($label, $data) = each(%data_hash) ) {
        my @new_data;
        foreach my $i (0..@{$data}-1) {
            my $new = $data->[$i] / sqrt($variance_vec[$i]);
            push @new_data, $data->[$i] / sqrt($variance_vec[$i]);
        }
        $new_data_hash{$label} = \@new_data;
    }
    return \%new_data_hash;
}

sub mean_and_variance {
    my @data = @{shift @_};
    my ($mean, $variance);
    foreach my $i (1..@data) {
        if ($i == 1) {
            $mean = $data[0];
            $variance = 0;
        } else {
            $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
            $variance = ( (($i-1)/$i) * $variance )  + ($data[$i-1]-$mean)**2 / ($i-1);
        }
    }
    return ($mean, $variance);
}

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / datafile
                            mask
                            K
                            Kmin
                            Kmax
                            terminal_output
                            write_clusters_to_files
                            do_variance_normalization
                            cluster_seeding
                            use_mahalanobis_metric
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub get_value_index_hash {
    my $arr = shift;
    my %hash;
    foreach my $index (0..@$arr-1) {
        $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
    }
    return \%hash;
}

sub non_maximum_suppression {
    my $arr = shift;
    my @output = (0) x @$arr;
    my @final_output = (0) x @$arr;
    my %hash;
    my @array_of_runs = ([$arr->[0]]);
    foreach my $index (1..@$arr-1) {
        if ($arr->[$index] == $arr->[$index-1]) {
            push @{$array_of_runs[-1]}, $arr->[$index];
        } else {  
            push @array_of_runs, [$arr->[$index]];
        }
    }
    my $runstart_index = 0;
    foreach my $run_index (1..@array_of_runs-2) {
        $runstart_index += @{$array_of_runs[$run_index-1]};
        if ($array_of_runs[$run_index]->[0] > 
            $array_of_runs[$run_index-1]->[0]  &&
            $array_of_runs[$run_index]->[0] > 
            $array_of_runs[$run_index+1]->[0]) {
            my $run_center = @{$array_of_runs[$run_index]} / 2;
            my $assignment_index = $runstart_index + $run_center;
            $output[$assignment_index] = $arr->[$assignment_index];
        }
    }
    if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
        $runstart_index += @{$array_of_runs[-2]};
        my $run_center = @{$array_of_runs[-1]} / 2;
        my $assignment_index = $runstart_index + $run_center;
        $output[$assignment_index] = $arr->[$assignment_index];
    }
    if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
        my $run_center = @{$array_of_runs[0]} / 2;
        $output[$run_center] = $arr->[$run_center];
    }
    return \@output;



( run in 1.155 second using v1.01-cache-2.11-cpan-0bd6704ced7 )