Algorithm-KMeans
view release on metacpan or search on metacpan
examples/which_cluster_for_new_data.pl view on Meta::CPAN
#my $mask = "N11"; # for mydatafile3.dat
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => $mask,
K => 3,
cluster_seeding => 'random', # also try 'smart'
use_mahalanobis_metric => 1, # also try '0'
terminal_output => 1,
write_clusters_to_files => 1,
debug => 0,
);
$clusterer->read_data_from_file();
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
# ACCESSING THE CLUSTERS AND CLUSTER CENTERS IN YOUR SCRIPT:
print "\nDisplaying clusters in the terminal window:\n";
foreach my $cluster_id (sort keys %{$clusters_hash}) {
lib/Algorithm/KMeans.pm view on Meta::CPAN
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || 0,
_K_min => $args{Kmin} || 'unknown',
_K_max => $args{Kmax} || 'unknown',
_cluster_seeding => $args{cluster_seeding} || croak("must choose smart or random ".
"for cluster seeding"),
_var_normalize => $args{do_variance_normalization} || 0,
_use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0,
_clusters_2_files => $args{write_clusters_to_files} || 0,
_terminal_output => $args{terminal_output} || 0,
_debug => $args{debug} || 0,
_N => 0,
_K_best => 'unknown',
_original_data => {},
_data => {},
_data_id_tags => [],
_QoC_values => {},
_clusters => [],
_cluster_centers => [],
_clusters_hash => {},
_cluster_centers_hash => {},
lib/Algorithm/KMeans.pm view on Meta::CPAN
sub get_initial_cluster_centers_smart {
my $self = shift;
my $K = shift;
if ($self->{_data_dimensions} == 1) {
my @one_d_data;
foreach my $j (0..$self->{_N}-1) {
my $tag = $self->{_data_id_tags}[$j];
push @one_d_data, $self->{_data}->{$tag}->[0];
}
my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
print "highest points at data values: @peak_points\n" if $self->{_debug};
my @cluster_centers;
foreach my $peakpoint (@peak_points) {
push @cluster_centers, [$peakpoint];
}
return \@cluster_centers;
}
my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_id_tags}. The
# actual data for clustering is stored in a hash at $self->{_data} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
foreach my $j (0..$num_cols-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
$matrix->set_col($j, $data);
}
if ($self->{_debug}) {
print "\nDisplaying the original data as a matrix:";
display_matrix( $matrix );
}
foreach my $j (0..$num_cols-1) {
$mean_vec += $matrix->col($j);
}
$mean_vec *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "Displaying the mean vector for the data:";
display_matrix( $mean_vec );
}
foreach my $j (0..$num_cols-1) {
my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
$matrix->set_col($j, \@new_col);
}
if ($self->{_debug}) {
print "Displaying mean subtracted data as a matrix:";
display_matrix( $matrix );
}
my $transposed = transpose( $matrix );
if ($self->{_debug}) {
print "Displaying transposed data matrix:";
display_matrix( $transposed );
}
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for your data:";
display_matrix( $covariance );
}
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
}
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
foreach my $i (0..$num_of_eigens-1) {
my @vec = $eigenvectors->[$i]->as_list;
print "Eigenvector $i: @vec\n" if $self->{_debug};
}
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
my @max_var_direction;
# Each element of the array @largest_eigen_vec is a Math::Complex object
foreach my $k (0..@largest_eigen_vec-1) {
my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
if ($theta eq '0') {
$max_var_direction[$k] = $mag;
} elsif ($theta eq 'pi') {
$max_var_direction[$k] = -1.0 * $mag;
} else {
die "eigendecomposition of covariance matrix produced a complex " .
"eigenvector --- something is wrong";
}
}
print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
# We now project all data points on the largest eigenvector.
# Each projection will yield a single point on the eigenvector.
my @projections;
foreach my $j (0..$self->{_N}-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
die "Dimensionality of the largest eigenvector does not "
. "match the dimensionality of the data"
unless @max_var_direction == $self->{_data_dimensions};
my $projection = vector_multiply($data, \@max_var_direction);
push @projections, $projection;
}
print "All projection points: @projections\n" if $self->{_debug};
my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
my @cluster_centers;
foreach my $peakpoint (@peak_points) {
my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
push @cluster_centers, \@actual_peak_coords;
}
return \@cluster_centers;
}
# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
# option in the constructor. It is called by the previous method to locate K peaks
lib/Algorithm/KMeans.pm view on Meta::CPAN
# corresponding numerical multidimensional data.
foreach my $j (0..$num_cols-1) {
my $tag = $cluster->[$j];
my $data = $self->{_data}->{$tag};
my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
$matrix->set_col($j, \@diff_from_mean);
}
my $transposed = transpose( $matrix );
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for cluster:";
display_matrix( $covariance );
}
push @new_cluster_covariances, $covariance;
}
return [\@new_cluster_centers, \@new_cluster_covariances];
}
# After each new assignment of the data points to the clusters on the basis of the
# current values for the cluster centers, we call the routine shown here for updating
lib/Algorithm/KMeans.pm view on Meta::CPAN
# corresponding numerical multidimensional data.
foreach my $j (0..$num_cols-1) {
my $tag = $cluster->[$j];
my $data = $self->{_data}->{$tag};
my @diff_from_mean = vector_subtract($data, \@cluster_center);
$matrix->set_col($j, \@diff_from_mean);
}
my $transposed = transpose( $matrix );
my $covariance = $matrix * $transposed;
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for cluster:";
display_matrix( $covariance );
}
return $covariance;
}
sub write_clusters_to_files {
my $self = shift;
my @clusters = @{$self->{_clusters}};
unlink glob "cluster*.dat";
lib/Algorithm/KMeans.pm view on Meta::CPAN
my @legal_params = qw / datafile
mask
K
Kmin
Kmax
terminal_output
write_clusters_to_files
do_variance_normalization
cluster_seeding
use_mahalanobis_metric
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
( run in 1.043 second using v1.01-cache-2.11-cpan-49f99fa48dc )