Algorithm-KMeans
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
package Algorithm::KMeans;
#------------------------------------------------------------------------------------
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::KMeans is a Perl module for clustering multidimensional data.
# -----------------------------------------------------------------------------------
#use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
our $VERSION = '2.05';
# from Perl docs:
my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
# Constructor:
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
bless {
_datafile => $args{datafile} || croak("datafile required"),
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || 0,
_K_min => $args{Kmin} || 'unknown',
_K_max => $args{Kmax} || 'unknown',
_cluster_seeding => $args{cluster_seeding} || croak("must choose smart or random ".
"for cluster seeding"),
_var_normalize => $args{do_variance_normalization} || 0,
_use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0,
_clusters_2_files => $args{write_clusters_to_files} || 0,
_terminal_output => $args{terminal_output} || 0,
_debug => $args{debug} || 0,
_N => 0,
_K_best => 'unknown',
_original_data => {},
_data => {},
_data_id_tags => [],
_QoC_values => {},
_clusters => [],
_cluster_centers => [],
_clusters_hash => {},
_cluster_centers_hash => {},
_cluster_covariances_hash => {},
_data_dimensions => 0,
}, $class;
}
sub read_data_from_file {
my $self = shift;
my $filename = $self->{_datafile};
$self->read_data_from_file_csv() if $filename =~ /.csv$/;
$self->read_data_from_file_dat() if $filename =~ /.dat$/;
}
sub read_data_from_file_csv {
my $self = shift;
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
"the size of at least one of the data records in the file: $!"
unless scalar(@mask) == scalar(@splits);
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_original_data} = \%data_hash;
$self->{_data_id_tags} = \@data_tags;
( run in 2.004 seconds using v1.01-cache-2.11-cpan-bbb979687b5 )