Algorithm-KMeans
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
sub write_clusters_to_files {
my $self = shift;
my @clusters = @{$self->{_clusters}};
unlink glob "cluster*.dat";
foreach my $i (0..@clusters-1) {
my $filename = "cluster" . $i . ".txt";
print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
foreach my $ele (@{$clusters[$i]}) {
print FILEHANDLE "$ele ";
}
close FILEHANDLE;
}
}
sub get_K_best {
my $self = shift;
croak "You need to run the clusterer with K=0 option " .
"before you can call this method" if $self->{_K_best} eq 'unknown';
print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
return $self->{_K_best};
}
sub show_QoC_values {
my $self = shift;
croak "\nYou need to run the clusterer with K=0 option before you can call this method"
if $self->{_K_best} eq 'unknown';
print "\nShown below are K on the left and the QoC values on the right (the smaller " .
"the QoC, the better it is)\n";
foreach my $key (sort keys %{$self->{_QoC_values}} ) {
print " $key => $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
}
}
sub DESTROY {
unlink "__temp_" . basename($_[0]->{_datafile});
unlink "__temp_data_" . basename($_[0]->{_datafile});
unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
}
################################## Visualization Code ###################################
# It makes sense to call visualize_clusters() only AFTER you have called kmeans().
#
# The visualize_clusters() implementation automatically figures out whether it
# should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
# supplied as one of the arguments is greater than 2, it does a 3D plot for the
# first three data coordinates. That is, the clusters will be displayed in the 3D
# space formed by the first three data coordinates. On the other hand, if the number
# of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
# only one on bit is specified for the mask, visualize_clusters() aborts.
#
# The visualization code consists of first accessing each of clusters created by the
# kmeans() subroutine. Note that the clusters contain only the symbolic names for
# the individual records in the source data file. We therefore next reach into the
# $self->{_original_data} hash and get the data coordinates associated with each
# symbolic label in a cluster. The numerical data thus generated is then written
# out to a temp file. When doing so we must remember to insert TWO BLANK LINES
# between the data blocks corresponding to the different clusters. This constraint
# is imposed on us by Gnuplot when plotting data from the same file since we want to
# use different point styles for the data points in different cluster files.
#
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
# module to plot the data clusters.
sub visualize_clusters {
my $self = shift;
my $v_mask;
my $pause_time;
if (@_ == 1) {
$v_mask = shift || croak "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || croak "visualization mask missing";
$pause_time = shift;
} else {
croak "visualize_clusters() called with wrong args";
}
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
croak "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
my %visualization_data;
while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
my @fields = @$data;
croak "\nABORTED: Visualization mask size exceeds data record size\n"
if $#v_mask > $#fields;
my @data_fields;
foreach my $i (0..@fields-1) {
if ($v_mask[$i] eq '0') {
next;
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
croak "Misformed visualization mask. It can only have 1s and 0s\n";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my @all_data_ids = @{$self->{_data_id_tags}};
my $K = scalar @{$self->{_clusters}};
my $filename = basename($master_datafile);
my $temp_file = "__temp_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!\n";
foreach my $cluster (@{$self->{_clusters}}) {
foreach my $item (@$cluster) {
print OUTPUT "@{$visualization_data{$item}}";
print OUTPUT "\n";
}
print OUTPUT "\n\n";
}
close OUTPUT;
my $plot;
my $hardcopy_plot;
( run in 1.685 second using v1.01-cache-2.11-cpan-140bd7fdf52 )