view release on metacpan or search on metacpan
examples/cluster_and_visualize.pl view on Meta::CPAN
# VISUALIZATION:
# Visualization mask:
# In most cases, you would not change the value of the mask between clustering and
# visualization. But, if you are clustering multi-dimensional data and you wish to
# visualize the projection of of the data on each plane separately, you can do so by
# changing the value of the visualization mask. The number of on bits in the
# visualization must not exceed the number of on bits in the original data mask.
my $vmask = "111"; # for mydatafile1.dat and mydatafile2.dat
#my $vmask = "11"; # for mydatafile3.dat
$clusterer->visualize_clusters( $vmask );
examples/cluster_and_visualize_with_data_visualization.pl view on Meta::CPAN
## IMPORTANT: Read the 6 point customization of a script like this in the
## file
## cluster_and_visualize.pl
## The focus of this script is on the visualization steps after the data has been
## clustered. This script makes calls for the visualization of the data that was
## used for clustering --- both the original data and the data after it is normed
## for variance normalization (assuming you choose the variance normalization step,
## which is not always a good thing).
use strict;
use Algorithm::KMeans;
#my $datafile = "mydatafile1.dat";
my $datafile = "sphericaldata.csv";
examples/cluster_and_visualize_with_data_visualization.pl view on Meta::CPAN
# Visualization mask:
# Read the comment block in cluster_and_visualize() that is associated with the
# setting up of the visualization mask.
my $visualization_mask = "111";
# In order to see the effects of variance normalization of the data (each data
# coordinate is normalized by the standard-deviation along that coordinate axis), it
# is sometimes useful to see both the raw data and its normalized form. The
# following two calls accomplish that:
$clusterer->visualize_data($visualization_mask, 'original');
$clusterer->visualize_data($visualization_mask, 'normed');
# Finally, you can visualize the clusters. BUT NOTE THAT THE VISUALIZATION MASK FOR
# CLUSTER VISUALIZATION WILL, IN GENERAL, BE INDEPENDENT OF THE VISUALIZATION MASK
# FOR VIEWING THE DATA:
$clusterer->visualize_clusters($visualization_mask);
examples/find_best_K_and_cluster.pl view on Meta::CPAN
# VISUALIZATION:
# Visualization mask:
# In most cases, you would not change the value of the mask between clustering and
# visualization. But, if you are clustering multi-dimensional data and you wish to
# visualize the projection of of the data on each plane separately, you can do so by
# changing the value of the visualization mask. The number of on bits in the
# visualization must not exceed the number of on bits in the original data mask.
my $visualization_mask = "111"; # for both mydatafile1.dat and mydatafile2.dat
#my $visualization_mask = "11";
$clusterer->visualize_clusters($visualization_mask);
examples/find_best_K_in_range_and_cluster.pl view on Meta::CPAN
$clusterer->show_QoC_values();
# VISUALIZATION:
# Visualization mask:
# In most cases, you would not change the value of the mask between clustering and
# visualization. But, if you are clustering multi-dimensional data and you wish to
# visualize the projection of of the data on each plane separately, you can do so by
# changing the value of the visualization mask. The number of on bits in the
# visualization must not exceed the number of on bits in the original data mask.
my $visualization_mask = "111";
$clusterer->visualize_clusters($visualization_mask);
lib/Algorithm/KMeans.pm view on Meta::CPAN
_K_max => $args{Kmax} || 'unknown',
_cluster_seeding => $args{cluster_seeding} || croak("must choose smart or random ".
"for cluster seeding"),
_var_normalize => $args{do_variance_normalization} || 0,
_use_mahalanobis_metric => $args{use_mahalanobis_metric} || 0,
_clusters_2_files => $args{write_clusters_to_files} || 0,
_terminal_output => $args{terminal_output} || 0,
_debug => $args{debug} || 0,
_N => 0,
_K_best => 'unknown',
_original_data => {},
_data => {},
_data_id_tags => [],
_QoC_values => {},
_clusters => [],
_cluster_centers => [],
_clusters_hash => {},
_cluster_centers_hash => {},
_cluster_covariances_hash => {},
_data_dimensions => 0,
lib/Algorithm/KMeans.pm view on Meta::CPAN
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
"the size of at least one of the data records in the file: $!"
unless scalar(@mask) == scalar(@splits);
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_original_data} = \%data_hash;
$self->{_data_id_tags} = \@data_tags;
$self->{_N} = scalar @data_tags;
if ($self->{_var_normalize}) {
$self->{_data} = variance_normalization( $self->{_original_data} );
} else {
$self->{_data} = deep_copy_hash( $self->{_original_data} );
}
# Need to make the following call to set the global mean and covariance:
# my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
# Need to make the following call to set the global eigenvec eigenval sets:
# $self->eigen_analysis_of_covariance($covariance);
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
lib/Algorithm/KMeans.pm view on Meta::CPAN
next;
} elsif ($mask[$i] eq 'N') {
$record_id = $fields[$i];
} elsif ($mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "misformed mask for reading the data file\n";
}
}
my @nums = map {/$_num_regex/;$_} @data_fields;
$self->{_original_data}->{ $record_id } = \@nums;
}
if ($self->{_var_normalize}) {
$self->{_data} = variance_normalization( $self->{_original_data} );
} else {
$self->{_data} = deep_copy_hash( $self->{_original_data} );
}
my @all_data_ids = keys %{$self->{_data}};
$self->{_data_id_tags} = \@all_data_ids;
$self->{_N} = scalar @all_data_ids;
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
if ( $self->{_N} < (2 * $self->{_K} ** 2) );
lib/Algorithm/KMeans.pm view on Meta::CPAN
# All the record labels are stored in the array $self->{_data_id_tags}. The
# actual data for clustering is stored in a hash at $self->{_data} whose keys are
# the record labels; the value associated with each key is the array holding the
# corresponding numerical multidimensional data.
foreach my $j (0..$num_cols-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
$matrix->set_col($j, $data);
}
if ($self->{_debug}) {
print "\nDisplaying the original data as a matrix:";
display_matrix( $matrix );
}
foreach my $j (0..$num_cols-1) {
$mean_vec += $matrix->col($j);
}
$mean_vec *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "Displaying the mean vector for the data:";
display_matrix( $mean_vec );
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
# should do a 2D plot or a 3D plot. If the number of on bits in the mask that is
# supplied as one of the arguments is greater than 2, it does a 3D plot for the
# first three data coordinates. That is, the clusters will be displayed in the 3D
# space formed by the first three data coordinates. On the other hand, if the number
# of on bits in the mask is exactly 2, it does a 2D plot. Should it happen that
# only one on bit is specified for the mask, visualize_clusters() aborts.
#
# The visualization code consists of first accessing each of clusters created by the
# kmeans() subroutine. Note that the clusters contain only the symbolic names for
# the individual records in the source data file. We therefore next reach into the
# $self->{_original_data} hash and get the data coordinates associated with each
# symbolic label in a cluster. The numerical data thus generated is then written
# out to a temp file. When doing so we must remember to insert TWO BLANK LINES
# between the data blocks corresponding to the different clusters. This constraint
# is imposed on us by Gnuplot when plotting data from the same file since we want to
# use different point styles for the data points in different cluster files.
#
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
# module to plot the data clusters.
sub visualize_clusters {
my $self = shift;
lib/Algorithm/KMeans.pm view on Meta::CPAN
$v_mask = shift || croak "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || croak "visualization mask missing";
$pause_time = shift;
} else {
croak "visualize_clusters() called with wrong args";
}
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
croak "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
my %visualization_data;
while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
my @fields = @$data;
croak "\nABORTED: Visualization mask size exceeds data record size\n"
if $#v_mask > $#fields;
my @data_fields;
foreach my $i (0..@fields-1) {
if ($v_mask[$i] eq '0') {
next;
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
lib/Algorithm/KMeans.pm view on Meta::CPAN
$hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
} elsif ($visualization_data_field_width == 1) {
croak "No provision for plotting 1-D data\n";
}
}
# It makes sense to call visualize_data() only AFTER you have called the method
# read_data_from_file().
#
# The visualize_data() is meant for the visualization of the original data in its
# various 2D or 3D subspaces. The method can also be used to visualize the normed
# data in a similar manner. Recall the normed data is the original data after each
# data dimension is normalized by the standard-deviation along that dimension.
#
# Whether you see the original data or the normed data depends on the second
# argument supplied in the method call. It must be either the string 'original' or
# the string 'normed'.
sub visualize_data {
my $self = shift;
my $v_mask = shift || croak "visualization mask missing";
my $datatype = shift; # must be either 'original' or 'normed'
croak "\n\nABORTED: You called visualize_data() for normed data " .
"but without first turning on data normalization in the " .
"in the KMeans constructor"
if ($datatype eq 'normed') && ! $self->{_var_normalize};
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
croak "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
my %visualization_data;
my $data_source;
if ($datatype eq 'original') {
$data_source = $self->{_original_data};
} elsif ($datatype eq 'normed') {
$data_source = $self->{_data};
} else {
croak "\n\nABORTED: improper call to visualize_data()";
}
while ( my ($record_id, $data) = each %{$data_source} ) {
my @fields = @$data;
croak "\nABORTED: Visualization mask size exceeds data record size\n"
if $#v_mask > $#fields;
my @data_fields;
lib/Algorithm/KMeans.pm view on Meta::CPAN
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
croak "Misformed visualization mask. It can only have 1s and 0s\n";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my $filename = basename($master_datafile);
my $temp_file;
if ($datatype eq 'original') {
$temp_file = "__temp_data_" . $filename;
} elsif ($datatype eq 'normed') {
$temp_file = "__temp_normed_data_" . $filename;
} else {
croak "ABORTED: Improper call to visualize_data()";
}
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!\n";
foreach my $datapoint (values %visualization_data) {
print OUTPUT "@$datapoint";
print OUTPUT "\n";
}
close OUTPUT;
my $plot = Graphics::GnuplotIF->new( persist => 1 );
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
my $arg_string ;
if ($visualization_data_field_width > 2) {
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 2) {
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 1 ) {
$arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
}
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "splot $arg_string" );
lib/Algorithm/KMeans.pm view on Meta::CPAN
my $ele = $self->{_data}->{$id};
my $i = 0;
foreach my $component (@$ele) {
$result[$i] += $component;
$i++;
}
}
return \@result;
}
sub add_point_coords_from_original_data {
my $self = shift;
my @arr_of_ids = @{shift @_}; # array of data element names
my @result;
my $data_dimensionality = $self->{_data_dimensions};
foreach my $i (0..$data_dimensionality-1) {
$result[$i] = 0.0;
}
foreach my $id (@arr_of_ids) {
my $ele = $self->{_original_data}->{$id};
my $i = 0;
foreach my $component (@$ele) {
$result[$i] += $component;
$i++;
}
}
return \@result;
}
################################### Support Routines ########################################
lib/Algorithm/KMeans.pm view on Meta::CPAN
return $ref_out;
}
sub display_cluster_centers {
my $self = shift;
my @clusters = @{shift @_};
my $i = 0;
foreach my $cluster (@clusters) {
my $cluster_size = @$cluster;
my @cluster_center =
@{$self->add_point_coords_from_original_data( $cluster )};
@cluster_center = map {my $x = $_/$cluster_size; $x} @cluster_center;
print "\ncluster $i ($cluster_size records):\n";
print "cluster center $i: " .
"@{[map {my $x = sprintf('%.4f', $_); $x} @cluster_center]}\n";
$i++;
}
}
# For displaying the individual clusters on a terminal screen. Each cluster is
# displayed through the symbolic names associated with the data points.
lib/Algorithm/KMeans.pm view on Meta::CPAN
print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
}
foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
}
# CLUSTER VISUALIZATION:
# You must first set the mask for cluster visualization. This mask tells the module
# which 2D or 3D subspace of the original data space you wish to visualize the
# clusters in:
my $visualization_mask = "111";
$clusterer->visualize_clusters($visualization_mask);
# SYNTHETIC DATA GENERATION:
# The module has been provided with a class method for generating multivariate data
# for experimenting with clustering. The data generation is controlled by the
lib/Algorithm/KMeans.pm view on Meta::CPAN
(Quality-of-Clustering) values in the right column. Note that this call makes sense
only if you either supply the C<K=0> option to the constructor, or if you specify
values for the C<Kmin> and C<Kmax> options.
=item B<visualize_clusters()>
$clusterer->visualize_clusters( $visualization_mask )
The visualization mask here does not have to be identical to the one used for
clustering, but must be a subset of that mask. This is convenient for visualizing
the clusters in two- or three-dimensional subspaces of the original space.
=item B<visualize_data()>
$clusterer->visualize_data($visualization_mask, 'original');
$clusterer->visualize_data($visualization_mask, 'normed');
This method requires a second argument and, as shown, it must be either the string
C<original> or the string C<normed>, the former for the visualization of the raw data
and the latter for the visualization of the data after its different dimensions are
normalized by the standard-deviations along those directions. If you call the method
with the second argument set to C<normed>, but do so without turning on the
C<do_variance_normalization> option in the KMeans constructor, it will let you know.
=item B<which_cluster_for_new_data_element()>
If you wish to determine the cluster membership of a new data sample after you have
created the clusters with the existing data samples, you would need to call this