best results from the CPAN

Algorithm-KMeans
#!/usr/bin/perl -w

#use lib '../blib/lib', '../blib/arch';


##  cluster_and_visualize.pl

##  This is the most basic script in the `examples' directory of the Algorithm::KMeans
##  module.  This script shows how the module is supposed to be called for clustering 
##  your data file.  You must experiment with all of the different options at the
##  six locations mentioned below in order to become more familiar with the capabilities
##  of the module.

##  Needs customization at SIX locations:
##
##      1) First choose which data file you want to use for clustering
##
##
##      2) Next, choose the data mask to apply to the columns of the data file.  The
##           position of the letter `N' in the mast indicates the column that
##           contains a symbolic name for each data record.  If the symbolic name for
##           each data record is in the first column and you want to cluster 3D data
##           that is in the next three columns, your data mask will be N111.  On the
##           other hand, if for the same data file, you want to carry out 2D
##           clustering on the last two columns, your data mask will be N011.
##
##      3) Next, you need to decide how many clusters you want the program to return.
##           If you want the program to figure out on its own how many clusters to 
##           partition the data into, see the script find_best_K_and_cluster.pl in this
##           directory.
##
##      4) Next you need to decide whether you want to `random' seeding or `smart'
##           seeding.  Bear in mind that `smart' seeding may produce worse results
##           than `random' seeding, depending on how the data clusters are actually
##           distributed.  
##
##      5) Next you need to decide whether or not you want to use the Mahalanobis
##           distance metric for clustering.  The default is the Euclidean metric.
##
##      6) Finally, you need to choose a mask for visualization.  Here is a reason
##           for why the visualization mask is set independently of the data mask
##           that was specified in Step 2: Let's say your datafile has 8 columns and
##           you are choosing to cluster the data records using 4 of those.
##           Subsequently, you may want to visually examine the quality of clustering
##           by examining some or 2D or 3D subspace of of the 4-dimensional space
##           used for clustering


use strict;
use Algorithm::KMeans;


my $datafile = "mydatafile2.dat";           #  use:  K = 2,  mask = "N111",  vmask = "N111"
#my $datafile = "sphericaldata.csv";        #  use:  K = 3,  mask = "N111",  vmask = "N111"
#my $datafile = "mydatafile1.dat";          #  use:  K = 3,  mask = "N111",  vmask = "N111"
#my $datafile = "mydatafile3.dat";          #  use:  K = 2,  mask = "N11" ,  vmask = "N11"


# Mask: (For emphasis, this is a slightly more detailed repetition of the comment
# made above in Item 2)

# The mask tells the module which columns of the data file are are to be used for
# clustering, which columns are to be ignored, and which column contains a symbolic
# ID tag for a data point.  If the ID tag is in the first column and you are
# clustering 3D data in a file that has just four columns, the mask would be "N111".
# Note the first character in the mask in this case is `N' for "Name".  If, on the
# other hand, you wanted to ignore the first data coordinate (which is in the second
# column of the data file) for clustering, the mask would be "N011".  The symbolic ID
# can be in any column --- you just have to place the character `N' at the right
# place:


my $mask = "N111";         # for mydatafile1.dat, mydatafile2.dat, and sphericaldata.csv 
#my $mask = "N011";        # for mydatafile1.dat --- use all only last two cols
#my $mask = "N100";        # for mydatafile1.dat --- use only the first coordinate
#my $mask = "N11";         # for mydatafile3.dat


my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                        mask     => $mask,
                                        K        => 2,
                                        cluster_seeding => 'random',   # also try 'smart'
#                                        use_mahalanobis_metric => 1,   # also try '0'
                                        terminal_output => 1,
                                        write_clusters_to_files => 1,
                );

$clusterer->read_data_from_file();
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
( run in 0.749 second using v1.01-cache-2.11-cpan-9581c071862 )