Algorithm-KMeans
view release on metacpan or search on metacpan
examples/cluster_and_visualize.pl view on Meta::CPAN
my $datafile = "mydatafile2.dat"; # use: K = 2, mask = "N111", vmask = "N111"
#my $datafile = "sphericaldata.csv"; # use: K = 3, mask = "N111", vmask = "N111"
#my $datafile = "mydatafile1.dat"; # use: K = 3, mask = "N111", vmask = "N111"
#my $datafile = "mydatafile3.dat"; # use: K = 2, mask = "N11" , vmask = "N11"
# Mask: (For emphasis, this is a slightly more detailed repetition of the comment
# made above in Item 2)
# The mask tells the module which columns of the data file are are to be used for
# clustering, which columns are to be ignored, and which column contains a symbolic
# ID tag for a data point. If the ID tag is in the first column and you are
# clustering 3D data in a file that has just four columns, the mask would be "N111".
# Note the first character in the mask in this case is `N' for "Name". If, on the
# other hand, you wanted to ignore the first data coordinate (which is in the second
# column of the data file) for clustering, the mask would be "N011". The symbolic ID
# can be in any column --- you just have to place the character `N' at the right
# place:
my $mask = "N111"; # for mydatafile1.dat, mydatafile2.dat, and sphericaldata.csv
#my $mask = "N011"; # for mydatafile1.dat --- use all only last two cols
#my $mask = "N100"; # for mydatafile1.dat --- use only the first coordinate
#my $mask = "N11"; # for mydatafile3.dat
my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
mask => $mask,
K => 2,
cluster_seeding => 'random', # also try 'smart'
# use_mahalanobis_metric => 1, # also try '0'
terminal_output => 1,
write_clusters_to_files => 1,
);
$clusterer->read_data_from_file();
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();
# ACCESSING THE CLUSTERS AND CLUSTER CENTERS IN YOUR SCRIPT:
print "\nDisplaying clusters in the terminal window:\n";
foreach my $cluster_id (sort keys %{$clusters_hash}) {
print "\n$cluster_id => @{$clusters_hash->{$cluster_id}}\n";
}
print "\nDisplaying cluster centers in the terminal window:\n";
foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
print "\n$cluster_id => @{$cluster_centers_hash->{$cluster_id}}\n";
}
# VISUALIZATION:
# Visualization mask:
# In most cases, you would not change the value of the mask between clustering and
# visualization. But, if you are clustering multi-dimensional data and you wish to
# visualize the projection of of the data on each plane separately, you can do so by
# changing the value of the visualization mask. The number of on bits in the
# visualization must not exceed the number of on bits in the original data mask.
my $vmask = "111"; # for mydatafile1.dat and mydatafile2.dat
#my $vmask = "11"; # for mydatafile3.dat
$clusterer->visualize_clusters( $vmask );
( run in 0.832 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )