Algorithm-KMeans

 view release on metacpan or  search on metacpan

examples/cluster_and_visualize.pl  view on Meta::CPAN

my $datafile = "mydatafile2.dat";           #  use:  K = 2,  mask = "N111",  vmask = "N111"
#my $datafile = "sphericaldata.csv";        #  use:  K = 3,  mask = "N111",  vmask = "N111"
#my $datafile = "mydatafile1.dat";          #  use:  K = 3,  mask = "N111",  vmask = "N111"
#my $datafile = "mydatafile3.dat";          #  use:  K = 2,  mask = "N11" ,  vmask = "N11"


# Mask: (For emphasis, this is a slightly more detailed repetition of the comment
# made above in Item 2)

# The mask tells the module which columns of the data file are are to be used for
# clustering, which columns are to be ignored, and which column contains a symbolic
# ID tag for a data point.  If the ID tag is in the first column and you are
# clustering 3D data in a file that has just four columns, the mask would be "N111".
# Note the first character in the mask in this case is `N' for "Name".  If, on the
# other hand, you wanted to ignore the first data coordinate (which is in the second
# column of the data file) for clustering, the mask would be "N011".  The symbolic ID
# can be in any column --- you just have to place the character `N' at the right
# place:


my $mask = "N111";         # for mydatafile1.dat, mydatafile2.dat, and sphericaldata.csv 
#my $mask = "N011";        # for mydatafile1.dat --- use all only last two cols
#my $mask = "N100";        # for mydatafile1.dat --- use only the first coordinate
#my $mask = "N11";         # for mydatafile3.dat


my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                        mask     => $mask,
                                        K        => 2,
                                        cluster_seeding => 'random',   # also try 'smart'
#                                        use_mahalanobis_metric => 1,   # also try '0'
                                        terminal_output => 1,
                                        write_clusters_to_files => 1,
                );

$clusterer->read_data_from_file();
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();


# ACCESSING THE CLUSTERS AND CLUSTER CENTERS IN YOUR SCRIPT:

print "\nDisplaying clusters in the terminal window:\n";
foreach my $cluster_id (sort keys %{$clusters_hash}) {
    print "\n$cluster_id   =>   @{$clusters_hash->{$cluster_id}}\n";
}

print "\nDisplaying cluster centers in the terminal window:\n";
foreach my $cluster_id (sort keys %{$cluster_centers_hash}) {
    print "\n$cluster_id   =>   @{$cluster_centers_hash->{$cluster_id}}\n";
}


# VISUALIZATION:

# Visualization mask:

# In most cases, you would not change the value of the mask between clustering and
# visualization.  But, if you are clustering multi-dimensional data and you wish to
# visualize the projection of of the data on each plane separately, you can do so by
# changing the value of the visualization mask.  The number of on bits in the
# visualization must not exceed the number of on bits in the original data mask.

my $vmask = "111";                 # for mydatafile1.dat and mydatafile2.dat
#my $vmask = "11";                 # for mydatafile3.dat

$clusterer->visualize_clusters( $vmask );



( run in 0.832 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )