Algorithm-KMeans

 view release on metacpan or  search on metacpan

lib/Algorithm/KMeans.pm  view on Meta::CPAN

        my $diff = $sorted_data[$index+1] - $sorted_data[$index];
        $avg_diff += ($diff - $avg_diff) / ($index + 1);
    }
    my $delta = 1.0 / 1000.0;
    my @accumulator = (0) x 1000;
    foreach my $index (0..@sorted_data-1) {
        my $cell_index = int($sorted_data[$index] / $delta);
        my $smoothness = 40;
        for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
            next if $index < 0 || $index > 999;
            $accumulator[$index]++;
        }
    }
    my $peaks_array = non_maximum_suppression( \@accumulator );
    my $peaks_index_hash = get_value_index_hash( $peaks_array );
    my @K_highest_peak_locations;
    my $k = 0;
    foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
        my $unscaled_peak_point = $min + $peaks_index_hash->{$peak} * $scale * $delta;
        push @K_highest_peak_locations, $unscaled_peak_point
            if $k < $how_many;
        last if ++$k == $how_many;
    }
    return @K_highest_peak_locations;
}

# The purpose of this routine is to form initial clusters by assigning the data
# samples to the initial clusters formed by the previous routine on the basis of the
# best proximity of the data samples to the different cluster centers.
sub assign_data_to_clusters_initial {
    my $self = shift;
    my @cluster_centers = @{ shift @_ };
    my @clusters;
    foreach my $ele (@{$self->{_data_id_tags}}) {
        my $best_cluster;
        my @dist_from_clust_centers;
        foreach my $center (@cluster_centers) {
            push @dist_from_clust_centers, $self->distance($ele, $center);
        }
        my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
        push @{$clusters[$best_center_index]}, $ele;
    }
    return \@clusters;
}    

sub assign_data_to_clusters_initial_mahalanobis {
    my $self = shift;
    my @cluster_centers = @{ shift @_ };
    my @clusters;
    foreach my $ele (@{$self->{_data_id_tags}}) {
        my $best_cluster;
        my @dist_from_clust_centers;
        foreach my $center (@cluster_centers) {
            push @dist_from_clust_centers, $self->distance($ele, $center);
        }
        my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
        push @{$clusters[$best_center_index]}, $ele if defined $best_center_index;
    }
    # Since a cluster center may not correspond to any particular sample, it is possible
    # for one of the elements of the array @clusters to be null using the above 
    # strategy for populating the initial clusters.  Let's say there are five cluster
    # centers in the array @cluster_centers.  The $best_center_index may populate the
    # the elements of the array @clusters for the indices 0, 1, 2, 4, which would leave
    # $clusters[3] as undefined.  So, in what follows, we must first check if all of
    # the elements of @clusters are defined.
    my @determinants;
    foreach my $cluster(@clusters) {
        die "The clustering program started with bad initialization.  Please start over" 
            unless defined $cluster;
        my $covariance = $self->estimate_cluster_covariance($cluster);
        my $determinant = $covariance->det();
        push @determinants, $determinant;
    }
    return [\@clusters, \@determinants];
}    

# This is the main routine that along with the update_cluster_centers() routine
# constitute the two key steps of the K-Means algorithm.  In most cases, the infinite
# while() loop will terminate automatically when the cluster assignments of the data
# points remain unchanged. For the sake of safety, we keep track of the number of
# iterations. If this number reaches 100, we exit the while() loop anyway.  In most
# cases, this limit will not be reached.
sub assign_data_to_clusters {
    my $self = shift;
    my $clusters = shift;
    my $K = shift;
    my $final_cluster_centers;
    my $iteration_index = 0;
    while (1) {
        my $new_clusters;
        my $assignment_changed_flag = 0;
        my $current_cluster_center_index = 0;
        my $cluster_size_zero_condition = 0;
        my $how_many = @$clusters;
        my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
        $iteration_index++;
        foreach my $cluster (@$clusters) {
            my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
            foreach my $ele (@$cluster) {
                my @dist_from_clust_centers;
                foreach my $center (@$cluster_centers) {
                    push @dist_from_clust_centers, 
                               $self->distance($ele, $center);
                }
                my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
                my $best_cluster_center = $cluster_centers->[$best_center_index];
                if (vector_equal($current_cluster_center, $best_cluster_center)){
                    push @{$new_clusters->[$current_cluster_center_index]}, $ele;
                } else {
                    $assignment_changed_flag = 1;             
                    push @{$new_clusters->[$best_center_index]}, $ele;
                }
            }
            $current_cluster_center_index++;
        }
        next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
        # Now make sure that none of the K clusters is an empty cluster:
        foreach my $newcluster (@$new_clusters) {
            $cluster_size_zero_condition = 1 if ((!defined $newcluster) or  (@$newcluster == 0));
        }
        push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;

lib/Algorithm/KMeans.pm  view on Meta::CPAN

            $result += $row[$j] * $col[$j];
        }
        return $result;
    }
    my $product = Math::GSL::Matrix->new($nrows1, $nrows1);
    foreach my $i (0..$nrows1-1) {
        my $row = $matrix1->row($i);
        my @product_row;
        foreach my $j (0..$ncols2-1) {
            my $col = $matrix2->col($j);
            my $row_times_col = matrix_multiply($row, $col);
            push @product_row, $row_times_col;
        }
        $product->set_row($i, \@product_row);
    }
    return $product;
}

1;

=pod

=head1 NAME

Algorithm::KMeans - for clustering multidimensional data

=head1 SYNOPSIS

  # You now have four different choices for clustering your data with this module:
  #
  #     1)  With Euclidean distances and with random cluster seeding
  #    
  #     2)  With Mahalanobis distances and with random cluster seeding
  #   
  #     3)  With Euclidean distances and with smart cluster seeding
  #
  #     4)  With Mahalanobis distances and with smart cluster seeding
  #
  # Despite the qualifier 'smart' in 'smart cluster seeding', it may not always
  # produce results that are superior to those obtained with random seeding.  (If you
  # also factor in the choice regarding variance normalization, you actually have
  # eight different choices for data clustering with this module.)
  #
  # In all cases, you'd obviously begin with

  use Algorithm::KMeans;

  # You'd then name the data file:

  my $datafile = "mydatafile.csv";

  # Next, set the mask to indicate which columns of the datafile to use for
  # clustering and which column contains a symbolic ID for each data record. For
  # example, if the symbolic name is in the first column, you want the second column
  # to be ignored, and you want the next three columns to be used for 3D clustering,
  # you'd set the mask to:

  my $mask = "N0111";

  # Now construct an instance of the clusterer.  The parameter K controls the number
  # of clusters.  If you know how many clusters you want (let's say 3), call

  my $clusterer = Algorithm::KMeans->new( datafile        => $datafile,
                                          mask            => $mask,
                                          K               => 3,
                                          cluster_seeding => 'random',
                                          terminal_output => 1,
                                          write_clusters_to_files => 1,
                                        );

  # By default, this constructor call will set you up for clustering based on
  # Euclidean distances.  If you want the module to use Mahalanobis distances, your
  # constructor call will look like:

  my $clusterer = Algorithm::KMeans->new( datafile        => $datafile,
                                          mask            => $mask,
                                          K               => 3,
                                          cluster_seeding => 'random',
                                          use_mahalanobis_metric => 1,
                                          terminal_output => 1,
                                          write_clusters_to_files => 1,
                                        );

  # For both constructor calls shown above, you can use smart seeding of the clusters
  # by changing 'random' to 'smart' for the cluster_seeding option.  See the
  # explanation of smart seeding in the Methods section of this documentation.

  # If your data is such that its variability along the different dimensions of the
  # data space is significantly different, you may get better clustering if you first
  # normalize your data by setting the constructor parameter
  # do_variance_normalization as shown below:

  my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                          mask     => $mask,
                                          K        => 3,
                                          cluster_seeding => 'smart',    # or 'random'
                                          terminal_output => 1,
                                          do_variance_normalization => 1,
                                          write_clusters_to_files => 1,
                                        );

  # But bear in mind that such data normalization may actually decrease the
  # performance of the clusterer if the variability in the data is more a result of
  # the separation between the means than a consequence of intra-cluster variance.

  # Set K to 0 if you want the module to figure out the optimum number of clusters
  # from the data. (It is best to run this option with the terminal_output set to 1
  # so that you can see the different value of QoC for the different K):

  my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                          mask     => $mask,
                                          K        => 0,
                                          cluster_seeding => 'random',    # or 'smart'
                                          terminal_output => 1,
                                          write_clusters_to_files => 1,
                                        );

  # Although not shown above, you can obviously set the 'do_variance_normalization'
  # flag here also if you wish.

  # For very large data files, setting K to 0 will result in searching through too

lib/Algorithm/KMeans.pm  view on Meta::CPAN


    my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                            mask     => "N111",   
                                            K        => 2,        
                                            cluster_seeding => 'smart',   # try 'random' also
                                            terminal_output => 1,
                                            do_variance_normalization => 1,
                    );

=back

=head2 Constructor Parameters

=over 8

=item C<datafile>:

This parameter names the data file that contains the multidimensional data records
you want the module to cluster.

=item C<mask>:

This parameter supplies the mask to be applied to the columns of your data file. See
the explanation in Synopsis for what this mask looks like.

=item C<K>:

This parameter supplies the number of clusters you are looking for.  If you set this
option to 0, that means that you want the module to search for the best value for
C<K>.  (Keep in mind the fact that searching for the best C<K> may take a long time
for large data files.)

=item C<Kmin>:

If you supply an integer value for C<Kmin>, the search for the best C<K> will begin
with that value.

=item C<Kmax>:

If you supply an integer value for C<Kmax>, the search for the best C<K> will end at
that value.

=item C<cluster_seeding>:

This parameter must be set to either C<random> or C<smart>.  Depending on your data,
you may get superior clustering with the C<random> option.  The choice C<smart> means
that the clusterer will (1) subject the data to principal components analysis to
determine the maximum variance direction; (2) project the data onto this direction;
(3) find peaks in a smoothed histogram of the projected points; and (4) use the
locations of the highest peaks as seeds for cluster centers.  If the C<smart> option
produces bizarre results, try C<random>.

=item C<use_mahalanobis_metric>:

When set to 1, this option causes Mahalanobis distances to be used for clustering.
The default is 0 for this parameter. By default, the module uses the Euclidean
distances for clustering.  In general, Mahalanobis distance based clustering will
fail if your data resides on a lower-dimensional hyperplane in the data space, if you
seek too many clusters, and if you do not have a sufficient number of samples in your
data file.  A necessary requirement for the module to be able to compute Mahalanobis
distances is that the cluster covariance matrices be non-singular. (Let's say your
data dimensionality is C<D> and the module is considering a cluster that has only
C<d> samples in it where C<d> is less than C<D>.  In this case, the covariance matrix
will be singular since its rank will not exceed C<d>.  For the covariance matrix to
be non-singular, it must be of full rank, that is, its rank must be C<D>.)

=item C<do_variance_normalization>:

When set, the module will first normalize the data variance along the different
dimensions of the data space before attempting clustering.  Depending on your data,
this option may or may not result in better clustering.

=item C<terminal_output>:

This boolean parameter, when not supplied in the call to C<new()>, defaults to 0.
When set, you will see in your terminal window the different clusters as lists of the
symbolic IDs and their cluster centers. You will also see the QoC (Quality of
Clustering) values for the clusters displayed.

=item C<write_clusters_to_files>:

This parameter is also boolean.  When set to 1, the clusters are written out to files
that are named in the following manner:

     cluster0.txt 
     cluster1.txt 
     cluster2.txt
     ...
     ...

Before the clusters are written to these files, the module destroys all files with
such names in the directory in which you call the module.


=back

=over

=item B<read_data_from_file()>

    $clusterer->read_data_from_file()

=item B<kmeans()>

    $clusterer->kmeans();

    or 

    my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();

The first call above works solely by side-effect.  The second call also returns the
clusters and the cluster centers. See the C<cluster_and_visualize.pl> script in the
C<examples> directory for how you can in your own code extract the clusters and the
cluster centers from the variables C<$clusters_hash> and C<$cluster_centers_hash>.

=item B<get_K_best()>

    $clusterer->get_K_best();

This call makes sense only if you supply either the C<K=0> option to the constructor,
or if you specify values for the C<Kmin> and C<Kmax> options. The C<K=0> and the

lib/Algorithm/KMeans.pm  view on Meta::CPAN


The module also leaves in your directory a printable `.png' file that is a point plot
of the different clusters. The name of this file is always C<clustering_results.png>.

Also, as mentioned previously, a call to C<kmeans()> in your own code will return the
clusters and the cluster centers.

=head1 REQUIRED

This module requires the following three modules:

   Math::Random
   Graphics::GnuplotIF
   Math::GSL

With regard to the third item above, what is actually required is the
C<Math::GSL::Matrix> module.  However, that module is a part of the C<Math::GSL>
distribution. The acronym GSL stands for the GNU Scientific Library.  C<Math::GSL> is
a Perl interface to the GSL C-based library.


=head1 THE C<examples> DIRECTORY

The C<examples> directory contains several scripts to help you become familiar with
this module.  The following script is an example of how the module can be expected to
be used most of the time. It calls for clustering to be carried out with a fixed
C<K>:

        cluster_and_visualize.pl

The more time you spend with this script, the more comfortable you will become with
the use of this module. The script file contains a large comment block that mentions
six locations in the script where you have to make decisions about how to use the
module.

See the following script if you do not know what value to use for C<K> for clustering
your data:

        find_best_K_and_cluster.pl

This script uses the C<K=0> option in the constructor that causes the module to
search for the best C<K> for your data.  Since this search is virtually unbounded ---
limited only by the number of samples in your data file --- the script may take a
long time to run for a large data file.  Hence the next script.

If your datafile is too large, you may need to range limit the values of C<K> that
are searched through, as in the following script:

        find_best_K_in_range_and_cluster.pl

If you also want to include data normalization (it may reduce the performance of the
clusterer in some cases), see the following script:

        cluster_after_data_normalization.pl

When you include the data normalization step and you would like to visualize the data
before and after normalization, see the following script:

        cluster_and_visualize_with_data_visualization.pl*

After you are done clustering, let's say you want to find the cluster membership of a
new data sample. To see how you can do that, see the script:

        which_cluster_for_new_data.pl

This script returns two answers for which cluster a new data sample belongs to: one
using the Euclidean metric to calculate the distances between the new data sample and
the cluster centers, and the other using the Mahalanobis metric.  If the clusters are
strongly elliptical in shape, you are likely to get better results with the
Mahalanobis metric.  (To see that you can get two different answers using the two
different distance metrics, run the C<which_cluster_for_new_data.pl> script on the
data in the file C<mydatafile3.dat>.  To make this run, note that you have to comment
out and uncomment the lines at four different locations in the script.)

The C<examples> directory also contains the following support scripts:

For generating the data for experiments with clustering:

        data_generator.pl

For cleaning up the examples directory:

        cleanup_directory.pl

The examples directory also includes a parameter file, C<param.txt>, for generating
synthetic data for clustering.  Just edit this file if you would like to generate
your own multivariate data for clustering.  The parameter file is for the 3D case,
but you can generate data with any dimensionality through appropriate entries in the
parameter file.

=head1 EXPORT

None by design.

=head1 CAVEATS

K-Means based clustering usually does not work well when the clusters are strongly
overlapping and when the extent of variability along the different dimensions is
different for the different clusters.  The module does give you the ability to
normalize the variability in your data with the constructor option
C<do_variance_normalization>.  However, as described elsewhere, this may actually
reduce the performance of the clusterer if the data variability along a direction is
more a result of the separation between the means than because of intra-cluster
variability.  For better clustering with difficult-to-cluster data, you could try
using the author's C<Algorithm::ExpectationMaximization> module.

=head1 BUGS

Please notify the author if you encounter any bugs.  When sending email, please place
the string 'KMeans' in the subject line.

=head1 INSTALLATION

Download the archive from CPAN in any directory of your choice.  Unpack the archive
with a command that on a Linux machine would look like:

    tar zxvf Algorithm-KMeans-2.05.tar.gz

This will create an installation directory for you whose name will be
C<Algorithm-KMeans-2.05>.  Enter this directory and execute the following commands
for a standard install of the module if you have root privileges:



( run in 0.456 second using v1.01-cache-2.11-cpan-483215c6ad5 )