PUT results from the CPAN

Algorithm-KMeans

view release on metacpan or search on metacpan

lib/Algorithm/KMeans.pm view on Meta::CPAN


sub read_data_from_file {
    my $self = shift;
    my $filename = $self->{_datafile};
    $self->read_data_from_file_csv() if $filename =~ /.csv$/;
    $self->read_data_from_file_dat() if $filename =~ /.dat$/;
}

sub read_data_from_file_csv {
    my $self = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
            "the size of at least one of the data records in the file: $!"
            unless scalar(@mask) == scalar(@splits);
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_original_data} = \%data_hash;
    $self->{_data_id_tags} = \@data_tags;
    $self->{_N} = scalar @data_tags;
    if ($self->{_var_normalize}) {
        $self->{_data} =  variance_normalization( $self->{_original_data} ); 
    } else {
        $self->{_data} = deep_copy_hash( $self->{_original_data} );
    }
    # Need to make the following call to set the global mean and covariance:
    # my $covariance =  $self->estimate_mean_and_covariance(\@data_tags);
    # Need to make the following call to set the global eigenvec eigenval sets:
    # $self->eigen_analysis_of_covariance($covariance);
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
             "points must satisfy the relation N > 2xK**2 where K is " .
             "the number of clusters requested for the clusters to be " .
             "meaningful $!" 
                         if ( $self->{_N} < (2 * $self->{_K} ** 2) );
        print "\n\n\n";
    }
}

sub read_data_from_file_dat {
    my $self = shift;
    my $datafile = $self->{_datafile};
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output};
    open INPUT, $datafile or die "unable to open file $datafile: $!\n";
    chomp( my @raw_data = <INPUT> );
    close INPUT;
    # Transform strings into number data
    foreach my $record (@raw_data) {
        next unless $record;
        next if $record =~ /^#/;
        my @data_fields;
        my @fields = split /\s+/, $record;
        die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
        my $record_id;
        foreach my $i (0..@fields-1) {
            if ($mask[$i] eq '0') {
                next;
            } elsif ($mask[$i] eq 'N') {
                $record_id = $fields[$i];
            } elsif ($mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "misformed mask for reading the data file\n";
            }
        }
        my @nums = map {/$_num_regex/;$_} @data_fields;
        $self->{_original_data}->{ $record_id } = \@nums;
    }
    if ($self->{_var_normalize}) {
        $self->{_data} =  variance_normalization( $self->{_original_data} ); 
    } else {
        $self->{_data} = deep_copy_hash( $self->{_original_data} );
    }
    my @all_data_ids = keys %{$self->{_data}};
    $self->{_data_id_tags} = \@all_data_ids;
    $self->{_N} = scalar @all_data_ids;
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
             "points must satisfy the relation N > 2xK**2 where K is " .
             "the number of clusters requested for the clusters to be " .
             "meaningful $!" 
                         if ( $self->{_N} < (2 * $self->{_K} ** 2) );
        print "\n\n\n";
    }
}

sub kmeans {
    my $self = shift;
    my $K = $self->{_K};
    if ( ($K == 0) ||
              ( ($self->{_K_min} ne 'unknown') &&
                ($self->{_K_max} ne 'unknown') ) ) {
        eval {
            $self->iterate_through_K();   
        };
        die "in kmeans(): $@" if ($@);
    } elsif ( $K =~ /\d+/) {
        eval {
            $self->cluster_for_fixed_K_multiple_random_tries($K) 
                                            if $self->{_cluster_seeding} eq 'random';
            $self->cluster_for_fixed_K_single_smart_try($K) 
                                            if $self->{_cluster_seeding} eq 'smart';
        };
        die "in kmeans(): $@" if ($@);
    } else {
        croak "Incorrect call syntax used.  See documentation.\n";

lib/Algorithm/KMeans.pm view on Meta::CPAN

#  of on bits in the mask is exactly 2, it does a 2D plot.  Should it happen that
#  only one on bit is specified for the mask, visualize_clusters() aborts.
#
#  The visualization code consists of first accessing each of clusters created by the
#  kmeans() subroutine.  Note that the clusters contain only the symbolic names for
#  the individual records in the source data file.  We therefore next reach into the
#  $self->{_original_data} hash and get the data coordinates associated with each
#  symbolic label in a cluster.  The numerical data thus generated is then written
#  out to a temp file.  When doing so we must remember to insert TWO BLANK LINES
#  between the data blocks corresponding to the different clusters.  This constraint
#  is imposed on us by Gnuplot when plotting data from the same file since we want to
#  use different point styles for the data points in different cluster files.
#
#  Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
#  module to plot the data clusters.
sub visualize_clusters {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || croak "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || croak "visualization mask missing";    
        $pause_time = shift;
    } else {
        croak "visualize_clusters() called with wrong args";
    }
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    croak "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
        my @fields = @$data;
        croak "\nABORTED: Visualization mask size exceeds data record size\n" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                croak "Misformed visualization mask. It can only have 1s and 0s\n";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my @all_data_ids = @{$self->{_data_id_tags}};
    my $K = scalar @{$self->{_clusters}};
    my $filename = basename($master_datafile);
    my $temp_file = "__temp_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!\n";
    foreach my $cluster (@{$self->{_clusters}}) {
        foreach my $item (@$cluster) {
            print OUTPUT "@{$visualization_data{$item}}";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    my $hardcopy_plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
        $hardcopy_plot = Graphics::GnuplotIF->new();
        $hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i\" with points lt $j pt $j, ";
        }
    } elsif ($visualization_data_field_width == 2) {
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i\" with points lt $j pt $j, ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1 title \"Cluster $i\" with points lt $j pt $j, ";
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
        $hardcopy_plot->gnuplot_cmd( "splot $arg_string" ) unless defined $pause_time;
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $hardcopy_plot->gnuplot_cmd( "plot $arg_string" ) unless defined $pause_time;
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 1) {
        croak "No provision for plotting 1-D data\n";
    }
}

#  It makes sense to call visualize_data() only AFTER you have called the method
#  read_data_from_file().
#
#  The visualize_data() is meant for the visualization of the original data in its
#  various 2D or 3D subspaces.  The method can also be used to visualize the normed
#  data in a similar manner.  Recall the normed data is the original data after each
#  data dimension is normalized by the standard-deviation along that dimension.
#
#  Whether you see the original data or the normed data depends on the second
#  argument supplied in the method call.  It must be either the string 'original' or
#  the string 'normed'.
sub visualize_data {
    my $self = shift;
    my $v_mask = shift || croak "visualization mask missing";
    my $datatype = shift;    # must be either 'original' or 'normed'
    croak "\n\nABORTED: You called visualize_data() for normed data " .
          "but without first turning on data normalization in the " .
          "in the KMeans constructor"
          if ($datatype eq 'normed') && ! $self->{_var_normalize}; 
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    croak "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    my %visualization_data;
    my $data_source;
    if ($datatype eq 'original') {
        $data_source  =  $self->{_original_data};
    } elsif ($datatype eq 'normed') {
        $data_source  =  $self->{_data};
    } else {
        croak "\n\nABORTED: improper call to visualize_data()";
    }
    while ( my ($record_id, $data) = each %{$data_source} ) {
        my @fields = @$data;
        croak "\nABORTED: Visualization mask size exceeds data record size\n" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                croak "Misformed visualization mask. It can only have 1s and 0s\n";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($master_datafile);
    my $temp_file;
    if ($datatype eq 'original') {
        $temp_file = "__temp_data_" . $filename;
    } elsif ($datatype eq 'normed') {
        $temp_file = "__temp_normed_data_" . $filename;
    } else {
        croak "ABORTED: Improper call to visualize_data()";
    }
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!\n";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;
    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  $datatype eq 'original' ? '"data"' : '"normed data"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        $arg_string = "\"$temp_file\" using 1 notitle with points lt -1 pt 1";
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        croak "No provision for plotting 1-D data\n";
    }
}

###########################  Generating Synthetic Data for Clustering  ##############################

#  The data generated corresponds to a multivariate distribution.  The mean and the
#  covariance of each Gaussian in the distribution are specified individually in a
#  parameter file.  See the example parameter file param.txt in the examples
#  directory.  Just edit this file for your own needs.
#
#  The multivariate random numbers are generated by calling the Math::Random module.
#  As you would expect, that module will insist that the covariance matrix you
#  specify be symmetric and positive definite.
sub cluster_data_generator {
    my $class = shift;
    croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
    my %args = @_;
    my $input_parameter_file = $args{input_parameter_file};
    my $output_file = $args{output_datafile};
    my $N = $args{number_data_points_per_cluster};
    my @all_params;
    my $param_string;
    if (defined $input_parameter_file) {
        open INPUT, $input_parameter_file || "unable to open parameter file: $!";
        @all_params = <INPUT>;
        @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
        chomp @all_params;
        $param_string = join ' ', @all_params;
    } else {
        # Just for testing. Used in t/test.t
        $param_string = "cluster 5 0 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 5 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 0 5  1 0 0 0 1 0 0 0 1";
    }
    my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;
    @cluster_strings = grep  $_, @cluster_strings;
    my $K = @cluster_strings;
    croak "Too many clusters requested" if $K > 12;
    my @point_labels = ('a'..'z');
    print "Number of Gaussians used for the synthetic data: $K\n";
    my @means;
    my @covariances;
    my $data_dimension;
    foreach my $i (0..$K-1) {
        my @num_strings = split /  /, $cluster_strings[$i];
        my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
        $data_dimension = @cluster_mean;
        push @means, \@cluster_mean;
        my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
        croak "dimensionality error" if @covariance_nums != 
                                      ($data_dimension ** 2);
        my $cluster_covariance;
        foreach my $j (0..$data_dimension-1) {
            foreach my $k (0..$data_dimension-1) {        
                $cluster_covariance->[$j]->[$k] = 
                         $covariance_nums[$j*$data_dimension + $k];
            }
        }
        push @covariances, $cluster_covariance;
    }
    random_seed_from_phrase( 'hellojello' );
    my @data_dump;
    foreach my $i (0..$K-1) {
        my @m = @{shift @means};
        my @covar = @{shift @covariances};
        my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
        my $p = 0;
        my $label = $point_labels[$i];
        @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
        push @data_dump, @new_data;     
    }
    fisher_yates_shuffle( \@data_dump );
    open OUTPUT, ">$output_file";
    foreach my $ele (@data_dump) {
        foreach my $coord ( @$ele ) {
            print OUTPUT "$coord ";
        }
        print OUTPUT "\n";
    }
    print "Data written out to file $output_file\n";
    close OUTPUT;
}

sub add_point_coords {
    my $self = shift;
    my @arr_of_ids = @{shift @_};      # array of data element names
    my @result;
    my $data_dimensionality = $self->{_data_dimensions};
    foreach my $i (0..$data_dimensionality-1) {
        $result[$i] = 0.0;
    }
    foreach my $id (@arr_of_ids) {
        my $ele = $self->{_data}->{$id};
        my $i = 0;
        foreach my $component (@$ele) {
            $result[$i] += $component;
            $i++;
        }
    }
    return \@result;
}

sub add_point_coords_from_original_data {
    my $self = shift;
    my @arr_of_ids = @{shift @_};      # array of data element names
    my @result;
    my $data_dimensionality = $self->{_data_dimensions};
    foreach my $i (0..$data_dimensionality-1) {
        $result[$i] = 0.0;
    }
    foreach my $id (@arr_of_ids) {
        my $ele = $self->{_original_data}->{$id};
        my $i = 0;
        foreach my $component (@$ele) {
            $result[$i] += $component;
            $i++;
        }
    }
    return \@result;
}

###################################   Support Routines  ########################################

sub get_index_at_value {
    my $value = shift;
    my @array = @{shift @_};
    foreach my $i (0..@array-1) {
        return $i if $value == $array[$i];
    }
    return -1;
}

# This routine is really not necessary in light of the new `~~' operator in Perl.
# Will use the new operator in the next version.
sub vector_equal {
    my $vec1 = shift;
    my $vec2 = shift;
    die "wrong data types for vector-equal predicate\n" if @$vec1 != @$vec2;
    foreach my $i (0..@$vec1-1){
        return 0 if $vec1->[$i] != $vec2->[$i];
    }

lib/Algorithm/KMeans.pm view on Meta::CPAN

=item B<visualize_clusters()>

    $clusterer->visualize_clusters( $visualization_mask )

The visualization mask here does not have to be identical to the one used for
clustering, but must be a subset of that mask.  This is convenient for visualizing
the clusters in two- or three-dimensional subspaces of the original space.

=item B<visualize_data()>

    $clusterer->visualize_data($visualization_mask, 'original');

    $clusterer->visualize_data($visualization_mask, 'normed');

This method requires a second argument and, as shown, it must be either the string
C<original> or the string C<normed>, the former for the visualization of the raw data
and the latter for the visualization of the data after its different dimensions are
normalized by the standard-deviations along those directions.  If you call the method
with the second argument set to C<normed>, but do so without turning on the
C<do_variance_normalization> option in the KMeans constructor, it will let you know.


=item  B<which_cluster_for_new_data_element()>

If you wish to determine the cluster membership of a new data sample after you have
created the clusters with the existing data samples, you would need to call this
method. The C<which_cluster_for_new_data.pl> script in the C<examples> directory
shows how to use this method.


=item  B<which_cluster_for_new_data_element_mahalanobis()>

This does the same thing as the previous method, except that it determines the
cluster membership using the Mahalanobis distance metric.  As for the previous
method, see the C<which_cluster_for_new_data.pl> script in the C<examples> directory
for how to use this method.


=item  B<cluster_data_generator()>

    Algorithm::KMeans->cluster_data_generator(
                            input_parameter_file => $parameter_file,
                            output_datafile => $out_datafile,
                            number_data_points_per_cluster => 20 );

for generating multivariate data for clustering if you wish to play with synthetic
data for clustering.  The input parameter file contains the means and the variances
for the different Gaussians you wish to use for the synthetic data.  See the file
C<param.txt> provided in the examples directory.  It will be easiest for you to just
edit this file for your data generation needs.  In addition to the format of the
parameter file, the main constraint you need to observe in specifying the parameters
is that the dimensionality of the covariance matrix must correspond to the
dimensionality of the mean vectors.  The multivariate random numbers are generated by
calling the C<Math::Random> module.  As you would expect, this module requires that
the covariance matrices you specify in your parameter file be symmetric and positive
definite.  Should the covariances in your parameter file not obey this condition, the
C<Math::Random> module will let you know.

=back

=head1 HOW THE CLUSTERS ARE OUTPUT

When the option C<terminal_output> is set in the call to the constructor, the
clusters are displayed on the terminal screen.

When the option C<write_clusters_to_files> is set in the call to the constructor, the
module dumps the clusters in files named

    cluster0.txt
    cluster1.txt
    cluster2.txt
    ...
    ...

in the directory in which you execute the module.  The number of such files will
equal the number of clusters formed.  All such existing files in the directory are
destroyed before any fresh ones are created.  Each cluster file contains the symbolic
ID tags of the data samples in that cluster.

The module also leaves in your directory a printable `.png' file that is a point plot
of the different clusters. The name of this file is always C<clustering_results.png>.

Also, as mentioned previously, a call to C<kmeans()> in your own code will return the
clusters and the cluster centers.

=head1 REQUIRED

This module requires the following three modules:

   Math::Random
   Graphics::GnuplotIF
   Math::GSL

With regard to the third item above, what is actually required is the
C<Math::GSL::Matrix> module.  However, that module is a part of the C<Math::GSL>
distribution. The acronym GSL stands for the GNU Scientific Library.  C<Math::GSL> is
a Perl interface to the GSL C-based library.


=head1 THE C<examples> DIRECTORY

The C<examples> directory contains several scripts to help you become familiar with
this module.  The following script is an example of how the module can be expected to
be used most of the time. It calls for clustering to be carried out with a fixed
C<K>:

        cluster_and_visualize.pl

The more time you spend with this script, the more comfortable you will become with
the use of this module. The script file contains a large comment block that mentions
six locations in the script where you have to make decisions about how to use the
module.

See the following script if you do not know what value to use for C<K> for clustering
your data:

        find_best_K_and_cluster.pl

This script uses the C<K=0> option in the constructor that causes the module to
search for the best C<K> for your data.  Since this search is virtually unbounded ---
limited only by the number of samples in your data file --- the script may take a

( run in 0.554 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )