Algorithm-KMeans
view release on metacpan or search on metacpan
lib/Algorithm/KMeans.pm view on Meta::CPAN
}
}
sub read_data_from_file_dat {
my $self = shift;
my $datafile = $self->{_datafile};
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open INPUT, $datafile or die "unable to open file $datafile: $!\n";
chomp( my @raw_data = <INPUT> );
close INPUT;
# Transform strings into number data
foreach my $record (@raw_data) {
next unless $record;
next if $record =~ /^#/;
my @data_fields;
my @fields = split /\s+/, $record;
die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
my $record_id;
foreach my $i (0..@fields-1) {
if ($mask[$i] eq '0') {
lib/Algorithm/KMeans.pm view on Meta::CPAN
croak "Misformed visualization mask. It can only have 1s and 0s\n";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my @all_data_ids = @{$self->{_data_id_tags}};
my $K = scalar @{$self->{_clusters}};
my $filename = basename($master_datafile);
my $temp_file = "__temp_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!\n";
foreach my $cluster (@{$self->{_clusters}}) {
foreach my $item (@$cluster) {
print OUTPUT "@{$visualization_data{$item}}";
print OUTPUT "\n";
}
print OUTPUT "\n\n";
}
close OUTPUT;
my $plot;
my $hardcopy_plot;
if (!defined $pause_time) {
$plot = Graphics::GnuplotIF->new( persist => 1 );
$hardcopy_plot = Graphics::GnuplotIF->new();
$hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
} else {
$plot = Graphics::GnuplotIF->new();
}
$plot->gnuplot_cmd( "set noclip" );
lib/Algorithm/KMeans.pm view on Meta::CPAN
my $filename = basename($master_datafile);
my $temp_file;
if ($datatype eq 'original') {
$temp_file = "__temp_data_" . $filename;
} elsif ($datatype eq 'normed') {
$temp_file = "__temp_normed_data_" . $filename;
} else {
croak "ABORTED: Improper call to visualize_data()";
}
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!\n";
foreach my $datapoint (values %visualization_data) {
print OUTPUT "@$datapoint";
print OUTPUT "\n";
}
close OUTPUT;
my $plot = Graphics::GnuplotIF->new( persist => 1 );
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
my $plot_title = $datatype eq 'original' ? '"data"' : '"normed data"';
my $arg_string ;
if ($visualization_data_field_width > 2) {
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 2) {
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 1 ) {
lib/Algorithm/KMeans.pm view on Meta::CPAN
sub cluster_data_generator {
my $class = shift;
croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
my %args = @_;
my $input_parameter_file = $args{input_parameter_file};
my $output_file = $args{output_datafile};
my $N = $args{number_data_points_per_cluster};
my @all_params;
my $param_string;
if (defined $input_parameter_file) {
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
chomp @all_params;
$param_string = join ' ', @all_params;
} else {
# Just for testing. Used in t/test.t
$param_string = "cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
}
my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;
lib/Algorithm/KMeans.pm view on Meta::CPAN
foreach my $i (0..$K-1) {
my @m = @{shift @means};
my @covar = @{shift @covariances};
my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
my $p = 0;
my $label = $point_labels[$i];
@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
push @data_dump, @new_data;
}
fisher_yates_shuffle( \@data_dump );
open OUTPUT, ">$output_file";
foreach my $ele (@data_dump) {
foreach my $coord ( @$ele ) {
print OUTPUT "$coord ";
}
print OUTPUT "\n";
}
print "Data written out to file $output_file\n";
close OUTPUT;
}
sub add_point_coords {
my $self = shift;
my @arr_of_ids = @{shift @_}; # array of data element names
my @result;
my $data_dimensionality = $self->{_data_dimensions};
foreach my $i (0..$data_dimensionality-1) {
$result[$i] = 0.0;
}
lib/Algorithm/KMeans.pm view on Meta::CPAN
parameter file, the main constraint you need to observe in specifying the parameters
is that the dimensionality of the covariance matrix must correspond to the
dimensionality of the mean vectors. The multivariate random numbers are generated by
calling the C<Math::Random> module. As you would expect, this module requires that
the covariance matrices you specify in your parameter file be symmetric and positive
definite. Should the covariances in your parameter file not obey this condition, the
C<Math::Random> module will let you know.
=back
=head1 HOW THE CLUSTERS ARE OUTPUT
When the option C<terminal_output> is set in the call to the constructor, the
clusters are displayed on the terminal screen.
When the option C<write_clusters_to_files> is set in the call to the constructor, the
module dumps the clusters in files named
cluster0.txt
cluster1.txt
cluster2.txt
( run in 0.417 second using v1.01-cache-2.11-cpan-4e96b696675 )