Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
_K => $args{K} || croak("number of clusters required"),
_terminal_output => $args{terminal_output} || 0,
_seeding => $args{seeding} || 'random',
_seed_tags => $args{seed_tags} || [],
_max_em_iterations=> $args{max_em_iterations} || 100,
_class_priors => $args{class_priors} || [],
_debug => $args{debug} || 0,
_N => 0,
_data => {},
_data_id_tags => [],
_clusters => [],
_cluster_centers => [],
_data_dimensions => 0,
_cluster_normalizers => [],
_cluster_means => [],
_cluster_covariances => [],
_class_labels_for_data => {},
_class_probs_at_each_data_point => {},
_expected_class_probs => {},
_old_priors => [],
_old_old_priors => [],
_fisher_quality_vs_iteration => [],
_mdl_quality_vs_iterations => [],
}, $class;
}
sub read_data_from_file {
my $self = shift;
my $filename = $self->{_datafile};
$self->read_data_from_file_csv() if $filename =~ /.csv$/;
$self->read_data_from_file_dat() if $filename =~ /.dat$/;
}
sub read_data_from_file_csv {
my $self = shift;
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
"the size of at least one of the data records in the file.\n"
unless scalar(@mask) == scalar(@splits);
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_data} = \%data_hash;
$self->{_data_id_tags} = \@data_tags;
$self->{_N} = scalar @data_tags;
# Need to make the following call to set the global mean and covariance:
# my $covariance = $self->estimate_mean_and_covariance(\@data_tags);
# Need to make the following call to set the global eigenvec eigenval sets:
# $self->eigen_analysis_of_covariance($covariance);
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
if ( $self->{_N} < (2 * $self->{_K} ** 2) );
print "\n\n\n";
}
}
sub read_data_from_file_dat {
my $self = shift;
my $datafile = $self->{_datafile};
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"
if $self->{_terminal_output};
open INPUT, $datafile
or die "unable to open file $datafile: $!";
chomp( my @raw_data = <INPUT> );
close INPUT;
# Transform strings into number data
foreach my $record (@raw_data) {
next unless $record;
next if $record =~ /^#/;
my @data_fields;
my @fields = split /\s+/, $record;
die "\nABORTED: Mask size does not correspond to row record size"
if $#fields != $#mask;
my $record_id;
foreach my $i (0..@fields-1) {
if ($mask[$i] eq '0') {
next;
} elsif ($mask[$i] eq 'N') {
$record_id = $fields[$i];
} elsif ($mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "misformed mask for reading the data file";
}
}
my @nums = map {/$_num_regex/;$_} @data_fields;
$self->{_data}->{ $record_id } = \@nums;
}
my @all_data_ids = keys %{$self->{_data}};
$self->{_data_id_tags} = \@all_data_ids;
$self->{_N} = scalar @all_data_ids;
if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
"points must satisfy the relation N > 2xK**2 where K is " .
"the number of clusters requested for the clusters to be " .
"meaningful $!"
if ( $self->{_N} < (2 * $self->{_K} ** 2) );
}
}
# This is the heart of the module --- in the sense that this method implements the EM
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
transpose($datavec_minus_mean),
matrix_vector_multiply($covar_inverses[$cluster_index],
$datavec_minus_mean ) );
} else {
my @var_inverse = $covar_inverses[$cluster_index]->as_list;
my $var_inverse_val = $var_inverse[0];
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
}
next if $covar->det() < 0;
my $coefficient = 1.0 /
( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions}
* sqrt($covar->det()) );
my $prob = $coefficient * exp($exponent);
$likelihood_for_each_tag +=
$prob * $self->{_class_priors}->[$cluster_index];
}
$log_likelihood += log( $likelihood_for_each_tag );
}
# Now calculate the model complexity penalty. $L is the total number of
# parameters it takes to specify a mixture of K Gaussians. If d is the
# dimensionality of the data space, the covariance matrix of each Gaussian takes
# (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
# then you need d mean value parameters, and one prior probability parameter
# for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
# is subtracted is to account for the normalization on the class priors.
my $L = (0.5 * $self->{_K} *
($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1;
my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
return $mdl_criterion;
}
# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
# is the within-class scatter matrix, more commonly denoted S_w, and SB the
# between-class scatter matrix, more commonly denoted S_b (the underscore means
# subscript). This measure can be thought of as the normalized average distance
# between the clusters, the normalization being provided by average cluster
# covariance SW^-1. Therefore, the larger the value of this quality measure, the
# better the separation between the clusters. Since this measure has its roots in
# the Fisher linear discriminant function, we incorporate the word 'fisher' in the
# name of the quality measure. Note that this measure is good only when the clusters
# are disjoint. When the clusters exhibit significant overlap, the numbers produced
# by this quality measure tend to be generally meaningless. As an extreme case,
# let's say your data was produced by a set of Gaussians, all with the same mean
# vector, but each with a distinct covariance. For this extreme case, this measure
# will produce a value close to zero --- depending on the accuracy with which the
# means are estimated --- even when your clusterer is doing a good job of identifying
# the individual clusters.
sub clustering_quality_fisher {
my $self = shift;
my @cluster_quality_indices;
my $fisher_trace = 0;
my $S_w =
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
$S_w->zero;
my $S_b =
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
$S_b->zero;
my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$global_mean->zero;
foreach my $cluster_index(0..$self->{_K}-1) {
$global_mean = $self->{_class_priors}->[$cluster_index] *
$self->{_cluster_means}->[$cluster_index];
}
foreach my $cluster_index(0..$self->{_K}-1) {
$S_w += $self->{_cluster_covariances}->[$cluster_index] *
$self->{_class_priors}->[$cluster_index];
my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
- $global_mean;
my $outer_product = outer_product( $class_mean_minus_global_mean,
$class_mean_minus_global_mean );
$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
}
my $fisher = matrix_multiply($S_w->inverse, $S_b);
return $fisher unless defined blessed($fisher);
return matrix_trace($fisher);
}
sub display_seeding_stats {
my $self = shift;
foreach my $cluster_index(0..$self->{_K}-1) {
print "\nSeeding for cluster $cluster_index:\n";
my $mean = $self->{_cluster_means}->[$cluster_index];
display_matrix("The mean is: ", $mean);
my $covariance = $self->{_cluster_covariances}->[$cluster_index];
display_matrix("The covariance is: ", $covariance);
}
}
sub display_fisher_quality_vs_iterations {
my $self = shift;
print "\n\nFisher Quality vs. Iterations: " .
"@{$self->{_fisher_quality_vs_iteration}}\n\n";
}
sub display_mdl_quality_vs_iterations {
my $self = shift;
print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}
sub find_prob_at_each_datapoint_for_given_mean_and_covar {
my $self = shift;
my $mean_vec_ref = shift;
my $covar_ref = shift;
foreach my $data_id (keys %{$self->{_data}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col( 0, $self->{_data}->{$data_id});
if ($self->{_debug}) {
display_matrix("data vec in find prob function", $data_vec);
display_matrix("mean vec in find prob function", $mean_vec_ref);
display_matrix("covariance in find prob function", $covar_ref);
}
my $datavec_minus_mean = $data_vec - $mean_vec_ref;
display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
my $exponent = undef;
if ($self->{_data_dimensions} > 1) {
$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
} elsif (defined blessed($covar_ref)) {
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
my @covar_as_matrix = $covar_ref->as_list;
my $covar_val = $covar_as_matrix[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
} else {
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
}
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
my $coefficient = undef;
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
next if !defined $cluster;
$largest_cluster = $cluster if !defined $largest_cluster;
if (@$cluster > @$largest_cluster) {
$largest_cluster = $cluster;
}
}
foreach my $cluster (@clusters) {
if ( (!defined $cluster) || (@$cluster == 0) ) {
push @$cluster, pop @$largest_cluster;
}
}
foreach my $cluster (@clusters) {
die "Cluster became empty --- untenable condition " .
"for a given K. Try again. " if !defined $cluster;
my $cluster_size = @$cluster;
die "Cluster size is zero --- untenable." if $cluster_size == 0;
my @new_cluster_center = @{$self->add_point_coords( $cluster )};
@new_cluster_center = map {my $x = $_/$cluster_size; $x}
@new_cluster_center;
push @new_cluster_centers, \@new_cluster_center;
}
return \@new_cluster_centers;
}
# The following routine is for computing the distance between a data point specified
# by its symbolic name in the master datafile and a point (such as the center of a
# cluster) expressed as a vector of coordinates:
sub distance {
my $self = shift;
my $ele1_id = shift @_; # symbolic name of data sample
my @ele1 = @{$self->{_data}->{$ele1_id}};
my @ele2 = @{shift @_};
die "wrong data types for distance calculation" if @ele1 != @ele2;
my $how_many = @ele1;
my $squared_sum = 0;
foreach my $i (0..$how_many-1) {
$squared_sum += ($ele1[$i] - $ele2[$i])**2;
}
my $dist = sqrt $squared_sum;
return $dist;
}
# The following routine does the same as above but now both
# arguments are expected to be arrays of numbers:
sub distance2 {
my $self = shift;
my @ele1 = @{shift @_};
my @ele2 = @{shift @_};
die "wrong data types for distance calculation" if @ele1 != @ele2;
my $how_many = @ele1;
my $squared_sum = 0;
foreach my $i (0..$how_many-1) {
$squared_sum += ($ele1[$i] - $ele2[$i])**2;
}
return sqrt $squared_sum;
}
sub write_naive_bayes_clusters_to_files {
my $self = shift;
my @clusters = @{$self->{_clusters}};
unlink glob "naive_bayes_cluster*.txt";
foreach my $i (1..@clusters) {
my $filename = "naive_bayes_cluster" . $i . ".txt";
print "Writing cluster $i to file $filename\n"
if $self->{_terminal_output};
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
foreach my $ele (@{$clusters[$i-1]}) {
print FILEHANDLE "$ele\n";
}
close FILEHANDLE;
}
}
sub write_posterior_prob_clusters_above_threshold_to_files {
my $self = shift;
my $theta = shift;
my @class_distributions;
foreach my $cluster_index (0..$self->{_K}-1) {
push @class_distributions, [];
}
foreach my $data_tag (@{$self->{_data_id_tags}}) {
foreach my $cluster_index (0..$self->{_K}-1) {
push @{$class_distributions[$cluster_index]}, $data_tag
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
> $theta;
}
}
unlink glob "posterior_prob_cluster*.txt";
foreach my $i (1..@class_distributions) {
my $filename = "posterior_prob_cluster" . $i . ".txt";
print "Writing posterior prob cluster $i to file $filename\n"
if $self->{_terminal_output};
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
foreach my $ele (@{$class_distributions[$i-1]}) {
print FILEHANDLE "$ele\n";
}
close FILEHANDLE;
}
}
sub DESTROY {
unlink "__temp_" . basename($_[0]->{_datafile});
unlink "__temp_data_" . basename($_[0]->{_datafile});
unlink "__temp2_" . basename($_[0]->{_datafile});
unlink glob "__temp1dhist*";
unlink glob "__contour*";
}
############################# Visualization Code ###############################
# The visualize_clusters() implementation displays as a plot in your terminal window
# the clusters constructed by the EM algorithm. It can show either 2D plots or
# 3D plots that you can rotate interactively for better visualization. For
# multidimensional data, as to which 2D or 3D dimensions are used for visualization
# is controlled by the mask you must supply as an argument to the method. Should it
# happen that only one on bit is specified for the mask, visualize_clusters()
# aborts.
#
# The visualization code consists of first accessing each of clusters created by the
# EM() subroutine. Note that the clusters contain only the symbolic names for the
# individual records in the source data file. We therefore next reach into the
# $self->{_data} hash and get the data coordinates associated with each symbolic
# label in a cluster. The numerical data thus generated is then written out to a
# temp file. When doing so we must remember to insert TWO BLANK LINES between the
# data blocks corresponding to the different clusters. This constraint is imposed
# on us by Gnuplot when plotting data from the same file since we want to use
# different point styles for the data points in different cluster files.
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
# module to plot the data clusters.
sub visualize_clusters {
my $self = shift;
my $v_mask;
my $pause_time;
if (@_ == 1) {
$v_mask = shift || die "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || die "visualization mask missing";
$pause_time = shift;
} else {
die "visualize_clusters() called with wrong args";
}
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
# The following section is for the superimposed one-Mahalanobis-distance-unit
# ellipses that are shown only for 2D plots:
if ($visualization_data_field_width == 2) {
foreach my $cluster_index (0..$self->{_K}-1) {
my $contour_filename = "__contour_" . $cluster_index . ".dat";
my $mean = $self->{_cluster_means}->[$cluster_index];
my $covariance = $self->{_cluster_covariances}->[$cluster_index];
my ($mux,$muy) = $mean->as_list();
my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
my ($sigmayx,$vary) = $covariance->row(1)->as_list();
die "Your covariance matrix does not look right"
unless $sigmaxy == $sigmayx;
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
( run in 2.299 seconds using v1.01-cache-2.11-cpan-56fb94df46f )