Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
package Algorithm::ExpectationMaximization;
#---------------------------------------------------------------------------
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::ExpectationMaximization is a pure Perl implementation for
# Expectation-Maximization based clustering of multi-dimensional data that can be
# modeled as a Gaussian mixture.
# ---------------------------------------------------------------------------
use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use Scalar::Util 'blessed';
our $VERSION = '1.22';
# from perl docs:
my $_num_regex = '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$';
# Constructor:
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
bless {
_datafile => $args{datafile} || croak("datafile required"),
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || croak("number of clusters required"),
_terminal_output => $args{terminal_output} || 0,
_seeding => $args{seeding} || 'random',
_seed_tags => $args{seed_tags} || [],
_max_em_iterations=> $args{max_em_iterations} || 100,
_class_priors => $args{class_priors} || [],
_debug => $args{debug} || 0,
_N => 0,
_data => {},
_data_id_tags => [],
_clusters => [],
_cluster_centers => [],
_data_dimensions => 0,
_cluster_normalizers => [],
_cluster_means => [],
_cluster_covariances => [],
_class_labels_for_data => {},
_class_probs_at_each_data_point => {},
_expected_class_probs => {},
_old_priors => [],
_old_old_priors => [],
_fisher_quality_vs_iteration => [],
_mdl_quality_vs_iterations => [],
}, $class;
}
sub read_data_from_file {
my $self = shift;
my $filename = $self->{_datafile};
$self->read_data_from_file_csv() if $filename =~ /.csv$/;
$self->read_data_from_file_dat() if $filename =~ /.dat$/;
}
sub read_data_from_file_csv {
my $self = shift;
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
my $mask = $self->{_mask};
my @mask = split //, $mask;
$self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
print "data dimensionality: $self->{_data_dimensions} \n"if $self->{_terminal_output};
open FILEIN, $filename or die "Unable to open $filename: $!";
die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
local $/ = undef;
my @all_data = split /\s+/, <FILEIN>;
my %data_hash = ();
my @data_tags = ();
foreach my $record (@all_data) {
my @splits = split /,/, $record;
die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
"the size of at least one of the data records in the file.\n"
unless scalar(@mask) == scalar(@splits);
my $record_name = shift @splits;
$data_hash{$record_name} = \@splits;
push @data_tags, $record_name;
}
$self->{_data} = \%data_hash;
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
$prob * $self->{_class_priors}->[$cluster_index];
}
$log_likelihood += log( $likelihood_for_each_tag );
}
# Now calculate the model complexity penalty. $L is the total number of
# parameters it takes to specify a mixture of K Gaussians. If d is the
# dimensionality of the data space, the covariance matrix of each Gaussian takes
# (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
# then you need d mean value parameters, and one prior probability parameter
# for the Gaussian. So $L = K[1 + d + d(d+1)/2] - 1 where the final '1' that
# is subtracted is to account for the normalization on the class priors.
my $L = (0.5 * $self->{_K} *
($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1;
my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
return $mdl_criterion;
}
# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
# is the within-class scatter matrix, more commonly denoted S_w, and SB the
# between-class scatter matrix, more commonly denoted S_b (the underscore means
# subscript). This measure can be thought of as the normalized average distance
# between the clusters, the normalization being provided by average cluster
# covariance SW^-1. Therefore, the larger the value of this quality measure, the
# better the separation between the clusters. Since this measure has its roots in
# the Fisher linear discriminant function, we incorporate the word 'fisher' in the
# name of the quality measure. Note that this measure is good only when the clusters
# are disjoint. When the clusters exhibit significant overlap, the numbers produced
# by this quality measure tend to be generally meaningless. As an extreme case,
# let's say your data was produced by a set of Gaussians, all with the same mean
# vector, but each with a distinct covariance. For this extreme case, this measure
# will produce a value close to zero --- depending on the accuracy with which the
# means are estimated --- even when your clusterer is doing a good job of identifying
# the individual clusters.
sub clustering_quality_fisher {
my $self = shift;
my @cluster_quality_indices;
my $fisher_trace = 0;
my $S_w =
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
$S_w->zero;
my $S_b =
Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
$S_b->zero;
my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$global_mean->zero;
foreach my $cluster_index(0..$self->{_K}-1) {
$global_mean = $self->{_class_priors}->[$cluster_index] *
$self->{_cluster_means}->[$cluster_index];
}
foreach my $cluster_index(0..$self->{_K}-1) {
$S_w += $self->{_cluster_covariances}->[$cluster_index] *
$self->{_class_priors}->[$cluster_index];
my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index]
- $global_mean;
my $outer_product = outer_product( $class_mean_minus_global_mean,
$class_mean_minus_global_mean );
$S_b += $self->{_class_priors}->[$cluster_index] * $outer_product;
}
my $fisher = matrix_multiply($S_w->inverse, $S_b);
return $fisher unless defined blessed($fisher);
return matrix_trace($fisher);
}
sub display_seeding_stats {
my $self = shift;
foreach my $cluster_index(0..$self->{_K}-1) {
print "\nSeeding for cluster $cluster_index:\n";
my $mean = $self->{_cluster_means}->[$cluster_index];
display_matrix("The mean is: ", $mean);
my $covariance = $self->{_cluster_covariances}->[$cluster_index];
display_matrix("The covariance is: ", $covariance);
}
}
sub display_fisher_quality_vs_iterations {
my $self = shift;
print "\n\nFisher Quality vs. Iterations: " .
"@{$self->{_fisher_quality_vs_iteration}}\n\n";
}
sub display_mdl_quality_vs_iterations {
my $self = shift;
print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}
sub find_prob_at_each_datapoint_for_given_mean_and_covar {
my $self = shift;
my $mean_vec_ref = shift;
my $covar_ref = shift;
foreach my $data_id (keys %{$self->{_data}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col( 0, $self->{_data}->{$data_id});
if ($self->{_debug}) {
display_matrix("data vec in find prob function", $data_vec);
display_matrix("mean vec in find prob function", $mean_vec_ref);
display_matrix("covariance in find prob function", $covar_ref);
}
my $datavec_minus_mean = $data_vec - $mean_vec_ref;
display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
my $exponent = undef;
if ($self->{_data_dimensions} > 1) {
$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
} elsif (defined blessed($covar_ref)) {
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
my @covar_as_matrix = $covar_ref->as_list;
my $covar_val = $covar_as_matrix[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
} else {
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
}
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
my $coefficient = undef;
if ($self->{_data_dimensions} > 1) {
$coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
$covar_ref->det()) ;
} elsif (!defined blessed($covar_ref)) {
$coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
} else {
my @covar_as_matrix = $covar_ref->as_list;
my $covar_val = $covar_as_matrix[0];
$coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
}
my $prob = $coefficient * exp($exponent);
push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
}
}
sub find_expected_classes_at_each_datapoint {
my $self = shift;
my @priors = @{$self->{_class_priors}};
foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
my $numerator =
vector_2_vector_multiply(
$self->{_class_probs_at_each_data_point}->{$data_id},
$self->{_class_priors} );
my $sum = 0;
foreach my $part (@$numerator) {
$sum += $part;
}
$self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
}
}
sub initialize_class_priors {
my $self = shift;
if (@{$self->{_class_priors}} == 0) {
my $prior = 1.0 / $self->{_K};
foreach my $class_index (0..$self->{_K}-1) {
push @{$self->{_class_priors}}, $prior;
}
}
die "Mismatch between number of values for class priors " .
"and the number of clusters expected"
unless @{$self->{_class_priors}} == $self->{_K};
my $sum = 0;
foreach my $prior (@{$self->{_class_priors}}) {
$sum += $prior;
}
die "Your priors in the constructor call do not add up to 1"
unless abs($sum - 1) < 0.001;
print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
}
sub estimate_class_priors {
my $self = shift;
foreach my $datatag (keys %{$self->{_data}}) {
my $class_label = $self->{_class_labels}->{$datatag};
$self->{_class_priors}[$class_label]++;
}
foreach my $prior (@{$self->{_class_priors}}) {
$prior /= $self->{_total_number_data_tuples};
}
foreach my $prior (@{$self->{_class_priors}}) {
print "class priors: @{$self->{_class_priors}}\n";
}
}
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub get_value_index_hash {
my $arr = shift;
my %hash;
foreach my $index (0..@$arr-1) {
$hash{$arr->[$index]} = $index if $arr->[$index] > 0;
}
return \%hash;
}
sub non_maximum_supression {
my $arr = shift;
my @output = (0) x @$arr;
my @final_output = (0) x @$arr;
my %hash;
my @array_of_runs = ([$arr->[0]]);
foreach my $index (1..@$arr-1) {
if ($arr->[$index] == $arr->[$index-1]) {
push @{$array_of_runs[-1]}, $arr->[$index];
} else {
push @array_of_runs, [$arr->[$index]];
}
}
my $runstart_index = 0;
foreach my $run_index (1..@array_of_runs-2) {
$runstart_index += @{$array_of_runs[$run_index-1]};
if ($array_of_runs[$run_index]->[0] >
$array_of_runs[$run_index-1]->[0] &&
$array_of_runs[$run_index]->[0] >
$array_of_runs[$run_index+1]->[0]) {
my $run_center = @{$array_of_runs[$run_index]} / 2;
my $assignment_index = $runstart_index + $run_center;
$output[$assignment_index] = $arr->[$assignment_index];
}
}
if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
$runstart_index += @{$array_of_runs[-2]};
my $run_center = @{$array_of_runs[-1]} / 2;
my $assignment_index = $runstart_index + $run_center;
$output[$assignment_index] = $arr->[$assignment_index];
}
if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
my $run_center = @{$array_of_runs[0]} / 2;
$output[$run_center] = $arr->[$run_center];
}
return \@output;
}
sub display_matrix {
my $message = shift;
my $matrix = shift;
if (!defined blessed($matrix)) {
print "display_matrix called on a scalar value: $matrix\n";
return;
}
my $nrows = $matrix->rows();
my $ncols = $matrix->cols();
print "$message ($nrows rows and $ncols columns)\n";
foreach my $i (0..$nrows-1) {
my $row = $matrix->row($i);
my @row_as_list = $row->as_list;
print "@row_as_list\n";
}
print "\n";
}
sub transpose {
my $matrix = shift;
my $num_rows = $matrix->rows();
my $num_cols = $matrix->cols();
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
foreach my $i (0..$num_rows-1) {
my @row = $matrix->row($i)->as_list;
$transpose->set_col($i, \@row );
}
return $transpose;
}
sub vector_multiply {
my $vec1 = shift;
my $vec2 = shift;
die "vec_multiply called with two vectors of different sizes"
unless @$vec1 == @$vec2;
my $result = 0;
foreach my $i (0..@$vec1-1) {
$result += $vec1->[$i] * $vec2->[$i];
}
return $result;
}
sub vector_2_vector_multiply {
my $vec1 = shift;
my $vec2 = shift;
die "vec_multiply called with two vectors of different sizes"
unless @$vec1 == @$vec2;
my @result_vec;
foreach my $i (0..@$vec1-1) {
$result_vec[$i] = $vec1->[$i] * $vec2->[$i];
}
return \@result_vec;
}
sub matrix_multiply {
my $matrix1 = shift;
my $matrix2 = shift;
my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
die "matrix multiplication called with non-matching matrix arguments"
# unless $ncols1 == $nrows2;
unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
if ($nrows1 == 1) {
my @row = $matrix1->row(0)->as_list;
( run in 1.334 second using v1.01-cache-2.11-cpan-5b529ec07f3 )