Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

package Algorithm::ExpectationMaximization;

#---------------------------------------------------------------------------
# Copyright (c) 2014 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::ExpectationMaximization is a pure Perl implementation for
# Expectation-Maximization based clustering of multi-dimensional data that can be
# modeled as a Gaussian mixture.
# ---------------------------------------------------------------------------

use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use Scalar::Util 'blessed';

our $VERSION = '1.22';

# from perl docs:
my $_num_regex =  '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$'; 

# Constructor:
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params(@params) == 0;
    bless {
        _datafile         =>   $args{datafile} || croak("datafile required"),
        _mask             =>   $args{mask}     || croak("mask required"),
        _K                =>   $args{K}       || croak("number of clusters required"),
        _terminal_output  =>   $args{terminal_output}   || 0,
        _seeding          =>   $args{seeding}           || 'random',
        _seed_tags        =>   $args{seed_tags}         || [],
        _max_em_iterations=>   $args{max_em_iterations} || 100,
        _class_priors     =>   $args{class_priors}      || [],
        _debug            =>   $args{debug}             || 0,
        _N                =>   0,
        _data             =>   {},
        _data_id_tags     =>   [],
        _clusters         =>   [],
        _cluster_centers  =>   [],
        _data_dimensions  =>   0,
        _cluster_normalizers            => [],
        _cluster_means                  => [],
        _cluster_covariances            => [],
        _class_labels_for_data          => {},
        _class_probs_at_each_data_point => {},
        _expected_class_probs           => {}, 
        _old_priors                     => [],
        _old_old_priors                 => [],
        _fisher_quality_vs_iteration    => [],
        _mdl_quality_vs_iterations      => [],
    }, $class;
}


sub read_data_from_file {
    my $self = shift;
    my $filename = $self->{_datafile};
    $self->read_data_from_file_csv() if $filename =~ /.csv$/;
    $self->read_data_from_file_dat() if $filename =~ /.dat$/;
}

sub read_data_from_file_csv {
    my $self = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
            "the size of at least one of the data records in the file.\n"
            unless scalar(@mask) == scalar(@splits);
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_data} = \%data_hash;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

                              $prob * $self->{_class_priors}->[$cluster_index];
        }
        $log_likelihood += log( $likelihood_for_each_tag );
    }
    # Now calculate the model complexity penalty. $L is the total number of
    # parameters it takes to specify a mixture of K Gaussians. If d is the
    # dimensionality of the data space, the covariance matrix of each Gaussian takes
    # (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
    # then you need d mean value parameters, and one prior probability parameter
    # for the Gaussian. So   $L = K[1 + d + d(d+1)/2] - 1  where the final '1' that
    # is subtracted is to account for the normalization on the class priors.
    my $L = (0.5 * $self->{_K} * 
             ($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1;
    my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
    my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
    return $mdl_criterion;
}

# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
# is the within-class scatter matrix, more commonly denoted S_w, and SB the
# between-class scatter matrix, more commonly denoted S_b (the underscore means
# subscript).  This measure can be thought of as the normalized average distance
# between the clusters, the normalization being provided by average cluster
# covariance SW^-1. Therefore, the larger the value of this quality measure, the
# better the separation between the clusters.  Since this measure has its roots in
# the Fisher linear discriminant function, we incorporate the word 'fisher' in the
# name of the quality measure.  Note that this measure is good only when the clusters
# are disjoint.  When the clusters exhibit significant overlap, the numbers produced
# by this quality measure tend to be generally meaningless.  As an extreme case,
# let's say your data was produced by a set of Gaussians, all with the same mean
# vector, but each with a distinct covariance. For this extreme case, this measure
# will produce a value close to zero --- depending on the accuracy with which the
# means are estimated --- even when your clusterer is doing a good job of identifying
# the individual clusters.
sub clustering_quality_fisher {
    my $self = shift;
    my @cluster_quality_indices;
    my $fisher_trace = 0;
    my $S_w = 
        Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
    $S_w->zero;
    my $S_b = 
        Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
    $S_b->zero;
    my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);    
    $global_mean->zero;
    foreach my $cluster_index(0..$self->{_K}-1) { 
        $global_mean = $self->{_class_priors}->[$cluster_index] *
                                    $self->{_cluster_means}->[$cluster_index];
    }
    foreach my $cluster_index(0..$self->{_K}-1) {      
        $S_w +=  $self->{_cluster_covariances}->[$cluster_index] * 
                            $self->{_class_priors}->[$cluster_index];
        my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index] 
                                                               - $global_mean;
        my $outer_product = outer_product( $class_mean_minus_global_mean, 
                                               $class_mean_minus_global_mean );
        $S_b +=  $self->{_class_priors}->[$cluster_index] * $outer_product;
    }
    my $fisher = matrix_multiply($S_w->inverse, $S_b);
    return $fisher unless defined blessed($fisher);
    return matrix_trace($fisher);
}

sub display_seeding_stats {
    my $self = shift;
    foreach my $cluster_index(0..$self->{_K}-1) {      
        print "\nSeeding for cluster $cluster_index:\n";
        my $mean = $self->{_cluster_means}->[$cluster_index];
        display_matrix("The mean is: ", $mean);        
        my $covariance = $self->{_cluster_covariances}->[$cluster_index];
        display_matrix("The covariance is: ", $covariance);
    }
}

sub display_fisher_quality_vs_iterations {
    my $self = shift;
    print "\n\nFisher Quality vs. Iterations: " .
                      "@{$self->{_fisher_quality_vs_iteration}}\n\n";
}

sub display_mdl_quality_vs_iterations {
    my $self = shift;
    print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}

sub find_prob_at_each_datapoint_for_given_mean_and_covar {
    my $self = shift;
    my $mean_vec_ref = shift;
    my $covar_ref = shift;
    foreach my $data_id (keys %{$self->{_data}}) {
        my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);        
        $data_vec->set_col( 0, $self->{_data}->{$data_id});
        if ($self->{_debug}) {
            display_matrix("data vec in find prob function", $data_vec);
            display_matrix("mean vec in find prob function", $mean_vec_ref);
            display_matrix("covariance in find prob function", $covar_ref);
        }
        my $datavec_minus_mean = $data_vec - $mean_vec_ref;
        display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
        my $exponent = undef;
        if ($self->{_data_dimensions} > 1) {
            $exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
                matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
        } elsif (defined blessed($covar_ref)) {
            my @data_minus_mean = $datavec_minus_mean->as_list;
            my $data_minus_mean_val = $data_minus_mean[0];
            my @covar_as_matrix = $covar_ref->as_list;
            my $covar_val = $covar_as_matrix[0];
            $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
        } else {
            my @data_minus_mean = $datavec_minus_mean->as_list;
            my $data_minus_mean_val = $data_minus_mean[0];
            $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
        }
        print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
        my $coefficient = undef;
        if ($self->{_data_dimensions} > 1) {
            $coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) * 
                                                                                $covar_ref->det()) ;
        } elsif (!defined blessed($covar_ref)) {
            $coefficient =  1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
        } else {         
            my @covar_as_matrix = $covar_ref->as_list;
            my $covar_val = $covar_as_matrix[0];
            $coefficient =  1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
        }
        my $prob = $coefficient * exp($exponent);
        push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
    }
}

sub find_expected_classes_at_each_datapoint {
    my $self = shift;
    my @priors = @{$self->{_class_priors}};
    foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
        my $numerator = 
          vector_2_vector_multiply( 
                           $self->{_class_probs_at_each_data_point}->{$data_id}, 
                           $self->{_class_priors} );
        my $sum = 0;
        foreach my $part (@$numerator) {
            $sum += $part;
        }
        $self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
    }
}

sub initialize_class_priors {
    my $self = shift;
    if (@{$self->{_class_priors}} == 0) {
        my $prior = 1.0 / $self->{_K};
        foreach my $class_index (0..$self->{_K}-1) {
            push @{$self->{_class_priors}}, $prior;
        }
    }
    die "Mismatch between number of values for class priors " .
          "and the number of clusters expected"
        unless @{$self->{_class_priors}} == $self->{_K};
    my $sum = 0;
    foreach my $prior (@{$self->{_class_priors}}) {
        $sum += $prior;
    }
    die "Your priors in the constructor call do not add up to 1"  
                       unless abs($sum - 1) < 0.001;
    print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
}

sub estimate_class_priors {
    my $self = shift;
    foreach my $datatag (keys %{$self->{_data}}) {
        my $class_label = $self->{_class_labels}->{$datatag};
        $self->{_class_priors}[$class_label]++;
    }
    foreach my $prior (@{$self->{_class_priors}}) {
        $prior /= $self->{_total_number_data_tuples};
    }
    foreach my $prior (@{$self->{_class_priors}}) {
        print "class priors: @{$self->{_class_priors}}\n";
    }
}    

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub get_value_index_hash {
    my $arr = shift;
    my %hash;
    foreach my $index (0..@$arr-1) {
        $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
    }
    return \%hash;
}

sub non_maximum_supression {
    my $arr = shift;
    my @output = (0) x @$arr;
    my @final_output = (0) x @$arr;
    my %hash;
    my @array_of_runs = ([$arr->[0]]);
    foreach my $index (1..@$arr-1) {
        if ($arr->[$index] == $arr->[$index-1]) {
            push @{$array_of_runs[-1]}, $arr->[$index];
        } else {  
            push @array_of_runs, [$arr->[$index]];
        }
    }
    my $runstart_index = 0;
    foreach my $run_index (1..@array_of_runs-2) {
        $runstart_index += @{$array_of_runs[$run_index-1]};
        if ($array_of_runs[$run_index]->[0] > 
            $array_of_runs[$run_index-1]->[0]  &&
            $array_of_runs[$run_index]->[0] > 
            $array_of_runs[$run_index+1]->[0]) {
            my $run_center = @{$array_of_runs[$run_index]} / 2;
            my $assignment_index = $runstart_index + $run_center;
            $output[$assignment_index] = $arr->[$assignment_index];
        }
    }
    if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
        $runstart_index += @{$array_of_runs[-2]};
        my $run_center = @{$array_of_runs[-1]} / 2;
        my $assignment_index = $runstart_index + $run_center;
        $output[$assignment_index] = $arr->[$assignment_index];
    }
    if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
        my $run_center = @{$array_of_runs[0]} / 2;
        $output[$run_center] = $arr->[$run_center];
    }
    return \@output;
}

sub display_matrix {
    my $message = shift;
    my $matrix = shift;
    if (!defined blessed($matrix)) {
        print "display_matrix called on a scalar value: $matrix\n";
        return;
    }
    my $nrows = $matrix->rows();
    my $ncols = $matrix->cols();
    print "$message ($nrows rows and $ncols columns)\n";
    foreach my $i (0..$nrows-1) {
        my $row = $matrix->row($i);
        my @row_as_list = $row->as_list;
        print "@row_as_list\n";
    }
    print "\n";
}

sub transpose {
    my $matrix = shift;
    my $num_rows = $matrix->rows();
    my $num_cols = $matrix->cols();
    my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
    foreach my $i (0..$num_rows-1) {
        my @row = $matrix->row($i)->as_list;
        $transpose->set_col($i, \@row );
    }
    return $transpose;
}

sub vector_multiply {
    my $vec1 = shift;
    my $vec2 = shift;
    die "vec_multiply called with two vectors of different sizes"
        unless @$vec1 == @$vec2;
    my $result = 0;
    foreach my $i (0..@$vec1-1) {
        $result += $vec1->[$i] * $vec2->[$i];
    }
    return $result;
}

sub vector_2_vector_multiply {
    my $vec1 = shift;
    my $vec2 = shift;
    die "vec_multiply called with two vectors of different sizes"
        unless @$vec1 == @$vec2;
    my @result_vec;
    foreach my $i (0..@$vec1-1) {
        $result_vec[$i] = $vec1->[$i] * $vec2->[$i];
    }
    return \@result_vec;
}

sub matrix_multiply {
    my $matrix1 = shift;
    my $matrix2 = shift;
    my ($nrows1, $ncols1) = ($matrix1->rows(), $matrix1->cols());
    my ($nrows2, $ncols2) = ($matrix2->rows(), $matrix2->cols());
    die "matrix multiplication called with non-matching matrix arguments"
#        unless $ncols1 == $nrows2;
        unless $nrows1 == $ncols2 && $ncols1 == $nrows2;
    if ($nrows1 == 1) {
        my @row = $matrix1->row(0)->as_list;



( run in 1.334 second using v1.01-cache-2.11-cpan-5b529ec07f3 )