Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

package Algorithm::ExpectationMaximization;

#---------------------------------------------------------------------------
# Copyright (c) 2014 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::ExpectationMaximization is a pure Perl implementation for
# Expectation-Maximization based clustering of multi-dimensional data that can be
# modeled as a Gaussian mixture.
# ---------------------------------------------------------------------------

use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Math::Random;
use Graphics::GnuplotIF;
use Math::GSL::Matrix;
use Scalar::Util 'blessed';

our $VERSION = '1.22';

# from perl docs:
my $_num_regex =  '^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$'; 

# Constructor:
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if check_for_illegal_params(@params) == 0;
    bless {
        _datafile         =>   $args{datafile} || croak("datafile required"),
        _mask             =>   $args{mask}     || croak("mask required"),
        _K                =>   $args{K}       || croak("number of clusters required"),
        _terminal_output  =>   $args{terminal_output}   || 0,
        _seeding          =>   $args{seeding}           || 'random',
        _seed_tags        =>   $args{seed_tags}         || [],
        _max_em_iterations=>   $args{max_em_iterations} || 100,
        _class_priors     =>   $args{class_priors}      || [],
        _debug            =>   $args{debug}             || 0,
        _N                =>   0,
        _data             =>   {},
        _data_id_tags     =>   [],
        _clusters         =>   [],
        _cluster_centers  =>   [],
        _data_dimensions  =>   0,
        _cluster_normalizers            => [],
        _cluster_means                  => [],
        _cluster_covariances            => [],
        _class_labels_for_data          => {},
        _class_probs_at_each_data_point => {},
        _expected_class_probs           => {}, 
        _old_priors                     => [],
        _old_old_priors                 => [],
        _fisher_quality_vs_iteration    => [],
        _mdl_quality_vs_iterations      => [],
    }, $class;
}


sub read_data_from_file {
    my $self = shift;
    my $filename = $self->{_datafile};
    $self->read_data_from_file_csv() if $filename =~ /.csv$/;
    $self->read_data_from_file_dat() if $filename =~ /.dat$/;
}

sub read_data_from_file_csv {
    my $self = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
            "the size of at least one of the data records in the file.\n"
            unless scalar(@mask) == scalar(@splits);
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_data} = \%data_hash;
    $self->{_data_id_tags} = \@data_tags;
    $self->{_N} = scalar @data_tags;
    # Need to make the following call to set the global mean and covariance:
    # my $covariance =  $self->estimate_mean_and_covariance(\@data_tags);
    # Need to make the following call to set the global eigenvec eigenval sets:
    # $self->eigen_analysis_of_covariance($covariance);
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
             "points must satisfy the relation N > 2xK**2 where K is " .

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

            if $#fields != $#mask;
        my $record_id;
        foreach my $i (0..@fields-1) {
            if ($mask[$i] eq '0') {
                next;
            } elsif ($mask[$i] eq 'N') {
                $record_id = $fields[$i];
            } elsif ($mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "misformed mask for reading the data file";
            }
        }
        my @nums = map {/$_num_regex/;$_} @data_fields;
        $self->{_data}->{ $record_id } = \@nums;
    }
    my @all_data_ids = keys %{$self->{_data}};
    $self->{_data_id_tags} = \@all_data_ids;
    $self->{_N} = scalar @all_data_ids;
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
             "points must satisfy the relation N > 2xK**2 where K is " .
             "the number of clusters requested for the clusters to be " .
             "meaningful $!" 
                         if ( $self->{_N} < (2 * $self->{_K} ** 2) );
    }
}


# This is the heart of the module --- in the sense that this method implements the EM
# algorithm for the estimating the parameters of a Gaussian mixture model for the
# data. In the implementation shown below, we declare convergence for the EM
# algorithm when the change in the class priors over three iterations falls below a
# threshold.  The current value of this threshold, as can be seen in the function
# compare_array_floats(), is 0.00001.
sub EM {
    my $self = shift;
    $self->initialize_class_priors();
    for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations}; 
                                                             $em_iteration++) {
        if ($em_iteration == 0) {
            print "\nSeeding the EM algorithm with:\n";                     
            $self->display_seeding_stats();
            print "\nFinished displaying the seeding information\n";
            print "\nWill print out a dot for each iteration of EM:\n\n";
        }
        my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
        print $progress_indicator;
        foreach my $data_id (@{$self->{_data_id_tags}}) {
            $self->{_class_probs_at_each_data_point}->{$data_id} = [];
            $self->{_expected_class_probs}->{$data_id} = [];
        }
        # Calculate prob(x | C_i) --- this is the prob of data point x as
        # a member of class C_i. You must do this for all K classes:
        foreach my $cluster_index(0..$self->{_K}-1) {
            $self->find_prob_at_each_datapoint_for_given_mean_and_covar(
                $self->{_cluster_means}->[$cluster_index],
                $self->{_cluster_covariances}->[$cluster_index] );
        }
        $self->{_cluster_normalizers} = [];
        if ($self->{_debug}) {
            print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
            foreach my $data_id (sort keys %{$self->{_data}}) {
                my $class_probs_at_a_point = 
                      $self->{_class_probs_at_each_data_point}->{$data_id};
                print "Class probs for $data_id: @$class_probs_at_a_point\n"
            }
        }
        # Calculate prob(C_i | x) which is the posterior prob of class
        # considered as a r.v. to be C_i at a given point x.  For a given
        # x, the sum of such probabilities over all C_i must add up to 1:
        $self->find_expected_classes_at_each_datapoint();
        if ($self->{_debug}) {
            print "\n\nDisplaying expected class probs at each data point:\n\n";
            foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
                my $expected_classes_at_a_point = 
                                   $self->{_expected_class_probs}->{$data_id};
                print "Expected classes $data_id: @$expected_classes_at_a_point\n";
            }
        }
        # 1. UPDATE MEANS:        
        my @new_means;
        foreach my $cluster_index(0..$self->{_K}-1) {         
            $new_means[$cluster_index] = 
                       Math::GSL::Matrix->new($self->{_data_dimensions},1);        
            $new_means[$cluster_index]->zero();
            foreach my $data_id (keys %{$self->{_data}}) {                        
                my $data_record = $self->{_data}->{$data_id};
                my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
                $data_vec->set_col(0,$data_record);
                $new_means[$cluster_index] +=  
                    $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
                    $data_vec->copy();
                $self->{_cluster_normalizers}->[$cluster_index] +=
                    $self->{_expected_class_probs}->{$data_id}->[$cluster_index];
            }
            $new_means[$cluster_index] *= 1.0 / 
                                 $self->{_cluster_normalizers}->[$cluster_index];
        }
        if ($self->{_debug}) {
            foreach my $meanvec (@new_means) {
                display_matrix("At EM Iteration $em_iteration, new mean vector is", 
                                                                          $meanvec);
            }
        }
        $self->{_cluster_means} = \@new_means;
        # 2. UPDATE COVARIANCES:
        my @new_covariances;
        foreach my $cluster_index(0..$self->{_K}-1) {         
            $new_covariances[$cluster_index] = 
                  Math::GSL::Matrix->new($self->{_data_dimensions},
                                         $self->{_data_dimensions});
            $new_covariances[$cluster_index]->zero();
            my $normalizer = 0;
            foreach my $data_id (keys %{$self->{_data}}) {                        
                my $data_record = $self->{_data}->{$data_id};
                my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
                $data_vec->set_col(0,$data_record);
                my $mean_subtracted_data = 
                    $data_vec - $self->{_cluster_means}->[$cluster_index];
                my $outer_product = outer_product($mean_subtracted_data, 
                                                  $mean_subtracted_data);
                $new_covariances[$cluster_index] +=  
                    $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
                    $outer_product;
            }
            $new_covariances[$cluster_index] *= 
                                   1.0 / 
                                   $self->{_cluster_normalizers}->[$cluster_index];
        }
        $self->{_cluster_covariances} = \@new_covariances;
        # 3. UPDATE PRIORS:
        $self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
                                    if @{$self->{_old_priors}} > 0;
        $self->{_old_priors} = deep_copy_array( $self->{_class_priors} ); 
        foreach my $cluster_index(0..$self->{_K}-1) {      
            $self->{_class_priors}->[$cluster_index] = 
                 $self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
        }
        my @priors = @{$self->{_class_priors}};
        print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
        push @{$self->{_fisher_quality_vs_iteration}}, 
                                                 $self->clustering_quality_fisher();
        push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
        if ( ($em_iteration > 5 && $self->reached_convergence()) 
             || ($em_iteration ==  $self->{_max_em_iterations} - 1) ) {
            my @old_old_priors = @{$self->{_old_old_priors}};  
            my @old_priors = @{$self->{_old_priors}};
            print "\n\nPrevious to previous priors:       @old_old_priors\n";
            print "Previous priors:                   @old_priors\n";
            print "Current class priors:              @{$self->{_class_priors}}\n";
            print "\n\nCONVERGENCE ACHIEVED AT ITERATION $em_iteration\n\n"
                if $em_iteration < $self->{_max_em_iterations} - 1; 
            last;
        }
    }
    print "\n\n\n";
}

sub reached_convergence {
    my $self = shift;
    return 1 if compare_array_floats($self->{_old_old_priors}, 
                                     $self->{_old_priors}) 
                &&
                compare_array_floats($self->{_old_priors}, 
                                     $self->{_class_priors});
    return 0;
}

# Classify the data into disjoint clusters using the Naive Bayes' classification:
sub run_bayes_classifier {
    my $self = shift;
    $self->classify_all_data_tuples_bayes($self->{_cluster_means}, 
                                         $self->{_cluster_covariances});
}

# Should NOT be called before run_bayes_classifier is run
sub return_disjoint_clusters {
    my $self = shift;
    return $self->{_clusters};
}

sub return_clusters_with_posterior_probs_above_threshold {
    my $self = shift;
    my $theta = shift;
    my @class_distributions;
    foreach my $cluster_index (0..$self->{_K}-1) {
        push @class_distributions, [];
    }
    foreach my $data_tag (@{$self->{_data_id_tags}}) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            push @{$class_distributions[$cluster_index]}, $data_tag
                if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] 
                                            > $theta;
        }
    }
    return \@class_distributions;
}    

sub return_individual_class_distributions_above_given_threshold {
    my $self = shift;
    my $theta = shift;
    my @probability_distributions;
    foreach my $cluster_index (0..$self->{_K}-1) {
        push @probability_distributions, [];
    }
    foreach my $cluster_index (0..$self->{_K}-1) {
        my $mean_vec = $self->{_cluster_means}->[$cluster_index];
        my $covar = $self->{_cluster_covariances}->[$cluster_index];
        foreach my $data_id (keys %{$self->{_data}}) {
            my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
            $data_vec->set_col( 0, $self->{_data}->{$data_id});
            my $datavec_minus_mean = $data_vec - $mean_vec;
            display_matrix( "datavec minus mean is ", $datavec_minus_mean )
                                    if $self->{_debug};
            my $exponent = undef;
            if ($self->{_data_dimensions} > 1) {
                $exponent = -0.5 * vector_matrix_multiply( 
                                               transpose($datavec_minus_mean),
                                matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
            } else {
                my @var_inverse = $covar->inverse()->as_list;
                my $var_inverse_val = $var_inverse[0];
                my @data_minus_mean = $datavec_minus_mean->as_list;
                my $data_minus_mean_val = $data_minus_mean[0];
                $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
            }
            print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
            my $coefficient = 1.0 / \
                 ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) );
            my $prob = $coefficient * exp($exponent);
            push @{$probability_distributions[$cluster_index]}, $data_id 
                if $prob > $theta;
        }
    }
    return \@probability_distributions;
}

sub return_estimated_priors {
    my $self = shift;
    return $self->{_class_priors};
}

# Calculates the MDL (Minimum Description Length) clustering criterion according to
# Rissanen.  (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978,
# and "A Universal Prior for Integers and Estimation by Minimum Description Length,"
# Annals of Statistics, 1983.)  The MDL criterion is a difference of a log-likelihood
# term for all of the observed data and a model-complexity penalty term. In general,
# both the log-likelihood and the model-complexity terms increase as the number of
# clusters increases.  The form of the MDL criterion used in the implementation below
# uses for the penalty term the Bayesian Information Criterion (BIC) of G. Schwartz,
# "Estimating the Dimensions of a Model," The Annals of Statistics, 1978.  In
# general, the smaller the value of the MDL quality measure calculated below, the
# better the clustering of the data.
sub clustering_quality_mdl {
    my $self = shift;
    # Calculate the inverses of all of the covariance matrices in order to avoid
    # having to calculate them repeatedly inside the inner 'foreach' loop in the
    # main part of this method.  Here we go:
    my @covar_inverses;
    foreach my $cluster_index (0..$self->{_K}-1) {
        my $covar = $self->{_cluster_covariances}->[$cluster_index];
        push @covar_inverses, $covar->inverse();
    }
    # For the clustering quality, first calculate the log-likelihood of all the
    # observed data:
    my $log_likelihood = 0;
    foreach my $tag (@{$self->{_data_id_tags}}) {
        my $likelihood_for_each_tag = 0;
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $mean_vec = $self->{_cluster_means}->[$cluster_index];
            my $covar = $self->{_cluster_covariances}->[$cluster_index];
            my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
            $data_vec->set_col( 0, $self->{_data}->{$tag});
            my $datavec_minus_mean = $data_vec - $mean_vec;
            my $exponent = undef;
            if ($self->{_data_dimensions} > 1) {
                $exponent = -0.5 * vector_matrix_multiply( 
                                       transpose($datavec_minus_mean),
                                            matrix_vector_multiply($covar_inverses[$cluster_index], 
                                                       $datavec_minus_mean ) );
            } else {
                my @var_inverse = $covar_inverses[$cluster_index]->as_list;
                my $var_inverse_val = $var_inverse[0];
                my @data_minus_mean = $datavec_minus_mean->as_list;
                my $data_minus_mean_val = $data_minus_mean[0];
                $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
            }

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

# the individual clusters.
sub clustering_quality_fisher {
    my $self = shift;
    my @cluster_quality_indices;
    my $fisher_trace = 0;
    my $S_w = 
        Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
    $S_w->zero;
    my $S_b = 
        Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
    $S_b->zero;
    my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);    
    $global_mean->zero;
    foreach my $cluster_index(0..$self->{_K}-1) { 
        $global_mean = $self->{_class_priors}->[$cluster_index] *
                                    $self->{_cluster_means}->[$cluster_index];
    }
    foreach my $cluster_index(0..$self->{_K}-1) {      
        $S_w +=  $self->{_cluster_covariances}->[$cluster_index] * 
                            $self->{_class_priors}->[$cluster_index];
        my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index] 
                                                               - $global_mean;
        my $outer_product = outer_product( $class_mean_minus_global_mean, 
                                               $class_mean_minus_global_mean );
        $S_b +=  $self->{_class_priors}->[$cluster_index] * $outer_product;
    }
    my $fisher = matrix_multiply($S_w->inverse, $S_b);
    return $fisher unless defined blessed($fisher);
    return matrix_trace($fisher);
}

sub display_seeding_stats {
    my $self = shift;
    foreach my $cluster_index(0..$self->{_K}-1) {      
        print "\nSeeding for cluster $cluster_index:\n";
        my $mean = $self->{_cluster_means}->[$cluster_index];
        display_matrix("The mean is: ", $mean);        
        my $covariance = $self->{_cluster_covariances}->[$cluster_index];
        display_matrix("The covariance is: ", $covariance);
    }
}

sub display_fisher_quality_vs_iterations {
    my $self = shift;
    print "\n\nFisher Quality vs. Iterations: " .
                      "@{$self->{_fisher_quality_vs_iteration}}\n\n";
}

sub display_mdl_quality_vs_iterations {
    my $self = shift;
    print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}

sub find_prob_at_each_datapoint_for_given_mean_and_covar {
    my $self = shift;
    my $mean_vec_ref = shift;
    my $covar_ref = shift;
    foreach my $data_id (keys %{$self->{_data}}) {
        my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);        
        $data_vec->set_col( 0, $self->{_data}->{$data_id});
        if ($self->{_debug}) {
            display_matrix("data vec in find prob function", $data_vec);
            display_matrix("mean vec in find prob function", $mean_vec_ref);
            display_matrix("covariance in find prob function", $covar_ref);
        }
        my $datavec_minus_mean = $data_vec - $mean_vec_ref;
        display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
        my $exponent = undef;
        if ($self->{_data_dimensions} > 1) {
            $exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
                matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
        } elsif (defined blessed($covar_ref)) {
            my @data_minus_mean = $datavec_minus_mean->as_list;
            my $data_minus_mean_val = $data_minus_mean[0];
            my @covar_as_matrix = $covar_ref->as_list;
            my $covar_val = $covar_as_matrix[0];
            $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
        } else {
            my @data_minus_mean = $datavec_minus_mean->as_list;
            my $data_minus_mean_val = $data_minus_mean[0];
            $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
        }
        print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
        my $coefficient = undef;
        if ($self->{_data_dimensions} > 1) {
            $coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) * 
                                                                                $covar_ref->det()) ;
        } elsif (!defined blessed($covar_ref)) {
            $coefficient =  1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
        } else {         
            my @covar_as_matrix = $covar_ref->as_list;
            my $covar_val = $covar_as_matrix[0];
            $coefficient =  1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
        }
        my $prob = $coefficient * exp($exponent);
        push @{$self->{_class_probs_at_each_data_point}->{$data_id}}, $prob;
    }
}

sub find_expected_classes_at_each_datapoint {
    my $self = shift;
    my @priors = @{$self->{_class_priors}};
    foreach my $data_id (sort keys %{$self->{_class_probs_at_each_data_point}}) {
        my $numerator = 
          vector_2_vector_multiply( 
                           $self->{_class_probs_at_each_data_point}->{$data_id}, 
                           $self->{_class_priors} );
        my $sum = 0;
        foreach my $part (@$numerator) {
            $sum += $part;
        }
        $self->{_expected_class_probs}->{$data_id} = [map $_/$sum, @{$numerator}];
    }
}

sub initialize_class_priors {
    my $self = shift;
    if (@{$self->{_class_priors}} == 0) {
        my $prior = 1.0 / $self->{_K};
        foreach my $class_index (0..$self->{_K}-1) {
            push @{$self->{_class_priors}}, $prior;
        }
    }
    die "Mismatch between number of values for class priors " .
          "and the number of clusters expected"
        unless @{$self->{_class_priors}} == $self->{_K};
    my $sum = 0;
    foreach my $prior (@{$self->{_class_priors}}) {
        $sum += $prior;
    }
    die "Your priors in the constructor call do not add up to 1"  
                       unless abs($sum - 1) < 0.001;
    print "\nInitially assumed class priors are: @{$self->{_class_priors}}\n";
}

sub estimate_class_priors {
    my $self = shift;
    foreach my $datatag (keys %{$self->{_data}}) {
        my $class_label = $self->{_class_labels}->{$datatag};
        $self->{_class_priors}[$class_label]++;
    }
    foreach my $prior (@{$self->{_class_priors}}) {
        $prior /= $self->{_total_number_data_tuples};

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        print "\nFor cluster $cluster_index: rows: $num_rows   and cols: $num_cols\n";
        my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);  
        my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
        my $col_index = 0;
        foreach my $ele (@{$clusters->[$cluster_index]}) {
            $matrix->set_col($col_index++, $ele);
        }
        # display_matrix( "Displaying cluster matrix", $matrix );
        foreach my $j (0..$num_cols-1) {
            $mean_vec += $matrix->col($j);
        }
        $mean_vec *=  1.0 / $num_cols;
        push @cluster_mean_vecs, $mean_vec;
        display_matrix( "Displaying the mean vector",  $mean_vec );
        foreach my $j (0..$num_cols-1) {
            my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
            $matrix->set_col($j, \@new_col);
        }
        # display_matrix("Displaying mean subtracted data as a matrix",  $matrix );
        my $transposed = transpose( $matrix );
        # display_matrix("Displaying transposed matrix",$transposed);
        my $covariance = matrix_multiply( $matrix, $transposed );
        $covariance *= 1.0 / $num_cols;
        push @cluster_covariances, $covariance;
        display_matrix("Displaying the cluster covariance",  $covariance );
    }
    return (\@cluster_mean_vecs, \@cluster_covariances);
}

sub find_data_dimensionality {
    my $clusters = shift;
    my @first_cluster = @{$clusters->[0]};
    my @first_data_element = @{$first_cluster[0]};
    return scalar(@first_data_element);
}

sub find_seed_centered_covariances {
    my $self = shift;
    my $seed_tags = shift;
    my (@seed_mean_vecs, @seed_based_covariances);
    foreach my $seed_tag (@$seed_tags) {
        my ($num_rows,$num_cols) = ($self->{_data_dimensions}, $self->{_N});
        my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);  
        my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
        $mean_vec->set_col(0, $self->{_data}->{$seed_tag});
        push @seed_mean_vecs, $mean_vec;
        display_matrix( "Displaying the seed mean vector",  $mean_vec );
        my $col_index = 0;
        foreach my $tag (@{$self->{_data_id_tags}}) {
            $matrix->set_col($col_index++, $self->{_data}->{$tag});
        }
        foreach my $j (0..$num_cols-1) {
            my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
            $matrix->set_col($j, \@new_col);
        }
        my $transposed = transpose( $matrix );
        my $covariance = matrix_multiply( $matrix, $transposed );
        $covariance *= 1.0 / $num_cols;
        push @seed_based_covariances, $covariance;
        display_matrix("Displaying the seed covariance",  $covariance )
            if $self->{_debug};
    }
    return (\@seed_mean_vecs, \@seed_based_covariances);
}

# The most popular seeding mode for EM is random. We include two other seeding modes
# --- kmeans and manual --- since they do produce good results for specialized cases.
# For example, when the clusters in your data are non-overlapping and not too
# anisotropic, the kmeans based seeding should work at least as well as the random
# seeding.  In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
# the advantage of avoiding the getting stuck in a local-maximum problem of the EM
# algorithm.
sub seed_the_clusters {
    my $self = shift;
    if ($self->{_seeding} eq 'random') {
        my @covariances;
        my @means;
        my @all_tags = @{$self->{_data_id_tags}}; 
        my @seed_tags;
        foreach my $i (0..$self->{_K}-1) {
            push @seed_tags, $all_tags[int rand( $self->{_N} )];
        }
        print "Random Seeding: Randomly selected seeding tags are  @seed_tags\n\n";
        my ($seed_means, $seed_covars) = 
                        $self->find_seed_centered_covariances(\@seed_tags);
        $self->{_cluster_means} = $seed_means;
        $self->{_cluster_covariances} = $seed_covars;
    } elsif ($self->{_seeding} eq 'kmeans') {
        $self->kmeans();
        my $clusters = $self->{_clusters};
        my @dataclusters;
        foreach my $index (0..@$clusters-1) {
            push @dataclusters, [];
        }
        foreach my $cluster_index (0..$self->{_K}-1) {
            foreach my $tag (@{$clusters->[$cluster_index]}) {
                my $data = $self->{_data}->{$tag};
                push @{$dataclusters[$cluster_index]}, deep_copy_array($data);
            }
        }
        ($self->{_cluster_means}, $self->{_cluster_covariances}) =
                           find_cluster_means_and_covariances(\@dataclusters);
    } elsif ($self->{_seeding} eq 'manual') {
        die "You have not supplied the seeding tags for the option \"manual\""
            unless @{$self->{_seed_tags}} > 0;
        print "Manual Seeding: Seed tags are @{$self->{_seed_tags}}\n\n";
        foreach my $tag (@{$self->{_seed_tags}}) {
            die "invalid tag used for manual seeding" 
                unless exists $self->{_data}->{$tag};
        }
        my ($seed_means, $seed_covars) = 
            $self->find_seed_centered_covariances($self->{_seed_tags});
        $self->{_cluster_means} = $seed_means;
        $self->{_cluster_covariances} = $seed_covars;
    } else {
        die "Incorrect call syntax used.  See documentation.";
    }
}

# This is the top-level method for kmeans based initialization of the EM
# algorithm. The means and the covariances returned by kmeans are used to seed the EM
# algorithm.
sub kmeans {
    my $self = shift;
    my $K = $self->{_K};
    $self->cluster_for_fixed_K_single_smart_try($K);
    if ((defined $self->{_clusters}) && (defined $self->{_cluster_centers})){
        return ($self->{_clusters}, $self->{_cluster_centers});
    } else {
        die "kmeans clustering failed.";
    }
}

# Used by the kmeans algorithm for the initialization of the EM iterations.  We do
# initial kmeans cluster seeding by subjecting the data to principal components
# analysis in order to discover the direction of maximum variance in the data space.
# Subsequently, we try to find the K largest peaks along this direction.  The
# coordinates of these peaks serve as the seeds for the K clusters.
sub cluster_for_fixed_K_single_smart_try {
    my $self = shift;
    my $K = shift;
    print "Clustering for K = $K\n" if $self->{_terminal_output};
    my ($clusters, $cluster_centers) =
                              $self->cluster_for_given_K($K);
    $self->{_clusters} = $clusters;
    $self->{_cluster_centers} = $cluster_centers;  
}

# Used by the kmeans part of the code for the initialization of the EM algorithm:
sub cluster_for_given_K {
    my $self = shift;
    my $K = shift;
    my $cluster_centers = $self->get_initial_cluster_centers($K);
    my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);  
    my $cluster_nonexistant_flag = 0;
    foreach my $trial (0..2) {
        ($clusters, $cluster_centers) =
                         $self->assign_data_to_clusters( $clusters, $K );
        my $num_of_clusters_returned = @$clusters;
        foreach my $cluster (@$clusters) {
            $cluster_nonexistant_flag = 1 if ((!defined $cluster) 
                                             ||  (@$cluster == 0));
        }
        last unless $cluster_nonexistant_flag;
    }
    return ($clusters, $cluster_centers);
}

# Used by the kmeans part of the code for the initialization of the EM algorithm:
sub get_initial_cluster_centers {
    my $self = shift;
    my $K = shift;
    if ($self->{_data_dimensions} == 1) {
        my @one_d_data;
        foreach my $j (0..$self->{_N}-1) {
            my $tag = $self->{_data_id_tags}[$j];     
            push @one_d_data, $self->{_data}->{$tag}->[0];
        }
        my @peak_points = 
                    find_peak_points_in_given_direction(\@one_d_data,$K);
        print "highest points at data values: @peak_points\n" 
                                                         if $self->{_debug};
        my @cluster_centers;
        foreach my $peakpoint (@peak_points) {
            push @cluster_centers, [$peakpoint];
        }
        return \@cluster_centers;
    }
    my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
    my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
    my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
    # All the record labels are stored in the array $self->{_data_id_tags}.
    # The actual data for clustering is stored in a hash at $self->{_data}
    # whose keys are the record labels; the value associated with each
    # key is the array holding the corresponding numerical multidimensional
    # data.
    foreach my $j (0..$num_cols-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        $matrix->set_col($j, $data);
    }
    if ($self->{_debug}) {
        print "\nDisplaying the original data as a matrix:";
        display_matrix( $matrix ); 
    }
    foreach my $j (0..$num_cols-1) {
        $mean_vec += $matrix->col($j);
    }
    $mean_vec *=  1.0 / $num_cols;
    if ($self->{_debug}) {
        print "Displaying the mean vector for the data:";
        display_matrix( $mean_vec );
    }
    foreach my $j (0..$num_cols-1) {
        my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
        $matrix->set_col($j, \@new_col);
    }
    if ($self->{_debug}) {
        print "Displaying mean subtracted data as a matrix:";
        display_matrix( $matrix ); 
    }
    my $transposed = transpose( $matrix );
    if ($self->{_debug}) {
        print "Displaying transposed data matrix:";
        display_matrix( $transposed );
    }
    my $covariance = matrix_multiply( $matrix, $transposed );
    $covariance *= 1.0 / $num_cols;
    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for your data:";
        display_matrix( $covariance );
    }
    my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    print "Eigenvalue 0:   $eigenvalues->[0]\n" if $self->{_debug};
    foreach my $i (1..$num_of_eigens-1) {
        $largest_eigen_index = $i if $eigenvalues->[$i] > 
                                     $eigenvalues->[$largest_eigen_index];
        $smallest_eigen_index = $i if $eigenvalues->[$i] < 
                                     $eigenvalues->[$smallest_eigen_index];
        print "Eigenvalue $i:   $eigenvalues->[$i]\n" if $self->{_debug};
    }
    print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
    print "\nsmallest eigen index: $smallest_eigen_index\n\n" 
                                                          if $self->{_debug};
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        print "Eigenvector $i:   @vec\n" if $self->{_debug};
    }
    my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
    print "\nLargest eigenvector:   @largest_eigen_vec\n" if $self->{_debug};
    my @max_var_direction;
    # Each element of the array @largest_eigen_vec is a Math::Complex object
    foreach my $k (0..@largest_eigen_vec-1) {
        my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
        if ($theta eq '0') {
            $max_var_direction[$k] = $mag;
        } elsif ($theta eq 'pi') {
            $max_var_direction[$k] = -1.0 * $mag;
        } else {
            die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
        }
    }
    # "Maximum variance direction: @max_var_direction
    print "\nMaximum Variance Direction: @max_var_direction\n\n" 
                                                 if $self->{_debug};
    # We now project all data points on the largest eigenvector.
    # Each projection will yield a single point on the eigenvector.
    my @projections;
    foreach my $j (0..$self->{_N}-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        die "Dimensionality of the largest eigenvector does not "
            . "match the dimensionality of the data" 
          unless @max_var_direction == $self->{_data_dimensions};
        my $projection = vector_multiply($data, \@max_var_direction);
        push @projections, $projection;
    }
    print "All projection points: @projections\n" if $self->{_debug};
    my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
    print "highest points at points along largest eigenvec: @peak_points\n"
                                              if $self->{_debug};
    my @cluster_centers;
    foreach my $peakpoint (@peak_points) {
        my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
        push @cluster_centers, \@actual_peak_coords;
    }
    return \@cluster_centers;
}

# Used by the kmeans part of the code: This method is called by the previous method
# to locate K peaks in a smoothed histogram of the data points projected onto the
# maximal variance direction.
sub find_peak_points_in_given_direction {
    my $dataref = shift;
    my $how_many = shift;
    my @data = @$dataref;
    my ($min, $max) = minmax(\@data);
    my $num_points = @data;
    my @sorted_data = sort {$a <=> $b} @data;
    #print "\n\nSorted data: @sorted_data\n";
    my $scale = $max - $min;
    foreach my $index (0..$#sorted_data-1) {
        $sorted_data[$index] = ($sorted_data[$index] - $min) / $scale;
    }
    my $avg_diff = 0;
    foreach my $index (0..$#sorted_data-1) {
        my $diff = $sorted_data[$index+1] - $sorted_data[$index];
        $avg_diff += ($diff - $avg_diff) / ($index + 1);
    }
    my $delta = 1.0 / 1000.0;
    #    It would be nice to set the delta adaptively, but I must
    #    change the number of cells in the next foreach loop accordingly
    #    my $delta = $avg_diff / 20;
    my @accumulator = (0) x 1000;
    foreach my $index (0..@sorted_data-1) {
        my $cell_index = int($sorted_data[$index] / $delta);
        my $smoothness = 40;
        for my $index ($cell_index-$smoothness..$cell_index+$smoothness) {
            next if $index < 0 || $index > 999;
            $accumulator[$index]++;
        }
    }
    my $peaks_array = non_maximum_supression( \@accumulator );
    my $peaks_index_hash = get_value_index_hash( $peaks_array );
    my @K_highest_peak_locations;
    my $k = 0;
    foreach my $peak (sort {$b <=> $a} keys %$peaks_index_hash) {
        my $unscaled_peak_point = 
                  $min + $peaks_index_hash->{$peak} * $scale * $delta;
        push @K_highest_peak_locations, $unscaled_peak_point
            if $k < $how_many;
        last if ++$k == $how_many;
    }
    return @K_highest_peak_locations;
}

# Used by the kmeans part of the code: The purpose of this routine is to form initial
# clusters by assigning the data samples to the initial clusters formed by the
# previous routine on the basis of the best proximity of the data samples to the
# different cluster centers.
sub assign_data_to_clusters_initial {

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

}

# Meant only for constructing a deep copy of a hash in which each value is an
# anonymous array of numbers:
sub deep_copy_hash {
    my $ref_in = shift;
    my $ref_out;
    while ( my ($key, $value) = each( %$ref_in ) ) {
        $ref_out->{$key} = deep_copy_array( $value );
    }
    return $ref_out;
}

# Meant only for an array of numbers:
sub deep_copy_array {
    my $ref_in = shift;
    my $ref_out;
    foreach my $i (0..@{$ref_in}-1) {
        $ref_out->[$i] = $ref_in->[$i];
    }
    return $ref_out;
}

# from perl docs:
sub fisher_yates_shuffle {                
    my $arr =  shift;                
    my $i = @$arr;                   
    while (--$i) {                   
        my $j = int rand( $i + 1 );  
        @$arr[$i, $j] = @$arr[$j, $i]; 
    }
}

sub mean_and_variance {
    my @data = @{shift @_};
    my ($mean, $variance);
    foreach my $i (1..@data) {
        if ($i == 1) {
            $mean = $data[0];
            $variance = 0;
        } else {
            # data[$i-1] because of zero-based indexing of vector
            $mean = ( (($i-1)/$i) * $mean ) + $data[$i-1] / $i;
            $variance = ( (($i-1)/$i) * $variance ) 
                           + ($data[$i-1]-$mean)**2 / ($i-1);
        }
    }
    return ($mean, $variance);
}

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / datafile
                            mask
                            K
                            terminal_output
                            max_em_iterations
                            seeding
                            class_priors
                            seed_tags
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub get_value_index_hash {
    my $arr = shift;
    my %hash;
    foreach my $index (0..@$arr-1) {
        $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
    }
    return \%hash;
}

sub non_maximum_supression {
    my $arr = shift;
    my @output = (0) x @$arr;
    my @final_output = (0) x @$arr;
    my %hash;
    my @array_of_runs = ([$arr->[0]]);
    foreach my $index (1..@$arr-1) {
        if ($arr->[$index] == $arr->[$index-1]) {
            push @{$array_of_runs[-1]}, $arr->[$index];
        } else {  
            push @array_of_runs, [$arr->[$index]];
        }
    }
    my $runstart_index = 0;
    foreach my $run_index (1..@array_of_runs-2) {
        $runstart_index += @{$array_of_runs[$run_index-1]};
        if ($array_of_runs[$run_index]->[0] > 
            $array_of_runs[$run_index-1]->[0]  &&
            $array_of_runs[$run_index]->[0] > 
            $array_of_runs[$run_index+1]->[0]) {
            my $run_center = @{$array_of_runs[$run_index]} / 2;
            my $assignment_index = $runstart_index + $run_center;
            $output[$assignment_index] = $arr->[$assignment_index];
        }
    }
    if ($array_of_runs[-1]->[0] > $array_of_runs[-2]->[0]) {
        $runstart_index += @{$array_of_runs[-2]};
        my $run_center = @{$array_of_runs[-1]} / 2;
        my $assignment_index = $runstart_index + $run_center;
        $output[$assignment_index] = $arr->[$assignment_index];
    }
    if ($array_of_runs[0]->[0] > $array_of_runs[1]->[0]) {
        my $run_center = @{$array_of_runs[0]} / 2;
        $output[$run_center] = $arr->[$run_center];
    }
    return \@output;



( run in 0.569 second using v1.01-cache-2.11-cpan-0bd6704ced7 )