view release on metacpan or search on metacpan
examples/canned_example1.pl view on Meta::CPAN
# to be used for clustering.
my $mask = "N11";
my $clusterer = Algorithm::ExpectationMaximization->new(
datafile => $datafile,
mask => $mask,
K => 3,
max_em_iterations => 300,
seeding => 'random',
terminal_output => 1,
debug => 0,
);
$clusterer->read_data_from_file();
# For visualizing the raw data:
my $data_visualization_mask = "11";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);
srand(time);
examples/canned_example2.pl view on Meta::CPAN
my $mask = "N11";
my $clusterer = Algorithm::ExpectationMaximization->new(
datafile => $datafile,
mask => $mask,
K => 2,
max_em_iterations => 300,
seeding => 'random',
# seeding => 'kmeans',
terminal_output => 1,
debug => 0,
);
$clusterer->read_data_from_file();
my $data_visualization_mask = "11";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);
srand(time);
$clusterer->seed_the_clusters();
examples/canned_example3.pl view on Meta::CPAN
my $mask = "N11";
my $clusterer = Algorithm::ExpectationMaximization->new(
datafile => $datafile,
mask => $mask,
K => 3,
max_em_iterations => 300,
seeding => 'manual',
seed_tags => ['a7', 'b3', 'c4'],
terminal_output => 1,
debug => 0,
);
$clusterer->read_data_from_file();
my $data_visualization_mask = "11";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);
srand(time);
$clusterer->seed_the_clusters();
examples/canned_example4.pl view on Meta::CPAN
my $mask = "N111";
my $clusterer = Algorithm::ExpectationMaximization->new(
datafile => $datafile,
mask => $mask,
K => 3,
max_em_iterations => 300,
seeding => 'random',
terminal_output => 1,
debug => 0,
);
$clusterer->read_data_from_file();
my $data_visualization_mask = "111";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);
srand(time);
$clusterer->seed_the_clusters();
examples/canned_example5.pl view on Meta::CPAN
my $mask = "N111";
my $clusterer = Algorithm::ExpectationMaximization->new(
datafile => $datafile,
mask => $mask,
K => 3,
max_em_iterations => 300,
seeding => 'random',
terminal_output => 1,
debug => 0,
);
$clusterer->read_data_from_file();
my $data_visualization_mask = "111";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);
srand(time);
$clusterer->seed_the_clusters();
examples/canned_example6.pl view on Meta::CPAN
my $mask = "N1";
my $clusterer = Algorithm::ExpectationMaximization->new(
datafile => $datafile,
mask => $mask,
K => 2,
max_em_iterations => 300,
seeding => 'random',
terminal_output => 1,
debug => 0,
);
$clusterer->read_data_from_file();
my $data_visualization_mask = "1";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);
srand(time);
$clusterer->seed_the_clusters();
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
if check_for_illegal_params(@params) == 0;
bless {
_datafile => $args{datafile} || croak("datafile required"),
_mask => $args{mask} || croak("mask required"),
_K => $args{K} || croak("number of clusters required"),
_terminal_output => $args{terminal_output} || 0,
_seeding => $args{seeding} || 'random',
_seed_tags => $args{seed_tags} || [],
_max_em_iterations=> $args{max_em_iterations} || 100,
_class_priors => $args{class_priors} || [],
_debug => $args{debug} || 0,
_N => 0,
_data => {},
_data_id_tags => [],
_clusters => [],
_cluster_centers => [],
_data_dimensions => 0,
_cluster_normalizers => [],
_cluster_means => [],
_cluster_covariances => [],
_class_labels_for_data => {},
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
$self->{_expected_class_probs}->{$data_id} = [];
}
# Calculate prob(x | C_i) --- this is the prob of data point x as
# a member of class C_i. You must do this for all K classes:
foreach my $cluster_index(0..$self->{_K}-1) {
$self->find_prob_at_each_datapoint_for_given_mean_and_covar(
$self->{_cluster_means}->[$cluster_index],
$self->{_cluster_covariances}->[$cluster_index] );
}
$self->{_cluster_normalizers} = [];
if ($self->{_debug}) {
print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
foreach my $data_id (sort keys %{$self->{_data}}) {
my $class_probs_at_a_point =
$self->{_class_probs_at_each_data_point}->{$data_id};
print "Class probs for $data_id: @$class_probs_at_a_point\n"
}
}
# Calculate prob(C_i | x) which is the posterior prob of class
# considered as a r.v. to be C_i at a given point x. For a given
# x, the sum of such probabilities over all C_i must add up to 1:
$self->find_expected_classes_at_each_datapoint();
if ($self->{_debug}) {
print "\n\nDisplaying expected class probs at each data point:\n\n";
foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
my $expected_classes_at_a_point =
$self->{_expected_class_probs}->{$data_id};
print "Expected classes $data_id: @$expected_classes_at_a_point\n";
}
}
# 1. UPDATE MEANS:
my @new_means;
foreach my $cluster_index(0..$self->{_K}-1) {
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
$data_vec->set_col(0,$data_record);
$new_means[$cluster_index] +=
$self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
$data_vec->copy();
$self->{_cluster_normalizers}->[$cluster_index] +=
$self->{_expected_class_probs}->{$data_id}->[$cluster_index];
}
$new_means[$cluster_index] *= 1.0 /
$self->{_cluster_normalizers}->[$cluster_index];
}
if ($self->{_debug}) {
foreach my $meanvec (@new_means) {
display_matrix("At EM Iteration $em_iteration, new mean vector is",
$meanvec);
}
}
$self->{_cluster_means} = \@new_means;
# 2. UPDATE COVARIANCES:
my @new_covariances;
foreach my $cluster_index(0..$self->{_K}-1) {
$new_covariances[$cluster_index] =
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
$self->{_cluster_covariances} = \@new_covariances;
# 3. UPDATE PRIORS:
$self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
if @{$self->{_old_priors}} > 0;
$self->{_old_priors} = deep_copy_array( $self->{_class_priors} );
foreach my $cluster_index(0..$self->{_K}-1) {
$self->{_class_priors}->[$cluster_index] =
$self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
}
my @priors = @{$self->{_class_priors}};
print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
push @{$self->{_fisher_quality_vs_iteration}},
$self->clustering_quality_fisher();
push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
if ( ($em_iteration > 5 && $self->reached_convergence())
|| ($em_iteration == $self->{_max_em_iterations} - 1) ) {
my @old_old_priors = @{$self->{_old_old_priors}};
my @old_priors = @{$self->{_old_priors}};
print "\n\nPrevious to previous priors: @old_old_priors\n";
print "Previous priors: @old_priors\n";
print "Current class priors: @{$self->{_class_priors}}\n";
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
push @probability_distributions, [];
}
foreach my $cluster_index (0..$self->{_K}-1) {
my $mean_vec = $self->{_cluster_means}->[$cluster_index];
my $covar = $self->{_cluster_covariances}->[$cluster_index];
foreach my $data_id (keys %{$self->{_data}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col( 0, $self->{_data}->{$data_id});
my $datavec_minus_mean = $data_vec - $mean_vec;
display_matrix( "datavec minus mean is ", $datavec_minus_mean )
if $self->{_debug};
my $exponent = undef;
if ($self->{_data_dimensions} > 1) {
$exponent = -0.5 * vector_matrix_multiply(
transpose($datavec_minus_mean),
matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
} else {
my @var_inverse = $covar->inverse()->as_list;
my $var_inverse_val = $var_inverse[0];
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
}
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
my $coefficient = 1.0 / \
( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) );
my $prob = $coefficient * exp($exponent);
push @{$probability_distributions[$cluster_index]}, $data_id
if $prob > $theta;
}
}
return \@probability_distributions;
}
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}
sub find_prob_at_each_datapoint_for_given_mean_and_covar {
my $self = shift;
my $mean_vec_ref = shift;
my $covar_ref = shift;
foreach my $data_id (keys %{$self->{_data}}) {
my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
$data_vec->set_col( 0, $self->{_data}->{$data_id});
if ($self->{_debug}) {
display_matrix("data vec in find prob function", $data_vec);
display_matrix("mean vec in find prob function", $mean_vec_ref);
display_matrix("covariance in find prob function", $covar_ref);
}
my $datavec_minus_mean = $data_vec - $mean_vec_ref;
display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
my $exponent = undef;
if ($self->{_data_dimensions} > 1) {
$exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
} elsif (defined blessed($covar_ref)) {
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
my @covar_as_matrix = $covar_ref->as_list;
my $covar_val = $covar_as_matrix[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
} else {
my @data_minus_mean = $datavec_minus_mean->as_list;
my $data_minus_mean_val = $data_minus_mean[0];
$exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
}
print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
my $coefficient = undef;
if ($self->{_data_dimensions} > 1) {
$coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) *
$covar_ref->det()) ;
} elsif (!defined blessed($covar_ref)) {
$coefficient = 1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
} else {
my @covar_as_matrix = $covar_ref->as_list;
my $covar_val = $covar_as_matrix[0];
$coefficient = 1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
}
foreach my $j (0..$num_cols-1) {
my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
$matrix->set_col($j, \@new_col);
}
my $transposed = transpose( $matrix );
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
push @seed_based_covariances, $covariance;
display_matrix("Displaying the seed covariance", $covariance )
if $self->{_debug};
}
return (\@seed_mean_vecs, \@seed_based_covariances);
}
# The most popular seeding mode for EM is random. We include two other seeding modes
# --- kmeans and manual --- since they do produce good results for specialized cases.
# For example, when the clusters in your data are non-overlapping and not too
# anisotropic, the kmeans based seeding should work at least as well as the random
# seeding. In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
# the advantage of avoiding the getting stuck in a local-maximum problem of the EM
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
my $K = shift;
if ($self->{_data_dimensions} == 1) {
my @one_d_data;
foreach my $j (0..$self->{_N}-1) {
my $tag = $self->{_data_id_tags}[$j];
push @one_d_data, $self->{_data}->{$tag}->[0];
}
my @peak_points =
find_peak_points_in_given_direction(\@one_d_data,$K);
print "highest points at data values: @peak_points\n"
if $self->{_debug};
my @cluster_centers;
foreach my $peakpoint (@peak_points) {
push @cluster_centers, [$peakpoint];
}
return \@cluster_centers;
}
my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
# All the record labels are stored in the array $self->{_data_id_tags}.
# The actual data for clustering is stored in a hash at $self->{_data}
# whose keys are the record labels; the value associated with each
# key is the array holding the corresponding numerical multidimensional
# data.
foreach my $j (0..$num_cols-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
$matrix->set_col($j, $data);
}
if ($self->{_debug}) {
print "\nDisplaying the original data as a matrix:";
display_matrix( $matrix );
}
foreach my $j (0..$num_cols-1) {
$mean_vec += $matrix->col($j);
}
$mean_vec *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "Displaying the mean vector for the data:";
display_matrix( $mean_vec );
}
foreach my $j (0..$num_cols-1) {
my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
$matrix->set_col($j, \@new_col);
}
if ($self->{_debug}) {
print "Displaying mean subtracted data as a matrix:";
display_matrix( $matrix );
}
my $transposed = transpose( $matrix );
if ($self->{_debug}) {
print "Displaying transposed data matrix:";
display_matrix( $transposed );
}
my $covariance = matrix_multiply( $matrix, $transposed );
$covariance *= 1.0 / $num_cols;
if ($self->{_debug}) {
print "\nDisplaying the Covariance Matrix for your data:";
display_matrix( $covariance );
}
my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
my $num_of_eigens = @$eigenvalues;
my $largest_eigen_index = 0;
my $smallest_eigen_index = 0;
print "Eigenvalue 0: $eigenvalues->[0]\n" if $self->{_debug};
foreach my $i (1..$num_of_eigens-1) {
$largest_eigen_index = $i if $eigenvalues->[$i] >
$eigenvalues->[$largest_eigen_index];
$smallest_eigen_index = $i if $eigenvalues->[$i] <
$eigenvalues->[$smallest_eigen_index];
print "Eigenvalue $i: $eigenvalues->[$i]\n" if $self->{_debug};
}
print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
print "\nsmallest eigen index: $smallest_eigen_index\n\n"
if $self->{_debug};
foreach my $i (0..$num_of_eigens-1) {
my @vec = $eigenvectors->[$i]->as_list;
print "Eigenvector $i: @vec\n" if $self->{_debug};
}
my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
print "\nLargest eigenvector: @largest_eigen_vec\n" if $self->{_debug};
my @max_var_direction;
# Each element of the array @largest_eigen_vec is a Math::Complex object
foreach my $k (0..@largest_eigen_vec-1) {
my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
if ($theta eq '0') {
$max_var_direction[$k] = $mag;
} elsif ($theta eq 'pi') {
$max_var_direction[$k] = -1.0 * $mag;
} else {
die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
}
}
# "Maximum variance direction: @max_var_direction
print "\nMaximum Variance Direction: @max_var_direction\n\n"
if $self->{_debug};
# We now project all data points on the largest eigenvector.
# Each projection will yield a single point on the eigenvector.
my @projections;
foreach my $j (0..$self->{_N}-1) {
my $tag = $self->{_data_id_tags}[$j];
my $data = $self->{_data}->{$tag};
die "Dimensionality of the largest eigenvector does not "
. "match the dimensionality of the data"
unless @max_var_direction == $self->{_data_dimensions};
my $projection = vector_multiply($data, \@max_var_direction);
push @projections, $projection;
}
print "All projection points: @projections\n" if $self->{_debug};
my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
print "highest points at points along largest eigenvec: @peak_points\n"
if $self->{_debug};
my @cluster_centers;
foreach my $peakpoint (@peak_points) {
my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
push @cluster_centers, \@actual_peak_coords;
}
return \@cluster_centers;
}
# Used by the kmeans part of the code: This method is called by the previous method
# to locate K peaks in a smoothed histogram of the data points projected onto the
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
sub check_for_illegal_params {
my @params = @_;
my @legal_params = qw / datafile
mask
K
terminal_output
max_em_iterations
seeding
class_priors
seed_tags
debug
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}