Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
# parameter file. The parameter file must also state the prior probabilities to be
# associated with each Gaussian. See the example parameter file param1.txt in the
# examples directory. Just edit this file for your own needs.
#
# The multivariate random numbers are generated by calling the Math::Random module.
# As you would expect, that module will insist that the covariance matrix you
# specify be symmetric and positive definite.
sub cluster_data_generator {
my $class = shift;
die "illegal call of a class method"
unless $class eq 'Algorithm::ExpectationMaximization';
my %args = @_;
my $input_parameter_file = $args{input_parameter_file};
my $output_file = $args{output_datafile};
my $N = $args{total_number_of_data_points};
my @all_params;
my $param_string;
if (defined $input_parameter_file) {
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
chomp @all_params;
$param_string = join ' ', @all_params;
} else {
# Just for testing. Used in t/test.t
$param_string = "priors 0.34 0.33 0.33 " .
"cluster 5 0 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 5 0 1 0 0 0 1 0 0 0 1 " .
"cluster 0 0 5 1 0 0 0 1 0 0 0 1";
}
my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
croak "You did not specify the prior probabilities in the parameter file"
unless $priors_string;
my @priors = split /\s+/, $priors_string;
@priors = grep {/$_num_regex/; $_} @priors;
my $sum = 0;
foreach my $prior (@priors) {
$sum += $prior;
}
croak "Your priors in the parameter file do not add up to 1" unless $sum == 1;
my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/;
my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string;
@cluster_strings = grep $_, @cluster_strings;
my $K = @cluster_strings;
croak "Too many clusters requested" if $K > 12;
croak "Mismatch between the number of values for priors and the number " .
"of clusters" unless $K == @priors;
my @point_labels = ('a'..'z');
print "Prior probabilities recorded from param file: @priors\n";
print "Number of Gaussians used for the synthetic data: $K\n";
my @means;
my @covariances;
my $data_dimension;
foreach my $i (0..$K-1) {
my @num_strings = split / /, $cluster_strings[$i];
my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
$data_dimension = @cluster_mean;
push @means, \@cluster_mean;
my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
croak "dimensionality error" if @covariance_nums !=
($data_dimension ** 2);
my $cluster_covariance;
foreach my $j (0..$data_dimension-1) {
foreach my $k (0..$data_dimension-1) {
$cluster_covariance->[$j]->[$k] =
$covariance_nums[$j*$data_dimension + $k];
}
}
push @covariances, $cluster_covariance;
}
random_seed_from_phrase( 'hellomellojello' );
my @data_dump;
foreach my $i (0..$K-1) {
my @m = @{shift @means};
my @covar = @{shift @covariances};
my @new_data = Math::Random::random_multivariate_normal(
int($N * $priors[$i]), @m, @covar );
my $p = 0;
my $label = $point_labels[$i];
@new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
push @data_dump, @new_data;
}
fisher_yates_shuffle( \@data_dump );
open OUTPUT, ">$output_file";
print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
if $input_parameter_file;
print OUTPUT "\#Total number of data points in this file: $N\n";
print OUTPUT "\#Prior class probabilities for this data: @priors\n";
foreach my $ele (@data_dump) {
foreach my $coord ( @$ele ) {
print OUTPUT "$coord ";
}
print OUTPUT "\n";
}
print "Data written out to file $output_file\n";
close OUTPUT;
}
sub add_point_coords {
my $self = shift;
my @arr_of_ids = @{shift @_}; # array of data element names
my @result;
my $data_dimensionality = $self->{_data_dimensions};
foreach my $i (0..$data_dimensionality-1) {
$result[$i] = 0.0;
}
foreach my $id (@arr_of_ids) {
my $ele = $self->{_data}->{$id};
my $i = 0;
foreach my $component (@$ele) {
$result[$i] += $component;
$i++;
}
}
return \@result;
}
###################### Support Routines ########################
# computer the outer product of two column vectors
( run in 0.737 second using v1.01-cache-2.11-cpan-fa01517f264 )