Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

#  parameter file. The parameter file must also state the prior probabilities to be
#  associated with each Gaussian.  See the example parameter file param1.txt in the
#  examples directory.  Just edit this file for your own needs.
#
#  The multivariate random numbers are generated by calling the Math::Random module.
#  As you would expect, that module will insist that the covariance matrix you
#  specify be symmetric and positive definite.
sub cluster_data_generator {
    my $class = shift;
    die "illegal call of a class method" 
        unless $class eq 'Algorithm::ExpectationMaximization';
    my %args = @_;
    my $input_parameter_file = $args{input_parameter_file};
    my $output_file = $args{output_datafile};
    my $N = $args{total_number_of_data_points};
    my @all_params;
    my $param_string;
    if (defined $input_parameter_file) {
        open INPUT, $input_parameter_file || "unable to open parameter file: $!";
        @all_params = <INPUT>;
        @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
        chomp @all_params;
        $param_string = join ' ', @all_params;
    } else {
        # Just for testing. Used in t/test.t
        $param_string = "priors 0.34 0.33 0.33 " .
                        "cluster 5 0 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 5 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 0 5  1 0 0 0 1 0 0 0 1";
    }
    my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
    croak "You did not specify the prior probabilities in the parameter file"
        unless $priors_string;
    my @priors = split /\s+/, $priors_string;
    @priors = grep {/$_num_regex/; $_} @priors;
    my $sum = 0;
    foreach my $prior (@priors) {
        $sum += $prior;
    }
    croak "Your priors in the parameter file do not add up to 1"  unless $sum == 1;
    my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/;
    my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string;
    @cluster_strings = grep  $_, @cluster_strings;

    my $K = @cluster_strings;
    croak "Too many clusters requested" if $K > 12;
    croak "Mismatch between the number of values for priors and the number " . 
        "of clusters" unless $K == @priors;
    my @point_labels = ('a'..'z');
    print "Prior probabilities recorded from param file: @priors\n";
    print "Number of Gaussians used for the synthetic data: $K\n";
    my @means;
    my @covariances;
    my $data_dimension;
    foreach my $i (0..$K-1) {
        my @num_strings = split /  /, $cluster_strings[$i];
        my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
        $data_dimension = @cluster_mean;
        push @means, \@cluster_mean;
        my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
        croak "dimensionality error" if @covariance_nums != 
                                      ($data_dimension ** 2);
        my $cluster_covariance;
        foreach my $j (0..$data_dimension-1) {
            foreach my $k (0..$data_dimension-1) {        
                $cluster_covariance->[$j]->[$k] = 
                         $covariance_nums[$j*$data_dimension + $k];
            }
        }
        push @covariances, $cluster_covariance;
    }
    random_seed_from_phrase( 'hellomellojello' );
    my @data_dump;
    foreach my $i (0..$K-1) {
        my @m = @{shift @means};
        my @covar = @{shift @covariances};
        my @new_data = Math::Random::random_multivariate_normal( 
                           int($N * $priors[$i]), @m, @covar );
        my $p = 0;
        my $label = $point_labels[$i];
        @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
        push @data_dump, @new_data;     
    }
    fisher_yates_shuffle( \@data_dump );
    open OUTPUT, ">$output_file";
    print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
        if $input_parameter_file;
    print OUTPUT "\#Total number of data points in this file: $N\n";
    print OUTPUT "\#Prior class probabilities for this data: @priors\n";
    foreach my $ele (@data_dump) {
        foreach my $coord ( @$ele ) {
            print OUTPUT "$coord ";
        }
        print OUTPUT "\n";
    }
    print "Data written out to file $output_file\n";
    close OUTPUT;
}

sub add_point_coords {
    my $self = shift;
    my @arr_of_ids = @{shift @_};      # array of data element names
    my @result;
    my $data_dimensionality = $self->{_data_dimensions};
    foreach my $i (0..$data_dimensionality-1) {
        $result[$i] = 0.0;
    }
    foreach my $id (@arr_of_ids) {
        my $ele = $self->{_data}->{$id};
        my $i = 0;
        foreach my $component (@$ele) {
            $result[$i] += $component;
            $i++;
        }
    }
    return \@result;
}

######################   Support Routines  ########################

# computer the outer product of two column vectors



( run in 0.737 second using v1.01-cache-2.11-cpan-fa01517f264 )