Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        _K                =>   $args{K}       || croak("number of clusters required"),
        _terminal_output  =>   $args{terminal_output}   || 0,
        _seeding          =>   $args{seeding}           || 'random',
        _seed_tags        =>   $args{seed_tags}         || [],
        _max_em_iterations=>   $args{max_em_iterations} || 100,
        _class_priors     =>   $args{class_priors}      || [],
        _debug            =>   $args{debug}             || 0,
        _N                =>   0,
        _data             =>   {},
        _data_id_tags     =>   [],
        _clusters         =>   [],
        _cluster_centers  =>   [],
        _data_dimensions  =>   0,
        _cluster_normalizers            => [],
        _cluster_means                  => [],
        _cluster_covariances            => [],
        _class_labels_for_data          => {},
        _class_probs_at_each_data_point => {},
        _expected_class_probs           => {}, 
        _old_priors                     => [],
        _old_old_priors                 => [],
        _fisher_quality_vs_iteration    => [],
        _mdl_quality_vs_iterations      => [],
    }, $class;
}


sub read_data_from_file {
    my $self = shift;
    my $filename = $self->{_datafile};
    $self->read_data_from_file_csv() if $filename =~ /.csv$/;
    $self->read_data_from_file_dat() if $filename =~ /.dat$/;
}

sub read_data_from_file_csv {
    my $self = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
            "the size of at least one of the data records in the file.\n"
            unless scalar(@mask) == scalar(@splits);
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_data} = \%data_hash;
    $self->{_data_id_tags} = \@data_tags;
    $self->{_N} = scalar @data_tags;
    # Need to make the following call to set the global mean and covariance:
    # my $covariance =  $self->estimate_mean_and_covariance(\@data_tags);
    # Need to make the following call to set the global eigenvec eigenval sets:
    # $self->eigen_analysis_of_covariance($covariance);
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
             "points must satisfy the relation N > 2xK**2 where K is " .
             "the number of clusters requested for the clusters to be " .
             "meaningful $!" 
                         if ( $self->{_N} < (2 * $self->{_K} ** 2) );
        print "\n\n\n";
    }
}

sub read_data_from_file_dat {
    my $self = shift;
    my $datafile = $self->{_datafile};
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"
	if $self->{_terminal_output};
    open INPUT, $datafile
        or die "unable to open file $datafile: $!";
    chomp( my @raw_data = <INPUT> );
    close INPUT;
    # Transform strings into number data
    foreach my $record (@raw_data) {
        next unless $record;
        next if $record =~ /^#/;
        my @data_fields;
        my @fields = split /\s+/, $record;
        die "\nABORTED: Mask size does not correspond to row record size" 
            if $#fields != $#mask;
        my $record_id;
        foreach my $i (0..@fields-1) {
            if ($mask[$i] eq '0') {
                next;
            } elsif ($mask[$i] eq 'N') {
                $record_id = $fields[$i];
            } elsif ($mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "misformed mask for reading the data file";
            }
        }
        my @nums = map {/$_num_regex/;$_} @data_fields;
        $self->{_data}->{ $record_id } = \@nums;
    }
    my @all_data_ids = keys %{$self->{_data}};
    $self->{_data_id_tags} = \@all_data_ids;
    $self->{_N} = scalar @all_data_ids;
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
             "points must satisfy the relation N > 2xK**2 where K is " .
             "the number of clusters requested for the clusters to be " .
             "meaningful $!" 
                         if ( $self->{_N} < (2 * $self->{_K} ** 2) );
    }
}


# This is the heart of the module --- in the sense that this method implements the EM

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

                                       transpose($datavec_minus_mean),
                                            matrix_vector_multiply($covar_inverses[$cluster_index], 
                                                       $datavec_minus_mean ) );
            } else {
                my @var_inverse = $covar_inverses[$cluster_index]->as_list;
                my $var_inverse_val = $var_inverse[0];
                my @data_minus_mean = $datavec_minus_mean->as_list;
                my $data_minus_mean_val = $data_minus_mean[0];
                $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
            }
            next if $covar->det() < 0;
            my $coefficient = 1.0 / 
                ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} 
                                                       * sqrt($covar->det()) );
            my $prob = $coefficient * exp($exponent);
            $likelihood_for_each_tag += 
                              $prob * $self->{_class_priors}->[$cluster_index];
        }
        $log_likelihood += log( $likelihood_for_each_tag );
    }
    # Now calculate the model complexity penalty. $L is the total number of
    # parameters it takes to specify a mixture of K Gaussians. If d is the
    # dimensionality of the data space, the covariance matrix of each Gaussian takes
    # (d**2 -d)/2 + d = d(d+1)/2 parameters since this matrix must be symmetric. And
    # then you need d mean value parameters, and one prior probability parameter
    # for the Gaussian. So   $L = K[1 + d + d(d+1)/2] - 1  where the final '1' that
    # is subtracted is to account for the normalization on the class priors.
    my $L = (0.5 * $self->{_K} * 
             ($self->{_data_dimensions}**2 + 3*$self->{_data_dimensions} + 2) ) - 1;
    my $model_complexity_penalty = 0.5 * $L * log( $self->{_N} );
    my $mdl_criterion = -1 * $log_likelihood + $model_complexity_penalty;
    return $mdl_criterion;
}

# For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW
# is the within-class scatter matrix, more commonly denoted S_w, and SB the
# between-class scatter matrix, more commonly denoted S_b (the underscore means
# subscript).  This measure can be thought of as the normalized average distance
# between the clusters, the normalization being provided by average cluster
# covariance SW^-1. Therefore, the larger the value of this quality measure, the
# better the separation between the clusters.  Since this measure has its roots in
# the Fisher linear discriminant function, we incorporate the word 'fisher' in the
# name of the quality measure.  Note that this measure is good only when the clusters
# are disjoint.  When the clusters exhibit significant overlap, the numbers produced
# by this quality measure tend to be generally meaningless.  As an extreme case,
# let's say your data was produced by a set of Gaussians, all with the same mean
# vector, but each with a distinct covariance. For this extreme case, this measure
# will produce a value close to zero --- depending on the accuracy with which the
# means are estimated --- even when your clusterer is doing a good job of identifying
# the individual clusters.
sub clustering_quality_fisher {
    my $self = shift;
    my @cluster_quality_indices;
    my $fisher_trace = 0;
    my $S_w = 
        Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
    $S_w->zero;
    my $S_b = 
        Math::GSL::Matrix->new($self->{_data_dimensions}, $self->{_data_dimensions});
    $S_b->zero;
    my $global_mean = Math::GSL::Matrix->new($self->{_data_dimensions},1);    
    $global_mean->zero;
    foreach my $cluster_index(0..$self->{_K}-1) { 
        $global_mean = $self->{_class_priors}->[$cluster_index] *
                                    $self->{_cluster_means}->[$cluster_index];
    }
    foreach my $cluster_index(0..$self->{_K}-1) {      
        $S_w +=  $self->{_cluster_covariances}->[$cluster_index] * 
                            $self->{_class_priors}->[$cluster_index];
        my $class_mean_minus_global_mean = $self->{_cluster_means}->[$cluster_index] 
                                                               - $global_mean;
        my $outer_product = outer_product( $class_mean_minus_global_mean, 
                                               $class_mean_minus_global_mean );
        $S_b +=  $self->{_class_priors}->[$cluster_index] * $outer_product;
    }
    my $fisher = matrix_multiply($S_w->inverse, $S_b);
    return $fisher unless defined blessed($fisher);
    return matrix_trace($fisher);
}

sub display_seeding_stats {
    my $self = shift;
    foreach my $cluster_index(0..$self->{_K}-1) {      
        print "\nSeeding for cluster $cluster_index:\n";
        my $mean = $self->{_cluster_means}->[$cluster_index];
        display_matrix("The mean is: ", $mean);        
        my $covariance = $self->{_cluster_covariances}->[$cluster_index];
        display_matrix("The covariance is: ", $covariance);
    }
}

sub display_fisher_quality_vs_iterations {
    my $self = shift;
    print "\n\nFisher Quality vs. Iterations: " .
                      "@{$self->{_fisher_quality_vs_iteration}}\n\n";
}

sub display_mdl_quality_vs_iterations {
    my $self = shift;
    print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}

sub find_prob_at_each_datapoint_for_given_mean_and_covar {
    my $self = shift;
    my $mean_vec_ref = shift;
    my $covar_ref = shift;
    foreach my $data_id (keys %{$self->{_data}}) {
        my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);        
        $data_vec->set_col( 0, $self->{_data}->{$data_id});
        if ($self->{_debug}) {
            display_matrix("data vec in find prob function", $data_vec);
            display_matrix("mean vec in find prob function", $mean_vec_ref);
            display_matrix("covariance in find prob function", $covar_ref);
        }
        my $datavec_minus_mean = $data_vec - $mean_vec_ref;
        display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
        my $exponent = undef;
        if ($self->{_data_dimensions} > 1) {
            $exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
                matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
        } elsif (defined blessed($covar_ref)) {
            my @data_minus_mean = $datavec_minus_mean->as_list;
            my $data_minus_mean_val = $data_minus_mean[0];
            my @covar_as_matrix = $covar_ref->as_list;
            my $covar_val = $covar_as_matrix[0];
            $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
        } else {
            my @data_minus_mean = $datavec_minus_mean->as_list;
            my $data_minus_mean_val = $data_minus_mean[0];
            $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
        }
        print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
        my $coefficient = undef;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        next if !defined $cluster;
        $largest_cluster = $cluster if !defined $largest_cluster;
        if (@$cluster > @$largest_cluster) {
            $largest_cluster = $cluster; 
        }
    }        
    foreach my $cluster (@clusters) {
        if ( (!defined $cluster) || (@$cluster == 0) ) {
            push @$cluster, pop @$largest_cluster;
        }
    }
    foreach my $cluster (@clusters) {
        die "Cluster became empty --- untenable condition " .
            "for a given K.  Try again. " if !defined $cluster;
        my $cluster_size = @$cluster;
        die "Cluster size is zero --- untenable." if $cluster_size == 0;
        my @new_cluster_center = @{$self->add_point_coords( $cluster )};
        @new_cluster_center = map {my $x = $_/$cluster_size; $x} 
                                  @new_cluster_center;
        push @new_cluster_centers, \@new_cluster_center;
    }        
    return \@new_cluster_centers;
}

# The following routine is for computing the distance between a data point specified
# by its symbolic name in the master datafile and a point (such as the center of a
# cluster) expressed as a vector of coordinates:
sub distance {
    my $self = shift;
    my $ele1_id = shift @_;            # symbolic name of data sample
    my @ele1 = @{$self->{_data}->{$ele1_id}};
    my @ele2 = @{shift @_};
    die "wrong data types for distance calculation" if @ele1 != @ele2;
    my $how_many = @ele1;
    my $squared_sum = 0;
    foreach my $i (0..$how_many-1) {
        $squared_sum += ($ele1[$i] - $ele2[$i])**2;
    }    
    my $dist = sqrt $squared_sum;
    return $dist;
}

# The following routine does the same as above but now both
# arguments are expected to be arrays of numbers:
sub distance2 {
    my $self = shift;
    my @ele1 = @{shift @_};
    my @ele2 = @{shift @_};
    die "wrong data types for distance calculation" if @ele1 != @ele2;
    my $how_many = @ele1;
    my $squared_sum = 0;
    foreach my $i (0..$how_many-1) {
        $squared_sum += ($ele1[$i] - $ele2[$i])**2;
    }    
    return sqrt $squared_sum;
}

sub write_naive_bayes_clusters_to_files {
    my $self = shift;
    my @clusters = @{$self->{_clusters}};
    unlink glob "naive_bayes_cluster*.txt";
    foreach my $i (1..@clusters) {
        my $filename = "naive_bayes_cluster" . $i . ".txt";
        print "Writing cluster $i to file $filename\n"
                            if $self->{_terminal_output};
        open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
        foreach my $ele (@{$clusters[$i-1]}) {        
            print FILEHANDLE "$ele\n";
        }
        close FILEHANDLE;
    }
}

sub write_posterior_prob_clusters_above_threshold_to_files {
    my $self = shift;
    my $theta = shift;
    my @class_distributions;
    foreach my $cluster_index (0..$self->{_K}-1) {
        push @class_distributions, [];
    }
    foreach my $data_tag (@{$self->{_data_id_tags}}) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            push @{$class_distributions[$cluster_index]}, $data_tag
                if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] 
                                            > $theta;
        }
    }
    unlink glob "posterior_prob_cluster*.txt";
    foreach my $i (1..@class_distributions) {
        my $filename = "posterior_prob_cluster" . $i . ".txt";
        print "Writing posterior prob cluster $i to file $filename\n"
                            if $self->{_terminal_output};
        open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
        foreach my $ele (@{$class_distributions[$i-1]}) {        
            print FILEHANDLE "$ele\n";
        }
        close FILEHANDLE;
    }
}

sub DESTROY {
    unlink "__temp_" . basename($_[0]->{_datafile});
    unlink "__temp_data_" . basename($_[0]->{_datafile});
    unlink "__temp2_" . basename($_[0]->{_datafile});
    unlink glob "__temp1dhist*";
    unlink glob "__contour*";
}

#############################  Visualization Code ###############################

#  The visualize_clusters() implementation displays as a plot in your terminal window
#  the clusters constructed by the EM algorithm.  It can show either 2D plots or
#  3D plots that you can rotate interactively for better visualization.  For
#  multidimensional data, as to which 2D or 3D dimensions are used for visualization
#  is controlled by the mask you must supply as an argument to the method.  Should it
#  happen that only one on bit is specified for the mask, visualize_clusters()
#  aborts.
#
#  The visualization code consists of first accessing each of clusters created by the
#  EM() subroutine.  Note that the clusters contain only the symbolic names for the
#  individual records in the source data file.  We therefore next reach into the
#  $self->{_data} hash and get the data coordinates associated with each symbolic
#  label in a cluster.  The numerical data thus generated is then written out to a
#  temp file.  When doing so we must remember to insert TWO BLANK LINES between the
#  data blocks corresponding to the different clusters.  This constraint is imposed
#  on us by Gnuplot when plotting data from the same file since we want to use
#  different point styles for the data points in different cluster files.
#  Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
#  module to plot the data clusters.
sub visualize_clusters {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || die "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || die "visualization mask missing";    
        $pause_time = shift;
    } else {
        die "visualize_clusters() called with wrong args";
    }
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    # The following section is for the superimposed one-Mahalanobis-distance-unit 
    # ellipses that are shown only for 2D plots:
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";



( run in 2.299 seconds using v1.01-cache-2.11-cpan-56fb94df46f )