Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    my ($clusters, $cluster_centers) =
                              $self->cluster_for_given_K($K);
    $self->{_clusters} = $clusters;
    $self->{_cluster_centers} = $cluster_centers;  
}

# Used by the kmeans part of the code for the initialization of the EM algorithm:
sub cluster_for_given_K {
    my $self = shift;
    my $K = shift;
    my $cluster_centers = $self->get_initial_cluster_centers($K);
    my $clusters = $self->assign_data_to_clusters_initial($cluster_centers);  
    my $cluster_nonexistant_flag = 0;
    foreach my $trial (0..2) {
        ($clusters, $cluster_centers) =
                         $self->assign_data_to_clusters( $clusters, $K );
        my $num_of_clusters_returned = @$clusters;
        foreach my $cluster (@$clusters) {
            $cluster_nonexistant_flag = 1 if ((!defined $cluster) 
                                             ||  (@$cluster == 0));
        }
        last unless $cluster_nonexistant_flag;
    }
    return ($clusters, $cluster_centers);
}

# Used by the kmeans part of the code for the initialization of the EM algorithm:
sub get_initial_cluster_centers {
    my $self = shift;
    my $K = shift;
    if ($self->{_data_dimensions} == 1) {
        my @one_d_data;
        foreach my $j (0..$self->{_N}-1) {
            my $tag = $self->{_data_id_tags}[$j];     
            push @one_d_data, $self->{_data}->{$tag}->[0];
        }
        my @peak_points = 
                    find_peak_points_in_given_direction(\@one_d_data,$K);
        print "highest points at data values: @peak_points\n" 
                                                         if $self->{_debug};
        my @cluster_centers;
        foreach my $peakpoint (@peak_points) {
            push @cluster_centers, [$peakpoint];
        }
        return \@cluster_centers;
    }
    my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
    my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
    my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
    # All the record labels are stored in the array $self->{_data_id_tags}.
    # The actual data for clustering is stored in a hash at $self->{_data}
    # whose keys are the record labels; the value associated with each
    # key is the array holding the corresponding numerical multidimensional
    # data.
    foreach my $j (0..$num_cols-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        $matrix->set_col($j, $data);
    }
    if ($self->{_debug}) {
        print "\nDisplaying the original data as a matrix:";
        display_matrix( $matrix ); 
    }
    foreach my $j (0..$num_cols-1) {
        $mean_vec += $matrix->col($j);
    }
    $mean_vec *=  1.0 / $num_cols;
    if ($self->{_debug}) {
        print "Displaying the mean vector for the data:";
        display_matrix( $mean_vec );
    }
    foreach my $j (0..$num_cols-1) {
        my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
        $matrix->set_col($j, \@new_col);
    }
    if ($self->{_debug}) {
        print "Displaying mean subtracted data as a matrix:";
        display_matrix( $matrix ); 
    }
    my $transposed = transpose( $matrix );
    if ($self->{_debug}) {
        print "Displaying transposed data matrix:";
        display_matrix( $transposed );
    }
    my $covariance = matrix_multiply( $matrix, $transposed );
    $covariance *= 1.0 / $num_cols;
    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for your data:";
        display_matrix( $covariance );
    }
    my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    print "Eigenvalue 0:   $eigenvalues->[0]\n" if $self->{_debug};
    foreach my $i (1..$num_of_eigens-1) {
        $largest_eigen_index = $i if $eigenvalues->[$i] > 
                                     $eigenvalues->[$largest_eigen_index];
        $smallest_eigen_index = $i if $eigenvalues->[$i] < 
                                     $eigenvalues->[$smallest_eigen_index];
        print "Eigenvalue $i:   $eigenvalues->[$i]\n" if $self->{_debug};
    }
    print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
    print "\nsmallest eigen index: $smallest_eigen_index\n\n" 
                                                          if $self->{_debug};
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        print "Eigenvector $i:   @vec\n" if $self->{_debug};
    }
    my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
    print "\nLargest eigenvector:   @largest_eigen_vec\n" if $self->{_debug};
    my @max_var_direction;
    # Each element of the array @largest_eigen_vec is a Math::Complex object
    foreach my $k (0..@largest_eigen_vec-1) {
        my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
        if ($theta eq '0') {
            $max_var_direction[$k] = $mag;
        } elsif ($theta eq 'pi') {
            $max_var_direction[$k] = -1.0 * $mag;
        } else {
            die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

                                            > $theta;
        }
    }
    unlink glob "posterior_prob_cluster*.txt";
    foreach my $i (1..@class_distributions) {
        my $filename = "posterior_prob_cluster" . $i . ".txt";
        print "Writing posterior prob cluster $i to file $filename\n"
                            if $self->{_terminal_output};
        open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
        foreach my $ele (@{$class_distributions[$i-1]}) {        
            print FILEHANDLE "$ele\n";
        }
        close FILEHANDLE;
    }
}

sub DESTROY {
    unlink "__temp_" . basename($_[0]->{_datafile});
    unlink "__temp_data_" . basename($_[0]->{_datafile});
    unlink "__temp2_" . basename($_[0]->{_datafile});
    unlink glob "__temp1dhist*";
    unlink glob "__contour*";
}

#############################  Visualization Code ###############################

#  The visualize_clusters() implementation displays as a plot in your terminal window
#  the clusters constructed by the EM algorithm.  It can show either 2D plots or
#  3D plots that you can rotate interactively for better visualization.  For
#  multidimensional data, as to which 2D or 3D dimensions are used for visualization
#  is controlled by the mask you must supply as an argument to the method.  Should it
#  happen that only one on bit is specified for the mask, visualize_clusters()
#  aborts.
#
#  The visualization code consists of first accessing each of clusters created by the
#  EM() subroutine.  Note that the clusters contain only the symbolic names for the
#  individual records in the source data file.  We therefore next reach into the
#  $self->{_data} hash and get the data coordinates associated with each symbolic
#  label in a cluster.  The numerical data thus generated is then written out to a
#  temp file.  When doing so we must remember to insert TWO BLANK LINES between the
#  data blocks corresponding to the different clusters.  This constraint is imposed
#  on us by Gnuplot when plotting data from the same file since we want to use
#  different point styles for the data points in different cluster files.
#  Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
#  module to plot the data clusters.
sub visualize_clusters {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || die "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || die "visualization mask missing";    
        $pause_time = shift;
    } else {
        die "visualize_clusters() called with wrong args";
    }
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    # The following section is for the superimposed one-Mahalanobis-distance-unit 
    # ellipses that are shown only for 2D plots:
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 
exponent(x,y)  = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
END
            my $plot = Graphics::GnuplotIF->new();
            $plot->gnuplot_cmd( $argstring );
        }
    }
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_data}} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        }
        push @all_minmaxvals, @all_minvals;
        push @all_minmaxvals, @all_maxvals;
        my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
        my $delta = ($abs_maxval - $abs_minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $filename = basename($master_datafile);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $filename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";
            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    }
}

# This subroutine is the same as above except that it makes PNG plots (for hardcopy
# printing) of the clusters.
sub plot_hardcopy_clusters {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || die "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || die "visualization mask missing";    
        $pause_time = shift;
    } else {
        die "visualize_clusters() called with wrong args";
    }
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 
exponent(x,y)  = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
END
            my $plot = Graphics::GnuplotIF->new();
            $plot->gnuplot_cmd( $argstring );
        }
    }
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_data}} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $K = scalar @{$self->{_clusters}};
    my $filename = basename($master_datafile);

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $filename = basename($master_datafile);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $filename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";
            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
#            $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png color',
                            'set output "cluster_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd('set terminal png',
                           'set output "cluster_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd('set terminal png',
                           'set output "cluster_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}

# This method is for the visualization of the posterior class distributions.  In
# other words, this method allows us to see the soft clustering produced by the EM
# algorithm.  While much of the gnuplot logic here is the same as in the
# visualize_clusters() method, there are significant differences in how the data is
# pooled for the purpose of display.
sub visualize_distributions {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || die "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || die "visualization mask missing";    
        $pause_time = shift;
    } else {
        die "visualize_distributions() called with wrong args";
    }
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour2_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 
exponent(x,y)  = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
END
            my $plot = Graphics::GnuplotIF->new();
            $plot->gnuplot_cmd( $argstring );
         }
    }
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_data}} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($self->{_datafile});
    my $temp_file = "__temp2_" . $filename;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        push @all_minmaxvals, @all_minvals;
        push @all_minmaxvals, @all_maxvals;
        my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
        my $delta = ($abs_maxval - $abs_minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $localfilename = basename($filename);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $localfilename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";

            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
#            $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    }
}

# This method is basically the same as the previous method, except that it is
# intended for making PNG files from the distributions.
sub plot_hardcopy_distributions {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || die "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || die "visualization mask missing";    
        $pause_time = shift;
    } else {
        die "visualize_distributions() called with wrong args";
    }
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour2_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 
exponent(x,y)  = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
END
            my $plot = Graphics::GnuplotIF->new();
            $plot->gnuplot_cmd( $argstring );
         }
    }
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_data}} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($self->{_datafile});
    my $temp_file = "__temp2_" . $filename;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;
            push @all_maxvals, $maxval;
            push @all_clusters_for_hist, \@cluster_for_hist;
        }
        push @all_minmaxvals, @all_minvals;
        push @all_minmaxvals, @all_maxvals;
        my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
        my $delta = ($abs_maxval - $abs_minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $localfilename = basename($filename);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $localfilename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";

            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;

    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "posterior_prob_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "posterior_prob_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "posterior_prob_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}

#  The method shown below should be called only AFTER you have called the method
#  read_data_from_file().  The visualize_data() is meant for the visualization of the
#  original data in its various 2D or 3D subspaces.
sub visualize_data {
    my $self = shift;
    my $v_mask = shift || die "visualization mask missing";

    my $master_datafile = $self->{_datafile};

    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    my %visualization_data;
    my $data_source = $self->{_data};
    while ( my ($record_id, $data) = each %{$data_source} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($master_datafile);
    my $temp_file;
    $temp_file = "__temp_data_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;
    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  '"original data provided for EM"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_} @all_data;
        @all_data = grep $_, @all_data;
        my ($minval,$maxval) = minmax(\@all_data);
        my $delta = ($maxval - $minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        my $localfilename = basename($filename);
        my $temp_file = "__temp1dhist_" .  $localfilename;
        unlink $temp_file if -e $temp_file;
        open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
        print OUTPUT "Xstep histval\n";
        my @histogram = (0) x 100;
        foreach my $i (0..@all_data-1) {
            $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
        }
        foreach my $i (0..@histogram-1) {
            print OUTPUT "$i $histogram[$i]\n";        
        }
        $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
        close OUTPUT;
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}

# This method is the same as the one shown above, except that it is meant for
# creating PNG files of the displays.
sub plot_hardcopy_data {
    my $self = shift;
    my $v_mask = shift || die "visualization mask missing";
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    my %visualization_data;
    my $data_source = $self->{_data};
    while ( my ($record_id, $data) = each %{$data_source} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($master_datafile);
    my $temp_file;
    $temp_file = "__temp_data_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;

    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  '"original data provided for EM"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_} @all_data;
        @all_data = grep $_, @all_data;
        my ($minval,$maxval) = minmax(\@all_data);
        my $delta = ($maxval - $minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        my $localfilename = basename($filename);
        my $temp_file = "__temp1dhist_" .  $localfilename;
        unlink $temp_file if -e $temp_file;
        open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
        print OUTPUT "Xstep histval\n";
        my @histogram = (0) x 100;
        foreach my $i (0..@all_data-1) {
            $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
        }
        foreach my $i (0..@histogram-1) {
            print OUTPUT "$i $histogram[$i]\n";        
        }
        $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
        close OUTPUT;
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}


###################  Generating Synthetic Data for Clustering  ###################

#  The data generated corresponds to a multivariate distribution.  The mean and the
#  covariance of each Gaussian in the distribution are specified individually in a
#  parameter file. The parameter file must also state the prior probabilities to be
#  associated with each Gaussian.  See the example parameter file param1.txt in the
#  examples directory.  Just edit this file for your own needs.
#
#  The multivariate random numbers are generated by calling the Math::Random module.

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

  $clusterer->seed_the_clusters();
  $clusterer->EM();
  $clusterer->run_bayes_classifier();
  my $clusters = $clusterer->return_disjoint_clusters();

  #  where the call to `EM()' is the invocation of the expectation-maximization
  #  algorithm.  The call to `srand(time)' is to seed the pseudo random number
  #  generator afresh for each run of the cluster seeding procedure.  If you want to
  #  see repeatable results from one run to another of the algorithm with random
  #  seeding, you would obviously not invoke `srand(time)'.

  #  The call `run_bayes_classifier()' shown above carries out a disjoint clustering
  #  of all the data points using the naive Bayes' classifier. And the call
  #  `return_disjoint_clusters()' returns the clusters thus formed to you.  Once you
  #  have obtained access to the clusters in this manner, you can display them in
  #  your terminal window by

  foreach my $index (0..@$clusters-1) {
      print "Cluster $index (Naive Bayes):   @{$clusters->[$index]}\n\n"
  }

  #  If you would like to also see the clusters purely on the basis of the posterior
  #  class probabilities exceeding a threshold, call

  my $theta1 = 0.2;
  my $posterior_prob_clusters =
           $clusterer->return_clusters_with_posterior_probs_above_threshold($theta1);

  #  where you can obviously set the threshold $theta1 to any value you wish.  Note
  #  that now you may end up with clusters that overlap.  You can display them in
  #  your terminal window in the same manner as shown above for the naive Bayes'
  #  clusters.

  #  You can write the naive Bayes' clusters out to files, one cluster per file, by
  #  calling

  $clusterer->write_naive_bayes_clusters_to_files();  

  #  The clusters are placed in files with names like

         naive_bayes_cluster1.txt
         naive_bayes_cluster2.txt
         ...

  #  In the same manner, you can write out the posterior probability based possibly
  #  overlapping clusters to files by calling:

  $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);

  #  where the threshold $theta1 sets the probability threshold for deciding which
  #  data elements to place in a cluster.  These clusters are placed in files with
  #  names like

         posterior_prob_cluster1.txt
         posterior_prob_cluster2.txt
         ...

  # CLUSTER VISUALIZATION:

  #  You must first set the mask for cluster visualization. This mask tells the 
  #  module which 2D or 3D subspace of the original data space you wish to visualize 
  #  the clusters in:

  my $visualization_mask = "111";
  $clusterer->visualize_clusters($visualization_mask);
  $clusterer->visualize_distributions($visualization_mask);
  $clusterer->plot_hardcopy_clusters($visualization_mask);
  $clusterer->plot_hardcopy_distributions($visualization_mask);

  #  where the last two invocations are for writing out the PNG plots of the
  #  visualization displays to disk files.  The PNG image of the posterior
  #  probability distributions is written out to a file named posterior_prob_plot.png
  #  and the PNG image of the disjoint clusters to a file called cluster_plot.png.

  # SYNTHETIC DATA GENERATION:

  #  The module has been provided with a class method for generating multivariate
  #  data for experimenting with the EM algorithm.  The data generation is controlled
  #  by the contents of a parameter file that is supplied as an argument to the data
  #  generator method.  The priors, the means, and the covariance matrices in the
  #  parameter file must be according to the syntax shown in the `param1.txt' file in
  #  the `examples' directory. It is best to edit a copy of this file for your
  #  synthetic data generation needs.

  my $parameter_file = "param1.txt";
  my $out_datafile = "mydatafile1.dat";
  Algorithm::ExpectationMaximization->cluster_data_generator(
                          input_parameter_file => $parameter_file,
                          output_datafile => $out_datafile,
                          total_number_of_data_points => $N );

  #  where the value of $N is the total number of data points you would like to see
  #  generated for all of the Gaussians.  How this total number is divided up amongst
  #  the Gaussians is decided by the prior probabilities for the Gaussian components
  #  as declared in input parameter file.  The synthetic data may be visualized in a
  #  terminal window and the visualization written out as a PNG image to a diskfile
  #  by

  my $data_visualization_mask = "11";                                            
  $clusterer->visualize_data($data_visualization_mask);                          
  $clusterer->plot_hardcopy_data($data_visualization_mask);


=head1 CHANGES

Version 1.22 should work with data in CSV files.

Version 1.21 incorporates minor code clean up.  Overall, the module implementation
remains unchanged.

Version 1.2 allows the module to also be used for 1-D data.  The visualization code
for 1-D shows the clusters through their histograms.

Version 1.1 incorporates much cleanup of the documentation associated with the
module.  Both the top-level module documentation, especially the Description part,
and the comments embedded in the code were revised for better utilization of the
module.  The basic implementation code remains unchanged.


=head1 DESCRIPTION

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

know the data to be perfectly multimodal Gaussian, EM is probably the most magical
approach to clustering multidimensional data.  Consider the case of clustering
three-dimensional data.  Each Gaussian cluster in 3D space is characterized by the
following 10 variables: the 6 unique elements of the C<3x3> covariance matrix (which
must be symmetric positive-definite), the 3 unique elements of the mean, and the
prior associated with the Gaussian. Now let's say you expect to see six Gaussians in
your data.  What that means is that you would want the values for 59 variables
(remember the unit-summation constraint on the class priors which reduces the overall
number of variables by one) to be estimated by the algorithm that seeks to discover
the clusters in your data.  What's amazing is that, despite the large number of
variables that must be optimized simultaneously, the EM algorithm will very likely
give you a good approximation to the right answer.

At its core, EM depends on the notion of unobserved data and the averaging of the
log-likelihood of the data actually observed over all admissible probabilities for
the unobserved data.  But what is unobserved data?  While in some cases where EM is
used, the unobserved data is literally the missing data, in others, it is something
that cannot be seen directly but that nonetheless is relevant to the data actually
observed. For the case of clustering multidimensional numerical data that can be
modeled as a Gaussian mixture, it turns out that the best way to think of the
unobserved data is in terms of a sequence of random variables, one for each observed
data point, whose values dictate the selection of the Gaussian for that data point.
This point is explained in great detail in my on-line tutorial at
L<https://engineering.purdue.edu/kak/Tutorials/ExpectationMaximization.pdf>.

The EM algorithm in our context reduces to an iterative invocation of the following
steps: (1) Given the current guess for the means and the covariances of the different
Gaussians in our mixture model, use Bayes' Rule to update the posterior class
probabilities at each of the data points; (2) Using the updated posterior class
probabilities, first update the class priors; (3) Using the updated class priors,
update the class means and the class covariances; and go back to Step (1).  Ideally,
the iterations should terminate when the expected log-likelihood of the observed data
has reached a maximum and does not change with any further iterations.  The stopping
rule used in this module is the detection of no change over three consecutive
iterations in the values calculated for the priors.

This module provides three different choices for seeding the clusters: (1) random,
(2) kmeans, and (3) manual.  When random seeding is chosen, the algorithm randomly
selects C<K> data elements as cluster seeds.  That is, the data vectors associated
with these seeds are treated as initial guesses for the means of the Gaussian
distributions.  The covariances are then set to the values calculated from the entire
dataset with respect to the means corresponding to the seeds. With kmeans seeding, on
the other hand, the means and the covariances are set to whatever values are returned
by the kmeans algorithm.  And, when seeding is set to manual, you are allowed to
choose C<K> data elements --- by specifying their tag names --- for the seeds.  The
rest of the EM initialization for the manual mode is the same as for the random mode.
The algorithm allows for the initial priors to be specified for the manual mode of
seeding.

Much of code for the kmeans based seeding of EM was drawn from the
C<Algorithm::KMeans> module by me. The code from that module used here corresponds to
the case when the C<cluster_seeding> option in the C<Algorithm::KMeans> module is set
to C<smart>.  The C<smart> option for KMeans consists of subjecting the data to a
principal components analysis (PCA) to discover the direction of maximum variance in
the data space.  The data points are then projected on to this direction and a
histogram constructed from the projections.  Centers of the C<K> largest peaks in
this smoothed histogram are used to seed the KMeans based clusterer.  As you'd
expect, the output of the KMeans used to seed the EM algorithm.

This module uses two different criteria to measure the quality of the clustering
achieved. The first is the Minimum Description Length (MDL) proposed originally by
Rissanen (J. Rissanen: "Modeling by Shortest Data Description," Automatica, 1978, and
"A Universal Prior for Integers and Estimation by Minimum Description Length," Annals
of Statistics, 1983.)  The MDL criterion is a difference of a log-likelihood term for
all of the observed data and a model-complexity penalty term. In general, both the
log-likelihood and the model-complexity terms increase as the number of clusters
increases.  The form of the MDL criterion in this module uses for the penalty term
the Bayesian Information Criterion (BIC) of G. Schwartz, "Estimating the Dimensions
of a Model," The Annals of Statistics, 1978.  In general, the smaller the value of
MDL quality measure, the better the clustering of the data.

For our second measure of clustering quality, we use `trace( SW^-1 . SB)' where SW is
the within-class scatter matrix, more commonly denoted S_w, and SB the between-class
scatter matrix, more commonly denoted S_b (the underscore means subscript).  This
measure can be thought of as the normalized average distance between the clusters,
the normalization being provided by average cluster covariance SW^-1. Therefore, the
larger the value of this quality measure, the better the separation between the
clusters.  Since this measure has its roots in the Fisher linear discriminant
function, we incorporate the word C<fisher> in the name of the quality measure.
I<Note that this measure is good only when the clusters are disjoint.> When the
clusters exhibit significant overlap, the numbers produced by this quality measure
tend to be generally meaningless.

=head1 METHODS

The module provides the following methods for EM based
clustering, for cluster visualization, for data
visualization, and for the generation of data for testing a
clustering algorithm:

=over

=item B<new():>

A call to C<new()> constructs a new instance of the
C<Algorithm::ExpectationMaximization> class.  A typical form
of this call when you want to use random option for seeding
the algorithm is:

    my $clusterer = Algorithm::ExpectationMaximization->new(
                                datafile            => $datafile,
                                mask                => $mask,
                                K                   => 3,
                                max_em_iterations   => 300,
                                seeding             => 'random',
                                terminal_output     => 1,
                    );

where C<K> is the expected number of clusters and
C<max_em_iterations> the maximum number of EM iterations
that you want to allow until convergence is achieved.
Depending on your dataset and on the choice of the initial
seeds, the actual number of iterations used could be as few
as 10 and as many as reaching 300.  The output produced by
the algorithm shows the actual number of iterations used to
arrive at convergence.

The data file supplied through the C<datafile> option is
expected to contain entries in the following format

   c20  0  10.7087017086940  9.63528386251712  10.9512155258108  ...

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

This method returns a reference to an array of C<K>
anonymous arrays, each consisting of the symbolic names for
the data records where the posterior class probability
exceeds the threshold as specified by C<$theta1>.
Subsequently, you can access each of these
posterior-probability based clusters through a loop
construct such as

    foreach my $index (0..@$posterior_prob_clusters-1) {
        print "Cluster $index (based on posterior probs exceeding $theta1): " .
              "@{$posterior_prob_clusters->[$index]}\n\n"
    }

=item B<write_posterior_prob_clusters_above_threshold_to_files($theta1):>

    $clusterer->write_posterior_prob_clusters_above_threshold_to_files($theta1);

This call writes out the posterior-probability based soft
clusters to disk files.  As in the previous method, the
threshold C<$theta1> sets the probability threshold for
deciding which data elements belong to a cluster.  These
clusters are placed in files with names like

    posterior_prob_cluster1.txt
    posterior_prob_cluster2.txt
    ...

=item B<return_individual_class_distributions_above_given_threshold($theta):>

    my $theta2 = 0.00001;
    my $class_distributions =
      $clusterer->return_individual_class_distributions_above_given_threshold($theta2);

This is the method to call if you wish to see the individual
Gaussians in your own script. The method returns a reference
to an array of anonymous arrays, with each anonymous array
representing data membership in each Gaussian.  Only those
data points are included in each Gaussian where the
probability exceeds the threshold C<$theta2>. Note that the
larger the covariance and the higher the data
dimensionality, the smaller this threshold must be for you
to see any of the data points in a Gaussian.  After you have
accessed the Gaussian mixture in this manner, you can
display the data membership in each Gaussian through the
following sort of a loop:

    foreach my $index (0..@$class_distributions-1) {
        print "Gaussian Distribution $index (only shows data elements " .
              "whose probabilities exceed the threshold $theta2:  " .
              "@{$class_distributions->[$index]}\n\n"
    }

=item B<visualize_clusters($visualization_mask):>

    my $visualization_mask = "11";
    $clusterer->visualize_clusters($visualization_mask);

The visualization mask here does not have to be identical to
the one used for clustering, but must be a subset of that
mask.  This is convenient for visualizing the clusters in
two- or three-dimensional subspaces of the original space.
The subset is specified by placing `0's in the positions
corresponding to the dimensions you do NOT want to see
through visualization.  Depending on the mask used, this
method creates a 2D or a 3D scatter plot of the clusters
obtained through the naive Bayes' classification rule.

=item B<visualize_distributions($visualization_mask):>

    $clusterer->visualize_distributions($visualization_mask);

This is the method to call if you want to visualize the soft
clustering corresponding to the posterior class
probabilities exceeding the threshold specified in the call
to
C<return_clusters_with_posterior_probs_above_threshold($theta1)>.
Again, depending on the visualization mask used, you will
see either a 2D plot or a 3D scatter plot.

=item B<plot_hardcopy_clusters($visualization_mask):>

    $clusterer->plot_hardcopy_clusters($visualization_mask);

This method create a PNG file from the C<gnuplot> created
display of the naive Bayes' clusters obtained from the data.
The plotting functionality of C<gnuplot> is accessed through
the Perl wrappers provided by the C<Graphics::GnuplotIF>
module.

=item B<plot_hardcopy_distributions($visualization_mask):>

    $clusterer->plot_hardcopy_distributions($visualization_mask);

This method create a PNG file from the C<gnuplot> created
display of the clusters that correspond to the posterior
class probabilities exceeding a specified threshold. The
plotting functionality of C<gnuplot> is accessed through the
Perl wrappers provided by the C<Graphics::GnuplotIF> module.

=item B<display_fisher_quality_vs_iterations():>

    $clusterer->display_fisher_quality_vs_iterations();

This method measures the quality of clustering by
calculating the normalized average squared distance between
the cluster centers, the normalization being provided by the
average cluster covariance. See the Description for further
details.  In general, this measure is NOT useful for
overlapping clusters.

=item B<display_mdl_quality_vs_iterations():>

    $clusterer->display_mdl_quality_vs_iterations();

At the end of each iteration, this method measures the
quality of clustering my calculating its MDL (Minimum
Description Length).  As stated earlier in Description, the
MDL measure is a difference of a log-likelihood term for all
of the observed data and a model-complexity penalty term.
The smaller the value returned by this method, the better
the clustering.



( run in 0.489 second using v1.01-cache-2.11-cpan-adec679a428 )