Algorithm-ExpectationMaximization
view release on metacpan or search on metacpan
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
foreach my $ele (@{$clusters[$i-1]}) {
print FILEHANDLE "$ele\n";
}
close FILEHANDLE;
}
}
sub write_posterior_prob_clusters_above_threshold_to_files {
my $self = shift;
my $theta = shift;
my @class_distributions;
foreach my $cluster_index (0..$self->{_K}-1) {
push @class_distributions, [];
}
foreach my $data_tag (@{$self->{_data_id_tags}}) {
foreach my $cluster_index (0..$self->{_K}-1) {
push @{$class_distributions[$cluster_index]}, $data_tag
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index]
> $theta;
}
}
unlink glob "posterior_prob_cluster*.txt";
foreach my $i (1..@class_distributions) {
my $filename = "posterior_prob_cluster" . $i . ".txt";
print "Writing posterior prob cluster $i to file $filename\n"
if $self->{_terminal_output};
open FILEHANDLE, "| sort > $filename" or die "Unable to open file: $!";
foreach my $ele (@{$class_distributions[$i-1]}) {
print FILEHANDLE "$ele\n";
}
close FILEHANDLE;
}
}
sub DESTROY {
unlink "__temp_" . basename($_[0]->{_datafile});
unlink "__temp_data_" . basename($_[0]->{_datafile});
unlink "__temp2_" . basename($_[0]->{_datafile});
unlink glob "__temp1dhist*";
unlink glob "__contour*";
}
############################# Visualization Code ###############################
# The visualize_clusters() implementation displays as a plot in your terminal window
# the clusters constructed by the EM algorithm. It can show either 2D plots or
# 3D plots that you can rotate interactively for better visualization. For
# multidimensional data, as to which 2D or 3D dimensions are used for visualization
# is controlled by the mask you must supply as an argument to the method. Should it
# happen that only one on bit is specified for the mask, visualize_clusters()
# aborts.
#
# The visualization code consists of first accessing each of clusters created by the
# EM() subroutine. Note that the clusters contain only the symbolic names for the
# individual records in the source data file. We therefore next reach into the
# $self->{_data} hash and get the data coordinates associated with each symbolic
# label in a cluster. The numerical data thus generated is then written out to a
# temp file. When doing so we must remember to insert TWO BLANK LINES between the
# data blocks corresponding to the different clusters. This constraint is imposed
# on us by Gnuplot when plotting data from the same file since we want to use
# different point styles for the data points in different cluster files.
# Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
# module to plot the data clusters.
sub visualize_clusters {
my $self = shift;
my $v_mask;
my $pause_time;
if (@_ == 1) {
$v_mask = shift || die "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || die "visualization mask missing";
$pause_time = shift;
} else {
die "visualize_clusters() called with wrong args";
}
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
# The following section is for the superimposed one-Mahalanobis-distance-unit
# ellipses that are shown only for 2D plots:
if ($visualization_data_field_width == 2) {
foreach my $cluster_index (0..$self->{_K}-1) {
my $contour_filename = "__contour_" . $cluster_index . ".dat";
my $mean = $self->{_cluster_means}->[$cluster_index];
my $covariance = $self->{_cluster_covariances}->[$cluster_index];
my ($mux,$muy) = $mean->as_list();
my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
my ($sigmayx,$vary) = $covariance->row(1)->as_list();
die "Your covariance matrix does not look right"
unless $sigmaxy == $sigmayx;
my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2
exponent(x,y) = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
}
my $K = scalar @{$self->{_clusters}};
my $filename = basename($master_datafile);
my $temp_file = "__temp_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!";
foreach my $cluster (@{$self->{_clusters}}) {
foreach my $item (@$cluster) {
print OUTPUT "@{$visualization_data{$item}}";
print OUTPUT "\n";
}
print OUTPUT "\n\n";
}
close OUTPUT;
my $plot;
if (!defined $pause_time) {
$plot = Graphics::GnuplotIF->new( persist => 1 );
} else {
$plot = Graphics::GnuplotIF->new();
}
my $arg_string = "";
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd("set noclip");
$plot->gnuplot_cmd("set pointsize 2");
foreach my $i (0..$K-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
}
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd("set noclip");
$plot->gnuplot_cmd("set pointsize 2");
foreach my $i (0..$K-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
my $ellipse_filename = "__contour_" . $i . ".dat";
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
}
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
my $all_joined_data = join ':', @all_data;
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
foreach my $i (0..@separated-1) {
$separated[$i] =~ s/SEPERATOR//g;
my @cluster_for_hist = split /:/, $separated[$i];
@cluster_for_hist = grep $_, @cluster_for_hist;
my ($minval,$maxval) = minmax(\@cluster_for_hist);
push @all_minvals, $minval;
push @all_maxvals, $maxval;
push @all_clusters_for_hist, \@cluster_for_hist;
}
push @all_minmaxvals, @all_minvals;
push @all_minmaxvals, @all_maxvals;
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
my $delta = ($abs_maxval - $abs_minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
foreach my $cindex (0..@all_clusters_for_hist-1) {
my $filename = basename($master_datafile);
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
close OUTPUT;
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "splot $arg_string" );
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "plot $arg_string" );
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd( "plot $arg_string" );
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
}
}
# This subroutine is the same as above except that it makes PNG plots (for hardcopy
# printing) of the clusters.
sub plot_hardcopy_clusters {
my $self = shift;
my $v_mask;
my $pause_time;
if (@_ == 1) {
$v_mask = shift || die "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || die "visualization mask missing";
$pause_time = shift;
} else {
die "visualize_clusters() called with wrong args";
}
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
if ($visualization_data_field_width == 2) {
foreach my $cluster_index (0..$self->{_K}-1) {
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
}
my $K = scalar @{$self->{_clusters}};
my $filename = basename($master_datafile);
my $temp_file = "__temp_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!";
foreach my $cluster (@{$self->{_clusters}}) {
foreach my $item (@$cluster) {
print OUTPUT "@{$visualization_data{$item}}";
print OUTPUT "\n";
}
print OUTPUT "\n\n";
}
close OUTPUT;
my $plot;
if (!defined $pause_time) {
$plot = Graphics::GnuplotIF->new( persist => 1 );
} else {
$plot = Graphics::GnuplotIF->new();
}
my $arg_string = "";
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
foreach my $i (0..$K-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
}
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
foreach my $i (0..$K-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
my $ellipse_filename = "__contour_" . $i . ".dat";
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
}
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
my $all_joined_data = join ':', @all_data;
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
foreach my $i (0..@separated-1) {
$separated[$i] =~ s/SEPERATOR//g;
my @cluster_for_hist = split /:/, $separated[$i];
@cluster_for_hist = grep $_, @cluster_for_hist;
my ($minval,$maxval) = minmax(\@cluster_for_hist);
push @all_minvals, $minval;
push @all_maxvals, $maxval;
push @all_clusters_for_hist, \@cluster_for_hist;
}
push @all_minmaxvals, @all_minvals;
push @all_minmaxvals, @all_maxvals;
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
my $delta = ($abs_maxval - $abs_minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
foreach my $cindex (0..@all_clusters_for_hist-1) {
my $filename = basename($master_datafile);
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
close OUTPUT;
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( 'set terminal png color',
'set output "cluster_plot.png"');
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd('set terminal png',
'set output "cluster_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd('set terminal png',
'set output "cluster_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
}
}
# This method is for the visualization of the posterior class distributions. In
# other words, this method allows us to see the soft clustering produced by the EM
# algorithm. While much of the gnuplot logic here is the same as in the
# visualize_clusters() method, there are significant differences in how the data is
# pooled for the purpose of display.
sub visualize_distributions {
my $self = shift;
my $v_mask;
my $pause_time;
if (@_ == 1) {
$v_mask = shift || die "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || die "visualization mask missing";
$pause_time = shift;
} else {
die "visualize_distributions() called with wrong args";
}
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
}
foreach my $data_tag (@{$self->{_data_id_tags}}) {
foreach my $cluster_index (0..$self->{_K}-1) {
push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
}
}
foreach my $distribution (@class_distributions) {
foreach my $item (@$distribution) {
print OUTPUT "@$item";
print OUTPUT "\n";
}
print OUTPUT "\n\n";
}
close OUTPUT;
my $plot;
if (!defined $pause_time) {
$plot = Graphics::GnuplotIF->new( persist => 1 );
} else {
$plot = Graphics::GnuplotIF->new();
}
my $arg_string = "";
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
foreach my $i (0..$self->{_K}-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
}
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
foreach my $i (0..$self->{_K}-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
my $ellipse_filename = "__contour2_" . $i . ".dat";
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
}
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
my $all_joined_data = join ':', @all_data;
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
foreach my $i (0..@separated-1) {
$separated[$i] =~ s/SEPERATOR//g;
my @cluster_for_hist = split /:/, $separated[$i];
@cluster_for_hist = grep $_, @cluster_for_hist;
my ($minval,$maxval) = minmax(\@cluster_for_hist);
push @all_minvals, $minval;
push @all_maxvals, $maxval;
push @all_clusters_for_hist, \@cluster_for_hist;
}
push @all_minmaxvals, @all_minvals;
push @all_minmaxvals, @all_maxvals;
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
my $delta = ($abs_maxval - $abs_minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
foreach my $cindex (0..@all_clusters_for_hist-1) {
my $localfilename = basename($filename);
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
# $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
close OUTPUT;
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "splot $arg_string" );
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "plot $arg_string" );
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd( "plot $arg_string" );
$plot->gnuplot_pause( $pause_time ) if defined $pause_time;
}
}
# This method is basically the same as the previous method, except that it is
# intended for making PNG files from the distributions.
sub plot_hardcopy_distributions {
my $self = shift;
my $v_mask;
my $pause_time;
if (@_ == 1) {
$v_mask = shift || die "visualization mask missing";
} elsif (@_ == 2) {
$v_mask = shift || die "visualization mask missing";
$pause_time = shift;
} else {
die "visualize_distributions() called with wrong args";
}
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
if ($visualization_data_field_width == 2) {
lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN
}
foreach my $data_tag (@{$self->{_data_id_tags}}) {
foreach my $cluster_index (0..$self->{_K}-1) {
push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
}
}
foreach my $distribution (@class_distributions) {
foreach my $item (@$distribution) {
print OUTPUT "@$item";
print OUTPUT "\n";
}
print OUTPUT "\n\n";
}
close OUTPUT;
my $plot;
if (!defined $pause_time) {
$plot = Graphics::GnuplotIF->new( persist => 1 );
} else {
$plot = Graphics::GnuplotIF->new();
}
my $arg_string = "";
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
foreach my $i (0..$self->{_K}-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
}
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
foreach my $i (0..$self->{_K}-1) {
my $j = $i + 1;
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
my $ellipse_filename = "__contour2_" . $i . ".dat";
$arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
}
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
my $all_joined_data = join ':', @all_data;
my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
foreach my $i (0..@separated-1) {
$separated[$i] =~ s/SEPERATOR//g;
my @cluster_for_hist = split /:/, $separated[$i];
@cluster_for_hist = grep $_, @cluster_for_hist;
my ($minval,$maxval) = minmax(\@cluster_for_hist);
push @all_minvals, $minval;
push @all_maxvals, $maxval;
push @all_clusters_for_hist, \@cluster_for_hist;
}
push @all_minmaxvals, @all_minvals;
push @all_minmaxvals, @all_maxvals;
my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
my $delta = ($abs_maxval - $abs_minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
foreach my $cindex (0..@all_clusters_for_hist-1) {
my $localfilename = basename($filename);
my $temp_file = "__temp1dhist_" . "$cindex" . "_" . $localfilename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
$histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
$arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
close OUTPUT;
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "posterior_prob_plot.png"');
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "posterior_prob_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "posterior_prob_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
}
}
# The method shown below should be called only AFTER you have called the method
# read_data_from_file(). The visualize_data() is meant for the visualization of the
# original data in its various 2D or 3D subspaces.
sub visualize_data {
my $self = shift;
my $v_mask = shift || die "visualization mask missing";
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
my %visualization_data;
my $data_source = $self->{_data};
while ( my ($record_id, $data) = each %{$data_source} ) {
my @fields = @$data;
die "\nABORTED: Visualization mask size exceeds data record size"
if $#v_mask > $#fields;
my @data_fields;
foreach my $i (0..@fields-1) {
if ($v_mask[$i] eq '0') {
next;
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "Misformed visualization mask. It can only have 1s and 0s";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my $filename = basename($master_datafile);
my $temp_file;
$temp_file = "__temp_data_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!";
foreach my $datapoint (values %visualization_data) {
print OUTPUT "@$datapoint";
print OUTPUT "\n";
}
close OUTPUT;
my $plot = Graphics::GnuplotIF->new( persist => 1 );
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
my $plot_title = '"original data provided for EM"';
my $arg_string ;
if ($visualization_data_field_width > 2) {
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 2) {
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_} @all_data;
@all_data = grep $_, @all_data;
my ($minval,$maxval) = minmax(\@all_data);
my $delta = ($maxval - $minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
my $localfilename = basename($filename);
my $temp_file = "__temp1dhist_" . $localfilename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@all_data-1) {
$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
close OUTPUT;
}
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd( "plot $arg_string" );
}
}
# This method is the same as the one shown above, except that it is meant for
# creating PNG files of the displays.
sub plot_hardcopy_data {
my $self = shift;
my $v_mask = shift || die "visualization mask missing";
my $master_datafile = $self->{_datafile};
my @v_mask = split //, $v_mask;
my $visualization_mask_width = @v_mask;
my $original_data_mask = $self->{_mask};
my @mask = split //, $original_data_mask;
my $data_field_width = scalar grep {$_ eq '1'} @mask;
die "\n\nABORTED: The width of the visualization mask (including " .
"all its 1s and 0s) must equal the width of the original mask " .
"used for reading the data file (counting only the 1's)"
if $visualization_mask_width != $data_field_width;
my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
my %visualization_data;
my $data_source = $self->{_data};
while ( my ($record_id, $data) = each %{$data_source} ) {
my @fields = @$data;
die "\nABORTED: Visualization mask size exceeds data record size"
if $#v_mask > $#fields;
my @data_fields;
foreach my $i (0..@fields-1) {
if ($v_mask[$i] eq '0') {
next;
} elsif ($v_mask[$i] eq '1') {
push @data_fields, $fields[$i];
} else {
die "Misformed visualization mask. It can only have 1s and 0s";
}
}
$visualization_data{ $record_id } = \@data_fields;
}
my $filename = basename($master_datafile);
my $temp_file;
$temp_file = "__temp_data_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!";
foreach my $datapoint (values %visualization_data) {
print OUTPUT "@$datapoint";
print OUTPUT "\n";
}
close OUTPUT;
my $plot = Graphics::GnuplotIF->new( persist => 1 );
$plot->gnuplot_cmd( "set noclip" );
$plot->gnuplot_cmd( "set pointsize 2" );
my $plot_title = '"original data provided for EM"';
my $arg_string ;
if ($visualization_data_field_width > 2) {
$arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 2) {
$arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
} elsif ($visualization_data_field_width == 1 ) {
open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
my @all_data = <INPUT>;
close INPUT;
@all_data = map {chomp $_; $_} @all_data;
@all_data = grep $_, @all_data;
my ($minval,$maxval) = minmax(\@all_data);
my $delta = ($maxval - $minval) / 100.0;
$plot->gnuplot_cmd("set boxwidth 3");
$plot->gnuplot_cmd("set style fill solid border -1");
$plot->gnuplot_cmd("set ytics out nomirror");
$plot->gnuplot_cmd("set style data histograms");
$plot->gnuplot_cmd("set style histogram clustered");
$plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
$plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
my $localfilename = basename($filename);
my $temp_file = "__temp1dhist_" . $localfilename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
print OUTPUT "Xstep histval\n";
my @histogram = (0) x 100;
foreach my $i (0..@all_data-1) {
$histogram[int( ($all_data[$i] - $minval) / $delta )]++;
}
foreach my $i (0..@histogram-1) {
print OUTPUT "$i $histogram[$i]\n";
}
$arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
close OUTPUT;
}
if ($visualization_data_field_width > 2) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "data_scatter_plot.png"');
$plot->gnuplot_cmd( "splot $arg_string" );
} elsif ($visualization_data_field_width == 2) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "data_scatter_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
} elsif ($visualization_data_field_width == 1) {
$plot->gnuplot_cmd( 'set terminal png',
'set output "data_scatter_plot.png"');
$plot->gnuplot_cmd( "plot $arg_string" );
}
}
################### Generating Synthetic Data for Clustering ###################
# The data generated corresponds to a multivariate distribution. The mean and the
# covariance of each Gaussian in the distribution are specified individually in a
# parameter file. The parameter file must also state the prior probabilities to be
# associated with each Gaussian. See the example parameter file param1.txt in the
# examples directory. Just edit this file for your own needs.
#
# The multivariate random numbers are generated by calling the Math::Random module.
# As you would expect, that module will insist that the covariance matrix you
# specify be symmetric and positive definite.
sub cluster_data_generator {
my $class = shift;
die "illegal call of a class method"
unless $class eq 'Algorithm::ExpectationMaximization';
my %args = @_;
my $input_parameter_file = $args{input_parameter_file};
my $output_file = $args{output_datafile};
my $N = $args{total_number_of_data_points};
my @all_params;
my $param_string;
if (defined $input_parameter_file) {
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
@all_params = <INPUT>;
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
chomp @all_params;
$param_string = join ' ', @all_params;
( run in 4.420 seconds using v1.01-cache-2.11-cpan-5b529ec07f3 )