PUT results from the CPAN

Algorithm-ExpectationMaximization

view release on metacpan or search on metacpan

lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN

        _mdl_quality_vs_iterations      => [],
    }, $class;
}


sub read_data_from_file {
    my $self = shift;
    my $filename = $self->{_datafile};
    $self->read_data_from_file_csv() if $filename =~ /.csv$/;
    $self->read_data_from_file_dat() if $filename =~ /.dat$/;
}

sub read_data_from_file_csv {
    my $self = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my $filename = $self->{_datafile} || die "you did not specify a file with the data to be clustered";
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output};
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    local $/ = undef;
    my @all_data = split /\s+/, <FILEIN>;
    my %data_hash = ();
    my @data_tags = ();
    foreach my $record (@all_data) {    
        my @splits = split /,/, $record;
        die "\nYour mask size (including `N' and 1's and 0's) does not match\n" .
            "the size of at least one of the data records in the file.\n"
            unless scalar(@mask) == scalar(@splits);
        my $record_name = shift @splits;
        $data_hash{$record_name} = \@splits;
        push @data_tags, $record_name;
    }
    $self->{_data} = \%data_hash;
    $self->{_data_id_tags} = \@data_tags;
    $self->{_N} = scalar @data_tags;
    # Need to make the following call to set the global mean and covariance:
    # my $covariance =  $self->estimate_mean_and_covariance(\@data_tags);
    # Need to make the following call to set the global eigenvec eigenval sets:
    # $self->eigen_analysis_of_covariance($covariance);
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
             "points must satisfy the relation N > 2xK**2 where K is " .
             "the number of clusters requested for the clusters to be " .
             "meaningful $!" 
                         if ( $self->{_N} < (2 * $self->{_K} ** 2) );
        print "\n\n\n";
    }
}

sub read_data_from_file_dat {
    my $self = shift;
    my $datafile = $self->{_datafile};
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"
	if $self->{_terminal_output};
    open INPUT, $datafile
        or die "unable to open file $datafile: $!";
    chomp( my @raw_data = <INPUT> );
    close INPUT;
    # Transform strings into number data
    foreach my $record (@raw_data) {
        next unless $record;
        next if $record =~ /^#/;
        my @data_fields;
        my @fields = split /\s+/, $record;
        die "\nABORTED: Mask size does not correspond to row record size" 
            if $#fields != $#mask;
        my $record_id;
        foreach my $i (0..@fields-1) {
            if ($mask[$i] eq '0') {
                next;
            } elsif ($mask[$i] eq 'N') {
                $record_id = $fields[$i];
            } elsif ($mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "misformed mask for reading the data file";
            }
        }
        my @nums = map {/$_num_regex/;$_} @data_fields;
        $self->{_data}->{ $record_id } = \@nums;
    }
    my @all_data_ids = keys %{$self->{_data}};
    $self->{_data_id_tags} = \@all_data_ids;
    $self->{_N} = scalar @all_data_ids;
    if ( defined($self->{_K}) && ($self->{_K} > 0) ) {
        carp "\n\nWARNING: YOUR K VALUE IS TOO LARGE.\n The number of data " .
             "points must satisfy the relation N > 2xK**2 where K is " .
             "the number of clusters requested for the clusters to be " .
             "meaningful $!" 
                         if ( $self->{_N} < (2 * $self->{_K} ** 2) );
    }
}


# This is the heart of the module --- in the sense that this method implements the EM
# algorithm for the estimating the parameters of a Gaussian mixture model for the
# data. In the implementation shown below, we declare convergence for the EM
# algorithm when the change in the class priors over three iterations falls below a
# threshold.  The current value of this threshold, as can be seen in the function
# compare_array_floats(), is 0.00001.
sub EM {
    my $self = shift;
    $self->initialize_class_priors();
    for (my $em_iteration=0; $em_iteration < $self->{_max_em_iterations}; 
                                                             $em_iteration++) {
        if ($em_iteration == 0) {
            print "\nSeeding the EM algorithm with:\n";                     
            $self->display_seeding_stats();
            print "\nFinished displaying the seeding information\n";
            print "\nWill print out a dot for each iteration of EM:\n\n";
        }
        my $progress_indicator = $em_iteration % 5 == 0 ? $em_iteration : ".";
        print $progress_indicator;
        foreach my $data_id (@{$self->{_data_id_tags}}) {
            $self->{_class_probs_at_each_data_point}->{$data_id} = [];
            $self->{_expected_class_probs}->{$data_id} = [];
        }
        # Calculate prob(x | C_i) --- this is the prob of data point x as

lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN

    # ellipses that are shown only for 2D plots:
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 
exponent(x,y)  = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
END
            my $plot = Graphics::GnuplotIF->new();
            $plot->gnuplot_cmd( $argstring );
        }
    }
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_data}} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $K = scalar @{$self->{_clusters}};
    my $filename = basename($master_datafile);
    my $temp_file = "__temp_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $cluster (@{$self->{_clusters}}) {
        foreach my $item (@$cluster) {
            print OUTPUT "@{$visualization_data{$item}}";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd("set noclip");
        $plot->gnuplot_cmd("set pointsize 2");
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
        }
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd("set noclip");
        $plot->gnuplot_cmd("set pointsize 2");
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
            my $ellipse_filename = "__contour_" . $i . ".dat";
            $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
        my $all_joined_data = join ':', @all_data;
        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;
            push @all_maxvals, $maxval;
            push @all_clusters_for_hist, \@cluster_for_hist;
        }
        push @all_minmaxvals, @all_minvals;
        push @all_minmaxvals, @all_maxvals;
        my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
        my $delta = ($abs_maxval - $abs_minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $filename = basename($master_datafile);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $filename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";
            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    }
}

# This subroutine is the same as above except that it makes PNG plots (for hardcopy
# printing) of the clusters.
sub plot_hardcopy_clusters {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || die "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || die "visualization mask missing";    
        $pause_time = shift;
    } else {
        die "visualize_clusters() called with wrong args";
    }
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 
exponent(x,y)  = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
END
            my $plot = Graphics::GnuplotIF->new();
            $plot->gnuplot_cmd( $argstring );
        }
    }
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_data}} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $K = scalar @{$self->{_clusters}};
    my $filename = basename($master_datafile);
    my $temp_file = "__temp_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $cluster (@{$self->{_clusters}}) {
        foreach my $item (@$cluster) {
            print OUTPUT "@{$visualization_data{$item}}";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
        }
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
            my $ellipse_filename = "__contour_" . $i . ".dat";
            $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
        my $all_joined_data = join ':', @all_data;
        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;
            push @all_maxvals, $maxval;
            push @all_clusters_for_hist, \@cluster_for_hist;
        }
        push @all_minmaxvals, @all_minvals;
        push @all_minmaxvals, @all_maxvals;
        my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
        my $delta = ($abs_maxval - $abs_minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $filename = basename($master_datafile);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $filename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";
            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
#            $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png color',
                            'set output "cluster_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd('set terminal png',
                           'set output "cluster_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd('set terminal png',
                           'set output "cluster_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}

# This method is for the visualization of the posterior class distributions.  In
# other words, this method allows us to see the soft clustering produced by the EM
# algorithm.  While much of the gnuplot logic here is the same as in the
# visualize_clusters() method, there are significant differences in how the data is
# pooled for the purpose of display.
sub visualize_distributions {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || die "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || die "visualization mask missing";    
        $pause_time = shift;
    } else {
        die "visualize_distributions() called with wrong args";
    }
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour2_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 
exponent(x,y)  = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
END
            my $plot = Graphics::GnuplotIF->new();
            $plot->gnuplot_cmd( $argstring );
         }
    }
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_data}} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($self->{_datafile});
    my $temp_file = "__temp2_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    my @class_distributions;
    foreach my $cluster_index (0..$self->{_K}-1) {
        push @class_distributions, [];
    }
    foreach my $data_tag (@{$self->{_data_id_tags}}) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
                if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
        }
    }
    foreach my $distribution (@class_distributions) {
        foreach my $item (@$distribution) {
            print OUTPUT "@$item";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$self->{_K}-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
        }
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$self->{_K}-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
            my $ellipse_filename = "__contour2_" . $i . ".dat";
            $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
        my $all_joined_data = join ':', @all_data;
        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;
            push @all_maxvals, $maxval;
            push @all_clusters_for_hist, \@cluster_for_hist;
        }
        push @all_minmaxvals, @all_minvals;
        push @all_minmaxvals, @all_maxvals;
        my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
        my $delta = ($abs_maxval - $abs_minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $localfilename = basename($filename);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $localfilename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";

            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
#            $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    }
}

# This method is basically the same as the previous method, except that it is
# intended for making PNG files from the distributions.
sub plot_hardcopy_distributions {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || die "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || die "visualization mask missing";    
        $pause_time = shift;
    } else {
        die "visualize_distributions() called with wrong args";
    }
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    if ($visualization_data_field_width == 2) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            my $contour_filename = "__contour2_" . $cluster_index . ".dat";
            my $mean = $self->{_cluster_means}->[$cluster_index];
            my $covariance = $self->{_cluster_covariances}->[$cluster_index];
            my ($mux,$muy) = $mean->as_list();
            my ($varx,$sigmaxy) = $covariance->row(0)->as_list();
            my ($sigmayx,$vary) = $covariance->row(1)->as_list();
            die "Your covariance matrix does not look right" 
                unless $sigmaxy == $sigmayx;
            my ($sigmax,$sigmay) = (sqrt($varx),sqrt($vary));
my $argstring = <<"END";
set contour
mux = $mux
muy = $muy
sigmax = $sigmax
sigmay = $sigmay
sigmaxy = $sigmaxy
determinant = (sigmax**2)*(sigmay**2) - sigmaxy**2 
exponent(x,y)  = -0.5 * (1.0 / determinant) * ( ((x-mux)**2)*sigmay**2 + ((y-muy)**2)*sigmax**2 - 2*sigmaxy*(x-mux)*(y-muy) )
f(x,y) = exp( exponent(x,y) ) - 0.2
xmax = mux + 2 * sigmax
xmin = mux - 2 * sigmax
ymax = muy + 2 * sigmay
ymin = muy - 2 * sigmay
set xrange [ xmin : xmax ]
set yrange [ ymin : ymax ]
set isosamples 200
unset surface
set cntrparam levels discrete 0
set table \"$contour_filename\"
splot f(x,y)
unset table
END
            my $plot = Graphics::GnuplotIF->new();
            $plot->gnuplot_cmd( $argstring );
         }
    }
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_data}} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($self->{_datafile});
    my $temp_file = "__temp2_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    my @class_distributions;
    foreach my $cluster_index (0..$self->{_K}-1) {
        push @class_distributions, [];
    }
    foreach my $data_tag (@{$self->{_data_id_tags}}) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
                if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
        }
    }
    foreach my $distribution (@class_distributions) {
        foreach my $item (@$distribution) {
            print OUTPUT "@$item";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$self->{_K}-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
        }
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$self->{_K}-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
            my $ellipse_filename = "__contour2_" . $i . ".dat";
            $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
        my $all_joined_data = join ':', @all_data;
        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;
            push @all_maxvals, $maxval;
            push @all_clusters_for_hist, \@cluster_for_hist;
        }
        push @all_minmaxvals, @all_minvals;
        push @all_minmaxvals, @all_maxvals;
        my ($abs_minval,$abs_maxval) = minmax(\@all_minmaxvals);
        my $delta = ($abs_maxval - $abs_minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $localfilename = basename($filename);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $localfilename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";

            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;

    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "posterior_prob_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "posterior_prob_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "posterior_prob_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}

#  The method shown below should be called only AFTER you have called the method
#  read_data_from_file().  The visualize_data() is meant for the visualization of the
#  original data in its various 2D or 3D subspaces.
sub visualize_data {
    my $self = shift;
    my $v_mask = shift || die "visualization mask missing";

    my $master_datafile = $self->{_datafile};

    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    my %visualization_data;
    my $data_source = $self->{_data};
    while ( my ($record_id, $data) = each %{$data_source} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($master_datafile);
    my $temp_file;
    $temp_file = "__temp_data_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;
    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  '"original data provided for EM"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_} @all_data;
        @all_data = grep $_, @all_data;
        my ($minval,$maxval) = minmax(\@all_data);
        my $delta = ($maxval - $minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        my $localfilename = basename($filename);
        my $temp_file = "__temp1dhist_" .  $localfilename;
        unlink $temp_file if -e $temp_file;
        open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
        print OUTPUT "Xstep histval\n";
        my @histogram = (0) x 100;
        foreach my $i (0..@all_data-1) {
            $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
        }
        foreach my $i (0..@histogram-1) {
            print OUTPUT "$i $histogram[$i]\n";        
        }
        $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
        close OUTPUT;
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}

# This method is the same as the one shown above, except that it is meant for
# creating PNG files of the displays.
sub plot_hardcopy_data {
    my $self = shift;
    my $v_mask = shift || die "visualization mask missing";
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    die "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    my %visualization_data;
    my $data_source = $self->{_data};
    while ( my ($record_id, $data) = each %{$data_source} ) {
        my @fields = @$data;
        die "\nABORTED: Visualization mask size exceeds data record size" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($master_datafile);
    my $temp_file;
    $temp_file = "__temp_data_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;

    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  '"original data provided for EM"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_} @all_data;
        @all_data = grep $_, @all_data;
        my ($minval,$maxval) = minmax(\@all_data);
        my $delta = ($maxval - $minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        my $localfilename = basename($filename);
        my $temp_file = "__temp1dhist_" .  $localfilename;
        unlink $temp_file if -e $temp_file;
        open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
        print OUTPUT "Xstep histval\n";
        my @histogram = (0) x 100;
        foreach my $i (0..@all_data-1) {
            $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
        }
        foreach my $i (0..@histogram-1) {
            print OUTPUT "$i $histogram[$i]\n";        
        }
        $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
        close OUTPUT;
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}


###################  Generating Synthetic Data for Clustering  ###################

#  The data generated corresponds to a multivariate distribution.  The mean and the
#  covariance of each Gaussian in the distribution are specified individually in a
#  parameter file. The parameter file must also state the prior probabilities to be
#  associated with each Gaussian.  See the example parameter file param1.txt in the
#  examples directory.  Just edit this file for your own needs.
#
#  The multivariate random numbers are generated by calling the Math::Random module.
#  As you would expect, that module will insist that the covariance matrix you
#  specify be symmetric and positive definite.
sub cluster_data_generator {
    my $class = shift;
    die "illegal call of a class method" 
        unless $class eq 'Algorithm::ExpectationMaximization';
    my %args = @_;
    my $input_parameter_file = $args{input_parameter_file};
    my $output_file = $args{output_datafile};
    my $N = $args{total_number_of_data_points};
    my @all_params;
    my $param_string;
    if (defined $input_parameter_file) {
        open INPUT, $input_parameter_file || "unable to open parameter file: $!";
        @all_params = <INPUT>;
        @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
        chomp @all_params;
        $param_string = join ' ', @all_params;
    } else {
        # Just for testing. Used in t/test.t
        $param_string = "priors 0.34 0.33 0.33 " .
                        "cluster 5 0 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 5 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 0 5  1 0 0 0 1 0 0 0 1";
    }
    my ($priors_string) = $param_string =~ /priors(.+?)cluster/;
    croak "You did not specify the prior probabilities in the parameter file"
        unless $priors_string;
    my @priors = split /\s+/, $priors_string;
    @priors = grep {/$_num_regex/; $_} @priors;
    my $sum = 0;
    foreach my $prior (@priors) {
        $sum += $prior;
    }
    croak "Your priors in the parameter file do not add up to 1"  unless $sum == 1;
    my ($rest_of_string) = $param_string =~ /priors\s*$priors_string(.*)$/;
    my @cluster_strings = split /[ ]*cluster[ ]*/, $rest_of_string;
    @cluster_strings = grep  $_, @cluster_strings;

    my $K = @cluster_strings;
    croak "Too many clusters requested" if $K > 12;
    croak "Mismatch between the number of values for priors and the number " . 
        "of clusters" unless $K == @priors;
    my @point_labels = ('a'..'z');
    print "Prior probabilities recorded from param file: @priors\n";
    print "Number of Gaussians used for the synthetic data: $K\n";
    my @means;
    my @covariances;
    my $data_dimension;
    foreach my $i (0..$K-1) {
        my @num_strings = split /  /, $cluster_strings[$i];
        my @cluster_mean = map {/$_num_regex/;$_} split / /, $num_strings[0];
        $data_dimension = @cluster_mean;
        push @means, \@cluster_mean;
        my @covariance_nums = map {/$_num_regex/;$_} split / /, $num_strings[1];
        croak "dimensionality error" if @covariance_nums != 
                                      ($data_dimension ** 2);
        my $cluster_covariance;
        foreach my $j (0..$data_dimension-1) {
            foreach my $k (0..$data_dimension-1) {        
                $cluster_covariance->[$j]->[$k] = 
                         $covariance_nums[$j*$data_dimension + $k];
            }
        }
        push @covariances, $cluster_covariance;
    }
    random_seed_from_phrase( 'hellomellojello' );
    my @data_dump;
    foreach my $i (0..$K-1) {
        my @m = @{shift @means};
        my @covar = @{shift @covariances};
        my @new_data = Math::Random::random_multivariate_normal( 
                           int($N * $priors[$i]), @m, @covar );
        my $p = 0;
        my $label = $point_labels[$i];
        @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
        push @data_dump, @new_data;     
    }
    fisher_yates_shuffle( \@data_dump );
    open OUTPUT, ">$output_file";
    print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
        if $input_parameter_file;
    print OUTPUT "\#Total number of data points in this file: $N\n";
    print OUTPUT "\#Prior class probabilities for this data: @priors\n";
    foreach my $ele (@data_dump) {
        foreach my $coord ( @$ele ) {
            print OUTPUT "$coord ";
        }
        print OUTPUT "\n";
    }
    print "Data written out to file $output_file\n";
    close OUTPUT;
}

sub add_point_coords {
    my $self = shift;
    my @arr_of_ids = @{shift @_};      # array of data element names
    my @result;
    my $data_dimensionality = $self->{_data_dimensions};
    foreach my $i (0..$data_dimensionality-1) {
        $result[$i] = 0.0;
    }
    foreach my $id (@arr_of_ids) {
        my $ele = $self->{_data}->{$id};
        my $i = 0;
        foreach my $component (@$ele) {
            $result[$i] += $component;
            $i++;
        }
    }
    return \@result;
}

######################   Support Routines  ########################

# computer the outer product of two column vectors
sub outer_product {
    my $vec1 = shift;
    my $vec2 = shift;
    my ($nrows1, $ncols1) = ($vec1->rows(), $vec1->cols());
    my ($nrows2, $ncols2) = ($vec2->rows(), $vec2->cols());
    die "Outer product operation called with non-matching vectors"
        unless $ncols1 == 1 && $ncols2 == 1 && $nrows1 == $nrows2;
    my @vec_arr1 = $vec1->as_list();
    my @vec_arr2 = $vec2->as_list();
    my $outer_product = Math::GSL::Matrix->new($nrows1, $nrows2);
    foreach my $index (0..$nrows1-1) {
        my @new_row = map $vec_arr1[$index] * $_, @vec_arr2;
        $outer_product->set_row($index, \@new_row);
    }
    return $outer_product;
}

sub get_index_at_value {
    my $value = shift;
    my @array = @{shift @_};
    foreach my $i (0..@array-1) {
        return $i if $value == $array[$i];
    }
}

# This routine is really not necessary in light of the new `~~' operator in Perl.
# Will use the new operator in the next version.
sub vector_equal {
    my $vec1 = shift;
    my $vec2 = shift;
    die "wrong data types for distance calculation" if @$vec1 != @$vec2;
    foreach my $i (0..@$vec1-1){
        return 0 if $vec1->[$i] != $vec2->[$i];
    }
    return 1;
}

lib/Algorithm/ExpectationMaximization.pm view on Meta::CPAN

    $clusterer->display_mdl_quality_vs_iterations();

At the end of each iteration, this method measures the
quality of clustering my calculating its MDL (Minimum
Description Length).  As stated earlier in Description, the
MDL measure is a difference of a log-likelihood term for all
of the observed data and a model-complexity penalty term.
The smaller the value returned by this method, the better
the clustering.

=item B<return_estimated_priors():>

    my $estimated_priors = $clusterer->return_estimated_priors();
    print "Estimated class priors: @$estimated_priors\n";

This method can be used to access the final values of the
class priors as estimated by the EM algorithm.

=item  B<cluster_data_generator()>

    Algorithm::ExpectationMaximization->cluster_data_generator(
                            input_parameter_file => $parameter_file,
                            output_datafile => $out_datafile,
                            total_number_of_data_points => 300 
    );

for generating multivariate data for clustering if you wish to play with synthetic
data for experimenting with the EM algorithm.  The input parameter file must specify
the priors to be used for the Gaussians, their means, and their covariance matrices.
The format of the information contained in the parameter file must be as shown in the
file C<param1.txt> provided in the C<examples> directory.  It will be easiest for you
to just edit a copy of this file for your data generation needs.  In addition to the
format of the parameter file, the main constraint you need to observe in specifying
the parameters is that the dimensionality of the covariance matrices must correspond
to the dimensionality of the mean vectors.  The multivariate random numbers are
generated by calling the C<Math::Random> module.  As you would expect, this module
requires that the covariance matrices you specify in your parameter file be symmetric
and positive definite.  Should the covariances in your parameter file not obey this
condition, the C<Math::Random> module will let you know.

=item B<visualize_data($data_visualization_mask):>

    $clusterer->visualize_data($data_visualization_mask);                          

This is the method to call if you want to visualize the data
you plan to cluster with the EM algorithm.  You'd need to
specify argument mask in a manner similar to the
visualization of the clusters, as explained earlier.

=item B<plot_hardcopy_data($data_visualization_mask):>

    $clusterer->plot_hardcopy_data($data_visualization_mask); 

This method creates a PNG file that can be used to print out
a hardcopy of the data in different 2D and 3D subspaces of
the data space. The visualization mask is used to select the
subspace for the PNG image.

=back

=head1 HOW THE CLUSTERS ARE OUTPUT

This module produces two different types of clusters: the "hard" clusters and the
"soft" clusters.  The hard clusters correspond to the naive Bayes' classification of
the data points on the basis of the Gaussian distributions and the class priors
estimated by the EM algorithm. Such clusters partition the data into disjoint
subsets.  On the other hand, the soft clusters correspond to the posterior class
probabilities calculated by the EM algorithm.  A data element belongs to a cluster if
its posterior probability for that Gaussian exceeds a threshold.

After the EM algorithm has finished running, the hard clusters are created by
invoking the method C<run_bayes_classifier()> on an instance of the module and then
made user-accessible by calling C<return_disjoint_clusters()>.  These clusters may
then be displayed in a terminal window by dereferencing each element of the array
whose reference is returned b C<return_disjoint_clusters()>.  The hard clusters can
be written out to disk files by invoking C<write_naive_bayes_clusters_to_files()>.
This method writes out the clusters to files, one cluster per file.  What is written
out to each file consists of the symbolic names of the data records that belong to
the cluster corresponding to that file.  The clusters are placed in files with names
like

    naive_bayes_cluster1.txt
    naive_bayes_cluster2.txt
    ...

The soft clusters on the other hand are created by calling
C<return_clusters_with_posterior_probs_above_threshold($theta1)>
on an instance of the module, where the argument C<$theta1>
is the threshold for deciding whether a data element belongs
in a soft cluster.  The posterior class probability at a
data element must exceed the threshold for the element to
belong to the corresponding cluster.  The soft cluster can
be written out to disk files by calling
C<write_posterior_prob_clusters_above_threshold_to_files($theta1)>.
As with the hard clusters, each cluster is placed in a separate
file. The filenames for such clusters look like:

    posterior_prob_cluster1.txt
    posterior_prob_cluster2.txt
    ...

=head1 WHAT IF THE NUMBER OF CLUSTERS IS UNKNOWN?

The module constructor requires that you supply a value for the parameter C<K>, which
is the number of clusters you expect to see in the data.  But what if you do not have
a good value for C<K>?  Note that it is possible to search for the best C<K> by using
the two clustering quality criteria included in the module.  However, I have
intentionally not yet incorporated that feature in the module because it slows down
the execution of the code --- especially when the dimensionality of the data
increases.  However, nothing prevents you from writing a script --- along the lines
of the five "canned_example" scripts in the C<examples> directory --- that would use
the two clustering quality metrics for finding the best choice for C<K> for a given
dataset.  Obviously, you will now have to incorporate the call to the constructor in
a loop and check the value of the quality measures for each value of C<K>.

=head1 SOME RESULTS OBTAINED WITH THIS MODULE

If you would like to see some results that have been obtained with this module, check
out Section 7 of the report
L<https://engineering.purdue.edu/kak/Tutorials/ExpectationMaximization.pdf>.

( run in 0.757 second using v1.01-cache-2.11-cpan-5b529ec07f3 )