Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

}

sub read_data_from_file_dat {
    my $self = shift;
    my $datafile = $self->{_datafile};
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"
	if $self->{_terminal_output};
    open INPUT, $datafile
        or die "unable to open file $datafile: $!";
    chomp( my @raw_data = <INPUT> );
    close INPUT;
    # Transform strings into number data
    foreach my $record (@raw_data) {
        next unless $record;
        next if $record =~ /^#/;
        my @data_fields;
        my @fields = split /\s+/, $record;
        die "\nABORTED: Mask size does not correspond to row record size" 
            if $#fields != $#mask;
        my $record_id;
        foreach my $i (0..@fields-1) {

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $K = scalar @{$self->{_clusters}};
    my $filename = basename($master_datafile);
    my $temp_file = "__temp_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $cluster (@{$self->{_clusters}}) {
        foreach my $item (@$cluster) {
            print OUTPUT "@{$visualization_data{$item}}";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd("set noclip");
        $plot->gnuplot_cmd("set pointsize 2");

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd("set noclip");
        $plot->gnuplot_cmd("set pointsize 2");
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
            my $ellipse_filename = "__contour_" . $i . ".dat";
            $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
        my $all_joined_data = join ':', @all_data;
        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $filename = basename($master_datafile);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $filename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";
            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $K = scalar @{$self->{_clusters}};
    my $filename = basename($master_datafile);
    my $temp_file = "__temp_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $cluster (@{$self->{_clusters}}) {
        foreach my $item (@$cluster) {
            print OUTPUT "@{$visualization_data{$item}}";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$K-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster (naive Bayes) $i\" with points lt $j pt $j, ";
            my $ellipse_filename = "__contour_" . $i . ".dat";
            $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
        my $all_joined_data = join ':', @all_data;
        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Clusters shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $filename = basename($master_datafile);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $filename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";
            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
#            $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png color',
                            'set output "cluster_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd('set terminal png',

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($self->{_datafile});
    my $temp_file = "__temp2_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    my @class_distributions;
    foreach my $cluster_index (0..$self->{_K}-1) {
        push @class_distributions, [];
    }
    foreach my $data_tag (@{$self->{_data_id_tags}}) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
                if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
        }
    }
    foreach my $distribution (@class_distributions) {
        foreach my $item (@$distribution) {
            print OUTPUT "@$item";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$self->{_K}-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
            my $ellipse_filename = "__contour2_" . $i . ".dat";
            $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
        my $all_joined_data = join ':', @all_data;
        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $localfilename = basename($filename);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $localfilename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";

            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
#            $arg_string .= "\"$temp_file\" using 1:2 ti col smooth frequency with boxes lc $cindex, ";
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
        $plot->gnuplot_pause( $pause_time ) if defined $pause_time;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

                push @data_fields, $fields[$i];
            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($self->{_datafile});
    my $temp_file = "__temp2_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    my @class_distributions;
    foreach my $cluster_index (0..$self->{_K}-1) {
        push @class_distributions, [];
    }
    foreach my $data_tag (@{$self->{_data_id_tags}}) {
        foreach my $cluster_index (0..$self->{_K}-1) {
            push @{$class_distributions[$cluster_index]}, $self->{_data}->{$data_tag}
                if $self->{_expected_class_probs}->{$data_tag}->[$cluster_index] > 0.2;
        }
    }
    foreach my $distribution (@class_distributions) {
        foreach my $item (@$distribution) {
            print OUTPUT "@$item";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    my $arg_string = "";
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "set noclip" );
        $plot->gnuplot_cmd( "set pointsize 2" );
        foreach my $i (0..$self->{_K}-1) {
            my $j = $i + 1;
            $arg_string .= "\"$temp_file\" index $i using 1:2 title \"Cluster $i (based on posterior probs)\" with points lt $j pt $j, ";
            my $ellipse_filename = "__contour2_" . $i . ".dat";
            $arg_string .= "\"$ellipse_filename\" with line lt $j title \"\", ";
        }
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_ =~ /\d/ ? $_ : "SEPERATOR" } @all_data;
        my $all_joined_data = join ':', @all_data;
        my @separated = split /:SEPERATOR:SEPERATOR/, $all_joined_data;
        my (@all_clusters_for_hist, @all_minvals, @all_maxvals, @all_minmaxvals);
        foreach my $i (0..@separated-1) {
            $separated[$i] =~ s/SEPERATOR//g;
            my @cluster_for_hist = split /:/, $separated[$i];
            @cluster_for_hist = grep $_, @cluster_for_hist;
            my ($minval,$maxval) = minmax(\@cluster_for_hist);
            push @all_minvals, $minval;

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Individual distributions shown through histograms'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        foreach my $cindex (0..@all_clusters_for_hist-1) {
            my $localfilename = basename($filename);
            my $temp_file = "__temp1dhist_" . "$cindex" . "_" .  $localfilename;
            unlink $temp_file if -e $temp_file;
            open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
            print OUTPUT "Xstep histval\n";

            my @histogram = (0) x 100;
            foreach my $i (0..@{$all_clusters_for_hist[$cindex]}-1) {
                $histogram[int( ($all_clusters_for_hist[$cindex][$i] - $abs_minval) / $delta )]++;
            }
            foreach my $i (0..@histogram-1) {
                print OUTPUT "$i $histogram[$i]\n";        
            }
            $arg_string .= "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc $cindex, ";
            close OUTPUT;
        }
    }
    $arg_string = $arg_string =~ /^(.*),[ ]+$/;
    $arg_string = $1;

    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "posterior_prob_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($master_datafile);
    my $temp_file;
    $temp_file = "__temp_data_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;
    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  '"original data provided for EM"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_} @all_data;
        @all_data = grep $_, @all_data;
        my ($minval,$maxval) = minmax(\@all_data);
        my $delta = ($maxval - $minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        my $localfilename = basename($filename);
        my $temp_file = "__temp1dhist_" .  $localfilename;
        unlink $temp_file if -e $temp_file;
        open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
        print OUTPUT "Xstep histval\n";
        my @histogram = (0) x 100;
        foreach my $i (0..@all_data-1) {
            $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
        }
        foreach my $i (0..@histogram-1) {
            print OUTPUT "$i $histogram[$i]\n";        
        }
        $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
        close OUTPUT;
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {
        $plot->gnuplot_cmd( "plot $arg_string" );
    }
}

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

            } else {
                die "Misformed visualization mask. It can only have 1s and 0s";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my $filename = basename($master_datafile);
    my $temp_file;
    $temp_file = "__temp_data_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;

    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  '"original data provided for EM"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {
        open INPUT, "$temp_file" or die "Unable to open a temp file in this directory: $!";
        my @all_data = <INPUT>;
        close INPUT;
        @all_data = map {chomp $_; $_} @all_data;
        @all_data = grep $_, @all_data;
        my ($minval,$maxval) = minmax(\@all_data);
        my $delta = ($maxval - $minval) / 100.0;
        $plot->gnuplot_cmd("set boxwidth 3");
        $plot->gnuplot_cmd("set style fill solid border -1");
        $plot->gnuplot_cmd("set ytics out nomirror");
        $plot->gnuplot_cmd("set style data histograms");
        $plot->gnuplot_cmd("set style histogram clustered");
        $plot->gnuplot_cmd("set title 'Overall distribution of 1D data'");
        $plot->gnuplot_cmd("set xtics rotate by 90 offset 0,-5 out nomirror");
        my $localfilename = basename($filename);
        my $temp_file = "__temp1dhist_" .  $localfilename;
        unlink $temp_file if -e $temp_file;
        open OUTPUT, ">$temp_file" or die "Unable to open a temp file in this directory: $!";
        print OUTPUT "Xstep histval\n";
        my @histogram = (0) x 100;
        foreach my $i (0..@all_data-1) {
            $histogram[int( ($all_data[$i] - $minval) / $delta )]++;
        }
        foreach my $i (0..@histogram-1) {
            print OUTPUT "$i $histogram[$i]\n";        
        }
        $arg_string = "\"$temp_file\" using 2:xtic(1) ti col smooth frequency with boxes lc rgb 'green'";
        close OUTPUT;
    }
    if ($visualization_data_field_width > 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "splot $arg_string" );
    } elsif ($visualization_data_field_width == 2) {
        $plot->gnuplot_cmd( 'set terminal png',
                            'set output "data_scatter_plot.png"');
        $plot->gnuplot_cmd( "plot $arg_string" );
    } elsif ($visualization_data_field_width == 1) {

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    my $class = shift;
    die "illegal call of a class method" 
        unless $class eq 'Algorithm::ExpectationMaximization';
    my %args = @_;
    my $input_parameter_file = $args{input_parameter_file};
    my $output_file = $args{output_datafile};
    my $N = $args{total_number_of_data_points};
    my @all_params;
    my $param_string;
    if (defined $input_parameter_file) {
        open INPUT, $input_parameter_file || "unable to open parameter file: $!";
        @all_params = <INPUT>;
        @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
        chomp @all_params;
        $param_string = join ' ', @all_params;
    } else {
        # Just for testing. Used in t/test.t
        $param_string = "priors 0.34 0.33 0.33 " .
                        "cluster 5 0 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 5 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 0 5  1 0 0 0 1 0 0 0 1";
    }

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        my @m = @{shift @means};
        my @covar = @{shift @covariances};
        my @new_data = Math::Random::random_multivariate_normal( 
                           int($N * $priors[$i]), @m, @covar );
        my $p = 0;
        my $label = $point_labels[$i];
        @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
        push @data_dump, @new_data;     
    }
    fisher_yates_shuffle( \@data_dump );
    open OUTPUT, ">$output_file";
    print OUTPUT "\#Data generated from the parameter file: $input_parameter_file\n"
        if $input_parameter_file;
    print OUTPUT "\#Total number of data points in this file: $N\n";
    print OUTPUT "\#Prior class probabilities for this data: @priors\n";
    foreach my $ele (@data_dump) {
        foreach my $coord ( @$ele ) {
            print OUTPUT "$coord ";
        }
        print OUTPUT "\n";
    }
    print "Data written out to file $output_file\n";
    close OUTPUT;
}

sub add_point_coords {
    my $self = shift;
    my @arr_of_ids = @{shift @_};      # array of data element names
    my @result;
    my $data_dimensionality = $self->{_data_dimensions};
    foreach my $i (0..$data_dimensionality-1) {
        $result[$i] = 0.0;
    }

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN


    $clusterer->plot_hardcopy_data($data_visualization_mask); 

This method creates a PNG file that can be used to print out
a hardcopy of the data in different 2D and 3D subspaces of
the data space. The visualization mask is used to select the
subspace for the PNG image.

=back

=head1 HOW THE CLUSTERS ARE OUTPUT

This module produces two different types of clusters: the "hard" clusters and the
"soft" clusters.  The hard clusters correspond to the naive Bayes' classification of
the data points on the basis of the Gaussian distributions and the class priors
estimated by the EM algorithm. Such clusters partition the data into disjoint
subsets.  On the other hand, the soft clusters correspond to the posterior class
probabilities calculated by the EM algorithm.  A data element belongs to a cluster if
its posterior probability for that Gaussian exceeds a threshold.

After the EM algorithm has finished running, the hard clusters are created by



( run in 0.253 second using v1.01-cache-2.11-cpan-4e96b696675 )