Algorithm-KMeans

 view release on metacpan or  search on metacpan

lib/Algorithm/KMeans.pm  view on Meta::CPAN

sub write_clusters_to_files {
    my $self = shift;
    my @clusters = @{$self->{_clusters}};
    unlink glob "cluster*.dat";
    foreach my $i (0..@clusters-1) {
        my $filename = "cluster" . $i . ".txt";
        print "\nWriting cluster $i to file $filename\n" if $self->{_terminal_output};
        open FILEHANDLE, "| sort > $filename"  or die "Unable to open file: $!";
        foreach my $ele (@{$clusters[$i]}) {        
            print FILEHANDLE "$ele ";
        }
        close FILEHANDLE;
    }
}

sub get_K_best {
    my $self = shift;
    croak "You need to run the clusterer with K=0 option " .
          "before you can call this method" if $self->{_K_best} eq 'unknown';
    print "\nThe best value of K: $self->{_K_best}\n" if $self->{_terminal_output};
    return $self->{_K_best};
}

sub show_QoC_values {
    my $self = shift;
    croak "\nYou need to run the clusterer with K=0 option before you can call this method" 
                            if $self->{_K_best} eq 'unknown';
    print "\nShown below are K on the left and the QoC values on the right (the smaller " .
          "the QoC, the better it is)\n";
    foreach my $key (sort keys %{$self->{_QoC_values}} ) {
        print " $key  =>  $self->{_QoC_values}->{$key}\n" if defined $self->{_QoC_values}->{$key};
    }
}

sub DESTROY {
    unlink "__temp_" . basename($_[0]->{_datafile});
    unlink "__temp_data_" . basename($_[0]->{_datafile});
    unlink "__temp_normed_data_" . basename($_[0]->{_datafile});
}

##################################  Visualization Code ###################################

#  It makes sense to call visualize_clusters() only AFTER you have called kmeans().
#
#  The visualize_clusters() implementation automatically figures out whether it
#  should do a 2D plot or a 3D plot.  If the number of on bits in the mask that is
#  supplied as one of the arguments is greater than 2, it does a 3D plot for the
#  first three data coordinates.  That is, the clusters will be displayed in the 3D
#  space formed by the first three data coordinates. On the other hand, if the number
#  of on bits in the mask is exactly 2, it does a 2D plot.  Should it happen that
#  only one on bit is specified for the mask, visualize_clusters() aborts.
#
#  The visualization code consists of first accessing each of clusters created by the
#  kmeans() subroutine.  Note that the clusters contain only the symbolic names for
#  the individual records in the source data file.  We therefore next reach into the
#  $self->{_original_data} hash and get the data coordinates associated with each
#  symbolic label in a cluster.  The numerical data thus generated is then written
#  out to a temp file.  When doing so we must remember to insert TWO BLANK LINES
#  between the data blocks corresponding to the different clusters.  This constraint
#  is imposed on us by Gnuplot when plotting data from the same file since we want to
#  use different point styles for the data points in different cluster files.
#
#  Subsequently, we call upon the Perl interface provided by the Graphics::GnuplotIF
#  module to plot the data clusters.
sub visualize_clusters {
    my $self = shift;
    my $v_mask;
    my $pause_time;
    if (@_ == 1) {
        $v_mask = shift || croak "visualization mask missing";
    } elsif (@_ == 2) {
        $v_mask = shift || croak "visualization mask missing";    
        $pause_time = shift;
    } else {
        croak "visualize_clusters() called with wrong args";
    }
    my $master_datafile = $self->{_datafile};
    my @v_mask = split //, $v_mask;
    my $visualization_mask_width = @v_mask;
    my $original_data_mask = $self->{_mask};
    my @mask = split //, $original_data_mask;
    my $data_field_width = scalar grep {$_ eq '1'} @mask;    
    croak "\n\nABORTED: The width of the visualization mask (including " .
          "all its 1s and 0s) must equal the width of the original mask " .
          "used for reading the data file (counting only the 1's)"
          if $visualization_mask_width != $data_field_width;
    my $visualization_data_field_width = scalar grep {$_ eq '1'} @v_mask;
    my %visualization_data;
    while ( my ($record_id, $data) = each %{$self->{_original_data}} ) {
        my @fields = @$data;
        croak "\nABORTED: Visualization mask size exceeds data record size\n" 
            if $#v_mask > $#fields;
        my @data_fields;
        foreach my $i (0..@fields-1) {
            if ($v_mask[$i] eq '0') {
                next;
            } elsif ($v_mask[$i] eq '1') {
                push @data_fields, $fields[$i];
            } else {
                croak "Misformed visualization mask. It can only have 1s and 0s\n";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my @all_data_ids = @{$self->{_data_id_tags}};
    my $K = scalar @{$self->{_clusters}};
    my $filename = basename($master_datafile);
    my $temp_file = "__temp_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!\n";
    foreach my $cluster (@{$self->{_clusters}}) {
        foreach my $item (@$cluster) {
            print OUTPUT "@{$visualization_data{$item}}";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    my $hardcopy_plot;



( run in 1.685 second using v1.01-cache-2.11-cpan-140bd7fdf52 )