Algorithm-KMeans

 view release on metacpan or  search on metacpan

lib/Algorithm/KMeans.pm  view on Meta::CPAN

    }
}

sub read_data_from_file_dat {
    my $self = shift;
    my $datafile = $self->{_datafile};
    my $mask = $self->{_mask};
    my @mask = split //, $mask;
    $self->{_data_dimensions} = scalar grep {$_ eq '1'} @mask;
    print "data dimensionality:  $self->{_data_dimensions} \n"if $self->{_terminal_output};
    open INPUT, $datafile or die "unable to open file $datafile: $!\n";
    chomp( my @raw_data = <INPUT> );
    close INPUT;
    # Transform strings into number data
    foreach my $record (@raw_data) {
        next unless $record;
        next if $record =~ /^#/;
        my @data_fields;
        my @fields = split /\s+/, $record;
        die "\nABORTED: Mask size does not correspond to row record size\n" if $#fields != $#mask;
        my $record_id;
        foreach my $i (0..@fields-1) {
            if ($mask[$i] eq '0') {

lib/Algorithm/KMeans.pm  view on Meta::CPAN

                croak "Misformed visualization mask. It can only have 1s and 0s\n";
            }
        }
        $visualization_data{ $record_id } = \@data_fields;
    }
    my @all_data_ids = @{$self->{_data_id_tags}};
    my $K = scalar @{$self->{_clusters}};
    my $filename = basename($master_datafile);
    my $temp_file = "__temp_" . $filename;
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!\n";
    foreach my $cluster (@{$self->{_clusters}}) {
        foreach my $item (@$cluster) {
            print OUTPUT "@{$visualization_data{$item}}";
            print OUTPUT "\n";
        }
        print OUTPUT "\n\n";
    }
    close OUTPUT;
    my $plot;
    my $hardcopy_plot;
    if (!defined $pause_time) {
        $plot = Graphics::GnuplotIF->new( persist => 1 );
        $hardcopy_plot = Graphics::GnuplotIF->new();
        $hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"clustering_results.png\"");
    } else {
        $plot = Graphics::GnuplotIF->new();
    }
    $plot->gnuplot_cmd( "set noclip" );

lib/Algorithm/KMeans.pm  view on Meta::CPAN

    my $filename = basename($master_datafile);
    my $temp_file;
    if ($datatype eq 'original') {
        $temp_file = "__temp_data_" . $filename;
    } elsif ($datatype eq 'normed') {
        $temp_file = "__temp_normed_data_" . $filename;
    } else {
        croak "ABORTED: Improper call to visualize_data()";
    }
    unlink $temp_file if -e $temp_file;
    open OUTPUT, ">$temp_file"
           or die "Unable to open a temp file in this directory: $!\n";
    foreach my $datapoint (values %visualization_data) {
        print OUTPUT "@$datapoint";
        print OUTPUT "\n";
    }
    close OUTPUT;
    my $plot = Graphics::GnuplotIF->new( persist => 1 );
    $plot->gnuplot_cmd( "set noclip" );
    $plot->gnuplot_cmd( "set pointsize 2" );
    my $plot_title =  $datatype eq 'original' ? '"data"' : '"normed data"';
    my $arg_string ;
    if ($visualization_data_field_width > 2) {
        $arg_string = "\"$temp_file\" using 1:2:3 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 2) {
        $arg_string = "\"$temp_file\" using 1:2 title $plot_title with points lt -1 pt 1";
    } elsif ($visualization_data_field_width == 1 ) {

lib/Algorithm/KMeans.pm  view on Meta::CPAN

sub cluster_data_generator {
    my $class = shift;
    croak "illegal call of a class method" unless $class eq 'Algorithm::KMeans';
    my %args = @_;
    my $input_parameter_file = $args{input_parameter_file};
    my $output_file = $args{output_datafile};
    my $N = $args{number_data_points_per_cluster};
    my @all_params;
    my $param_string;
    if (defined $input_parameter_file) {
        open INPUT, $input_parameter_file || "unable to open parameter file: $!";
        @all_params = <INPUT>;
        @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
        chomp @all_params;
        $param_string = join ' ', @all_params;
    } else {
        # Just for testing. Used in t/test.t
        $param_string = "cluster 5 0 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 5 0  1 0 0 0 1 0 0 0 1 " .
                        "cluster 0 0 5  1 0 0 0 1 0 0 0 1";
    }
    my @cluster_strings = split /[ ]*cluster[ ]*/, $param_string;

lib/Algorithm/KMeans.pm  view on Meta::CPAN

    foreach my $i (0..$K-1) {
        my @m = @{shift @means};
        my @covar = @{shift @covariances};
        my @new_data = Math::Random::random_multivariate_normal( $N, @m, @covar );
        my $p = 0;
        my $label = $point_labels[$i];
        @new_data = map {unshift @$_, $label.$i; $i++; $_} @new_data;
        push @data_dump, @new_data;     
    }
    fisher_yates_shuffle( \@data_dump );
    open OUTPUT, ">$output_file";
    foreach my $ele (@data_dump) {
        foreach my $coord ( @$ele ) {
            print OUTPUT "$coord ";
        }
        print OUTPUT "\n";
    }
    print "Data written out to file $output_file\n";
    close OUTPUT;
}

sub add_point_coords {
    my $self = shift;
    my @arr_of_ids = @{shift @_};      # array of data element names
    my @result;
    my $data_dimensionality = $self->{_data_dimensions};
    foreach my $i (0..$data_dimensionality-1) {
        $result[$i] = 0.0;
    }

lib/Algorithm/KMeans.pm  view on Meta::CPAN

parameter file, the main constraint you need to observe in specifying the parameters
is that the dimensionality of the covariance matrix must correspond to the
dimensionality of the mean vectors.  The multivariate random numbers are generated by
calling the C<Math::Random> module.  As you would expect, this module requires that
the covariance matrices you specify in your parameter file be symmetric and positive
definite.  Should the covariances in your parameter file not obey this condition, the
C<Math::Random> module will let you know.

=back

=head1 HOW THE CLUSTERS ARE OUTPUT

When the option C<terminal_output> is set in the call to the constructor, the
clusters are displayed on the terminal screen.

When the option C<write_clusters_to_files> is set in the call to the constructor, the
module dumps the clusters in files named

    cluster0.txt
    cluster1.txt
    cluster2.txt



( run in 0.417 second using v1.01-cache-2.11-cpan-4e96b696675 )