Algorithm-KMeans

 view release on metacpan or  search on metacpan

examples/which_cluster_for_new_data.pl  view on Meta::CPAN

#my $mask = "N11";         # for mydatafile3.dat


my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                        mask     => $mask,
                                        K        => 3,
                                        cluster_seeding => 'random',   # also try 'smart'
                                        use_mahalanobis_metric => 1,   # also try '0'
                                        terminal_output => 1,
                                        write_clusters_to_files => 1,
                                        debug => 0,
                );

$clusterer->read_data_from_file();
my ($clusters_hash, $cluster_centers_hash) = $clusterer->kmeans();


# ACCESSING THE CLUSTERS AND CLUSTER CENTERS IN YOUR SCRIPT:

print "\nDisplaying clusters in the terminal window:\n";
foreach my $cluster_id (sort keys %{$clusters_hash}) {

lib/Algorithm/KMeans.pm  view on Meta::CPAN

        _mask                     =>   $args{mask}     || croak("mask required"),
        _K                        =>   $args{K}        || 0,
        _K_min                    =>   $args{Kmin} || 'unknown',
        _K_max                    =>   $args{Kmax} || 'unknown',
        _cluster_seeding          =>   $args{cluster_seeding} || croak("must choose smart or random ".
                                                                       "for cluster seeding"),
        _var_normalize            =>   $args{do_variance_normalization} || 0,
        _use_mahalanobis_metric   =>   $args{use_mahalanobis_metric} || 0,  
        _clusters_2_files         =>   $args{write_clusters_to_files} || 0,
        _terminal_output          =>   $args{terminal_output} || 0,
        _debug                    =>   $args{debug} || 0,
        _N                        =>   0,
        _K_best                   =>   'unknown',
        _original_data            =>   {},
        _data                     =>   {},
        _data_id_tags             =>   [],
        _QoC_values               =>   {},
        _clusters                 =>   [],
        _cluster_centers          =>   [],
        _clusters_hash            =>   {},
        _cluster_centers_hash     =>   {},

lib/Algorithm/KMeans.pm  view on Meta::CPAN

sub get_initial_cluster_centers_smart {
    my $self = shift;
    my $K = shift;
    if ($self->{_data_dimensions} == 1) {
        my @one_d_data;
        foreach my $j (0..$self->{_N}-1) {
            my $tag = $self->{_data_id_tags}[$j];     
            push @one_d_data, $self->{_data}->{$tag}->[0];
        }
        my @peak_points = find_peak_points_in_given_direction(\@one_d_data,$K);
        print "highest points at data values: @peak_points\n" if $self->{_debug};
        my @cluster_centers;
        foreach my $peakpoint (@peak_points) {
            push @cluster_centers, [$peakpoint];
        }
        return \@cluster_centers;
    }
    my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
    my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
    my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
    # All the record labels are stored in the array $self->{_data_id_tags}.  The
    # actual data for clustering is stored in a hash at $self->{_data} whose keys are
    # the record labels; the value associated with each key is the array holding the
    # corresponding numerical multidimensional data.
    foreach my $j (0..$num_cols-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        $matrix->set_col($j, $data);
    }
    if ($self->{_debug}) {
        print "\nDisplaying the original data as a matrix:";
        display_matrix( $matrix ); 
    }
    foreach my $j (0..$num_cols-1) {
        $mean_vec += $matrix->col($j);
    }
    $mean_vec *=  1.0 / $num_cols;
    if ($self->{_debug}) {
        print "Displaying the mean vector for the data:";
        display_matrix( $mean_vec );
    }
    foreach my $j (0..$num_cols-1) {
        my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
        $matrix->set_col($j, \@new_col);
    }
    if ($self->{_debug}) {
        print "Displaying mean subtracted data as a matrix:";
        display_matrix( $matrix ); 
    }
    my $transposed = transpose( $matrix );
    if ($self->{_debug}) {
        print "Displaying transposed data matrix:";
        display_matrix( $transposed );
    }
    my $covariance = matrix_multiply( $matrix, $transposed );
    $covariance *= 1.0 / $num_cols;
    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for your data:";
        display_matrix( $covariance );
    }
    my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    print "Eigenvalue 0:   $eigenvalues->[0]\n" if $self->{_debug};
    foreach my $i (1..$num_of_eigens-1) {
        $largest_eigen_index = $i if $eigenvalues->[$i] > $eigenvalues->[$largest_eigen_index];
        $smallest_eigen_index = $i if $eigenvalues->[$i] < $eigenvalues->[$smallest_eigen_index];
        print "Eigenvalue $i:   $eigenvalues->[$i]\n" if $self->{_debug};
    }
    print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
    print "\nsmallest eigen index: $smallest_eigen_index\n\n" if $self->{_debug};
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        print "Eigenvector $i:   @vec\n" if $self->{_debug};
    }
    my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
    print "\nLargest eigenvector:   @largest_eigen_vec\n" if $self->{_debug};
    my @max_var_direction;
    # Each element of the array @largest_eigen_vec is a Math::Complex object
    foreach my $k (0..@largest_eigen_vec-1) {
        my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
        if ($theta eq '0') {
            $max_var_direction[$k] = $mag;
        } elsif ($theta eq 'pi') {
            $max_var_direction[$k] = -1.0 * $mag;
        } else {
            die "eigendecomposition of covariance matrix produced a complex " .
                "eigenvector --- something is wrong";
        }
    }
    print "\nMaximum Variance Direction: @max_var_direction\n\n" if $self->{_debug};
    # We now project all data points on the largest eigenvector.
    # Each projection will yield a single point on the eigenvector.
    my @projections;
    foreach my $j (0..$self->{_N}-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        die "Dimensionality of the largest eigenvector does not "
            . "match the dimensionality of the data" 
          unless @max_var_direction == $self->{_data_dimensions};
        my $projection = vector_multiply($data, \@max_var_direction);
        push @projections, $projection;
    }
    print "All projection points: @projections\n" if $self->{_debug};
    my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
    print "highest points at points along largest eigenvec: @peak_points\n" if $self->{_debug};
    my @cluster_centers;
    foreach my $peakpoint (@peak_points) {
        my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
        push @cluster_centers, \@actual_peak_coords;
    }
    return \@cluster_centers;
}

# This method is invoked when you choose the 'smart' option for the "cluster_seeding"
# option in the constructor.  It is called by the previous method to locate K peaks

lib/Algorithm/KMeans.pm  view on Meta::CPAN

        # corresponding numerical multidimensional data.
        foreach my $j (0..$num_cols-1) {
            my $tag = $cluster->[$j];            
            my $data = $self->{_data}->{$tag};
            my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
            $matrix->set_col($j, \@diff_from_mean);
        }
        my $transposed = transpose( $matrix );
        my $covariance = matrix_multiply( $matrix, $transposed );
        $covariance *= 1.0 / $num_cols;
        if ($self->{_debug}) {
            print "\nDisplaying the Covariance Matrix for cluster:";
            display_matrix( $covariance );
        }
        push @new_cluster_covariances, $covariance;
    }
    return [\@new_cluster_centers, \@new_cluster_covariances];
}

# After each new assignment of the data points to the clusters on the basis of the
# current values for the cluster centers, we call the routine shown here for updating

lib/Algorithm/KMeans.pm  view on Meta::CPAN

    # corresponding numerical multidimensional data.
    foreach my $j (0..$num_cols-1) {
        my $tag = $cluster->[$j];            
        my $data = $self->{_data}->{$tag};
        my @diff_from_mean = vector_subtract($data, \@cluster_center);
        $matrix->set_col($j, \@diff_from_mean);
    }
    my $transposed = transpose( $matrix );
    my $covariance = $matrix * $transposed;
    $covariance *= 1.0 / $num_cols;
    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for cluster:";
        display_matrix( $covariance );
    }
    return $covariance;
}

sub write_clusters_to_files {
    my $self = shift;
    my @clusters = @{$self->{_clusters}};
    unlink glob "cluster*.dat";

lib/Algorithm/KMeans.pm  view on Meta::CPAN

    my @legal_params = qw / datafile
                            mask
                            K
                            Kmin
                            Kmax
                            terminal_output
                            write_clusters_to_files
                            do_variance_normalization
                            cluster_seeding
                            use_mahalanobis_metric
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }



( run in 1.043 second using v1.01-cache-2.11-cpan-49f99fa48dc )