Algorithm-Cluster
view release on metacpan or search on metacpan
src/cluster.c view on Meta::CPAN
}
}
return distance;
}
/* ********************************************************************* */
static int
svd(int m, int n, double** u, double w[], double** vt)
/*
* This subroutine is a translation of the Algol procedure svd, described in
* G.H. Golub and C. Reinsch:
* "Singular Value Decomposition and Least Squares Solutions",
* published in
* Numerische Mathematik 14(5): page 403-420 (1970)
* and
* Handbook for Automatic Computation, Volume II: Linear Algebra,
* (John H. Wilkinson and C. Reinsch; edited by Friedrich L. Bauer and
* Alston S. Householder): page 134-151 (1971).
* t
* This subroutine determines the singular value decomposition A=usv of a
* real m by n rectangular matrix, where m is greater * than or equal to n.
* Householder bidiagonalization and a variant of the QR algorithm are used.
*
* On input:
*
* m is the number of rows of A (and u).
*
* n is the number of columns of A (and u) and the order of v.
*
* u contains the rectangular input matrix A to be decomposed.
*
* On output:
*
* the routine returns an integer ierr equal to
* 0: to indicate a normal return,
* k: if the k-th singular value has not been determined after 30 iterations,
* -1: if memory allocation fails.
*
* w contains the n (non-negative) singular values of a (the
* diagonal elements of s). they are unordered.
* If an error exit is made, the singular values should be correct for
* indices ierr+1, ierr+2, ..., n.
*
* u contains the matrix u (orthogonal column vectors) of the decomposition.
* If an error exit is made, the columns of u corresponding to indices of
* correct singular values should be correct.
* t
* vt contains the matrix v (orthogonal) of the decomposition.
* If an error exit is made, the columns of v corresponding to indices of
* correct singular values should be correct.
*
*
* Questions and comments should be directed to B. S. Garbow,
* Applied Mathematics division, Argonne National Laboratory
*
* Modified to eliminate machep
*
* Translated to C by Michiel de Hoon, Human Genome Center,
* University of Tokyo, for inclusion in the C Clustering Library.
* This routine is less general than the original svd routine, as
* it focuses on the singular value decomposition as needed for
* clustering. In particular,
* - We calculate both u and v in all cases
* - We pass the input array A via u; this array is subsequently overwritten.
* - We allocate for the array rv1, used as a working space, internally in
* this routine, instead of passing it as an argument.
* If the allocation fails, svd returns -1.
* 2003.06.05
*/
{
int i, j, k, i1, k1, l1, its;
double c, f, h, s, x, y, z;
int l = 0;
int ierr = 0;
double g = 0.0;
double scale = 0.0;
double anorm = 0.0;
double* rv1;
rv1 = malloc(n*sizeof(double));
if (!rv1) return -1;
if (m >= n) {
/* Householder reduction to bidiagonal form */
for (i = 0; i < n; i++) {
l = i + 1;
rv1[i] = scale * g;
g = 0.0;
s = 0.0;
scale = 0.0;
for (k = i; k < m; k++) scale += fabs(u[k][i]);
if (scale != 0.0) {
for (k = i; k < m; k++) {
u[k][i] /= scale;
s += u[k][i]*u[k][i];
}
f = u[i][i];
g = (f >= 0) ? -sqrt(s) : sqrt(s);
h = f * g - s;
u[i][i] = f - g;
if (i < n-1) {
for (j = l; j < n; j++) {
s = 0.0;
for (k = i; k < m; k++) s += u[k][i] * u[k][j];
f = s / h;
for (k = i; k < m; k++) u[k][j] += f * u[k][i];
}
}
for (k = i; k < m; k++) u[k][i] *= scale;
}
w[i] = scale * g;
g = 0.0;
s = 0.0;
scale = 0.0;
if (i<n-1) {
for (k = l; k < n; k++) scale += fabs(u[i][k]);
if (scale != 0.0) {
for (k = l; k < n; k++) {
u[i][k] /= scale;
s += u[i][k] * u[i][k];
}
src/cluster.c view on Meta::CPAN
The function returns a pointer to a newly allocated array containing the
calculated weights for the rows (if transpose == 0) or columns (if
transpose != 0). If not enough memory could be allocated to store the
weights array, the function returns NULL.
========================================================================
*/
{
int i, j;
const int ndata = (transpose == 0) ? ncolumns : nrows;
const int nelements = (transpose == 0) ? nrows : ncolumns;
/* Set the metric function as indicated by dist */
double (*metric) (int, double**, double**, int**, int**,
const double[], int, int, int) = setmetric(dist);
double* result;
result = malloc(nelements*sizeof(double));
if (!result) return NULL;
memset(result, 0, nelements*sizeof(double));
for (i = 0; i < nelements; i++) {
result[i] += 1.0;
for (j = 0; j < i; j++) {
const double distance = metric(ndata, data, data, mask, mask,
weights, i, j, transpose);
if (distance < cutoff) {
const double dweight = exp(exponent*log(1-distance/cutoff));
/* pow() causes a crash on AIX */
result[i] += dweight;
result[j] += dweight;
}
}
}
for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
return result;
}
/* ******************************************************************** */
int
cuttree(int nelements, const Node* tree, int nclusters, int clusterid[])
/*
Purpose
=======
The cuttree routine takes the output of a hierarchical clustering routine, and
divides the elements in the tree structure into clusters based on the
hierarchical clustering result. The number of clusters is specified by the
user.
Arguments
=========
nelements (input) int
The number of elements that were clustered.
tree (input) Node[nelements-1]
The clustering solution. Each node in the array describes one linking event,
with tree[i].left and tree[i].right representing the elements that were joined.
The original elements are numbered 0..nelements-1, nodes are numbered
-1..-(nelements-1).
nclusters (input) int
The number of clusters to be formed.
clusterid (output) int[nelements]
The number of the cluster to which each element was assigned. Clusters are
numbered 0..nclusters-1 in the left-to-right order in which they appear in the
hierarchical clustering tree. Space for the clusterid array should be allocated
before calling the cuttree routine.
Return value
============
If no errors occur, cuttree returns 1.
If a memory error occurs, cuttree returns 0.
========================================================================
*/
{
int i = -nelements+1; /* top node */
int j;
int k = -1;
int previous = nelements;
const int n = nelements-nclusters; /* number of nodes to join */
int* parents;
if (nclusters == 1) {
for (i = 0; i < nelements; i++) clusterid[i] = 0;
return 1;
}
parents = malloc((nelements-1)*sizeof(int));
if (!parents) return 0;
while (1) {
if (i >= 0) {
clusterid[i] = k;
j = i;
i = previous;
previous = j;
}
else {
j = -i-1;
if (previous == tree[j].left) {
previous = i;
i = tree[j].right;
if (j >= n && (i >= 0 || -i-1 < n)) k++;
}
else if (previous == tree[j].right) {
previous = i;
i = parents[j];
if (i == nelements) break;
}
else {
parents[j] = previous;
previous = i;
i = tree[j].left;
if (j >= n && (i >= 0 || -i-1 < n)) k++;
}
}
}
src/cluster.c view on Meta::CPAN
=======
The pslcluster routine performs single-linkage hierarchical clustering, using
either the distance matrix directly, if available, or by calculating the
distances from the data array. This implementation is based on the SLINK
algorithm, described in:
Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
cluster method. The Computer Journal, 16(1): 30-34.
The output of this algorithm is identical to conventional single-linkage
hierarchical clustering, but is much more memory-efficient and faster. Hence,
it can be applied to large data sets, for which the conventional single-
linkage algorithm fails due to lack of memory.
Arguments
=========
nrows (input) int
The number of rows in the gene expression data matrix, equal to the number of
genes.
ncolumns (input) int
The number of columns in the gene expression data matrix, equal to the number
of samples.
data (input) double[nrows][ncolumns]
The array containing the gene expression data.
mask (input) int[nrows][ncolumns]
This array shows which data values are missing. If
mask[i][j] == 0, then data[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
transpose (input) int
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
of the matrix are clustered.
dist (input) char
Defines which distance measure is used, as given by the table:
dist == 'e': Euclidean distance
dist == 'b': City-block distance
dist == 'c': correlation
dist == 'a': absolute value of the correlation
dist == 'u': uncentered correlation
dist == 'x': absolute uncentered correlation
dist == 's': Spearman's rank correlation
dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
distmatrix (input) double**
The distance matrix. If the distance matrix is passed by the calling routine
treecluster, it is used by pslcluster to speed up the clustering calculation.
The pslcluster routine does not modify the contents of distmatrix, and does
not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
by the pslcluster routine from the gene expression data (the data and mask
arrays) and stored in temporary arrays. If distmatrix is passed, the original
gene expression data (specified by the data and mask arguments) are not needed
and are therefore ignored.
Return value
============
A pointer to a newly allocated array of Node structs, describing the
hierarchical clustering solution consisting of nelements-1 nodes. Depending
on whether genes (rows) or samples (columns) were clustered, nelements is
equal to nrows or ncolumns. See src/cluster.h for a description of the Node
structure.
If a memory error occurs, pslcluster returns NULL.
========================================================================
*/
{
int i, j, k;
const int nelements = transpose ? ncolumns : nrows;
const int nnodes = nelements - 1;
int* vector;
double* temp;
int* index;
Node* result;
temp = malloc(nnodes*sizeof(double));
if (!temp) return NULL;
index = malloc(nelements*sizeof(int));
if (!index) {
free(temp);
return NULL;
}
vector = malloc(nnodes*sizeof(int));
if (!vector) {
free(index);
free(temp);
return NULL;
}
result = malloc(nelements*sizeof(Node));
if (!result) {
free(vector);
free(index);
free(temp);
return NULL;
}
for (i = 0; i < nnodes; i++) vector[i] = i;
if (distmatrix) {
for (i = 0; i < nrows; i++) {
result[i].distance = DBL_MAX;
for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
for (j = 0; j < i; j++) {
k = vector[j];
if (result[j].distance >= temp[j]) {
if (result[j].distance < temp[k])
temp[k] = result[j].distance;
result[j].distance = temp[j];
vector[j] = i;
}
( run in 2.210 seconds using v1.01-cache-2.11-cpan-cdf2f3d4e48 )