Algorithm-Cluster
view release on metacpan or search on metacpan
src/cluster.c view on Meta::CPAN
if (i == j && i == nr) return result;
}
}
while (lo<hi-1);
if (even) return (0.5*(x[nl]+x[nr]));
if (x[lo]>x[hi]) {
double temp = x[lo];
x[lo] = x[hi];
x[hi] = temp;
}
return x[nr];
}
/* ********************************************************************** */
static const double* sortdata = NULL; /* used in the quicksort algorithm */
/* ---------------------------------------------------------------------- */
static int
compare(const void* a, const void* b)
/* Helper function for sort. Previously, this was a nested function under
* sort, which is not allowed under ANSI C.
*/
{
const int i1 = *(const int*)a;
const int i2 = *(const int*)b;
const double term1 = sortdata[i1];
const double term2 = sortdata[i2];
if (term1 < term2) return -1;
if (term1 > term2) return +1;
return 0;
}
/* ---------------------------------------------------------------------- */
void
sort(int n, const double data[], int index[])
/* Sets up an index table given the data, such that data[index[]] is in
* increasing order. Sorting is done on the indices; the array data
* is unchanged.
*/
{
int i;
sortdata = data;
for (i = 0; i < n; i++) index[i] = i;
qsort(index, n, sizeof(int), compare);
}
/* ********************************************************************** */
static double*
getrank(int n, const double data[], const double weight[])
/* Calculates the ranks of the elements in the array data. Two elements with
* the same value get the same rank, equal to the average of the ranks had the
* elements different values. The ranks are returned as a newly allocated
* array that should be freed by the calling routine. If getrank fails due to
* a memory allocation error, it returns NULL.
*/
{
int i, j, k, l;
double* rank;
int* index;
double total = 0.0;
double subtotal;
double current;
double value;
rank = malloc(n*sizeof(double));
if (!rank) return NULL;
index = malloc(n*sizeof(int));
if (!index) {
free(rank);
return NULL;
}
/* Call sort to get an index table */
sort(n, data, index);
/* Build a rank table */
k = 0;
j = index[0];
current = data[j];
subtotal = weight[j];
for (i = 1; i < n; i++) {
j = index[i];
value = data[j];
if (value != current) {
current = value;
value = total + (subtotal + 1.0) / 2.0;
for (l = k; l < i; l++) rank[index[l]] = value;
k = i;
total += subtotal;
subtotal = 0.0;
}
subtotal += weight[j];
}
value = total + (subtotal + 1.0) / 2.0;
for (l = k; l < i; l++) rank[index[l]] = value;
free(index);
return rank;
}
/* ---------------------------------------------------------------------- */
static int
makedatamask(int nrows, int ncols, double*** pdata, int*** pmask)
{
int i;
double** data;
int** mask;
data = malloc(nrows*sizeof(double*));
if (!data) return 0;
mask = malloc(nrows*sizeof(int*));
if (!mask) {
free(data);
return 0;
}
for (i = 0; i < nrows; i++) {
src/cluster.c view on Meta::CPAN
jp (output) int*
A pointer to the integer that is to receive the second index of the pair with
the shortest distance.
*/
{
int i, j;
double temp;
double distance = distmatrix[1][0];
*ip = 1;
*jp = 0;
for (i = 1; i < n; i++) {
for (j = 0; j < i; j++) {
temp = distmatrix[i][j];
if (temp<distance) {
distance = temp;
*ip = i;
*jp = j;
}
}
}
return distance;
}
/* ********************************************************************* */
static int
svd(int m, int n, double** u, double w[], double** vt)
/*
* This subroutine is a translation of the Algol procedure svd, described in
* G.H. Golub and C. Reinsch:
* "Singular Value Decomposition and Least Squares Solutions",
* published in
* Numerische Mathematik 14(5): page 403-420 (1970)
* and
* Handbook for Automatic Computation, Volume II: Linear Algebra,
* (John H. Wilkinson and C. Reinsch; edited by Friedrich L. Bauer and
* Alston S. Householder): page 134-151 (1971).
* t
* This subroutine determines the singular value decomposition A=usv of a
* real m by n rectangular matrix, where m is greater * than or equal to n.
* Householder bidiagonalization and a variant of the QR algorithm are used.
*
* On input:
*
* m is the number of rows of A (and u).
*
* n is the number of columns of A (and u) and the order of v.
*
* u contains the rectangular input matrix A to be decomposed.
*
* On output:
*
* the routine returns an integer ierr equal to
* 0: to indicate a normal return,
* k: if the k-th singular value has not been determined after 30 iterations,
* -1: if memory allocation fails.
*
* w contains the n (non-negative) singular values of a (the
* diagonal elements of s). they are unordered.
* If an error exit is made, the singular values should be correct for
* indices ierr+1, ierr+2, ..., n.
*
* u contains the matrix u (orthogonal column vectors) of the decomposition.
* If an error exit is made, the columns of u corresponding to indices of
* correct singular values should be correct.
* t
* vt contains the matrix v (orthogonal) of the decomposition.
* If an error exit is made, the columns of v corresponding to indices of
* correct singular values should be correct.
*
*
* Questions and comments should be directed to B. S. Garbow,
* Applied Mathematics division, Argonne National Laboratory
*
* Modified to eliminate machep
*
* Translated to C by Michiel de Hoon, Human Genome Center,
* University of Tokyo, for inclusion in the C Clustering Library.
* This routine is less general than the original svd routine, as
* it focuses on the singular value decomposition as needed for
* clustering. In particular,
* - We calculate both u and v in all cases
* - We pass the input array A via u; this array is subsequently overwritten.
* - We allocate for the array rv1, used as a working space, internally in
* this routine, instead of passing it as an argument.
* If the allocation fails, svd returns -1.
* 2003.06.05
*/
{
int i, j, k, i1, k1, l1, its;
double c, f, h, s, x, y, z;
int l = 0;
int ierr = 0;
double g = 0.0;
double scale = 0.0;
double anorm = 0.0;
double* rv1;
rv1 = malloc(n*sizeof(double));
if (!rv1) return -1;
if (m >= n) {
/* Householder reduction to bidiagonal form */
for (i = 0; i < n; i++) {
l = i + 1;
rv1[i] = scale * g;
g = 0.0;
s = 0.0;
scale = 0.0;
for (k = i; k < m; k++) scale += fabs(u[k][i]);
if (scale != 0.0) {
for (k = i; k < m; k++) {
u[k][i] /= scale;
s += u[k][i]*u[k][i];
}
f = u[i][i];
g = (f >= 0) ? -sqrt(s) : sqrt(s);
h = f * g - s;
u[i][i] = f - g;
if (i < n-1) {
for (j = l; j < n; j++) {
s = 0.0;
for (k = i; k < m; k++) s += u[k][i] * u[k][j];
f = s / h;
for (k = i; k < m; k++) u[k][j] += f * u[k][i];
}
}
for (k = i; k < m; k++) u[k][i] *= scale;
}
src/cluster.c view on Meta::CPAN
/*
Purpose
=======
This subroutine uses the singular value decomposition to perform principal
components analysis of a real nrows by ncolumns rectangular matrix.
Arguments
=========
nrows (input) int
The number of rows in the matrix u.
ncolumns (input) int
The number of columns in the matrix v.
u (input) double[nrows][ncolumns]
On input, the array containing the data to which the principal component
analysis should be applied. The function assumes that the mean has already been
subtracted of each column, and hence that the mean of each column is zero.
On output, see below.
v (input) double[n][n], where n = min(nrows, ncolumns)
Not used on input.
w (input) double[n], where n = min(nrows, ncolumns)
Not used on input.
Return value
============
On output:
If nrows >= ncolumns, then
u contains the coordinates with respect to the principal components;
v contains the principal component vectors.
The dot product u . v reproduces the data that were passed in u.
If nrows < ncolumns, then
u contains the principal component vectors;
v contains the coordinates with respect to the principal components.
The dot product v . u reproduces the data that were passed in u.
The eigenvalues of the covariance matrix are returned in w.
The arrays u, v, and w are sorted according to eigenvalue, with the largest
eigenvalues appearing first.
The function returns 0 if successful, -1 if memory allocation fails, and a
positive integer if the singular value decomposition fails to converge.
*/
{
int i;
int j;
int error;
int* index = malloc(ncolumns*sizeof(int));
double* temp = malloc(ncolumns*sizeof(double));
if (!index || !temp) {
if (index) free(index);
if (temp) free(temp);
return -1;
}
error = svd(nrows, ncolumns, u, w, v);
if (error == 0) {
if (nrows >= ncolumns) {
for (j = 0; j < ncolumns; j++) {
const double s = w[j];
for (i = 0; i < nrows; i++) u[i][j] *= s;
}
sort(ncolumns, w, index);
for (i = 0; i < ncolumns/2; i++) {
j = index[i];
index[i] = index[ncolumns-1-i];
index[ncolumns-1-i] = j;
}
for (i = 0; i < nrows; i++) {
for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]];
for (j = 0; j < ncolumns; j++) u[i][j] = temp[j];
}
for (i = 0; i < ncolumns; i++) {
for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
}
for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
for (i = 0; i < ncolumns; i++) w[i] = temp[i];
}
else /* nrows < ncolumns */ {
for (j = 0; j < nrows; j++) {
const double s = w[j];
for (i = 0; i < nrows; i++) v[i][j] *= s;
}
sort(nrows, w, index);
for (i = 0; i < nrows/2; i++) {
j = index[i];
index[i] = index[nrows-1-i];
index[nrows-1-i] = j;
}
for (j = 0; j < ncolumns; j++) {
for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
for (i = 0; i < nrows; i++) u[i][j] = temp[i];
}
for (j = 0; j < nrows; j++) {
for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
for (i = 0; i < nrows; i++) v[j][i] = temp[i];
}
for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
for (i = 0; i < nrows; i++) w[i] = temp[i];
}
}
free(index);
free(temp);
return error;
}
/* ********************************************************************* */
static double
euclid(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The euclid routine calculates the weighted Euclidean distance between two
rows or columns in a matrix.
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double result = 0.0;
double tweight = 0;
int i;
if (transpose == 0) /* Calculate the distance between two rows */ {
src/cluster.c view on Meta::CPAN
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double result = 0.;
double sum1 = 0.;
double sum2 = 0.;
double denom1 = 0.;
double denom2 = 0.;
double tweight = 0.;
if (transpose == 0) /* Calculate the distance between two rows */ {
int i;
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
double term1 = data1[index1][i];
double term2 = data2[index2][i];
double w = weight[i];
sum1 += w*term1;
sum2 += w*term2;
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
tweight += w;
}
}
}
else {
int i;
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
double term1 = data1[i][index1];
double term2 = data2[i][index2];
double w = weight[i];
sum1 += w*term1;
sum2 += w*term2;
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
tweight += w;
}
}
}
if (!tweight) return 0; /* usually due to empty clusters */
result -= sum1 * sum2 / tweight;
denom1 -= sum1 * sum1 / tweight;
denom2 -= sum2 * sum2 / tweight;
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
result = result / sqrt(denom1*denom2);
result = 1. - result;
return result;
}
/* ********************************************************************* */
static double
acorrelation(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The acorrelation routine calculates the weighted Pearson distance between two
rows or columns, using the absolute value of the correlation.
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
(e.g., choose b = a + c).
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double result = 0.;
double sum1 = 0.;
double sum2 = 0.;
double denom1 = 0.;
double denom2 = 0.;
double tweight = 0.;
if (transpose == 0) /* Calculate the distance between two rows */ {
int i;
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
double term1 = data1[index1][i];
double term2 = data2[index2][i];
double w = weight[i];
sum1 += w*term1;
sum2 += w*term2;
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
tweight += w;
}
}
}
else {
int i;
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
double term1 = data1[i][index1];
double term2 = data2[i][index2];
double w = weight[i];
sum1 += w*term1;
sum2 += w*term2;
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
tweight += w;
}
}
}
if (!tweight) return 0; /* usually due to empty clusters */
result -= sum1 * sum2 / tweight;
denom1 -= sum1 * sum1 / tweight;
denom2 -= sum2 * sum2 / tweight;
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
result = fabs(result) / sqrt(denom1*denom2);
result = 1. - result;
return result;
}
/* ********************************************************************* */
static double
ucorrelation(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The ucorrelation routine calculates the weighted Pearson distance between two
rows or columns, using the uncentered version of the Pearson correlation. In
the uncentered Pearson correlation, a zero mean is used for both vectors even
if the actual mean is nonzero.
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
(e.g., choose b = a + c).
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
src/cluster.c view on Meta::CPAN
Purpose
=======
The spearman routine calculates the Spearman distance between two rows or
columns. The Spearman distance is defined as one minus the Spearman rank
correlation.
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
int i;
int m = 0;
double* rank1;
double* rank2;
double result = 0.;
double denom1 = 0.;
double denom2 = 0.;
double sum1 = 0.;
double sum2 = 0.;
double totalweight = 0.;
double* tdata1;
double* tdata2;
tdata1 = malloc(n*sizeof(double));
if (!tdata1) return 0.0; /* Memory allocation error */
tdata2 = malloc(n*sizeof(double));
if (!tdata2) /* Memory allocation error */ {
free(tdata1);
return 0.0;
}
if (transpose == 0) {
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
tdata1[m] = data1[index1][i];
tdata2[m] = data2[index2][i];
m++;
}
}
}
else {
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
tdata1[m] = data1[i][index1];
tdata2[m] = data2[i][index2];
m++;
}
}
}
if (m == 0) {
free(tdata1);
free(tdata2);
return 0;
}
rank1 = getrank(m, tdata1, weight);
free(tdata1);
if (!rank1) {
free(tdata2);
return 0.0; /* Memory allocation error */
}
rank2 = getrank(m, tdata2, weight);
free(tdata2);
if (!rank2) /* Memory allocation error */ {
free(rank1);
return 0.0;
}
for (i = 0; i < m; i++) {
const double term1 = rank1[i];
const double term2 = rank2[i];
const double w = weight[i];
sum1 += term1 * w;
sum2 += term2 * w;
result += term1 * term2 * w;
denom1 += term1 * term1 * w;
denom2 += term2 * term2 * w;
totalweight += w;
}
/* Note: denom1 and denom2 cannot be calculated directly from the number
* of elements. If two elements have the same rank, the squared sum of
* their ranks will change.
*/
free(rank1);
free(rank2);
if (!totalweight) return 0; /* usually due to empty clusters */
result -= sum1 * sum2 / totalweight;
denom1 -= sum1 * sum1 / totalweight;
denom2 -= sum2 * sum2 / totalweight;
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
result = result / sqrt(denom1*denom2);
result = 1. - result;
return result;
}
/* ********************************************************************* */
static double
kendall(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The kendall routine calculates the Kendall distance between two
rows or columns. The Kendall distance is defined as one minus Kendall's tau.
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double con = 0;
double dis = 0;
double exx = 0;
double exy = 0;
src/cluster.c view on Meta::CPAN
/*
Purpose
=======
The getclustercentroids routine calculates the cluster centroids, given to
which cluster each element belongs. Depending on the argument method, the
centroid is defined as either the mean or the median for each dimension over
all elements belonging to a cluster.
Arguments
=========
nclusters (input) int
The number of clusters.
nrows (input) int
The number of rows in the gene expression data matrix, equal to the number of
genes.
ncolumns (input) int
The number of columns in the gene expression data matrix, equal to the number
of samples.
data (input) double[nrows][ncolumns]
The array containing the gene expression data.
mask (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.
clusterid (output) int[nrows] if transpose == 0
int[ncolumns] otherwise
The cluster number to which each element belongs. If transpose == 0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of samples).
cdata (output) double[nclusters][ncolumns] if transpose == 0
double[nrows][nclusters] otherwise
On exit of getclustercentroids, this array contains the cluster centroids.
cmask (output) int[nclusters][ncolumns] if transpose == 0
int[nrows][nclusters] otherwise
This array shows which data values of are missing for each centroid. If
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for
a centroid if all corresponding data values of the cluster members are missing.
transpose (input) int
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
of columns (samples) are specified.
method (input) char
For method == 'a', the centroid is defined as the mean over all elements
belonging to a cluster for each dimension.
For method == 'm', the centroid is defined as the median over all elements
belonging to a cluster for each dimension.
Return value
============
The function returns an integer to indicate success or failure. If a
memory error occurs, or if method is not 'm' or 'a', getclustercentroids
returns 0. If successful, getclustercentroids returns 1.
========================================================================
*/
{
switch(method) {
case 'm': {
const int nelements = (transpose == 0) ? nrows : ncolumns;
double* cache = malloc(nelements*sizeof(double));
if (!cache) return 0;
getclustermedians(nclusters, nrows, ncolumns, data, mask,
clusterid, cdata, cmask, transpose, cache);
free(cache);
return 1;
}
case 'a': {
getclustermeans(nclusters, nrows, ncolumns, data, mask,
clusterid, cdata, cmask, transpose);
return 1;
}
}
return 0;
}
/* ********************************************************************* */
void
getclustermedoids(int nclusters, int nelements, double** distance,
int clusterid[], int centroids[], double errors[])
/*
Purpose
=======
The getclustermedoids routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the element with the
smallest sum of distances to the other elements.
Arguments
=========
nclusters (input) int
The number of clusters.
nelements (input) int
The total number of elements.
distmatrix (input) double array, ragged
(number of rows is nelements, number of columns is equal to the row number)
The distance matrix. To save space, the distance matrix is given in the
form of a ragged array. The distance matrix is symmetric and has zeros
on the diagonal. See distancematrix for a description of the content.
clusterid (output) int[nelements]
The cluster number to which each element belongs.
centroid (output) int[nclusters]
The index of the element that functions as the centroid for each cluster.
errors (output) double[nclusters]
The within-cluster sum of distances between the items and the cluster
centroid.
========================================================================
*/
{
int i, j, k;
for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
for (i = 0; i < nelements; i++) {
double d = 0.0;
j = clusterid[i];
for (k = 0; k < nelements; k++) {
if (i == k || clusterid[k]!=j) continue;
d += (i < k ? distance[k][i] : distance[i][k]);
if (d > errors[j]) break;
}
if (d < errors[j]) {
errors[j] = d;
centroids[j] = i;
}
}
}
/* ********************************************************************* */
static int
kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
double weight[], int transpose, int npass, char dist,
double** cdata, int** cmask, int clusterid[], double* error,
int tclusterid[], int counts[], int mapping[])
{
int i, j, k;
const int nelements = (transpose == 0) ? nrows : ncolumns;
const int ndata = (transpose == 0) ? ncolumns : nrows;
int ifound = 1;
int ipass = 0;
/* Set the metric function as indicated by dist */
double (*metric) (int, double**, double**, int**, int**,
const double[], int, int, int) = setmetric(dist);
/* Save the clustering solution periodically and check if it reappears */
int* saved = malloc(nelements*sizeof(int));
if (saved == NULL) return -1;
*error = DBL_MAX;
do {
double total = DBL_MAX;
int counter = 0;
int period = 10;
/* Perform the EM algorithm.
* First, randomly assign elements to clusters. */
if (npass != 0) randomassign(nclusters, nelements, tclusterid);
for (i = 0; i < nclusters; i++) counts[i] = 0;
for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
/* Start the loop */
while (1) {
double previous = total;
total = 0.0;
if (counter % period == 0) {
/* Save the current cluster assignments */
for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
if (period < INT_MAX / 2) period *= 2;
}
counter++;
/* Find the center */
getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
cdata, cmask, transpose);
for (i = 0; i < nelements; i++) {
double distance;
/* Calculate the distances */
k = tclusterid[i];
if (counts[k] == 1) continue;
/* No reassignment if that would lead to an empty cluster */
/* Treat the present cluster as a special case */
distance = metric(ndata, data, cdata, mask, cmask, weight,
i, k, transpose);
for (j = 0; j < nclusters; j++) {
double tdistance;
if (j == k) continue;
tdistance = metric(ndata, data, cdata, mask, cmask, weight,
i, j, transpose);
if (tdistance < distance) {
distance = tdistance;
counts[tclusterid[i]]--;
tclusterid[i] = j;
counts[j]++;
}
}
total += distance;
}
if (total >= previous) break;
/* total >= previous is FALSE on some machines even if total and
* previous are bitwise identical. */
for (i = 0; i < nelements; i++)
if (saved[i]!=tclusterid[i]) break;
if (i == nelements)
break; /* Identical solution found; break out of this loop */
}
if (npass <= 1) {
*error = total;
break;
}
for (i = 0; i < nclusters; i++) mapping[i] = -1;
for (i = 0; i < nelements; i++) {
j = tclusterid[i];
k = clusterid[i];
if (mapping[k] == -1) mapping[k] = j;
else if (mapping[k] != j) {
if (total < *error) {
ifound = 1;
*error = total;
for (j = 0; j < nelements; j++)
clusterid[j] = tclusterid[j];
}
break;
}
}
if (i == nelements) ifound++; /* break statement not encountered */
} while (++ipass < npass);
free(saved);
return ifound;
}
/* ---------------------------------------------------------------------- */
static int
kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
double weight[], int transpose, int npass, char dist,
double** cdata, int** cmask, int clusterid[], double* error,
int tclusterid[], int counts[], int mapping[], double cache[])
{
int i, j, k;
const int nelements = (transpose == 0) ? nrows : ncolumns;
const int ndata = (transpose == 0) ? ncolumns : nrows;
int ifound = 1;
int ipass = 0;
int* saved;
/* Set the metric function as indicated by dist */
double (*metric) (int, double**, double**, int**, int**,
const double[], int, int, int) = setmetric(dist);
/* Save the clustering solution periodically and check if it reappears */
saved = malloc(nelements*sizeof(int));
if (saved == NULL) return -1;
*error = DBL_MAX;
do {
double total = DBL_MAX;
int counter = 0;
int period = 10;
/* Perform the EM algorithm.
* First, randomly assign elements to clusters. */
if (npass != 0) randomassign(nclusters, nelements, tclusterid);
for (i = 0; i < nclusters; i++) counts[i] = 0;
for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
/* Start the loop */
while (1) {
double previous = total;
total = 0.0;
if (counter % period == 0) {
/* Save the current cluster assignments */
for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
if (period < INT_MAX / 2) period *= 2;
}
counter++;
/* Find the center */
getclustermedians(nclusters, nrows, ncolumns, data, mask,
tclusterid, cdata, cmask, transpose, cache);
for (i = 0; i < nelements; i++) {
/* Calculate the distances */
double distance;
k = tclusterid[i];
if (counts[k] == 1) continue;
/* No reassignment if that would lead to an empty cluster */
/* Treat the present cluster as a special case */
distance = metric(ndata, data, cdata, mask, cmask, weight,
i, k, transpose);
for (j = 0; j < nclusters; j++) {
double tdistance;
if (j == k) continue;
tdistance = metric(ndata, data, cdata, mask, cmask, weight,
i, j, transpose);
if (tdistance < distance) {
distance = tdistance;
counts[tclusterid[i]]--;
tclusterid[i] = j;
counts[j]++;
}
}
total += distance;
}
if (total >= previous) break;
/* total >= previous is FALSE on some machines even if total and
* previous are bitwise identical. */
for (i = 0; i < nelements; i++)
if (saved[i]!=tclusterid[i]) break;
if (i == nelements)
break; /* Identical solution found; break out of this loop */
}
if (npass <= 1) {
*error = total;
break;
}
for (i = 0; i < nclusters; i++) mapping[i] = -1;
for (i = 0; i < nelements; i++) {
j = tclusterid[i];
k = clusterid[i];
if (mapping[k] == -1) mapping[k] = j;
else if (mapping[k] != j) {
if (total < *error) {
ifound = 1;
*error = total;
for (j = 0; j < nelements; j++)
clusterid[j] = tclusterid[j];
}
break;
}
}
if (i == nelements) ifound++; /* break statement not encountered */
} while (++ipass < npass);
free(saved);
return ifound;
}
/* ********************************************************************* */
void
kcluster(int nclusters, int nrows, int ncolumns, double** data, int** mask,
double weight[], int transpose, int npass, char method, char dist,
int clusterid[], double* error, int* ifound)
/*
Purpose
=======
The kcluster routine performs k-means or k-median clustering on a given set of
elements, using the specified distance measure. The number of clusters is given
by the user. Multiple passes are being made to find the optimal clustering
solution, each time starting from a different initial clustering.
Arguments
=========
nclusters (input) int
The number of clusters to be found.
data (input) double[nrows][ncolumns]
The array containing the data of the elements to be clustered (i.e., the gene
expression data).
mask (input) int[nrows][ncolumns]
This array shows which data values are missing. If
mask[i][j] == 0, then data[i][j] is missing.
nrows (input) int
The number of rows in the data matrix, equal to the number of genes.
ncolumns (input) int
The number of columns in the data matrix, equal to the number of samples.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
transpose (input) int
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
of the matrix are clustered.
npass (input) int
The number of times clustering is performed. Clustering is performed npass
times, each time starting from a different (random) initial assignment of
genes to clusters. The clustering solution with the lowest within-cluster sum
of distances is chosen.
If npass == 0, then the clustering algorithm will be run once, where the
initial assignment of elements to clusters is taken from the clusterid array.
method (input) char
Defines whether the arithmetic mean (method == 'a') or the median
(method == 'm') is used to calculate the cluster center.
dist (input) char
Defines which distance measure is used, as given by the table:
dist == 'e': Euclidean distance
dist == 'b': City-block distance
dist == 'c': correlation
dist == 'a': absolute value of the correlation
dist == 'u': uncentered correlation
dist == 'x': absolute uncentered correlation
dist == 's': Spearman's rank correlation
dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
clusterid (output; input) int[nrows] if transpose == 0
int[ncolumns] otherwise
The cluster number to which a gene or microarray was assigned. If npass == 0,
then on input clusterid contains the initial clustering assignment from which
the clustering algorithm starts. On output, it contains the clustering solution
that was found.
error (output) double*
The sum of distances to the cluster center of each item in the optimal k-means
clustering solution that was found.
ifound (output) int*
The number of times the optimal clustering solution was
found. The value of ifound is at least 1; its maximum value is npass. If the
number of clusters is larger than the number of elements being clustered,
*ifound is set to 0 as an error code. If a memory allocation error occurs,
*ifound is set to -1.
========================================================================
*/
{
const int nelements = (transpose == 0) ? nrows : ncolumns;
const int ndata = (transpose == 0) ? ncolumns : nrows;
int i;
int ok;
int* tclusterid;
int* mapping = NULL;
double** cdata;
int** cmask;
int* counts;
if (nelements < nclusters) {
*ifound = 0;
return;
}
/* More clusters asked for than elements available */
*ifound = -1;
/* This will contain the number of elements in each cluster, which is
* needed to check for empty clusters. */
counts = malloc(nclusters*sizeof(int));
if (!counts) return;
/* Find out if the user specified an initial clustering */
if (npass <= 1) tclusterid = clusterid;
else {
tclusterid = malloc(nelements*sizeof(int));
if (!tclusterid) {
free(counts);
return;
}
mapping = malloc(nclusters*sizeof(int));
if (!mapping) {
free(counts);
free(tclusterid);
return;
}
for (i = 0; i < nelements; i++) clusterid[i] = 0;
}
/* Allocate space to store the centroid data */
if (transpose == 0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
if (!ok) {
free(counts);
if (npass>1) {
free(tclusterid);
free(mapping);
}
return;
}
if (method == 'm') {
double* cache = malloc(nelements*sizeof(double));
if (cache) {
*ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
transpose, npass, dist, cdata, cmask, clusterid,
error, tclusterid, counts, mapping, cache);
free(cache);
}
}
else
*ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
transpose, npass, dist, cdata, cmask, clusterid,
error, tclusterid, counts, mapping);
/* Deallocate temporarily used space */
if (npass > 1) {
free(mapping);
free(tclusterid);
}
if (transpose == 0) freedatamask(nclusters, cdata, cmask);
else freedatamask(ndata, cdata, cmask);
free(counts);
}
/* *********************************************************************** */
void
kmedoids(int nclusters, int nelements, double** distmatrix, int npass,
int clusterid[], double* error, int* ifound)
/*
Purpose
=======
The kmedoids routine performs k-medoids clustering on a given set of elements,
using the distance matrix and the number of clusters passed by the user.
Multiple passes are being made to find the optimal clustering solution, each
time starting from a different initial clustering.
Arguments
=========
nclusters (input) int
The number of clusters to be found.
nelements (input) int
The number of elements to be clustered.
distmatrix (input) double array, ragged
(number of rows is nelements, number of columns is equal to the row number)
The distance matrix. To save space, the distance matrix is given in the
form of a ragged array. The distance matrix is symmetric and has zeros
on the diagonal. See distancematrix for a description of the content.
npass (input) int
The number of times clustering is performed. Clustering is performed npass
times, each time starting from a different (random) initial assignment of genes
to clusters. The clustering solution with the lowest within-cluster sum of
distances is chosen.
If npass == 0, then the clustering algorithm will be run once, where the
initial assignment of elements to clusters is taken from the clusterid array.
clusterid (output; input) int[nelements]
On input, if npass == 0, then clusterid contains the initial clustering
assignment from which the clustering algorithm starts; all numbers in clusterid
should be between zero and nelements-1 inclusive. If npass != 0, clusterid is
ignored on input.
On output, clusterid contains the clustering solution that was found: clusterid
contains the number of the cluster to which each item was assigned. On output,
the number of a cluster is defined as the item number of the centroid of the
cluster.
error (output) double
The sum of distances to the cluster center of each item in the optimal
k-medoids clustering solution that was found.
ifound (output) int
If kmedoids is successful: the number of times the optimal clustering solution
was found. The value of ifound is at least 1; its maximum value is npass.
If the user requested more clusters than elements available, ifound is set
to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
========================================================================
*/
{
int i, j, icluster;
int* tclusterid;
int* saved;
int* centroids;
double* errors;
int ipass = 0;
if (nelements < nclusters) {
*ifound = 0;
return;
} /* More clusters asked for than elements available */
*ifound = -1;
/* Save the clustering solution periodically and check if it reappears */
saved = malloc(nelements*sizeof(int));
if (saved == NULL) return;
centroids = malloc(nclusters*sizeof(int));
if (!centroids) {
free(saved);
return;
}
errors = malloc(nclusters*sizeof(double));
if (!errors) {
free(saved);
free(centroids);
return;
}
/* Find out if the user specified an initial clustering */
if (npass <= 1) tclusterid = clusterid;
else {
tclusterid = malloc(nelements*sizeof(int));
if (!tclusterid) {
free(saved);
free(centroids);
free(errors);
return;
}
for (i = 0; i < nelements; i++) clusterid[i] = -1;
}
*error = DBL_MAX;
do /* Start the loop */ {
double total = DBL_MAX;
int counter = 0;
int period = 10;
if (npass != 0) randomassign(nclusters, nelements, tclusterid);
while (1) {
double previous = total;
total = 0.0;
if (counter % period == 0) {
/* Save the current cluster assignments */
for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
if (period < INT_MAX / 2) period *= 2;
}
counter++;
/* Find the center */
getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
centroids, errors);
for (i = 0; i < nelements; i++) {
/* Find the closest cluster */
double distance = DBL_MAX;
for (icluster = 0; icluster < nclusters; icluster++) {
double tdistance;
j = centroids[icluster];
if (i == j) {
distance = 0.0;
tclusterid[i] = icluster;
break;
}
tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
if (tdistance < distance) {
distance = tdistance;
tclusterid[i] = icluster;
}
}
total += distance;
}
if (total >= previous) break;
/* total >= previous is FALSE on some machines even if total and
* previous are bitwise identical. */
for (i = 0; i < nelements; i++)
if (saved[i] != tclusterid[i]) break;
if (i == nelements)
break; /* Identical solution found; break out of this loop */
}
if (npass <= 1) {
*ifound = 1;
*error = total;
/* Replace by the centroid in each cluster. */
for (j = 0; j < nelements; j++) {
clusterid[j] = centroids[tclusterid[j]];
}
break;
}
for (i = 0; i < nelements; i++) {
if (clusterid[i]!=centroids[tclusterid[i]]) {
if (total < *error) {
*ifound = 1;
*error = total;
/* Replace by the centroid in each cluster. */
for (j = 0; j < nelements; j++) {
clusterid[j] = centroids[tclusterid[j]];
}
}
break;
}
}
if (i == nelements) (*ifound)++; /* break statement not encountered */
} while (++ipass < npass);
/* Deallocate temporarily used space */
if (npass > 1) free(tclusterid);
free(saved);
free(centroids);
free(errors);
}
/* ******************************************************************** */
void
distancematrix(int nrows, int ncolumns, double** data, int** mask,
double weights[], char dist, int transpose, double** matrix)
/*
Purpose
=======
The distancematrix routine calculates the distance matrix between genes or
samples using their measured gene expression data. Several distance measures
can be used. As the distance matrix is symmetric, with zeros on the diagonal,
only the lower triangular half of the distance matrix is stored.
Space for the distance matrix should be allocated before calling this routine.
If the parameter transpose is set to a nonzero value, the distances between
columns of the data matrix are calculated, otherwise distances between the rows
are calculated.
Arguments
=========
nrows (input) int
The number of rows in the gene expression data matrix (i.e., the number of
genes)
ncolumns (input) int
The number of columns in the gene expression data matrix (i.e., the number of
samples)
data (input) double[nrows][ncolumns]
The array containing the gene expression data.
mask (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
dist (input) char
Defines which distance measure is used, as given by the table:
dist == 'e': Euclidean distance
dist == 'b': City-block distance
dist == 'c': correlation
dist == 'a': absolute value of the correlation
dist == 'u': uncentered correlation
dist == 'x': absolute uncentered correlation
dist == 's': Spearman's rank correlation
dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
transpose (input) int
If transpose is equal to zero, the distances between the rows is
calculated. Otherwise, the distances between the columns is calculated.
src/cluster.c view on Meta::CPAN
double* result;
result = malloc(nelements*sizeof(double));
if (!result) return NULL;
memset(result, 0, nelements*sizeof(double));
for (i = 0; i < nelements; i++) {
result[i] += 1.0;
for (j = 0; j < i; j++) {
const double distance = metric(ndata, data, data, mask, mask,
weights, i, j, transpose);
if (distance < cutoff) {
const double dweight = exp(exponent*log(1-distance/cutoff));
/* pow() causes a crash on AIX */
result[i] += dweight;
result[j] += dweight;
}
}
}
for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
return result;
}
/* ******************************************************************** */
int
cuttree(int nelements, const Node* tree, int nclusters, int clusterid[])
/*
Purpose
=======
The cuttree routine takes the output of a hierarchical clustering routine, and
divides the elements in the tree structure into clusters based on the
hierarchical clustering result. The number of clusters is specified by the
user.
Arguments
=========
nelements (input) int
The number of elements that were clustered.
tree (input) Node[nelements-1]
The clustering solution. Each node in the array describes one linking event,
with tree[i].left and tree[i].right representing the elements that were joined.
The original elements are numbered 0..nelements-1, nodes are numbered
-1..-(nelements-1).
nclusters (input) int
The number of clusters to be formed.
clusterid (output) int[nelements]
The number of the cluster to which each element was assigned. Clusters are
numbered 0..nclusters-1 in the left-to-right order in which they appear in the
hierarchical clustering tree. Space for the clusterid array should be allocated
before calling the cuttree routine.
Return value
============
If no errors occur, cuttree returns 1.
If a memory error occurs, cuttree returns 0.
========================================================================
*/
{
int i = -nelements+1; /* top node */
int j;
int k = -1;
int previous = nelements;
const int n = nelements-nclusters; /* number of nodes to join */
int* parents;
if (nclusters == 1) {
for (i = 0; i < nelements; i++) clusterid[i] = 0;
return 1;
}
parents = malloc((nelements-1)*sizeof(int));
if (!parents) return 0;
while (1) {
if (i >= 0) {
clusterid[i] = k;
j = i;
i = previous;
previous = j;
}
else {
j = -i-1;
if (previous == tree[j].left) {
previous = i;
i = tree[j].right;
if (j >= n && (i >= 0 || -i-1 < n)) k++;
}
else if (previous == tree[j].right) {
previous = i;
i = parents[j];
if (i == nelements) break;
}
else {
parents[j] = previous;
previous = i;
i = tree[j].left;
if (j >= n && (i >= 0 || -i-1 < n)) k++;
}
}
}
free(parents);
return 1;
}
/* ******************************************************************** */
static Node*
pclcluster(int nrows, int ncolumns, double** data, int** mask, double weight[],
double** distmatrix, char dist, int transpose)
/*
Purpose
=======
The pclcluster routine performs clustering using pairwise centroid-linking on a
given set of gene expression data, using the distance metric given by dist.
Arguments
=========
nrows (input) int
The number of rows in the gene expression data matrix, equal to the number of
genes.
ncolumns (input) int
The number of columns in the gene expression data matrix, equal to the number
of samples.
data (input) double[nrows][ncolumns]
The array containing the gene expression data.
mask (input) int[nrows][ncolumns]
This array shows which data values are missing. If
mask[i][j] == 0, then data[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0;
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
transpose (input) int
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
of the matrix are clustered.
dist (input) char
Defines which distance measure is used, as given by the table:
dist == 'e': Euclidean distance
dist == 'b': City-block distance
dist == 'c': correlation
dist == 'a': absolute value of the correlation
dist == 'u': uncentered correlation
dist == 'x': absolute uncentered correlation
dist == 's': Spearman's rank correlation
dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
distmatrix (input) double**
The distance matrix. This matrix is precalculated by the calling routine
treecluster. The pclcluster routine modifies the contents of distmatrix, but
does not deallocate it.
Return value
============
A pointer to a newly allocated array of Node structs, describing the
hierarchical clustering solution consisting of nelements-1 nodes. Depending
on whether genes (rows) or samples (columns) were clustered, nelements is
equal to nrows or ncolumns. See src/cluster.h for a description of the Node
structure.
If a memory error occurs, pclcluster returns NULL.
========================================================================
*/
{
int i, j;
const int nelements = (transpose == 0) ? nrows : ncolumns;
int inode;
const int ndata = transpose ? nrows : ncolumns;
const int nnodes = nelements - 1;
Node* result;
double** newdata;
int** newmask;
int* distid;
/* Set the metric function as indicated by dist */
double (*metric) (int, double**, double**, int**, int**,
const double[], int, int, int) = setmetric(dist);
distid = malloc(nelements*sizeof(int));
if (!distid) return NULL;
result = malloc(nnodes*sizeof(Node));
if (!result) {
free(distid);
return NULL;
}
if (!makedatamask(nelements, ndata, &newdata, &newmask)) {
free(result);
free(distid);
return NULL;
}
for (i = 0; i < nelements; i++) distid[i] = i;
/* To remember which row/column in the distance matrix contains what */
/* Storage for node data */
if (transpose) {
for (i = 0; i < nelements; i++) {
for (j = 0; j < ndata; j++) {
newdata[i][j] = data[j][i];
newmask[i][j] = mask[j][i];
}
}
data = newdata;
mask = newmask;
}
else {
for (i = 0; i < nelements; i++) {
memcpy(newdata[i], data[i], ndata*sizeof(double));
memcpy(newmask[i], mask[i], ndata*sizeof(int));
}
data = newdata;
mask = newmask;
}
for (inode = 0; inode < nnodes; inode++) {
/* Find the pair with the shortest distance */
int is = 1;
int js = 0;
result[inode].distance = find_closest_pair(nelements-inode, distmatrix,
&is, &js);
result[inode].left = distid[js];
src/cluster.c view on Meta::CPAN
Arguments
=========
nrows (input) int
The number of rows in the gene expression data matrix, equal to the number of
genes.
ncolumns (input) int
The number of columns in the gene expression data matrix, equal to the number
of samples.
data (input) double[nrows][ncolumns]
The array containing the gene expression data.
mask (input) int[nrows][ncolumns]
This array shows which data values are missing. If
mask[i][j] == 0, then data[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
transpose (input) int
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
of the matrix are clustered.
dist (input) char
Defines which distance measure is used, as given by the table:
dist == 'e': Euclidean distance
dist == 'b': City-block distance
dist == 'c': correlation
dist == 'a': absolute value of the correlation
dist == 'u': uncentered correlation
dist == 'x': absolute uncentered correlation
dist == 's': Spearman's rank correlation
dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
distmatrix (input) double**
The distance matrix. If the distance matrix is passed by the calling routine
treecluster, it is used by pslcluster to speed up the clustering calculation.
The pslcluster routine does not modify the contents of distmatrix, and does
not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
by the pslcluster routine from the gene expression data (the data and mask
arrays) and stored in temporary arrays. If distmatrix is passed, the original
gene expression data (specified by the data and mask arguments) are not needed
and are therefore ignored.
Return value
============
A pointer to a newly allocated array of Node structs, describing the
hierarchical clustering solution consisting of nelements-1 nodes. Depending
on whether genes (rows) or samples (columns) were clustered, nelements is
equal to nrows or ncolumns. See src/cluster.h for a description of the Node
structure.
If a memory error occurs, pslcluster returns NULL.
========================================================================
*/
{
int i, j, k;
const int nelements = transpose ? ncolumns : nrows;
const int nnodes = nelements - 1;
int* vector;
double* temp;
int* index;
Node* result;
temp = malloc(nnodes*sizeof(double));
if (!temp) return NULL;
index = malloc(nelements*sizeof(int));
if (!index) {
free(temp);
return NULL;
}
vector = malloc(nnodes*sizeof(int));
if (!vector) {
free(index);
free(temp);
return NULL;
}
result = malloc(nelements*sizeof(Node));
if (!result) {
free(vector);
free(index);
free(temp);
return NULL;
}
for (i = 0; i < nnodes; i++) vector[i] = i;
if (distmatrix) {
for (i = 0; i < nrows; i++) {
result[i].distance = DBL_MAX;
for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
for (j = 0; j < i; j++) {
k = vector[j];
if (result[j].distance >= temp[j]) {
if (result[j].distance < temp[k])
temp[k] = result[j].distance;
result[j].distance = temp[j];
vector[j] = i;
}
else if (temp[j] < temp[k]) temp[k] = temp[j];
}
for (j = 0; j < i; j++) {
if (result[j].distance >= result[vector[j]].distance)
vector[j] = i;
}
}
}
else {
const int ndata = transpose ? nrows : ncolumns;
/* Set the metric function as indicated by dist */
double (*metric) (int, double**, double**, int**, int**,
const double[], int, int, int) = setmetric(dist);
src/cluster.c view on Meta::CPAN
vector[j] = i;
}
else if (temp[j] < temp[k]) temp[k] = temp[j];
}
for (j = 0; j < i; j++)
if (result[j].distance >= result[vector[j]].distance)
vector[j] = i;
}
}
free(temp);
for (i = 0; i < nnodes; i++) result[i].left = i;
qsort(result, nnodes, sizeof(Node), nodecompare);
for (i = 0; i < nelements; i++) index[i] = i;
for (i = 0; i < nnodes; i++) {
j = result[i].left;
k = vector[j];
result[i].left = index[j];
result[i].right = index[k];
index[k] = -i-1;
}
free(vector);
free(index);
result = realloc(result, nnodes*sizeof(Node));
return result;
}
/* ******************************************************************** */
static Node*
pmlcluster(int nelements, double** distmatrix)
/*
Purpose
=======
The pmlcluster routine performs clustering using pairwise maximum- (complete-)
linking on the given distance matrix.
Arguments
=========
nelements (input) int
The number of elements to be clustered.
distmatrix (input) double**
The distance matrix, with nelements rows, each row being filled up to the
diagonal. The elements on the diagonal are not used, as they are assumed to be
zero. The distance matrix will be modified by this routine.
Return value
============
A pointer to a newly allocated array of Node structs, describing the
hierarchical clustering solution consisting of nelements-1 nodes. Depending on
whether genes (rows) or samples (columns) were clustered, nelements is equal
to nrows or ncolumns. See src/cluster.h for a description of the Node
structure.
If a memory error occurs, pmlcluster returns NULL.
========================================================================
*/
{
int j;
int n;
int* clusterid;
Node* result;
clusterid = malloc(nelements*sizeof(int));
if (!clusterid) return NULL;
result = malloc((nelements-1)*sizeof(Node));
if (!result) {
free(clusterid);
return NULL;
}
/* Setup a list specifying to which cluster a gene belongs */
for (j = 0; j < nelements; j++) clusterid[j] = j;
for (n = nelements; n > 1; n--) {
int is = 1;
int js = 0;
result[nelements-n].distance = find_closest_pair(n, distmatrix,
&is, &js);
/* Fix the distances */
for (j = 0; j < js; j++)
distmatrix[js][j] = max(distmatrix[is][j], distmatrix[js][j]);
for (j = js+1; j < is; j++)
distmatrix[j][js] = max(distmatrix[is][j], distmatrix[j][js]);
for (j = is+1; j < n; j++)
distmatrix[j][js] = max(distmatrix[j][is], distmatrix[j][js]);
for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
/* Update clusterids */
result[nelements-n].left = clusterid[is];
result[nelements-n].right = clusterid[js];
clusterid[js] = n-nelements-1;
clusterid[is] = clusterid[n-1];
}
free(clusterid);
return result;
}
/* ******************************************************************* */
static Node*
palcluster(int nelements, double** distmatrix)
/*
Purpose
=======
The palcluster routine performs clustering using pairwise average
linking on the given distance matrix.
Arguments
=========
nelements (input) int
The number of elements to be clustered.
distmatrix (input) double**
The distance matrix, with nelements rows, each row being filled up to the
diagonal. The elements on the diagonal are not used, as they are assumed to be
zero. The distance matrix will be modified by this routine.
Return value
============
A pointer to a newly allocated array of Node structs, describing the
hierarchical clustering solution consisting of nelements-1 nodes. Depending on
whether genes (rows) or samples (columns) were clustered, nelements is equal
to nrows or ncolumns. See src/cluster.h for a description of the Node
structure.
If a memory error occurs, palcluster returns NULL.
========================================================================
*/
{
int j;
int n;
int* clusterid;
int* number;
Node* result;
clusterid = malloc(nelements*sizeof(int));
if (!clusterid) return NULL;
number = malloc(nelements*sizeof(int));
if (!number) {
free(clusterid);
return NULL;
}
result = malloc((nelements-1)*sizeof(Node));
if (!result) {
free(clusterid);
free(number);
return NULL;
}
/* Setup a list specifying to which cluster a gene belongs, and keep track
* of the number of elements in each cluster (needed to calculate the
* average). */
for (j = 0; j < nelements; j++) {
number[j] = 1;
clusterid[j] = j;
}
for (n = nelements; n > 1; n--) {
int sum;
int is = 1;
int js = 0;
result[nelements-n].distance = find_closest_pair(n, distmatrix,
&is, &js);
/* Save result */
result[nelements-n].left = clusterid[is];
result[nelements-n].right = clusterid[js];
/* Fix the distances */
sum = number[is] + number[js];
for (j = 0; j < js; j++) {
distmatrix[js][j] = distmatrix[is][j]*number[is]
+ distmatrix[js][j]*number[js];
distmatrix[js][j] /= sum;
}
for (j = js+1; j < is; j++) {
distmatrix[j][js] = distmatrix[is][j]*number[is]
+ distmatrix[j][js]*number[js];
distmatrix[j][js] /= sum;
}
for (j = is+1; j < n; j++) {
distmatrix[j][js] = distmatrix[j][is]*number[is]
+ distmatrix[j][js]*number[js];
distmatrix[j][js] /= sum;
}
for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
/* Update number of elements in the clusters */
number[js] = sum;
number[is] = number[n-1];
/* Update clusterids */
clusterid[js] = n-nelements-1;
clusterid[is] = clusterid[n-1];
}
free(clusterid);
free(number);
return result;
}
/* ******************************************************************* */
Node*
treecluster(int nrows, int ncolumns, double** data, int** mask,
double weight[], int transpose, char dist, char method,
double** distmatrix)
/*
Purpose
=======
The treecluster routine performs hierarchical clustering using pairwise
single-, maximum-, centroid-, or average-linkage, as defined by method, on a
given set of gene expression data, using the distance metric given by dist.
If successful, the function returns a pointer to a newly allocated Tree struct
containing the hierarchical clustering solution, and NULL if a memory error
occurs. The pointer should be freed by the calling routine to prevent memory
leaks.
Arguments
=========
nrows (input) int
The number of rows in the data matrix, equal to the number of genes.
ncolumns (input) int
The number of columns in the data matrix, equal to the number of samples.
data (input) double[nrows][ncolumns]
The array containing the data of the vectors to be clustered.
mask (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
transpose (input) int
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
of the matrix are clustered.
dist (input) char
Defines which distance measure is used, as given by the table:
dist == 'e': Euclidean distance
dist == 'b': City-block distance
dist == 'c': correlation
dist == 'a': absolute value of the correlation
dist == 'u': uncentered correlation
dist == 'x': absolute uncentered correlation
dist == 's': Spearman's rank correlation
dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
method (input) char
Defines which hierarchical clustering method is used:
method == 's': pairwise single-linkage clustering
method == 'm': pairwise maximum- (or complete-) linkage clustering
method == 'a': pairwise average-linkage clustering
method == 'c': pairwise centroid-linkage clustering
For the first three, either the distance matrix or the gene expression data is
sufficient to perform the clustering algorithm. For pairwise centroid-linkage
clustering, however, the gene expression data are always needed, even if the
distance matrix itself is available.
distmatrix (input) double**
The distance matrix. If the distance matrix is zero initially, the distance
matrix will be allocated and calculated from the data by treecluster, and
deallocated before treecluster returns. If the distance matrix is passed by the
calling routine, treecluster will modify the contents of the distance matrix as
part of the clustering algorithm, but will not deallocate it. The calling
routine should deallocate the distance matrix after the return from
treecluster.
Return value
============
A pointer to a newly allocated array of Node structs, describing the
hierarchical clustering solution consisting of nelements-1 nodes. Depending on
whether genes (rows) or samples (columns) were clustered, nelements is equal
to nrows or ncolumns. See src/cluster.h for a description of the Node
structure.
If a memory error occurs, treecluster returns NULL.
========================================================================
*/
{
Node* result = NULL;
const int nelements = (transpose == 0) ? nrows : ncolumns;
const int ldistmatrix = (distmatrix == NULL && method != 's') ? 1 : 0;
if (nelements < 2) return NULL;
/* Calculate the distance matrix if the user didn't give it */
if (ldistmatrix) {
/* Set up the ragged array */
int i;
distmatrix = malloc(nelements*sizeof(double*));
if (distmatrix == NULL) return NULL; /* Not enough memory available */
distmatrix[0] = NULL;
for (i = 1; i < nelements; i++) {
distmatrix[i] = malloc(i*sizeof(double));
if (distmatrix[i] == NULL) /* Not enough memory available */ {
while (--i > 0) free(distmatrix[i]);
free(distmatrix);
return NULL;
}
}
distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose,
distmatrix);
}
switch(method) {
case 's':
result = pslcluster(nrows, ncolumns, data, mask, weight,
distmatrix, dist, transpose);
break;
case 'm':
result = pmlcluster(nelements, distmatrix);
break;
case 'a':
result = palcluster(nelements, distmatrix);
break;
case 'c':
result = pclcluster(nrows, ncolumns, data, mask, weight,
distmatrix, dist, transpose);
break;
}
/* Deallocate space for distance matrix if allocated by treecluster */
if (ldistmatrix) {
int i;
for (i = 1; i < nelements; i++) free(distmatrix[i]);
free(distmatrix);
}
return result;
}
/* ******************************************************************* */
int
sorttree(const int nnodes, Node* tree, const double order[], int indices[])
/*
Purpose
=======
The sorttree routine sorts the items in a hierarchical clustering solution
based on their order values, while remaining consistent with the hierchical
clustering solution.
Arguments
=========
nnodes (input) int
The number of nodes in the hierarchical clustering tree.
tree (input) Node[nnodes]
The hierarchical clustering tree describing the clustering solution.
order (input) double[nnodes+1]
The preferred order of the items.
indices (output) int*
The indices of each item after sorting, with item i appearing at indices[i]
after sorting.
Return value
============
If no errors occur, sorttree returns 1.
If a memory error occurs, sorttree returns 0.
========================================================================
*/
{
int i;
int index;
int i1, i2;
double order1, order2;
int counts1, counts2;
int* nodecounts;
nodecounts = malloc(nnodes*sizeof(int));
if (!nodecounts) return 0;
if (order) {
double* nodeorder = malloc(nnodes*sizeof(double));
if (!nodeorder) {
free(nodecounts);
return 0;
}
for (i = 0; i < nnodes; i++) {
i1 = tree[i].left;
i2 = tree[i].right;
/* i1 and i2 are the elements that are to be joined */
if (i1 < 0) {
index = -i1-1;
order1 = nodeorder[index];
counts1 = nodecounts[index];
}
else {
order1 = order[i1];
counts1 = 1;
}
if (i2 < 0) {
index = -i2-1;
order2 = nodeorder[index];
counts2 = nodecounts[index];
}
else {
order2 = order[i2];
counts2 = 1;
}
if (order1 > order2) {
tree[i].left = i2;
tree[i].right = i1;
}
nodecounts[i] = counts1 + counts2;
nodeorder[i] = (counts1*order1+counts2*order2) / (counts1+counts2);
}
free(nodeorder);
}
else {
for (i = 0; i < nnodes; i++) {
i1 = tree[i].left;
i2 = tree[i].right;
/* i1 and i2 are the elements that are to be joined */
counts1 = (i1 < 0) ? nodecounts[-i1-1] : 1;
counts2 = (i2 < 0) ? nodecounts[-i2-1] : 1;
nodecounts[i] = counts1 + counts2;
}
( run in 1.153 second using v1.01-cache-2.11-cpan-8450f2e95f3 )