Algorithm-Cluster
view release on metacpan or search on metacpan
src/cluster.c view on Meta::CPAN
/*
Find the median of X(1), ... , X(N), using as much of the quicksort
algorithm as is needed to isolate it.
N.B. On exit, the array X is partially ordered.
Based on Alan J. Miller's median.f90 routine.
*/
{
int i, j;
int nr = n / 2;
int nl = nr - 1;
int even = 0;
/* hi & lo are position limits encompassing the median. */
int lo = 0;
int hi = n-1;
if (n == 2*nr) even = 1;
if (n < 3) {
if (n < 1) return 0.;
if (n == 1) return x[0];
return 0.5*(x[0]+x[1]);
}
/* Find median of 1st, middle & last values. */
do {
int loop;
int mid = (lo + hi)/2;
double result = x[mid];
double xlo = x[lo];
double xhi = x[hi];
if (xhi<xlo) {
double temp = xlo;
xlo = xhi;
xhi = temp;
}
if (result>xhi) result = xhi;
else if (result<xlo) result = xlo;
/* The basic quicksort algorithm to move all values <= the sort key
* (XMED) to the left-hand end, and all higher values to the other end.
*/
i = lo;
j = hi;
do {
while (x[i]<result) i++;
while (x[j]>result) j--;
loop = 0;
if (i<j) {
double temp = x[i];
x[i] = x[j];
x[j] = temp;
i++;
j--;
if (i<=j) loop = 1;
}
} while (loop); /* Decide which half the median is in. */
if (even) {
if (j == nl && i == nr) {
/* Special case, n even, j = n/2 & i = j + 1, so the median is
* between the two halves of the series. Find max. of the first
* half & min. of the second half, then average.
*/
int k;
double xmax = x[0];
double xmin = x[n-1];
for (k = lo; k <= j; k++) xmax = max(xmax, x[k]);
for (k = i; k <= hi; k++) xmin = min(xmin, x[k]);
return 0.5*(xmin + xmax);
}
if (j<nl) lo = i;
if (i>nr) hi = j;
if (i == j) {
if (i == nl) lo = nl;
if (j == nr) hi = nr;
}
}
else {
if (j<nr) lo = i;
if (i>nr) hi = j;
/* Test whether median has been isolated. */
if (i == j && i == nr) return result;
}
}
while (lo<hi-1);
if (even) return (0.5*(x[nl]+x[nr]));
if (x[lo]>x[hi]) {
double temp = x[lo];
x[lo] = x[hi];
x[hi] = temp;
}
return x[nr];
}
/* ********************************************************************** */
static const double* sortdata = NULL; /* used in the quicksort algorithm */
/* ---------------------------------------------------------------------- */
static int
compare(const void* a, const void* b)
/* Helper function for sort. Previously, this was a nested function under
* sort, which is not allowed under ANSI C.
*/
{
const int i1 = *(const int*)a;
const int i2 = *(const int*)b;
const double term1 = sortdata[i1];
const double term2 = sortdata[i2];
if (term1 < term2) return -1;
if (term1 > term2) return +1;
return 0;
}
/* ---------------------------------------------------------------------- */
void
sort(int n, const double data[], int index[])
/* Sets up an index table given the data, such that data[index[]] is in
src/cluster.c view on Meta::CPAN
if (!data[i]) break;
mask[i] = malloc(ncols*sizeof(int));
if (!mask[i]) {
free(data[i]);
break;
}
}
if (i == nrows) { /* break not encountered */
*pdata = data;
*pmask = mask;
return 1;
}
*pdata = NULL;
*pmask = NULL;
nrows = i;
for (i = 0; i < nrows; i++) {
free(data[i]);
free(mask[i]);
}
free(data);
free(mask);
return 0;
}
/* ---------------------------------------------------------------------- */
static void
freedatamask(int n, double** data, int** mask)
{
int i;
for (i = 0; i < n; i++) {
free(mask[i]);
free(data[i]);
}
free(mask);
free(data);
}
/* ---------------------------------------------------------------------- */
static double
find_closest_pair(int n, double** distmatrix, int* ip, int* jp)
/*
This function searches the distance matrix to find the pair with the shortest
distance between them. The indices of the pair are returned in ip and jp; the
distance itself is returned by the function.
n (input) int
The number of elements in the distance matrix.
distmatrix (input) double**
A ragged array containing the distance matrix. The number of columns in each
row is one less than the row index.
ip (output) int*
A pointer to the integer that is to receive the first index of the pair with
the shortest distance.
jp (output) int*
A pointer to the integer that is to receive the second index of the pair with
the shortest distance.
*/
{
int i, j;
double temp;
double distance = distmatrix[1][0];
*ip = 1;
*jp = 0;
for (i = 1; i < n; i++) {
for (j = 0; j < i; j++) {
temp = distmatrix[i][j];
if (temp<distance) {
distance = temp;
*ip = i;
*jp = j;
}
}
}
return distance;
}
/* ********************************************************************* */
static int
svd(int m, int n, double** u, double w[], double** vt)
/*
* This subroutine is a translation of the Algol procedure svd, described in
* G.H. Golub and C. Reinsch:
* "Singular Value Decomposition and Least Squares Solutions",
* published in
* Numerische Mathematik 14(5): page 403-420 (1970)
* and
* Handbook for Automatic Computation, Volume II: Linear Algebra,
* (John H. Wilkinson and C. Reinsch; edited by Friedrich L. Bauer and
* Alston S. Householder): page 134-151 (1971).
* t
* This subroutine determines the singular value decomposition A=usv of a
* real m by n rectangular matrix, where m is greater * than or equal to n.
* Householder bidiagonalization and a variant of the QR algorithm are used.
*
* On input:
*
* m is the number of rows of A (and u).
*
* n is the number of columns of A (and u) and the order of v.
*
* u contains the rectangular input matrix A to be decomposed.
*
* On output:
*
* the routine returns an integer ierr equal to
* 0: to indicate a normal return,
* k: if the k-th singular value has not been determined after 30 iterations,
* -1: if memory allocation fails.
*
* w contains the n (non-negative) singular values of a (the
* diagonal elements of s). they are unordered.
* If an error exit is made, the singular values should be correct for
* indices ierr+1, ierr+2, ..., n.
src/cluster.c view on Meta::CPAN
}
for (i = 0; i < ncolumns; i++) {
for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
}
for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
for (i = 0; i < ncolumns; i++) w[i] = temp[i];
}
else /* nrows < ncolumns */ {
for (j = 0; j < nrows; j++) {
const double s = w[j];
for (i = 0; i < nrows; i++) v[i][j] *= s;
}
sort(nrows, w, index);
for (i = 0; i < nrows/2; i++) {
j = index[i];
index[i] = index[nrows-1-i];
index[nrows-1-i] = j;
}
for (j = 0; j < ncolumns; j++) {
for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
for (i = 0; i < nrows; i++) u[i][j] = temp[i];
}
for (j = 0; j < nrows; j++) {
for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
for (i = 0; i < nrows; i++) v[j][i] = temp[i];
}
for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
for (i = 0; i < nrows; i++) w[i] = temp[i];
}
}
free(index);
free(temp);
return error;
}
/* ********************************************************************* */
static double
euclid(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The euclid routine calculates the weighted Euclidean distance between two
rows or columns in a matrix.
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double result = 0.0;
double tweight = 0;
int i;
if (transpose == 0) /* Calculate the distance between two rows */ {
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
double term = data1[index1][i] - data2[index2][i];
result += weight[i]*term*term;
tweight += weight[i];
}
}
}
else {
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
double term = data1[i][index1] - data2[i][index2];
result += weight[i]*term*term;
tweight += weight[i];
}
}
}
if (!tweight) return 0; /* usually due to empty clusters */
result /= tweight;
return result;
}
/* ********************************************************************* */
static double
cityblock(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The cityblock routine calculates the weighted "City Block" distance between
two rows or columns in a matrix. City Block distance is defined as the
absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
equivalent to taking an "up and over" path.
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================ */
{
double result = 0.;
double tweight = 0;
int i;
if (transpose == 0) /* Calculate the distance between two rows */ {
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
double term = data1[index1][i] - data2[index2][i];
result = result + weight[i]*fabs(term);
tweight += weight[i];
}
}
}
else {
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
double term = data1[i][index1] - data2[i][index2];
result = result + weight[i]*fabs(term);
tweight += weight[i];
}
}
}
if (!tweight) return 0; /* usually due to empty clusters */
result /= tweight;
return result;
}
/* ********************************************************************* */
static double
correlation(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The correlation routine calculates the weighted Pearson distance between two
rows or columns in a matrix. We define the Pearson distance as one minus the
Pearson correlation.
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
(e.g., choose b = a + c).
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double result = 0.;
double sum1 = 0.;
double sum2 = 0.;
double denom1 = 0.;
double denom2 = 0.;
double tweight = 0.;
if (transpose == 0) /* Calculate the distance between two rows */ {
int i;
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
double term1 = data1[index1][i];
double term2 = data2[index2][i];
double w = weight[i];
sum1 += w*term1;
sum2 += w*term2;
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
tweight += w;
}
}
}
else {
int i;
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
double term1 = data1[i][index1];
double term2 = data2[i][index2];
double w = weight[i];
sum1 += w*term1;
sum2 += w*term2;
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
tweight += w;
}
}
}
if (!tweight) return 0; /* usually due to empty clusters */
result -= sum1 * sum2 / tweight;
denom1 -= sum1 * sum1 / tweight;
denom2 -= sum2 * sum2 / tweight;
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
result = result / sqrt(denom1*denom2);
result = 1. - result;
return result;
}
/* ********************************************************************* */
static double
acorrelation(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The acorrelation routine calculates the weighted Pearson distance between two
rows or columns, using the absolute value of the correlation.
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
(e.g., choose b = a + c).
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double result = 0.;
double sum1 = 0.;
double sum2 = 0.;
double denom1 = 0.;
double denom2 = 0.;
double tweight = 0.;
if (transpose == 0) /* Calculate the distance between two rows */ {
int i;
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
double term1 = data1[index1][i];
double term2 = data2[index2][i];
double w = weight[i];
sum1 += w*term1;
sum2 += w*term2;
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
tweight += w;
}
}
}
else {
int i;
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
double term1 = data1[i][index1];
double term2 = data2[i][index2];
double w = weight[i];
sum1 += w*term1;
sum2 += w*term2;
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
tweight += w;
}
}
}
if (!tweight) return 0; /* usually due to empty clusters */
result -= sum1 * sum2 / tweight;
denom1 -= sum1 * sum1 / tweight;
denom2 -= sum2 * sum2 / tweight;
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
result = fabs(result) / sqrt(denom1*denom2);
result = 1. - result;
return result;
}
/* ********************************************************************* */
static double
ucorrelation(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The ucorrelation routine calculates the weighted Pearson distance between two
rows or columns, using the uncentered version of the Pearson correlation. In
the uncentered Pearson correlation, a zero mean is used for both vectors even
if the actual mean is nonzero.
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
(e.g., choose b = a + c).
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double result = 0.;
double denom1 = 0.;
double denom2 = 0.;
int flag = 0;
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
* found.
*/
if (transpose == 0) /* Calculate the distance between two rows */ {
int i;
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
double term1 = data1[index1][i];
double term2 = data2[index2][i];
double w = weight[i];
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
flag = 1;
}
}
}
else {
int i;
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
double term1 = data1[i][index1];
double term2 = data2[i][index2];
double w = weight[i];
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
flag = 1;
}
}
}
if (!flag) return 0.;
if (denom1 == 0.) return 1.;
if (denom2 == 0.) return 1.;
result = result / sqrt(denom1*denom2);
result = 1. - result;
return result;
}
/* ********************************************************************* */
static double
uacorrelation(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The uacorrelation routine calculates the weighted Pearson distance between two
rows or columns, using the absolute value of the uncentered version of the
Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
for both vectors even if the actual mean is nonzero.
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
(e.g., choose b = a + c).
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double result = 0.;
double denom1 = 0.;
double denom2 = 0.;
int flag = 0;
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
* found.
*/
if (transpose == 0) /* Calculate the distance between two rows */ {
int i;
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
double term1 = data1[index1][i];
double term2 = data2[index2][i];
double w = weight[i];
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
flag = 1;
}
}
}
else {
int i;
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
double term1 = data1[i][index1];
double term2 = data2[i][index2];
double w = weight[i];
result += w*term1*term2;
denom1 += w*term1*term1;
denom2 += w*term2*term2;
flag = 1;
}
}
}
if (!flag) return 0.;
if (denom1 == 0.) return 1.;
if (denom2 == 0.) return 1.;
result = fabs(result) / sqrt(denom1*denom2);
result = 1. - result;
return result;
}
/* ********************************************************************* */
static double
spearman(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The spearman routine calculates the Spearman distance between two rows or
columns. The Spearman distance is defined as one minus the Spearman rank
correlation.
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
int i;
int m = 0;
double* rank1;
double* rank2;
double result = 0.;
double denom1 = 0.;
double denom2 = 0.;
double sum1 = 0.;
double sum2 = 0.;
double totalweight = 0.;
double* tdata1;
double* tdata2;
tdata1 = malloc(n*sizeof(double));
if (!tdata1) return 0.0; /* Memory allocation error */
tdata2 = malloc(n*sizeof(double));
if (!tdata2) /* Memory allocation error */ {
free(tdata1);
return 0.0;
}
if (transpose == 0) {
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
tdata1[m] = data1[index1][i];
tdata2[m] = data2[index2][i];
m++;
}
}
}
else {
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
tdata1[m] = data1[i][index1];
tdata2[m] = data2[i][index2];
m++;
}
}
}
if (m == 0) {
free(tdata1);
free(tdata2);
return 0;
}
rank1 = getrank(m, tdata1, weight);
free(tdata1);
if (!rank1) {
free(tdata2);
return 0.0; /* Memory allocation error */
}
rank2 = getrank(m, tdata2, weight);
free(tdata2);
if (!rank2) /* Memory allocation error */ {
free(rank1);
return 0.0;
}
for (i = 0; i < m; i++) {
const double term1 = rank1[i];
const double term2 = rank2[i];
const double w = weight[i];
sum1 += term1 * w;
sum2 += term2 * w;
result += term1 * term2 * w;
denom1 += term1 * term1 * w;
denom2 += term2 * term2 * w;
totalweight += w;
}
/* Note: denom1 and denom2 cannot be calculated directly from the number
* of elements. If two elements have the same rank, the squared sum of
* their ranks will change.
*/
free(rank1);
free(rank2);
if (!totalweight) return 0; /* usually due to empty clusters */
result -= sum1 * sum2 / totalweight;
denom1 -= sum1 * sum1 / totalweight;
denom2 -= sum2 * sum2 / totalweight;
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
result = result / sqrt(denom1*denom2);
result = 1. - result;
return result;
}
/* ********************************************************************* */
static double
kendall(int n, double** data1, double** data2, int** mask1, int** mask2,
const double weight[], int index1, int index2, int transpose)
/*
Purpose
=======
The kendall routine calculates the Kendall distance between two
rows or columns. The Kendall distance is defined as one minus Kendall's tau.
Arguments
=========
n (input) int
The number of elements in a row or column. If transpose == 0, then n is the
number of columns; otherwise, n is the number of rows.
data1 (input) double array
The data array containing the first vector.
data2 (input) double array
The data array containing the second vector.
mask1 (input) int array
This array which elements in data1 are missing. If mask1[i][j] == 0, then
data1[i][j] is missing.
mask2 (input) int array
This array which elements in data2 are missing. If mask2[i][j] == 0, then
data2[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0,
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
index1 (input) int
Index of the first row or column.
index2 (input) int
Index of the second row or column.
transpose (input) int
If transpose == 0, the distance between two rows in the matrix is calculated.
Otherwise, the distance between two columns in the matrix is calculated.
============================================================================
*/
{
double con = 0;
double dis = 0;
double exx = 0;
double exy = 0;
int flag = 0;
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
* found.
*/
double denomx;
double denomy;
double tau;
int i, j;
if (transpose == 0) {
for (i = 0; i < n; i++) {
if (mask1[index1][i] && mask2[index2][i]) {
for (j = 0; j < i; j++) {
if (mask1[index1][j] && mask2[index2][j]) {
const double x1 = data1[index1][i];
const double x2 = data1[index1][j];
const double y1 = data2[index2][i];
const double y2 = data2[index2][j];
const double w = weight[i] * weight[j];
if (x1 < x2 && y1 < y2) con += w;
else if (x1 > x2 && y1 > y2) con += w;
else if (x1 < x2 && y1 > y2) dis += w;
else if (x1 > x2 && y1 < y2) dis += w;
else if (x1 == x2 && y1 != y2) exx += w;
else if (x1 != x2 && y1 == y2) exy += w;
flag = 1;
}
}
}
}
}
else {
for (i = 0; i < n; i++) {
if (mask1[i][index1] && mask2[i][index2]) {
for (j = 0; j < i; j++) {
if (mask1[j][index1] && mask2[j][index2]) {
const double x1 = data1[i][index1];
const double x2 = data1[j][index1];
const double y1 = data2[i][index2];
const double y2 = data2[j][index2];
const double w = weight[i] * weight[j];
if (x1 < x2 && y1 < y2) con += w;
else if (x1 > x2 && y1 > y2) con += w;
else if (x1 < x2 && y1 > y2) dis += w;
else if (x1 > x2 && y1 < y2) dis += w;
else if (x1 == x2 && y1 != y2) exx += w;
else if (x1 != x2 && y1 == y2) exy += w;
flag = 1;
}
}
}
}
}
if (!flag) return 0.;
denomx = con + dis + exx;
denomy = con + dis + exy;
if (denomx == 0) return 1;
if (denomy == 0) return 1;
tau = (con-dis)/sqrt(denomx*denomy);
return 1.-tau;
}
/* ********************************************************************* */
static double(*setmetric(char dist))
(int, double**, double**, int**, int**, const double[], int, int, int)
{
switch(dist) {
case 'e': return &euclid;
case 'b': return &cityblock;
case 'c': return &correlation;
case 'a': return &acorrelation;
case 'u': return &ucorrelation;
case 'x': return &uacorrelation;
case 's': return &spearman;
case 'k': return &kendall;
default: return &euclid;
}
}
/* ********************************************************************* */
static double
uniform(void)
/*
Purpose
=======
This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
and 1.0 are excluded. This random number generator is described in:
Pierre l'Ecuyer
Efficient and Portable Combined Random Number Generators
Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749, 774.
The first time this routine is called, it initializes the random number
generator using the current time. First, the current epoch time in seconds is
used as a seed for the random number generator in the C library. The first two
random numbers generated by this generator are used to initialize the random
number generator implemented in this routine.
Arguments
=========
None.
Return value
============
A double-precison number between 0.0 and 1.0.
============================================================================
*/
{
int z;
static const int m1 = 2147483563;
static const int m2 = 2147483399;
const double scale = 1.0/m1;
static int s1 = 0;
static int s2 = 0;
if (s1 == 0 || s2 == 0) {
/* initialize */
unsigned int initseed = (unsigned int) time(0);
srand(initseed);
s1 = rand();
s2 = rand();
}
do {
int k = s1/53668;
s1 = 40014*(s1-k*53668)-k*12211;
if (s1 < 0) s1+=m1;
k = s2/52774;
s2 = 40692*(s2-k*52774)-k*3791;
if (s2 < 0) s2 += m2;
z = s1-s2;
if (z < 1) z += (m1-1);
} while (z == m1); /* To avoid returning 1.0 */
return z*scale;
}
/* ************************************************************************ */
static int
binomial(int n, double p)
/*
Purpose
=======
This routine generates a random number between 0 and n inclusive, following
the binomial distribution with probability p and n trials. The routine is
based on the BTPE algorithm, described in:
src/cluster.c view on Meta::CPAN
somassign(nrows, ncolumns, data, mask, weight, transpose,
nxgrid, nygrid, celldata, dist, clusterid);
if (lcelldata == 0) {
for (i = 0; i < nxgrid; i++)
for (j = 0; j < nygrid; j++)
free(celldata[i][j]);
for (i = 0; i < nxgrid; i++)
free(celldata[i]);
free(celldata);
}
}
/* ******************************************************************** */
double
clusterdistance(int nrows, int ncolumns, double** data, int** mask,
double weight[], int n1, int n2, int index1[], int index2[],
char dist, char method, int transpose)
/*
Purpose
=======
The clusterdistance routine calculates the distance between two clusters
containing genes or samples using the measured gene expression vectors. The
distance between clusters, given the genes/samples in each cluster, can be
defined in several ways. Several distance measures can be used.
The routine returns the distance in double precision.
If the parameter transpose is set to a nonzero value, the clusters are
interpreted as clusters of samples, otherwise as clusters of gene.
Arguments
=========
nrows (input) int
The number of rows (i.e., the number of genes) in the gene expression data
matrix.
ncolumns (input) int
The number of columns (i.e., the number of samples) in the gene expression
data matrix.
data (input) double[nrows][ncolumns]
The array containing the data of the vectors.
mask (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.
weight (input) double[ncolumns] if transpose == 0;
double[nrows] otherwise
The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.
n1 (input) int
The number of elements in the first cluster.
n2 (input) int
The number of elements in the second cluster.
index1 (input) int[n1]
Identifies which genes/samples belong to the first cluster.
index2 (input) int[n2]
Identifies which genes/samples belong to the second cluster.
dist (input) char
Defines which distance measure is used, as given by the table:
dist == 'e': Euclidean distance
dist == 'b': City-block distance
dist == 'c': correlation
dist == 'a': absolute value of the correlation
dist == 'u': uncentered correlation
dist == 'x': absolute uncentered correlation
dist == 's': Spearman's rank correlation
dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
method (input) char
Defines how the distance between two clusters is defined, given which genes
belong to which cluster:
method == 'a': the distance between the arithmetic means of the two clusters
method == 'm': the distance between the medians of the two clusters
method == 's': the smallest pairwise distance between members of the two
clusters
method == 'x': the largest pairwise distance between members of the two
clusters
method == 'v': average of the pairwise distances between members of the two
clusters
transpose (input) int
If transpose is equal to zero, the distances between the rows is
calculated. Otherwise, the distances between the columns is calculated.
The former is needed when genes are being clustered; the latter is used
when samples are being clustered.
========================================================================
*/
{
/* Set the metric function as indicated by dist */
double (*metric) (int, double**, double**, int**, int**,
const double[], int, int, int) = setmetric(dist);
/* if one or both clusters are empty, return */
if (n1 < 1 || n2 < 1) return -1.0;
/* Check the indices */
if (transpose == 0) {
int i;
for (i = 0; i < n1; i++) {
int index = index1[i];
if (index < 0 || index >= nrows) return -1.0;
}
for (i = 0; i < n2; i++) {
int index = index2[i];
if (index < 0 || index >= nrows) return -1.0;
}
}
else {
int i;
for (i = 0; i < n1; i++) {
int index = index1[i];
if (index < 0 || index >= ncolumns) return -1.0;
}
for (i = 0; i < n2; i++) {
int index = index2[i];
( run in 0.566 second using v1.01-cache-2.11-cpan-39bf76dae61 )