Algorithm-Cluster

 view release on metacpan or  search on metacpan

src/cluster.c  view on Meta::CPAN

static void
randomassign(int nclusters, int nelements, int clusterid[])
/*
Purpose
=======

The randomassign routine performs an initial random clustering, needed for
k-means or k-median clustering. Elements (genes or samples) are randomly
assigned to clusters. The number of elements in each cluster is chosen
randomly, making sure that each cluster will receive at least one element.


Arguments
=========

nclusters    (input) int
The number of clusters.

nelements    (input) int
The number of elements to be clustered (i.e., the number of genes or samples
to be clustered).

clusterid    (output) int[nelements]
The cluster number to which an element was assigned.

============================================================================
*/
{
    int i, j;
    int k = 0;
    double p;
    int n = nelements-nclusters;

    /* Draw the number of elements in each cluster from a multinomial
     * distribution, reserving ncluster elements to set independently
     * in order to guarantee that none of the clusters are empty.
     */
    for (i = 0; i < nclusters-1; i++) {
        p = 1.0/(nclusters-i);
        j = binomial(n, p);
        n -= j;
        j += k+1; /* Assign at least one element to cluster i */
        for ( ; k < j; k++) clusterid[k] = i;
    }
    /* Assign the remaining elements to the last cluster */
    for ( ; k < nelements; k++) clusterid[k] = i;

    /* Create a random permutation of the cluster assignments */
    for (i = 0; i < nelements; i++) {
        j = (int) (i + (nelements-i)*uniform());
        k = clusterid[j];
        clusterid[j] = clusterid[i];
        clusterid[i] = k;
    }
}

/* ********************************************************************* */

static void
getclustermeans(int nclusters, int nrows, int ncolumns,
    double** data, int** mask, int clusterid[], double** cdata, int** cmask,
    int transpose)
/*
Purpose
=======

The getclustermeans routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the mean over all
elements for each dimension.

Arguments
=========

nclusters (input) int
The number of clusters.

nrows     (input) int
The number of rows in the gene expression data matrix, equal to the number of
genes.

ncolumns  (input) int
The number of columns in the gene expression data matrix, equal to the number
of samples.

data      (input) double[nrows][ncolumns]
The array containing the gene expression data.

mask      (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.

clusterid (output) int[nrows] if transpose == 0
                   int[ncolumns] otherwise
The cluster number to which each element belongs. If transpose == 0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of samples).

cdata     (output) double[nclusters][ncolumns] if transpose == 0
                   double[nrows][nclusters]    otherwise
On exit of getclustermeans, this array contains the cluster centroids.

cmask     (output) int[nclusters][ncolumns] if transpose == 0
                   int[nrows][nclusters]    otherwise
This array shows which data values of are missing for each centroid. If
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for a
centroid if all corresponding data values of the cluster members are missing.

transpose (input) int
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
of columns (samples) are specified.

========================================================================
*/
{
    int i, j, k;

    if (transpose == 0) {
        for (i = 0; i < nclusters; i++) {
            for (j = 0; j < ncolumns; j++) {
                cmask[i][j] = 0;
                cdata[i][j] = 0.;
            }
        }
        for (k = 0; k < nrows; k++) {
            i = clusterid[k];
            for (j = 0; j < ncolumns; j++) {
                if (mask[k][j] != 0) {
                    cdata[i][j] += data[k][j];
                    cmask[i][j]++;
                }
            }
        }
        for (i = 0; i < nclusters; i++) {
            for (j = 0; j < ncolumns; j++) {
                if (cmask[i][j]>0) {
                    cdata[i][j] /= cmask[i][j];
                    cmask[i][j] = 1;
                }
            }
        }
    }
    else {
        for (i = 0; i < nrows; i++) {
            for (j = 0; j < nclusters; j++) {
                cdata[i][j] = 0.;
                cmask[i][j] = 0;
            }
        }
        for (k = 0; k < ncolumns; k++) {
            i = clusterid[k];
            for (j = 0; j < nrows; j++) {
                if (mask[j][k] != 0) {
                    cdata[j][i] += data[j][k];
                    cmask[j][i]++;
                }
            }
        }
        for (i = 0; i < nrows; i++) {
            for (j = 0; j < nclusters; j++) {
                if (cmask[i][j]>0) {
                    cdata[i][j] /= cmask[i][j];
                    cmask[i][j] = 1;
                }
            }
        }
    }
}

/* ********************************************************************* */

static void
getclustermedians(int nclusters, int nrows, int ncolumns,
    double** data, int** mask, int clusterid[], double** cdata, int** cmask,
    int transpose, double cache[])
/*
Purpose
=======

The getclustermedians routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the median over all
elements for each dimension.

Arguments
=========

nclusters  (input) int
The number of clusters.

nrows      (input) int
The number of rows in the gene expression data matrix, equal to the number of
genes.

ncolumns   (input) int
The number of columns in the gene expression data matrix, equal to the number
of samples.

data       (input) double[nrows][ncolumns]
The array containing the gene expression data.

mask       (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.

clusterid  (output) int[nrows] if transpose == 0
                                        int[ncolumns] otherwise
The cluster number to which each element belongs. If transpose == 0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of samples).

cdata      (output) double[nclusters][ncolumns] if transpose == 0
                    double[nrows][nclusters] otherwise
On exit of getclustermedians, this array contains the cluster centroids.

cmask      (output) int[nclusters][ncolumns] if transpose == 0
                    int[nrows][nclusters] otherwise
This array shows which data values of are missing for each centroid. If
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for
a centroid if all corresponding data values of the cluster members are missing.

transpose  (input) int
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
of columns (samples) are specified.

cache      (input) double[nrows] if transpose == 0
                   double[ncolumns] otherwise
This array should be allocated before calling getclustermedians; its contents
on input is not relevant. This array is used as a temporary storage space when
calculating the medians.

========================================================================
*/
{
    int i, j, k;

    if (transpose == 0) {
        for (i = 0; i < nclusters; i++) {
            for (j = 0; j < ncolumns; j++) {
                int count = 0;
                for (k = 0; k < nrows; k++) {
                    if (i == clusterid[k] && mask[k][j]) {
                        cache[count] = data[k][j];
                        count++;
                    }
                }
                if (count>0) {
                    cdata[i][j] = median(count, cache);
                    cmask[i][j] = 1;
                }
                else {
                    cdata[i][j] = 0.;
                    cmask[i][j] = 0;
                }
            }
        }
    }
    else {
        for (i = 0; i < nclusters; i++) {
            for (j = 0; j < nrows; j++) {
                int count = 0;
                for (k = 0; k < ncolumns; k++) {
                    if (i == clusterid[k] && mask[j][k]) {
                        cache[count] = data[j][k];
                        count++;
                    }
                }
                if (count>0) {
                    cdata[j][i] = median(count, cache);
                    cmask[j][i] = 1;
                }
                else {
                    cdata[j][i] = 0.;
                    cmask[j][i] = 0;
                }
            }
        }
    }
}

/* ********************************************************************* */

int
getclustercentroids(int nclusters, int nrows, int ncolumns,
    double** data, int** mask, int clusterid[], double** cdata, int** cmask,
    int transpose, char method)
/*
Purpose
=======

The getclustercentroids routine calculates the cluster centroids, given to
which cluster each element belongs. Depending on the argument method, the
centroid is defined as either the mean or the median for each dimension over
all elements belonging to a cluster.

Arguments
=========

nclusters  (input) int
The number of clusters.

nrows      (input) int
The number of rows in the gene expression data matrix, equal to the number of
genes.

ncolumns   (input) int
The number of columns in the gene expression data matrix, equal to the number
of samples.

data       (input) double[nrows][ncolumns]
The array containing the gene expression data.

mask       (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.

clusterid  (output) int[nrows] if transpose == 0
                                        int[ncolumns] otherwise
The cluster number to which each element belongs. If transpose == 0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of samples).

cdata      (output) double[nclusters][ncolumns] if transpose == 0
                    double[nrows][nclusters] otherwise
On exit of getclustercentroids, this array contains the cluster centroids.

cmask      (output) int[nclusters][ncolumns] if transpose == 0
                    int[nrows][nclusters] otherwise
This array shows which data values of are missing for each centroid. If
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for
a centroid if all corresponding data values of the cluster members are missing.

transpose  (input) int
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
of columns (samples) are specified.

method     (input) char
For method == 'a', the centroid is defined as the mean over all elements
belonging to a cluster for each dimension.
For method == 'm', the centroid is defined as the median over all elements
belonging to a cluster for each dimension.

Return value
============

The function returns an integer to indicate success or failure. If a
memory error occurs, or if method is not 'm' or 'a', getclustercentroids
returns 0. If successful, getclustercentroids returns 1.
========================================================================
*/
{
    switch(method) {
        case 'm': {
            const int nelements = (transpose == 0) ? nrows : ncolumns;
            double* cache = malloc(nelements*sizeof(double));
            if (!cache) return 0;
            getclustermedians(nclusters, nrows, ncolumns, data, mask,
                              clusterid, cdata, cmask, transpose, cache);
            free(cache);
            return 1;
        }
        case 'a': {
            getclustermeans(nclusters, nrows, ncolumns, data, mask,
                            clusterid, cdata, cmask, transpose);
            return 1;
        }
    }
    return 0;
}

/* ********************************************************************* */

void
getclustermedoids(int nclusters, int nelements, double** distance,
    int clusterid[], int centroids[], double errors[])
/*
Purpose
=======

The getclustermedoids routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the element with the
smallest sum of distances to the other elements.

Arguments
=========

nclusters    (input) int
The number of clusters.

nelements    (input) int
The total number of elements.

distmatrix   (input) double array, ragged
    (number of rows is nelements, number of columns is equal to the row number)
The distance matrix. To save space, the distance matrix is given in the
form of a ragged array. The distance matrix is symmetric and has zeros
on the diagonal. See distancematrix for a description of the content.

clusterid    (output) int[nelements]
The cluster number to which each element belongs.

centroid     (output) int[nclusters]
The index of the element that functions as the centroid for each cluster.

errors       (output) double[nclusters]
The within-cluster sum of distances between the items and the cluster
centroid.

========================================================================
*/
{
    int i, j, k;

    for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
    for (i = 0; i < nelements; i++) {
        double d = 0.0;
        j = clusterid[i];
        for (k = 0; k < nelements; k++) {
            if (i == k || clusterid[k]!=j) continue;
            d += (i < k ? distance[k][i] : distance[i][k]);
            if (d > errors[j]) break;
        }
        if (d < errors[j]) {
            errors[j] = d;
            centroids[j] = i;
        }
    }
}

/* ********************************************************************* */

static int
kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
    double weight[], int transpose, int npass, char dist,
    double** cdata, int** cmask, int clusterid[], double* error,
    int tclusterid[], int counts[], int mapping[])
{
    int i, j, k;
    const int nelements = (transpose == 0) ? nrows : ncolumns;
    const int ndata = (transpose == 0) ? ncolumns : nrows;
    int ifound = 1;
    int ipass = 0;
    /* Set the metric function as indicated by dist */
    double (*metric) (int, double**, double**, int**, int**,
                      const double[], int, int, int) = setmetric(dist);

    /* Save the clustering solution periodically and check if it reappears */
    int* saved = malloc(nelements*sizeof(int));
    if (saved == NULL) return -1;

    *error = DBL_MAX;

    do {
        double total = DBL_MAX;
        int counter = 0;
        int period = 10;

        /* Perform the EM algorithm.
         * First, randomly assign elements to clusters. */
        if (npass != 0) randomassign(nclusters, nelements, tclusterid);

        for (i = 0; i < nclusters; i++) counts[i] = 0;
        for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;

        /* Start the loop */
        while (1) {
            double previous = total;
            total = 0.0;

            if (counter % period == 0) {
                /* Save the current cluster assignments */
                for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
                if (period < INT_MAX / 2) period *= 2;
            }
            counter++;

            /* Find the center */
            getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
                            cdata, cmask, transpose);

            for (i = 0; i < nelements; i++) {
                double distance;
                /* Calculate the distances */
                k = tclusterid[i];
                if (counts[k] == 1) continue;
                /* No reassignment if that would lead to an empty cluster */
                /* Treat the present cluster as a special case */
                distance = metric(ndata, data, cdata, mask, cmask, weight,
                                  i, k, transpose);
                for (j = 0; j < nclusters; j++) {
                    double tdistance;
                    if (j == k) continue;
                    tdistance = metric(ndata, data, cdata, mask, cmask, weight,
                                       i, j, transpose);
                    if (tdistance < distance) {
                        distance = tdistance;
                        counts[tclusterid[i]]--;
                        tclusterid[i] = j;
                        counts[j]++;
                    }
                }
                total += distance;
            }
            if (total >= previous) break;
            /* total >= previous is FALSE on some machines even if total and
             * previous are bitwise identical. */
            for (i = 0; i < nelements; i++)
                if (saved[i]!=tclusterid[i]) break;
            if (i == nelements)
                break; /* Identical solution found; break out of this loop */
        }

        if (npass <= 1) {
            *error = total;
            break;
        }

        for (i = 0; i < nclusters; i++) mapping[i] = -1;
        for (i = 0; i < nelements; i++) {
            j = tclusterid[i];
            k = clusterid[i];
            if (mapping[k] == -1) mapping[k] = j;
            else if (mapping[k] != j) {
                if (total < *error) {
                    ifound = 1;
                    *error = total;
                    for (j = 0; j < nelements; j++)
                        clusterid[j] = tclusterid[j];
                }
                break;
            }
        }
        if (i == nelements) ifound++; /* break statement not encountered */
    } while (++ipass < npass);

    free(saved);
    return ifound;
}

/* ---------------------------------------------------------------------- */

static int
kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
    double weight[], int transpose, int npass, char dist,
    double** cdata, int** cmask, int clusterid[], double* error,
    int tclusterid[], int counts[], int mapping[], double cache[])
{
    int i, j, k;
    const int nelements = (transpose == 0) ? nrows : ncolumns;
    const int ndata = (transpose == 0) ? ncolumns : nrows;
    int ifound = 1;
    int ipass = 0;
    int* saved;
    /* Set the metric function as indicated by dist */
    double (*metric) (int, double**, double**, int**, int**,
                      const double[], int, int, int) = setmetric(dist);

    /* Save the clustering solution periodically and check if it reappears */
    saved = malloc(nelements*sizeof(int));
    if (saved == NULL) return -1;

    *error = DBL_MAX;

    do {
        double total = DBL_MAX;
        int counter = 0;
        int period = 10;

        /* Perform the EM algorithm.
         * First, randomly assign elements to clusters. */
        if (npass != 0) randomassign(nclusters, nelements, tclusterid);

        for (i = 0; i < nclusters; i++) counts[i] = 0;
        for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;

        /* Start the loop */
        while (1) {
            double previous = total;
            total = 0.0;

            if (counter % period == 0) {
                /* Save the current cluster assignments */
                for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
                if (period < INT_MAX / 2) period *= 2;
            }
            counter++;

            /* Find the center */
            getclustermedians(nclusters, nrows, ncolumns, data, mask,
                              tclusterid, cdata, cmask, transpose, cache);

            for (i = 0; i < nelements; i++) {
                /* Calculate the distances */
                double distance;
                k = tclusterid[i];
                if (counts[k] == 1) continue;
                /* No reassignment if that would lead to an empty cluster */
                /* Treat the present cluster as a special case */
                distance = metric(ndata, data, cdata, mask, cmask, weight,
                                  i, k, transpose);
                for (j = 0; j < nclusters; j++) {
                    double tdistance;
                    if (j == k) continue;
                    tdistance = metric(ndata, data, cdata, mask, cmask, weight,
                                       i, j, transpose);
                    if (tdistance < distance) {
                        distance = tdistance;
                        counts[tclusterid[i]]--;
                        tclusterid[i] = j;
                        counts[j]++;
                    }
                }
                total += distance;
            }
            if (total >= previous) break;
            /* total >= previous is FALSE on some machines even if total and
             * previous are bitwise identical. */
            for (i = 0; i < nelements; i++)
                if (saved[i]!=tclusterid[i]) break;
            if (i == nelements)
                break; /* Identical solution found; break out of this loop */
        }

        if (npass <= 1) {
            *error = total;
            break;
        }

        for (i = 0; i < nclusters; i++) mapping[i] = -1;
        for (i = 0; i < nelements; i++) {
            j = tclusterid[i];
            k = clusterid[i];
            if (mapping[k] == -1) mapping[k] = j;
            else if (mapping[k] != j) {
                if (total < *error) {
                    ifound = 1;
                    *error = total;
                    for (j = 0; j < nelements; j++)
                        clusterid[j] = tclusterid[j];
                }
                break;
            }
        }
        if (i == nelements) ifound++; /* break statement not encountered */
    } while (++ipass < npass);

    free(saved);
    return ifound;
}

/* ********************************************************************* */

void
kcluster(int nclusters, int nrows, int ncolumns, double** data, int** mask,
    double weight[], int transpose, int npass, char method, char dist,
    int clusterid[], double* error, int* ifound)
/*
Purpose
=======

The kcluster routine performs k-means or k-median clustering on a given set of
elements, using the specified distance measure. The number of clusters is given
by the user. Multiple passes are being made to find the optimal clustering
solution, each time starting from a different initial clustering.

src/cluster.c  view on Meta::CPAN

The weights that are used to calculate the distance. This is equivalent
to including the jth data point weight[j] times in the calculation. The
weights can be non-integer.

transpose  (input) int
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns
of the matrix are clustered.

npass      (input) int
The number of times clustering is performed. Clustering is performed npass
times, each time starting from a different (random) initial assignment of
genes to clusters. The clustering solution with the lowest within-cluster sum
of distances is chosen.
If npass == 0, then the clustering algorithm will be run once, where the
initial assignment of elements to clusters is taken from the clusterid array.

method     (input) char
Defines whether the arithmetic mean (method == 'a') or the median
(method == 'm') is used to calculate the cluster center.

dist       (input) char
Defines which distance measure is used, as given by the table:
dist == 'e': Euclidean distance
dist == 'b': City-block distance
dist == 'c': correlation
dist == 'a': absolute value of the correlation
dist == 'u': uncentered correlation
dist == 'x': absolute uncentered correlation
dist == 's': Spearman's rank correlation
dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.

clusterid  (output; input) int[nrows] if transpose == 0
                           int[ncolumns] otherwise
The cluster number to which a gene or microarray was assigned. If npass == 0,
then on input clusterid contains the initial clustering assignment from which
the clustering algorithm starts. On output, it contains the clustering solution
that was found.

error      (output) double*
The sum of distances to the cluster center of each item in the optimal k-means
clustering solution that was found.

ifound     (output) int*
The number of times the optimal clustering solution was
found. The value of ifound is at least 1; its maximum value is npass. If the
number of clusters is larger than the number of elements being clustered,
*ifound is set to 0 as an error code. If a memory allocation error occurs,
*ifound is set to -1.

========================================================================
*/
{
    const int nelements = (transpose == 0) ? nrows : ncolumns;
    const int ndata = (transpose == 0) ? ncolumns : nrows;

    int i;
    int ok;
    int* tclusterid;
    int* mapping = NULL;
    double** cdata;
    int** cmask;
    int* counts;

    if (nelements < nclusters) {
        *ifound = 0;
        return;
    }
    /* More clusters asked for than elements available */

    *ifound = -1;

    /* This will contain the number of elements in each cluster, which is
     * needed to check for empty clusters. */
    counts = malloc(nclusters*sizeof(int));
    if (!counts) return;

    /* Find out if the user specified an initial clustering */
    if (npass <= 1) tclusterid = clusterid;
    else {
        tclusterid = malloc(nelements*sizeof(int));
        if (!tclusterid) {
            free(counts);
            return;
        }
        mapping = malloc(nclusters*sizeof(int));
        if (!mapping) {
            free(counts);
            free(tclusterid);
            return;
        }
        for (i = 0; i < nelements; i++) clusterid[i] = 0;
    }

    /* Allocate space to store the centroid data */
    if (transpose == 0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
    else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
    if (!ok) {
        free(counts);
        if (npass>1) {
            free(tclusterid);
            free(mapping);
        }
        return;
    }

    if (method == 'm') {
        double* cache = malloc(nelements*sizeof(double));
        if (cache) {
            *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
                               transpose, npass, dist, cdata, cmask, clusterid,
                               error, tclusterid, counts, mapping, cache);
            free(cache);
        }
    }
    else
        *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
                         transpose, npass, dist, cdata, cmask, clusterid,
                         error, tclusterid, counts, mapping);

    /* Deallocate temporarily used space */
    if (npass > 1) {
        free(mapping);
        free(tclusterid);
    }

    if (transpose == 0) freedatamask(nclusters, cdata, cmask);
    else freedatamask(ndata, cdata, cmask);

    free(counts);
}

/* *********************************************************************** */

void
kmedoids(int nclusters, int nelements, double** distmatrix, int npass,
    int clusterid[], double* error, int* ifound)
/*
Purpose
=======

The kmedoids routine performs k-medoids clustering on a given set of elements,
using the distance matrix and the number of clusters passed by the user.
Multiple passes are being made to find the optimal clustering solution, each
time starting from a different initial clustering.


Arguments
=========

nclusters  (input) int
The number of clusters to be found.

nelements  (input) int
The number of elements to be clustered.

distmatrix (input) double array, ragged
    (number of rows is nelements, number of columns is equal to the row number)
The distance matrix. To save space, the distance matrix is given in the
form of a ragged array. The distance matrix is symmetric and has zeros
on the diagonal. See distancematrix for a description of the content.

npass      (input) int
The number of times clustering is performed. Clustering is performed npass
times, each time starting from a different (random) initial assignment of genes
to clusters. The clustering solution with the lowest within-cluster sum of
distances is chosen.
If npass == 0, then the clustering algorithm will be run once, where the
initial assignment of elements to clusters is taken from the clusterid array.

clusterid  (output; input) int[nelements]
On input, if npass == 0, then clusterid contains the initial clustering
assignment from which the clustering algorithm starts; all numbers in clusterid
should be between zero and nelements-1 inclusive. If npass != 0, clusterid is
ignored on input.
On output, clusterid contains the clustering solution that was found: clusterid
contains the number of the cluster to which each item was assigned. On output,
the number of a cluster is defined as the item number of the centroid of the
cluster.

error      (output) double
The sum of distances to the cluster center of each item in the optimal
k-medoids clustering solution that was found.

ifound     (output) int
If kmedoids is successful: the number of times the optimal clustering solution
was found. The value of ifound is at least 1; its maximum value is npass.
If the user requested more clusters than elements available, ifound is set

src/cluster.c  view on Meta::CPAN

dist == 'k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.

method     (input) char
Defines how the distance between two clusters is defined, given which genes
belong to which cluster:
method == 'a': the distance between the arithmetic means of the two clusters
method == 'm': the distance between the medians of the two clusters
method == 's': the smallest pairwise distance between members of the two
               clusters
method == 'x': the largest pairwise distance between members of the two
               clusters
method == 'v': average of the pairwise distances between members of the two
               clusters

transpose  (input) int
If transpose is equal to zero, the distances between the rows is
calculated. Otherwise, the distances between the columns is calculated.
The former is needed when genes are being clustered; the latter is used
when samples are being clustered.

========================================================================
*/
{
    /* Set the metric function as indicated by dist */
    double (*metric) (int, double**, double**, int**, int**,
                      const double[], int, int, int) = setmetric(dist);

    /* if one or both clusters are empty, return */
    if (n1 < 1 || n2 < 1) return -1.0;
    /* Check the indices */
    if (transpose == 0) {
        int i;
        for (i = 0; i < n1; i++) {
            int index = index1[i];
            if (index < 0 || index >= nrows) return -1.0;
        }
        for (i = 0; i < n2; i++) {
            int index = index2[i];
            if (index < 0 || index >= nrows) return -1.0;
        }
    }
    else {
        int i;
        for (i = 0; i < n1; i++) {
            int index = index1[i];
            if (index < 0 || index >= ncolumns) return -1.0;
        }
        for (i = 0; i < n2; i++) {
            int index = index2[i];
            if (index < 0 || index >= ncolumns) return -1.0;
        }
    }

    switch (method) {
        case 'a': {
            /* Find the center */
            int i, j, k;
            if (transpose == 0) {
                double distance;
                double* cdata[2];
                int* cmask[2];
                int* count[2];
                count[0] = calloc(ncolumns, sizeof(int));
                count[1] = calloc(ncolumns, sizeof(int));
                cdata[0] = calloc(ncolumns, sizeof(double));
                cdata[1] = calloc(ncolumns, sizeof(double));
                cmask[0] = malloc(ncolumns*sizeof(int));
                cmask[1] = malloc(ncolumns*sizeof(int));
                for (i = 0; i < n1; i++) {
                    k = index1[i];
                    for (j = 0; j < ncolumns; j++)
                        if (mask[k][j] != 0) {
                            cdata[0][j] = cdata[0][j] + data[k][j];
                            count[0][j] = count[0][j] + 1;
                        }
                }
                for (i = 0; i < n2; i++) {
                    k = index2[i];
                    for (j = 0; j < ncolumns; j++)
                        if (mask[k][j] != 0) {
                            cdata[1][j] = cdata[1][j] + data[k][j];
                            count[1][j] = count[1][j] + 1;
                        }
                }
                for (i = 0; i < 2; i++)
                    for (j = 0; j < ncolumns; j++) {
                        if (count[i][j]>0) {
                            cdata[i][j] = cdata[i][j] / count[i][j];
                            cmask[i][j] = 1;
                        }
                        else
                            cmask[i][j] = 0;
                    }
                distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight,
                                  0, 1, 0);
                for (i = 0; i < 2; i++) {
                    free(cdata[i]);
                    free(cmask[i]);
                    free(count[i]);
                }
                return distance;
            }
            else {
                double distance;
                int** count = malloc(nrows*sizeof(int*));
                double** cdata = malloc(nrows*sizeof(double*));
                int** cmask = malloc(nrows*sizeof(int*));
                for (i = 0; i < nrows; i++) {
                    count[i] = calloc(2, sizeof(int));
                    cdata[i] = calloc(2, sizeof(double));
                    cmask[i] = malloc(2*sizeof(int));
                }
                for (i = 0; i < n1; i++) {
                    k = index1[i];
                    for (j = 0; j < nrows; j++) {
                        if (mask[j][k] != 0) {
                            cdata[j][0] += data[j][k];
                            count[j][0]++;
                        }
                    }
                }
                for (i = 0; i < n2; i++) {
                    k = index2[i];
                    for (j = 0; j < nrows; j++) {
                        if (mask[j][k] != 0) {
                            cdata[j][1] += data[j][k];
                            count[j][1]++;
                        }
                    }
                }
                for (i = 0; i < nrows; i++)
                    for (j = 0; j < 2; j++)
                        if (count[i][j]>0) {
                            cdata[i][j] /= count[i][j];
                            cmask[i][j] = 1;
                        }
                        else
                            cmask[i][j] = 0;
                distance = metric(nrows, cdata, cdata, cmask, cmask, weight,
                                  0, 1, 1);
                for (i = 0; i < nrows; i++) {
                    free(count[i]);
                    free(cdata[i]);
                    free(cmask[i]);
                }
                free(count);
                free(cdata);
                free(cmask);
                return distance;
            }
        }
        case 'm': {
            int i, j, k;
            if (transpose == 0) {
                double distance;
                double* temp = malloc(nrows*sizeof(double));
                double* cdata[2];
                int* cmask[2];
                for (i = 0; i < 2; i++) {
                    cdata[i] = malloc(ncolumns*sizeof(double));
                    cmask[i] = malloc(ncolumns*sizeof(int));
                }
                for (j = 0; j < ncolumns; j++) {
                    int count = 0;
                    for (k = 0; k < n1; k++) {
                        i = index1[k];
                        if (mask[i][j]) {
                            temp[count] = data[i][j];
                            count++;
                        }
                    }
                    if (count>0) {
                        cdata[0][j] = median(count, temp);
                        cmask[0][j] = 1;
                    }
                    else {
                        cdata[0][j] = 0.;
                        cmask[0][j] = 0;
                    }
                }
                for (j = 0; j < ncolumns; j++) {
                    int count = 0;
                    for (k = 0; k < n2; k++) {
                        i = index2[k];
                        if (mask[i][j]) {
                            temp[count] = data[i][j];
                            count++;
                        }
                    }
                    if (count>0) {
                        cdata[1][j] = median(count, temp);
                        cmask[1][j] = 1;
                    }
                    else {
                        cdata[1][j] = 0.;
                        cmask[1][j] = 0;
                    }
                }
                distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight,
                                  0, 1, 0);
                for (i = 0; i < 2; i++) {
                    free(cdata[i]);
                    free(cmask[i]);
                }
                free(temp);
                return distance;
            }
            else {
                double distance;
                double* temp = malloc(ncolumns*sizeof(double));
                double** cdata = malloc(nrows*sizeof(double*));
                int** cmask = malloc(nrows*sizeof(int*));
                for (i = 0; i < nrows; i++) {
                    cdata[i] = malloc(2*sizeof(double));
                    cmask[i] = malloc(2*sizeof(int));
                }
                for (j = 0; j < nrows; j++) {
                    int count = 0;
                    for (k = 0; k < n1; k++) {
                        i = index1[k];
                        if (mask[j][i]) {
                            temp[count] = data[j][i];
                            count++;
                        }
                    }
                    if (count>0) {
                        cdata[j][0] = median(count, temp);
                        cmask[j][0] = 1;
                    }
                    else {
                        cdata[j][0] = 0.;
                        cmask[j][0] = 0;
                    }
                }
                for (j = 0; j < nrows; j++) {
                    int count = 0;
                    for (k = 0; k < n2; k++) {
                        i = index2[k];
                        if (mask[j][i]) {
                            temp[count] = data[j][i];
                            count++;
                        }
                    }
                    if (count>0) {
                        cdata[j][1] = median(count, temp);
                        cmask[j][1] = 1;
                    }
                    else {
                        cdata[j][1] = 0.;
                        cmask[j][1] = 0;
                    }
                }
                distance = metric(nrows, cdata, cdata, cmask, cmask, weight,
                                  0, 1, 1);
                for (i = 0; i < nrows; i++) {
                    free(cdata[i]);
                    free(cmask[i]);
                }
                free(cdata);
                free(cmask);
                free(temp);
                return distance;
            }
        }
        case 's': {
            int i1, i2, j1, j2;
            const int n = (transpose == 0) ? ncolumns : nrows;
            double mindistance = DBL_MAX;
            for (i1 = 0; i1 < n1; i1++)
                for (i2 = 0; i2 < n2; i2++) {
                    double distance;
                    j1 = index1[i1];
                    j2 = index2[i2];
                    distance = metric(n, data, data, mask, mask, weight,
                                      j1, j2, transpose);
                    if (distance < mindistance) mindistance = distance;
                }
            return mindistance;
        }
        case 'x': {
            int i1, i2, j1, j2;
            const int n = (transpose == 0) ? ncolumns : nrows;
            double maxdistance = 0;
            for (i1 = 0; i1 < n1; i1++)
                for (i2 = 0; i2 < n2; i2++) {
                    double distance;
                    j1 = index1[i1];
                    j2 = index2[i2];
                    distance = metric(n, data, data, mask, mask, weight,
                                      j1, j2, transpose);
                    if (distance > maxdistance) maxdistance = distance;
                }
            return maxdistance;
        }
        case 'v': {
            int i1, i2, j1, j2;
            const int n = (transpose == 0) ? ncolumns : nrows;
            double distance = 0;
            for (i1 = 0; i1 < n1; i1++)
                for (i2 = 0; i2 < n2; i2++) {
                    j1 = index1[i1];
                    j2 = index2[i2];
                    distance += metric(n, data, data, mask, mask, weight,
                                       j1, j2, transpose);
                }
            distance /= (n1*n2);
            return distance;
        }
    }
    /* Never get here */
    return -2.0;
}



( run in 0.675 second using v1.01-cache-2.11-cpan-6b5c3043376 )