Algorithm-Cluster

 view release on metacpan or  search on metacpan

perl/Cluster.xs  view on Meta::CPAN

    int      nclusters;
    int      nrows;
    int      ncols;
    SV *     data_ref;
    SV *     mask_ref;
    SV *     clusterid_ref;
    int      transpose;
    char *   method;

    PREINIT:
    SV  *    cdata_ref;
    SV  *    cmask_ref;
    int     * clusterid;

    double ** matrix;
    int    ** mask;

    double ** cdata;
    int    ** cmask;
    int       cnrows = 0; /* Initialize to make the compiler shut up */
    int       cncols = 0; /* Initialize to make the compiler shut up */

    int i;
    int ok;

    PPCODE:

    /* ------------------------

perl/Cluster.xs  view on Meta::CPAN

                data_ref,   &matrix,
                mask_ref,   &mask,  
                nrows,      ncols);
    if (!ok) {
        free(clusterid);
            croak("failed to read input data for _clustercentroids\n");
    }


    /* ------------------------
     * Create the output variables cdata and cmask.
     */
    i = 0;
    cdata = malloc(cnrows * sizeof(double*));
    cmask = malloc(cnrows * sizeof(int*));
    if (cdata && cmask) {
        for ( ; i < cnrows; i++) {
            cdata[i] = malloc(cncols*sizeof(double));
            cmask[i] = malloc(cncols*sizeof(int));
            if (!cdata[i] || !cmask[i]) break;
        }
    }
    if (i < cnrows)
    {
        if (cdata[i]) free(cdata[i]);
        if (cmask[i]) free(cmask[i]);
        while (--i >= 0) {
            free(cdata[i]);
            free(cmask[i]);
        }
        if (cdata) free(cdata);
        if (cmask) free(cmask);
        free(clusterid);
        free_matrix_int(mask,     nrows);
        free_matrix_dbl(matrix,   nrows);
        croak("memory allocation failure in _clustercentroids\n");
    }

    /* ------------------------
     * Run the library function
     */
    ok = getclustercentroids(
               nclusters, nrows, ncols,
               matrix, mask, clusterid,
               cdata, cmask, transpose, method[0]);

    if (ok) {
            /* ------------------------
             * Convert generated C matrices to Perl matrices
             */
            cdata_ref = matrix_c2perl_dbl(aTHX_ cdata, cnrows, cncols);
            cmask_ref = matrix_c2perl_int(aTHX_ cmask, cnrows, cncols);

            /* ------------------------
             * Push the new Perl matrices onto the return stack
             */
            XPUSHs(sv_2mortal( cdata_ref   ));
            XPUSHs(sv_2mortal( cmask_ref   ));
    }

    /* ------------------------
     * Free what we've malloc'ed 
     */
    free_matrix_int(mask,     nrows);
    free_matrix_dbl(matrix,   nrows);
    free_matrix_int(cmask,    cnrows);
    free_matrix_dbl(cdata,    cnrows);
    free(clusterid);

    if (!ok) {
        croak("memory allocation failure in _clustercentroids\n");
    }
    /* Finished _clustercentroids() */

void
_distancematrix(nrows,ncols,data_ref,mask_ref,weight_ref,transpose,dist)
    int      nrows;

src/cluster.c  view on Meta::CPAN

        k = clusterid[j];
        clusterid[j] = clusterid[i];
        clusterid[i] = k;
    }
}

/* ********************************************************************* */

static void
getclustermeans(int nclusters, int nrows, int ncolumns,
    double** data, int** mask, int clusterid[], double** cdata, int** cmask,
    int transpose)
/*
Purpose
=======

The getclustermeans routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the mean over all
elements for each dimension.

Arguments

src/cluster.c  view on Meta::CPAN

mask      (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.

clusterid (output) int[nrows] if transpose == 0
                   int[ncolumns] otherwise
The cluster number to which each element belongs. If transpose == 0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of samples).

cdata     (output) double[nclusters][ncolumns] if transpose == 0
                   double[nrows][nclusters]    otherwise
On exit of getclustermeans, this array contains the cluster centroids.

cmask     (output) int[nclusters][ncolumns] if transpose == 0
                   int[nrows][nclusters]    otherwise
This array shows which data values of are missing for each centroid. If
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for a
centroid if all corresponding data values of the cluster members are missing.

transpose (input) int
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
of columns (samples) are specified.

========================================================================
*/
{
    int i, j, k;

    if (transpose == 0) {
        for (i = 0; i < nclusters; i++) {
            for (j = 0; j < ncolumns; j++) {
                cmask[i][j] = 0;
                cdata[i][j] = 0.;
            }
        }
        for (k = 0; k < nrows; k++) {
            i = clusterid[k];
            for (j = 0; j < ncolumns; j++) {
                if (mask[k][j] != 0) {
                    cdata[i][j] += data[k][j];
                    cmask[i][j]++;
                }
            }
        }
        for (i = 0; i < nclusters; i++) {
            for (j = 0; j < ncolumns; j++) {
                if (cmask[i][j]>0) {
                    cdata[i][j] /= cmask[i][j];
                    cmask[i][j] = 1;
                }
            }
        }
    }
    else {
        for (i = 0; i < nrows; i++) {
            for (j = 0; j < nclusters; j++) {
                cdata[i][j] = 0.;
                cmask[i][j] = 0;
            }
        }
        for (k = 0; k < ncolumns; k++) {
            i = clusterid[k];
            for (j = 0; j < nrows; j++) {
                if (mask[j][k] != 0) {
                    cdata[j][i] += data[j][k];
                    cmask[j][i]++;
                }
            }
        }
        for (i = 0; i < nrows; i++) {
            for (j = 0; j < nclusters; j++) {
                if (cmask[i][j]>0) {
                    cdata[i][j] /= cmask[i][j];
                    cmask[i][j] = 1;
                }
            }
        }
    }
}

/* ********************************************************************* */

static void
getclustermedians(int nclusters, int nrows, int ncolumns,
    double** data, int** mask, int clusterid[], double** cdata, int** cmask,
    int transpose, double cache[])
/*
Purpose
=======

The getclustermedians routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the median over all
elements for each dimension.

Arguments

src/cluster.c  view on Meta::CPAN

mask       (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.

clusterid  (output) int[nrows] if transpose == 0
                                        int[ncolumns] otherwise
The cluster number to which each element belongs. If transpose == 0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of samples).

cdata      (output) double[nclusters][ncolumns] if transpose == 0
                    double[nrows][nclusters] otherwise
On exit of getclustermedians, this array contains the cluster centroids.

cmask      (output) int[nclusters][ncolumns] if transpose == 0
                    int[nrows][nclusters] otherwise
This array shows which data values of are missing for each centroid. If
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for
a centroid if all corresponding data values of the cluster members are missing.

transpose  (input) int
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
of columns (samples) are specified.

cache      (input) double[nrows] if transpose == 0
                   double[ncolumns] otherwise
This array should be allocated before calling getclustermedians; its contents
on input is not relevant. This array is used as a temporary storage space when

src/cluster.c  view on Meta::CPAN

        for (i = 0; i < nclusters; i++) {
            for (j = 0; j < ncolumns; j++) {
                int count = 0;
                for (k = 0; k < nrows; k++) {
                    if (i == clusterid[k] && mask[k][j]) {
                        cache[count] = data[k][j];
                        count++;
                    }
                }
                if (count>0) {
                    cdata[i][j] = median(count, cache);
                    cmask[i][j] = 1;
                }
                else {
                    cdata[i][j] = 0.;
                    cmask[i][j] = 0;
                }
            }
        }
    }
    else {
        for (i = 0; i < nclusters; i++) {
            for (j = 0; j < nrows; j++) {
                int count = 0;
                for (k = 0; k < ncolumns; k++) {
                    if (i == clusterid[k] && mask[j][k]) {
                        cache[count] = data[j][k];
                        count++;
                    }
                }
                if (count>0) {
                    cdata[j][i] = median(count, cache);
                    cmask[j][i] = 1;
                }
                else {
                    cdata[j][i] = 0.;
                    cmask[j][i] = 0;
                }
            }
        }
    }
}

/* ********************************************************************* */

int
getclustercentroids(int nclusters, int nrows, int ncolumns,
    double** data, int** mask, int clusterid[], double** cdata, int** cmask,
    int transpose, char method)
/*
Purpose
=======

The getclustercentroids routine calculates the cluster centroids, given to
which cluster each element belongs. Depending on the argument method, the
centroid is defined as either the mean or the median for each dimension over
all elements belonging to a cluster.

src/cluster.c  view on Meta::CPAN

mask       (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j] == 0, then
data[i][j] is missing.

clusterid  (output) int[nrows] if transpose == 0
                                        int[ncolumns] otherwise
The cluster number to which each element belongs. If transpose == 0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of samples).

cdata      (output) double[nclusters][ncolumns] if transpose == 0
                    double[nrows][nclusters] otherwise
On exit of getclustercentroids, this array contains the cluster centroids.

cmask      (output) int[nclusters][ncolumns] if transpose == 0
                    int[nrows][nclusters] otherwise
This array shows which data values of are missing for each centroid. If
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for
a centroid if all corresponding data values of the cluster members are missing.

transpose  (input) int
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters
of columns (samples) are specified.

method     (input) char
For method == 'a', the centroid is defined as the mean over all elements
belonging to a cluster for each dimension.
For method == 'm', the centroid is defined as the median over all elements

src/cluster.c  view on Meta::CPAN

returns 0. If successful, getclustercentroids returns 1.
========================================================================
*/
{
    switch(method) {
        case 'm': {
            const int nelements = (transpose == 0) ? nrows : ncolumns;
            double* cache = malloc(nelements*sizeof(double));
            if (!cache) return 0;
            getclustermedians(nclusters, nrows, ncolumns, data, mask,
                              clusterid, cdata, cmask, transpose, cache);
            free(cache);
            return 1;
        }
        case 'a': {
            getclustermeans(nclusters, nrows, ncolumns, data, mask,
                            clusterid, cdata, cmask, transpose);
            return 1;
        }
    }
    return 0;
}

/* ********************************************************************* */

void
getclustermedoids(int nclusters, int nelements, double** distance,

src/cluster.c  view on Meta::CPAN

            centroids[j] = i;
        }
    }
}

/* ********************************************************************* */

static int
kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
    double weight[], int transpose, int npass, char dist,
    double** cdata, int** cmask, int clusterid[], double* error,
    int tclusterid[], int counts[], int mapping[])
{
    int i, j, k;
    const int nelements = (transpose == 0) ? nrows : ncolumns;
    const int ndata = (transpose == 0) ? ncolumns : nrows;
    int ifound = 1;
    int ipass = 0;
    /* Set the metric function as indicated by dist */
    double (*metric) (int, double**, double**, int**, int**,
                      const double[], int, int, int) = setmetric(dist);

src/cluster.c  view on Meta::CPAN


            if (counter % period == 0) {
                /* Save the current cluster assignments */
                for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
                if (period < INT_MAX / 2) period *= 2;
            }
            counter++;

            /* Find the center */
            getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
                            cdata, cmask, transpose);

            for (i = 0; i < nelements; i++) {
                double distance;
                /* Calculate the distances */
                k = tclusterid[i];
                if (counts[k] == 1) continue;
                /* No reassignment if that would lead to an empty cluster */
                /* Treat the present cluster as a special case */
                distance = metric(ndata, data, cdata, mask, cmask, weight,
                                  i, k, transpose);
                for (j = 0; j < nclusters; j++) {
                    double tdistance;
                    if (j == k) continue;
                    tdistance = metric(ndata, data, cdata, mask, cmask, weight,
                                       i, j, transpose);
                    if (tdistance < distance) {
                        distance = tdistance;
                        counts[tclusterid[i]]--;
                        tclusterid[i] = j;
                        counts[j]++;
                    }
                }
                total += distance;
            }

src/cluster.c  view on Meta::CPAN


    free(saved);
    return ifound;
}

/* ---------------------------------------------------------------------- */

static int
kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
    double weight[], int transpose, int npass, char dist,
    double** cdata, int** cmask, int clusterid[], double* error,
    int tclusterid[], int counts[], int mapping[], double cache[])
{
    int i, j, k;
    const int nelements = (transpose == 0) ? nrows : ncolumns;
    const int ndata = (transpose == 0) ? ncolumns : nrows;
    int ifound = 1;
    int ipass = 0;
    int* saved;
    /* Set the metric function as indicated by dist */
    double (*metric) (int, double**, double**, int**, int**,

src/cluster.c  view on Meta::CPAN


            if (counter % period == 0) {
                /* Save the current cluster assignments */
                for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
                if (period < INT_MAX / 2) period *= 2;
            }
            counter++;

            /* Find the center */
            getclustermedians(nclusters, nrows, ncolumns, data, mask,
                              tclusterid, cdata, cmask, transpose, cache);

            for (i = 0; i < nelements; i++) {
                /* Calculate the distances */
                double distance;
                k = tclusterid[i];
                if (counts[k] == 1) continue;
                /* No reassignment if that would lead to an empty cluster */
                /* Treat the present cluster as a special case */
                distance = metric(ndata, data, cdata, mask, cmask, weight,
                                  i, k, transpose);
                for (j = 0; j < nclusters; j++) {
                    double tdistance;
                    if (j == k) continue;
                    tdistance = metric(ndata, data, cdata, mask, cmask, weight,
                                       i, j, transpose);
                    if (tdistance < distance) {
                        distance = tdistance;
                        counts[tclusterid[i]]--;
                        tclusterid[i] = j;
                        counts[j]++;
                    }
                }
                total += distance;
            }

src/cluster.c  view on Meta::CPAN

========================================================================
*/
{
    const int nelements = (transpose == 0) ? nrows : ncolumns;
    const int ndata = (transpose == 0) ? ncolumns : nrows;

    int i;
    int ok;
    int* tclusterid;
    int* mapping = NULL;
    double** cdata;
    int** cmask;
    int* counts;

    if (nelements < nclusters) {
        *ifound = 0;
        return;
    }
    /* More clusters asked for than elements available */

    *ifound = -1;

src/cluster.c  view on Meta::CPAN

        mapping = malloc(nclusters*sizeof(int));
        if (!mapping) {
            free(counts);
            free(tclusterid);
            return;
        }
        for (i = 0; i < nelements; i++) clusterid[i] = 0;
    }

    /* Allocate space to store the centroid data */
    if (transpose == 0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
    else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
    if (!ok) {
        free(counts);
        if (npass>1) {
            free(tclusterid);
            free(mapping);
        }
        return;
    }

    if (method == 'm') {
        double* cache = malloc(nelements*sizeof(double));
        if (cache) {
            *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
                               transpose, npass, dist, cdata, cmask, clusterid,
                               error, tclusterid, counts, mapping, cache);
            free(cache);
        }
    }
    else
        *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
                         transpose, npass, dist, cdata, cmask, clusterid,
                         error, tclusterid, counts, mapping);

    /* Deallocate temporarily used space */
    if (npass > 1) {
        free(mapping);
        free(tclusterid);
    }

    if (transpose == 0) freedatamask(nclusters, cdata, cmask);
    else freedatamask(ndata, cdata, cmask);

    free(counts);
}

/* *********************************************************************** */

void
kmedoids(int nclusters, int nelements, double** distmatrix, int npass,
    int clusterid[], double* error, int* ifound)
/*

src/cluster.c  view on Meta::CPAN

            if (index < 0 || index >= ncolumns) return -1.0;
        }
    }

    switch (method) {
        case 'a': {
            /* Find the center */
            int i, j, k;
            if (transpose == 0) {
                double distance;
                double* cdata[2];
                int* cmask[2];
                int* count[2];
                count[0] = calloc(ncolumns, sizeof(int));
                count[1] = calloc(ncolumns, sizeof(int));
                cdata[0] = calloc(ncolumns, sizeof(double));
                cdata[1] = calloc(ncolumns, sizeof(double));
                cmask[0] = malloc(ncolumns*sizeof(int));
                cmask[1] = malloc(ncolumns*sizeof(int));
                for (i = 0; i < n1; i++) {
                    k = index1[i];
                    for (j = 0; j < ncolumns; j++)
                        if (mask[k][j] != 0) {
                            cdata[0][j] = cdata[0][j] + data[k][j];
                            count[0][j] = count[0][j] + 1;
                        }
                }
                for (i = 0; i < n2; i++) {
                    k = index2[i];
                    for (j = 0; j < ncolumns; j++)
                        if (mask[k][j] != 0) {
                            cdata[1][j] = cdata[1][j] + data[k][j];
                            count[1][j] = count[1][j] + 1;
                        }
                }
                for (i = 0; i < 2; i++)
                    for (j = 0; j < ncolumns; j++) {
                        if (count[i][j]>0) {
                            cdata[i][j] = cdata[i][j] / count[i][j];
                            cmask[i][j] = 1;
                        }
                        else
                            cmask[i][j] = 0;
                    }
                distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight,
                                  0, 1, 0);
                for (i = 0; i < 2; i++) {
                    free(cdata[i]);
                    free(cmask[i]);
                    free(count[i]);
                }
                return distance;
            }
            else {
                double distance;
                int** count = malloc(nrows*sizeof(int*));
                double** cdata = malloc(nrows*sizeof(double*));
                int** cmask = malloc(nrows*sizeof(int*));
                for (i = 0; i < nrows; i++) {
                    count[i] = calloc(2, sizeof(int));
                    cdata[i] = calloc(2, sizeof(double));
                    cmask[i] = malloc(2*sizeof(int));
                }
                for (i = 0; i < n1; i++) {
                    k = index1[i];
                    for (j = 0; j < nrows; j++) {
                        if (mask[j][k] != 0) {
                            cdata[j][0] += data[j][k];
                            count[j][0]++;
                        }
                    }
                }
                for (i = 0; i < n2; i++) {
                    k = index2[i];
                    for (j = 0; j < nrows; j++) {
                        if (mask[j][k] != 0) {
                            cdata[j][1] += data[j][k];
                            count[j][1]++;
                        }
                    }
                }
                for (i = 0; i < nrows; i++)
                    for (j = 0; j < 2; j++)
                        if (count[i][j]>0) {
                            cdata[i][j] /= count[i][j];
                            cmask[i][j] = 1;
                        }
                        else
                            cmask[i][j] = 0;
                distance = metric(nrows, cdata, cdata, cmask, cmask, weight,
                                  0, 1, 1);
                for (i = 0; i < nrows; i++) {
                    free(count[i]);
                    free(cdata[i]);
                    free(cmask[i]);
                }
                free(count);
                free(cdata);
                free(cmask);
                return distance;
            }
        }
        case 'm': {
            int i, j, k;
            if (transpose == 0) {
                double distance;
                double* temp = malloc(nrows*sizeof(double));
                double* cdata[2];
                int* cmask[2];
                for (i = 0; i < 2; i++) {
                    cdata[i] = malloc(ncolumns*sizeof(double));
                    cmask[i] = malloc(ncolumns*sizeof(int));
                }
                for (j = 0; j < ncolumns; j++) {
                    int count = 0;
                    for (k = 0; k < n1; k++) {
                        i = index1[k];
                        if (mask[i][j]) {
                            temp[count] = data[i][j];
                            count++;
                        }
                    }
                    if (count>0) {
                        cdata[0][j] = median(count, temp);
                        cmask[0][j] = 1;
                    }
                    else {
                        cdata[0][j] = 0.;
                        cmask[0][j] = 0;
                    }
                }
                for (j = 0; j < ncolumns; j++) {
                    int count = 0;
                    for (k = 0; k < n2; k++) {
                        i = index2[k];
                        if (mask[i][j]) {
                            temp[count] = data[i][j];
                            count++;
                        }
                    }
                    if (count>0) {
                        cdata[1][j] = median(count, temp);
                        cmask[1][j] = 1;
                    }
                    else {
                        cdata[1][j] = 0.;
                        cmask[1][j] = 0;
                    }
                }
                distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight,
                                  0, 1, 0);
                for (i = 0; i < 2; i++) {
                    free(cdata[i]);
                    free(cmask[i]);
                }
                free(temp);
                return distance;
            }
            else {
                double distance;
                double* temp = malloc(ncolumns*sizeof(double));
                double** cdata = malloc(nrows*sizeof(double*));
                int** cmask = malloc(nrows*sizeof(int*));
                for (i = 0; i < nrows; i++) {
                    cdata[i] = malloc(2*sizeof(double));
                    cmask[i] = malloc(2*sizeof(int));
                }
                for (j = 0; j < nrows; j++) {
                    int count = 0;
                    for (k = 0; k < n1; k++) {
                        i = index1[k];
                        if (mask[j][i]) {
                            temp[count] = data[j][i];
                            count++;
                        }
                    }
                    if (count>0) {
                        cdata[j][0] = median(count, temp);
                        cmask[j][0] = 1;
                    }
                    else {
                        cdata[j][0] = 0.;
                        cmask[j][0] = 0;
                    }
                }
                for (j = 0; j < nrows; j++) {
                    int count = 0;
                    for (k = 0; k < n2; k++) {
                        i = index2[k];
                        if (mask[j][i]) {
                            temp[count] = data[j][i];
                            count++;
                        }
                    }
                    if (count>0) {
                        cdata[j][1] = median(count, temp);
                        cmask[j][1] = 1;
                    }
                    else {
                        cdata[j][1] = 0.;
                        cmask[j][1] = 0;
                    }
                }
                distance = metric(nrows, cdata, cdata, cmask, cmask, weight,
                                  0, 1, 1);
                for (i = 0; i < nrows; i++) {
                    free(cdata[i]);
                    free(cmask[i]);
                }
                free(cdata);
                free(cmask);
                free(temp);
                return distance;
            }
        }
        case 's': {
            int i1, i2, j1, j2;
            const int n = (transpose == 0) ? ncolumns : nrows;
            double mindistance = DBL_MAX;
            for (i1 = 0; i1 < n1; i1++)

src/cluster.h  view on Meta::CPAN


/* Chapter 2 */
double clusterdistance(int nrows, int ncolumns, double** data, int** mask,
  double weight[], int n1, int n2, int index1[], int index2[], char dist,
  char method, int transpose);
void distancematrix(int ngenes, int ndata, double** data, int** mask,
  double* weight, char dist, int transpose, double** distances);

/* Chapter 3 */
int getclustercentroids(int nclusters, int nrows, int ncolumns,
  double** data, int** mask, int clusterid[], double** cdata, int** cmask,
  int transpose, char method);
void getclustermedoids(int nclusters, int nelements, double** distance,
  int clusterid[], int centroids[], double errors[]);
void kcluster(int nclusters, int ngenes, int ndata, double** data,
  int** mask, double weight[], int transpose, int npass, char method, char dist,
  int clusterid[], double* error, int* ifound);
void kmedoids(int nclusters, int nelements, double** distance,
  int npass, int clusterid[], double* error, int* ifound);

/* Chapter 4 */



( run in 0.259 second using v1.01-cache-2.11-cpan-454fe037f31 )