PDL-Cluster

 view release on metacpan or  search on metacpan

Cluster.pd  view on Meta::CPAN


##----------------------------------------------------------------------
## Cluster Centroids: Generic
pp_def
  ('getclustercentroids',
   Pars => join("\n   ", '',
		q(double data(d,n);),     ##-- n="rows"|"elts", d="columns"|"features"
		q(int    mask(d,n);),     ##-- n="rows"|"elts", d="columns"|"features"
		#q(int    transpose();),   ##-- probably dangerous
		q(int    clusterids(n);),  ##-- maps elts to cluster-ids
		q(double [o]cdata(d,k);), ##-- centroid data
		q(int    [o]cmask(d,k);), ##-- centroid data
		''
	       ),
    OtherPars => join("\n   ", '', 'char *ctrMethodFlag;', ''),
   Code =>
('
  int    transpose = 0;
  double **datapp  = (double **)pp_alloc($SIZE(n));
  int    **maskpp  = (int    **)pp_alloc($SIZE(n));
  double **cdatapp = (double **)pp_alloc($SIZE(k));
  int    **cmaskpp = (int    **)pp_alloc($SIZE(k));

  threadloop %{
    p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
    p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
    p2pp_dbl($SIZE(k), $SIZE(d), $P(cdata), cdatapp);
    p2pp_int($SIZE(k), $SIZE(d), $P(cmask), cmaskpp);

    getclustercentroids($SIZE(k), $SIZE(n), $SIZE(d), datapp, maskpp,
                        $P(clusterids), cdatapp, cmaskpp, transpose, *$COMP(ctrMethodFlag));
  %}

  /*-- cleanup --*/
  if (datapp) free(datapp);
  if (maskpp) free(maskpp);
  if (cdatapp) free(cdatapp);
  if (cmaskpp) free(cmaskpp);
'),

  Doc => 'Find cluster centroids by arithmetic mean (C<ctrMethodFlag="a">) or median over each dimension (C<ctrMethodFlag="m">).'
);


##----------------------------------------------------------------------
## Cluster Centroids: Mean
##  + now just a wrapper for 'getclustercentroids(...,"a")'

Cluster.pd  view on Meta::CPAN

=pod

=head2 getclustermean

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   int    clusterids(n);
   double [o]cdata(d,k);
   int    [o]cmask(d,k);
   )

Really just a wrapper for getclustercentroids(...,"a").

=cut

sub getclustermean {
  my ($data,$mask,$cids,$cdata,$cmask) = @_;
  return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'a');
}

EOPM

##----------------------------------------------------------------------
## Cluster Centroids: Median
##  + now just a wrapper for 'getclustercentroids(...,"m")'
pp_add_exported('','getclustermedian');
pp_addpm(<<'EOPM');

=pod

=head2 getclustermedian

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   int    clusterids(n);
   double [o]cdata(d,k);
   int    [o]cmask(d,k);
   )

Really just a wrapper for getclustercentroids(...,"m").

=cut

sub getclustermedian {
  my ($data,$mask,$cids,$cdata,$cmask) = @_;
  return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'m');
}

EOPM


##----------------------------------------------------------------------
## Cluster Centroids: Medoids
pp_def
  ('getclustermedoids',
   Pars => join("\n   ", '',

Cluster.pd  view on Meta::CPAN


##----------------------------------------------------------------------
## Cluster Centroids via Weighted Sum [p(datum_n|cluster_k)]
pp_def
('getclusterwsum',
       Pars => join("\n   ", '',
		    q(double data(d,n);),       ##-- n="rows"|"elts", d="columns"|"features"
		    q(int    mask(d,n);),       ##-- n="rows"|"elts", d="columns"|"features"
		    q(double clusterwts(k,n);), ##-- maps (cluster,elt) to weight [p(elt|cluster)]
		                                ##   : should probably sum to 1 over each cluster (k)
		    q(double [o]cdata(d,k);),   ##-- centroid data
		    q(int    [o]cmask(d,k);),   ##-- centroid mask
		    ''
		   ),
       Code =>
('
 int rid, rwt, cmaskdk;
 loop (d) %{
   loop (k) %{
     cmaskdk = 0;
     loop (n) %{
       if ($mask()) {
         cmaskdk = 1;
	 $cdata() += $clusterwts() * $data();
       }
     %}
     $cmask() = cmaskdk;
   %}
 %}
'),

 Doc => '
Find cluster centroids by weighted sum.  This can be considered an
expensive generalization of the getclustermean() and getclustermedian()
functions.  Here, the input PDLs $data() and $mask(), as well as the
output PDL $cdata() are as for getclustermean().  The matrix $clusterwts()
determines the relative weight of each data row in determining the
centroid of each cluster, potentially useful for "fuzzy" clustering.
The equation used to compute cluster means is:

 $cdata(d,k) = sum_{n} $clusterwts(k,n) * $data(d,n) * $mask(d,n)

For centroids in the same range as data elements, $clusterwts()
should sum to 1 over each column (k):

 all($clusterwts->xchg(0,1)->sumover == 1)

getclustermean() can be simulated by instantiating $clusterwts() with
a uniform distribution over cluster elements:

 $clusterwts = zeroes($k,$n);
 $clusterwts->indexND(cat($clusterids, xvals($clusterids))->xchg(0,1)) .= 1;
 $clusterwts /= $clusterwts->xchg(0,1)->sumover;
 getclusterwsum($data,$mask, $clusterwts, $cdata=zeroes($d,$k));

Similarly, getclustermedian() can be simulated by setting $clusterwts() to
1 for cluster medians and otherwise to 0.  More sophisticated centroid
discovery methods can be computed by this function by setting
$clusterwts(k,n) to some estimate of the conditional probability
of the datum at row $n given the cluster with index $k:
p(Elt==n|Cluster==k).  One
way to achieve such an estimate is to use (normalized inverses of) the
singleton-row-to-cluster distances as output by clusterdistancematrix().

Cluster.pd  view on Meta::CPAN


##----------------------------------------------------------------------
## Attach data to nearest centroid
pp_def
('attachtonearest',
       Pars => join("\n   ", '',
		    q(double data(d,n);),         ##-- n="rows"|"elts", d="columns"|"features"
		    q(int    mask(d,n);),         ##-- n="rows"|"elts", d="columns"|"features"
		    q(double weight(d);),         ##-- feature weights
		    q(int    rowids(nr);),        ##-- rows to attach
		    q(double cdata(d,k);),        ##-- centroid data
		    q(int    cmask(d,k);),        ##-- centroid mask
		    q(int    [o]clusterids(nr);), ##-- output cluster ids
		    q(double [o]cdist(nr);),      ##-- distances to best clusters
		    ''
		   ),
 OtherPars => join("\n   ", '', 'char *distFlag;', 'char *methodFlag;', ''),
       Code =>
('
  double **datapp  = (double **)pp_alloc($SIZE(n));
  int    **maskpp  = (int    **)pp_alloc($SIZE(n));
  double **cdatapp = (double **)pp_alloc($SIZE(k));
  int    **cmaskpp = (int    **)pp_alloc($SIZE(k));
  double *tmpdatapp[2];
  int    *tmpmaskpp[2];
  int    transpose=0;
  //
  threadloop %{
    int    tmprowid = 0;
    int    tmpctrid = 1;
    int    ni;
    int    ki, kbest;
    double dist, dbest;
    //
    p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
    p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
    p2pp_dbl($SIZE(k), $SIZE(d), $P(cdata), cdatapp);
    p2pp_int($SIZE(k), $SIZE(d), $P(cmask), cmaskpp);
    //
    /*-- loop over all target rows --*/
    loop (nr) %{
      ni = $rowids();
      tmpdatapp[tmprowid] = datapp[ni];
      tmpmaskpp[tmprowid] = maskpp[ni];
      //
      /*-- initialize --*/
      tmpdatapp[tmpctrid] = cdatapp[0];
      tmpmaskpp[tmpctrid] = cmaskpp[0];
      kbest = 0;
      dbest = clusterdistance(2, $SIZE(d), tmpdatapp, tmpmaskpp, $P(weight),
                              1, 1, &tmprowid, &tmpctrid,
                              *$COMP(distFlag), *$COMP(methodFlag), transpose);
      //
      /*-- loop over all centroids --*/
      for (ki=1; ki < $SIZE(k); ki++) {
        tmpdatapp[tmpctrid] = cdatapp[ki];
        tmpmaskpp[tmpctrid] = cmaskpp[ki];
        //
        dist = clusterdistance(2, $SIZE(d), tmpdatapp, tmpmaskpp, $P(weight),
                               1, 1, &tmprowid, &tmpctrid,
                               *$COMP(distFlag), *$COMP(methodFlag), transpose);
        if (dist < dbest) {
          kbest = ki;
          dbest = dist;
        }
      }
      //
      /*-- save best data --*/
      $clusterids() = kbest;
      $cdist()      = dbest;
    %}
  %}
  //
  /*-- cleanup --*/
  if (datapp) free(datapp);
  if (maskpp) free(maskpp);
  if (cdatapp) free(cdatapp);
  if (cmaskpp) free(cmaskpp);
'),

 Doc => '
Assigns each specified data row to the nearest cluster centroid.
Data elements are given by $data() and $mask(), feature weights are
given by $weight(), as usual.  Cluster centroids are defined by
by $cdata() and $cmask(), and the indices of rows to be attached
are given in the vector $rowids().  The output vector $clusterids()
contains for each specified row index the identifier of the nearest
cluster centroid.  The vector $cdist() contains the distance to
the best clusters.

See also: clusterdistancematrix(), attachtonearestd().
'
);


GENERATED/PDL/Cluster.pm  view on Meta::CPAN



=head2 getclustercentroids

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   int    clusterids(n);
   double [o]cdata(d,k);
   int    [o]cmask(d,k);
   ; char *ctrMethodFlag;
)

=for ref

Find cluster centroids by arithmetic mean (C<ctrMethodFlag="a">) or median over each dimension (C<ctrMethodFlag="m">).

=for bad

GENERATED/PDL/Cluster.pm  view on Meta::CPAN

=pod

=head2 getclustermean

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   int    clusterids(n);
   double [o]cdata(d,k);
   int    [o]cmask(d,k);
   )

Really just a wrapper for getclustercentroids(...,"a").

=cut

sub getclustermean {
  my ($data,$mask,$cids,$cdata,$cmask) = @_;
  return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'a');
}
#line 468 "Cluster.pm"



#line 620 "Cluster.pd"


=pod

=head2 getclustermedian

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   int    clusterids(n);
   double [o]cdata(d,k);
   int    [o]cmask(d,k);
   )

Really just a wrapper for getclustercentroids(...,"m").

=cut

sub getclustermedian {
  my ($data,$mask,$cids,$cdata,$cmask) = @_;
  return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'m');
}
#line 497 "Cluster.pm"



#line 949 "/usr/lib/x86_64-linux-gnu/perl5/5.36/PDL/PP.pm"



=head2 getclustermedoids

GENERATED/PDL/Cluster.pm  view on Meta::CPAN



=head2 getclusterwsum

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   double clusterwts(k,n);
   double [o]cdata(d,k);
   int    [o]cmask(d,k);
   )


Find cluster centroids by weighted sum.  This can be considered an
expensive generalization of the getclustermean() and getclustermedian()
functions.  Here, the input PDLs $data() and $mask(), as well as the
output PDL $cdata() are as for getclustermean().  The matrix $clusterwts()
determines the relative weight of each data row in determining the
centroid of each cluster, potentially useful for "fuzzy" clustering.
The equation used to compute cluster means is:

 $cdata(d,k) = sum_{n} $clusterwts(k,n) * $data(d,n) * $mask(d,n)

For centroids in the same range as data elements, $clusterwts()
should sum to 1 over each column (k):

 all($clusterwts->xchg(0,1)->sumover == 1)

getclustermean() can be simulated by instantiating $clusterwts() with
a uniform distribution over cluster elements:

 $clusterwts = zeroes($k,$n);
 $clusterwts->indexND(cat($clusterids, xvals($clusterids))->xchg(0,1)) .= 1;
 $clusterwts /= $clusterwts->xchg(0,1)->sumover;
 getclusterwsum($data,$mask, $clusterwts, $cdata=zeroes($d,$k));

Similarly, getclustermedian() can be simulated by setting $clusterwts() to
1 for cluster medians and otherwise to 0.  More sophisticated centroid
discovery methods can be computed by this function by setting
$clusterwts(k,n) to some estimate of the conditional probability
of the datum at row $n given the cluster with index $k:
p(Elt==n|Cluster==k).  One
way to achieve such an estimate is to use (normalized inverses of) the
singleton-row-to-cluster distances as output by clusterdistancematrix().

GENERATED/PDL/Cluster.pm  view on Meta::CPAN


=head2 attachtonearest

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   double weight(d);
   int    rowids(nr);
   double cdata(d,k);
   int    cmask(d,k);
   int    [o]clusterids(nr);
   double [o]cdist(nr);
   ; 
   char *distFlag;
   char *methodFlag;
   )


Assigns each specified data row to the nearest cluster centroid.
Data elements are given by $data() and $mask(), feature weights are
given by $weight(), as usual.  Cluster centroids are defined by
by $cdata() and $cmask(), and the indices of rows to be attached
are given in the vector $rowids().  The output vector $clusterids()
contains for each specified row index the identifier of the nearest
cluster centroid.  The vector $cdist() contains the distance to
the best clusters.

See also: clusterdistancematrix(), attachtonearestd().


=for bad

ccluster.c  view on Meta::CPAN

    clusterid[j] = clusterid[i];
    clusterid[i] = k;
  }

  return;
}

/* ********************************************************************* */

static void getclustermeans(int nclusters, int nrows, int ncolumns,
  double** data, int** mask, int clusterid[], double** cdata, int** cmask,
  int transpose)
/*
Purpose
=======

The getclustermeans routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the mean over all
elements for each dimension.

Arguments

ccluster.c  view on Meta::CPAN

mask       (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j]==0, then
data[i][j] is missing.

clusterid  (output) int[nrows] if transpose==0
                    int[ncolumns] if transpose==1
The cluster number to which each element belongs. If transpose==0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of microarrays).

cdata      (output) double[nclusters][ncolumns] if transpose==0
                    double[nrows][nclusters] if transpose==1
On exit of getclustermeans, this array contains the cluster centroids.

cmask      (output) int[nclusters][ncolumns] if transpose==0
                    int[nrows][nclusters] if transpose==1
This array shows which data values of are missing for each centroid. If
cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
a centroid if all corresponding data values of the cluster members are missing.

transpose  (input) int
If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
columns (microarrays) are specified.

========================================================================
*/
{ int i, j, k;
  if (transpose==0)
  { for (i = 0; i < nclusters; i++)
    { for (j = 0; j < ncolumns; j++)
      { cmask[i][j] = 0;
        cdata[i][j] = 0.;
      }
    }
    for (k = 0; k < nrows; k++)
    { i = clusterid[k];
      for (j = 0; j < ncolumns; j++)
      { if (mask[k][j] != 0)
        { cdata[i][j]+=data[k][j];
          cmask[i][j]++;
        }
      }
    }
    for (i = 0; i < nclusters; i++)
    { for (j = 0; j < ncolumns; j++)
      { if (cmask[i][j]>0)
        { cdata[i][j] /= cmask[i][j];
          cmask[i][j] = 1;
        }
      }
    }
  }
  else
  { for (i = 0; i < nrows; i++)
    { for (j = 0; j < nclusters; j++)
      { cdata[i][j] = 0.;
        cmask[i][j] = 0;
      }
    }
    for (k = 0; k < ncolumns; k++)
    { i = clusterid[k];
      for (j = 0; j < nrows; j++)
      { if (mask[j][k] != 0)
        { cdata[j][i]+=data[j][k];
          cmask[j][i]++;
        }
      }
    }
    for (i = 0; i < nrows; i++)
    { for (j = 0; j < nclusters; j++)
      { if (cmask[i][j]>0)
        { cdata[i][j] /= cmask[i][j];
          cmask[i][j] = 1;
        }
      }
    }
  }
}

/* ********************************************************************* */

static void
getclustermedians(int nclusters, int nrows, int ncolumns,
  double** data, int** mask, int clusterid[], double** cdata, int** cmask,
  int transpose, double cache[])
/*
Purpose
=======

The getclustermedians routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the median over all
elements for each dimension.

Arguments

ccluster.c  view on Meta::CPAN

mask       (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j]==0, then
data[i][j] is missing.

clusterid  (output) int[nrows] if transpose==0
                    int[ncolumns] if transpose==1
The cluster number to which each element belongs. If transpose==0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of microarrays).

cdata      (output) double[nclusters][ncolumns] if transpose==0
                    double[nrows][nclusters] if transpose==1
On exit of getclustermedians, this array contains the cluster centroids.

cmask      (output) int[nclusters][ncolumns] if transpose==0
                    int[nrows][nclusters] if transpose==1
This array shows which data values of are missing for each centroid. If
cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
a centroid if all corresponding data values of the cluster members are missing.

transpose  (input) int
If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
columns (microarrays) are specified.

cache      (input) double[nrows] if transpose==0
                   double[ncolumns] if transpose==1
This array should be allocated before calling getclustermedians; its contents
on input is not relevant. This array is used as a temporary storage space when

ccluster.c  view on Meta::CPAN

  { for (i = 0; i < nclusters; i++)
    { for (j = 0; j < ncolumns; j++)
      { int count = 0;
        for (k = 0; k < nrows; k++)
        { if (i==clusterid[k] && mask[k][j])
          { cache[count] = data[k][j];
            count++;
          }
        }
        if (count>0)
        { cdata[i][j] = median(count,cache);
          cmask[i][j] = 1;
        }
        else
        { cdata[i][j] = 0.;
          cmask[i][j] = 0;
        }
      }
    }
  }
  else
  { for (i = 0; i < nclusters; i++)
    { for (j = 0; j < nrows; j++)
      { int count = 0;
        for (k = 0; k < ncolumns; k++)
        { if (i==clusterid[k] && mask[j][k])
          { cache[count] = data[j][k];
            count++;
          }
        }
        if (count>0)
        { cdata[j][i] = median(count,cache);
          cmask[j][i] = 1;
        }
        else
        { cdata[j][i] = 0.;
          cmask[j][i] = 0;
        }
      }
    }
  }
}

/* ********************************************************************* */

int getclustercentroids(int nclusters, int nrows, int ncolumns,
  double** data, int** mask, int clusterid[], double** cdata, int** cmask,
  int transpose, char method)
/*
Purpose
=======

The getclustercentroids routine calculates the cluster centroids, given to
which cluster each element belongs. Depending on the argument method, the
centroid is defined as either the mean or the median for each dimension over
all elements belonging to a cluster.

ccluster.c  view on Meta::CPAN

mask       (input) int[nrows][ncolumns]
This array shows which data values are missing. If mask[i][j]==0, then
data[i][j] is missing.

clusterid  (output) int[nrows] if transpose==0
                    int[ncolumns] if transpose==1
The cluster number to which each element belongs. If transpose==0, then the
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
is equal to ncolumns (the number of microarrays).

cdata      (output) double[nclusters][ncolumns] if transpose==0
                    double[nrows][nclusters] if transpose==1
On exit of getclustercentroids, this array contains the cluster centroids.

cmask      (output) int[nclusters][ncolumns] if transpose==0
                    int[nrows][nclusters] if transpose==1
This array shows which data values of are missing for each centroid. If
cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
a centroid if all corresponding data values of the cluster members are missing.

transpose  (input) int
If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
columns (microarrays) are specified.

method     (input) char
For method=='a', the centroid is defined as the mean over all elements
belonging to a cluster for each dimension.
For method=='m', the centroid is defined as the median over all elements

ccluster.c  view on Meta::CPAN

memory error occurs, or if method is not 'm' or 'a', getclustercentroids
returns 0. If successful, getclustercentroids returns 1.
========================================================================
*/
{ switch(method)
  { case 'm':
    { const int nelements = (transpose==0) ? nrows : ncolumns;
      double* cache = malloc(nelements*sizeof(double));
      if (!cache) return 0;
      getclustermedians(nclusters, nrows, ncolumns, data, mask, clusterid,
                        cdata, cmask, transpose, cache);
      free(cache);
      return 1;
    }
    case 'a':
    { getclustermeans(nclusters, nrows, ncolumns, data, mask, clusterid,
                      cdata, cmask, transpose);
      return 1;
    }
  }
  return 0;
}

/* ********************************************************************* */

void getclustermedoids(int nclusters, int nelements, double** distance,
  int clusterid[], int centroids[], double errors[])

ccluster.c  view on Meta::CPAN

      centroids[j] = i;
    }
  }
}

/* ********************************************************************* */

static int
kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
  double weight[], int transpose, int npass, char dist,
  double** cdata, int** cmask, int clusterid[], double* error,
  int tclusterid[], int counts[], int mapping[])
{ int i, j, k;
  const int nelements = (transpose==0) ? nrows : ncolumns;
  const int ndata = (transpose==0) ? ncolumns : nrows;
  int ifound = 1;
  int ipass = 0;
  /* Set the metric function as indicated by dist */
  double (*metric)
    (int, double**, double**, int**, int**, const double[], int, int, int) =
       setmetric(dist);

ccluster.c  view on Meta::CPAN

      total = 0.0;

      if (counter % period == 0) /* Save the current cluster assignments */
      { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
        if (period < INT_MAX / 2) period *= 2;
      }
      counter++;

      /* Find the center */
      getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
                      cdata, cmask, transpose);

      for (i = 0; i < nelements; i++)
      /* Calculate the distances */
      { double distance;
        k = tclusterid[i];
        if (counts[k]==1) continue;
        /* No reassignment if that would lead to an empty cluster */
        /* Treat the present cluster as a special case */
        distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
        for (j = 0; j < nclusters; j++)
        { double tdistance;
          if (j==k) continue;
          tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
          if (tdistance < distance)
          { distance = tdistance;
            counts[tclusterid[i]]--;
            tclusterid[i] = j;
            counts[j]++;
          }
        }
        total += distance;
      }
      if (total>=previous) break;

ccluster.c  view on Meta::CPAN


  free(saved);
  return ifound;
}

/* ---------------------------------------------------------------------- */

static int
kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
  double weight[], int transpose, int npass, char dist,
  double** cdata, int** cmask, int clusterid[], double* error,
  int tclusterid[], int counts[], int mapping[], double cache[])
{ int i, j, k;
  const int nelements = (transpose==0) ? nrows : ncolumns;
  const int ndata = (transpose==0) ? ncolumns : nrows;
  int ifound = 1;
  int ipass = 0;
  /* Set the metric function as indicated by dist */
  double (*metric)
    (int, double**, double**, int**, int**, const double[], int, int, int) =
       setmetric(dist);

ccluster.c  view on Meta::CPAN

      total = 0.0;

      if (counter % period == 0) /* Save the current cluster assignments */
      { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
        if (period < INT_MAX / 2) period *= 2;
      }
      counter++;

      /* Find the center */
      getclustermedians(nclusters, nrows, ncolumns, data, mask, tclusterid,
                        cdata, cmask, transpose, cache);

      for (i = 0; i < nelements; i++)
      /* Calculate the distances */
      { double distance;
        k = tclusterid[i];
        if (counts[k]==1) continue;
        /* No reassignment if that would lead to an empty cluster */
        /* Treat the present cluster as a special case */
        distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
        for (j = 0; j < nclusters; j++)
        { double tdistance;
          if (j==k) continue;
          tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
          if (tdistance < distance)
          { distance = tdistance;
            counts[tclusterid[i]]--;
            tclusterid[i] = j;
            counts[j]++;
          }
        }
        total += distance;
      }
      if (total>=previous) break;

ccluster.c  view on Meta::CPAN


========================================================================
*/
{ const int nelements = (transpose==0) ? nrows : ncolumns;
  const int ndata = (transpose==0) ? ncolumns : nrows;

  int i;
  int ok;
  int* tclusterid;
  int* mapping = NULL;
  double** cdata;
  int** cmask;
  int* counts;

  if (nelements < nclusters)
  { *ifound = 0;
    return;
  }
  /* More clusters asked for than elements available */

  *ifound = -1;

ccluster.c  view on Meta::CPAN

    mapping = malloc(nclusters*sizeof(int));
    if (!mapping)
    { free(counts);
      free(tclusterid);
      return;
    }
    for (i = 0; i < nelements; i++) clusterid[i] = 0;
  }

  /* Allocate space to store the centroid data */
  if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
  else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
  if(!ok)
  { free(counts);
    if(npass>1)
    { free(tclusterid);
      free(mapping);
      return;
    }
  }

  if (method=='m')
  { double* cache = malloc(nelements*sizeof(double));
    if(cache)
    { *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
                         transpose, npass, dist, cdata, cmask, clusterid, error,
                         tclusterid, counts, mapping, cache);
      free(cache);
    }
  }
  else
    *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
                     transpose, npass, dist, cdata, cmask, clusterid, error,
                     tclusterid, counts, mapping);

  /* Deallocate temporarily used space */
  if (npass > 1)
  { free(mapping);
    free(tclusterid);
  }

  if (transpose==0) freedatamask(nclusters, cdata, cmask);
  else freedatamask(ndata, cdata, cmask);

  free(counts);
}

/* *********************************************************************** */

void kmedoids (int nclusters, int nelements, double** distmatrix,
  int npass, int clusterid[], double* error, int* ifound)
/*
Purpose

ccluster.c  view on Meta::CPAN

      if (index < 0 || index >= ncolumns) return -1.0;
    }
  }

  switch (method)
  { case 'a':
    { /* Find the center */
      int i,j,k;
      if (transpose==0)
      { double distance;
        double* cdata[2];
        int* cmask[2];
        int* count[2];
        count[0] = calloc(ncolumns,sizeof(int));
        count[1] = calloc(ncolumns,sizeof(int));
        cdata[0] = calloc(ncolumns,sizeof(double));
        cdata[1] = calloc(ncolumns,sizeof(double));
        cmask[0] = malloc(ncolumns*sizeof(int));
        cmask[1] = malloc(ncolumns*sizeof(int));
        for (i = 0; i < n1; i++)
        { k = index1[i];
          for (j = 0; j < ncolumns; j++)
            if (mask[k][j] != 0)
            { cdata[0][j] = cdata[0][j] + data[k][j];
              count[0][j] = count[0][j] + 1;
            }
        }
        for (i = 0; i < n2; i++)
        { k = index2[i];
          for (j = 0; j < ncolumns; j++)
            if (mask[k][j] != 0)
            { cdata[1][j] = cdata[1][j] + data[k][j];
              count[1][j] = count[1][j] + 1;
            }
        }
        for (i = 0; i < 2; i++)
          for (j = 0; j < ncolumns; j++)
          { if (count[i][j]>0)
            { cdata[i][j] = cdata[i][j] / count[i][j];
              cmask[i][j] = 1;
            }
            else
              cmask[i][j] = 0;
          }
        distance =
          metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
        for (i = 0; i < 2; i++)
        { free (cdata[i]);
          free (cmask[i]);
          free (count[i]);
        }
        return distance;
      }
      else
      { double distance;
        int** count = malloc(nrows*sizeof(int*));
        double** cdata = malloc(nrows*sizeof(double*));
        int** cmask = malloc(nrows*sizeof(int*));
        for (i = 0; i < nrows; i++)
        { count[i] = calloc(2,sizeof(int));
          cdata[i] = calloc(2,sizeof(double));
          cmask[i] = malloc(2*sizeof(int));
        }
        for (i = 0; i < n1; i++)
        { k = index1[i];
          for (j = 0; j < nrows; j++)
          { if (mask[j][k] != 0)
            { cdata[j][0] = cdata[j][0] + data[j][k];
              count[j][0] = count[j][0] + 1;
            }
          }
        }
        for (i = 0; i < n2; i++)
        { k = index2[i];
          for (j = 0; j < nrows; j++)
          { if (mask[j][k] != 0)
            { cdata[j][1] = cdata[j][1] + data[j][k];
              count[j][1] = count[j][1] + 1;
            }
          }
        }
        for (i = 0; i < nrows; i++)
          for (j = 0; j < 2; j++)
            if (count[i][j]>0)
            { cdata[i][j] = cdata[i][j] / count[i][j];
              cmask[i][j] = 1;
            }
            else
              cmask[i][j] = 0;
        distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
        for (i = 0; i < nrows; i++)
        { free (count[i]);
          free (cdata[i]);
          free (cmask[i]);
        }
        free (count);
        free (cdata);
        free (cmask);
        return distance;
      }
    }
    case 'm':
    { int i, j, k;
      if (transpose==0)
      { double distance;
        double* temp = malloc(nrows*sizeof(double));
        double* cdata[2];
        int* cmask[2];
        for (i = 0; i < 2; i++)
        { cdata[i] = malloc(ncolumns*sizeof(double));
          cmask[i] = malloc(ncolumns*sizeof(int));
        }
        for (j = 0; j < ncolumns; j++)
        { int count = 0;
          for (k = 0; k < n1; k++)
          { i = index1[k];
            if (mask[i][j])
            { temp[count] = data[i][j];
              count++;
            }
          }
          if (count>0)
          { cdata[0][j] = median (count,temp);
            cmask[0][j] = 1;
          }
          else
          { cdata[0][j] = 0.;
            cmask[0][j] = 0;
          }
        }
        for (j = 0; j < ncolumns; j++)
        { int count = 0;
          for (k = 0; k < n2; k++)
          { i = index2[k];
            if (mask[i][j])
            { temp[count] = data[i][j];
              count++;
            }
          }
          if (count>0)
          { cdata[1][j] = median (count,temp);
            cmask[1][j] = 1;
          }
          else
          { cdata[1][j] = 0.;
            cmask[1][j] = 0;
          }
        }
        distance = metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
        for (i = 0; i < 2; i++)
        { free (cdata[i]);
          free (cmask[i]);
        }
        free(temp);
        return distance;
      }
      else
      { double distance;
        double* temp = malloc(ncolumns*sizeof(double));
        double** cdata = malloc(nrows*sizeof(double*));
        int** cmask = malloc(nrows*sizeof(int*));
        for (i = 0; i < nrows; i++)
        { cdata[i] = malloc(2*sizeof(double));
          cmask[i] = malloc(2*sizeof(int));
        }
        for (j = 0; j < nrows; j++)
        { int count = 0;
          for (k = 0; k < n1; k++)
          { i = index1[k];
            if (mask[j][i])
            { temp[count] = data[j][i];
              count++;
            }
          }
          if (count>0)
          { cdata[j][0] = median (count,temp);
            cmask[j][0] = 1;
          }
          else
          { cdata[j][0] = 0.;
            cmask[j][0] = 0;
          }
        }
        for (j = 0; j < nrows; j++)
        { int count = 0;
          for (k = 0; k < n2; k++)
          { i = index2[k];
            if (mask[j][i])
            { temp[count] = data[j][i];
              count++;
            }
          }
          if (count>0)
          { cdata[j][1] = median (count,temp);
            cmask[j][1] = 1;
          }
          else
          { cdata[j][1] = 0.;
            cmask[j][1] = 0;
          }
        }
        distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
        for (i = 0; i < nrows; i++)
        { free (cdata[i]);
          free (cmask[i]);
        }
        free(cdata);
        free(cmask);
        free(temp);
        return distance;
      }
    }
    case 's':
    { int i1, i2, j1, j2;
      const int n = (transpose==0) ? ncolumns : nrows;
      double mindistance = DBL_MAX;
      for (i1 = 0; i1 < n1; i1++)

ccluster.h  view on Meta::CPAN


/* Chapter 2 */
double clusterdistance (int nrows, int ncolumns, double** data, int** mask,
  double weight[], int n1, int n2, int index1[], int index2[], char dist,
  char method, int transpose);
double** distancematrix (int ngenes, int ndata, double** data,
  int** mask, double* weight, char dist, int transpose);

/* Chapter 3 */
int getclustercentroids(int nclusters, int nrows, int ncolumns,
  double** data, int** mask, int clusterid[], double** cdata, int** cmask,
  int transpose, char method);
void getclustermedoids(int nclusters, int nelements, double** distance,
  int clusterid[], int centroids[], double errors[]);
void kcluster (int nclusters, int ngenes, int ndata, double** data,
  int** mask, double weight[], int transpose, int npass, char method, char dist,
  int clusterid[], double* error, int* ifound);
void kmedoids (int nclusters, int nelements, double** distance,
  int npass, int clusterid[], double* error, int* ifound);

/* Chapter 4 */



( run in 0.275 second using v1.01-cache-2.11-cpan-454fe037f31 )