PDL-Cluster

 view release on metacpan or  search on metacpan

Cluster.pd  view on Meta::CPAN

'),
);

##----------------------------------------------------------------------
## Distance Matrix
pp_def
  ('distancematrix',
   Pars => join("\n   ", '',
		q(double data(d,n);),
		q(int    mask(d,n);),
		q(double weight(d);),
		#q(int    transpose();),
		q(double [o]dists(n,n);),
		''
	       ),
   OtherPars => join("\n   ", '', 'char *distFlag;', ''),
   Code =>
('
  int    transpose = 0;
  double **datapp = (double **)pp_alloc($SIZE(n));
  int    **maskpp = (int    **)pp_alloc($SIZE(n));
  double **retval;
  //
  threadloop %{
    p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
    p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
    retval = distancematrix($SIZE(n), $SIZE(d), datapp, maskpp,
                            $P(weight), *$COMP(distFlag), transpose);
    //
    if (!retval) barf("Cluster matrix allocation failed!");
    pp2pdl_ragged_dbl($SIZE(n), $SIZE(n), retval, $P(dists));
    //
    if (retval) free(retval);
  %}
  //
  /*-- cleanup --*/
  if (datapp) free(datapp);
  if (maskpp) free(maskpp);
'),

  Doc => 'Compute triangular distance matrix over all data points.',
);

##======================================================================
## REMOVED: Random number generation
##  + REMOVED: initran()

##======================================================================
## Chapter 3: Partitioning Algorithms
##  + REMOVED: randomassign

##----------------------------------------------------------------------
## Cluster Centroids: Generic
pp_def
  ('getclustercentroids',
   Pars => join("\n   ", '',
		q(double data(d,n);),     ##-- n="rows"|"elts", d="columns"|"features"
		q(int    mask(d,n);),     ##-- n="rows"|"elts", d="columns"|"features"
		#q(int    transpose();),   ##-- probably dangerous
		q(int    clusterids(n);),  ##-- maps elts to cluster-ids
		q(double [o]cdata(d,k);), ##-- centroid data
		q(int    [o]cmask(d,k);), ##-- centroid data
		''
	       ),
    OtherPars => join("\n   ", '', 'char *ctrMethodFlag;', ''),
   Code =>
('
  int    transpose = 0;
  double **datapp  = (double **)pp_alloc($SIZE(n));
  int    **maskpp  = (int    **)pp_alloc($SIZE(n));
  double **cdatapp = (double **)pp_alloc($SIZE(k));
  int    **cmaskpp = (int    **)pp_alloc($SIZE(k));

  threadloop %{
    p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
    p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
    p2pp_dbl($SIZE(k), $SIZE(d), $P(cdata), cdatapp);
    p2pp_int($SIZE(k), $SIZE(d), $P(cmask), cmaskpp);

    getclustercentroids($SIZE(k), $SIZE(n), $SIZE(d), datapp, maskpp,
                        $P(clusterids), cdatapp, cmaskpp, transpose, *$COMP(ctrMethodFlag));
  %}

  /*-- cleanup --*/
  if (datapp) free(datapp);
  if (maskpp) free(maskpp);
  if (cdatapp) free(cdatapp);
  if (cmaskpp) free(cmaskpp);
'),

  Doc => 'Find cluster centroids by arithmetic mean (C<ctrMethodFlag="a">) or median over each dimension (C<ctrMethodFlag="m">).'
);


##----------------------------------------------------------------------
## Cluster Centroids: Mean
##  + now just a wrapper for 'getclustercentroids(...,"a")'
pp_add_exported('','getclustermean');
pp_addpm(<<'EOPM');

=pod

=head2 getclustermean

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   int    clusterids(n);
   double [o]cdata(d,k);
   int    [o]cmask(d,k);
   )

Really just a wrapper for getclustercentroids(...,"a").

=cut

sub getclustermean {
  my ($data,$mask,$cids,$cdata,$cmask) = @_;
  return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'a');
}

EOPM

##----------------------------------------------------------------------
## Cluster Centroids: Median
##  + now just a wrapper for 'getclustercentroids(...,"m")'
pp_add_exported('','getclustermedian');
pp_addpm(<<'EOPM');

=pod

=head2 getclustermedian

=for sig

  Signature: (
   double data(d,n);
   int    mask(d,n);
   int    clusterids(n);
   double [o]cdata(d,k);
   int    [o]cmask(d,k);
   )

Really just a wrapper for getclustercentroids(...,"m").

=cut

sub getclustermedian {
  my ($data,$mask,$cids,$cdata,$cmask) = @_;
  return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'m');
}

EOPM


##----------------------------------------------------------------------
## Cluster Centroids: Medoids
pp_def
  ('getclustermedoids',
   Pars => join("\n   ", '',
		q(double distance(n,n);),     ##-- (in) distance matrix (ragged, lower-left-triangle)
		q(int    clusterids(n);),     ##-- (in) cluster ids indexed by gene-id
		q(int    [o]centroids(k);),   ##-- (out) centroid-(elt-)ids indexed by cluster-id
		q(double [o]errors(k);),      ##-- (out) maps cluster-id c -> sum(dist(x \in c, ctr(c)))
		''
	       ),
   Code =>
('
  double **distpp = (double **)pp_alloc($SIZE(n));
  threadloop %{
    p2pp_dbl_ragged($SIZE(n), $SIZE(n), $P(distance), distpp);
    getclustermedoids($SIZE(k), $SIZE(n), distpp,
                      $P(clusterids), $P(centroids), $P(errors));
  %}
  //
  /*-- cleanup --*/
  if (distpp) free(distpp);
'),

  Doc => 'The getclustermedoid routine calculates the cluster centroids, given to which
cluster each element belongs. The centroid is defined as the element with the
smallest sum of distances to the other elements.
'
);


##----------------------------------------------------------------------
## K-Means
pp_def
  ('kcluster',
   Pars => join("\n   ", '',
		q(int    nclusters();),	##-- number of clusters to find
		q(double data(d,n);), ##-- n="rows"|"elts", d="columns"|"features"
		q(int    mask(d,n);), ##-- n="rows"|"elts", d="columns"|"features"
		q(double weight(d);), ##-- weights
		#q(int    transpose();),   ##-- probably dangerous
		q(int    npass();), ##-- n passes
		q(int    [o]clusterids(n);), ##-- maps elts to cluster-ids
		q(double [o]error();)   , ##-- solution error
		q(int    [o]nfound();),	##-- number of times solution found
		''
	       ),
   OtherPars => join("\n   ", '', 'char *distFlag;', 'char *ctrMethodFlag;', ''),
   Code =>
('
  int    transpose = 0;
  double **datapp  = (double **)pp_alloc($SIZE(n));
  int    **maskpp  = (int    **)pp_alloc($SIZE(n));
  //
  threadloop %{

Cluster.pd  view on Meta::CPAN

	      q(int    cwhich1(ncmps);),     ##-- selected left comparison operands
	      q(int    coffsets2(k2);),      ##-- (encoded): Y (dim=1) cluster offsets (k2+1)
	      q(int    crowids2(nc2);),      ##-- (encoded): Y (dim=1) clustered-element row-ids
	      q(int    cwhich2(ncmps);),     ##-- selected right comparison operands
	      q(double [o]dists(ncmps);),    ##-- output matrix
	      ''
	     ),
 OtherPars => join("\n   ", '', 'char *distFlag;', 'char *methodFlag;', ''),
 Doc => '
Computes cluster-distance between selected pairs of co-indexed clusters in ($cwhich1,$cwhich2).
Cluster contents are passed as pairs ($coffsetsX(),$crowidsX()) as returned
by the clusteroffsets() function.

$distFlag and $methodFlag are interpreted as for clusterdistance().

See also clusterenc(), clusterdistancematrixenc().
',

 Code => '
  double **datapp   = (double **)pp_alloc($SIZE(n));
  int    **maskpp   = (int    **)pp_alloc($SIZE(n));
  int    transpose=0;
  //
  threadloop %{
    p2pp_dbl($SIZE(n),   $SIZE(d),   $P(data), datapp);
    p2pp_int($SIZE(n),   $SIZE(d),   $P(mask), maskpp);
    //
    loop (ncmps) %{
      int c1 = $cwhich1();
      int c2 = $cwhich2();
      int succ_c1=c1+1;
      int succ_c2=c2+1;
      int beg1 = $coffsets1(k1=>c1);
      int beg2 = $coffsets2(k2=>c2);
      int len1 = $coffsets1(k1=>succ_c1) - beg1;
      int len2 = $coffsets2(k2=>succ_c2) - beg2;
      int *crowids1p = $P(crowids1) + beg1;
      int *crowids2p = $P(crowids2) + beg2;

      $dists() = clusterdistance($SIZE(n), $SIZE(d), datapp, maskpp, $P(weight),
                                 len1,      len2,
                                 crowids1p, crowids2p,
                                 *$COMP(distFlag), *$COMP(methodFlag), transpose);
    %}
  %}
  /*-- cleanup --*/
  if (datapp) free(datapp);
  if (maskpp) free(maskpp);
',
);

##----------------------------------------------------------------------
## Cluster Centroids via Weighted Sum [p(datum_n|cluster_k)]
pp_def
('getclusterwsum',
       Pars => join("\n   ", '',
		    q(double data(d,n);),       ##-- n="rows"|"elts", d="columns"|"features"
		    q(int    mask(d,n);),       ##-- n="rows"|"elts", d="columns"|"features"
		    q(double clusterwts(k,n);), ##-- maps (cluster,elt) to weight [p(elt|cluster)]
		                                ##   : should probably sum to 1 over each cluster (k)
		    q(double [o]cdata(d,k);),   ##-- centroid data
		    q(int    [o]cmask(d,k);),   ##-- centroid mask
		    ''
		   ),
       Code =>
('
 int rid, rwt, cmaskdk;
 loop (d) %{
   loop (k) %{
     cmaskdk = 0;
     loop (n) %{
       if ($mask()) {
         cmaskdk = 1;
	 $cdata() += $clusterwts() * $data();
       }
     %}
     $cmask() = cmaskdk;
   %}
 %}
'),

 Doc => '
Find cluster centroids by weighted sum.  This can be considered an
expensive generalization of the getclustermean() and getclustermedian()
functions.  Here, the input PDLs $data() and $mask(), as well as the
output PDL $cdata() are as for getclustermean().  The matrix $clusterwts()
determines the relative weight of each data row in determining the
centroid of each cluster, potentially useful for "fuzzy" clustering.
The equation used to compute cluster means is:

 $cdata(d,k) = sum_{n} $clusterwts(k,n) * $data(d,n) * $mask(d,n)

For centroids in the same range as data elements, $clusterwts()
should sum to 1 over each column (k):

 all($clusterwts->xchg(0,1)->sumover == 1)

getclustermean() can be simulated by instantiating $clusterwts() with
a uniform distribution over cluster elements:

 $clusterwts = zeroes($k,$n);
 $clusterwts->indexND(cat($clusterids, xvals($clusterids))->xchg(0,1)) .= 1;
 $clusterwts /= $clusterwts->xchg(0,1)->sumover;
 getclusterwsum($data,$mask, $clusterwts, $cdata=zeroes($d,$k));

Similarly, getclustermedian() can be simulated by setting $clusterwts() to
1 for cluster medians and otherwise to 0.  More sophisticated centroid
discovery methods can be computed by this function by setting
$clusterwts(k,n) to some estimate of the conditional probability
of the datum at row $n given the cluster with index $k:
p(Elt==n|Cluster==k).  One
way to achieve such an estimate is to use (normalized inverses of) the
singleton-row-to-cluster distances as output by clusterdistancematrix().

'
);

##----------------------------------------------------------------------
## Attach data to nearest centroid
pp_def
('attachtonearest',
       Pars => join("\n   ", '',
		    q(double data(d,n);),         ##-- n="rows"|"elts", d="columns"|"features"
		    q(int    mask(d,n);),         ##-- n="rows"|"elts", d="columns"|"features"
		    q(double weight(d);),         ##-- feature weights
		    q(int    rowids(nr);),        ##-- rows to attach
		    q(double cdata(d,k);),        ##-- centroid data
		    q(int    cmask(d,k);),        ##-- centroid mask
		    q(int    [o]clusterids(nr);), ##-- output cluster ids
		    q(double [o]cdist(nr);),      ##-- distances to best clusters
		    ''
		   ),
 OtherPars => join("\n   ", '', 'char *distFlag;', 'char *methodFlag;', ''),
       Code =>
('
  double **datapp  = (double **)pp_alloc($SIZE(n));
  int    **maskpp  = (int    **)pp_alloc($SIZE(n));
  double **cdatapp = (double **)pp_alloc($SIZE(k));
  int    **cmaskpp = (int    **)pp_alloc($SIZE(k));
  double *tmpdatapp[2];
  int    *tmpmaskpp[2];
  int    transpose=0;
  //
  threadloop %{
    int    tmprowid = 0;
    int    tmpctrid = 1;
    int    ni;
    int    ki, kbest;
    double dist, dbest;
    //
    p2pp_dbl($SIZE(n), $SIZE(d), $P(data), datapp);
    p2pp_int($SIZE(n), $SIZE(d), $P(mask), maskpp);
    p2pp_dbl($SIZE(k), $SIZE(d), $P(cdata), cdatapp);
    p2pp_int($SIZE(k), $SIZE(d), $P(cmask), cmaskpp);
    //
    /*-- loop over all target rows --*/
    loop (nr) %{
      ni = $rowids();
      tmpdatapp[tmprowid] = datapp[ni];
      tmpmaskpp[tmprowid] = maskpp[ni];
      //
      /*-- initialize --*/
      tmpdatapp[tmpctrid] = cdatapp[0];
      tmpmaskpp[tmpctrid] = cmaskpp[0];
      kbest = 0;
      dbest = clusterdistance(2, $SIZE(d), tmpdatapp, tmpmaskpp, $P(weight),
                              1, 1, &tmprowid, &tmpctrid,
                              *$COMP(distFlag), *$COMP(methodFlag), transpose);
      //
      /*-- loop over all centroids --*/
      for (ki=1; ki < $SIZE(k); ki++) {
        tmpdatapp[tmpctrid] = cdatapp[ki];
        tmpmaskpp[tmpctrid] = cmaskpp[ki];
        //
        dist = clusterdistance(2, $SIZE(d), tmpdatapp, tmpmaskpp, $P(weight),
                               1, 1, &tmprowid, &tmpctrid,
                               *$COMP(distFlag), *$COMP(methodFlag), transpose);
        if (dist < dbest) {
          kbest = ki;
          dbest = dist;
        }
      }
      //
      /*-- save best data --*/
      $clusterids() = kbest;
      $cdist()      = dbest;
    %}
  %}
  //
  /*-- cleanup --*/
  if (datapp) free(datapp);
  if (maskpp) free(maskpp);
  if (cdatapp) free(cdatapp);
  if (cmaskpp) free(cmaskpp);
'),

 Doc => '
Assigns each specified data row to the nearest cluster centroid.
Data elements are given by $data() and $mask(), feature weights are
given by $weight(), as usual.  Cluster centroids are defined by
by $cdata() and $cmask(), and the indices of rows to be attached
are given in the vector $rowids().  The output vector $clusterids()
contains for each specified row index the identifier of the nearest
cluster centroid.  The vector $cdist() contains the distance to
the best clusters.

See also: clusterdistancematrix(), attachtonearestd().
'
);


##----------------------------------------------------------------------
## Attach data to nearest centroid, given datum-to-centroid distance matrix
pp_add_exported('','attachtonearestd');

pp_addpm(<<'EOPM');

=pod

=head2 attachtonearestd

=for sig

  Signature: (
   double cdistmat(k,n);
   int rowids(nr);
   int [o]clusterids(nr);
   double [o]dists(nr);
   )

Assigns each specified data row to the nearest cluster centroid,
as for attachtonearest(), given the datum-to-cluster distance
matrix $cdistmat().  Currently just a wrapper for a few PDL calls.
In scalar context returns $clusterids(), in list context returns
the list ($clusterids(),$dists()).

=cut

sub attachtonearestd {
  my ($cdm,$rowids,$cids,$dists)=@_;
  $cids = zeroes(long, $rowids->dim(0))    if (!defined($cids));
  $dists = zeroes(double, $rowids->dim(0)) if (!defined($dists));

  ##-- dice matrix
  my $cdmr   = $cdm->dice_axis(1,$rowids);

  ##-- get best
  $cdmr->minimum_ind($cids);
  $dists .= $cdmr->index($cids);

  return wantarray ? ($cids,$dists) : $cids;
}

EOPM

##======================================================================
## Cluster assignment: checking
##======================================================================

##----------------------------------------------------------------------
## checkprototypes(): ensure no repeats of $k values in the range [0,$n(



( run in 1.095 second using v1.01-cache-2.11-cpan-f56aa216473 )