Back to index

python-biopython  1.60
cluster.c
Go to the documentation of this file.
00001 /* The C clustering library.
00002  * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
00003  *
00004  * This library was written at the Laboratory of DNA Information Analysis,
00005  * Human Genome Center, Institute of Medical Science, University of Tokyo,
00006  * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
00007  * Contact: mdehoon 'AT' gsc.riken.jp
00008  * 
00009  * Permission to use, copy, modify, and distribute this software and its
00010  * documentation with or without modifications and for any purpose and
00011  * without fee is hereby granted, provided that any copyright notices
00012  * appear in all copies and that both those copyright notices and this
00013  * permission notice appear in supporting documentation, and that the
00014  * names of the contributors or copyright holders not be used in
00015  * advertising or publicity pertaining to distribution of the software
00016  * without specific prior permission.
00017  * 
00018  * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
00019  * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
00020  * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
00021  * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
00022  * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
00023  * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
00024  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
00025  * OR PERFORMANCE OF THIS SOFTWARE.
00026  * 
00027  */
00028 
00029 #include <time.h>
00030 #include <stdlib.h>
00031 #include <math.h>
00032 #include <float.h>
00033 #include <limits.h>
00034 #include <string.h>
00035 #include "cluster.h"
00036 #ifdef WINDOWS
00037 #  include <windows.h>
00038 #endif
00039 
00040 /* ************************************************************************ */
00041 
00042 #ifdef WINDOWS
00043 /* Then we make a Windows DLL */
00044 int WINAPI
00045 clusterdll_init (HANDLE h, DWORD reason, void* foo)
00046 {
00047   return 1;
00048 }
00049 #endif
00050 
00051 /* ************************************************************************ */
00052 
00053 double mean(int n, double x[])
00054 { double result = 0.;
00055   int i;
00056   for (i = 0; i < n; i++) result += x[i];
00057   result /= n;
00058   return result;
00059 }
00060 
00061 /* ************************************************************************ */
00062 
00063 double median (int n, double x[])
00064 /*
00065 Find the median of X(1), ... , X(N), using as much of the quicksort
00066 algorithm as is needed to isolate it.
00067 N.B. On exit, the array X is partially ordered.
00068 Based on Alan J. Miller's median.f90 routine.
00069 */
00070 
00071 { int i, j;
00072   int nr = n / 2;
00073   int nl = nr - 1;
00074   int even = 0;
00075   /* hi & lo are position limits encompassing the median. */
00076   int lo = 0;
00077   int hi = n-1;
00078 
00079   if (n==2*nr) even = 1;
00080   if (n<3)
00081   { if (n<1) return 0.;
00082     if (n == 1) return x[0];
00083     return 0.5*(x[0]+x[1]);
00084   }
00085 
00086   /* Find median of 1st, middle & last values. */
00087   do
00088   { int loop;
00089     int mid = (lo + hi)/2;
00090     double result = x[mid];
00091     double xlo = x[lo];
00092     double xhi = x[hi];
00093     if (xhi<xlo)
00094     { double temp = xlo;
00095       xlo = xhi;
00096       xhi = temp;
00097     }
00098     if (result>xhi) result = xhi;
00099     else if (result<xlo) result = xlo;
00100     /* The basic quicksort algorithm to move all values <= the sort key (XMED)
00101      * to the left-hand end, and all higher values to the other end.
00102      */
00103     i = lo;
00104     j = hi;
00105     do
00106     { while (x[i]<result) i++;
00107       while (x[j]>result) j--;
00108       loop = 0;
00109       if (i<j)
00110       { double temp = x[i];
00111         x[i] = x[j];
00112         x[j] = temp;
00113         i++;
00114         j--;
00115         if (i<=j) loop = 1;
00116       }
00117     } while (loop); /* Decide which half the median is in. */
00118 
00119     if (even)
00120     { if (j==nl && i==nr)
00121         /* Special case, n even, j = n/2 & i = j + 1, so the median is
00122          * between the two halves of the series.   Find max. of the first
00123          * half & min. of the second half, then average.
00124          */
00125         { int k;
00126           double xmax = x[0];
00127           double xmin = x[n-1];
00128           for (k = lo; k <= j; k++) xmax = max(xmax,x[k]);
00129           for (k = i; k <= hi; k++) xmin = min(xmin,x[k]);
00130           return 0.5*(xmin + xmax);
00131         }
00132       if (j<nl) lo = i;
00133       if (i>nr) hi = j;
00134       if (i==j)
00135       { if (i==nl) lo = nl;
00136         if (j==nr) hi = nr;
00137       }
00138     }
00139     else
00140     { if (j<nr) lo = i;
00141       if (i>nr) hi = j;
00142       /* Test whether median has been isolated. */
00143       if (i==j && i==nr) return result;
00144     }
00145   }
00146   while (lo<hi-1);
00147 
00148   if (even) return (0.5*(x[nl]+x[nr]));
00149   if (x[lo]>x[hi])
00150   { double temp = x[lo];
00151     x[lo] = x[hi];
00152     x[hi] = temp;
00153   }
00154   return x[nr];
00155 }
00156 
00157 /* ********************************************************************** */
00158 
00159 static const double* sortdata = NULL; /* used in the quicksort algorithm */
00160 
00161 /* ---------------------------------------------------------------------- */
00162 
00163 static
00164 int compare(const void* a, const void* b)
00165 /* Helper function for sort. Previously, this was a nested function under
00166  * sort, which is not allowed under ANSI C.
00167  */
00168 { const int i1 = *(const int*)a;
00169   const int i2 = *(const int*)b;
00170   const double term1 = sortdata[i1];
00171   const double term2 = sortdata[i2];
00172   if (term1 < term2) return -1;
00173   if (term1 > term2) return +1;
00174   return 0;
00175 }
00176 
00177 /* ---------------------------------------------------------------------- */
00178 
00179 void sort(int n, const double data[], int index[])
00180 /* Sets up an index table given the data, such that data[index[]] is in
00181  * increasing order. Sorting is done on the indices; the array data
00182  * is unchanged.
00183  */
00184 { int i;
00185   sortdata = data;
00186   for (i = 0; i < n; i++) index[i] = i;
00187   qsort(index, n, sizeof(int), compare);
00188 }
00189 
00190 /* ********************************************************************** */
00191 
00192 static double* getrank (int n, double data[])
00193 /* Calculates the ranks of the elements in the array data. Two elements with
00194  * the same value get the same rank, equal to the average of the ranks had the
00195  * elements different values. The ranks are returned as a newly allocated
00196  * array that should be freed by the calling routine. If getrank fails due to
00197  * a memory allocation error, it returns NULL.
00198  */
00199 { int i;
00200   double* rank;
00201   int* index;
00202   rank = malloc(n*sizeof(double));
00203   if (!rank) return NULL;
00204   index = malloc(n*sizeof(int));
00205   if (!index)
00206   { free(rank);
00207     return NULL;
00208   }
00209   /* Call sort to get an index table */
00210   sort (n, data, index);
00211   /* Build a rank table */
00212   for (i = 0; i < n; i++) rank[index[i]] = i;
00213   /* Fix for equal ranks */
00214   i = 0;
00215   while (i < n)
00216   { int m;
00217     double value = data[index[i]];
00218     int j = i + 1;
00219     while (j < n && data[index[j]] == value) j++;
00220     m = j - i; /* number of equal ranks found */
00221     value = rank[index[i]] + (m-1)/2.;
00222     for (j = i; j < i + m; j++) rank[index[j]] = value;
00223     i += m;
00224   }
00225   free (index);
00226   return rank;
00227 }
00228 
00229 /* ---------------------------------------------------------------------- */
00230 
00231 static int
00232 makedatamask(int nrows, int ncols, double*** pdata, int*** pmask)
00233 { int i;
00234   double** data;
00235   int** mask;
00236   data = malloc(nrows*sizeof(double*));
00237   if(!data) return 0;
00238   mask = malloc(nrows*sizeof(int*));
00239   if(!mask)
00240   { free(data);
00241     return 0;
00242   }
00243   for (i = 0; i < nrows; i++)
00244   { data[i] = malloc(ncols*sizeof(double));
00245     if(!data[i]) break;
00246     mask[i] = malloc(ncols*sizeof(int));
00247     if(!mask[i])
00248     { free(data[i]);
00249       break;
00250     }
00251   }
00252   if (i==nrows) /* break not encountered */
00253   { *pdata = data;
00254     *pmask = mask;
00255     return 1;
00256   }
00257   *pdata = NULL;
00258   *pmask = NULL;
00259   nrows = i;
00260   for (i = 0; i < nrows; i++)
00261   { free(data[i]);
00262     free(mask[i]);
00263   }
00264   free(data);
00265   free(mask);
00266   return 0;
00267 }
00268 
00269 /* ---------------------------------------------------------------------- */
00270 
00271 static void
00272 freedatamask(int n, double** data, int** mask)
00273 { int i;
00274   for (i = 0; i < n; i++)
00275   { free(mask[i]);
00276     free(data[i]);
00277   }
00278   free(mask);
00279   free(data);
00280 }
00281 
00282 /* ---------------------------------------------------------------------- */
00283 
00284 static
00285 double find_closest_pair(int n, double** distmatrix, int* ip, int* jp)
00286 /*
00287 This function searches the distance matrix to find the pair with the shortest
00288 distance between them. The indices of the pair are returned in ip and jp; the
00289 distance itself is returned by the function.
00290 
00291 n          (input) int
00292 The number of elements in the distance matrix.
00293 
00294 distmatrix (input) double**
00295 A ragged array containing the distance matrix. The number of columns in each
00296 row is one less than the row index.
00297 
00298 ip         (output) int*
00299 A pointer to the integer that is to receive the first index of the pair with
00300 the shortest distance.
00301 
00302 jp         (output) int*
00303 A pointer to the integer that is to receive the second index of the pair with
00304 the shortest distance.
00305 */
00306 { int i, j;
00307   double temp;
00308   double distance = distmatrix[1][0];
00309   *ip = 1;
00310   *jp = 0;
00311   for (i = 1; i < n; i++)
00312   { for (j = 0; j < i; j++)
00313     { temp = distmatrix[i][j];
00314       if (temp<distance)
00315       { distance = temp;
00316         *ip = i;
00317         *jp = j;
00318       }
00319     }
00320   }
00321   return distance;
00322 }
00323 
00324 /* ********************************************************************* */
00325 
00326 static int svd(int m, int n, double** u, double w[], double** vt)
00327 /*
00328  *   This subroutine is a translation of the Algol procedure svd,
00329  *   Num. Math. 14, 403-420(1970) by Golub and Reinsch.
00330  *   Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971).
00331  *
00332  *   This subroutine determines the singular value decomposition
00333  *        t
00334  *   A=usv  of a real m by n rectangular matrix, where m is greater
00335  *   than or equal to n.  Householder bidiagonalization and a variant
00336  *   of the QR algorithm are used.
00337  *  
00338  *
00339  *   On input.
00340  *
00341  *      m is the number of rows of A (and u).
00342  *
00343  *      n is the number of columns of A (and u) and the order of v.
00344  *
00345  *      u contains the rectangular input matrix A to be decomposed.
00346  *
00347  *   On output.
00348  *
00349  *      the routine returns an integer ierr equal to
00350  *        0          to indicate a normal return,
00351  *        k          if the k-th singular value has not been
00352  *                   determined after 30 iterations,
00353  *        -1         if memory allocation fails.
00354  *
00355  *
00356  *     w  contains the n (non-negative) singular values of a (the
00357  *        diagonal elements of s).  they are unordered.  if an
00358  *        error exit is made, the singular values should be correct
00359  *        for indices ierr+1,ierr+2,...,n.
00360  *
00361  *
00362  *     u  contains the matrix u (orthogonal column vectors) of the
00363  *        decomposition.
00364  *        if an error exit is made, the columns of u corresponding
00365  *        to indices of correct singular values should be correct.
00366  *
00367  *                             t
00368  *     vt contains the matrix v (orthogonal) of the decomposition.
00369  *        if an error exit is made, the columns of v corresponding
00370  *        to indices of correct singular values should be correct.
00371  *
00372  *
00373  *   Questions and comments should be directed to B. S. Garbow,
00374  *   Applied Mathematics division, Argonne National Laboratory
00375  *
00376  *   Modified to eliminate machep
00377  *
00378  *   Translated to C by Michiel de Hoon, Human Genome Center,
00379  *   University of Tokyo, for inclusion in the C Clustering Library.
00380  *   This routine is less general than the original svd routine, as
00381  *   it focuses on the singular value decomposition as needed for
00382  *   clustering. In particular,
00383  *     - We calculate both u and v in all cases
00384  *     - We pass the input array A via u; this array is subsequently
00385  *       overwritten.
00386  *     - We allocate for the array rv1, used as a working space,
00387  *       internally in this routine, instead of passing it as an
00388  *       argument. If the allocation fails, svd returns -1.
00389  *   2003.06.05
00390  */
00391 { int i, j, k, i1, k1, l1, its;
00392   double c,f,h,s,x,y,z;
00393   int l = 0;
00394   int ierr = 0;
00395   double g = 0.0;
00396   double scale = 0.0;
00397   double anorm = 0.0;
00398   double* rv1 = malloc(n*sizeof(double));
00399   if (!rv1) return -1;
00400   if (m >= n)
00401   { /* Householder reduction to bidiagonal form */
00402     for (i = 0; i < n; i++)
00403     { l = i + 1;
00404       rv1[i] = scale * g;
00405       g = 0.0;
00406       s = 0.0;
00407       scale = 0.0;
00408       for (k = i; k < m; k++) scale += fabs(u[k][i]);
00409       if (scale != 0.0)
00410       { for (k = i; k < m; k++)
00411         { u[k][i] /= scale;
00412           s += u[k][i]*u[k][i];
00413         }
00414         f = u[i][i];
00415         g = (f >= 0) ? -sqrt(s) : sqrt(s);
00416         h = f * g - s;
00417         u[i][i] = f - g;
00418         if (i < n-1)
00419         { for (j = l; j < n; j++)
00420           { s = 0.0;
00421             for (k = i; k < m; k++) s += u[k][i] * u[k][j];
00422             f = s / h;
00423             for (k = i; k < m; k++) u[k][j] += f * u[k][i];
00424           }
00425         }
00426         for (k = i; k < m; k++) u[k][i] *= scale;
00427       }
00428       w[i] = scale * g;
00429       g = 0.0;
00430       s = 0.0;
00431       scale = 0.0;
00432       if (i<n-1)
00433       { for (k = l; k < n; k++) scale += fabs(u[i][k]);
00434         if (scale != 0.0)
00435         { for (k = l; k < n; k++)
00436           { u[i][k] /= scale;
00437             s += u[i][k] * u[i][k];
00438           }
00439           f = u[i][l];
00440           g = (f >= 0) ? -sqrt(s) : sqrt(s);
00441           h = f * g - s;
00442           u[i][l] = f - g;
00443           for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
00444           for (j = l; j < m; j++)
00445           { s = 0.0;
00446             for (k = l; k < n; k++) s += u[j][k] * u[i][k];
00447             for (k = l; k < n; k++) u[j][k] += s * rv1[k];
00448           }
00449           for (k = l; k < n; k++)  u[i][k] *= scale;
00450         }
00451       }
00452       anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
00453     }
00454     /* accumulation of right-hand transformations */
00455     for (i = n-1; i>=0; i--)
00456     { if (i < n-1)
00457       { if (g != 0.0)
00458         { for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g;
00459           /* double division avoids possible underflow */
00460           for (j = l; j < n; j++)
00461           { s = 0.0;
00462             for (k = l; k < n; k++) s += u[i][k] * vt[j][k];
00463             for (k = l; k < n; k++) vt[j][k] += s * vt[i][k];
00464           }
00465         }
00466       }
00467       for (j = l; j < n; j++)
00468       { vt[j][i] = 0.0;
00469         vt[i][j] = 0.0;
00470       }
00471       vt[i][i] = 1.0;
00472       g = rv1[i];
00473       l = i;
00474     }
00475     /* accumulation of left-hand transformations */
00476     for (i = n-1; i >= 0; i--)
00477     { l = i + 1;
00478       g = w[i];
00479       if (i!=n-1)
00480         for (j = l; j < n; j++) u[i][j] = 0.0;
00481       if (g!=0.0)
00482       { if (i!=n-1)
00483         { for (j = l; j < n; j++)
00484           { s = 0.0;
00485             for (k = l; k < m; k++) s += u[k][i] * u[k][j];
00486             /* double division avoids possible underflow */
00487             f = (s / u[i][i]) / g;
00488             for (k = i; k < m; k++) u[k][j] += f * u[k][i];
00489           }
00490         }
00491         for (j = i; j < m; j++) u[j][i] /= g;
00492       }
00493       else
00494         for (j = i; j < m; j++) u[j][i] = 0.0;
00495       u[i][i] += 1.0;
00496     }
00497     /* diagonalization of the bidiagonal form */
00498     for (k = n-1; k >= 0; k--)
00499     { k1 = k-1;
00500       its = 0;
00501       while(1)
00502       /* test for splitting */
00503       { for (l = k; l >= 0; l--)
00504         { l1 = l-1;
00505           if (fabs(rv1[l]) + anorm == anorm) break;
00506           /* rv1[0] is always zero, so there is no exit
00507            * through the bottom of the loop */
00508           if (fabs(w[l1]) + anorm == anorm)
00509           /* cancellation of rv1[l] if l greater than 0 */
00510           { c = 0.0;
00511             s = 1.0;
00512             for (i = l; i <= k; i++)
00513             { f = s * rv1[i];
00514               rv1[i] *= c;
00515               if (fabs(f) + anorm == anorm) break;
00516               g = w[i];
00517               h = sqrt(f*f+g*g);
00518               w[i] = h;
00519               c = g / h;
00520               s = -f / h;
00521               for (j = 0; j < m; j++)
00522               { y = u[j][l1];
00523                 z = u[j][i];
00524                 u[j][l1] = y * c + z * s;
00525                 u[j][i] = -y * s + z * c;
00526               }
00527             }
00528             break;
00529           }
00530         }
00531         /* test for convergence */
00532         z = w[k];
00533         if (l==k) /* convergence */
00534         { if (z < 0.0)
00535           /*  w[k] is made non-negative */
00536           { w[k] = -z;
00537             for (j = 0; j < n; j++) vt[k][j] = -vt[k][j];
00538           }
00539           break;
00540         }
00541         else if (its==30)
00542         { ierr = k;
00543           break;
00544         }
00545         else
00546         /* shift from bottom 2 by 2 minor */
00547         { its++;
00548           x = w[l];
00549           y = w[k1];
00550           g = rv1[k1];
00551           h = rv1[k];
00552           f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
00553           g = sqrt(f*f+1.0);
00554           f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
00555           /* next qr transformation */
00556           c = 1.0;
00557           s = 1.0;
00558           for (i1 = l; i1 <= k1; i1++)
00559           { i = i1 + 1;
00560             g = rv1[i];
00561             y = w[i];
00562             h = s * g;
00563             g = c * g;
00564             z = sqrt(f*f+h*h);
00565             rv1[i1] = z;
00566             c = f / z;
00567             s = h / z;
00568             f = x * c + g * s;
00569             g = -x * s + g * c;
00570             h = y * s;
00571             y = y * c;
00572             for (j = 0; j < n; j++)
00573             { x = vt[i1][j];
00574               z = vt[i][j];
00575               vt[i1][j] = x * c + z * s;
00576               vt[i][j] = -x * s + z * c;
00577             }
00578             z = sqrt(f*f+h*h);
00579             w[i1] = z;
00580             /* rotation can be arbitrary if z is zero */
00581             if (z!=0.0)
00582             { c = f / z;
00583               s = h / z;
00584             }
00585             f = c * g + s * y;
00586             x = -s * g + c * y;
00587             for (j = 0; j < m; j++)
00588             { y = u[j][i1];
00589               z = u[j][i];
00590               u[j][i1] = y * c + z * s;
00591               u[j][i] = -y * s + z * c;
00592             }
00593           }
00594           rv1[l] = 0.0;
00595           rv1[k] = f;
00596           w[k] = x;
00597         }
00598       }
00599     }
00600   }
00601   else /* m < n */
00602   { /* Householder reduction to bidiagonal form */
00603     for (i = 0; i < m; i++)
00604     { l = i + 1;
00605       rv1[i] = scale * g;
00606       g = 0.0;
00607       s = 0.0;
00608       scale = 0.0;
00609       for (k = i; k < n; k++) scale += fabs(u[i][k]);
00610       if (scale != 0.0)
00611       { for (k = i; k < n; k++)
00612         { u[i][k] /= scale;
00613           s += u[i][k]*u[i][k];
00614         }
00615         f = u[i][i];
00616         g = (f >= 0) ? -sqrt(s) : sqrt(s);
00617         h = f * g - s;
00618         u[i][i] = f - g;
00619         if (i < m-1)
00620         { for (j = l; j < m; j++)
00621           { s = 0.0;
00622             for (k = i; k < n; k++) s += u[i][k] * u[j][k];
00623             f = s / h;
00624             for (k = i; k < n; k++) u[j][k] += f * u[i][k];
00625           }
00626         }
00627         for (k = i; k < n; k++) u[i][k] *= scale;
00628       }
00629       w[i] = scale * g;
00630       g = 0.0;
00631       s = 0.0;
00632       scale = 0.0;
00633       if (i<m-1)
00634       { for (k = l; k < m; k++) scale += fabs(u[k][i]);
00635         if (scale != 0.0)
00636         { for (k = l; k < m; k++)
00637           { u[k][i] /= scale;
00638             s += u[k][i] * u[k][i];
00639           }
00640           f = u[l][i];
00641           g = (f >= 0) ? -sqrt(s) : sqrt(s);
00642           h = f * g - s;
00643           u[l][i] = f - g;
00644           for (k = l; k < m; k++) rv1[k] = u[k][i] / h;
00645           for (j = l; j < n; j++)
00646           { s = 0.0;
00647             for (k = l; k < m; k++) s += u[k][j] * u[k][i];
00648             for (k = l; k < m; k++) u[k][j] += s * rv1[k];
00649           }
00650           for (k = l; k < m; k++)  u[k][i] *= scale;
00651         }
00652       }
00653       anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
00654     }
00655     /* accumulation of right-hand transformations */
00656     for (i = m-1; i>=0; i--)
00657     { if (i < m-1)
00658       { if (g != 0.0)
00659         { for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g;
00660           /* double division avoids possible underflow */
00661           for (j = l; j < m; j++)
00662           { s = 0.0;
00663             for (k = l; k < m; k++) s += u[k][i] * vt[k][j];
00664             for (k = l; k < m; k++) vt[k][j] += s * vt[k][i];
00665           }
00666         }
00667       }
00668       for (j = l; j < m; j++)
00669       { vt[i][j] = 0.0;
00670         vt[j][i] = 0.0;
00671       }
00672       vt[i][i] = 1.0;
00673       g = rv1[i];
00674       l = i;
00675     }
00676     /* accumulation of left-hand transformations */
00677     for (i = m-1; i >= 0; i--)
00678     { l = i + 1;
00679       g = w[i];
00680       if (i!=m-1)
00681         for (j = l; j < m; j++) u[j][i] = 0.0;
00682       if (g!=0.0)
00683       { if (i!=m-1)
00684         { for (j = l; j < m; j++)
00685           { s = 0.0;
00686             for (k = l; k < n; k++) s += u[i][k] * u[j][k];
00687             /* double division avoids possible underflow */
00688             f = (s / u[i][i]) / g;
00689             for (k = i; k < n; k++) u[j][k] += f * u[i][k];
00690           }
00691         }
00692         for (j = i; j < n; j++) u[i][j] /= g;
00693       }
00694       else
00695         for (j = i; j < n; j++) u[i][j] = 0.0;
00696       u[i][i] += 1.0;
00697     }
00698     /* diagonalization of the bidiagonal form */
00699     for (k = m-1; k >= 0; k--)
00700     { k1 = k-1;
00701       its = 0;
00702       while(1)
00703       /* test for splitting */
00704       { for (l = k; l >= 0; l--)
00705         { l1 = l-1;
00706           if (fabs(rv1[l]) + anorm == anorm) break;
00707           /* rv1[0] is always zero, so there is no exit
00708            * through the bottom of the loop */
00709           if (fabs(w[l1]) + anorm == anorm)
00710           /* cancellation of rv1[l] if l greater than 0 */
00711           { c = 0.0;
00712             s = 1.0;
00713             for (i = l; i <= k; i++)
00714             { f = s * rv1[i];
00715               rv1[i] *= c;
00716               if (fabs(f) + anorm == anorm) break;
00717               g = w[i];
00718               h = sqrt(f*f+g*g);
00719               w[i] = h;
00720               c = g / h;
00721               s = -f / h;
00722               for (j = 0; j < n; j++)
00723               { y = u[l1][j];
00724                 z = u[i][j];
00725                 u[l1][j] = y * c + z * s;
00726                 u[i][j] = -y * s + z * c;
00727               }
00728             }
00729             break;
00730           }
00731         }
00732         /* test for convergence */
00733         z = w[k];
00734         if (l==k) /* convergence */
00735         { if (z < 0.0)
00736           /*  w[k] is made non-negative */
00737           { w[k] = -z;
00738             for (j = 0; j < m; j++) vt[j][k] = -vt[j][k];
00739           }
00740           break;
00741         }
00742         else if (its==30)
00743         { ierr = k;
00744           break;
00745         }
00746         else
00747         /* shift from bottom 2 by 2 minor */
00748         { its++;
00749           x = w[l];
00750           y = w[k1];
00751           g = rv1[k1];
00752           h = rv1[k];
00753           f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
00754           g = sqrt(f*f+1.0);
00755           f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
00756           /* next qr transformation */
00757           c = 1.0;
00758           s = 1.0;
00759           for (i1 = l; i1 <= k1; i1++)
00760           { i = i1 + 1;
00761             g = rv1[i];
00762             y = w[i];
00763             h = s * g;
00764             g = c * g;
00765             z = sqrt(f*f+h*h);
00766             rv1[i1] = z;
00767             c = f / z;
00768             s = h / z;
00769             f = x * c + g * s;
00770             g = -x * s + g * c;
00771             h = y * s;
00772             y = y * c;
00773             for (j = 0; j < m; j++)
00774             { x = vt[j][i1];
00775               z = vt[j][i];
00776               vt[j][i1] = x * c + z * s;
00777               vt[j][i] = -x * s + z * c;
00778             }
00779             z = sqrt(f*f+h*h);
00780             w[i1] = z;
00781             /* rotation can be arbitrary if z is zero */
00782             if (z!=0.0)
00783             { c = f / z;
00784               s = h / z;
00785             }
00786             f = c * g + s * y;
00787             x = -s * g + c * y;
00788             for (j = 0; j < n; j++)
00789             { y = u[i1][j];
00790               z = u[i][j];
00791               u[i1][j] = y * c + z * s;
00792               u[i][j] = -y * s + z * c;
00793             }
00794           }
00795           rv1[l] = 0.0;
00796           rv1[k] = f;
00797           w[k] = x;
00798         }
00799       }
00800     }
00801   }
00802   free(rv1);
00803   return ierr;
00804 }
00805 
00806 /* ********************************************************************* */
00807 
00808 int pca(int nrows, int ncolumns, double** u, double** v, double* w)
00809 /*
00810 Purpose
00811 =======
00812 
00813 This subroutine uses the singular value decomposition to perform principal
00814 components analysis of a real nrows by ncolumns rectangular matrix.
00815 
00816 Arguments
00817 =========
00818 
00819 nrows     (input) int
00820 The number of rows in the matrix u.
00821 
00822 ncolumns  (input) int
00823 The number of columns in the matrix v.
00824 
00825 u          (input) double[nrows][ncolumns]
00826 On input, the array containing the data to which the principal component
00827 analysis should be applied. The function assumes that the mean has already been
00828 subtracted of each column, and hence that the mean of each column is zero.
00829 On output, see below.
00830 
00831 v          (input) double[n][n], where n = min(nrows, ncolumns)
00832 Not used on input.
00833 
00834 w          (input) double[n], where n = min(nrows, ncolumns)
00835 Not used on input.
00836 
00837 
00838 Return value
00839 ============
00840 
00841 On output:
00842 
00843 If nrows >= ncolumns, then
00844 
00845 u contains the coordinates with respect to the principal components;
00846 v contains the principal component vectors.
00847 
00848 The dot product u . v reproduces the data that were passed in u.
00849 
00850 
00851 If nrows < ncolumns, then
00852 
00853 u contains the principal component vectors;
00854 v contains the coordinates with respect to the principal components.
00855 
00856 The dot product v . u reproduces the data that were passed in u.
00857 
00858 The eigenvalues of the covariance matrix are returned in w.
00859 
00860 The arrays u, v, and w are sorted according to eigenvalue, with the largest
00861 eigenvalues appearing first.
00862 
00863 The function returns 0 if successful, -1 if memory allocation fails, and a
00864 positive integer if the singular value decomposition fails to converge.
00865 */
00866 {
00867     int i;
00868     int j;
00869     int error;
00870     int* index = malloc(ncolumns*sizeof(int));
00871     double* temp = malloc(ncolumns*sizeof(double));
00872     if (!index || !temp)
00873     {   if (index) free(index);
00874         if (temp) free(temp);
00875         return -1;
00876     }
00877     error = svd(nrows, ncolumns, u, w, v);
00878     if (error==0)
00879     {
00880         if (nrows >= ncolumns)
00881         {   for (j = 0; j < ncolumns; j++)
00882             {   const double s = w[j];
00883                 for (i = 0; i < nrows; i++) u[i][j] *= s;
00884             }
00885             sort(ncolumns, w, index);
00886             for (i = 0; i < ncolumns/2; i++)
00887             {   j = index[i];
00888                 index[i] = index[ncolumns-1-i];
00889                 index[ncolumns-1-i] = j;
00890             }
00891             for (i = 0; i < nrows; i++)
00892             {   for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]];
00893                 for (j = 0; j < ncolumns; j++) u[i][j] = temp[j];
00894             }
00895             for (i = 0; i < ncolumns; i++)
00896             {   for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
00897                 for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
00898             }
00899             for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
00900             for (i = 0; i < ncolumns; i++) w[i] = temp[i];
00901         }
00902         else /* nrows < ncolumns */
00903         {   for (j = 0; j < nrows; j++)
00904             {   const double s = w[j];
00905                 for (i = 0; i < nrows; i++) v[i][j] *= s;
00906             }
00907             sort(nrows, w, index);
00908             for (i = 0; i < nrows/2; i++)
00909             {   j = index[i];
00910                 index[i] = index[nrows-1-i];
00911                 index[nrows-1-i] = j;
00912             }
00913             for (j = 0; j < ncolumns; j++)
00914             {   for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
00915                 for (i = 0; i < nrows; i++) u[i][j] = temp[i];
00916             }
00917             for (j = 0; j < nrows; j++)
00918             {   for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
00919                 for (i = 0; i < nrows; i++) v[j][i] = temp[i];
00920             }
00921             for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
00922             for (i = 0; i < nrows; i++) w[i] = temp[i];
00923         }
00924     }
00925     free(index);
00926     free(temp);
00927     return error;
00928 }
00929 
00930 /* ********************************************************************* */
00931 
00932 static
00933 double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
00934   const double weight[], int index1, int index2, int transpose)
00935  
00936 /*
00937 Purpose
00938 =======
00939 
00940 The euclid routine calculates the weighted Euclidean distance between two
00941 rows or columns in a matrix.
00942 
00943 Arguments
00944 =========
00945 
00946 n      (input) int
00947 The number of elements in a row or column. If transpose==0, then n is the number
00948 of columns; otherwise, n is the number of rows.
00949 
00950 data1  (input) double array
00951 The data array containing the first vector.
00952 
00953 data2  (input) double array
00954 The data array containing the second vector.
00955 
00956 mask1  (input) int array
00957 This array which elements in data1 are missing. If mask1[i][j]==0, then
00958 data1[i][j] is missing.
00959 
00960 mask2  (input) int array
00961 This array which elements in data2 are missing. If mask2[i][j]==0, then
00962 data2[i][j] is missing.
00963 
00964 weight (input) double[n]
00965 The weights that are used to calculate the distance.
00966 
00967 index1     (input) int
00968 Index of the first row or column.
00969 
00970 index2     (input) int
00971 Index of the second row or column.
00972 
00973 transpose (input) int
00974 If transpose==0, the distance between two rows in the matrix is calculated.
00975 Otherwise, the distance between two columns in the matrix is calculated.
00976 
00977 ============================================================================
00978 */
00979 { double result = 0.;
00980   double tweight = 0;
00981   int i;
00982   if (transpose==0) /* Calculate the distance between two rows */
00983   { for (i = 0; i < n; i++)
00984     { if (mask1[index1][i] && mask2[index2][i])
00985       { double term = data1[index1][i] - data2[index2][i];
00986         result += weight[i]*term*term;
00987         tweight += weight[i];
00988       }
00989     }
00990   }
00991   else
00992   { for (i = 0; i < n; i++)
00993     { if (mask1[i][index1] && mask2[i][index2])
00994       { double term = data1[i][index1] - data2[i][index2];
00995         result += weight[i]*term*term;
00996         tweight += weight[i];
00997       }
00998     }
00999   }
01000   if (!tweight) return 0; /* usually due to empty clusters */
01001   result /= tweight;
01002   return result;
01003 }
01004 
01005 /* ********************************************************************* */
01006 
01007 static
01008 double cityblock (int n, double** data1, double** data2, int** mask1,
01009   int** mask2, const double weight[], int index1, int index2, int transpose)
01010 
01011 /*
01012 Purpose
01013 =======
01014 
01015 The cityblock routine calculates the weighted "City Block" distance between
01016 two rows or columns in a matrix. City Block distance is defined as the
01017 absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
01018 equivalent to taking an "up and over" path.
01019 
01020 Arguments
01021 =========
01022 
01023 n      (input) int
01024 The number of elements in a row or column. If transpose==0, then n is the number
01025 of columns; otherwise, n is the number of rows.
01026 
01027 data1  (input) double array
01028 The data array containing the first vector.
01029 
01030 data2  (input) double array
01031 The data array containing the second vector.
01032 
01033 mask1  (input) int array
01034 This array which elements in data1 are missing. If mask1[i][j]==0, then
01035 data1[i][j] is missing.
01036 
01037 mask2  (input) int array
01038 This array which elements in data2 are missing. If mask2[i][j]==0, then
01039 data2[i][j] is missing.
01040 
01041 weight (input) double[n]
01042 The weights that are used to calculate the distance.
01043 
01044 index1     (input) int
01045 Index of the first row or column.
01046 
01047 index2     (input) int
01048 Index of the second row or column.
01049 
01050 transpose (input) int
01051 If transpose==0, the distance between two rows in the matrix is calculated.
01052 Otherwise, the distance between two columns in the matrix is calculated.
01053 
01054 ============================================================================ */
01055 { double result = 0.;
01056   double tweight = 0;
01057   int i;
01058   if (transpose==0) /* Calculate the distance between two rows */
01059   { for (i = 0; i < n; i++)
01060     { if (mask1[index1][i] && mask2[index2][i])
01061       { double term = data1[index1][i] - data2[index2][i];
01062         result = result + weight[i]*fabs(term);
01063         tweight += weight[i];
01064       }
01065     }
01066   }
01067   else
01068   { for (i = 0; i < n; i++)
01069     { if (mask1[i][index1] && mask2[i][index2])
01070       { double term = data1[i][index1] - data2[i][index2];
01071         result = result + weight[i]*fabs(term);
01072         tweight += weight[i];
01073       }
01074     }
01075   }
01076   if (!tweight) return 0; /* usually due to empty clusters */
01077   result /= tweight;
01078   return result;
01079 }
01080 
01081 /* ********************************************************************* */
01082 
01083 static
01084 double correlation (int n, double** data1, double** data2, int** mask1,
01085   int** mask2, const double weight[], int index1, int index2, int transpose)
01086 /*
01087 Purpose
01088 =======
01089 
01090 The correlation routine calculates the weighted Pearson distance between two
01091 rows or columns in a matrix. We define the Pearson distance as one minus the
01092 Pearson correlation.
01093 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
01094 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
01095 (e.g., choose b = a + c).
01096 
01097 Arguments
01098 =========
01099 
01100 n      (input) int
01101 The number of elements in a row or column. If transpose==0, then n is the number
01102 of columns; otherwise, n is the number of rows.
01103 
01104 data1  (input) double array
01105 The data array containing the first vector.
01106 
01107 data2  (input) double array
01108 The data array containing the second vector.
01109 
01110 mask1  (input) int array
01111 This array which elements in data1 are missing. If mask1[i][j]==0, then
01112 data1[i][j] is missing.
01113 
01114 mask2  (input) int array
01115 This array which elements in data2 are missing. If mask2[i][j]==0, then
01116 data2[i][j] is missing.
01117 
01118 weight (input) double[n]
01119 The weights that are used to calculate the distance.
01120 
01121 index1     (input) int
01122 Index of the first row or column.
01123 
01124 index2     (input) int
01125 Index of the second row or column.
01126 
01127 transpose (input) int
01128 If transpose==0, the distance between two rows in the matrix is calculated.
01129 Otherwise, the distance between two columns in the matrix is calculated.
01130 ============================================================================
01131 */
01132 { double result = 0.;
01133   double sum1 = 0.;
01134   double sum2 = 0.;
01135   double denom1 = 0.;
01136   double denom2 = 0.;
01137   double tweight = 0.;
01138   if (transpose==0) /* Calculate the distance between two rows */
01139   { int i;
01140     for (i = 0; i < n; i++)
01141     { if (mask1[index1][i] && mask2[index2][i])
01142       { double term1 = data1[index1][i];
01143         double term2 = data2[index2][i];
01144         double w = weight[i];
01145         sum1 += w*term1;
01146         sum2 += w*term2;
01147         result += w*term1*term2;
01148         denom1 += w*term1*term1;
01149         denom2 += w*term2*term2;
01150         tweight += w;
01151       }
01152     }
01153   }
01154   else
01155   { int i;
01156     for (i = 0; i < n; i++)
01157     { if (mask1[i][index1] && mask2[i][index2])
01158       { double term1 = data1[i][index1];
01159         double term2 = data2[i][index2];
01160         double w = weight[i];
01161         sum1 += w*term1;
01162         sum2 += w*term2;
01163         result += w*term1*term2;
01164         denom1 += w*term1*term1;
01165         denom2 += w*term2*term2;
01166         tweight += w;
01167       }
01168     }
01169   }
01170   if (!tweight) return 0; /* usually due to empty clusters */
01171   result -= sum1 * sum2 / tweight;
01172   denom1 -= sum1 * sum1 / tweight;
01173   denom2 -= sum2 * sum2 / tweight;
01174   if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
01175   if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
01176   result = result / sqrt(denom1*denom2);
01177   result = 1. - result;
01178   return result;
01179 }
01180 
01181 /* ********************************************************************* */
01182 
01183 static
01184 double acorrelation (int n, double** data1, double** data2, int** mask1,
01185   int** mask2, const double weight[], int index1, int index2, int transpose)
01186 /*
01187 Purpose
01188 =======
01189 
01190 The acorrelation routine calculates the weighted Pearson distance between two
01191 rows or columns, using the absolute value of the correlation.
01192 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
01193 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
01194 (e.g., choose b = a + c).
01195 
01196 Arguments
01197 =========
01198 
01199 n      (input) int
01200 The number of elements in a row or column. If transpose==0, then n is the number
01201 of columns; otherwise, n is the number of rows.
01202 
01203 data1  (input) double array
01204 The data array containing the first vector.
01205 
01206 data2  (input) double array
01207 The data array containing the second vector.
01208 
01209 mask1  (input) int array
01210 This array which elements in data1 are missing. If mask1[i][j]==0, then
01211 data1[i][j] is missing.
01212 
01213 mask2  (input) int array
01214 This array which elements in data2 are missing. If mask2[i][j]==0, then
01215 data2[i][j] is missing.
01216 
01217 weight (input) double[n]
01218 The weights that are used to calculate the distance.
01219 
01220 index1     (input) int
01221 Index of the first row or column.
01222 
01223 index2     (input) int
01224 Index of the second row or column.
01225 
01226 transpose (input) int
01227 If transpose==0, the distance between two rows in the matrix is calculated.
01228 Otherwise, the distance between two columns in the matrix is calculated.
01229 ============================================================================
01230 */
01231 { double result = 0.;
01232   double sum1 = 0.;
01233   double sum2 = 0.;
01234   double denom1 = 0.;
01235   double denom2 = 0.;
01236   double tweight = 0.;
01237   if (transpose==0) /* Calculate the distance between two rows */
01238   { int i;
01239     for (i = 0; i < n; i++)
01240     { if (mask1[index1][i] && mask2[index2][i])
01241       { double term1 = data1[index1][i];
01242         double term2 = data2[index2][i];
01243         double w = weight[i];
01244         sum1 += w*term1;
01245         sum2 += w*term2;
01246         result += w*term1*term2;
01247         denom1 += w*term1*term1;
01248         denom2 += w*term2*term2;
01249         tweight += w;
01250       }
01251     }
01252   }
01253   else
01254   { int i;
01255     for (i = 0; i < n; i++)
01256     { if (mask1[i][index1] && mask2[i][index2])
01257       { double term1 = data1[i][index1];
01258         double term2 = data2[i][index2];
01259         double w = weight[i];
01260         sum1 += w*term1;
01261         sum2 += w*term2;
01262         result += w*term1*term2;
01263         denom1 += w*term1*term1;
01264         denom2 += w*term2*term2;
01265         tweight += w;
01266       }
01267     }
01268   }
01269   if (!tweight) return 0; /* usually due to empty clusters */
01270   result -= sum1 * sum2 / tweight;
01271   denom1 -= sum1 * sum1 / tweight;
01272   denom2 -= sum2 * sum2 / tweight;
01273   if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
01274   if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
01275   result = fabs(result) / sqrt(denom1*denom2);
01276   result = 1. - result;
01277   return result;
01278 }
01279 
01280 /* ********************************************************************* */
01281 
01282 static
01283 double ucorrelation (int n, double** data1, double** data2, int** mask1,
01284   int** mask2, const double weight[], int index1, int index2, int transpose)
01285 /*
01286 Purpose
01287 =======
01288 
01289 The ucorrelation routine calculates the weighted Pearson distance between two
01290 rows or columns, using the uncentered version of the Pearson correlation. In the
01291 uncentered Pearson correlation, a zero mean is used for both vectors even if
01292 the actual mean is nonzero.
01293 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
01294 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
01295 (e.g., choose b = a + c).
01296 
01297 Arguments
01298 =========
01299 
01300 n      (input) int
01301 The number of elements in a row or column. If transpose==0, then n is the number
01302 of columns; otherwise, n is the number of rows.
01303 
01304 data1  (input) double array
01305 The data array containing the first vector.
01306 
01307 data2  (input) double array
01308 The data array containing the second vector.
01309 
01310 mask1  (input) int array
01311 This array which elements in data1 are missing. If mask1[i][j]==0, then
01312 data1[i][j] is missing.
01313 
01314 mask2  (input) int array
01315 This array which elements in data2 are missing. If mask2[i][j]==0, then
01316 data2[i][j] is missing.
01317 
01318 weight (input) double[n]
01319 The weights that are used to calculate the distance.
01320 
01321 index1     (input) int
01322 Index of the first row or column.
01323 
01324 index2     (input) int
01325 Index of the second row or column.
01326 
01327 transpose (input) int
01328 If transpose==0, the distance between two rows in the matrix is calculated.
01329 Otherwise, the distance between two columns in the matrix is calculated.
01330 ============================================================================
01331 */
01332 { double result = 0.;
01333   double denom1 = 0.;
01334   double denom2 = 0.;
01335   int flag = 0;
01336   /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
01337    * found.
01338    */
01339   if (transpose==0) /* Calculate the distance between two rows */
01340   { int i;
01341     for (i = 0; i < n; i++)
01342     { if (mask1[index1][i] && mask2[index2][i])
01343       { double term1 = data1[index1][i];
01344         double term2 = data2[index2][i];
01345         double w = weight[i];
01346         result += w*term1*term2;
01347         denom1 += w*term1*term1;
01348         denom2 += w*term2*term2;
01349         flag = 1;
01350       }
01351     }
01352   }
01353   else
01354   { int i;
01355     for (i = 0; i < n; i++)
01356     { if (mask1[i][index1] && mask2[i][index2])
01357       { double term1 = data1[i][index1];
01358         double term2 = data2[i][index2];
01359         double w = weight[i];
01360         result += w*term1*term2;
01361         denom1 += w*term1*term1;
01362         denom2 += w*term2*term2;
01363         flag = 1;
01364       }
01365     }
01366   }
01367   if (!flag) return 0.;
01368   if (denom1==0.) return 1.;
01369   if (denom2==0.) return 1.;
01370   result = result / sqrt(denom1*denom2);
01371   result = 1. - result;
01372   return result;
01373 }
01374 
01375 /* ********************************************************************* */
01376 
01377 static
01378 double uacorrelation (int n, double** data1, double** data2, int** mask1,
01379   int** mask2, const double weight[], int index1, int index2, int transpose)
01380 /*
01381 Purpose
01382 =======
01383 
01384 The uacorrelation routine calculates the weighted Pearson distance between two
01385 rows or columns, using the absolute value of the uncentered version of the
01386 Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
01387 for both vectors even if the actual mean is nonzero.
01388 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
01389 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
01390 (e.g., choose b = a + c).
01391 
01392 Arguments
01393 =========
01394 
01395 n      (input) int
01396 The number of elements in a row or column. If transpose==0, then n is the number
01397 of columns; otherwise, n is the number of rows.
01398 
01399 data1  (input) double array
01400 The data array containing the first vector.
01401 
01402 data2  (input) double array
01403 The data array containing the second vector.
01404 
01405 mask1  (input) int array
01406 This array which elements in data1 are missing. If mask1[i][j]==0, then
01407 data1[i][j] is missing.
01408 
01409 mask2  (input) int array
01410 This array which elements in data2 are missing. If mask2[i][j]==0, then
01411 data2[i][j] is missing.
01412 
01413 weight (input) double[n]
01414 The weights that are used to calculate the distance.
01415 
01416 index1     (input) int
01417 Index of the first row or column.
01418 
01419 index2     (input) int
01420 Index of the second row or column.
01421 
01422 transpose (input) int
01423 If transpose==0, the distance between two rows in the matrix is calculated.
01424 Otherwise, the distance between two columns in the matrix is calculated.
01425 ============================================================================
01426 */
01427 { double result = 0.;
01428   double denom1 = 0.;
01429   double denom2 = 0.;
01430   int flag = 0;
01431   /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
01432    * found.
01433    */
01434   if (transpose==0) /* Calculate the distance between two rows */
01435   { int i;
01436     for (i = 0; i < n; i++)
01437     { if (mask1[index1][i] && mask2[index2][i])
01438       { double term1 = data1[index1][i];
01439         double term2 = data2[index2][i];
01440         double w = weight[i];
01441         result += w*term1*term2;
01442         denom1 += w*term1*term1;
01443         denom2 += w*term2*term2;
01444         flag = 1;
01445       }
01446     }
01447   }
01448   else
01449   { int i;
01450     for (i = 0; i < n; i++)
01451     { if (mask1[i][index1] && mask2[i][index2])
01452       { double term1 = data1[i][index1];
01453         double term2 = data2[i][index2];
01454         double w = weight[i];
01455         result += w*term1*term2;
01456         denom1 += w*term1*term1;
01457         denom2 += w*term2*term2;
01458         flag = 1;
01459       }
01460     }
01461   }
01462   if (!flag) return 0.;
01463   if (denom1==0.) return 1.;
01464   if (denom2==0.) return 1.;
01465   result = fabs(result) / sqrt(denom1*denom2);
01466   result = 1. - result;
01467   return result;
01468 }
01469 
01470 /* *********************************************************************  */
01471 
01472 static
01473 double spearman (int n, double** data1, double** data2, int** mask1,
01474   int** mask2, const double weight[], int index1, int index2, int transpose)
01475 /*
01476 Purpose
01477 =======
01478 
01479 The spearman routine calculates the Spearman distance between two rows or
01480 columns. The Spearman distance is defined as one minus the Spearman rank
01481 correlation.
01482 
01483 Arguments
01484 =========
01485 
01486 n      (input) int
01487 The number of elements in a row or column. If transpose==0, then n is the number
01488 of columns; otherwise, n is the number of rows.
01489 
01490 data1  (input) double array
01491 The data array containing the first vector.
01492 
01493 data2  (input) double array
01494 The data array containing the second vector.
01495 
01496 mask1  (input) int array
01497 This array which elements in data1 are missing. If mask1[i][j]==0, then
01498 data1[i][j] is missing.
01499 
01500 mask2  (input) int array
01501 This array which elements in data2 are missing. If mask2[i][j]==0, then
01502 data2[i][j] is missing.
01503 
01504 weight (input) double[n]
01505 These weights are ignored, but included for consistency with other distance
01506 measures.
01507 
01508 index1     (input) int
01509 Index of the first row or column.
01510 
01511 index2     (input) int
01512 Index of the second row or column.
01513 
01514 transpose (input) int
01515 If transpose==0, the distance between two rows in the matrix is calculated.
01516 Otherwise, the distance between two columns in the matrix is calculated.
01517 ============================================================================
01518 */
01519 { int i;
01520   int m = 0;
01521   double* rank1;
01522   double* rank2;
01523   double result = 0.;
01524   double denom1 = 0.;
01525   double denom2 = 0.;
01526   double avgrank;
01527   double* tdata1;
01528   double* tdata2;
01529   tdata1 = malloc(n*sizeof(double));
01530   if(!tdata1) return 0.0; /* Memory allocation error */
01531   tdata2 = malloc(n*sizeof(double));
01532   if(!tdata2) /* Memory allocation error */
01533   { free(tdata1);
01534     return 0.0;
01535   }
01536   if (transpose==0)
01537   { for (i = 0; i < n; i++)
01538     { if (mask1[index1][i] && mask2[index2][i])
01539       { tdata1[m] = data1[index1][i];
01540         tdata2[m] = data2[index2][i];
01541         m++;
01542       }
01543     }
01544   }
01545   else
01546   { for (i = 0; i < n; i++)
01547     { if (mask1[i][index1] && mask2[i][index2])
01548       { tdata1[m] = data1[i][index1];
01549         tdata2[m] = data2[i][index2];
01550         m++;
01551       }
01552     }
01553   }
01554   if (m==0)
01555   { free(tdata1);
01556     free(tdata2);
01557     return 0;
01558   }
01559   rank1 = getrank(m, tdata1);
01560   free(tdata1);
01561   if(!rank1)
01562   { free(tdata2);
01563     return 0.0; /* Memory allocation error */
01564   }
01565   rank2 = getrank(m, tdata2);
01566   free(tdata2);
01567   if(!rank2) /* Memory allocation error */
01568   { free(rank1);
01569     return 0.0;
01570   }
01571   avgrank = 0.5*(m-1); /* Average rank */
01572   for (i = 0; i < m; i++)
01573   { const double value1 = rank1[i];
01574     const double value2 = rank2[i];
01575     result += value1 * value2;
01576     denom1 += value1 * value1;
01577     denom2 += value2 * value2;
01578   }
01579   /* Note: denom1 and denom2 cannot be calculated directly from the number
01580    * of elements. If two elements have the same rank, the squared sum of
01581    * their ranks will change.
01582    */
01583   free(rank1);
01584   free(rank2);
01585   result /= m;
01586   denom1 /= m;
01587   denom2 /= m;
01588   result -= avgrank * avgrank;
01589   denom1 -= avgrank * avgrank;
01590   denom2 -= avgrank * avgrank;
01591   if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
01592   if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
01593   result = result / sqrt(denom1*denom2);
01594   result = 1. - result;
01595   return result;
01596 }
01597 
01598 /* *********************************************************************  */
01599 
01600 static
01601 double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
01602   const double weight[], int index1, int index2, int transpose)
01603 /*
01604 Purpose
01605 =======
01606 
01607 The kendall routine calculates the Kendall distance between two
01608 rows or columns. The Kendall distance is defined as one minus Kendall's tau.
01609 
01610 Arguments
01611 =========
01612 
01613 n      (input) int
01614 The number of elements in a row or column. If transpose==0, then n is the number
01615 of columns; otherwise, n is the number of rows.
01616 
01617 data1  (input) double array
01618 The data array containing the first vector.
01619 
01620 data2  (input) double array
01621 The data array containing the second vector.
01622 
01623 mask1  (input) int array
01624 This array which elements in data1 are missing. If mask1[i][j]==0, then
01625 data1[i][j] is missing.
01626 
01627 mask2  (input) int array
01628 This array which elements in data2 are missing. If mask2[i][j]==0, then
01629 data2[i][j] is missing.
01630 
01631 weight (input) double[n]
01632 These weights are ignored, but included for consistency with other distance
01633 measures.
01634 
01635 index1     (input) int
01636 Index of the first row or column.
01637 
01638 index2     (input) int
01639 Index of the second row or column.
01640 
01641 transpose (input) int
01642 If transpose==0, the distance between two rows in the matrix is calculated.
01643 Otherwise, the distance between two columns in the matrix is calculated.
01644 ============================================================================
01645 */
01646 { int con = 0;
01647   int dis = 0;
01648   int exx = 0;
01649   int exy = 0;
01650   int flag = 0;
01651   /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
01652    * found.
01653    */
01654   double denomx;
01655   double denomy;
01656   double tau;
01657   int i, j;
01658   if (transpose==0)
01659   { for (i = 0; i < n; i++)
01660     { if (mask1[index1][i] && mask2[index2][i])
01661       { for (j = 0; j < i; j++)
01662         { if (mask1[index1][j] && mask2[index2][j])
01663           { double x1 = data1[index1][i];
01664             double x2 = data1[index1][j];
01665             double y1 = data2[index2][i];
01666             double y2 = data2[index2][j];
01667             if (x1 < x2 && y1 < y2) con++;
01668             if (x1 > x2 && y1 > y2) con++;
01669             if (x1 < x2 && y1 > y2) dis++;
01670             if (x1 > x2 && y1 < y2) dis++;
01671             if (x1 == x2 && y1 != y2) exx++;
01672             if (x1 != x2 && y1 == y2) exy++;
01673             flag = 1;
01674           }
01675         }
01676       }
01677     }
01678   }
01679   else
01680   { for (i = 0; i < n; i++)
01681     { if (mask1[i][index1] && mask2[i][index2])
01682       { for (j = 0; j < i; j++)
01683         { if (mask1[j][index1] && mask2[j][index2])
01684           { double x1 = data1[i][index1];
01685             double x2 = data1[j][index1];
01686             double y1 = data2[i][index2];
01687             double y2 = data2[j][index2];
01688             if (x1 < x2 && y1 < y2) con++;
01689             if (x1 > x2 && y1 > y2) con++;
01690             if (x1 < x2 && y1 > y2) dis++;
01691             if (x1 > x2 && y1 < y2) dis++;
01692             if (x1 == x2 && y1 != y2) exx++;
01693             if (x1 != x2 && y1 == y2) exy++;
01694             flag = 1;
01695           }
01696         }
01697       }
01698     }
01699   }
01700   if (!flag) return 0.;
01701   denomx = con + dis + exx;
01702   denomy = con + dis + exy;
01703   if (denomx==0) return 1;
01704   if (denomy==0) return 1;
01705   tau = (con-dis)/sqrt(denomx*denomy);
01706   return 1.-tau;
01707 }
01708 
01709 /* *********************************************************************  */
01710 
01711 static double(*setmetric(char dist)) 
01712   (int, double**, double**, int**, int**, const double[], int, int, int)
01713 { switch(dist)
01714   { case 'e': return &euclid;
01715     case 'b': return &cityblock;
01716     case 'c': return &correlation;
01717     case 'a': return &acorrelation;
01718     case 'u': return &ucorrelation;
01719     case 'x': return &uacorrelation;
01720     case 's': return &spearman;
01721     case 'k': return &kendall;
01722     default: return &euclid;
01723   }
01724   return NULL; /* Never get here */
01725 }
01726 
01727 /* *********************************************************************  */
01728 
01729 static double uniform(void)
01730 /*
01731 Purpose
01732 =======
01733 
01734 This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
01735 and 1.0 are excluded. This random number generator is described in:
01736 
01737 Pierre l'Ecuyer
01738 Efficient and Portable Combined Random Number Generators
01739 Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749,774.
01740 
01741 The first time this routine is called, it initializes the random number
01742 generator using the current time. First, the current epoch time in seconds is
01743 used as a seed for the random number generator in the C library. The first two
01744 random numbers generated by this generator are used to initialize the random
01745 number generator implemented in this routine.
01746 
01747 
01748 Arguments
01749 =========
01750 
01751 None.
01752 
01753 
01754 Return value
01755 ============
01756 
01757 A double-precison number between 0.0 and 1.0.
01758 ============================================================================
01759 */
01760 { int z;
01761   static const int m1 = 2147483563;
01762   static const int m2 = 2147483399;
01763   const double scale = 1.0/m1;
01764 
01765   static int s1 = 0;
01766   static int s2 = 0;
01767 
01768   if (s1==0 || s2==0) /* initialize */
01769   { unsigned int initseed = (unsigned int) time(0);
01770     srand(initseed);
01771     s1 = rand();
01772     s2 = rand();
01773   }
01774 
01775   do
01776   { int k;
01777     k = s1/53668;
01778     s1 = 40014*(s1-k*53668)-k*12211;
01779     if (s1 < 0) s1+=m1;
01780     k = s2/52774;
01781     s2 = 40692*(s2-k*52774)-k*3791;
01782     if(s2 < 0) s2+=m2;
01783     z = s1-s2;
01784     if(z < 1) z+=(m1-1);
01785   } while (z==m1); /* To avoid returning 1.0 */
01786 
01787   return z*scale;
01788 }
01789 
01790 /* ************************************************************************ */
01791 
01792 static int binomial(int n, double p)
01793 /*
01794 Purpose
01795 =======
01796 
01797 This routine generates a random number between 0 and n inclusive, following
01798 the binomial distribution with probability p and n trials. The routine is
01799 based on the BTPE algorithm, described in:
01800 
01801 Voratas Kachitvichyanukul and Bruce W. Schmeiser:
01802 Binomial Random Variate Generation
01803 Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
01804 
01805 
01806 Arguments
01807 =========
01808 
01809 p          (input) double
01810 The probability of a single event. This probability should be less than or
01811 equal to 0.5.
01812 
01813 n          (input) int
01814 The number of trials.
01815 
01816 
01817 Return value
01818 ============
01819 
01820 An integer drawn from a binomial distribution with parameters (p, n).
01821 
01822 ============================================================================
01823 */
01824 { const double q = 1 - p;
01825   if (n*p < 30.0) /* Algorithm BINV */
01826   { const double s = p/q;
01827     const double a = (n+1)*s;
01828     double r = exp(n*log(q)); /* pow() causes a crash on AIX */
01829     int x = 0;
01830     double u = uniform();
01831     while(1)
01832     { if (u < r) return x;
01833       u-=r;
01834       x++;
01835       r *= (a/x)-s;
01836     }
01837   }
01838   else /* Algorithm BTPE */
01839   { /* Step 0 */
01840     const double fm = n*p + p;
01841     const int m = (int) fm;
01842     const double p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5;
01843     const double xm = m + 0.5;
01844     const double xl = xm - p1;
01845     const double xr = xm + p1;
01846     const double c = 0.134 + 20.5/(15.3+m);
01847     const double a = (fm-xl)/(fm-xl*p);
01848     const double b = (xr-fm)/(xr*q);
01849     const double lambdal = a*(1.0+0.5*a);
01850     const double lambdar = b*(1.0+0.5*b);
01851     const double p2 = p1*(1+2*c);
01852     const double p3 = p2 + c/lambdal;
01853     const double p4 = p3 + c/lambdar;
01854     while (1)
01855     { /* Step 1 */
01856       int y;
01857       int k;
01858       double u = uniform();
01859       double v = uniform();
01860       u *= p4;
01861       if (u <= p1) return (int)(xm-p1*v+u);
01862       /* Step 2 */
01863       if (u > p2)
01864       { /* Step 3 */
01865         if (u > p3)
01866         { /* Step 4 */
01867           y = (int)(xr-log(v)/lambdar);
01868           if (y > n) continue;
01869           /* Go to step 5 */
01870           v = v*(u-p3)*lambdar;
01871         }
01872         else
01873         { y = (int)(xl+log(v)/lambdal);
01874           if (y < 0) continue;
01875           /* Go to step 5 */
01876           v = v*(u-p2)*lambdal;
01877         }
01878       }
01879       else
01880       { const double x = xl + (u-p1)/c;
01881         v = v*c + 1.0 - fabs(m-x+0.5)/p1;
01882         if (v > 1) continue;
01883         /* Go to step 5 */
01884         y = (int)x;
01885       }
01886       /* Step 5 */
01887       /* Step 5.0 */
01888       k = abs(y-m);
01889       if (k > 20 && k < 0.5*n*p*q-1.0)
01890       { /* Step 5.2 */
01891         double rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) + 0.1666666666666)/(n*p*q)+0.5);
01892         double t = -k*k/(2*n*p*q);
01893         double A = log(v);
01894         if (A < t-rho) return y;
01895         else if (A > t+rho) continue;
01896         else
01897         { /* Step 5.3 */
01898           double x1 = y+1;
01899           double f1 = m+1;
01900           double z = n+1-m;
01901           double w = n-y+1;
01902           double x2 = x1*x1;
01903           double f2 = f1*f1;
01904           double z2 = z*z;
01905           double w2 = w*w;
01906           if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
01907                 + (y-m)*log(w*p/(x1*q))
01908                 + (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
01909                 + (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
01910                 + (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
01911                 + (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
01912              continue;
01913           return y;
01914         }
01915       }
01916       else
01917       { /* Step 5.1 */
01918         int i;
01919         const double s = p/q;
01920         const double aa = s*(n+1);
01921         double f = 1.0;
01922         for (i = m; i < y; f *= (aa/(++i)-s));
01923         for (i = y; i < m; f /= (aa/(++i)-s));
01924         if (v > f) continue;
01925         return y;
01926       }
01927     }
01928   }
01929   /* Never get here */
01930   return -1;
01931 }
01932 
01933 /* ************************************************************************ */
01934 
01935 static void randomassign (int nclusters, int nelements, int clusterid[])
01936 /*
01937 Purpose
01938 =======
01939 
01940 The randomassign routine performs an initial random clustering, needed for
01941 k-means or k-median clustering. Elements (genes or microarrays) are randomly
01942 assigned to clusters. The number of elements in each cluster is chosen
01943 randomly, making sure that each cluster will receive at least one element.
01944 
01945 
01946 Arguments
01947 =========
01948 
01949 nclusters  (input) int
01950 The number of clusters.
01951 
01952 nelements  (input) int
01953 The number of elements to be clustered (i.e., the number of genes or microarrays
01954 to be clustered).
01955 
01956 clusterid  (output) int[nelements]
01957 The cluster number to which an element was assigned.
01958 
01959 ============================================================================
01960 */
01961 { int i, j;
01962   int k = 0;
01963   double p;
01964   int n = nelements-nclusters;
01965   /* Draw the number of elements in each cluster from a multinomial
01966    * distribution, reserving ncluster elements to set independently
01967    * in order to guarantee that none of the clusters are empty.
01968    */
01969   for (i = 0; i < nclusters-1; i++)
01970   { p = 1.0/(nclusters-i);
01971     j = binomial(n, p);
01972     n -= j;
01973     j += k+1; /* Assign at least one element to cluster i */
01974     for ( ; k < j; k++) clusterid[k] = i;
01975   }
01976   /* Assign the remaining elements to the last cluster */
01977   for ( ; k < nelements; k++) clusterid[k] = i;
01978 
01979   /* Create a random permutation of the cluster assignments */
01980   for (i = 0; i < nelements; i++)
01981   { j = (int) (i + (nelements-i)*uniform());
01982     k = clusterid[j];
01983     clusterid[j] = clusterid[i];
01984     clusterid[i] = k;
01985   }
01986 
01987   return;
01988 }
01989 
01990 /* ********************************************************************* */
01991 
01992 static void getclustermeans(int nclusters, int nrows, int ncolumns,
01993   double** data, int** mask, int clusterid[], double** cdata, int** cmask,
01994   int transpose)
01995 /*
01996 Purpose
01997 =======
01998 
01999 The getclustermeans routine calculates the cluster centroids, given to which
02000 cluster each element belongs. The centroid is defined as the mean over all
02001 elements for each dimension.
02002 
02003 Arguments
02004 =========
02005 
02006 nclusters  (input) int
02007 The number of clusters.
02008 
02009 nrows     (input) int
02010 The number of rows in the gene expression data matrix, equal to the number of
02011 genes.
02012 
02013 ncolumns  (input) int
02014 The number of columns in the gene expression data matrix, equal to the number of
02015 microarrays.
02016 
02017 data       (input) double[nrows][ncolumns]
02018 The array containing the gene expression data.
02019 
02020 mask       (input) int[nrows][ncolumns]
02021 This array shows which data values are missing. If mask[i][j]==0, then
02022 data[i][j] is missing.
02023 
02024 clusterid  (output) int[nrows] if transpose==0
02025                     int[ncolumns] if transpose==1
02026 The cluster number to which each element belongs. If transpose==0, then the
02027 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
02028 is equal to ncolumns (the number of microarrays).
02029 
02030 cdata      (output) double[nclusters][ncolumns] if transpose==0
02031                     double[nrows][nclusters] if transpose==1
02032 On exit of getclustermeans, this array contains the cluster centroids.
02033 
02034 cmask      (output) int[nclusters][ncolumns] if transpose==0
02035                     int[nrows][nclusters] if transpose==1
02036 This array shows which data values of are missing for each centroid. If
02037 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
02038 a centroid if all corresponding data values of the cluster members are missing.
02039 
02040 transpose  (input) int
02041 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
02042 columns (microarrays) are specified.
02043 
02044 ========================================================================
02045 */
02046 { int i, j, k;
02047   if (transpose==0)
02048   { for (i = 0; i < nclusters; i++)
02049     { for (j = 0; j < ncolumns; j++)
02050       { cmask[i][j] = 0;
02051         cdata[i][j] = 0.;
02052       }
02053     }
02054     for (k = 0; k < nrows; k++)
02055     { i = clusterid[k];
02056       for (j = 0; j < ncolumns; j++)
02057       { if (mask[k][j] != 0)
02058         { cdata[i][j]+=data[k][j];
02059           cmask[i][j]++;
02060         }
02061       }
02062     }
02063     for (i = 0; i < nclusters; i++)
02064     { for (j = 0; j < ncolumns; j++)
02065       { if (cmask[i][j]>0)
02066         { cdata[i][j] /= cmask[i][j];
02067           cmask[i][j] = 1;
02068         }
02069       }
02070     }
02071   }
02072   else
02073   { for (i = 0; i < nrows; i++)
02074     { for (j = 0; j < nclusters; j++)
02075       { cdata[i][j] = 0.;
02076         cmask[i][j] = 0;
02077       }
02078     }
02079     for (k = 0; k < ncolumns; k++)
02080     { i = clusterid[k];
02081       for (j = 0; j < nrows; j++)
02082       { if (mask[j][k] != 0)
02083         { cdata[j][i]+=data[j][k];
02084           cmask[j][i]++;
02085         }
02086       }
02087     }
02088     for (i = 0; i < nrows; i++)
02089     { for (j = 0; j < nclusters; j++)
02090       { if (cmask[i][j]>0)
02091         { cdata[i][j] /= cmask[i][j];
02092           cmask[i][j] = 1;
02093         }
02094       }
02095     }
02096   }
02097 }
02098 
02099 /* ********************************************************************* */
02100 
02101 static void
02102 getclustermedians(int nclusters, int nrows, int ncolumns,
02103   double** data, int** mask, int clusterid[], double** cdata, int** cmask,
02104   int transpose, double cache[])
02105 /*
02106 Purpose
02107 =======
02108 
02109 The getclustermedians routine calculates the cluster centroids, given to which
02110 cluster each element belongs. The centroid is defined as the median over all
02111 elements for each dimension.
02112 
02113 Arguments
02114 =========
02115 
02116 nclusters  (input) int
02117 The number of clusters.
02118 
02119 nrows     (input) int
02120 The number of rows in the gene expression data matrix, equal to the number of
02121 genes.
02122 
02123 ncolumns  (input) int
02124 The number of columns in the gene expression data matrix, equal to the number of
02125 microarrays.
02126 
02127 data       (input) double[nrows][ncolumns]
02128 The array containing the gene expression data.
02129 
02130 mask       (input) int[nrows][ncolumns]
02131 This array shows which data values are missing. If mask[i][j]==0, then
02132 data[i][j] is missing.
02133 
02134 clusterid  (output) int[nrows] if transpose==0
02135                     int[ncolumns] if transpose==1
02136 The cluster number to which each element belongs. If transpose==0, then the
02137 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
02138 is equal to ncolumns (the number of microarrays).
02139 
02140 cdata      (output) double[nclusters][ncolumns] if transpose==0
02141                     double[nrows][nclusters] if transpose==1
02142 On exit of getclustermedians, this array contains the cluster centroids.
02143 
02144 cmask      (output) int[nclusters][ncolumns] if transpose==0
02145                     int[nrows][nclusters] if transpose==1
02146 This array shows which data values of are missing for each centroid. If
02147 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
02148 a centroid if all corresponding data values of the cluster members are missing.
02149 
02150 transpose  (input) int
02151 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
02152 columns (microarrays) are specified.
02153 
02154 cache      (input) double[nrows] if transpose==0
02155                    double[ncolumns] if transpose==1
02156 This array should be allocated before calling getclustermedians; its contents
02157 on input is not relevant. This array is used as a temporary storage space when
02158 calculating the medians.
02159 
02160 ========================================================================
02161 */
02162 { int i, j, k;
02163   if (transpose==0)
02164   { for (i = 0; i < nclusters; i++)
02165     { for (j = 0; j < ncolumns; j++)
02166       { int count = 0;
02167         for (k = 0; k < nrows; k++)
02168         { if (i==clusterid[k] && mask[k][j])
02169           { cache[count] = data[k][j];
02170             count++;
02171           }
02172         }
02173         if (count>0)
02174         { cdata[i][j] = median(count,cache);
02175           cmask[i][j] = 1;
02176         }
02177         else
02178         { cdata[i][j] = 0.;
02179           cmask[i][j] = 0;
02180         }
02181       }
02182     }
02183   }
02184   else
02185   { for (i = 0; i < nclusters; i++)
02186     { for (j = 0; j < nrows; j++)
02187       { int count = 0;
02188         for (k = 0; k < ncolumns; k++)
02189         { if (i==clusterid[k] && mask[j][k])
02190           { cache[count] = data[j][k];
02191             count++;
02192           }
02193         }
02194         if (count>0)
02195         { cdata[j][i] = median(count,cache);
02196           cmask[j][i] = 1;
02197         }
02198         else
02199         { cdata[j][i] = 0.;
02200           cmask[j][i] = 0;
02201         }
02202       }
02203     }
02204   }
02205 }
02206  
02207 /* ********************************************************************* */
02208 
02209 int getclustercentroids(int nclusters, int nrows, int ncolumns,
02210   double** data, int** mask, int clusterid[], double** cdata, int** cmask,
02211   int transpose, char method)
02212 /*
02213 Purpose
02214 =======
02215 
02216 The getclustercentroids routine calculates the cluster centroids, given to
02217 which cluster each element belongs. Depending on the argument method, the
02218 centroid is defined as either the mean or the median for each dimension over
02219 all elements belonging to a cluster.
02220 
02221 Arguments
02222 =========
02223 
02224 nclusters  (input) int
02225 The number of clusters.
02226 
02227 nrows     (input) int
02228 The number of rows in the gene expression data matrix, equal to the number of
02229 genes.
02230 
02231 ncolumns  (input) int
02232 The number of columns in the gene expression data matrix, equal to the number of
02233 microarrays.
02234 
02235 data       (input) double[nrows][ncolumns]
02236 The array containing the gene expression data.
02237 
02238 mask       (input) int[nrows][ncolumns]
02239 This array shows which data values are missing. If mask[i][j]==0, then
02240 data[i][j] is missing.
02241 
02242 clusterid  (output) int[nrows] if transpose==0
02243                     int[ncolumns] if transpose==1
02244 The cluster number to which each element belongs. If transpose==0, then the
02245 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
02246 is equal to ncolumns (the number of microarrays).
02247 
02248 cdata      (output) double[nclusters][ncolumns] if transpose==0
02249                     double[nrows][nclusters] if transpose==1
02250 On exit of getclustercentroids, this array contains the cluster centroids.
02251 
02252 cmask      (output) int[nclusters][ncolumns] if transpose==0
02253                     int[nrows][nclusters] if transpose==1
02254 This array shows which data values of are missing for each centroid. If
02255 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
02256 a centroid if all corresponding data values of the cluster members are missing.
02257 
02258 transpose  (input) int
02259 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
02260 columns (microarrays) are specified.
02261 
02262 method     (input) char
02263 For method=='a', the centroid is defined as the mean over all elements
02264 belonging to a cluster for each dimension.
02265 For method=='m', the centroid is defined as the median over all elements
02266 belonging to a cluster for each dimension.
02267 
02268 Return value
02269 ============
02270 
02271 The function returns an integer to indicate success or failure. If a
02272 memory error occurs, or if method is not 'm' or 'a', getclustercentroids
02273 returns 0. If successful, getclustercentroids returns 1.
02274 ========================================================================
02275 */
02276 { switch(method)
02277   { case 'm':
02278     { const int nelements = (transpose==0) ? nrows : ncolumns;
02279       double* cache = malloc(nelements*sizeof(double));
02280       if (!cache) return 0;
02281       getclustermedians(nclusters, nrows, ncolumns, data, mask, clusterid,
02282                         cdata, cmask, transpose, cache);
02283       free(cache);
02284       return 1;
02285     }
02286     case 'a':
02287     { getclustermeans(nclusters, nrows, ncolumns, data, mask, clusterid,
02288                       cdata, cmask, transpose);
02289       return 1;
02290     }
02291   }
02292   return 0;
02293 }
02294 
02295 /* ********************************************************************* */
02296 
02297 void getclustermedoids(int nclusters, int nelements, double** distance,
02298   int clusterid[], int centroids[], double errors[])
02299 /*
02300 Purpose
02301 =======
02302 
02303 The getclustermedoids routine calculates the cluster centroids, given to which
02304 cluster each element belongs. The centroid is defined as the element with the
02305 smallest sum of distances to the other elements.
02306 
02307 Arguments
02308 =========
02309 
02310 nclusters  (input) int
02311 The number of clusters.
02312 
02313 nelements  (input) int
02314 The total number of elements.
02315 
02316 distmatrix (input) double array, ragged
02317   (number of rows is nelements, number of columns is equal to the row number)
02318 The distance matrix. To save space, the distance matrix is given in the
02319 form of a ragged array. The distance matrix is symmetric and has zeros
02320 on the diagonal. See distancematrix for a description of the content.
02321 
02322 clusterid  (output) int[nelements]
02323 The cluster number to which each element belongs.
02324 
02325 centroid   (output) int[nclusters]
02326 The index of the element that functions as the centroid for each cluster.
02327 
02328 errors     (output) double[nclusters]
02329 The within-cluster sum of distances between the items and the cluster
02330 centroid.
02331 
02332 ========================================================================
02333 */
02334 { int i, j, k;
02335   for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
02336   for (i = 0; i < nelements; i++)
02337   { double d = 0.0;
02338     j = clusterid[i];
02339     for (k = 0; k < nelements; k++)
02340     { if (i==k || clusterid[k]!=j) continue;
02341       d += (i < k ? distance[k][i] : distance[i][k]);
02342       if (d > errors[j]) break;
02343     }
02344     if (d < errors[j])
02345     { errors[j] = d;
02346       centroids[j] = i;
02347     }
02348   }
02349 }
02350 
02351 /* ********************************************************************* */
02352 
02353 static int
02354 kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
02355   double weight[], int transpose, int npass, char dist,
02356   double** cdata, int** cmask, int clusterid[], double* error,
02357   int tclusterid[], int counts[], int mapping[])
02358 { int i, j, k;
02359   const int nelements = (transpose==0) ? nrows : ncolumns;
02360   const int ndata = (transpose==0) ? ncolumns : nrows;
02361   int ifound = 1;
02362   int ipass = 0;
02363   /* Set the metric function as indicated by dist */
02364   double (*metric)
02365     (int, double**, double**, int**, int**, const double[], int, int, int) =
02366        setmetric(dist);
02367 
02368   /* We save the clustering solution periodically and check if it reappears */
02369   int* saved = malloc(nelements*sizeof(int));
02370   if (saved==NULL) return -1;
02371 
02372   *error = DBL_MAX;
02373 
02374   do
02375   { double total = DBL_MAX;
02376     int counter = 0;
02377     int period = 10;
02378 
02379     /* Perform the EM algorithm. First, randomly assign elements to clusters. */
02380     if (npass!=0) randomassign (nclusters, nelements, tclusterid);
02381 
02382     for (i = 0; i < nclusters; i++) counts[i] = 0;
02383     for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
02384 
02385     /* Start the loop */
02386     while(1)
02387     { double previous = total;
02388       total = 0.0;
02389 
02390       if (counter % period == 0) /* Save the current cluster assignments */
02391       { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
02392         if (period < INT_MAX / 2) period *= 2;
02393       }
02394       counter++;
02395 
02396       /* Find the center */
02397       getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
02398                       cdata, cmask, transpose);
02399 
02400       for (i = 0; i < nelements; i++)
02401       /* Calculate the distances */
02402       { double distance;
02403         k = tclusterid[i];
02404         if (counts[k]==1) continue;
02405         /* No reassignment if that would lead to an empty cluster */
02406         /* Treat the present cluster as a special case */
02407         distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
02408         for (j = 0; j < nclusters; j++)
02409         { double tdistance;
02410           if (j==k) continue;
02411           tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
02412           if (tdistance < distance)
02413           { distance = tdistance;
02414             counts[tclusterid[i]]--;
02415             tclusterid[i] = j;
02416             counts[j]++;
02417           }
02418         }
02419         total += distance;
02420       }
02421       if (total>=previous) break;
02422       /* total>=previous is FALSE on some machines even if total and previous
02423        * are bitwise identical. */
02424       for (i = 0; i < nelements; i++)
02425         if (saved[i]!=tclusterid[i]) break;
02426       if (i==nelements)
02427         break; /* Identical solution found; break out of this loop */
02428     }
02429 
02430     if (npass<=1) 
02431     { *error = total;
02432       break;
02433     }
02434 
02435     for (i = 0; i < nclusters; i++) mapping[i] = -1;
02436     for (i = 0; i < nelements; i++)
02437     { j = tclusterid[i];
02438       k = clusterid[i];
02439       if (mapping[k] == -1) mapping[k] = j;
02440       else if (mapping[k] != j)
02441       { if (total < *error)
02442         { ifound = 1;
02443           *error = total;
02444           for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
02445         }
02446         break;
02447       }
02448     }
02449     if (i==nelements) ifound++; /* break statement not encountered */
02450   } while (++ipass < npass);
02451 
02452   free(saved);
02453   return ifound;
02454 }
02455 
02456 /* ---------------------------------------------------------------------- */
02457 
02458 static int
02459 kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
02460   double weight[], int transpose, int npass, char dist,
02461   double** cdata, int** cmask, int clusterid[], double* error,
02462   int tclusterid[], int counts[], int mapping[], double cache[])
02463 { int i, j, k;
02464   const int nelements = (transpose==0) ? nrows : ncolumns;
02465   const int ndata = (transpose==0) ? ncolumns : nrows;
02466   int ifound = 1;
02467   int ipass = 0;
02468   /* Set the metric function as indicated by dist */
02469   double (*metric)
02470     (int, double**, double**, int**, int**, const double[], int, int, int) =
02471        setmetric(dist);
02472 
02473   /* We save the clustering solution periodically and check if it reappears */
02474   int* saved = malloc(nelements*sizeof(int));
02475   if (saved==NULL) return -1;
02476 
02477   *error = DBL_MAX;
02478 
02479   do
02480   { double total = DBL_MAX;
02481     int counter = 0;
02482     int period = 10;
02483 
02484     /* Perform the EM algorithm. First, randomly assign elements to clusters. */
02485     if (npass!=0) randomassign (nclusters, nelements, tclusterid);
02486 
02487     for (i = 0; i < nclusters; i++) counts[i]=0;
02488     for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
02489 
02490     /* Start the loop */
02491     while(1)
02492     { double previous = total;
02493       total = 0.0;
02494 
02495       if (counter % period == 0) /* Save the current cluster assignments */
02496       { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
02497         if (period < INT_MAX / 2) period *= 2;
02498       }
02499       counter++;
02500 
02501       /* Find the center */
02502       getclustermedians(nclusters, nrows, ncolumns, data, mask, tclusterid,
02503                         cdata, cmask, transpose, cache);
02504 
02505       for (i = 0; i < nelements; i++)
02506       /* Calculate the distances */
02507       { double distance;
02508         k = tclusterid[i];
02509         if (counts[k]==1) continue;
02510         /* No reassignment if that would lead to an empty cluster */
02511         /* Treat the present cluster as a special case */
02512         distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
02513         for (j = 0; j < nclusters; j++)
02514         { double tdistance;
02515           if (j==k) continue;
02516           tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
02517           if (tdistance < distance)
02518           { distance = tdistance;
02519             counts[tclusterid[i]]--;
02520             tclusterid[i] = j;
02521             counts[j]++;
02522           }
02523         }
02524         total += distance;
02525       }
02526       if (total>=previous) break;
02527       /* total>=previous is FALSE on some machines even if total and previous
02528        * are bitwise identical. */
02529       for (i = 0; i < nelements; i++)
02530         if (saved[i]!=tclusterid[i]) break;
02531       if (i==nelements)
02532         break; /* Identical solution found; break out of this loop */
02533     }
02534 
02535     if (npass<=1) 
02536     { *error = total;
02537       break;
02538     }
02539 
02540     for (i = 0; i < nclusters; i++) mapping[i] = -1;
02541     for (i = 0; i < nelements; i++)
02542     { j = tclusterid[i];
02543       k = clusterid[i];
02544       if (mapping[k] == -1) mapping[k] = j;
02545       else if (mapping[k] != j)
02546       { if (total < *error)
02547         { ifound = 1;
02548           *error = total;
02549           for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
02550         }
02551         break;
02552       }
02553     }
02554     if (i==nelements) ifound++; /* break statement not encountered */
02555   } while (++ipass < npass);
02556 
02557   free(saved);
02558   return ifound;
02559 }
02560 
02561 /* ********************************************************************* */
02562 
02563 void kcluster (int nclusters, int nrows, int ncolumns,
02564   double** data, int** mask, double weight[], int transpose,
02565   int npass, char method, char dist,
02566   int clusterid[], double* error, int* ifound)
02567 /*
02568 Purpose
02569 =======
02570 
02571 The kcluster routine performs k-means or k-median clustering on a given set of
02572 elements, using the specified distance measure. The number of clusters is given
02573 by the user. Multiple passes are being made to find the optimal clustering
02574 solution, each time starting from a different initial clustering.
02575 
02576 
02577 Arguments
02578 =========
02579 
02580 nclusters  (input) int
02581 The number of clusters to be found.
02582 
02583 data       (input) double[nrows][ncolumns]
02584 The array containing the data of the elements to be clustered (i.e., the gene
02585 expression data).
02586 
02587 mask       (input) int[nrows][ncolumns]
02588 This array shows which data values are missing. If
02589 mask[i][j] == 0, then data[i][j] is missing.
02590 
02591 nrows     (input) int
02592 The number of rows in the data matrix, equal to the number of genes.
02593 
02594 ncolumns  (input) int
02595 The number of columns in the data matrix, equal to the number of microarrays.
02596 
02597 weight (input) double[n]
02598 The weights that are used to calculate the distance.
02599 
02600 transpose  (input) int
02601 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
02602 of the matrix are clustered.
02603 
02604 npass      (input) int
02605 The number of times clustering is performed. Clustering is performed npass
02606 times, each time starting from a different (random) initial assignment of 
02607 genes to clusters. The clustering solution with the lowest within-cluster sum
02608 of distances is chosen.
02609 If npass==0, then the clustering algorithm will be run once, where the initial
02610 assignment of elements to clusters is taken from the clusterid array.
02611 
02612 method     (input) char
02613 Defines whether the arithmetic mean (method=='a') or the median
02614 (method=='m') is used to calculate the cluster center.
02615 
02616 dist       (input) char
02617 Defines which distance measure is used, as given by the table:
02618 dist=='e': Euclidean distance
02619 dist=='b': City-block distance
02620 dist=='c': correlation
02621 dist=='a': absolute value of the correlation
02622 dist=='u': uncentered correlation
02623 dist=='x': absolute uncentered correlation
02624 dist=='s': Spearman's rank correlation
02625 dist=='k': Kendall's tau
02626 For other values of dist, the default (Euclidean distance) is used.
02627 
02628 clusterid  (output; input) int[nrows] if transpose==0
02629                            int[ncolumns] if transpose==1
02630 The cluster number to which a gene or microarray was assigned. If npass==0,
02631 then on input clusterid contains the initial clustering assignment from which
02632 the clustering algorithm starts. On output, it contains the clustering solution
02633 that was found.
02634 
02635 error      (output) double*
02636 The sum of distances to the cluster center of each item in the optimal k-means
02637 clustering solution that was found.
02638 
02639 ifound     (output) int*
02640 The number of times the optimal clustering solution was
02641 found. The value of ifound is at least 1; its maximum value is npass. If the
02642 number of clusters is larger than the number of elements being clustered,
02643 *ifound is set to 0 as an error code. If a memory allocation error occurs,
02644 *ifound is set to -1.
02645 
02646 ========================================================================
02647 */
02648 { const int nelements = (transpose==0) ? nrows : ncolumns;
02649   const int ndata = (transpose==0) ? ncolumns : nrows;
02650 
02651   int i;
02652   int ok;
02653   int* tclusterid;
02654   int* mapping = NULL;
02655   double** cdata;
02656   int** cmask;
02657   int* counts;
02658 
02659   if (nelements < nclusters)
02660   { *ifound = 0;
02661     return;
02662   }
02663   /* More clusters asked for than elements available */
02664 
02665   *ifound = -1;
02666 
02667   /* This will contain the number of elements in each cluster, which is
02668    * needed to check for empty clusters. */
02669   counts = malloc(nclusters*sizeof(int));
02670   if(!counts) return;
02671 
02672   /* Find out if the user specified an initial clustering */
02673   if (npass<=1) tclusterid = clusterid;
02674   else
02675   { tclusterid = malloc(nelements*sizeof(int));
02676     if (!tclusterid)
02677     { free(counts);
02678       return;
02679     }
02680     mapping = malloc(nclusters*sizeof(int));
02681     if (!mapping)
02682     { free(counts);
02683       free(tclusterid);
02684       return;
02685     }
02686     for (i = 0; i < nelements; i++) clusterid[i] = 0;
02687   }
02688 
02689   /* Allocate space to store the centroid data */
02690   if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
02691   else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
02692   if(!ok)
02693   { free(counts);
02694     if(npass>1)
02695     { free(tclusterid);
02696       free(mapping);
02697       return;
02698     }
02699   }
02700   
02701   if (method=='m')
02702   { double* cache = malloc(nelements*sizeof(double));
02703     if(cache)
02704     { *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
02705                          transpose, npass, dist, cdata, cmask, clusterid, error,
02706                          tclusterid, counts, mapping, cache);
02707       free(cache);
02708     }
02709   }
02710   else
02711     *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
02712                      transpose, npass, dist, cdata, cmask, clusterid, error,
02713                      tclusterid, counts, mapping);
02714 
02715   /* Deallocate temporarily used space */
02716   if (npass > 1)
02717   { free(mapping);
02718     free(tclusterid);
02719   }
02720 
02721   if (transpose==0) freedatamask(nclusters, cdata, cmask);
02722   else freedatamask(ndata, cdata, cmask);
02723 
02724   free(counts);
02725 }
02726 
02727 /* *********************************************************************** */
02728 
02729 void kmedoids (int nclusters, int nelements, double** distmatrix,
02730   int npass, int clusterid[], double* error, int* ifound)
02731 /*
02732 Purpose
02733 =======
02734 
02735 The kmedoids routine performs k-medoids clustering on a given set of elements,
02736 using the distance matrix and the number of clusters passed by the user.
02737 Multiple passes are being made to find the optimal clustering solution, each
02738 time starting from a different initial clustering.
02739 
02740 
02741 Arguments
02742 =========
02743 
02744 nclusters  (input) int
02745 The number of clusters to be found.
02746 
02747 nelements  (input) int
02748 The number of elements to be clustered.
02749 
02750 distmatrix (input) double array, ragged
02751   (number of rows is nelements, number of columns is equal to the row number)
02752 The distance matrix. To save space, the distance matrix is given in the
02753 form of a ragged array. The distance matrix is symmetric and has zeros
02754 on the diagonal. See distancematrix for a description of the content.
02755 
02756 npass      (input) int
02757 The number of times clustering is performed. Clustering is performed npass
02758 times, each time starting from a different (random) initial assignment of genes
02759 to clusters. The clustering solution with the lowest within-cluster sum of
02760 distances is chosen.
02761 If npass==0, then the clustering algorithm will be run once, where the initial
02762 assignment of elements to clusters is taken from the clusterid array.
02763 
02764 clusterid  (output; input) int[nelements]
02765 On input, if npass==0, then clusterid contains the initial clustering assignment
02766 from which the clustering algorithm starts; all numbers in clusterid should be
02767 between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on
02768 input.
02769 On output, clusterid contains the clustering solution that was found: clusterid
02770 contains the number of the cluster to which each item was assigned. On output,
02771 the number of a cluster is defined as the item number of the centroid of the
02772 cluster.
02773 
02774 error      (output) double
02775 The sum of distances to the cluster center of each item in the optimal k-medoids
02776 clustering solution that was found.
02777 
02778 ifound     (output) int
02779 If kmedoids is successful: the number of times the optimal clustering solution
02780 was found. The value of ifound is at least 1; its maximum value is npass.
02781 If the user requested more clusters than elements available, ifound is set
02782 to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
02783 
02784 ========================================================================
02785 */
02786 { int i, j, icluster;
02787   int* tclusterid;
02788   int* saved;
02789   int* centroids;
02790   double* errors;
02791   int ipass = 0;
02792 
02793   if (nelements < nclusters)
02794   { *ifound = 0;
02795     return;
02796   } /* More clusters asked for than elements available */
02797 
02798   *ifound = -1;
02799 
02800   /* We save the clustering solution periodically and check if it reappears */
02801   saved = malloc(nelements*sizeof(int));
02802   if (saved==NULL) return;
02803 
02804   centroids = malloc(nclusters*sizeof(int));
02805   if(!centroids)
02806   { free(saved);
02807     return;
02808   }
02809 
02810   errors = malloc(nclusters*sizeof(double));
02811   if(!errors)
02812   { free(saved);
02813     free(centroids);
02814     return;
02815   }
02816 
02817   /* Find out if the user specified an initial clustering */
02818   if (npass<=1) tclusterid = clusterid;
02819   else
02820   { tclusterid = malloc(nelements*sizeof(int));
02821     if(!tclusterid)
02822     { free(saved);
02823       free(centroids);
02824       free(errors);
02825       return;
02826     }
02827   }
02828 
02829   *error = DBL_MAX;
02830   do /* Start the loop */
02831   { double total = DBL_MAX;
02832     int counter = 0;
02833     int period = 10;
02834 
02835     if (npass!=0) randomassign (nclusters, nelements, tclusterid);
02836     while(1)
02837     { double previous = total;
02838       total = 0.0;
02839 
02840       if (counter % period == 0) /* Save the current cluster assignments */
02841       { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
02842         if (period < INT_MAX / 2) period *= 2;
02843       }
02844       counter++;
02845 
02846       /* Find the center */
02847       getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
02848                         centroids, errors);
02849 
02850       for (i = 0; i < nelements; i++)
02851       /* Find the closest cluster */
02852       { double distance = DBL_MAX;
02853         for (icluster = 0; icluster < nclusters; icluster++)
02854         { double tdistance;
02855           j = centroids[icluster];
02856           if (i==j)
02857           { distance = 0.0;
02858             tclusterid[i] = icluster;
02859             break;
02860           }
02861           tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
02862           if (tdistance < distance)
02863           { distance = tdistance;
02864             tclusterid[i] = icluster;
02865           }
02866         }
02867         total += distance;
02868       }
02869       if (total>=previous) break;
02870       /* total>=previous is FALSE on some machines even if total and previous
02871        * are bitwise identical. */
02872       for (i = 0; i < nelements; i++)
02873         if (saved[i]!=tclusterid[i]) break;
02874       if (i==nelements)
02875         break; /* Identical solution found; break out of this loop */
02876     }
02877 
02878     for (i = 0; i < nelements; i++)
02879     { if (clusterid[i]!=centroids[tclusterid[i]])
02880       { if (total < *error)
02881         { *ifound = 1;
02882           *error = total;
02883           /* Replace by the centroid in each cluster. */
02884           for (j = 0; j < nelements; j++)
02885             clusterid[j] = centroids[tclusterid[j]];
02886         }
02887         break;
02888       }
02889     }
02890     if (i==nelements) (*ifound)++; /* break statement not encountered */
02891   } while (++ipass < npass);
02892 
02893   /* Deallocate temporarily used space */
02894   if (npass > 1) free(tclusterid);
02895 
02896   free(saved);
02897   free(centroids);
02898   free(errors);
02899 
02900   return;
02901 }
02902 
02903 /* ******************************************************************** */
02904 
02905 double** distancematrix (int nrows, int ncolumns, double** data,
02906   int** mask, double weights[], char dist, int transpose)
02907 /*
02908 Purpose
02909 =======
02910 
02911 The distancematrix routine calculates the distance matrix between genes or
02912 microarrays using their measured gene expression data. Several distance measures
02913 can be used. The routine returns a pointer to a ragged array containing the
02914 distances between the genes. As the distance matrix is symmetric, with zeros on
02915 the diagonal, only the lower triangular half of the distance matrix is saved.
02916 The distancematrix routine allocates space for the distance matrix. If the
02917 parameter transpose is set to a nonzero value, the distances between the columns
02918 (microarrays) are calculated, otherwise distances between the rows (genes) are
02919 calculated.
02920 If sufficient space in memory cannot be allocated to store the distance matrix,
02921 the routine returns a NULL pointer, and all memory allocated so far for the
02922 distance matrix is freed.
02923 
02924 
02925 Arguments
02926 =========
02927 
02928 nrows      (input) int
02929 The number of rows in the gene expression data matrix (i.e., the number of
02930 genes)
02931 
02932 ncolumns   (input) int
02933 The number of columns in the gene expression data matrix (i.e., the number of
02934 microarrays)
02935 
02936 data       (input) double[nrows][ncolumns]
02937 The array containing the gene expression data.
02938 
02939 mask       (input) int[nrows][ncolumns]
02940 This array shows which data values are missing. If mask[i][j]==0, then
02941 data[i][j] is missing.
02942 
02943 weight (input) double[n]
02944 The weights that are used to calculate the distance. The length of this vector
02945 is equal to the number of columns if the distances between genes are calculated,
02946 or the number of rows if the distances between microarrays are calculated.
02947 
02948 dist       (input) char
02949 Defines which distance measure is used, as given by the table:
02950 dist=='e': Euclidean distance
02951 dist=='b': City-block distance
02952 dist=='c': correlation
02953 dist=='a': absolute value of the correlation
02954 dist=='u': uncentered correlation
02955 dist=='x': absolute uncentered correlation
02956 dist=='s': Spearman's rank correlation
02957 dist=='k': Kendall's tau
02958 For other values of dist, the default (Euclidean distance) is used.
02959 
02960 transpose  (input) int
02961 If transpose is equal to zero, the distances between the rows is
02962 calculated. Otherwise, the distances between the columns is calculated.
02963 The former is needed when genes are being clustered; the latter is used
02964 when microarrays are being clustered.
02965 
02966 ========================================================================
02967 */
02968 { /* First determine the size of the distance matrix */
02969   const int n = (transpose==0) ? nrows : ncolumns;
02970   const int ndata = (transpose==0) ? ncolumns : nrows;
02971   int i,j;
02972   double** matrix;
02973 
02974   /* Set the metric function as indicated by dist */
02975   double (*metric)
02976     (int, double**, double**, int**, int**, const double[], int, int, int) =
02977        setmetric(dist);
02978 
02979   if (n < 2) return NULL;
02980 
02981   /* Set up the ragged array */
02982   matrix = malloc(n*sizeof(double*));
02983   if(matrix==NULL) return NULL; /* Not enough memory available */
02984   matrix[0] = NULL;
02985   /* The zeroth row has zero columns. We allocate it anyway for convenience.*/
02986   for (i = 1; i < n; i++)
02987   { matrix[i] = malloc(i*sizeof(double));
02988     if (matrix[i]==NULL) break; /* Not enough memory available */
02989   }
02990   if (i < n) /* break condition encountered */
02991   { j = i;
02992     for (i = 1; i < j; i++) free(matrix[i]);
02993     return NULL;
02994   }
02995 
02996   /* Calculate the distances and save them in the ragged array */
02997   for (i = 1; i < n; i++)
02998     for (j = 0; j < i; j++)
02999       matrix[i][j]=metric(ndata,data,data,mask,mask,weights,i,j,transpose);
03000 
03001   return matrix;
03002 }
03003 
03004 /* ******************************************************************** */
03005 
03006 double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
03007   double weights[], int transpose, char dist, double cutoff, double exponent)
03008 
03009 /*
03010 Purpose
03011 =======
03012 
03013 This function calculates the weights using the weighting scheme proposed by
03014 Michael Eisen:
03015 w[i] = 1.0 / sum_{j where d[i][j]<cutoff} (1 - d[i][j]/cutoff)^exponent
03016 where the cutoff and the exponent are specified by the user.
03017 
03018 
03019 Arguments
03020 =========
03021 
03022 nrows      (input) int
03023 The number of rows in the gene expression data matrix, equal to the number of
03024 genes.
03025 
03026 ncolumns   (input) int
03027 The number of columns in the gene expression data matrix, equal to the number of
03028 microarrays.
03029 
03030 data       (input) double[nrows][ncolumns]
03031 The array containing the gene expression data.
03032 
03033 mask       (input) int[nrows][ncolumns]
03034 This array shows which data values are missing. If mask[i][j]==0, then
03035 data[i][j] is missing.
03036 
03037 weight     (input) int[ncolumns] if transpose==0,
03038                    int[nrows]    if transpose==1
03039 The weights that are used to calculate the distance. The length of this vector
03040 is ncolumns if gene weights are being clustered, and nrows if microarrays
03041 weights are being clustered.
03042 
03043 transpose (input) int
03044 If transpose==0, the weights of the rows of the data matrix are calculated.
03045 Otherwise, the weights of the columns of the data matrix are calculated.
03046 
03047 dist      (input) char
03048 Defines which distance measure is used, as given by the table:
03049 dist=='e': Euclidean distance
03050 dist=='b': City-block distance
03051 dist=='c': correlation
03052 dist=='a': absolute value of the correlation
03053 dist=='u': uncentered correlation
03054 dist=='x': absolute uncentered correlation
03055 dist=='s': Spearman's rank correlation
03056 dist=='k': Kendall's tau
03057 For other values of dist, the default (Euclidean distance) is used.
03058 
03059 cutoff    (input) double
03060 The cutoff to be used to calculate the weights.
03061 
03062 exponent  (input) double
03063 The exponent to be used to calculate the weights.
03064 
03065 
03066 Return value
03067 ============
03068 
03069 The function returns a pointer to a newly allocated array containing the
03070 calculated weights for the rows (if transpose==0) or columns (if
03071 transpose==1). If not enough memory could be allocated to store the
03072 weights array, the function returns NULL.
03073 
03074 ========================================================================
03075 */
03076 { int i,j;
03077   const int ndata = (transpose==0) ? ncolumns : nrows;
03078   const int nelements = (transpose==0) ? nrows : ncolumns;
03079 
03080   /* Set the metric function as indicated by dist */
03081   double (*metric)
03082     (int, double**, double**, int**, int**, const double[], int, int, int) =
03083        setmetric(dist);
03084 
03085   double* result = malloc(nelements*sizeof(double));
03086   if (!result) return NULL;
03087   memset(result, 0, nelements*sizeof(double));
03088 
03089   for (i = 0; i < nelements; i++)
03090   { result[i] += 1.0;
03091     for (j = 0; j < i; j++)
03092     { const double distance = metric(ndata, data, data, mask, mask, weights,
03093                                      i, j, transpose);
03094       if (distance < cutoff)
03095       { const double dweight = exp(exponent*log(1-distance/cutoff));
03096         /* pow() causes a crash on AIX */
03097         result[i] += dweight;
03098         result[j] += dweight;
03099       }
03100     }
03101   }
03102   for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
03103   return result;
03104 }
03105 
03106 /* ******************************************************************** */
03107 
03108 void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]) 
03109 
03110 /*
03111 Purpose
03112 =======
03113 
03114 The cuttree routine takes the output of a hierarchical clustering routine, and
03115 divides the elements in the tree structure into clusters based on the
03116 hierarchical clustering result. The number of clusters is specified by the user.
03117 
03118 Arguments
03119 =========
03120 
03121 nelements      (input) int
03122 The number of elements that were clustered.
03123 
03124 tree           (input) Node[nelements-1]
03125 The clustering solution. Each node in the array describes one linking event,
03126 with tree[i].left and tree[i].right representig the elements that were joined.
03127 The original elements are numbered 0..nelements-1, nodes are numbered
03128 -1..-(nelements-1).
03129 
03130 nclusters      (input) int
03131 The number of clusters to be formed.
03132 
03133 clusterid      (output) int[nelements]
03134 The number of the cluster to which each element was assigned. Space for this
03135 array should be allocated before calling the cuttree routine. If a memory
03136 error occured, all elements in clusterid are set to -1.
03137 
03138 ========================================================================
03139 */
03140 { int i, j, k;
03141   int icluster = 0;
03142   const int n = nelements-nclusters; /* number of nodes to join */
03143   int* nodeid;
03144   for (i = nelements-2; i >= n; i--)
03145   { k = tree[i].left;
03146     if (k>=0)
03147     { clusterid[k] = icluster;
03148       icluster++;
03149     }
03150     k = tree[i].right;
03151     if (k>=0)
03152     { clusterid[k] = icluster;
03153       icluster++;
03154     }
03155   }
03156   nodeid = malloc(n*sizeof(int));
03157   if(!nodeid)
03158   { for (i = 0; i < nelements; i++) clusterid[i] = -1;
03159     return;
03160   }
03161   for (i = 0; i < n; i++) nodeid[i] = -1;
03162   for (i = n-1; i >= 0; i--)
03163   { if(nodeid[i]<0) 
03164     { j = icluster;
03165       nodeid[i] = j;
03166       icluster++;
03167     }
03168     else j = nodeid[i];
03169     k = tree[i].left;
03170     if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
03171     k = tree[i].right;
03172     if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
03173   }
03174   free(nodeid);
03175   return;
03176 }
03177 
03178 /* ******************************************************************** */
03179 
03180 static
03181 Node* pclcluster (int nrows, int ncolumns, double** data, int** mask,
03182   double weight[], double** distmatrix, char dist, int transpose)
03183 
03184 /*
03185 
03186 Purpose
03187 =======
03188 
03189 The pclcluster routine performs clustering using pairwise centroid-linking
03190 on a given set of gene expression data, using the distance metric given by dist.
03191 
03192 Arguments
03193 =========
03194 
03195 nrows     (input) int
03196 The number of rows in the gene expression data matrix, equal to the number of
03197 genes.
03198 
03199 ncolumns  (input) int
03200 The number of columns in the gene expression data matrix, equal to the number of
03201 microarrays.
03202 
03203 data       (input) double[nrows][ncolumns]
03204 The array containing the gene expression data.
03205 
03206 mask       (input) int[nrows][ncolumns]
03207 This array shows which data values are missing. If
03208 mask[i][j] == 0, then data[i][j] is missing.
03209 
03210 weight     (input) double[ncolumns] if transpose==0;
03211                    double[nrows]    if transpose==1
03212 The weights that are used to calculate the distance. The length of this vector
03213 is ncolumns if genes are being clustered, and nrows if microarrays are being
03214 clustered.
03215 
03216 transpose  (input) int
03217 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
03218 of the matrix are clustered.
03219 
03220 dist       (input) char
03221 Defines which distance measure is used, as given by the table:
03222 dist=='e': Euclidean distance
03223 dist=='b': City-block distance
03224 dist=='c': correlation
03225 dist=='a': absolute value of the correlation
03226 dist=='u': uncentered correlation
03227 dist=='x': absolute uncentered correlation
03228 dist=='s': Spearman's rank correlation
03229 dist=='k': Kendall's tau
03230 For other values of dist, the default (Euclidean distance) is used.
03231 
03232 distmatrix (input) double**
03233 The distance matrix. This matrix is precalculated by the calling routine
03234 treecluster. The pclcluster routine modifies the contents of distmatrix, but
03235 does not deallocate it.
03236 
03237 Return value
03238 ============
03239 
03240 A pointer to a newly allocated array of Node structs, describing the
03241 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
03242 whether genes (rows) or microarrays (columns) were clustered, nelements is
03243 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
03244 structure.
03245 If a memory error occurs, pclcluster returns NULL.
03246 ========================================================================
03247 */
03248 { int i, j;
03249   const int nelements = (transpose==0) ? nrows : ncolumns;
03250   int inode;
03251   const int ndata = transpose ? nrows : ncolumns;
03252   const int nnodes = nelements - 1;
03253 
03254   /* Set the metric function as indicated by dist */
03255   double (*metric)
03256     (int, double**, double**, int**, int**, const double[], int, int, int) =
03257        setmetric(dist);
03258 
03259   Node* result;
03260   double** newdata;
03261   int** newmask;
03262   int* distid = malloc(nelements*sizeof(int));
03263   if(!distid) return NULL;
03264   result = malloc(nnodes*sizeof(Node));
03265   if(!result)
03266   { free(distid);
03267     return NULL;
03268   }
03269   if(!makedatamask(nelements, ndata, &newdata, &newmask))
03270   { free(result);
03271     free(distid);
03272     return NULL; 
03273   }
03274 
03275   for (i = 0; i < nelements; i++) distid[i] = i;
03276   /* To remember which row/column in the distance matrix contains what */
03277 
03278   /* Storage for node data */
03279   if (transpose)
03280   { for (i = 0; i < nelements; i++)
03281     { for (j = 0; j < ndata; j++)
03282       { newdata[i][j] = data[j][i];
03283         newmask[i][j] = mask[j][i];
03284       }
03285     }
03286     data = newdata;
03287     mask = newmask;
03288   }
03289   else
03290   { for (i = 0; i < nelements; i++)
03291     { memcpy(newdata[i], data[i], ndata*sizeof(double));
03292       memcpy(newmask[i], mask[i], ndata*sizeof(int));
03293     }
03294     data = newdata;
03295     mask = newmask;
03296   }
03297 
03298   for (inode = 0; inode < nnodes; inode++)
03299   { /* Find the pair with the shortest distance */
03300     int is = 1;
03301     int js = 0;
03302     result[inode].distance = find_closest_pair(nelements-inode, distmatrix, &is, &js);
03303     result[inode].left = distid[js];
03304     result[inode].right = distid[is];
03305 
03306     /* Make node js the new node */
03307     for (i = 0; i < ndata; i++)
03308     { data[js][i] = data[js][i]*mask[js][i] + data[is][i]*mask[is][i];
03309       mask[js][i] += mask[is][i];
03310       if (mask[js][i]) data[js][i] /= mask[js][i];
03311     }
03312     free(data[is]);
03313     free(mask[is]);
03314     data[is] = data[nnodes-inode];
03315     mask[is] = mask[nnodes-inode];
03316   
03317     /* Fix the distances */
03318     distid[is] = distid[nnodes-inode];
03319     for (i = 0; i < is; i++)
03320       distmatrix[is][i] = distmatrix[nnodes-inode][i];
03321     for (i = is + 1; i < nnodes-inode; i++)
03322       distmatrix[i][is] = distmatrix[nnodes-inode][i];
03323 
03324     distid[js] = -inode-1;
03325     for (i = 0; i < js; i++)
03326       distmatrix[js][i] = metric(ndata,data,data,mask,mask,weight,js,i,0);
03327     for (i = js + 1; i < nnodes-inode; i++)
03328       distmatrix[i][js] = metric(ndata,data,data,mask,mask,weight,js,i,0);
03329   }
03330 
03331   /* Free temporarily allocated space */
03332   free(data[0]);
03333   free(mask[0]);
03334   free(data);
03335   free(mask);
03336   free(distid);
03337  
03338   return result;
03339 }
03340 
03341 /* ******************************************************************** */
03342 
03343 static
03344 int nodecompare(const void* a, const void* b)
03345 /* Helper function for qsort. */
03346 { const Node* node1 = (const Node*)a;
03347   const Node* node2 = (const Node*)b;
03348   const double term1 = node1->distance;
03349   const double term2 = node2->distance;
03350   if (term1 < term2) return -1;
03351   if (term1 > term2) return +1;
03352   return 0;
03353 }
03354 
03355 /* ---------------------------------------------------------------------- */
03356 
03357 static
03358 Node* pslcluster (int nrows, int ncolumns, double** data, int** mask,
03359   double weight[], double** distmatrix, char dist, int transpose)
03360 
03361 /*
03362 
03363 Purpose
03364 =======
03365 
03366 The pslcluster routine performs single-linkage hierarchical clustering, using
03367 either the distance matrix directly, if available, or by calculating the
03368 distances from the data array. This implementation is based on the SLINK
03369 algorithm, described in:
03370 Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
03371 cluster method. The Computer Journal, 16(1): 30-34.
03372 The output of this algorithm is identical to conventional single-linkage
03373 hierarchical clustering, but is much more memory-efficient and faster. Hence,
03374 it can be applied to large data sets, for which the conventional single-
03375 linkage algorithm fails due to lack of memory.
03376 
03377 
03378 Arguments
03379 =========
03380 
03381 nrows     (input) int
03382 The number of rows in the gene expression data matrix, equal to the number of
03383 genes.
03384 
03385 ncolumns  (input) int
03386 The number of columns in the gene expression data matrix, equal to the number of
03387 microarrays.
03388 
03389 data       (input) double[nrows][ncolumns]
03390 The array containing the gene expression data.
03391 
03392 mask       (input) int[nrows][ncolumns]
03393 This array shows which data values are missing. If
03394 mask[i][j] == 0, then data[i][j] is missing.
03395 
03396 weight (input) double[n]
03397 The weights that are used to calculate the distance. The length of this vector
03398 is ncolumns if genes are being clustered, and nrows if microarrays are being
03399 clustered.
03400 
03401 transpose  (input) int
03402 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
03403 of the matrix are clustered.
03404 
03405 dist       (input) char
03406 Defines which distance measure is used, as given by the table:
03407 dist=='e': Euclidean distance
03408 dist=='b': City-block distance
03409 dist=='c': correlation
03410 dist=='a': absolute value of the correlation
03411 dist=='u': uncentered correlation
03412 dist=='x': absolute uncentered correlation
03413 dist=='s': Spearman's rank correlation
03414 dist=='k': Kendall's tau
03415 For other values of dist, the default (Euclidean distance) is used.
03416 
03417 distmatrix (input) double**
03418 The distance matrix. If the distance matrix is passed by the calling routine
03419 treecluster, it is used by pslcluster to speed up the clustering calculation.
03420 The pslcluster routine does not modify the contents of distmatrix, and does
03421 not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
03422 by the pslcluster routine from the gene expression data (the data and mask
03423 arrays) and stored in temporary arrays. If distmatrix is passed, the original
03424 gene expression data (specified by the data and mask arguments) are not needed
03425 and are therefore ignored.
03426 
03427 
03428 Return value
03429 ============
03430 
03431 A pointer to a newly allocated array of Node structs, describing the
03432 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
03433 whether genes (rows) or microarrays (columns) were clustered, nelements is
03434 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
03435 structure.
03436 If a memory error occurs, pslcluster returns NULL.
03437 
03438 ========================================================================
03439 */
03440 { int i, j, k;
03441   const int nelements = transpose ? ncolumns : nrows;
03442   const int nnodes = nelements - 1;
03443   int* vector;
03444   double* temp;
03445   int* index;
03446   Node* result;
03447   temp = malloc(nnodes*sizeof(double));
03448   if(!temp) return NULL;
03449   index = malloc(nelements*sizeof(int));
03450   if(!index)
03451   { free(temp);
03452     return NULL;
03453   }
03454   vector = malloc(nnodes*sizeof(int));
03455   if(!vector)
03456   { free(index);
03457     free(temp);
03458     return NULL;
03459   }
03460   result = malloc(nelements*sizeof(Node));
03461   if(!result)
03462   { free(vector);
03463     free(index);
03464     free(temp);
03465     return NULL;
03466   }
03467 
03468   for (i = 0; i < nnodes; i++) vector[i] = i;
03469 
03470   if(distmatrix)
03471   { for (i = 0; i < nrows; i++)
03472     { result[i].distance = DBL_MAX;
03473       for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
03474       for (j = 0; j < i; j++)
03475       { k = vector[j];
03476         if (result[j].distance >= temp[j])
03477         { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
03478           result[j].distance = temp[j];
03479           vector[j] = i;
03480         }
03481         else if (temp[j] < temp[k]) temp[k] = temp[j];
03482       }
03483       for (j = 0; j < i; j++)
03484       {
03485         if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
03486       }
03487     }
03488   }
03489   else
03490   { const int ndata = transpose ? nrows : ncolumns;
03491     /* Set the metric function as indicated by dist */
03492     double (*metric)
03493       (int, double**, double**, int**, int**, const double[], int, int, int) =
03494          setmetric(dist);
03495 
03496     for (i = 0; i < nelements; i++)
03497     { result[i].distance = DBL_MAX;
03498       for (j = 0; j < i; j++) temp[j] =
03499         metric(ndata, data, data, mask, mask, weight, i, j, transpose);
03500       for (j = 0; j < i; j++)
03501       { k = vector[j];
03502         if (result[j].distance >= temp[j])
03503         { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
03504           result[j].distance = temp[j];
03505           vector[j] = i;
03506         }
03507         else if (temp[j] < temp[k]) temp[k] = temp[j];
03508       }
03509       for (j = 0; j < i; j++)
03510         if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
03511     }
03512   }
03513   free(temp);
03514 
03515   for (i = 0; i < nnodes; i++) result[i].left = i;
03516   qsort(result, nnodes, sizeof(Node), nodecompare);
03517 
03518   for (i = 0; i < nelements; i++) index[i] = i;
03519   for (i = 0; i < nnodes; i++)
03520   { j = result[i].left;
03521     k = vector[j];
03522     result[i].left = index[j];
03523     result[i].right = index[k];
03524     index[k] = -i-1;
03525   }
03526   free(vector);
03527   free(index);
03528 
03529   result = realloc(result, nnodes*sizeof(Node));
03530 
03531   return result;
03532 }
03533 /* ******************************************************************** */
03534 
03535 static Node* pmlcluster (int nelements, double** distmatrix)
03536 /*
03537 
03538 Purpose
03539 =======
03540 
03541 The pmlcluster routine performs clustering using pairwise maximum- (complete-)
03542 linking on the given distance matrix.
03543 
03544 Arguments
03545 =========
03546 
03547 nelements     (input) int
03548 The number of elements to be clustered.
03549 
03550 distmatrix (input) double**
03551 The distance matrix, with nelements rows, each row being filled up to the
03552 diagonal. The elements on the diagonal are not used, as they are assumed to be
03553 zero. The distance matrix will be modified by this routine.
03554 
03555 Return value
03556 ============
03557 
03558 A pointer to a newly allocated array of Node structs, describing the
03559 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
03560 whether genes (rows) or microarrays (columns) were clustered, nelements is
03561 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
03562 structure.
03563 If a memory error occurs, pmlcluster returns NULL.
03564 ========================================================================
03565 */
03566 { int j;
03567   int n;
03568   int* clusterid;
03569   Node* result;
03570 
03571   clusterid = malloc(nelements*sizeof(int));
03572   if(!clusterid) return NULL;
03573   result = malloc((nelements-1)*sizeof(Node));
03574   if (!result)
03575   { free(clusterid);
03576     return NULL;
03577   }
03578 
03579   /* Setup a list specifying to which cluster a gene belongs */
03580   for (j = 0; j < nelements; j++) clusterid[j] = j;
03581 
03582   for (n = nelements; n > 1; n--)
03583   { int is = 1;
03584     int js = 0;
03585     result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
03586 
03587     /* Fix the distances */
03588     for (j = 0; j < js; j++)
03589       distmatrix[js][j] = max(distmatrix[is][j],distmatrix[js][j]);
03590     for (j = js+1; j < is; j++)
03591       distmatrix[j][js] = max(distmatrix[is][j],distmatrix[j][js]);
03592     for (j = is+1; j < n; j++)
03593       distmatrix[j][js] = max(distmatrix[j][is],distmatrix[j][js]);
03594 
03595     for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
03596     for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
03597 
03598     /* Update clusterids */
03599     result[nelements-n].left = clusterid[is];
03600     result[nelements-n].right = clusterid[js];
03601     clusterid[js] = n-nelements-1;
03602     clusterid[is] = clusterid[n-1];
03603   }
03604   free(clusterid);
03605 
03606   return result;
03607 }
03608 
03609 /* ******************************************************************* */
03610 
03611 static Node* palcluster (int nelements, double** distmatrix)
03612 /*
03613 Purpose
03614 =======
03615 
03616 The palcluster routine performs clustering using pairwise average
03617 linking on the given distance matrix.
03618 
03619 Arguments
03620 =========
03621 
03622 nelements     (input) int
03623 The number of elements to be clustered.
03624 
03625 distmatrix (input) double**
03626 The distance matrix, with nelements rows, each row being filled up to the
03627 diagonal. The elements on the diagonal are not used, as they are assumed to be
03628 zero. The distance matrix will be modified by this routine.
03629 
03630 Return value
03631 ============
03632 
03633 A pointer to a newly allocated array of Node structs, describing the
03634 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
03635 whether genes (rows) or microarrays (columns) were clustered, nelements is
03636 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
03637 structure.
03638 If a memory error occurs, palcluster returns NULL.
03639 ========================================================================
03640 */
03641 { int j;
03642   int n;
03643   int* clusterid;
03644   int* number;
03645   Node* result;
03646 
03647   clusterid = malloc(nelements*sizeof(int));
03648   if(!clusterid) return NULL;
03649   number = malloc(nelements*sizeof(int));
03650   if(!number)
03651   { free(clusterid);
03652     return NULL;
03653   }
03654   result = malloc((nelements-1)*sizeof(Node));
03655   if (!result)
03656   { free(clusterid);
03657     free(number);
03658     return NULL;
03659   }
03660 
03661   /* Setup a list specifying to which cluster a gene belongs, and keep track
03662    * of the number of elements in each cluster (needed to calculate the
03663    * average). */
03664   for (j = 0; j < nelements; j++)
03665   { number[j] = 1;
03666     clusterid[j] = j;
03667   }
03668 
03669   for (n = nelements; n > 1; n--)
03670   { int sum;
03671     int is = 1;
03672     int js = 0;
03673     result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
03674 
03675     /* Save result */
03676     result[nelements-n].left = clusterid[is];
03677     result[nelements-n].right = clusterid[js];
03678 
03679     /* Fix the distances */
03680     sum = number[is] + number[js];
03681     for (j = 0; j < js; j++)
03682     { distmatrix[js][j] = distmatrix[is][j]*number[is]
03683                         + distmatrix[js][j]*number[js];
03684       distmatrix[js][j] /= sum;
03685     }
03686     for (j = js+1; j < is; j++)
03687     { distmatrix[j][js] = distmatrix[is][j]*number[is]
03688                         + distmatrix[j][js]*number[js];
03689       distmatrix[j][js] /= sum;
03690     }
03691     for (j = is+1; j < n; j++)
03692     { distmatrix[j][js] = distmatrix[j][is]*number[is]
03693                         + distmatrix[j][js]*number[js];
03694       distmatrix[j][js] /= sum;
03695     }
03696 
03697     for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
03698     for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
03699 
03700     /* Update number of elements in the clusters */
03701     number[js] = sum;
03702     number[is] = number[n-1];
03703 
03704     /* Update clusterids */
03705     clusterid[js] = n-nelements-1;
03706     clusterid[is] = clusterid[n-1];
03707   }
03708   free(clusterid);
03709   free(number);
03710 
03711   return result;
03712 }
03713 
03714 /* ******************************************************************* */
03715 
03716 Node* treecluster (int nrows, int ncolumns, double** data, int** mask,
03717   double weight[], int transpose, char dist, char method, double** distmatrix)
03718 /*
03719 Purpose
03720 =======
03721 
03722 The treecluster routine performs hierarchical clustering using pairwise
03723 single-, maximum-, centroid-, or average-linkage, as defined by method, on a
03724 given set of gene expression data, using the distance metric given by dist.
03725 If successful, the function returns a pointer to a newly allocated Tree struct
03726 containing the hierarchical clustering solution, and NULL if a memory error
03727 occurs. The pointer should be freed by the calling routine to prevent memory
03728 leaks.
03729 
03730 Arguments
03731 =========
03732 
03733 nrows     (input) int
03734 The number of rows in the data matrix, equal to the number of genes.
03735 
03736 ncolumns  (input) int
03737 The number of columns in the data matrix, equal to the number of microarrays.
03738 
03739 data       (input) double[nrows][ncolumns]
03740 The array containing the data of the vectors to be clustered.
03741 
03742 mask       (input) int[nrows][ncolumns]
03743 This array shows which data values are missing. If mask[i][j]==0, then
03744 data[i][j] is missing.
03745 
03746 weight (input) double array[n]
03747 The weights that are used to calculate the distance.
03748 
03749 transpose  (input) int
03750 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
03751 of the matrix are clustered.
03752 
03753 dist       (input) char
03754 Defines which distance measure is used, as given by the table:
03755 dist=='e': Euclidean distance
03756 dist=='b': City-block distance
03757 dist=='c': correlation
03758 dist=='a': absolute value of the correlation
03759 dist=='u': uncentered correlation
03760 dist=='x': absolute uncentered correlation
03761 dist=='s': Spearman's rank correlation
03762 dist=='k': Kendall's tau
03763 For other values of dist, the default (Euclidean distance) is used.
03764 
03765 method     (input) char
03766 Defines which hierarchical clustering method is used:
03767 method=='s': pairwise single-linkage clustering
03768 method=='m': pairwise maximum- (or complete-) linkage clustering
03769 method=='a': pairwise average-linkage clustering
03770 method=='c': pairwise centroid-linkage clustering
03771 For the first three, either the distance matrix or the gene expression data is
03772 sufficient to perform the clustering algorithm. For pairwise centroid-linkage
03773 clustering, however, the gene expression data are always needed, even if the
03774 distance matrix itself is available.
03775 
03776 distmatrix (input) double**
03777 The distance matrix. If the distance matrix is zero initially, the distance
03778 matrix will be allocated and calculated from the data by treecluster, and
03779 deallocated before treecluster returns. If the distance matrix is passed by the
03780 calling routine, treecluster will modify the contents of the distance matrix as
03781 part of the clustering algorithm, but will not deallocate it. The calling
03782 routine should deallocate the distance matrix after the return from treecluster.
03783 
03784 Return value
03785 ============
03786 
03787 A pointer to a newly allocated array of Node structs, describing the
03788 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
03789 whether genes (rows) or microarrays (columns) were clustered, nelements is
03790 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
03791 structure.
03792 If a memory error occurs, treecluster returns NULL.
03793 
03794 ========================================================================
03795 */
03796 { Node* result = NULL;
03797   const int nelements = (transpose==0) ? nrows : ncolumns;
03798   const int ldistmatrix = (distmatrix==NULL && method!='s') ? 1 : 0;
03799 
03800   if (nelements < 2) return NULL;
03801 
03802   /* Calculate the distance matrix if the user didn't give it */
03803   if(ldistmatrix)
03804   { distmatrix =
03805       distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose);
03806     if (!distmatrix) return NULL; /* Insufficient memory */
03807   }
03808 
03809   switch(method)
03810   { case 's':
03811       result = pslcluster(nrows, ncolumns, data, mask, weight, distmatrix,
03812                           dist, transpose);
03813       break;
03814     case 'm':
03815       result = pmlcluster(nelements, distmatrix);
03816       break;
03817     case 'a':
03818       result = palcluster(nelements, distmatrix);
03819       break;
03820     case 'c':
03821       result = pclcluster(nrows, ncolumns, data, mask, weight, distmatrix,
03822                           dist, transpose);
03823       break;
03824   }
03825 
03826   /* Deallocate space for distance matrix, if it was allocated by treecluster */
03827   if(ldistmatrix)
03828   { int i;
03829     for (i = 1; i < nelements; i++) free(distmatrix[i]);
03830     free (distmatrix);
03831   }
03832  
03833   return result;
03834 }
03835 
03836 /* ******************************************************************* */
03837 
03838 static
03839 void somworker (int nrows, int ncolumns, double** data, int** mask,
03840   const double weights[], int transpose, int nxgrid, int nygrid,
03841   double inittau, double*** celldata, int niter, char dist)
03842 
03843 { const int nelements = (transpose==0) ? nrows : ncolumns;
03844   const int ndata = (transpose==0) ? ncolumns : nrows;
03845   int i, j;
03846   double* stddata = calloc(nelements,sizeof(double));
03847   int** dummymask;
03848   int ix, iy;
03849   int* index;
03850   int iter;
03851   /* Maximum radius in which nodes are adjusted */
03852   double maxradius = sqrt(nxgrid*nxgrid+nygrid*nygrid);
03853 
03854   /* Set the metric function as indicated by dist */
03855   double (*metric)
03856     (int, double**, double**, int**, int**, const double[], int, int, int) =
03857        setmetric(dist);
03858 
03859   /* Calculate the standard deviation for each row or column */
03860   if (transpose==0)
03861   { for (i = 0; i < nelements; i++)
03862     { int n = 0;
03863       for (j = 0; j < ndata; j++)
03864       { if (mask[i][j])
03865         { double term = data[i][j];
03866           term = term * term;
03867           stddata[i] += term;
03868           n++;
03869         }
03870       }
03871       if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
03872       else stddata[i] = 1;
03873     }
03874   }
03875   else
03876   { for (i = 0; i < nelements; i++)
03877     { int n = 0;
03878       for (j = 0; j < ndata; j++)
03879       { if (mask[j][i])
03880         { double term = data[j][i];
03881           term = term * term;
03882           stddata[i] += term;
03883           n++;
03884         }
03885       }
03886       if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
03887       else stddata[i] = 1;
03888     }
03889   }
03890 
03891   if (transpose==0)
03892   { dummymask = malloc(nygrid*sizeof(int*));
03893     for (i = 0; i < nygrid; i++)
03894     { dummymask[i] = malloc(ndata*sizeof(int));
03895       for (j = 0; j < ndata; j++) dummymask[i][j] = 1;
03896     }
03897   }
03898   else
03899   { dummymask = malloc(ndata*sizeof(int*));
03900     for (i = 0; i < ndata; i++)
03901     { dummymask[i] = malloc(sizeof(int));
03902       dummymask[i][0] = 1;
03903     }
03904   }
03905 
03906   /* Randomly initialize the nodes */
03907   for (ix = 0; ix < nxgrid; ix++)
03908   { for (iy = 0; iy < nygrid; iy++)
03909     { double sum = 0.;
03910       for (i = 0; i < ndata; i++)
03911       { double term = -1.0 + 2.0*uniform();
03912         celldata[ix][iy][i] = term;
03913         sum += term * term;
03914       }
03915       sum = sqrt(sum/ndata);
03916       for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
03917     }
03918   }
03919 
03920   /* Randomize the order in which genes or arrays will be used */
03921   index = malloc(nelements*sizeof(int));
03922   for (i = 0; i < nelements; i++) index[i] = i;
03923   for (i = 0; i < nelements; i++)
03924   { j = (int) (i + (nelements-i)*uniform());
03925     ix = index[j];
03926     index[j] = index[i];
03927     index[i] = ix;
03928   }
03929 
03930   /* Start the iteration */
03931   for (iter = 0; iter < niter; iter++)
03932   { int ixbest = 0;
03933     int iybest = 0;
03934     int iobject = iter % nelements;
03935     iobject = index[iobject];
03936     if (transpose==0)
03937     { double closest = metric(ndata,data,celldata[ixbest],
03938         mask,dummymask,weights,iobject,iybest,transpose);
03939       double radius = maxradius * (1. - ((double)iter)/((double)niter));
03940       double tau = inittau * (1. - ((double)iter)/((double)niter));
03941 
03942       for (ix = 0; ix < nxgrid; ix++)
03943       { for (iy = 0; iy < nygrid; iy++)
03944         { double distance =
03945             metric (ndata,data,celldata[ix],
03946               mask,dummymask,weights,iobject,iy,transpose);
03947           if (distance < closest)
03948           { ixbest = ix;
03949             iybest = iy;
03950             closest = distance;
03951           }
03952         }
03953       }
03954       for (ix = 0; ix < nxgrid; ix++)
03955       { for (iy = 0; iy < nygrid; iy++)
03956         { if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))<radius)
03957           { double sum = 0.;
03958             for (i = 0; i < ndata; i++)
03959             { if (mask[iobject][i]==0) continue;
03960               celldata[ix][iy][i] +=
03961                 tau * (data[iobject][i]/stddata[iobject]-celldata[ix][iy][i]);
03962             }
03963             for (i = 0; i < ndata; i++)
03964             { double term = celldata[ix][iy][i];
03965               term = term * term;
03966               sum += term;
03967             }
03968             if (sum>0)
03969             { sum = sqrt(sum/ndata);
03970               for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
03971             }
03972           }
03973         }
03974       }
03975     }
03976     else
03977     { double closest;
03978       double** celldatavector = malloc(ndata*sizeof(double*));
03979       double radius = maxradius * (1. - ((double)iter)/((double)niter));
03980       double tau = inittau * (1. - ((double)iter)/((double)niter));
03981 
03982       for (i = 0; i < ndata; i++)
03983         celldatavector[i] = &(celldata[ixbest][iybest][i]);
03984       closest = metric(ndata,data,celldatavector,
03985         mask,dummymask,weights,iobject,0,transpose);
03986       for (ix = 0; ix < nxgrid; ix++)
03987       { for (iy = 0; iy < nygrid; iy++)
03988         { double distance;
03989           for (i = 0; i < ndata; i++)
03990             celldatavector[i] = &(celldata[ixbest][iybest][i]);
03991           distance =
03992             metric (ndata,data,celldatavector,
03993               mask,dummymask,weights,iobject,0,transpose);
03994           if (distance < closest)
03995           { ixbest = ix;
03996             iybest = iy;
03997             closest = distance;
03998           }
03999         }
04000       }
04001       free(celldatavector);
04002       for (ix = 0; ix < nxgrid; ix++)
04003       { for (iy = 0; iy < nygrid; iy++)
04004         { if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))<radius)
04005           { double sum = 0.;
04006             for (i = 0; i < ndata; i++)
04007             { if (mask[i][iobject]==0) continue;
04008               celldata[ix][iy][i] +=
04009                 tau * (data[i][iobject]/stddata[iobject]-celldata[ix][iy][i]);
04010             }
04011             for (i = 0; i < ndata; i++)
04012             { double term = celldata[ix][iy][i];
04013               term = term * term;
04014               sum += term;
04015             }
04016             if (sum>0)
04017             { sum = sqrt(sum/ndata);
04018               for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
04019             }
04020           }
04021         }
04022       }
04023     }
04024   }
04025   if (transpose==0)
04026     for (i = 0; i < nygrid; i++) free(dummymask[i]);
04027   else
04028     for (i = 0; i < ndata; i++) free(dummymask[i]);
04029   free(dummymask);
04030   free(stddata);
04031   free(index);
04032   return;
04033 }
04034 
04035 /* ******************************************************************* */
04036 
04037 static
04038 void somassign (int nrows, int ncolumns, double** data, int** mask,
04039   const double weights[], int transpose, int nxgrid, int nygrid,
04040   double*** celldata, char dist, int clusterid[][2])
04041 /* Collect clusterids */
04042 { const int ndata = (transpose==0) ? ncolumns : nrows;
04043   int i,j;
04044 
04045   /* Set the metric function as indicated by dist */
04046   double (*metric)
04047     (int, double**, double**, int**, int**, const double[], int, int, int) =
04048        setmetric(dist);
04049 
04050   if (transpose==0)
04051   { int** dummymask = malloc(nygrid*sizeof(int*));
04052     for (i = 0; i < nygrid; i++)
04053     { dummymask[i] = malloc(ncolumns*sizeof(int));
04054       for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1;
04055     }
04056     for (i = 0; i < nrows; i++)
04057     { int ixbest = 0;
04058       int iybest = 0;
04059       double closest = metric(ndata,data,celldata[ixbest],
04060         mask,dummymask,weights,i,iybest,transpose);
04061       int ix, iy;
04062       for (ix = 0; ix < nxgrid; ix++)
04063       { for (iy = 0; iy < nygrid; iy++)
04064         { double distance =
04065             metric (ndata,data,celldata[ix],
04066               mask,dummymask,weights,i,iy,transpose);
04067           if (distance < closest)
04068           { ixbest = ix;
04069             iybest = iy;
04070             closest = distance;
04071           }
04072         }
04073       }
04074       clusterid[i][0] = ixbest;
04075       clusterid[i][1] = iybest;
04076     }
04077     for (i = 0; i < nygrid; i++) free(dummymask[i]);
04078     free(dummymask);
04079   }
04080   else
04081   { double** celldatavector = malloc(ndata*sizeof(double*));
04082     int** dummymask = malloc(nrows*sizeof(int*));
04083     int ixbest = 0;
04084     int iybest = 0;
04085     for (i = 0; i < nrows; i++)
04086     { dummymask[i] = malloc(sizeof(int));
04087       dummymask[i][0] = 1;
04088     }
04089     for (i = 0; i < ncolumns; i++)
04090     { double closest;
04091       int ix, iy;
04092       for (j = 0; j < ndata; j++)
04093         celldatavector[j] = &(celldata[ixbest][iybest][j]);
04094       closest = metric(ndata,data,celldatavector,
04095         mask,dummymask,weights,i,0,transpose);
04096       for (ix = 0; ix < nxgrid; ix++)
04097       { for (iy = 0; iy < nygrid; iy++)
04098         { double distance;
04099           for(j = 0; j < ndata; j++)
04100             celldatavector[j] = &(celldata[ix][iy][j]);
04101           distance = metric(ndata,data,celldatavector,
04102             mask,dummymask,weights,i,0,transpose);
04103           if (distance < closest)
04104           { ixbest = ix;
04105             iybest = iy;
04106             closest = distance;
04107           }
04108         }
04109       }
04110       clusterid[i][0] = ixbest;
04111       clusterid[i][1] = iybest;
04112     }
04113     free(celldatavector);
04114     for (i = 0; i < nrows; i++) free(dummymask[i]);
04115     free(dummymask);
04116   }
04117   return;
04118 }
04119 
04120 /* ******************************************************************* */
04121 
04122 void somcluster (int nrows, int ncolumns, double** data, int** mask,
04123   const double weight[], int transpose, int nxgrid, int nygrid,
04124   double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
04125 /*
04126 
04127 Purpose
04128 =======
04129 
04130 The somcluster routine implements a self-organizing map (Kohonen) on a
04131 rectangular grid, using a given set of vectors. The distance measure to be
04132 used to find the similarity between genes and nodes is given by dist.
04133 
04134 Arguments
04135 =========
04136 
04137 nrows     (input) int
04138 The number of rows in the data matrix, equal to the number of genes.
04139 
04140 ncolumns  (input) int
04141 The number of columns in the data matrix, equal to the number of microarrays.
04142 
04143 data       (input) double[nrows][ncolumns]
04144 The array containing the gene expression data.
04145 
04146 mask       (input) int[nrows][ncolumns]
04147 This array shows which data values are missing. If
04148 mask[i][j] == 0, then data[i][j] is missing.
04149 
04150 weights    (input) double[ncolumns] if transpose==0;
04151                    double[nrows]    if transpose==1
04152 The weights that are used to calculate the distance. The length of this vector
04153 is ncolumns if genes are being clustered, or nrows if microarrays are being
04154 clustered.
04155 
04156 transpose  (input) int
04157 If transpose==0, the rows (genes) of the matrix are clustered. Otherwise,
04158 columns (microarrays) of the matrix are clustered.
04159 
04160 nxgrid    (input) int
04161 The number of grid cells horizontally in the rectangular topology of clusters.
04162 
04163 nygrid    (input) int
04164 The number of grid cells horizontally in the rectangular topology of clusters.
04165 
04166 inittau    (input) double
04167 The initial value of tau, representing the neighborhood function.
04168 
04169 niter      (input) int
04170 The number of iterations to be performed.
04171 
04172 dist       (input) char
04173 Defines which distance measure is used, as given by the table:
04174 dist=='e': Euclidean distance
04175 dist=='b': City-block distance
04176 dist=='c': correlation
04177 dist=='a': absolute value of the correlation
04178 dist=='u': uncentered correlation
04179 dist=='x': absolute uncentered correlation
04180 dist=='s': Spearman's rank correlation
04181 dist=='k': Kendall's tau
04182 For other values of dist, the default (Euclidean distance) is used.
04183 
04184 celldata (output) double[nxgrid][nygrid][ncolumns] if transpose==0;
04185                   double[nxgrid][nygrid][nrows]    if tranpose==1
04186 The gene expression data for each node (cell) in the 2D grid. This can be
04187 interpreted as the centroid for the cluster corresponding to that cell. If
04188 celldata is NULL, then the centroids are not returned. If celldata is not
04189 NULL, enough space should be allocated to store the centroid data before callingsomcluster.
04190 
04191 clusterid (output), int[nrows][2]    if transpose==0;
04192                     int[ncolumns][2] if transpose==1
04193 For each item (gene or microarray) that is clustered, the coordinates of the
04194 cell in the 2D grid to which the item was assigned. If clusterid is NULL, the
04195 cluster assignments are not returned. If clusterid is not NULL, enough memory
04196 should be allocated to store the clustering information before calling
04197 somcluster.
04198 
04199 ========================================================================
04200 */
04201 { const int nobjects = (transpose==0) ? nrows : ncolumns;
04202   const int ndata = (transpose==0) ? ncolumns : nrows;
04203   int i,j;
04204   const int lcelldata = (celldata==NULL) ? 0 : 1;
04205 
04206   if (nobjects < 2) return;
04207 
04208   if (lcelldata==0)
04209   { celldata = malloc(nxgrid*nygrid*ndata*sizeof(double**));
04210     for (i = 0; i < nxgrid; i++)
04211     { celldata[i] = malloc(nygrid*ndata*sizeof(double*));
04212       for (j = 0; j < nygrid; j++)
04213         celldata[i][j] = malloc(ndata*sizeof(double));
04214     }
04215   }
04216 
04217   somworker (nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid,
04218     inittau, celldata, niter, dist);
04219   if (clusterid)
04220     somassign (nrows, ncolumns, data, mask, weight, transpose,
04221       nxgrid, nygrid, celldata, dist, clusterid);
04222   if(lcelldata==0)
04223   { for (i = 0; i < nxgrid; i++)
04224       for (j = 0; j < nygrid; j++)
04225         free(celldata[i][j]);
04226     for (i = 0; i < nxgrid; i++)
04227       free(celldata[i]);
04228     free(celldata);
04229   }
04230   return;
04231 }
04232 
04233 /* ******************************************************************** */
04234 
04235 double clusterdistance (int nrows, int ncolumns, double** data,
04236   int** mask, double weight[], int n1, int n2, int index1[], int index2[],
04237   char dist, char method, int transpose)
04238               
04239 /*
04240 Purpose
04241 =======
04242 
04243 The clusterdistance routine calculates the distance between two clusters
04244 containing genes or microarrays using the measured gene expression vectors. The
04245 distance between clusters, given the genes/microarrays in each cluster, can be
04246 defined in several ways. Several distance measures can be used.
04247 
04248 The routine returns the distance in double precision.
04249 If the parameter transpose is set to a nonzero value, the clusters are
04250 interpreted as clusters of microarrays, otherwise as clusters of gene.
04251 
04252 Arguments
04253 =========
04254 
04255 nrows     (input) int
04256 The number of rows (i.e., the number of genes) in the gene expression data
04257 matrix.
04258 
04259 ncolumns      (input) int
04260 The number of columns (i.e., the number of microarrays) in the gene expression
04261 data matrix.
04262 
04263 data       (input) double[nrows][ncolumns]
04264 The array containing the data of the vectors.
04265 
04266 mask       (input) int[nrows][ncolumns]
04267 This array shows which data values are missing. If mask[i][j]==0, then
04268 data[i][j] is missing.
04269 
04270 weight     (input) double[ncolumns] if transpose==0;
04271                    double[nrows]    if transpose==1
04272 The weights that are used to calculate the distance.
04273 
04274 n1         (input) int
04275 The number of elements in the first cluster.
04276 
04277 n2         (input) int
04278 The number of elements in the second cluster.
04279 
04280 index1     (input) int[n1]
04281 Identifies which genes/microarrays belong to the first cluster.
04282 
04283 index2     (input) int[n2]
04284 Identifies which genes/microarrays belong to the second cluster.
04285 
04286 dist       (input) char
04287 Defines which distance measure is used, as given by the table:
04288 dist=='e': Euclidean distance
04289 dist=='b': City-block distance
04290 dist=='c': correlation
04291 dist=='a': absolute value of the correlation
04292 dist=='u': uncentered correlation
04293 dist=='x': absolute uncentered correlation
04294 dist=='s': Spearman's rank correlation
04295 dist=='k': Kendall's tau
04296 For other values of dist, the default (Euclidean distance) is used.
04297 
04298 method     (input) char
04299 Defines how the distance between two clusters is defined, given which genes
04300 belong to which cluster:
04301 method=='a': the distance between the arithmetic means of the two clusters
04302 method=='m': the distance between the medians of the two clusters
04303 method=='s': the smallest pairwise distance between members of the two clusters
04304 method=='x': the largest pairwise distance between members of the two clusters
04305 method=='v': average of the pairwise distances between members of the clusters
04306 
04307 transpose  (input) int
04308 If transpose is equal to zero, the distances between the rows is
04309 calculated. Otherwise, the distances between the columns is calculated.
04310 The former is needed when genes are being clustered; the latter is used
04311 when microarrays are being clustered.
04312 
04313 ========================================================================
04314 */
04315 { /* Set the metric function as indicated by dist */
04316   double (*metric)
04317     (int, double**, double**, int**, int**, const double[], int, int, int) =
04318        setmetric(dist);
04319 
04320   /* if one or both clusters are empty, return */
04321   if (n1 < 1 || n2 < 1) return -1.0;
04322   /* Check the indices */
04323   if (transpose==0)
04324   { int i;
04325     for (i = 0; i < n1; i++)
04326     { int index = index1[i];
04327       if (index < 0 || index >= nrows) return -1.0;
04328     }
04329     for (i = 0; i < n2; i++)
04330     { int index = index2[i];
04331       if (index < 0 || index >= nrows) return -1.0;
04332     }
04333   }
04334   else
04335   { int i;
04336     for (i = 0; i < n1; i++)
04337     { int index = index1[i];
04338       if (index < 0 || index >= ncolumns) return -1.0;
04339     }
04340     for (i = 0; i < n2; i++)
04341     { int index = index2[i];
04342       if (index < 0 || index >= ncolumns) return -1.0;
04343     }
04344   }
04345 
04346   switch (method)
04347   { case 'a':
04348     { /* Find the center */
04349       int i,j,k;
04350       if (transpose==0)
04351       { double distance;
04352         double* cdata[2];
04353         int* cmask[2];
04354         int* count[2];
04355         count[0] = calloc(ncolumns,sizeof(int));
04356         count[1] = calloc(ncolumns,sizeof(int));
04357         cdata[0] = calloc(ncolumns,sizeof(double));
04358         cdata[1] = calloc(ncolumns,sizeof(double));
04359         cmask[0] = malloc(ncolumns*sizeof(int));
04360         cmask[1] = malloc(ncolumns*sizeof(int));
04361         for (i = 0; i < n1; i++)
04362         { k = index1[i];
04363           for (j = 0; j < ncolumns; j++)
04364             if (mask[k][j] != 0)
04365             { cdata[0][j] = cdata[0][j] + data[k][j];
04366               count[0][j] = count[0][j] + 1;
04367             }
04368         }
04369         for (i = 0; i < n2; i++)
04370         { k = index2[i];
04371           for (j = 0; j < ncolumns; j++)
04372             if (mask[k][j] != 0)
04373             { cdata[1][j] = cdata[1][j] + data[k][j];
04374               count[1][j] = count[1][j] + 1;
04375             }
04376         }
04377         for (i = 0; i < 2; i++)
04378           for (j = 0; j < ncolumns; j++)
04379           { if (count[i][j]>0)
04380             { cdata[i][j] = cdata[i][j] / count[i][j];
04381               cmask[i][j] = 1;
04382             }
04383             else
04384               cmask[i][j] = 0;
04385           }
04386         distance =
04387           metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
04388         for (i = 0; i < 2; i++)
04389         { free (cdata[i]);
04390           free (cmask[i]);
04391           free (count[i]);
04392         }
04393         return distance;
04394       }
04395       else
04396       { double distance;
04397         int** count = malloc(nrows*sizeof(int*));
04398         double** cdata = malloc(nrows*sizeof(double*));
04399         int** cmask = malloc(nrows*sizeof(int*));
04400         for (i = 0; i < nrows; i++)
04401         { count[i] = calloc(2,sizeof(int));
04402           cdata[i] = calloc(2,sizeof(double));
04403           cmask[i] = malloc(2*sizeof(int));
04404         }
04405         for (i = 0; i < n1; i++)
04406         { k = index1[i];
04407           for (j = 0; j < nrows; j++)
04408           { if (mask[j][k] != 0)
04409             { cdata[j][0] = cdata[j][0] + data[j][k];
04410               count[j][0] = count[j][0] + 1;
04411             }
04412           }
04413         }
04414         for (i = 0; i < n2; i++)
04415         { k = index2[i];
04416           for (j = 0; j < nrows; j++)
04417           { if (mask[j][k] != 0)
04418             { cdata[j][1] = cdata[j][1] + data[j][k];
04419               count[j][1] = count[j][1] + 1;
04420             }
04421           }
04422         }
04423         for (i = 0; i < nrows; i++)
04424           for (j = 0; j < 2; j++)
04425             if (count[i][j]>0)
04426             { cdata[i][j] = cdata[i][j] / count[i][j];
04427               cmask[i][j] = 1;
04428             }
04429             else
04430               cmask[i][j] = 0;
04431         distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
04432         for (i = 0; i < nrows; i++)
04433         { free (count[i]);
04434           free (cdata[i]);
04435           free (cmask[i]);
04436         }
04437         free (count);
04438         free (cdata);
04439         free (cmask);
04440         return distance;
04441       }
04442     }
04443     case 'm':
04444     { int i, j, k;
04445       if (transpose==0)
04446       { double distance;
04447         double* temp = malloc(nrows*sizeof(double));
04448         double* cdata[2];
04449         int* cmask[2];
04450         for (i = 0; i < 2; i++)
04451         { cdata[i] = malloc(ncolumns*sizeof(double));
04452           cmask[i] = malloc(ncolumns*sizeof(int));
04453         }
04454         for (j = 0; j < ncolumns; j++)
04455         { int count = 0;
04456           for (k = 0; k < n1; k++)
04457           { i = index1[k];
04458             if (mask[i][j])
04459             { temp[count] = data[i][j];
04460               count++;
04461             }
04462           }
04463           if (count>0)
04464           { cdata[0][j] = median (count,temp);
04465             cmask[0][j] = 1;
04466           }
04467           else
04468           { cdata[0][j] = 0.;
04469             cmask[0][j] = 0;
04470           }
04471         }
04472         for (j = 0; j < ncolumns; j++)
04473         { int count = 0;
04474           for (k = 0; k < n2; k++)
04475           { i = index2[k];
04476             if (mask[i][j])
04477             { temp[count] = data[i][j];
04478               count++;
04479             }
04480           }
04481           if (count>0)
04482           { cdata[1][j] = median (count,temp);
04483             cmask[1][j] = 1;
04484           }
04485           else
04486           { cdata[1][j] = 0.;
04487             cmask[1][j] = 0;
04488           }
04489         }
04490         distance = metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
04491         for (i = 0; i < 2; i++)
04492         { free (cdata[i]);
04493           free (cmask[i]);
04494         }
04495         free(temp);
04496         return distance;
04497       }
04498       else
04499       { double distance;
04500         double* temp = malloc(ncolumns*sizeof(double));
04501         double** cdata = malloc(nrows*sizeof(double*));
04502         int** cmask = malloc(nrows*sizeof(int*));
04503         for (i = 0; i < nrows; i++)
04504         { cdata[i] = malloc(2*sizeof(double));
04505           cmask[i] = malloc(2*sizeof(int));
04506         }
04507         for (j = 0; j < nrows; j++)
04508         { int count = 0;
04509           for (k = 0; k < n1; k++)
04510           { i = index1[k];
04511             if (mask[j][i])
04512             { temp[count] = data[j][i];
04513               count++;
04514             }
04515           }
04516           if (count>0)
04517           { cdata[j][0] = median (count,temp);
04518             cmask[j][0] = 1;
04519           }
04520           else
04521           { cdata[j][0] = 0.;
04522             cmask[j][0] = 0;
04523           }
04524         }
04525         for (j = 0; j < nrows; j++)
04526         { int count = 0;
04527           for (k = 0; k < n2; k++)
04528           { i = index2[k];
04529             if (mask[j][i])
04530             { temp[count] = data[j][i];
04531               count++;
04532             }
04533           }
04534           if (count>0)
04535           { cdata[j][1] = median (count,temp);
04536             cmask[j][1] = 1;
04537           }
04538           else
04539           { cdata[j][1] = 0.;
04540             cmask[j][1] = 0;
04541           }
04542         }
04543         distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
04544         for (i = 0; i < nrows; i++)
04545         { free (cdata[i]);
04546           free (cmask[i]);
04547         }
04548         free(cdata);
04549         free(cmask);
04550         free(temp);
04551         return distance;
04552       }
04553     }
04554     case 's':
04555     { int i1, i2, j1, j2;
04556       const int n = (transpose==0) ? ncolumns : nrows;
04557       double mindistance = DBL_MAX;
04558       for (i1 = 0; i1 < n1; i1++)
04559         for (i2 = 0; i2 < n2; i2++)
04560         { double distance;
04561           j1 = index1[i1];
04562           j2 = index2[i2];
04563           distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
04564           if (distance < mindistance) mindistance = distance;
04565         }
04566       return mindistance;
04567     }
04568     case 'x':
04569     { int i1, i2, j1, j2;
04570       const int n = (transpose==0) ? ncolumns : nrows;
04571       double maxdistance = 0;
04572       for (i1 = 0; i1 < n1; i1++)
04573         for (i2 = 0; i2 < n2; i2++)
04574         { double distance;
04575           j1 = index1[i1];
04576           j2 = index2[i2];
04577           distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
04578           if (distance > maxdistance) maxdistance = distance;
04579         }
04580       return maxdistance;
04581     }
04582     case 'v':
04583     { int i1, i2, j1, j2;
04584       const int n = (transpose==0) ? ncolumns : nrows;
04585       double distance = 0;
04586       for (i1 = 0; i1 < n1; i1++)
04587         for (i2 = 0; i2 < n2; i2++)
04588         { j1 = index1[i1];
04589           j2 = index2[i2];
04590           distance += metric (n,data,data,mask,mask,weight,j1,j2,transpose);
04591         }
04592       distance /= (n1*n2);
04593       return distance;
04594     }
04595   }
04596   /* Never get here */
04597   return -2.0;
04598 }