Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 import numpy
00002 
00003 from Bio.Cluster.cluster import *
00004 
00005 def _treesort(order, nodeorder, nodecounts, tree):
00006     # Find the order of the nodes consistent with the hierarchical clustering
00007     # tree, taking into account the preferred order of nodes.
00008     nNodes = len(tree)
00009     nElements = nNodes + 1
00010     neworder = numpy.zeros(nElements)
00011     clusterids = numpy.arange(nElements)
00012     for i in range(nNodes):
00013         i1 = tree[i].left
00014         i2 = tree[i].right
00015         if i1 < 0:
00016             order1 = nodeorder[-i1-1]
00017             count1 = nodecounts[-i1-1]
00018         else:
00019             order1 = order[i1]
00020             count1 = 1
00021         if i2 < 0:
00022             order2 = nodeorder[-i2-1]
00023             count2 = nodecounts[-i2-1]
00024         else:
00025             order2 = order[i2]
00026             count2 = 1
00027         # If order1 and order2 are equal, their order is determined
00028         # by the order in which they were clustered
00029         if i1 < i2:
00030             if order1 < order2:
00031                 increase = count1
00032             else:
00033                 increase = count2
00034             for j in range(nElements):
00035                 clusterid = clusterids[j]
00036                 if clusterid == i1 and order1 >= order2:
00037                     neworder[j] += increase
00038                 if clusterid == i2 and order1 < order2:
00039                     neworder[j] += increase
00040                 if clusterid == i1 or clusterid == i2:
00041                     clusterids[j] = -i-1
00042         else:
00043             if order1 <= order2:
00044                 increase = count1
00045             else:
00046                 increase = count2
00047             for j in range(nElements):
00048                 clusterid = clusterids[j]
00049                 if clusterid == i1 and order1 > order2:
00050                     neworder[j] += increase
00051                 if clusterid == i2 and order1 <= order2:
00052                     neworder[j] += increase
00053                 if clusterid == i1 or clusterid == i2:
00054                     clusterids[j] = -i-1
00055     return numpy.argsort(neworder)
00056 
00057 
00058 def _savetree(jobname, tree, order, transpose):
00059     # Save the hierarchical clustering solution given by the tree, following
00060     # the specified order, in a file whose name is based on jobname.
00061     if transpose == 0:
00062         extension = ".gtr"
00063         keyword = "GENE"
00064     else:
00065         extension = ".atr"
00066         keyword = "ARRY"
00067     nnodes = len(tree)
00068     outputfile = open(jobname+extension, "w")
00069     nodeindex = 0
00070     nodeID = [''] * nnodes
00071     nodecounts = numpy.zeros(nnodes, int)
00072     nodeorder = numpy.zeros(nnodes)
00073     nodedist = numpy.array([node.distance for node in tree])
00074     for nodeindex in range(nnodes):
00075         min1 = tree[nodeindex].left
00076         min2 = tree[nodeindex].right
00077         nodeID[nodeindex] = "NODE%dX" % (nodeindex+1)
00078         outputfile.write(nodeID[nodeindex])
00079         outputfile.write("\t")
00080         if min1 < 0:
00081             index1 = -min1-1
00082             order1 = nodeorder[index1]
00083             counts1 = nodecounts[index1]
00084             outputfile.write(nodeID[index1]+"\t")
00085             nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1])
00086         else:
00087             order1 = order[min1]
00088             counts1 = 1
00089             outputfile.write("%s%dX\t" % (keyword, min1))
00090         if min2 < 0:
00091             index2 = -min2-1
00092             order2 = nodeorder[index2]
00093             counts2 = nodecounts[index2]
00094             outputfile.write(nodeID[index2]+"\t")
00095             nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2])
00096         else:
00097             order2 = order[min2]
00098             counts2 = 1
00099             outputfile.write("%s%dX\t" % (keyword, min2))
00100         outputfile.write(str(1.0-nodedist[nodeindex]))
00101         outputfile.write("\n")
00102         counts = counts1 + counts2
00103         nodecounts[nodeindex] = counts
00104         nodeorder[nodeindex] = (counts1*order1+counts2*order2) / counts
00105     outputfile.close()
00106     # Now set up order based on the tree structure
00107     index = _treesort(order, nodeorder, nodecounts, tree)
00108     return index
00109 
00110 
00111 class Record(object):
00112     """Store gene expression data.
00113 
00114 A Record stores the gene expression data and related information contained
00115 in a data file following the file format defined for Michael Eisen's
00116 Cluster/TreeView program. A Record has the following members:
00117 
00118 data:     a matrix containing the gene expression data
00119 mask:     a matrix containing only 1's and 0's, denoting which values
00120           are present (1) or missing (0). If all elements of mask are
00121           one (no missing data), then mask is set to None.
00122 geneid:   a list containing a unique identifier for each gene
00123           (e.g., ORF name)
00124 genename: a list containing an additional description for each gene
00125           (e.g., gene name)
00126 gweight:  the weight to be used for each gene when calculating the
00127           distance
00128 gorder:   an array of real numbers indicating the preferred order of the
00129           genes in the output file
00130 expid:    a list containing a unique identifier for each experimental
00131           condition
00132 eweight:  the weight to be used for each experimental condition when
00133           calculating the distance
00134 eorder:   an array of real numbers indication the preferred order in the
00135           output file of the experimental conditions
00136 uniqid:   the string that was used instead of UNIQID in the input file.
00137 
00138 """
00139 
00140     def __init__(self, handle=None):
00141         """Read gene expression data from the file handle and return a Record.
00142 
00143 The file should be in the format defined for Michael Eisen's
00144 Cluster/TreeView program.
00145 
00146 """
00147         self.data = None
00148         self.mask = None
00149         self.geneid = None
00150         self.genename = None
00151         self.gweight = None
00152         self.gorder = None
00153         self.expid = None
00154         self.eweight = None
00155         self.eorder = None
00156         self.uniqid = None
00157         if not handle:
00158             return
00159         line = handle.readline().strip("\r\n").split("\t")
00160         n = len(line)
00161         self.uniqid = line[0]
00162         self.expid = []
00163         cols = {0: "GENEID"}
00164         for word in line[1:]:
00165             if word == "NAME":
00166                 cols[line.index(word)] = word
00167                 self.genename = []
00168             elif word == "GWEIGHT":
00169                 cols[line.index(word)] = word
00170                 self.gweight = []
00171             elif word=="GORDER":
00172                 cols[line.index(word)] = word
00173                 self.gorder = []
00174             else:
00175                 self.expid.append(word)
00176         self.geneid = []
00177         self.data = []
00178         self.mask = []
00179         needmask = 0
00180         for line in handle:
00181             line = line.strip("\r\n").split("\t")
00182             if len(line) != n:
00183                 raise ValueError("Line with %d columns found (expected %d)" %
00184                                  (len(line), n))
00185             if line[0] == "EWEIGHT":
00186                 i = max(cols) + 1
00187                 self.eweight = numpy.array(line[i:], float)
00188                 continue
00189             if line[0] == "EORDER":
00190                 i = max(cols) + 1
00191                 self.eorder = numpy.array(line[i:], float)
00192                 continue
00193             rowdata = []
00194             rowmask = []
00195             n = len(line)
00196             for i in range(n):
00197                 word = line[i]
00198                 if i in cols:
00199                     if cols[i] == "GENEID":
00200                         self.geneid.append(word)
00201                     if cols[i] == "NAME":
00202                         self.genename.append(word)
00203                     if cols[i] == "GWEIGHT":
00204                         self.gweight.append(float(word))
00205                     if cols[i] == "GORDER":
00206                         self.gorder.append(float(word))
00207                     continue
00208                 if not word:
00209                     rowdata.append(0.0)
00210                     rowmask.append(0)
00211                     needmask = 1
00212                 else:
00213                     rowdata.append(float(word))
00214                     rowmask.append(1)
00215             self.data.append(rowdata)
00216             self.mask.append(rowmask)
00217         self.data = numpy.array(self.data)
00218         if needmask:
00219             self.mask = numpy.array(self.mask, int)
00220         else:
00221             self.mask = None
00222         if self.gweight:
00223             self.gweight = numpy.array(self.gweight)
00224         if self.gorder:
00225             self.gorder = numpy.array(self.gorder)
00226 
00227     def treecluster(self, transpose=0, method='m', dist='e'):
00228         """Apply hierarchical clustering and return a Tree object.
00229 
00230 The pairwise single, complete, centroid, and average linkage hierarchical
00231 clustering methods are available.
00232 
00233 transpose: if equal to 0, genes (rows) are clustered;
00234            if equal to 1, microarrays (columns) are clustered.
00235 dist     : specifies the distance function to be used:
00236            dist=='e': Euclidean distance
00237            dist=='b': City Block distance
00238            dist=='c': Pearson correlation
00239            dist=='a': absolute value of the correlation
00240            dist=='u': uncentered correlation
00241            dist=='x': absolute uncentered correlation
00242            dist=='s': Spearman's rank correlation
00243            dist=='k': Kendall's tau
00244 method   : specifies which linkage method is used:
00245            method=='s': Single pairwise linkage
00246            method=='m': Complete (maximum) pairwise linkage (default)
00247            method=='c': Centroid linkage
00248            method=='a': Average pairwise linkage
00249 
00250 See the description of the Tree class for more information about the Tree
00251 object returned by this method.
00252 
00253 """
00254         if transpose == 0:
00255             weight = self.eweight
00256         else:
00257             weight = self.gweight
00258         return treecluster(self.data, self.mask, weight, transpose, method,
00259                            dist)
00260 
00261     def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e',
00262                  initialid=None):
00263         """Apply k-means or k-median clustering.
00264 
00265 This method returns a tuple (clusterid, error, nfound).
00266 
00267 nclusters: number of clusters (the 'k' in k-means)
00268 transpose: if equal to 0, genes (rows) are clustered;
00269            if equal to 1, microarrays (columns) are clustered.
00270 npass    : number of times the k-means clustering algorithm is
00271            performed, each time with a different (random) initial
00272            condition.
00273 method   : specifies how the center of a cluster is found:
00274            method=='a': arithmetic mean
00275            method=='m': median
00276 dist     : specifies the distance function to be used:
00277            dist=='e': Euclidean distance
00278            dist=='b': City Block distance
00279            dist=='c': Pearson correlation
00280            dist=='a': absolute value of the correlation
00281            dist=='u': uncentered correlation
00282            dist=='x': absolute uncentered correlation
00283            dist=='s': Spearman's rank correlation
00284            dist=='k': Kendall's tau
00285 initialid: the initial clustering from which the algorithm should start.
00286            If initialid is None, the routine carries out npass
00287            repetitions of the EM algorithm, each time starting from a
00288            different random initial clustering. If initialid is given,
00289            the routine carries out the EM algorithm only once, starting
00290            from the given initial clustering and without randomizing the
00291            order in which items are assigned to clusters (i.e., using
00292            the same order as in the data matrix). In that case, the
00293            k-means algorithm is fully deterministic.
00294 
00295 Return values:
00296 clusterid: array containing the number of the cluster to which each
00297            gene/microarray was assigned in the best k-means clustering
00298            solution that was found in the npass runs;
00299 error:     the within-cluster sum of distances for the returned k-means
00300            clustering solution;
00301 nfound:    the number of times this solution was found.
00302 
00303 """
00304 
00305         if transpose == 0:
00306             weight = self.eweight
00307         else:
00308             weight = self.gweight
00309         return kcluster(self.data, nclusters, self.mask, weight, transpose,
00310                         npass, method, dist, initialid)
00311 
00312     def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02,
00313                    niter=1, dist='e'):
00314         """Calculate a self-organizing map on a rectangular grid.
00315 
00316 The somcluster method returns a tuple (clusterid, celldata).
00317 
00318 transpose: if equal to 0, genes (rows) are clustered;
00319            if equal to 1, microarrays (columns) are clustered.
00320 nxgrid   : the horizontal dimension of the rectangular SOM map
00321 nygrid   : the vertical dimension of the rectangular SOM map
00322 inittau  : the initial value of tau (the neighborbood function)
00323 niter    : the number of iterations
00324 dist     : specifies the distance function to be used:
00325            dist=='e': Euclidean distance
00326            dist=='b': City Block distance
00327            dist=='c': Pearson correlation
00328            dist=='a': absolute value of the correlation
00329            dist=='u': uncentered correlation
00330            dist=='x': absolute uncentered correlation
00331            dist=='s': Spearman's rank correlation
00332            dist=='k': Kendall's tau
00333 
00334 Return values:
00335 clusterid: array with two columns, while the number of rows is equal to
00336            the number of genes or the number of microarrays depending on
00337            whether genes or microarrays are being clustered. Each row in
00338            the array contains the x and y coordinates of the cell in the
00339            rectangular SOM grid to which the gene or microarray was
00340            assigned.
00341 celldata:  an array with dimensions (nxgrid, nygrid, number of
00342            microarrays) if genes are being clustered, or (nxgrid,
00343            nygrid, number of genes) if microarrays are being clustered.
00344            Each element [ix][iy] of this array is a 1D vector containing
00345            the gene expression data for the centroid of the cluster in
00346            the SOM grid cell with coordinates (ix, iy).
00347 
00348 """
00349 
00350         if transpose == 0:
00351             weight = self.eweight
00352         else:
00353             weight = self.gweight
00354         return somcluster(self.data, self.mask, weight, transpose,
00355                           nxgrid, nygrid, inittau, niter, dist)
00356 
00357     def clustercentroids(self, clusterid=None, method='a', transpose=0):
00358         """Calculate the cluster centroids and return a tuple (cdata, cmask).
00359 
00360 The centroid is defined as either the mean or the median over all elements
00361 for each dimension.
00362 
00363 data     : nrows x ncolumns array containing the expression data
00364 mask     : nrows x ncolumns array of integers, showing which data are
00365            missing. If mask[i][j]==0, then data[i][j] is missing.
00366 transpose: if equal to 0, gene (row) clusters are considered;
00367            if equal to 1, microarray (column) clusters are considered.
00368 clusterid: array containing the cluster number for each gene or
00369            microarray. The cluster number should be non-negative.
00370 method   : specifies how the centroid is calculated:
00371            method=='a': arithmetic mean over each dimension. (default)
00372            method=='m': median over each dimension.
00373 
00374 Return values:
00375 cdata    : 2D array containing the cluster centroids. If transpose==0,
00376            then the dimensions of cdata are nclusters x ncolumns. If
00377            transpose==1, then the dimensions of cdata are
00378            nrows x nclusters.
00379 cmask    : 2D array of integers describing which elements in cdata,
00380            if any, are missing.
00381 
00382 """
00383         return clustercentroids(self.data, self.mask, clusterid, method,
00384                                 transpose)
00385 
00386     def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e',
00387                         transpose=0):
00388         """Calculate the distance between two clusters.
00389 
00390 index1   : 1D array identifying which genes/microarrays belong to the
00391            first cluster. If the cluster contains only one gene, then
00392            index1 can also be written as a single integer.
00393 index2   : 1D array identifying which genes/microarrays belong to the
00394            second cluster. If the cluster contains only one gene, then
00395            index2 can also be written as a single integer.
00396 transpose: if equal to 0, genes (rows) are clustered;
00397            if equal to 1, microarrays (columns) are clustered.
00398 dist     : specifies the distance function to be used:
00399            dist=='e': Euclidean distance
00400            dist=='b': City Block distance
00401            dist=='c': Pearson correlation
00402            dist=='a': absolute value of the correlation
00403            dist=='u': uncentered correlation
00404            dist=='x': absolute uncentered correlation
00405            dist=='s': Spearman's rank correlation
00406            dist=='k': Kendall's tau
00407 method   : specifies how the distance between two clusters is defined:
00408            method=='a': the distance between the arithmetic means of the
00409                         two clusters
00410            method=='m': the distance between the medians of the two
00411                         clusters
00412            method=='s': the smallest pairwise distance between members
00413                         of the two clusters
00414            method=='x': the largest pairwise distance between members of
00415                         the two clusters
00416            method=='v': average of the pairwise distances between
00417                         members of the clusters
00418 transpose: if equal to 0: clusters of genes (rows) are considered;
00419            if equal to 1: clusters of microarrays (columns) are
00420                           considered.
00421 
00422 """
00423 
00424         if transpose == 0:
00425             weight = self.eweight
00426         else:
00427             weight = self.gweight
00428         return clusterdistance(self.data, self.mask, weight,
00429                                index1, index2, method, dist, transpose)
00430 
00431     def distancematrix(self, transpose=0, dist='e'):
00432         """Calculate the distance matrix and return it as a list of arrays
00433 
00434 transpose: if equal to 0: calculate the distances between genes (rows);
00435            if equal to 1: calculate the distances beteeen microarrays
00436                           (columns).
00437 dist     : specifies the distance function to be used:
00438            dist=='e': Euclidean distance
00439            dist=='b': City Block distance
00440            dist=='c': Pearson correlation
00441            dist=='a': absolute value of the correlation
00442            dist=='u': uncentered correlation
00443            dist=='x': absolute uncentered correlation
00444            dist=='s': Spearman's rank correlation
00445            dist=='k': Kendall's tau
00446 
00447 Return value:
00448 The distance matrix is returned as a list of 1D arrays containing the
00449 distance matrix between the gene expression data. The number of columns
00450 in each row is equal to the row number. Hence, the first row has zero
00451 elements. An example of the return value is
00452 matrix = [[],
00453           array([1.]),
00454           array([7., 3.]),
00455           array([4., 2., 6.])]
00456 This corresponds to the distance matrix
00457  [0., 1., 7., 4.]
00458  [1., 0., 3., 2.]
00459  [7., 3., 0., 6.]
00460  [4., 2., 6., 0.]
00461 
00462 """
00463         if transpose == 0:
00464             weight = self.eweight
00465         else:
00466             weight = self.gweight
00467         return distancematrix(self.data, self.mask, weight, transpose, dist)
00468 
00469     def save(self, jobname, geneclusters=None, expclusters=None):
00470         """Save the clustering results.
00471 
00472 The saved files follow the convention for the Java TreeView program,
00473 which can therefore be used to view the clustering result.
00474 
00475 Arguments:
00476 jobname:   The base name of the files to be saved. The filenames are
00477            jobname.cdt, jobname.gtr, and jobname.atr for
00478            hierarchical clustering, and jobname-K*.cdt,
00479            jobname-K*.kgg, jobname-K*.kag for k-means clustering
00480            results.
00481 geneclusters=None:  For hierarchical clustering results, geneclusters
00482            is a Tree object as returned by the treecluster method.
00483            For k-means clustering results, geneclusters is a vector
00484            containing ngenes integers, describing to which cluster a
00485            given gene belongs. This vector can be calculated by
00486            kcluster.
00487 expclusters=None:  For hierarchical clustering results, expclusters
00488            is a Tree object as returned by the treecluster method.
00489            For k-means clustering results, expclusters is a vector
00490            containing nexps integers, describing to which cluster a
00491            given experimental condition belongs. This vector can be
00492            calculated by kcluster.
00493 
00494 """
00495         (ngenes,nexps) = numpy.shape(self.data)
00496         if self.gorder == None:
00497             gorder = numpy.arange(ngenes)
00498         else:
00499             gorder = self.gorder
00500         if self.eorder == None:
00501             eorder = numpy.arange(nexps)
00502         else:
00503             eorder = self.eorder
00504         if geneclusters!=None and expclusters!=None and \
00505            type(geneclusters) != type(expclusters):
00506             raise ValueError("found one k-means and one hierarchical "
00507                            + "clustering solution in geneclusters and "
00508                            + "expclusters")
00509         gid = 0
00510         aid = 0
00511         filename = jobname
00512         postfix = ""
00513         if type(geneclusters) == Tree:
00514             # This is a hierarchical clustering result.
00515             geneindex = _savetree(jobname, geneclusters, gorder, 0)
00516             gid = 1
00517         elif geneclusters!=None:
00518             # This is a k-means clustering result.
00519             filename = jobname + "_K"
00520             k = max(geneclusters+1)
00521             kggfilename = "%s_K_G%d.kgg" % (jobname, k)
00522             geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0)
00523             postfix = "_G%d" % k
00524         else:
00525             geneindex = numpy.argsort(gorder)
00526         if type(expclusters) == Tree:
00527             # This is a hierarchical clustering result.
00528             expindex = _savetree(jobname, expclusters, eorder, 1)
00529             aid = 1
00530         elif expclusters!=None:
00531             # This is a k-means clustering result.
00532             filename = jobname + "_K"
00533             k = max(expclusters+1)
00534             kagfilename = "%s_K_A%d.kag" % (jobname, k)
00535             expindex = self._savekmeans(kagfilename, expclusters, eorder, 1)
00536             postfix += "_A%d" % k
00537         else:
00538             expindex = numpy.argsort(eorder)
00539         filename = filename + postfix
00540         self._savedata(filename,gid,aid,geneindex,expindex)
00541 
00542     def _savekmeans(self, filename, clusterids, order, transpose):
00543         # Save a k-means clustering solution
00544         if transpose == 0:
00545             label = self.uniqid
00546             names = self.geneid
00547         else:
00548             label = "ARRAY"
00549             names = self.expid
00550         try:
00551             outputfile = open(filename, "w")
00552         except IOError:
00553             raise IOError("Unable to open output file")
00554         outputfile.write(label + "\tGROUP\n")
00555         index = numpy.argsort(order)
00556         n = len(names)
00557         sortedindex = numpy.zeros(n, int)
00558         counter = 0
00559         cluster = 0
00560         while counter < n:
00561             for j in index:
00562                 if clusterids[j] == cluster:
00563                     outputfile.write("%s\t%s\n" % (names[j], cluster))
00564                     sortedindex[counter] = j
00565                     counter += 1
00566             cluster += 1
00567         outputfile.close()
00568         return sortedindex
00569 
00570     def _savedata(self, jobname, gid, aid, geneindex, expindex):
00571         # Save the clustered data.
00572         if self.genename == None:
00573             genename = self.geneid
00574         else:
00575             genename = self.genename
00576         (ngenes, nexps) = numpy.shape(self.data)
00577         try:
00578             outputfile = open(jobname+'.cdt', 'w')
00579         except IOError:
00580             raise IOError("Unable to open output file")
00581         if self.mask!=None:
00582             mask = self.mask
00583         else:
00584             mask = numpy.ones((ngenes,nexps), int)
00585         if self.gweight!=None:
00586             gweight = self.gweight
00587         else:
00588             gweight = numpy.ones(ngenes)
00589         if self.eweight!=None:
00590             eweight = self.eweight
00591         else:
00592             eweight = numpy.ones(nexps)
00593         if gid:
00594             outputfile.write('GID\t')
00595         outputfile.write(self.uniqid)
00596         outputfile.write('\tNAME\tGWEIGHT')
00597         # Now add headers for data columns.
00598         for j in expindex:
00599             outputfile.write('\t%s' % self.expid[j])
00600         outputfile.write('\n')
00601         if aid:
00602             outputfile.write("AID")
00603             if gid:
00604                 outputfile.write('\t')
00605             outputfile.write("\t\t")
00606             for j in expindex:
00607                 outputfile.write('\tARRY%dX' % j)
00608             outputfile.write('\n')
00609         outputfile.write('EWEIGHT')
00610         if gid:
00611             outputfile.write('\t')
00612         outputfile.write('\t\t')
00613         for j in expindex:
00614             outputfile.write('\t%f' % eweight[j])
00615         outputfile.write('\n')
00616         for i in geneindex:
00617             if gid:
00618                 outputfile.write('GENE%dX\t' % i)
00619             outputfile.write("%s\t%s\t%f" %
00620                              (self.geneid[i], genename[i], gweight[i]))
00621             for j in expindex:
00622                 outputfile.write('\t')
00623                 if mask[i,j]:
00624                     outputfile.write(str(self.data[i,j]))
00625             outputfile.write('\n')
00626         outputfile.close()
00627 
00628 
00629 def read(handle):
00630     """Read gene expression data from the file handle and return a Record.
00631 
00632 The file should be in the file format defined for Michael Eisen's
00633 Cluster/TreeView program.
00634 
00635 """
00636     return Record(handle)