Back to index

python-biopython  1.60
Trees.py
Go to the documentation of this file.
00001 # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license. Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 #
00006 # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla.
00007 """Tree class to handle phylogenetic trees.
00008 
00009 Provides a set of methods to read and write newick-format tree descriptions,
00010 get information about trees (monphyly of taxon sets, congruence between trees,
00011 common ancestors,...) and to manipulate trees (reroot trees, split terminal
00012 nodes).
00013 """
00014 
00015 import sys, random, copy
00016 import Nodes
00017 
00018 PRECISION_BRANCHLENGTH=6
00019 PRECISION_SUPPORT=6
00020 NODECOMMENT_START='[&'
00021 NODECOMMENT_END=']'
00022 
00023 class TreeError(Exception): pass
00024 
00025 class NodeData(object):
00026     """Stores tree-relevant data associated with nodes (e.g. branches or otus)."""
00027     def __init__(self,taxon=None,branchlength=0.0,support=None,comment=None):
00028         self.taxon=taxon
00029         self.branchlength=branchlength
00030         self.support=support
00031         self.comment=comment
00032 
00033 class Tree(Nodes.Chain):
00034     """Represents a tree using a chain of nodes with on predecessor (=ancestor)
00035     and multiple successors (=subclades).
00036     """ 
00037     # A newick tree is parsed into nested list and then converted to a node list in two stages
00038     # mostly due to historical reasons. This could be done in one swoop). Note: parentheses ( ) and
00039     # colon : are not allowed in taxon names. This is against NEXUS standard, but makes life much
00040     # easier when parsing trees.
00041     
00042     ## NOTE: Tree should store its data class in something like self.dataclass=data,
00043     ## so that nodes that are generated have easy access to the data class
00044     ## Some routines use automatically NodeData, this needs to be more concise
00045 
00046     def __init__(self,tree=None,weight=1.0,rooted=False,name='',data=NodeData,values_are_support=False,max_support=1.0):
00047         """Ntree(self,tree)."""
00048         Nodes.Chain.__init__(self)
00049         self.dataclass=data
00050         self.__values_are_support=values_are_support
00051         self.max_support=max_support
00052         self.weight=weight
00053         self.rooted=rooted
00054         self.name=name
00055         root=Nodes.Node(data())
00056         self.root = self.add(root)
00057         if tree:    # use the tree we have
00058             # if Tree is called from outside Nexus parser, we need to get rid of linebreaks, etc
00059             tree=tree.strip().replace('\n','').replace('\r','')
00060             # there's discrepancy whether newick allows semicolons et the end
00061             tree=tree.rstrip(';')
00062             subtree_info, base_info = self._parse(tree)
00063             root.data = self._add_nodedata(root.data, [[], base_info])
00064             self._add_subtree(parent_id=root.id,tree=subtree_info)
00065         
00066     def _parse(self,tree):
00067         """Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively."""
00068         #Remove any leading/trailing white space - want any string starting
00069         #with " (..." should be recognised as a leaf, "(..."
00070         tree = tree.strip()
00071         if tree.count('(')!=tree.count(')'):
00072             raise TreeError('Parentheses do not match in (sub)tree: '+tree)
00073         if tree.count('(')==0: # a leaf
00074             #check if there's a colon, or a special comment, or both  after the taxon name
00075             nodecomment=tree.find(NODECOMMENT_START)
00076             colon=tree.find(':')
00077             if colon==-1 and nodecomment==-1: # none
00078                 return [tree,[None]]
00079             elif colon==-1 and nodecomment>-1: # only special comment
00080                 return [tree[:nodecomment],self._get_values(tree[nodecomment:])]
00081             elif colon>-1 and nodecomment==-1: # only numerical values
00082                 return [tree[:colon],self._get_values(tree[colon+1:])]
00083             elif colon < nodecomment: # taxon name ends at first colon or with special comment
00084                 return [tree[:colon],self._get_values(tree[colon+1:])]
00085             else:
00086                 return [tree[:nodecomment],self._get_values(tree[nodecomment:])]
00087         else:
00088             closing=tree.rfind(')')
00089             val=self._get_values(tree[closing+1:])
00090             if not val:
00091                 val=[None]
00092             subtrees=[]
00093             plevel=0
00094             prev=1
00095             for p in range(1,closing):
00096                 if tree[p]=='(':
00097                     plevel+=1
00098                 elif tree[p]==')':
00099                     plevel-=1
00100                 elif tree[p]==',' and plevel==0:
00101                     subtrees.append(tree[prev:p])
00102                     prev=p+1
00103             subtrees.append(tree[prev:closing])
00104             subclades=[self._parse(subtree) for subtree in subtrees]
00105             return [subclades,val]
00106     
00107     def _add_subtree(self,parent_id=None,tree=None):
00108         """Adds leaf or tree (in newick format) to a parent_id. (self,parent_id,tree)."""
00109         if parent_id is None:
00110             raise TreeError('Need node_id to connect to.')
00111         for st in tree:
00112             nd=self.dataclass()
00113             nd = self._add_nodedata(nd, st)
00114             if type(st[0])==list: # it's a subtree
00115                 sn=Nodes.Node(nd)
00116                 self.add(sn,parent_id)
00117                 self._add_subtree(sn.id,st[0])
00118             else: # it's a leaf
00119                 nd.taxon=st[0]
00120                 leaf=Nodes.Node(nd)
00121                 self.add(leaf,parent_id)
00122 
00123     def _add_nodedata(self, nd, st):
00124         """Add data to the node parsed from the comments, taxon and support.
00125         """
00126         if isinstance(st[1][-1],str) and st[1][-1].startswith(NODECOMMENT_START):
00127             nd.comment=st[1].pop(-1)
00128         # if the first element is a string, it's the subtree node taxon
00129         elif isinstance(st[1][0], str):
00130             nd.taxon = st[1][0]
00131             st[1] = st[1][1:]
00132         if len(st)>1:
00133             if len(st[1])>=2: # if there's two values, support comes first. Is that always so?
00134                 nd.support=st[1][0]
00135                 if st[1][1] is not None:
00136                     nd.branchlength=st[1][1]
00137             elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths
00138                 if not self.__values_are_support: # default
00139                     if st[1][0] is not None:
00140                         nd.branchlength=st[1][0]
00141                 else:
00142                     nd.support=st[1][0]
00143         return nd
00144     
00145     def _get_values(self, text):
00146         """Extracts values (support/branchlength) from xx[:yyy], xx."""
00147        
00148         if text=='':
00149             return None
00150         nodecomment = None
00151         if NODECOMMENT_START in text: # if there's a [&....] comment, cut it out
00152             nc_start=text.find(NODECOMMENT_START)
00153             nc_end=text.find(NODECOMMENT_END)
00154             if nc_end==-1:
00155                 raise TreeError('Error in tree description: Found %s without matching %s' \
00156                                 % (NODECOMMENT_START, NODECOMMENT_END))
00157             nodecomment=text[nc_start:nc_end+1]
00158             text=text[:nc_start]+text[nc_end+1:]
00159 
00160         # pase out supports and branchlengths, with internal node taxa info
00161         values = []
00162         taxonomy = None
00163         for part in [t.strip() for t in text.split(":")]:
00164             if part:
00165                 try:
00166                     values.append(float(part))
00167                 except ValueError:
00168                     assert taxonomy is None, "Two string taxonomies?"
00169                     taxonomy = part
00170         if taxonomy:
00171             values.insert(0, taxonomy)
00172         if nodecomment:
00173             values.append(nodecomment)
00174         return values
00175    
00176     def _walk(self,node=None):
00177         """Return all node_ids downwards from a node."""
00178         
00179         if node is None:
00180             node=self.root
00181         for n in self.node(node).succ:
00182             yield n
00183             for sn in self._walk(n):
00184                 yield sn
00185 
00186     def node(self,node_id):
00187         """Return the instance of node_id.
00188         
00189         node = node(self,node_id)
00190         """
00191         if node_id not in self.chain:
00192             raise TreeError('Unknown node_id: %d' % node_id)
00193         return self.chain[node_id]
00194 
00195     def split(self,parent_id=None,n=2,branchlength=1.0):
00196         """Speciation: generates n (default two) descendants of a node.
00197         
00198         [new ids] = split(self,parent_id=None,n=2,branchlength=1.0):
00199         """ 
00200         if parent_id is None:
00201             raise TreeError('Missing node_id.')
00202         ids=[]
00203         parent_data=self.chain[parent_id].data
00204         for i in range(n):
00205             node=Nodes.Node()
00206             if parent_data:
00207                 node.data=self.dataclass()
00208                 # each node has taxon and branchlength attribute
00209                 if parent_data.taxon:
00210                     node.data.taxon=parent_data.taxon+str(i)
00211                 node.data.branchlength=branchlength
00212             ids.append(self.add(node,parent_id))
00213         return ids
00214 
00215     def search_taxon(self,taxon):
00216         """Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes.
00217         
00218         node_id = search_taxon(self,taxon)
00219         """
00220         for id,node in self.chain.iteritems():
00221             if node.data.taxon==taxon:
00222                 return id
00223         return None
00224    
00225     def prune(self,taxon):
00226         """Prunes a terminal taxon from the tree.
00227         
00228         id_of_previous_node = prune(self,taxon)
00229         If taxon is from a bifurcation, the connectiong node will be collapsed
00230         and its branchlength added to remaining terminal node. This might be no
00231         longer a meaningful value'
00232         """
00233         
00234         id=self.search_taxon(taxon)
00235         if id is None:
00236             raise TreeError('Taxon not found: %s' % taxon)
00237         elif id not in self.get_terminals():
00238             raise TreeError('Not a terminal taxon: %s' % taxon)
00239         else:
00240             prev=self.unlink(id)
00241             self.kill(id)
00242             if len(self.node(prev).succ)==1:
00243                 if prev==self.root: # we deleted one branch of a bifurcating root, then we have to move the root upwards
00244                     self.root=self.node(self.root).succ[0]    
00245                     self.node(self.root).branchlength=0.0
00246                     self.kill(prev)
00247                 else: 
00248                     succ=self.node(prev).succ[0]
00249                     new_bl=self.node(prev).data.branchlength+self.node(succ).data.branchlength
00250                     self.collapse(prev)
00251                     self.node(succ).data.branchlength=new_bl
00252             return prev
00253         
00254     def get_taxa(self,node_id=None):
00255         """Return a list of all otus downwards from a node (self, node_id).
00256 
00257         nodes = get_taxa(self,node_id=None)
00258         """
00259 
00260         if node_id is None:
00261             node_id=self.root
00262         if node_id not in self.chain:
00263             raise TreeError('Unknown node_id: %d.' % node_id)    
00264         if self.chain[node_id].succ==[]:
00265             if self.chain[node_id].data:
00266                 return [self.chain[node_id].data.taxon]
00267             else:
00268                 return None
00269         else:
00270             list=[]
00271             for succ in self.chain[node_id].succ:
00272                 list.extend(self.get_taxa(succ))
00273             return list
00274 
00275     def get_terminals(self):
00276         """Return a list of all terminal nodes."""
00277         return [i for i in self.all_ids() if self.node(i).succ==[]]
00278     
00279     def is_terminal(self,node):
00280         """Returns True if node is a terminal node."""
00281         return self.node(node).succ==[]
00282 
00283     def is_internal(self,node):
00284         """Returns True if node is an internal node."""
00285         return len(self.node(node).succ)>0
00286 
00287     def is_preterminal(self,node):
00288         """Returns True if all successors of a node are terminal ones."""
00289         if self.is_terminal(node):
00290             return False not in [self.is_terminal(n) for n in self.node(node).succ]
00291         else:
00292             return False
00293     def count_terminals(self,node=None):
00294         """Counts the number of terminal nodes that are attached to a node."""
00295         if node is None:
00296             node=self.root
00297         return len([n for n in self._walk(node) if self.is_terminal(n)])
00298 
00299     def collapse_genera(self,space_equals_underscore=True):
00300         """Collapses all subtrees which belong to the same genus (i.e share the same first word in their taxon name."""
00301 
00302         while True:
00303             for n in self._walk():
00304                 if self.is_terminal(n):
00305                     continue
00306                 taxa=self.get_taxa(n)
00307                 genera=[]
00308                 for t in taxa:
00309                     if space_equals_underscore:
00310                         t=t.replace(' ','_')
00311                     try:
00312                         genus=t.split('_',1)[0]
00313                     except:
00314                         genus='None'
00315                     if genus not in genera:
00316                         genera.append(genus)
00317                 if len(genera)==1:
00318                     self.node(n).data.taxon=genera[0]+' <collapsed>'
00319                     #now we kill all nodes downstream
00320                     nodes2kill=[kn for kn in self._walk(node=n)]
00321                     for kn in nodes2kill:
00322                         self.kill(kn)
00323                     self.node(n).succ=[]
00324                     break # break out of for loop because node list from _walk will be inconsistent
00325             else: # for loop exhausted: no genera to collapse left
00326                 break # while
00327 
00328 
00329     def sum_branchlength(self,root=None,node=None):
00330         """Adds up the branchlengths from root (default self.root) to node.
00331         
00332         sum = sum_branchlength(self,root=None,node=None)
00333         """
00334 
00335         if root is None:
00336             root=self.root
00337         if node is None:
00338             raise TreeError('Missing node id.')
00339         blen=0.0
00340         while node is not None and node is not root: 
00341             blen+=self.node(node).data.branchlength
00342             node=self.node(node).prev
00343         return blen
00344 
00345     def set_subtree(self,node):
00346         """Return subtree as a set of nested sets.
00347 
00348         sets = set_subtree(self,node)
00349         """
00350         
00351         if self.node(node).succ==[]:
00352             return self.node(node).data.taxon
00353         else:
00354             try:
00355                 return frozenset([self.set_subtree(n) for n in self.node(node).succ])
00356             except:
00357                 print node
00358                 print self.node(node).succ
00359                 for n in self.node(node).succ:
00360                     print n, self.set_subtree(n)
00361                 print [self.set_subtree(n) for n in self.node(node).succ]
00362                 raise
00363             
00364     def is_identical(self,tree2):
00365         """Compare tree and tree2 for identity.
00366 
00367         result = is_identical(self,tree2)
00368         """
00369         return self.set_subtree(self.root)==tree2.set_subtree(tree2.root)
00370 
00371     def is_compatible(self,tree2,threshold,strict=True):
00372         """Compares branches with support>threshold for compatibility.
00373         
00374         result = is_compatible(self,tree2,threshold)
00375         """
00376 
00377         # check if both trees have the same set of taxa. strict=True enforces this.
00378         missing2=set(self.get_taxa())-set(tree2.get_taxa())
00379         missing1=set(tree2.get_taxa())-set(self.get_taxa())
00380         if strict and (missing1 or missing2):
00381             if missing1: 
00382                 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing1) , self.name)
00383             if missing2:
00384                 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing2) , tree2.name)
00385             raise TreeError('Can\'t compare trees with different taxon compositions.')
00386         t1=[(set(self.get_taxa(n)),self.node(n).data.support) for n in self.all_ids() if \
00387             self.node(n).succ and\
00388             (self.node(n).data and self.node(n).data.support and self.node(n).data.support>=threshold)]
00389         t2=[(set(tree2.get_taxa(n)),tree2.node(n).data.support) for n in tree2.all_ids() if \
00390             tree2.node(n).succ and\
00391             (tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support>=threshold)]
00392         conflict=[]
00393         for (st1,sup1) in t1:
00394             for (st2,sup2) in t2:
00395                 if not st1.issubset(st2) and not st2.issubset(st1):                     # don't hiccup on upstream nodes
00396                     intersect,notin1,notin2=st1 & st2, st2-st1, st1-st2                 # all three are non-empty sets
00397                     # if notin1==missing1 or notin2==missing2  <==> st1.issubset(st2) or st2.issubset(st1) ???
00398                     if intersect and not (notin1.issubset(missing1) or notin2.issubset(missing2)):         # omit conflicts due to missing taxa
00399                         conflict.append((st1,sup1,st2,sup2,intersect,notin1,notin2))
00400         return conflict
00401         
00402     def common_ancestor(self,node1,node2):
00403         """Return the common ancestor that connects two nodes.
00404         
00405         node_id = common_ancestor(self,node1,node2)
00406         """
00407         
00408         l1=[self.root]+self.trace(self.root,node1)
00409         l2=[self.root]+self.trace(self.root,node2)
00410         return [n for n in l1 if n in l2][-1]
00411 
00412 
00413     def distance(self,node1,node2):
00414         """Add and return the sum of the branchlengths between two nodes.
00415         dist = distance(self,node1,node2)
00416         """
00417         
00418         ca=self.common_ancestor(node1,node2)
00419         return self.sum_branchlength(ca,node1)+self.sum_branchlength(ca,node2)
00420 
00421     def is_monophyletic(self,taxon_list):
00422         """Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise.
00423         
00424         result = is_monophyletic(self,taxon_list)
00425         """
00426         if isinstance(taxon_list,str):
00427             taxon_set=set([taxon_list])
00428         else:
00429             taxon_set=set(taxon_list)
00430         node_id=self.root
00431         while 1:
00432             subclade_taxa=set(self.get_taxa(node_id))
00433             if subclade_taxa==taxon_set:                                        # are we there?
00434                 return node_id
00435             else:                                                               # check subnodes
00436                 for subnode in self.chain[node_id].succ:
00437                     if set(self.get_taxa(subnode)).issuperset(taxon_set):  # taxon_set is downstream
00438                         node_id=subnode
00439                         break   # out of for loop
00440                 else:
00441                     return -1   # taxon set was not with successors, for loop exhausted
00442 
00443     def is_bifurcating(self,node=None):
00444         """Return True if tree downstream of node is strictly bifurcating."""
00445         if node is None:
00446             node=self.root
00447         if node==self.root and len(self.node(node).succ)==3: #root can be trifurcating, because it has no ancestor
00448             return self.is_bifurcating(self.node(node).succ[0]) and \
00449                     self.is_bifurcating(self.node(node).succ[1]) and \
00450                     self.is_bifurcating(self.node(node).succ[2])
00451         if len(self.node(node).succ)==2:
00452             return self.is_bifurcating(self.node(node).succ[0]) and self.is_bifurcating(self.node(node).succ[1])
00453         elif len(self.node(node).succ)==0:
00454             return True
00455         else:
00456             return False
00457 
00458     def branchlength2support(self):
00459         """Move values stored in data.branchlength to data.support, and set branchlength to 0.0
00460 
00461         This is necessary when support has been stored as branchlength (e.g. paup), and has thus
00462         been read in as branchlength. 
00463         """
00464 
00465         for n in self.chain:
00466             self.node(n).data.support=self.node(n).data.branchlength
00467             self.node(n).data.branchlength=0.0
00468 
00469     def convert_absolute_support(self,nrep):
00470         """Convert absolute support (clade-count) to rel. frequencies.
00471         
00472         Some software (e.g. PHYLIP consense) just calculate how often clades appear, instead of
00473         calculating relative frequencies."""
00474 
00475         for n in self._walk():
00476             if self.node(n).data.support:
00477                 self.node(n).data.support/=float(nrep)
00478 
00479     def has_support(self,node=None):
00480         """Returns True if any of the nodes has data.support != None."""
00481         for n in self._walk(node):
00482             if self.node(n).data.support:
00483                 return True
00484         else:
00485             return False
00486 
00487     def randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True):
00488         """Generates a random tree with ntax taxa and/or taxa from taxlabels.
00489     
00490         new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True)
00491         Trees are bifurcating by default. (Polytomies not yet supported).
00492         """
00493 
00494         if not ntax and taxon_list:
00495             ntax=len(taxon_list)
00496         elif not taxon_list and ntax:
00497             taxon_list=['taxon'+str(i+1) for i in range(ntax)]
00498         elif not ntax and not taxon_list:
00499             raise TreeError('Either numer of taxa or list of taxa must be specified.')
00500         elif ntax != len(taxon_list):
00501             raise TreeError('Length of taxon list must correspond to ntax.')
00502         # initiate self with empty root
00503         self.__init__()
00504         terminals=self.get_terminals()
00505         # bifurcate randomly at terminal nodes until ntax is reached
00506         while len(terminals)<ntax:
00507             newsplit=random.choice(terminals)
00508             new_terminals=self.split(parent_id=newsplit,branchlength=branchlength)
00509             # if desired, give some variation to the branch length
00510             if branchlength_sd:
00511                 for nt in new_terminals:
00512                     bl=random.gauss(branchlength,branchlength_sd)
00513                     if bl<0:
00514                         bl=0
00515                     self.node(nt).data.branchlength=bl
00516             terminals.extend(new_terminals)
00517             terminals.remove(newsplit)
00518         # distribute taxon labels randomly
00519         random.shuffle(taxon_list)
00520         for (node,name) in zip(terminals,taxon_list):
00521             self.node(node).data.taxon=name
00522 
00523     def display(self):
00524         """Quick and dirty lists of all nodes."""
00525         table=[('#','taxon','prev','succ','brlen','blen (sum)','support','comment')]
00526         #Sort this to be consistent accross CPython, Jython, etc
00527         for i in sorted(self.all_ids()):
00528             n=self.node(i)
00529             if not n.data:
00530                 table.append((str(i),'-',str(n.prev),str(n.succ),'-','-','-','-'))
00531             else:
00532                 tx=n.data.taxon
00533                 if not tx:
00534                     tx='-'
00535                 blength="%0.2f" % n.data.branchlength
00536                 if blength is None:
00537                     blength='-'
00538                     sum_blength='-'
00539                 else:
00540                     sum_blength="%0.2f" % self.sum_branchlength(node=i)
00541                 support=n.data.support
00542                 if support is None:
00543                     support='-'
00544                 else:
00545                     support="%0.2f" % support
00546                 comment=n.data.comment
00547                 if comment is None:
00548                     comment='-'
00549                 table.append((str(i),tx,str(n.prev),str(n.succ),
00550                              blength, sum_blength, support, comment))
00551         print '\n'.join(['%3s %32s %15s %15s %8s %10s %8s %20s' % l for l in table])
00552         print '\nRoot: ',self.root
00553 
00554     def to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True,plain_newick=False,ladderize=None,ignore_comments=True):
00555         """Return a paup compatible tree line.
00556        
00557         to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True)
00558         """
00559         # if there's a conflict in the arguments, we override plain=True
00560         if support_as_branchlengths or branchlengths_only:
00561             plain=False
00562         self.support_as_branchlengths=support_as_branchlengths
00563         self.branchlengths_only=branchlengths_only
00564         self.ignore_comments=ignore_comments
00565         self.plain=plain
00566 
00567         def make_info_string(data,terminal=False):
00568             """Creates nicely formatted support/branchlengths."""
00569             # CHECK FORMATTING
00570             if self.plain: # plain tree only. That's easy.
00571                 info_string= ''
00572             elif self.support_as_branchlengths: # support as branchlengths (eg. PAUP), ignore actual branchlengths
00573                 if terminal:    # terminal branches have 100% support
00574                     info_string= ':%1.2f' % self.max_support
00575                 elif data.support:
00576                     info_string= ':%1.2f' % (data.support)
00577                 else:
00578                     info_string=':0.00'
00579             elif self.branchlengths_only: # write only branchlengths, ignore support
00580                 info_string= ':%1.5f' % (data.branchlength)
00581             else:   # write suport and branchlengths (e.g. .con tree of mrbayes)
00582                 if terminal:
00583                     info_string= ':%1.5f' % (data.branchlength)
00584                 else:
00585                     if data.branchlength is not None and data.support is not None:  # we have blen and suppport
00586                         info_string= '%1.2f:%1.5f' % (data.support,data.branchlength)
00587                     elif data.branchlength is not None:                             # we have only blen
00588                         info_string= '0.00000:%1.5f' % (data.branchlength)
00589                     elif data.support is not None:                                  # we have only support
00590                         info_string= '%1.2f:0.00000' % (data.support)
00591                     else:
00592                         info_string= '0.00:0.00000'
00593             if not ignore_comments and hasattr(data,'nodecomment'):
00594                 info_string=str(data.nodecomment)+info_string
00595             return info_string
00596             
00597         def ladderize_nodes(nodes,ladderize=None):
00598             """Sorts node numbers according to the number of terminal nodes."""
00599             if ladderize in ['left','LEFT','right','RIGHT']:
00600                 succnode_terminals=[(self.count_terminals(node=n),n) for n in nodes]
00601                 succnode_terminals.sort()
00602                 if (ladderize=='right' or ladderize=='RIGHT'):
00603                     succnode_terminals.reverse()
00604                 if succnode_terminals:
00605                     succnodes=zip(*succnode_terminals)[1]
00606                 else:
00607                     succnodes=[]
00608             else:
00609                 succnodes=nodes
00610             return succnodes
00611 
00612         def newickize(node,ladderize=None):
00613             """Convert a node tree to a newick tree recursively."""
00614 
00615             if not self.node(node).succ:    #terminal
00616                 return self.node(node).data.taxon+make_info_string(self.node(node).data,terminal=True)
00617             else:
00618                 succnodes=ladderize_nodes(self.node(node).succ,ladderize=ladderize)
00619                 subtrees=[newickize(sn,ladderize=ladderize) for sn in succnodes]
00620                 return '(%s)%s' % (','.join(subtrees),make_info_string(self.node(node).data))
00621                     
00622         treeline=['tree']
00623         if self.name:
00624             treeline.append(self.name)
00625         else:
00626             treeline.append('a_tree')
00627         treeline.append('=')
00628         if self.weight != 1:
00629             treeline.append('[&W%s]' % str(round(float(self.weight),3)))
00630         if self.rooted:
00631             treeline.append('[&R]')
00632         succnodes=ladderize_nodes(self.node(self.root).succ)
00633         subtrees=[newickize(sn,ladderize=ladderize) for sn in succnodes]
00634         treeline.append('(%s)' % ','.join(subtrees))
00635         if plain_newick:
00636             return treeline[-1]
00637         else:
00638             return ' '.join(treeline)+';'
00639         
00640     def __str__(self):
00641         """Short version of to_string(), gives plain tree"""
00642         return self.to_string(plain=True)
00643         
00644     def unroot(self):
00645         """Defines a unrooted Tree structure, using data of a rooted Tree."""
00646 
00647         # travel down the rooted tree structure and save all branches and the nodes they connect
00648 
00649         def _get_branches(node):
00650             branches=[]
00651             for b in self.node(node).succ:
00652                 branches.append([node,b,self.node(b).data.branchlength,self.node(b).data.support])
00653                 branches.extend(_get_branches(b))
00654             return branches
00655     
00656         self.unrooted=_get_branches(self.root)
00657         # if root is bifurcating, then it is eliminated
00658         if len(self.node(self.root).succ)==2:
00659             # find the two branches that connect to root
00660             rootbranches=[b for b in self.unrooted if self.root in b[:2]]
00661             b1=self.unrooted.pop(self.unrooted.index(rootbranches[0]))
00662             b2=self.unrooted.pop(self.unrooted.index(rootbranches[1]))
00663             # Connect them two each other. If both have support, it should be identical (or one set to None?).
00664             # If both have branchlengths, they will be added
00665             newbranch=[b1[1],b2[1],b1[2]+b2[2]]
00666             if b1[3] is None:
00667                 newbranch.append(b2[3]) # either None (both rootbranches are unsupported) or some support
00668             elif b2[3] is None:
00669                 newbranch.append(b1[3]) # dito
00670             elif b1[3]==b2[3]:          
00671                 newbranch.append(b1[3]) # identical support
00672             elif b1[3]==0 or b2[3]==0:
00673                 newbranch.append(b1[3]+b2[3]) # one is 0, take the other
00674             else:
00675                 raise TreeError('Support mismatch in bifurcating root: %f, %f' \
00676                                 % (float(b1[3]),float(b2[3])))
00677             self.unrooted.append(newbranch)
00678 
00679     def root_with_outgroup(self,outgroup=None):
00680         
00681         def _connect_subtree(parent,child):
00682             """Hook subtree starting with node child to parent."""
00683             for i,branch in enumerate(self.unrooted):
00684                 if parent in branch[:2] and child in branch[:2]:
00685                     branch=self.unrooted.pop(i)
00686                     break 
00687             else:
00688                 raise TreeError('Unable to connect nodes for rooting: nodes %d and %d are not connected' \
00689                                 % (parent,child))
00690             self.link(parent,child)
00691             self.node(child).data.branchlength=branch[2]
00692             self.node(child).data.support=branch[3]
00693             #now check if there are more branches connected to the child, and if so, connect them
00694             child_branches=[b for b in self.unrooted if child in b[:2]]
00695             for b in child_branches:
00696                 if child==b[0]:
00697                     succ=b[1]
00698                 else:
00699                     succ=b[0]
00700                 _connect_subtree(child,succ) 
00701             
00702         # check the outgroup we're supposed to root with
00703         if outgroup is None:
00704             return self.root
00705         outgroup_node=self.is_monophyletic(outgroup)
00706         if outgroup_node==-1:
00707             return -1
00708         # if tree is already rooted with outgroup on a bifurcating root,
00709         # or the outgroup includes all taxa on the tree, then we're fine
00710         if (len(self.node(self.root).succ)==2 and outgroup_node in self.node(self.root).succ) or outgroup_node==self.root:
00711             return self.root
00712         
00713         self.unroot()
00714         # now we find the branch that connects outgroup and ingroup
00715         #print self.node(outgroup_node).prev
00716         for i,b in enumerate(self.unrooted):
00717             if outgroup_node in b[:2] and self.node(outgroup_node).prev in b[:2]:
00718                 root_branch=self.unrooted.pop(i)
00719                 break
00720         else:
00721             raise TreeError('Unrooted and rooted Tree do not match')
00722         if outgroup_node==root_branch[1]:
00723             ingroup_node=root_branch[0]
00724         else:
00725             ingroup_node=root_branch[1]
00726         # now we destroy the old tree structure, but keep node data. Nodes will be reconnected according to new outgroup
00727         for n in self.all_ids():
00728             self.node(n).prev=None
00729             self.node(n).succ=[]
00730         # now we just add both subtrees (outgroup and ingroup) branch for branch
00731         root=Nodes.Node(data=NodeData())            # new root    
00732         self.add(root)                              # add to tree description
00733         self.root=root.id                           # set as root
00734         self.unrooted.append([root.id,ingroup_node,root_branch[2],root_branch[3]])  # add branch to ingroup to unrooted tree
00735         self.unrooted.append([root.id,outgroup_node,0.0,0.0])   # add branch to outgroup to unrooted tree
00736         _connect_subtree(root.id,ingroup_node)      # add ingroup
00737         _connect_subtree(root.id,outgroup_node)     # add outgroup
00738         # if theres still a lonely node in self.chain, then it's the old root, and we delete it
00739         oldroot=[i for i in self.all_ids() if self.node(i).prev is None and i!=self.root]
00740         if len(oldroot)>1:
00741             raise TreeError('Isolated nodes in tree description: %s' \
00742                             % ','.join(oldroot))
00743         elif len(oldroot)==1:
00744             self.kill(oldroot[0])
00745         return self.root
00746         
00747     def merge_with_support(self,bstrees=None,constree=None,threshold=0.5,outgroup=None):
00748         """Merges clade support (from consensus or list of bootstrap-trees) with phylogeny.
00749 
00750         tree=merge_bootstrap(phylo,bs_tree=<list_of_trees>)
00751         or
00752         tree=merge_bootstrap(phylo,consree=consensus_tree with clade support)
00753         """
00754 
00755         if bstrees and constree:
00756             raise TreeError('Specify either list of boostrap trees or consensus tree, not both')
00757         if not (bstrees or constree):
00758             raise TreeError('Specify either list of boostrap trees or consensus tree.')
00759         # no outgroup specified: use the smallest clade of the root
00760         if outgroup is None:
00761             try:
00762                 succnodes=self.node(self.root).succ
00763                 smallest=min([(len(self.get_taxa(n)),n) for n in succnodes]) 
00764                 outgroup=self.get_taxa(smallest[1])
00765             except:
00766                 raise TreeError("Error determining outgroup.")
00767         else: # root with user specified outgroup
00768             self.root_with_outgroup(outgroup)
00769 
00770         if bstrees: # calculate consensus 
00771             constree=consensus(bstrees,threshold=threshold,outgroup=outgroup)
00772         else:
00773             if not constree.has_support():
00774                 constree.branchlength2support()
00775             constree.root_with_outgroup(outgroup)
00776         # now we travel all nodes, and add support from consensus, if the clade is present in both
00777         for pnode in self._walk():
00778             cnode=constree.is_monophyletic(self.get_taxa(pnode))
00779             if cnode>-1:
00780                 self.node(pnode).data.support=constree.node(cnode).data.support
00781 
00782          
00783 def consensus(trees, threshold=0.5,outgroup=None):
00784     """Compute a majority rule consensus tree of all clades with relative frequency>=threshold from a list of trees."""
00785 
00786     total=len(trees)
00787     if total==0:
00788         return None
00789     # shouldn't we make sure that it's NodeData or subclass??
00790     dataclass=trees[0].dataclass
00791     max_support=trees[0].max_support
00792     clades={}
00793     #countclades={}
00794     alltaxa=set(trees[0].get_taxa())
00795     # calculate calde frequencies
00796     c=0
00797     for t in trees:
00798         c+=1
00799         #if c%100==0:
00800         #    print c
00801         if alltaxa!=set(t.get_taxa()):
00802             raise TreeError('Trees for consensus must contain the same taxa')
00803         t.root_with_outgroup(outgroup=outgroup)
00804         for st_node in t._walk(t.root):
00805             subclade_taxa=t.get_taxa(st_node)
00806             subclade_taxa.sort()
00807             subclade_taxa=str(subclade_taxa) # lists are not hashable
00808             if subclade_taxa in clades:
00809                 clades[subclade_taxa]+=float(t.weight)/total
00810             else:
00811                 clades[subclade_taxa]=float(t.weight)/total
00812             #if subclade_taxa in countclades:
00813             #    countclades[subclade_taxa]+=t.weight
00814             #else:
00815             #    countclades[subclade_taxa]=t.weight
00816     # weed out clades below threshold
00817     delclades=[c for c,p in clades.iteritems() if round(p,3)<threshold] # round can be necessary 
00818     for c in delclades:
00819         del clades[c]
00820     # create a tree with a root node
00821     consensus=Tree(name='consensus_%2.1f' % float(threshold),data=dataclass)
00822     # each clade needs a node in the new tree, add them as isolated nodes
00823     for c, s in clades.iteritems():
00824         node=Nodes.Node(data=dataclass())
00825         node.data.support=s
00826         node.data.taxon=set(eval(c))
00827         consensus.add(node)
00828     # set root node data
00829     consensus.node(consensus.root).data.support=None
00830     consensus.node(consensus.root).data.taxon=alltaxa
00831     # we sort the nodes by no. of taxa in the clade, so root will be the last
00832     consensus_ids=consensus.all_ids()
00833     consensus_ids.sort(lambda x,y:len(consensus.node(x).data.taxon)-len(consensus.node(y).data.taxon))
00834     # now we just have to hook each node to the next smallest node that includes all taxa of the current 
00835     for i,current in enumerate(consensus_ids[:-1]): # skip the last one which is the root
00836         #print '----'
00837         #print 'current: ',consensus.node(current).data.taxon
00838         # search remaining nodes
00839         for parent in consensus_ids[i+1:]:
00840             #print 'parent: ',consensus.node(parent).data.taxon
00841             if consensus.node(parent).data.taxon.issuperset(consensus.node(current).data.taxon):
00842                 break
00843         else:
00844             sys.exit('corrupt tree structure?')
00845         # internal nodes don't have taxa
00846         if len(consensus.node(current).data.taxon)==1:
00847             consensus.node(current).data.taxon=consensus.node(current).data.taxon.pop()
00848             # reset the support for terminal nodes to maximum
00849             #consensus.node(current).data.support=max_support
00850         else:
00851             consensus.node(current).data.taxon=None
00852         consensus.link(parent,current)
00853     # eliminate root taxon name
00854     consensus.node(consensus_ids[-1]).data.taxon=None 
00855     if alltaxa != set(consensus.get_taxa()):
00856         raise TreeError('FATAL ERROR: consensus tree is corrupt') 
00857     return consensus
00858