Back to index

python-biopython  1.60
BaseTree.py
Go to the documentation of this file.
00001 # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license. Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Base classes for Bio.Phylo objects.
00007 
00008 All object representations for phylogenetic trees should derive from these base
00009 classes in order to use the common methods defined on them.
00010 """
00011 __docformat__ = "restructuredtext en"
00012 
00013 import collections
00014 import copy
00015 import itertools
00016 import random
00017 import re
00018 
00019 from Bio.Phylo import _sugar
00020 
00021 # General tree-traversal algorithms
00022 
00023 def _level_traverse(root, get_children):
00024     """Traverse a tree in breadth-first (level) order."""
00025     Q = collections.deque([root])
00026     while Q:
00027         v = Q.popleft()
00028         yield v
00029         Q.extend(get_children(v))
00030 
00031 def _preorder_traverse(root, get_children):
00032     """Traverse a tree in depth-first pre-order (parent before children)."""
00033     def dfs(elem):
00034         yield elem
00035         for v in get_children(elem):
00036             for u in dfs(v):
00037                 yield u
00038     for elem in dfs(root):
00039         yield elem
00040 
00041 def _postorder_traverse(root, get_children):
00042     """Traverse a tree in depth-first post-order (children before parent)."""
00043     def dfs(elem):
00044         for v in get_children(elem):
00045             for u in dfs(v):
00046                 yield u
00047         yield elem
00048     for elem in dfs(root):
00049         yield elem
00050 
00051 
00052 def _sorted_attrs(elem):
00053     """Get a flat list of elem's attributes, sorted for consistency."""
00054     singles = []
00055     lists = []
00056     # Sort attributes for consistent results
00057     for attrname, child in sorted(elem.__dict__.iteritems(),
00058                                   key=lambda kv: kv[0]):
00059         if child is None:
00060             continue
00061         if isinstance(child, list):
00062             lists.extend(child)
00063         else:
00064             singles.append(child)
00065     return (x for x in singles + lists
00066             if isinstance(x, TreeElement))
00067 
00068 
00069 # Factory functions to generalize searching for clades/nodes
00070 
00071 def _identity_matcher(target):
00072     """Match a node to the target object by identity."""
00073     def match(node):
00074         return (node is target)
00075     return match
00076 
00077 def _class_matcher(target_cls):
00078     """Match a node if it's an instance of the given class."""
00079     def match(node):
00080         return isinstance(node, target_cls)
00081     return match
00082 
00083 def _string_matcher(target):
00084     def match(node):
00085         return unicode(node) == target
00086     return match
00087 
00088 def _attribute_matcher(kwargs):
00089     """Match a node by specified attribute values.
00090 
00091     ``terminal`` is a special case: True restricts the search to external (leaf)
00092     nodes, False restricts to internal nodes, and None allows all tree elements
00093     to be searched, including phyloXML annotations.
00094 
00095     Otherwise, for a tree element to match the specification (i.e. for the
00096     function produced by `_attribute_matcher` to return True when given a tree
00097     element), it must have each of the attributes specified by the keys and
00098     match each of the corresponding values -- think 'and', not 'or', for
00099     multiple keys.
00100     """
00101     def match(node):
00102         if 'terminal' in kwargs:
00103             # Special case: restrict to internal/external/any nodes
00104             kwa_copy = kwargs.copy()
00105             pattern = kwa_copy.pop('terminal')
00106             if (pattern is not None and
00107                 (not hasattr(node, 'is_terminal') or
00108                     node.is_terminal() != pattern)):
00109                 return False
00110         else:
00111             kwa_copy = kwargs
00112         for key, pattern in kwa_copy.iteritems():
00113             # Nodes must match all other specified attributes
00114             if not hasattr(node, key):
00115                 return False
00116             target = getattr(node, key)
00117             if isinstance(pattern, basestring):
00118                 return (isinstance(target, basestring) and
00119                         re.match(pattern+'$', target))
00120             if isinstance(pattern, bool):
00121                 return (pattern == bool(target))
00122             if isinstance(pattern, int):
00123                 return (pattern == target)
00124             if pattern is None:
00125                 return (target is None)
00126             raise TypeError('invalid query type: %s' % type(pattern))
00127         return True
00128     return match
00129 
00130 def _function_matcher(matcher_func):
00131     """Safer attribute lookup -- returns False instead of raising an error."""
00132     def match(node):
00133         try:
00134             return matcher_func(node)
00135         except (LookupError, AttributeError, ValueError, TypeError):
00136             return False
00137     return match
00138 
00139 def _object_matcher(obj):
00140     """Retrieve a matcher function by passing an arbitrary object.
00141 
00142     i.e. passing a `TreeElement` such as a `Clade` or `Tree` instance returns an
00143     identity matcher, passing a type such as the `PhyloXML.Taxonomy` class
00144     returns a class matcher, and passing a dictionary returns an attribute
00145     matcher.
00146 
00147     The resulting 'match' function returns True when given an object matching
00148     the specification (identity, type or attribute values), otherwise False.
00149     This is useful for writing functions that search the tree, and probably
00150     shouldn't be used directly by the end user.
00151     """
00152     if isinstance(obj, TreeElement):
00153         return _identity_matcher(obj)
00154     if isinstance(obj, type):
00155         return _class_matcher(obj)
00156     if isinstance(obj, basestring):
00157         return _string_matcher(obj)
00158     if isinstance(obj, dict):
00159         return _attribute_matcher(obj)
00160     if callable(obj):
00161         return _function_matcher(obj)
00162     raise ValueError("%s (type %s) is not a valid type for comparison."
00163                      % (obj, type(obj)))
00164 
00165 
00166 def _combine_matchers(target, kwargs, require_spec):
00167     """Merge target specifications with keyword arguments.
00168 
00169     Dispatch the components to the various matcher functions, then merge into a
00170     single boolean function.
00171     """
00172     if not target:
00173         if not kwargs:
00174             if require_spec:
00175                 raise ValueError("you must specify a target object or keyword "
00176                                 "arguments.")
00177             return lambda x: True
00178         return _attribute_matcher(kwargs)
00179     match_obj = _object_matcher(target)
00180     if not kwargs:
00181         return match_obj
00182     match_kwargs = _attribute_matcher(kwargs)
00183     return (lambda x: match_obj(x) and match_kwargs(x))
00184 
00185 
00186 def _combine_args(first, *rest):
00187     """Convert ``[targets]`` or ``*targets`` arguments to a single iterable.
00188 
00189     This helps other functions work like the built-in functions `max` and
00190     `min`.
00191     """
00192     # Background: is_monophyletic takes a single list or iterable (like the
00193     # same method in Bio.Nexus.Trees); root_with_outgroup and common_ancestor
00194     # take separate arguments. This mismatch was in the initial release and I
00195     # didn't notice the inconsistency until after Biopython 1.55. I can think
00196     # of cases where either style is more convenient, so let's support both
00197     # (for backward compatibility and consistency between methods).
00198     if hasattr(first, '__iter__') and not (isinstance(first, TreeElement) or
00199             isinstance(first, type) or isinstance(first, basestring) or
00200             isinstance(first, dict)):
00201         # `terminals` is an iterable of targets
00202         if rest:
00203             raise ValueError("Arguments must be either a single list of "
00204                     "targets, or separately specified targets "
00205                     "(e.g. foo(t1, t2, t3)), but not both.")
00206         return first
00207     # `terminals` is a single target -- wrap in a container
00208     return itertools.chain([first], rest)
00209 
00210 
00211 # Class definitions
00212 
00213 class TreeElement(object):
00214     """Base class for all Bio.Phylo classes."""
00215 
00216     def __repr__(self):
00217         """Show this object's constructor with its primitive arguments."""
00218         def pair_as_kwarg_string(key, val):
00219             if isinstance(val, basestring):
00220                 return "%s='%s'" % (key, _sugar.trim_str(unicode(val)))
00221             return "%s=%s" % (key, val)
00222         return u'%s(%s)' % (self.__class__.__name__,
00223                             ', '.join(pair_as_kwarg_string(key, val)
00224                                   for key, val in self.__dict__.iteritems()
00225                                   if val is not None and
00226                                   type(val) in (str, int, float, bool, unicode)
00227                                   ))
00228 
00229     __str__ = __repr__
00230 
00231 
00232 class TreeMixin(object):
00233     """Methods for Tree- and Clade-based classes.
00234 
00235     This lets `Tree` and `Clade` support the same traversal and searching
00236     operations without requiring Clade to inherit from Tree, so Clade isn't
00237     required to have all of Tree's attributes -- just ``root`` (a Clade
00238     instance) and ``is_terminal``.
00239     """
00240     # Traversal methods
00241 
00242     def _filter_search(self, filter_func, order, follow_attrs):
00243         """Perform a BFS or DFS traversal through all elements in this tree.
00244 
00245         :returns: generator of all elements for which `filter_func` is True.
00246         """
00247         order_opts = {'preorder': _preorder_traverse,
00248                       'postorder': _postorder_traverse,
00249                       'level': _level_traverse}
00250         try:
00251             order_func = order_opts[order]
00252         except KeyError:
00253             raise ValueError("Invalid order '%s'; must be one of: %s"
00254                              % (order, tuple(order_opts.keys())))
00255         if follow_attrs:
00256             get_children = _sorted_attrs
00257             root = self
00258         else:
00259             get_children = lambda elem: elem.clades
00260             root = self.root
00261         return itertools.ifilter(filter_func, order_func(root, get_children))
00262 
00263     def find_any(self, *args, **kwargs):
00264         """Return the first element found by find_elements(), or None.
00265 
00266         This is also useful for checking whether any matching element exists in
00267         the tree, and can be used in a conditional expression.
00268         """
00269         hits = self.find_elements(*args, **kwargs)
00270         try:
00271             return hits.next()
00272         except StopIteration:
00273             return None
00274 
00275     def find_elements(self, target=None, terminal=None, order='preorder',
00276             **kwargs):
00277         """Find all tree elements matching the given attributes.
00278 
00279         The arbitrary keyword arguments indicate the attribute name of the
00280         sub-element and the value to match: string, integer or boolean. Strings
00281         are evaluated as regular expression matches; integers are compared
00282         directly for equality, and booleans evaluate the attribute's truth value
00283         (True or False) before comparing. To handle nonzero floats, search with
00284         a boolean argument, then filter the result manually.
00285 
00286         If no keyword arguments are given, then just the class type is used for
00287         matching.
00288 
00289         The result is an iterable through all matching objects, by depth-first
00290         search. (Not necessarily the same order as the elements appear in the
00291         source file!)
00292 
00293         :Parameters:
00294             target : TreeElement instance, type, dict, or callable
00295                 Specifies the characteristics to search for. (The default,
00296                 TreeElement, matches any standard Bio.Phylo type.)
00297             terminal : bool
00298                 A boolean value to select for or against terminal nodes (a.k.a.
00299                 leaf nodes). True searches for only terminal nodes, False
00300                 excludes terminal nodes, and the default, None, searches both
00301                 terminal and non-terminal nodes, as well as any tree elements
00302                 lacking the ``is_terminal`` method.
00303             order : {'preorder', 'postorder', 'level'}
00304                 Tree traversal order: 'preorder' (default) is depth-first
00305                 search, 'postorder' is DFS with child nodes preceding parents,
00306                 and 'level' is breadth-first search.
00307 
00308         Example
00309         -------
00310 
00311         >>> from Bio.Phylo.IO import PhyloXMIO
00312         >>> phx = PhyloXMLIO.read('phyloxml_examples.xml')
00313         >>> matches = phx.phylogenies[5].find_elements(code='OCTVU')
00314         >>> matches.next()
00315         Taxonomy(code='OCTVU', scientific_name='Octopus vulgaris')
00316 
00317         """
00318         if terminal is not None:
00319             kwargs['terminal'] = terminal
00320         is_matching_elem = _combine_matchers(target, kwargs, False)
00321         return self._filter_search(is_matching_elem, order, True)
00322 
00323     def find_clades(self, target=None, terminal=None, order='preorder',
00324             **kwargs):
00325         """Find each clade containing a matching element.
00326 
00327         That is, find each element as with find_elements(), but return the
00328         corresponding clade object. (This is usually what you want.)
00329 
00330         :returns: an iterable through all matching objects, searching
00331             depth-first (preorder) by default.
00332         """
00333         def match_attrs(elem):
00334             orig_clades = elem.__dict__.pop('clades')
00335             found = elem.find_any(target, **kwargs)
00336             elem.clades = orig_clades
00337             return (found is not None)
00338         if terminal is None:
00339             is_matching_elem = match_attrs
00340         else:
00341             def is_matching_elem(elem):
00342                 return ((elem.is_terminal() == terminal) and
00343                         match_attrs(elem))
00344         return self._filter_search(is_matching_elem, order, False)
00345 
00346     def get_path(self, target=None, **kwargs):
00347         """List the clades directly between this root and the given target.
00348 
00349         :returns: list of all clade objects along this path, ending with the
00350             given target, but excluding the root clade.
00351         """
00352         # Only one path will work -- ignore weights and visits
00353         path = []
00354         match = _combine_matchers(target, kwargs, True)
00355         def check_in_path(v):
00356             if match(v):
00357                 path.append(v)
00358                 return True
00359             elif v.is_terminal():
00360                 return False
00361             for child in v:
00362                 if check_in_path(child):
00363                     path.append(v)
00364                     return True
00365             return False
00366         if not check_in_path(self.root):
00367             return None
00368         return path[-2::-1]
00369 
00370     def get_nonterminals(self, order='preorder'):
00371         """Get a list of all of this tree's nonterminal (internal) nodes."""
00372         return list(self.find_clades(terminal=False, order=order))
00373 
00374     def get_terminals(self, order='preorder'):
00375         """Get a list of all of this tree's terminal (leaf) nodes."""
00376         return list(self.find_clades(terminal=True, order=order))
00377 
00378     def trace(self, start, finish):
00379         """List of all clade object between two targets in this tree.
00380 
00381         Excluding `start`, including `finish`.
00382         """
00383         mrca = self.common_ancestor(start, finish)
00384         fromstart = mrca.get_path(start)[-2::-1]
00385         to = mrca.get_path(finish)
00386         return fromstart + [mrca] + to
00387 
00388     # Information methods
00389 
00390     def common_ancestor(self, targets, *more_targets):
00391         """Most recent common ancestor (clade) of all the given targets.
00392 
00393         Edge cases:
00394         - If no target is given, returns self.root
00395         - If 1 target is given, returns the target
00396         - If any target is not found in this tree, raises a ValueError
00397         """
00398         paths = [self.get_path(t)
00399                  for t in _combine_args(targets, *more_targets)]
00400         # Validation -- otherwise izip throws a spooky error below
00401         for p, t in zip(paths, targets):
00402             if p is None:
00403                 raise ValueError("target %s is not in this tree" % repr(t))
00404         mrca = self.root
00405         for level in itertools.izip(*paths):
00406             ref = level[0]
00407             for other in level[1:]:
00408                 if ref is not other:
00409                     break
00410             else:
00411                 mrca = ref
00412             if ref is not mrca:
00413                 break
00414         return mrca
00415 
00416     def count_terminals(self):
00417         """Counts the number of terminal (leaf) nodes within this tree."""
00418         return _sugar.iterlen(self.find_clades(terminal=True))
00419 
00420     def depths(self, unit_branch_lengths=False):
00421         """Create a mapping of tree clades to depths (by branch length).
00422 
00423         :Parameters:
00424             unit_branch_lengths : bool
00425                 If True, count only the number of branches (levels in the tree).
00426                 By default the distance is the cumulative branch length leading
00427                 to the clade.
00428 
00429         :returns: dict of {clade: depth}, where keys are all of the Clade
00430             instances in the tree, and values are the distance from the root to
00431             each clade (including terminals).
00432         """
00433         if unit_branch_lengths:
00434             depth_of = lambda c: 1
00435         else:
00436             depth_of = lambda c: c.branch_length or 0
00437         depths = {}
00438         def update_depths(node, curr_depth):
00439             depths[node] = curr_depth
00440             for child in node.clades:
00441                 new_depth = curr_depth + depth_of(child)
00442                 update_depths(child, new_depth)
00443         update_depths(self.root, self.root.branch_length or 0)
00444         return depths
00445 
00446     def distance(self, target1, target2=None):
00447         """Calculate the sum of the branch lengths between two targets.
00448 
00449         If only one target is specified, the other is the root of this tree.
00450         """
00451         if target2 is None:
00452             return sum(n.branch_length for n in self.get_path(target1)
00453                        if n.branch_length is not None)
00454         mrca = self.common_ancestor(target1, target2)
00455         return mrca.distance(target1) + mrca.distance(target2)
00456 
00457     def is_bifurcating(self):
00458         """Return True if tree downstream of node is strictly bifurcating.
00459 
00460         I.e., all nodes have either 2 or 0 children (internal or external,
00461         respectively). The root may have 3 descendents and still be considered
00462         part of a bifurcating tree, because it has no ancestor.
00463         """
00464         # Root can be trifurcating
00465         if isinstance(self, Tree) and len(self.root) == 3:
00466             return (self.root.clades[0].is_bifurcating() and
00467                     self.root.clades[1].is_bifurcating() and
00468                     self.root.clades[2].is_bifurcating())
00469         if len(self.root) == 2:
00470             return (self.root.clades[0].is_bifurcating() and
00471                     self.root.clades[1].is_bifurcating())
00472         if len(self.root) == 0:
00473             return True
00474         return False
00475 
00476     def is_monophyletic(self, terminals, *more_terminals):
00477         """MRCA of terminals if they comprise a complete subclade, or False.
00478 
00479         I.e., there exists a clade such that its terminals are the same set as
00480         the given targets.
00481 
00482         The given targets must be terminals of the tree.
00483 
00484         To match both `Bio.Nexus.Trees` and the other multi-target methods in
00485         Bio.Phylo, arguments to this method can be specified either of two ways:
00486         (i) as a single list of targets, or (ii) separately specified targets,
00487         e.g. is_monophyletic(t1, t2, t3) -- but not both.
00488 
00489         For convenience, this method returns the common ancestor (MCRA) of the
00490         targets if they are monophyletic (instead of the value True), and False
00491         otherwise.
00492 
00493         :returns: common ancestor if terminals are monophyletic, otherwise False.
00494         """
00495         target_set = set(_combine_args(terminals, *more_terminals))
00496         current = self.root
00497         while True:
00498             if set(current.get_terminals()) == target_set:
00499                 return current
00500             # Try a narrower subclade
00501             for subclade in current.clades:
00502                 if set(subclade.get_terminals()).issuperset(target_set):
00503                     current = subclade
00504                     break
00505             else:
00506                 return False
00507 
00508     def is_parent_of(self, target=None, **kwargs):
00509         """True if target is a descendent of this tree.
00510 
00511         Not required to be a direct descendent.
00512 
00513         To check only direct descendents of a clade, simply use list membership
00514         testing: ``if subclade in clade: ...``
00515         """
00516         return self.get_path(target, **kwargs) is not None
00517 
00518     def is_preterminal(self):
00519         """True if all direct descendents are terminal."""
00520         if self.root.is_terminal():
00521             return False
00522         for clade in self.root.clades:
00523             if not clade.is_terminal():
00524                 return False
00525         return True
00526 
00527     def total_branch_length(self):
00528         """Calculate the sum of all the branch lengths in this tree."""
00529         return sum(node.branch_length
00530                    for node in self.find_clades(branch_length=True))
00531 
00532     # Tree manipulation methods
00533 
00534     def collapse(self, target=None, **kwargs):
00535         """Deletes target from the tree, relinking its children to its parent.
00536 
00537         :returns: the parent clade.
00538         """
00539         path = self.get_path(target, **kwargs)
00540         if not path:
00541             raise ValueError("couldn't collapse %s in this tree"
00542                              % (target or kwargs))
00543         if len(path) == 1:
00544             parent = self.root
00545         else:
00546             parent = path[-2]
00547         popped = parent.clades.pop(parent.clades.index(path[-1]))
00548         extra_length = popped.branch_length or 0
00549         for child in popped:
00550             child.branch_length += extra_length
00551         parent.clades.extend(popped.clades)
00552         return parent
00553 
00554     def collapse_all(self, target=None, **kwargs):
00555         """Collapse all the descendents of this tree, leaving only terminals.
00556 
00557         Total branch lengths are preserved, i.e. the distance to each terminal
00558         stays the same.
00559 
00560         For example, this will safely collapse nodes with poor bootstrap
00561         support:
00562 
00563             >>> tree.collapse_all(lambda c: c.confidence is not None and
00564             ...                   c.confidence < 70)
00565 
00566         This implementation avoids strange side-effects by using level-order
00567         traversal and testing all clade properties (versus the target
00568         specification) up front. In particular, if a clade meets the target
00569         specification in the original tree, it will be collapsed.  For example,
00570         if the condition is:
00571 
00572             >>> tree.collapse_all(lambda c: c.branch_length < 0.1)
00573 
00574         Collapsing a clade's parent node adds the parent's branch length to the
00575         child, so during the execution of collapse_all, a clade's branch_length
00576         may increase. In this implementation, clades are collapsed according to
00577         their properties in the original tree, not the properties when tree
00578         traversal reaches the clade. (It's easier to debug.) If you want the
00579         other behavior (incremental testing), modifying the source code of this
00580         function is straightforward.
00581         """
00582         # Read the iterable into a list to protect against in-place changes
00583         matches = list(self.find_clades(target, False, 'level', **kwargs))
00584         if not matches:
00585             # No matching nodes to collapse
00586             return
00587         # Skip the root node -- it can't be collapsed
00588         if matches[0] == self.root:
00589             matches.pop(0)
00590         for clade in matches:
00591             self.collapse(clade)
00592 
00593     def ladderize(self, reverse=False):
00594         """Sort clades in-place according to the number of terminal nodes.
00595 
00596         Deepest clades are last by default. Use ``reverse=True`` to sort clades
00597         deepest-to-shallowest.
00598         """
00599         self.root.clades.sort(key=lambda c: c.count_terminals(),
00600                               reverse=reverse)
00601         for subclade in self.root.clades:
00602             subclade.ladderize(reverse=reverse)
00603 
00604     def prune(self, target=None, **kwargs):
00605         """Prunes a terminal clade from the tree.
00606 
00607         If taxon is from a bifurcation, the connecting node will be collapsed
00608         and its branch length added to remaining terminal node. This might be no
00609         longer be a meaningful value.
00610 
00611         :returns: parent clade of the pruned target
00612         """
00613         if 'terminal' in kwargs and kwargs['terminal']:
00614             raise ValueError("target must be terminal")
00615         path = self.get_path(target, terminal=True, **kwargs)
00616         if not path:
00617             raise ValueError("can't find a matching target below this root")
00618         if len(path) == 1:
00619             parent = self.root
00620         else:
00621             parent = path[-2]
00622         parent.clades.remove(path[-1])
00623         if len(parent) == 1:
00624             # We deleted a branch from a bifurcation
00625             if parent == self.root:
00626                 # If we're at the root, move the root upwards
00627                 # NB: This loses the length of the original branch
00628                 newroot = parent.clades[0]
00629                 newroot.branch_length = None
00630                 parent = self.root = newroot
00631             else:
00632                 # If we're not at the root, collapse this parent
00633                 child = parent.clades[0]
00634                 if child.branch_length is not None:
00635                     child.branch_length += (parent.branch_length or 0.0)
00636                 if len(path) < 3:
00637                     grandparent = self.root
00638                 else:
00639                     grandparent = path[-3]
00640                 # Replace parent with child at the same place in grandparent
00641                 index = grandparent.clades.index(parent)
00642                 grandparent.clades.pop(index)
00643                 grandparent.clades.insert(index, child)
00644                 parent = grandparent
00645         return parent
00646 
00647     def split(self, n=2, branch_length=1.0):
00648         """Generate n (default 2) new descendants.
00649 
00650         In a species tree, this is a speciation event.
00651 
00652         New clades have the given branch_length and the same name as this
00653         clade's root plus an integer suffix (counting from 0). For example,
00654         splitting a clade named "A" produces sub-clades named "A0" and "A1".
00655         """
00656         clade_cls = type(self.root)
00657         base_name = self.root.name or ''
00658         for i in range(n):
00659             clade = clade_cls(name=base_name+str(i),
00660                                 branch_length=branch_length)
00661             self.root.clades.append(clade)
00662 
00663 
00664 class Tree(TreeElement, TreeMixin):
00665     """A phylogenetic tree, containing global info for the phylogeny.
00666 
00667     The structure and node-specific data is accessible through the 'root'
00668     clade attached to the Tree instance.
00669 
00670     :Parameters:
00671         root : Clade
00672             The starting node of the tree. If the tree is rooted, this will
00673             usually be the root node.
00674         rooted : bool
00675             Whether or not the tree is rooted. By default, a tree is assumed to
00676             be rooted.
00677         id : str
00678             The identifier of the tree, if there is one.
00679         name : str
00680             The name of the tree, in essence a label.
00681     """
00682     def __init__(self, root=None, rooted=True, id=None, name=None):
00683         self.root = root or Clade()
00684         self.rooted = rooted
00685         self.id = id
00686         self.name = name
00687 
00688     @classmethod
00689     def from_clade(cls, clade, **kwargs):
00690         """Create a new Tree object given a clade.
00691 
00692         Keyword arguments are the usual `Tree` constructor parameters.
00693         """
00694         root = copy.deepcopy(clade)
00695         return cls(root, **kwargs)
00696 
00697     @classmethod
00698     def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
00699         """Create a randomized bifurcating tree given a list of taxa.
00700 
00701         :param taxa: Either an integer specifying the number of taxa to create
00702             (automatically named taxon#), or an iterable of taxon names, as
00703             strings.
00704 
00705         :returns: a tree of the same type as this class.
00706         """
00707         if isinstance(taxa, int):
00708             taxa = ['taxon%s' % (i+1) for i in range(taxa)]
00709         elif hasattr(taxa, '__iter__'):
00710             taxa = list(taxa)
00711         else:
00712             raise TypeError("taxa argument must be integer (# taxa) or "
00713                             "iterable of taxon names.")
00714         rtree = cls()
00715         terminals = [rtree.root]
00716         while len(terminals) < len(taxa):
00717             newsplit = random.choice(terminals)
00718             newterms = newsplit.split(branch_length=branch_length)
00719             if branch_stdev:
00720                 # Add some noise to the branch lengths
00721                 for nt in newterms:
00722                     nt.branch_length = max(0,
00723                             random.gauss(branch_length, branch_stdev))
00724             terminals.remove(newsplit)
00725             terminals.extend(newterms)
00726         # Distribute taxon labels randomly
00727         random.shuffle(taxa)
00728         for node, name in zip(terminals, taxa):
00729             node.name = name
00730         return rtree
00731 
00732     @property
00733     def clade(self):
00734         """The first clade in this tree (not itself)."""
00735         return self.root
00736 
00737     def as_phyloxml(self, **kwargs):
00738         """Convert this tree to a PhyloXML-compatible Phylogeny.
00739 
00740         This lets you use the additional annotation types PhyloXML defines, and
00741         save this information when you write this tree as 'phyloxml'.
00742         """
00743         from Bio.Phylo.PhyloXML import Phylogeny
00744         return Phylogeny.from_tree(self, **kwargs)
00745 
00746     # XXX Compatibility: In Python 2.6+, **kwargs can be replaced with the named
00747     # keyword argument outgroup_branch_length=None
00748     def root_with_outgroup(self, outgroup_targets, *more_targets, **kwargs):
00749         """Reroot this tree with the outgroup clade containing outgroup_targets.
00750 
00751         Operates in-place.
00752 
00753         Edge cases:
00754 
00755         - If ``outgroup == self.root``, no change
00756         - If outgroup is terminal, create new bifurcating root node with a
00757           0-length branch to the outgroup
00758         - If outgroup is internal, use the given outgroup node as the new
00759           trifurcating root, keeping branches the same
00760         - If the original root was bifurcating, drop it from the tree,
00761           preserving total branch lengths
00762 
00763         :param outgroup_branch_length: length of the branch leading to the
00764             outgroup after rerooting. If not specified (None), then:
00765 
00766             - If the outgroup is an internal node (not a single terminal taxon),
00767               then use that node as the new root.
00768             - Otherwise, create a new root node as the parent of the outgroup.
00769 
00770         """
00771         # This raises a ValueError if any target is not in this tree
00772         # Otherwise, common_ancestor guarantees outgroup is in this tree
00773         outgroup = self.common_ancestor(outgroup_targets, *more_targets)
00774         outgroup_path = self.get_path(outgroup)
00775         if len(outgroup_path) == 0:
00776             # Outgroup is the current root -- no change
00777             return
00778 
00779         prev_blen = outgroup.branch_length or 0.0
00780         # Hideous kludge because Py2.x doesn't allow keyword args after *args
00781         outgroup_branch_length = kwargs.get('outgroup_branch_length')
00782         if outgroup_branch_length is not None:
00783             assert 0 <= outgroup_branch_length <= prev_blen, \
00784                     "outgroup_branch_length must be between 0 and the " \
00785                     "original length of the branch leading to the outgroup."
00786 
00787         if outgroup.is_terminal() or outgroup_branch_length is not None:
00788             # Create a new root with a 0-length branch to the outgroup
00789             outgroup.branch_length = outgroup_branch_length or 0.0
00790             new_root = self.root.__class__(
00791                     branch_length=self.root.branch_length, clades=[outgroup])
00792             # The first branch reversal (see the upcoming loop) is modified
00793             if len(outgroup_path) == 1:
00794                 # No nodes between the original root and outgroup to rearrange.
00795                 # Most of the code below will be skipped, but we still need
00796                 # 'new_parent' pointing at the new root.
00797                 new_parent = new_root
00798             else:
00799                 parent = outgroup_path.pop(-2)
00800                 # First iteration of reversing the path to the outgroup
00801                 parent.clades.pop(parent.clades.index(outgroup))
00802                 (prev_blen, parent.branch_length) = (parent.branch_length,
00803                         prev_blen - outgroup.branch_length)
00804                 new_root.clades.insert(0, parent)
00805                 new_parent = parent
00806         else:
00807             # Use the given outgroup node as the new (trifurcating) root
00808             new_root = outgroup
00809             new_root.branch_length = self.root.branch_length
00810             new_parent = new_root
00811 
00812         # Tracing the outgroup lineage backwards, reattach the subclades under a
00813         # new root clade. Reverse the branches directly above the outgroup in
00814         # the tree, but keep the descendants of those clades as they are.
00815         for parent in outgroup_path[-2::-1]:
00816             parent.clades.pop(parent.clades.index(new_parent))
00817             prev_blen, parent.branch_length = parent.branch_length, prev_blen
00818             new_parent.clades.insert(0, parent)
00819             new_parent = parent
00820 
00821         # Finally, handle the original root according to number of descendents
00822         old_root = self.root
00823         if outgroup in old_root.clades:
00824             assert len(outgroup_path) == 1
00825             old_root.clades.pop(old_root.clades.index(outgroup))
00826         else:
00827             old_root.clades.pop(old_root.clades.index(new_parent))
00828         if len(old_root) == 1:
00829             # Delete the old bifurcating root & add branch lengths
00830             ingroup = old_root.clades[0]
00831             if ingroup.branch_length:
00832                 ingroup.branch_length += prev_blen
00833             else:
00834                 ingroup.branch_length = prev_blen
00835             new_parent.clades.insert(0, ingroup)
00836             # ENH: If annotations are attached to old_root, do... something.
00837         else:
00838             # Keep the old trifurcating/polytomous root as an internal node
00839             old_root.branch_length = prev_blen
00840             new_parent.clades.insert(0, old_root)
00841 
00842         self.root = new_root
00843         self.rooted = True
00844         return
00845 
00846     def root_at_midpoint(self):
00847         """Root the tree at the midpoint of the two most distant taxa.
00848 
00849         This operates in-place, leaving a bifurcating root. The topology of the
00850         tree is otherwise retained, though no guarantees are made about the
00851         stability of clade/node/taxon ordering.
00852         """
00853         # Identify the largest pairwise distance
00854         max_distance = 0.0
00855         tips = self.get_terminals()
00856         for tip in tips:
00857             self.root_with_outgroup(tip)
00858             new_max = max(self.depths().iteritems(), key=lambda nd: nd[1])
00859             if new_max[1] > max_distance:
00860                 tip1 = tip
00861                 tip2 = new_max[0]
00862                 max_distance = new_max[1]
00863         self.root_with_outgroup(tip1)
00864         # Depth to go from the ingroup tip toward the outgroup tip
00865         root_remainder = 0.5 * (max_distance - (self.root.branch_length or 0))
00866         assert root_remainder >= 0
00867         # Identify the midpoint and reroot there.
00868         # Trace the path to the outgroup tip until all of the root depth has
00869         # been traveled/accounted for.
00870         for node in self.get_path(tip2):
00871             root_remainder -= node.branch_length
00872             if root_remainder < 0:
00873                 outgroup_node = node
00874                 outgroup_branch_length = -root_remainder
00875                 break
00876         else:
00877             raise ValueError("Somehow, failed to find the midpoint!")
00878         self.root_with_outgroup(outgroup_node,
00879                                 outgroup_branch_length=outgroup_branch_length)
00880 
00881     # Method assumed by TreeMixin
00882 
00883     def is_terminal(self):
00884         """True if the root of this tree is terminal."""
00885         return (not self.root.clades)
00886 
00887     # Convention from SeqRecord and Alignment classes
00888 
00889     def __format__(self, format_spec):
00890         """Serialize the tree as a string in the specified file format.
00891 
00892         This method supports the ``format`` built-in function added in Python
00893         2.6/3.0.
00894 
00895         :param format_spec: a lower-case string supported by `Bio.Phylo.write`
00896             as an output file format.
00897         """
00898         if format_spec:
00899             from StringIO import StringIO
00900             from Bio.Phylo import _io
00901             handle = StringIO()
00902             _io.write([self], handle, format_spec)
00903             return handle.getvalue()
00904         else:
00905             # Follow python convention and default to using __str__
00906             return str(self)
00907 
00908     def format(self, format):
00909         """Serialize the tree as a string in the specified file format.
00910 
00911         This duplicates the __format__ magic method for pre-2.6 Pythons.
00912         """
00913         return self.__format__(format)
00914 
00915     # Pretty-printer for the entire tree hierarchy
00916 
00917     def __str__(self):
00918         """String representation of the entire tree.
00919 
00920         Serializes each sub-clade recursively using ``repr`` to create a summary
00921         of the object structure.
00922         """
00923         TAB = '    '
00924         textlines = []
00925         def print_tree(obj, indent):
00926             """Recursively serialize sub-elements.
00927 
00928             This closes over textlines and modifies it in-place.
00929             """
00930             textlines.append(TAB*indent + repr(obj))
00931             indent += 1
00932             for attr in obj.__dict__:
00933                 child = getattr(obj, attr)
00934                 if isinstance(child, TreeElement):
00935                     print_tree(child, indent)
00936                 elif isinstance(child, list):
00937                     for elem in child:
00938                         if isinstance(elem, TreeElement):
00939                             print_tree(elem, indent)
00940         print_tree(self, 0)
00941         return '\n'.join(textlines)
00942 
00943 
00944 class Clade(TreeElement, TreeMixin):
00945     """A recursively defined sub-tree.
00946 
00947     :Parameters:
00948         branch_length : str
00949             The length of the branch leading to the root node of this clade.
00950         name : str
00951             The clade's name (a label).
00952         clades : list
00953             Sub-trees rooted directly under this tree's root.
00954         confidence : number
00955             Support.
00956         color : BranchColor
00957             The display color of the branch and descendents.
00958         width : number
00959             The display width of the branch and descendents.
00960     """
00961     def __init__(self, branch_length=None, name=None, clades=None,
00962             confidence=None, color=None, width=None):
00963         self.branch_length = branch_length
00964         self.name = name
00965         self.clades = clades or []
00966         self.confidence = confidence
00967         self.color = color
00968         self.width = width
00969 
00970     @property
00971     def root(self):
00972         """Allow TreeMixin methods to traverse clades properly."""
00973         return self
00974 
00975     def is_terminal(self):
00976         """True if this is a terminal (leaf) node."""
00977         return (not self.clades)
00978 
00979     # Sequence-type behavior methods
00980 
00981     def __getitem__(self, index):
00982         """Get clades by index (integer or slice)."""
00983         if isinstance(index, int) or isinstance(index, slice):
00984             return self.clades[index]
00985         ref = self
00986         for idx in index:
00987             ref = ref[idx]
00988         return ref
00989 
00990     def __iter__(self):
00991         """Iterate through this tree's direct descendent clades (sub-trees)."""
00992         return iter(self.clades)
00993 
00994     def __len__(self):
00995         """Number of clades directy under the root."""
00996         return len(self.clades)
00997 
00998     def __nonzero__(self):
00999         """Boolean value of an instance of this class.
01000 
01001         NB: If this method is not defined, but ``__len__``  is, then the object
01002         is considered true if the result of ``__len__()`` is nonzero. We want
01003         Clade instances to always be considered True.
01004         """
01005         return True
01006 
01007     def __str__(self):
01008         if self.name:
01009             return _sugar.trim_str(self.name, maxlen=40)
01010         return self.__class__.__name__
01011 
01012     # Syntax sugar for setting the branch color
01013     def _get_color(self):
01014         return self._color
01015 
01016     def _set_color(self, arg):
01017         if arg is None or isinstance(arg, BranchColor):
01018             self._color = arg
01019         elif isinstance(arg, basestring):
01020             if arg in BranchColor.color_names:
01021                 # Known color name
01022                 self._color = BranchColor.from_name(arg)
01023             elif arg.startswith('#') and len(arg) == 7:
01024                 # HTML-style hex string
01025                 self._color = BranchColor.from_hex(arg)
01026             else:
01027                 raise ValueError("invalid color string %s" % arg)
01028         elif hasattr(arg, '__iter__') and len(arg) == 3:
01029             # RGB triplet
01030             self._color = BranchColor(*arg)
01031         else:
01032             raise ValueError("invalid color value %s" % arg)
01033 
01034     color = property(_get_color, _set_color, doc="Branch color.")
01035 
01036 
01037 class BranchColor(object):
01038     """Indicates the color of a clade when rendered graphically.
01039 
01040     The color should be interpreted by client code (e.g. visualization
01041     programs) as applying to the whole clade, unless overwritten by the
01042     color(s) of sub-clades.
01043 
01044     Color values must be integers from 0 to 255.
01045     """
01046 
01047     color_names = {
01048             'red':      (255,   0,   0),
01049             'r':        (255,   0,   0),
01050             'yellow':   (255, 255,   0),
01051             'y':        (255, 255,   0),
01052             'green':    (  0, 128,   0),
01053             'g':        (  0, 128,   0),
01054             'cyan':     (  0, 255, 255),
01055             'c':        (  0, 255, 255),
01056             'blue':     (  0,   0, 255),
01057             'b':        (  0,   0, 255),
01058             'magenta':  (255,   0, 255),
01059             'm':        (255,   0, 255),
01060             'black':    (  0,   0,   0),
01061             'k':        (  0,   0,   0),
01062             'white':    (255, 255, 255),
01063             'w':        (255, 255, 255),
01064             # Names standardized in HTML/CSS spec
01065             # http://w3schools.com/html/html_colornames.asp
01066             'maroon':   (128,   0,   0),
01067             'olive':    (128, 128,   0),
01068             'lime':     (  0, 255,   0),
01069             'aqua':     (  0, 255, 255),
01070             'teal':     (  0, 128, 128),
01071             'navy':     (  0,   0, 128),
01072             'fuchsia':  (255,   0, 255),
01073             'purple':   (128,   0, 128),
01074             'silver':   (192, 192, 192),
01075             'gray':     (128, 128, 128),
01076             # More definitions from matplotlib/gcolor2
01077             'grey':     (128, 128, 128),
01078             'pink':     (255, 192, 203),
01079             'salmon':   (250, 128, 114),
01080             'orange':   (255, 165,   0),
01081             'gold':     (255, 215,   0),
01082             'tan':      (210, 180, 140),
01083             'brown':    (165,  42,  42),
01084             }
01085 
01086     def __init__(self, red, green, blue):
01087         for color in (red, green, blue):
01088             assert (isinstance(color, int) and
01089                     0 <= color <= 255
01090                     ), "Color values must be integers between 0 and 255."
01091         self.red = red
01092         self.green = green
01093         self.blue = blue
01094 
01095     @classmethod
01096     def from_hex(cls, hexstr):
01097         """Construct a BranchColor object from a hexadecimal string.
01098 
01099         The string format is the same style used in HTML and CSS, such as
01100         '#FF8000' for an RGB value of (255, 128, 0).
01101         """
01102         assert (isinstance(hexstr, basestring) and
01103                 hexstr.startswith('#') and
01104                 len(hexstr) == 7
01105                 ), "need a 24-bit hexadecimal string, e.g. #000000"
01106         def unpack(cc):
01107             return int('0x'+cc, base=16)
01108         RGB = hexstr[1:3], hexstr[3:5], hexstr[5:]
01109         return cls(*map(unpack, RGB))
01110 
01111     @classmethod
01112     def from_name(cls, colorname):
01113         """Construct a BranchColor object by the color's name."""
01114         return cls(*cls.color_names[colorname])
01115 
01116     def to_hex(self):
01117         """Return a 24-bit hexadecimal RGB representation of this color.
01118 
01119         The returned string is suitable for use in HTML/CSS, as a color
01120         parameter in matplotlib, and perhaps other situations.
01121 
01122         Example:
01123 
01124             >>> bc = BranchColor(12, 200, 100)
01125             >>> bc.to_hex()
01126             '#0cc864'
01127         """
01128         return '#' + hex(
01129                 self.red * (16**4)
01130                 + self.green * (16**2)
01131                 + self.blue)[2:].zfill(6)
01132 
01133     def to_rgb(self):
01134         """Return a tuple of RGB values (0 to 255) representing this color.
01135 
01136         Example:
01137 
01138             >>> bc = BranchColor(255, 165, 0)
01139             >>> bc.to_rgb()
01140             (255, 165, 0)
01141         """
01142         return (self.red, self.green, self.blue)
01143 
01144     def __repr__(self):
01145         """Preserve the standard RGB order when representing this object."""
01146         return (u'%s(red=%d, green=%d, blue=%d)'
01147                 % (self.__class__.__name__, self.red, self.green, self.blue))
01148 
01149     def __str__(self):
01150         """Show the color's RGB values."""
01151         return "(%d, %d, %d)" % (self.red, self.green, self.blue)
01152