Back to index

python-biopython  1.60
NewickIO.py
Go to the documentation of this file.
00001 # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com)
00002 # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox.
00003 # All rights reserved.
00004 # This code is part of the Biopython distribution and governed by its
00005 # license. Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 
00008 """I/O function wrappers for the Newick file format.
00009 
00010 See: http://evolution.genetics.washington.edu/phylip/newick_doc.html
00011 """
00012 __docformat__ = "restructuredtext en"
00013 
00014 import warnings
00015 
00016 from cStringIO import StringIO
00017 
00018 from Bio import BiopythonDeprecationWarning
00019 from Bio.Phylo import Newick
00020 
00021 # Definitions retrieved from Bio.Nexus.Trees
00022 NODECOMMENT_START = '[&'
00023 NODECOMMENT_END = ']'
00024 
00025 
00026 class NewickError(Exception):
00027     """Exception raised when Newick object construction cannot continue."""
00028     pass
00029 
00030 
00031 # ---------------------------------------------------------
00032 # Public API
00033 
00034 def parse(handle, **kwargs):
00035     """Iterate over the trees in a Newick file handle.
00036 
00037     :returns: generator of Bio.Phylo.Newick.Tree objects.
00038     """
00039     return Parser(handle).parse(**kwargs)
00040 
00041 def write(trees, handle, plain=False, **kwargs):
00042     """Write a trees in Newick format to the given file handle.
00043 
00044     :returns: number of trees written.
00045     """
00046     return Writer(trees).write(handle, plain=plain, **kwargs)
00047 
00048 
00049 # ---------------------------------------------------------
00050 # Input
00051 
00052 class Parser(object):
00053     """Parse a Newick tree given a file handle.
00054 
00055     Based on the parser in `Bio.Nexus.Trees`.
00056     """
00057 
00058     def __init__(self, handle):
00059         self.handle = handle
00060 
00061     @classmethod
00062     def from_string(cls, treetext):
00063         handle = StringIO(treetext)
00064         return cls(handle)
00065 
00066     def parse(self, values_are_confidence=False, rooted=False):
00067         """Parse the text stream this object was initialized with."""
00068         self.values_are_confidence = values_are_confidence
00069         self.rooted = rooted    # XXX this attribue is useless
00070         buf = ''
00071         for line in self.handle:
00072             buf += line.rstrip()
00073             if buf.endswith(';'):
00074                 yield self._parse_tree(buf, rooted)
00075                 buf = ''
00076         if buf:
00077             # Last tree is missing a terminal ';' character -- that's OK
00078             yield self._parse_tree(buf, rooted)
00079 
00080     def _parse_tree(self, text, rooted):
00081         """Parses the text representation into an Tree object."""
00082         # XXX Pass **kwargs along from Parser.parse?
00083         return Newick.Tree(root=self._parse_subtree(text), rooted=self.rooted)
00084 
00085     def _parse_subtree(self, text):
00086         """Parse ``(a,b,c...)[[[xx]:]yy]`` into subcomponents, recursively."""
00087         text = text.strip().rstrip(';')
00088         if text.count('(')!=text.count(')'):
00089             raise NewickError("Parentheses do not match in (sub)tree: " + text)
00090         # Text is now "(...)..." (balanced parens) or "..." (leaf node)
00091         if text.count('(') == 0:
00092             # Leaf/terminal node -- recursion stops here
00093             return self._parse_tag(text)
00094         # Handle one layer of the nested subtree
00095         # XXX what if there's a paren in a comment or other string?
00096         close_posn = text.rfind(')')
00097         subtrees = []
00098         # Locate subtrees by counting nesting levels of parens
00099         plevel = 0
00100         prev = 1
00101         for posn in range(1, close_posn):
00102             if text[posn] == '(':
00103                 plevel += 1
00104             elif text[posn] == ')':
00105                 plevel -= 1
00106             elif text[posn] == ',' and plevel == 0:
00107                 subtrees.append(text[prev:posn])
00108                 prev = posn + 1
00109         subtrees.append(text[prev:close_posn])
00110         # Construct a new clade from trailing text, then attach subclades
00111         clade = self._parse_tag(text[close_posn+1:])
00112         clade.clades = [self._parse_subtree(st) for st in subtrees]
00113         return clade
00114 
00115     def _parse_tag(self, text):
00116         """Extract the data for a node from text.
00117 
00118         :returns: Clade instance containing any available data
00119         """
00120         # Extract the comment
00121         comment_start = text.find(NODECOMMENT_START)
00122         if comment_start != -1:
00123             comment_end = text.find(NODECOMMENT_END)
00124             if comment_end == -1:
00125                 raise NewickError('Error in tree description: '
00126                                   'Found %s without matching %s'
00127                                   % (NODECOMMENT_START, NODECOMMENT_END))
00128             comment = text[comment_start+len(NODECOMMENT_START):comment_end]
00129             text = text[:comment_start] + text[comment_end+len(NODECOMMENT_END):]
00130         else:
00131             comment = None
00132         clade = Newick.Clade(comment=comment)
00133         # Extract name (taxon), and optionally support, branch length
00134         # Float values are support and branch length, the string is name/taxon
00135         values = []
00136         for part in (t.strip() for t in text.split(':')):
00137             if part:
00138                 try:
00139                     values.append(float(part))
00140                 except ValueError:
00141                     assert clade.name is None, "Two string taxonomies?"
00142                     clade.name = part
00143         if len(values) == 1:
00144             # Real branch length, or support as branch length
00145             if self.values_are_confidence:
00146                 clade.confidence = values[0]
00147             else:
00148                 clade.branch_length = values[0]
00149         elif len(values) == 2:
00150             # Two non-taxon values: support comes first. (Is that always so?)
00151             clade.confidence, clade.branch_length = values
00152         elif len(values) > 2:
00153             raise NewickError("Too many colons in tag: " + text)
00154         return clade
00155 
00156 
00157 # ---------------------------------------------------------
00158 # Output
00159 
00160 class Writer(object):
00161     """Based on the writer in Bio.Nexus.Trees (str, to_string)."""
00162 
00163     def __init__(self, trees):
00164         self.trees = trees
00165 
00166     def write(self, handle, **kwargs):
00167         """Write this instance's trees to a file handle."""
00168         count = 0
00169         for treestr in self.to_strings(**kwargs):
00170             handle.write(treestr + '\n')
00171             count += 1
00172         return count
00173 
00174     def to_strings(self, confidence_as_branch_length=False,
00175             branch_length_only=False, plain=False,
00176             plain_newick=True, ladderize=None, max_confidence=1.0,
00177             format_confidence='%1.2f', format_branch_length='%1.5f'):
00178         """Return an iterable of PAUP-compatible tree lines."""
00179         # If there's a conflict in the arguments, we override plain=True
00180         if confidence_as_branch_length or branch_length_only:
00181             plain = False
00182         make_info_string = self._info_factory(plain,
00183                 confidence_as_branch_length, branch_length_only, max_confidence,
00184                 format_confidence, format_branch_length)
00185         def newickize(clade):
00186             """Convert a node tree to a Newick tree string, recursively."""
00187             if clade.is_terminal():    #terminal
00188                 return ((clade.name or '')
00189                         + make_info_string(clade, terminal=True))
00190             else:
00191                 subtrees = (newickize(sub) for sub in clade)
00192                 return '(%s)%s' % (','.join(subtrees),
00193                         (clade.name or '') + make_info_string(clade))
00194 
00195         # Convert each tree to a string
00196         for tree in self.trees:
00197             if ladderize in ('left', 'LEFT', 'right', 'RIGHT'):
00198                 # Nexus compatibility shim, kind of
00199                 tree.ladderize(reverse=(ladderize in ('right', 'RIGHT')))
00200             rawtree = newickize(tree.root) + ';'
00201             if plain_newick:
00202                 yield rawtree
00203                 continue
00204             # Nexus-style (?) notation before the raw Newick tree
00205             treeline = ['tree', (tree.name or 'a_tree'), '=']
00206             if tree.weight != 1:
00207                 treeline.append('[&W%s]' % round(float(tree.weight), 3))
00208             if tree.rooted:
00209                 treeline.append('[&R]')
00210             treeline.append(rawtree)
00211             yield ' '.join(treeline)
00212 
00213     def _info_factory(self, plain, confidence_as_branch_length,
00214             branch_length_only, max_confidence, format_confidence,
00215             format_branch_length):
00216         """Return a function that creates a nicely formatted node tag."""
00217         if plain:
00218             # Plain tree only. That's easy.
00219             def make_info_string(clade, terminal=False):
00220                 return ''
00221 
00222         elif confidence_as_branch_length:
00223             # Support as branchlengths (eg. PAUP), ignore actual branchlengths
00224             def make_info_string(clade, terminal=False):
00225                 if terminal:
00226                     # terminal branches have 100% support
00227                     return ':' + format_confidence % max_confidence
00228                 else:      
00229                     return ':' + format_confidence % clade.confidence
00230 
00231         elif branch_length_only:
00232             # write only branchlengths, ignore support
00233             def make_info_string(clade, terminal=False):
00234                 return ':' + format_branch_length % clade.branch_length
00235 
00236         else:
00237             # write support and branchlengths (e.g. .con tree of mrbayes)
00238             def make_info_string(clade, terminal=False):
00239                 if (terminal or
00240                         not hasattr(clade, 'confidence') or
00241                         clade.confidence is None):
00242                     return (':' + format_branch_length
00243                             ) % (clade.branch_length or 0.0)
00244                 else:
00245                     return (format_confidence + ':' + format_branch_length
00246                             ) % (clade.confidence, clade.branch_length or 0.0)
00247 
00248         return make_info_string
00249