Back to index

python-biopython  1.60
PhyloXML.py
Go to the documentation of this file.
00001 # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license. Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Classes corresponding to phyloXML elements.
00007 
00008 See Also
00009 --------
00010 Official specification:
00011    http://phyloxml.org/ 
00012 Journal article:
00013     Han and Zmasek (2009), doi:10.1186/1471-2105-10-356
00014 """
00015 __docformat__ = "restructuredtext en"
00016 
00017 import re
00018 import warnings
00019 
00020 from Bio import Alphabet
00021 from Bio.Align import MultipleSeqAlignment
00022 from Bio.Seq import Seq
00023 from Bio.SeqFeature import SeqFeature, FeatureLocation
00024 from Bio.SeqRecord import SeqRecord
00025 from Bio import BiopythonWarning
00026 
00027 from Bio.Phylo import BaseTree
00028 
00029 
00030 class PhyloXMLWarning(BiopythonWarning):
00031     """Warning for non-compliance with the phyloXML specification."""
00032     pass
00033 
00034 
00035 def _check_str(text, testfunc):
00036     """Check a string using testfunc, and warn if there's no match."""
00037     if text is not None and not testfunc(text):
00038         warnings.warn("String %s doesn't match the given regexp" % text,
00039                       PhyloXMLWarning, stacklevel=2)
00040 
00041 
00042 # Core elements
00043 
00044 class PhyloElement(BaseTree.TreeElement):
00045     """Base class for all PhyloXML objects."""
00046 
00047 
00048 class Phyloxml(PhyloElement):
00049     """Root node of the PhyloXML document.
00050 
00051     Contains an arbitrary number of Phylogeny elements, possibly followed by
00052     elements from other namespaces.
00053 
00054     :Parameters:
00055         attributes : dict
00056             (XML namespace definitions)
00057         phylogenies : list
00058             The phylogenetic trees
00059         other : list
00060             Arbitrary non-phyloXML elements, if any
00061     """
00062     def __init__(self, attributes, phylogenies=None, other=None):
00063         self.attributes = {
00064                 "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", # standard
00065                 "xmlns": "http://www.phyloxml.org",
00066                 "xsi:schemaLocation": "http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd",
00067                 }
00068         if attributes:
00069             self.attributes.update(attributes)
00070         self.phylogenies = phylogenies or []
00071         self.other = other or []
00072 
00073     def __getitem__(self, index):
00074         """Get a phylogeny by index or name."""
00075         if isinstance(index, int) or isinstance(index, slice):
00076             return self.phylogenies[index]
00077         if not isinstance(index, basestring):
00078             raise KeyError("can't use %s as an index" % type(index))
00079         for tree in self.phylogenies:
00080             if tree.name == index:
00081                 return tree
00082         else:
00083             raise KeyError("no phylogeny found with name " + repr(index))
00084 
00085     def __iter__(self):
00086         """Iterate through the phylogenetic trees in this object."""
00087         return iter(self.phylogenies)
00088 
00089     def __len__(self):
00090         """Number of phylogenetic trees in this object."""
00091         return len(self.phylogenies)
00092 
00093     def __str__(self):
00094         return '%s([%s])' % (self.__class__.__name__,
00095                              ',\n'.join(map(str, self.phylogenies)))
00096 
00097 
00098 class Other(PhyloElement):
00099     """Container for non-phyloXML elements in the tree.
00100 
00101     Usually, an Other object will have either a 'value' or a non-empty list
00102     of 'children', but not both. This is not enforced here, though.
00103 
00104     :Parameters:
00105         tag : string
00106             local tag for the XML node
00107         namespace : string
00108             XML namespace for the node -- should not be the default phyloXML
00109             namespace.
00110         attributes : dict of strings
00111             attributes on the XML node
00112         value : string
00113             text contained directly within this XML node
00114         children : list 
00115             child nodes, if any (also `Other` instances)
00116     """
00117     def __init__(self, tag, namespace=None, attributes=None, value=None,
00118             children=None):
00119         self.tag = tag
00120         self.namespace = namespace
00121         self.attributes = attributes or {}
00122         self.value = value
00123         self.children = children or []
00124 
00125     def __iter__(self):
00126         """Iterate through the children of this object (if any)."""
00127         return iter(self.children)
00128 
00129 
00130 class Phylogeny(PhyloElement, BaseTree.Tree):
00131     """A phylogenetic tree.
00132 
00133     :Parameters:
00134         root : Clade
00135             the root node/clade of this tree
00136         rooted : bool
00137             True if this tree is rooted
00138         rerootable : bool
00139             True if this tree is rerootable
00140         branch_length_unit : string
00141             unit for branch_length values on clades
00142         name : string
00143             identifier for this tree, not required to be unique
00144         id : Id
00145             unique identifier for this tree
00146         description : string
00147             plain-text description
00148         date : Date
00149             date for the root node of this tree
00150         confidences : list
00151             Confidence objects for this tree
00152         clade_relations : list
00153             CladeRelation objects
00154         sequence_relations : list 
00155             SequenceRelation objects
00156         properties : list
00157             Property objects
00158         other : list
00159             non-phyloXML elements (type `Other`)
00160     """
00161     def __init__(self, root=None, rooted=True,
00162             rerootable=None, branch_length_unit=None, type=None,
00163             # Child nodes
00164             name=None, id=None, description=None, date=None,
00165             # Collections
00166             confidences=None, clade_relations=None, sequence_relations=None,
00167             properties=None, other=None,
00168             ):
00169         assert isinstance(rooted, bool)
00170         self.root = root
00171         self.rooted = rooted
00172         self.rerootable = rerootable
00173         self.branch_length_unit = branch_length_unit
00174         self.type = type
00175         self.name = name
00176         self.id = id
00177         self.description = description
00178         self.date = date
00179         self.confidences = confidences or []
00180         self.clade_relations = clade_relations or []
00181         self.sequence_relations = sequence_relations or []
00182         self.properties = properties or []
00183         self.other = other or []
00184 
00185     @classmethod
00186     def from_tree(cls, tree, **kwargs):
00187         """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree).
00188 
00189         Keyword arguments are the usual `Phylogeny` constructor parameters.
00190         """
00191         phy = cls(
00192                 root=Clade.from_clade(tree.root),
00193                 rooted=tree.rooted,
00194                 name=tree.name,
00195                 id=(tree.id is not None) and Id(str(tree.id)) or None)
00196         phy.__dict__.update(kwargs)
00197         return phy
00198 
00199     @classmethod
00200     def from_clade(cls, clade, **kwargs):
00201         """Create a new Phylogeny given a Newick or BaseTree Clade object.
00202 
00203         Keyword arguments are the usual `PhyloXML.Clade` constructor parameters.
00204         """
00205         return Clade.from_clade(clade).to_phylogeny(**kwargs)
00206 
00207     def as_phyloxml(self):
00208         """Return this tree, a PhyloXML-compatible Phylogeny object.
00209 
00210         Overrides the `BaseTree` method.
00211         """
00212         return self
00213 
00214     def to_phyloxml_container(self, **kwargs):
00215         """Create a new Phyloxml object containing just this phylogeny."""
00216         return Phyloxml(kwargs, phylogenies=[self])
00217 
00218     def to_alignment(self):
00219         """Construct an alignment from the aligned sequences in this tree."""
00220         def is_aligned_seq(elem):
00221             if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
00222                 return True
00223             return False
00224         seqs = self._filter_search(is_aligned_seq, 'preorder', True)
00225         try:
00226             first_seq = seqs.next()
00227         except StopIteration:
00228             # No aligned sequences were found --> empty MSA
00229             return MultipleSeqAlignment([])
00230         msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
00231                                    first_seq.get_alphabet())
00232         msa.extend(seq.to_seqrecord() for seq in seqs)
00233         return msa
00234 
00235     # Singular property for plural attribute
00236     def _get_confidence(self):
00237         """Equivalent to self.confidences[0] if there is only 1 value.
00238 
00239         See also: `Clade.confidence`, `Clade.taxonomy`
00240         """
00241         if len(self.confidences) == 0:
00242             return None
00243         if len(self.confidences) > 1:
00244             raise AttributeError("more than 1 confidence value available; "
00245                                  "use Phylogeny.confidences")
00246         return self.confidences[0]
00247 
00248     def _set_confidence(self, value):
00249         if value is None:
00250             # Special case: mirror the behavior of _get_confidence
00251             self.confidences = []
00252             return
00253         if isinstance(value, float) or isinstance(value, int):
00254             value = Confidence(value)
00255         elif not isinstance(value, Confidence):
00256             raise ValueError("value must be a number or Confidence instance")
00257         if len(self.confidences) == 0:
00258             self.confidences.append(value)
00259         elif len(self.confidences) == 1:
00260             self.confidences[0] = value
00261         else:
00262             raise ValueError("multiple confidence values already exist; "
00263                              "use Phylogeny.confidences instead")
00264 
00265     def _del_confidence(self):
00266         self.confidences = []
00267 
00268     confidence = property(_get_confidence, _set_confidence, _del_confidence)
00269 
00270 
00271 class Clade(PhyloElement, BaseTree.Clade):
00272     """Describes a branch of the current phylogenetic tree.
00273 
00274     Used recursively, describes the topology of a phylogenetic tree.
00275 
00276     Both ``color`` and ``width`` elements should be interpreted by client code
00277     as applying to the whole clade, including all descendents, unless
00278     overwritten in-sub clades. This module doesn't automatically assign these
00279     attributes to sub-clades to achieve this cascade -- and neither should you.
00280 
00281     :Parameters:
00282         branch_length
00283             parent branch length of this clade
00284         id_source
00285             link other elements to a clade (on the xml-level)
00286         name : string
00287             short label for this clade
00288         confidences : list of Confidence objects
00289             used to indicate the support for a clade/parent branch.
00290         width : float
00291             branch width for this clade (including branch from parent)
00292         color : BranchColor
00293             color used for graphical display of this clade
00294         node_id
00295             unique identifier for the root node of this clade
00296         taxonomies : list
00297             Taxonomy objects
00298         sequences : list
00299             Sequence objects
00300         events : Events
00301             describe such events as gene-duplications at the root node/parent
00302             branch of this clade
00303         binary_characters : BinaryCharacters
00304             binary characters
00305         distributions : list of Distribution objects
00306             distribution(s) of this clade
00307         date : Date
00308             a date for the root node of this clade
00309         references : list
00310             Reference objects
00311         properties : list
00312             Property objects
00313         clades : list Clade objects
00314             Sub-clades
00315         other : list of Other objects
00316             non-phyloXML objects
00317     """
00318     def __init__(self,
00319             # Attributes
00320             branch_length=None, id_source=None,
00321             # Child nodes
00322             name=None, width=None, color=None, node_id=None, events=None,
00323             binary_characters=None, date=None,
00324             # Collections
00325             confidences=None, taxonomies=None, sequences=None,
00326             distributions=None, references=None, properties=None, clades=None,
00327             other=None,
00328             ):
00329         self.branch_length = branch_length
00330         self.id_source = id_source
00331         self.name = name
00332         self.width = width
00333         self.color = color
00334         self.node_id = node_id
00335         self.events = events
00336         self.binary_characters = binary_characters
00337         self.date = date
00338         self.confidences = confidences or []
00339         self.taxonomies = taxonomies or []
00340         self.sequences = sequences or []
00341         self.distributions = distributions or []
00342         self.references = references or []
00343         self.properties = properties or []
00344         self.clades = clades or []
00345         self.other = other or []
00346 
00347     @classmethod
00348     def from_clade(cls, clade, **kwargs):
00349         """Create a new PhyloXML Clade from a Newick or BaseTree Clade object.
00350 
00351         Keyword arguments are the usual PhyloXML Clade constructor parameters.
00352         """
00353         new_clade = cls(branch_length=clade.branch_length,
00354                     name=clade.name)
00355         new_clade.clades = [cls.from_clade(c) for c in clade]
00356         new_clade.confidence = clade.confidence
00357         new_clade.width = clade.width
00358         new_clade.color = (BranchColor(
00359                 clade.color.red, clade.color.green, clade.color.blue)
00360                 if clade.color else None)
00361         new_clade.__dict__.update(kwargs)
00362         return new_clade
00363 
00364     def to_phylogeny(self, **kwargs):
00365         """Create a new phylogeny containing just this clade."""
00366         phy = Phylogeny(root=self, date=self.date)
00367         phy.__dict__.update(kwargs)
00368         return phy
00369 
00370     # Shortcuts for list attributes that are usually only 1 item
00371     # NB: Duplicated from Phylogeny class
00372     def _get_confidence(self):
00373         if len(self.confidences) == 0:
00374             return None
00375         if len(self.confidences) > 1:
00376             raise AttributeError("more than 1 confidence value available; "
00377                                  "use Clade.confidences")
00378         return self.confidences[0]
00379 
00380     def _set_confidence(self, value):
00381         if value is None:
00382             # Special case: mirror the behavior of _get_confidence
00383             self.confidences = []
00384             return
00385         if isinstance(value, float) or isinstance(value, int):
00386             value = Confidence(value)
00387         elif not isinstance(value, Confidence):
00388             raise ValueError("value must be a number or Confidence instance")
00389         if len(self.confidences) == 0:
00390             self.confidences.append(value)
00391         elif len(self.confidences) == 1:
00392             self.confidences[0] = value
00393         else:
00394             raise ValueError("multiple confidence values already exist; "
00395                              "use Phylogeny.confidences instead")
00396 
00397     def _del_confidence(self):
00398         self.confidences = []
00399 
00400     confidence = property(_get_confidence, _set_confidence, _del_confidence)
00401 
00402     def _get_taxonomy(self):
00403         if len(self.taxonomies) == 0:
00404             return None
00405         if len(self.taxonomies) > 1:
00406             raise AttributeError("more than 1 taxonomy value available; "
00407                                  "use Clade.taxonomies")
00408         return self.taxonomies[0]
00409 
00410     def _set_taxonomy(self, value):
00411         if not isinstance(value, Taxonomy):
00412             raise ValueError("assigned value must be a Taxonomy instance")
00413         if len(self.taxonomies) == 0:
00414             self.taxonomies.append(value)
00415         elif len(self.taxonomies) == 1:
00416             self.taxonomies[0] = value
00417         else:
00418             raise ValueError("multiple taxonomy values already exist; "
00419                              "use Phylogeny.taxonomies instead")
00420 
00421     taxonomy = property(_get_taxonomy, _set_taxonomy)
00422 
00423 
00424 # PhyloXML wrapper for a special BaseTree attribute
00425 
00426 class BranchColor(PhyloElement, BaseTree.BranchColor):
00427     def __init__(self, *args, **kwargs):
00428         BaseTree.BranchColor.__init__(self, *args, **kwargs)
00429 
00430 
00431 # PhyloXML-specific complex types
00432 
00433 class Accession(PhyloElement):
00434     """Captures the local part in a sequence identifier.
00435 
00436     Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value``
00437     is 'P17304' and the ``source`` attribute is 'UniProtKB'.
00438     """
00439     def __init__(self, value, source):
00440         self.value = value
00441         self.source = source
00442 
00443     def __str__(self):
00444         """Show the class name and an identifying attribute."""
00445         return '%s:%s' % (self.source, self.value)
00446 
00447 
00448 class Annotation(PhyloElement):
00449     """The annotation of a molecular sequence.
00450 
00451     It is recommended to annotate by using the optional 'ref' attribute.
00452 
00453     :Parameters:
00454         ref : string
00455             reference string, e.g. 'GO:0008270',
00456             'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1'
00457         source : string
00458             plain-text source for this annotation
00459         evidence : str
00460             describe evidence as free text (e.g. 'experimental')
00461         desc : string
00462             free text description
00463         confidence : Confidence
00464             state the type and value of support (type Confidence)
00465         properties : list
00466             typed and referenced annotations from external resources
00467         uri : Uri
00468             link
00469     """
00470     re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
00471 
00472     def __init__(self, 
00473             # Attributes
00474             ref=None, source=None, evidence=None, type=None,
00475             # Child nodes
00476             desc=None, confidence=None, uri=None,
00477             # Collection
00478             properties=None):
00479         _check_str(ref, self.re_ref.match)
00480         self.ref = ref
00481         self.source = source
00482         self.evidence = evidence
00483         self.type = type
00484         self.desc = desc
00485         self.confidence = confidence
00486         self.uri = uri
00487         self.properties = properties or []
00488 
00489 
00490 class BinaryCharacters(PhyloElement):
00491     """The names and/or counts of binary characters present, gained, and lost
00492     at the root of a clade. 
00493     """
00494     def __init__(self,
00495             # Attributes
00496             type=None, gained_count=None, lost_count=None, present_count=None,
00497             absent_count=None,
00498             # Child nodes (flattened into collections)
00499             gained=None, lost=None, present=None, absent=None):
00500         self.type=type
00501         self.gained_count=gained_count
00502         self.lost_count=lost_count
00503         self.present_count=present_count
00504         self.absent_count=absent_count
00505         self.gained=gained or []
00506         self.lost=lost or []
00507         self.present=present or []
00508         self.absent=absent or []
00509 
00510 
00511 
00512 class CladeRelation(PhyloElement):
00513     """Expresses a typed relationship between two clades.
00514 
00515     For example, this could be used to describe multiple parents of a clade.
00516 
00517     @type id_ref_0: str
00518     @type id_ref_1: str
00519     @type distance: str
00520     @type type: str
00521 
00522     @type confidence: Confidence
00523     """
00524     def __init__(self, type, id_ref_0, id_ref_1,
00525             distance=None, confidence=None):
00526         self.distance = distance
00527         self.type = type
00528         self.id_ref_0 = id_ref_0
00529         self.id_ref_1 = id_ref_1
00530         self.confidence = confidence
00531 
00532 
00533 class Confidence(PhyloElement):
00534     """A general purpose confidence element.
00535 
00536     For example, this can be used to express the bootstrap support value of a
00537     clade (in which case the `type` attribute is 'bootstrap').
00538 
00539     :Parameters:
00540         value : float
00541             confidence value
00542         type : string
00543             label for the type of confidence, e.g. 'bootstrap'
00544     """
00545     def __init__(self, value, type='unknown'):
00546         self.value = value
00547         self.type = type
00548 
00549     # Comparison operators
00550 
00551     def __hash__(self):
00552         """Return the hash value of the object.
00553 
00554         Hash values are integers. They are used to quickly compare dictionary
00555         keys during a dictionary lookup. Numeric values that compare equal have
00556         the same hash value (even if they are of different types, as is the
00557         case for 1 and 1.0).
00558         """
00559         return id(self)
00560 
00561     def __eq__(self, other):
00562         if isinstance(other, Confidence):
00563             return self.value == other.value
00564         return self.value == other
00565 
00566     def __ne__(self, other):
00567         if isinstance(other, Confidence):
00568             return self.value != other.value
00569         return self.value != other
00570 
00571     # Ordering -- see functools.total_ordering in Py2.7
00572 
00573     def __lt__(self, other):
00574         if isinstance(other, Confidence):
00575             return self.value < other.value
00576         return self.value < other
00577 
00578     def __le__(self, other):
00579         return self < other or self == other
00580 
00581     def __gt__(self, other):
00582         return not (self <= other)
00583 
00584     def __ge__(self, other):
00585         return not (self.value < other)
00586 
00587     # Arithmetic operators, including reverse
00588 
00589     def __add__(self, other):
00590         return self.value + other
00591 
00592     def __radd__(self, other):
00593         return other + self.value
00594 
00595     def __sub__(self, other):
00596         return self.value - other
00597 
00598     def __rsub__(self, other):
00599         return other - self.value
00600 
00601     def __mul__(self, other):
00602         return self.value * other
00603 
00604     def __rmul__(self, other):
00605         return other * self.value
00606 
00607     def __div__(self, other):
00608         return self.value.__div__(other)
00609 
00610     def __rdiv__(self, other):
00611         return other.__div__(self.value)
00612 
00613     def __truediv__(self, other):
00614         """Rational-style division in Py3.0+.
00615 
00616         Also active in Py2.5+ with __future__.division import.
00617         """
00618         return self.value / other
00619 
00620     def __rtruediv__(self, other):
00621         return other / self.value
00622 
00623     def __floordiv__(self, other):
00624         """C-style and old-style division in Py3.0+.
00625 
00626         Also active in Py2.5+ with __future__.division import.
00627         """
00628         return self.value.__floordiv__(other)
00629 
00630     def __rfloordiv__(self, other):
00631         return other.__floordiv__(self.value)
00632 
00633     def __mod__(self, other):
00634         return self.value % other
00635 
00636     def __rmod__(self, other):
00637         return other % self.value
00638 
00639     def __divmod__(self, other):
00640         return divmod(self.value, other)
00641 
00642     def __rdivmod__(self, other):
00643         return divmod(other, self.value)
00644 
00645     def __pow__(self, other, modulo=None):
00646         if modulo is not None:
00647             return pow(self.value, other, modulo)
00648         return pow(self.value, other)
00649 
00650     def __rpow__(self, other):
00651         return pow(other, self.value)
00652 
00653     # Unary arithmetic operations: -, +, abs()
00654 
00655     def __neg__(self):
00656         return -self.value
00657 
00658     def __pos__(self):
00659         return self.value
00660 
00661     def __abs__(self):
00662         return abs(self.value)
00663 
00664     # Explicit coercion to numeric types: int, long, float
00665 
00666     def __float__(self):
00667         return float(self.value)
00668 
00669     def __int__(self):
00670         return int(self.value)
00671 
00672     def __long__(self):
00673         return long(self.value)
00674 
00675 
00676 class Date(PhyloElement):
00677     """A date associated with a clade/node.
00678 
00679     Its value can be numerical by using the 'value' element and/or free text
00680     with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it
00681     is recommended to employ the 'unit' attribute.
00682 
00683     :Parameters:
00684         unit : string
00685             type of numerical value (e.g. 'mya' for 'million years ago')
00686         value : float
00687             the date value
00688         desc : string
00689             plain-text description of the date
00690         minimum : float
00691             lower bound on the date value
00692         maximum : float
00693             upper bound on the date value
00694     """
00695     def __init__(self, value=None, unit=None, desc=None, 
00696             minimum=None, maximum=None):
00697         self.value = value
00698         self.unit = unit
00699         self.desc = desc
00700         self.minimum = minimum
00701         self.maximum = maximum
00702 
00703     def __str__(self):
00704         """Show the class name and the human-readable date."""
00705         if self.unit and self.value is not None:
00706             return '%s %s' % (self.value, self.unit)
00707         if self.desc is not None:
00708             return self.desc
00709         return self.__class__.__name__
00710 
00711 
00712 class Distribution(PhyloElement):
00713     """Geographic distribution of the items of a clade (species, sequences).
00714 
00715     Intended for phylogeographic applications.
00716 
00717     :Parameters:
00718         desc : string
00719             free-text description of the location
00720         points : list of `Point` objects
00721             coordinates (similar to the 'Point' element in Google's KML format)
00722         polygons : list of `Polygon` objects
00723             coordinate sets defining geographic regions
00724     """
00725     def __init__(self, desc=None, points=None, polygons=None):
00726         self.desc = desc
00727         self.points = points or []
00728         self.polygons = polygons or []
00729 
00730 
00731 class DomainArchitecture(PhyloElement):
00732     """Domain architecture of a protein.
00733 
00734     :Parameters:
00735         length : int
00736             total length of the protein sequence
00737         domains : list ProteinDomain objects
00738             the domains within this protein
00739     """
00740     def __init__(self, length=None, domains=None):
00741         self.length = length
00742         self.domains = domains
00743 
00744 
00745 class Events(PhyloElement):
00746     """Events at the root node of a clade (e.g. one gene duplication).
00747 
00748     All attributes are set to None by default, but this object can also be
00749     treated as a dictionary, in which case None values are treated as missing
00750     keys and deleting a key resets that attribute's value back to None.
00751     """
00752     ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other',
00753                     'mixed', 'unassigned'))
00754 
00755     def __init__(self, type=None, duplications=None, speciations=None,
00756             losses=None, confidence=None):
00757         _check_str(type, self.ok_type.__contains__)
00758         self.type = type
00759         self.duplications = duplications
00760         self.speciations = speciations
00761         self.losses = losses
00762         self.confidence = confidence
00763 
00764     def items(self):
00765         return [(k, v) for k, v in self.__dict__.iteritems() if v is not None]
00766 
00767     def keys(self):
00768         return [k for k, v in self.__dict__.iteritems() if v is not None]
00769 
00770     def values(self):
00771         return [v for v in self.__dict__.itervalues() if v is not None]
00772 
00773     def __len__(self):
00774         return len(self.values())
00775 
00776     def __getitem__(self, key):
00777         if not hasattr(self, key):
00778             raise KeyError(key)
00779         val = getattr(self, key)
00780         if val is None:
00781             raise KeyError("%s has not been set in this object" % repr(key))
00782         return val
00783 
00784     def __setitem__(self, key, val):
00785         setattr(self, key, val)
00786 
00787     def __delitem__(self, key):
00788         setattr(self, key, None)
00789 
00790     def __iter__(self):
00791         return iter(self.keys())
00792 
00793     def __contains__(self, key):
00794         return (hasattr(self, key) and getattr(self, key) is not None)
00795 
00796 
00797 class Id(PhyloElement):
00798     """A general-purpose identifier element.
00799 
00800     Allows to indicate the provider (or authority) of an identifier, e.g. NCBI,
00801     along with the value itself.
00802     """
00803     def __init__(self, value, provider=None):
00804         self.value = value
00805         self.provider = provider
00806 
00807     def __str__(self):
00808         if self.provider is not None:
00809             return '%s:%s' % (self.provider, self.value)
00810         return self.value
00811 
00812 
00813 class MolSeq(PhyloElement):
00814     """Store a molecular sequence.
00815 
00816     :Parameters:
00817         value : string
00818             the sequence itself
00819         is_aligned : bool
00820             True if this sequence is aligned with the others (usually meaning
00821             all aligned seqs are the same length and gaps may be present)
00822     """
00823     re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+')
00824 
00825     def __init__(self, value, is_aligned=None):
00826         _check_str(value, self.re_value.match)
00827         self.value = value
00828         self.is_aligned = is_aligned
00829 
00830     def __str__(self):
00831         return self.value
00832 
00833 
00834 class Point(PhyloElement):
00835     """Geographic coordinates of a point, with an optional altitude.
00836 
00837     Used by element 'Distribution'.
00838 
00839     :Parameters:
00840         geodetic_datum : string, required
00841             the geodetic datum (also called 'map datum'). For example, Google's
00842             KML uses 'WGS84'.
00843         lat : numeric
00844             latitude
00845         long : numeric
00846             longitude
00847         alt : numeric
00848             altitude
00849         alt_unit : string
00850             unit for the altitude (e.g. 'meter')
00851     """
00852     def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
00853         self.geodetic_datum = geodetic_datum
00854         self.lat = lat
00855         self.long = long
00856         self.alt = alt
00857         self.alt_unit = alt_unit
00858 
00859 
00860 class Polygon(PhyloElement):
00861     """A polygon defined by a list of 'Points' (used by element 'Distribution').
00862 
00863     :param points: list of 3 or more points representing vertices.
00864     """
00865     def __init__(self, points=None):
00866         self.points = points or []
00867 
00868     def __str__(self):
00869         return '%s([%s])' % (self.__class__.__name__,
00870                              ',\n'.join(map(str, self.points)))
00871 
00872 
00873 class Property(PhyloElement):
00874     """A typed and referenced property from an external resources.
00875 
00876     Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects.
00877 
00878     :Parameters:
00879         value : string
00880             the value of the property
00881         ref : string
00882             reference to an external resource, e.g. "NOAA:depth"
00883         applies_to : string
00884             indicates the item to which a property applies to (e.g.  'node' for
00885             the parent node of a clade, 'parent_branch' for the parent branch of
00886             a clade, or just 'clade').
00887         datatype : string
00888             the type of a property; limited to xsd-datatypes
00889             (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal',
00890             'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI').
00891         unit : string (optional)
00892             the unit of the property, e.g. "METRIC:m"
00893         id_ref : Id (optional)
00894             allows to attached a property specifically to one element (on the
00895             xml-level)
00896     """
00897     re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
00898     ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation',
00899                          'parent_branch', 'other'))
00900     ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float',
00901         'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date',
00902         'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay',
00903         'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI',
00904         'xsd:normalizedString', 'xsd:token', 'xsd:integer',
00905         'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int',
00906         'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong',
00907         'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte',
00908         'xsd:positiveInteger'))
00909 
00910     def __init__(self, value, ref, applies_to, datatype,
00911             unit=None, id_ref=None):
00912         _check_str(ref, self.re_ref.match)
00913         _check_str(applies_to, self.ok_applies_to.__contains__)
00914         _check_str(datatype, self.ok_datatype.__contains__)
00915         _check_str(unit, self.re_ref.match)
00916         self.unit = unit
00917         self.id_ref = id_ref
00918         self.value = value
00919         self.ref = ref
00920         self.applies_to = applies_to
00921         self.datatype = datatype
00922 
00923 
00924 class ProteinDomain(PhyloElement):
00925     """Represents an individual domain in a domain architecture.
00926 
00927     The locations use 0-based indexing, as most Python objects including
00928     SeqFeature do, rather than the usual biological convention starting at 1.
00929     This means the start and end attributes can be used directly as slice
00930     indexes on Seq objects.
00931 
00932     :Parameters:
00933         start : non-negative integer
00934             start of the domain on the sequence, using 0-based indexing
00935         end : non-negative integer
00936             end of the domain on the sequence
00937         confidence : float
00938             can be used to store e.g. E-values
00939         id : string
00940             unique identifier/name
00941     """
00942 
00943     def __init__(self, value, start, end, confidence=None, id=None):
00944         self.value = value
00945         self.start = start
00946         self.end = end
00947         self.confidence = confidence
00948         self.id = id
00949 
00950     @classmethod
00951     def from_seqfeature(cls, feat):
00952         return ProteinDomain(feat.id,
00953                 feat.location.nofuzzy_start,
00954                 feat.location.nofuzzy_end,
00955                 confidence=feat.qualifiers.get('confidence'))
00956 
00957     def to_seqfeature(self):
00958         feat = SeqFeature(location=FeatureLocation(self.start, self.end),
00959                           id=self.value)
00960         if hasattr(self, 'confidence'):
00961             feat.qualifiers['confidence'] = self.confidence
00962         return feat
00963 
00964 
00965 class Reference(PhyloElement):
00966     """Literature reference for a clade.
00967 
00968     NB: Whenever possible, use the ``doi`` attribute instead of the free-text
00969     ``desc`` element.
00970     """
00971     re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+')
00972 
00973     def __init__(self, doi=None, desc=None):
00974         _check_str(doi, self.re_doi.match)
00975         self.doi = doi
00976         self.desc = desc
00977 
00978 
00979 class Sequence(PhyloElement):
00980     """A molecular sequence (Protein, DNA, RNA) associated with a node.
00981 
00982     One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the
00983     taxonomy's ``id_source``) in case of multiple sequences and taxonomies per
00984     node.
00985 
00986     :Parameters:
00987         type : {'dna', 'rna', 'protein'}
00988             type of molecule this sequence represents
00989         id_ref : string
00990             reference to another resource
00991         id_source : string
00992             source for the reference
00993         symbol : string
00994             short symbol of the sequence, e.g. 'ACTM' (max. 10 chars)
00995         accession : Accession
00996             accession code for this sequence.
00997         name : string
00998             full name of the sequence, e.g. 'muscle Actin'
00999         location
01000             location of a sequence on a genome/chromosome.
01001         mol_seq : MolSeq
01002             the molecular sequence itself
01003         uri : Uri
01004             link
01005         annotations : list of Annotation objects
01006             annotations on this sequence
01007         domain_architecture : DomainArchitecture
01008             protein domains on this sequence
01009         other : list of Other objects
01010             non-phyloXML elements
01011     """
01012     alphabets = {'dna':     Alphabet.generic_dna,
01013                  'rna':     Alphabet.generic_rna,
01014                  'protein': Alphabet.generic_protein}
01015     re_symbol = re.compile(r'\S{1,10}')
01016 
01017     def __init__(self, 
01018             # Attributes
01019             type=None, id_ref=None, id_source=None,
01020             # Child nodes
01021             symbol=None, accession=None, name=None, location=None,
01022             mol_seq=None, uri=None, domain_architecture=None,
01023             # Collections
01024             annotations=None, other=None,
01025             ):
01026         _check_str(type, self.alphabets.__contains__)
01027         _check_str(symbol, self.re_symbol.match)
01028         self.type = type
01029         self.id_ref = id_ref
01030         self.id_source = id_source
01031         self.symbol = symbol
01032         self.accession = accession
01033         self.name = name
01034         self.location = location
01035         self.mol_seq = mol_seq
01036         self.uri = uri
01037         self.domain_architecture = domain_architecture
01038         self.annotations = annotations or []
01039         self.other = other or []
01040 
01041     @classmethod
01042     def from_seqrecord(cls, record, is_aligned=None):
01043         """Create a new PhyloXML Sequence from a SeqRecord object."""
01044         if is_aligned == None:
01045             is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped)
01046         params = {
01047                 'accession': Accession(record.id, ''),
01048                 'symbol': record.name,
01049                 'name': record.description,
01050                 'mol_seq': MolSeq(str(record.seq), is_aligned),
01051                 }
01052         if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
01053             params['type'] = 'dna'
01054         elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
01055             params['type'] = 'rna'
01056         elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
01057             params['type'] = 'protein'
01058 
01059         # Unpack record.annotations
01060         for key in ('id_ref', 'id_source', 'location'):
01061             if key in record.annotations:
01062                 params[key] = record.annotations[key]
01063         if isinstance(record.annotations.get('uri'), dict):
01064             params['uri'] = Uri(**record.annotations['uri'])
01065         # Build a Sequence.annotation object
01066         if record.annotations.get('annotations'):
01067             params['annotations'] = []
01068             for annot in record.annotations['annotations']:
01069                 ann_args = {}
01070                 for key in ('ref', 'source', 'evidence', 'type', 'desc'):
01071                     if key in annot:
01072                         ann_args[key] = annot[key]
01073                 if isinstance(annot.get('confidence'), list):
01074                     ann_args['confidence'] = Confidence(
01075                                         *annot['confidence'])
01076                 if isinstance(annot.get('properties'), list):
01077                     ann_args['properties'] = [Property(**prop)
01078                                         for prop in annot['properties']
01079                                         if isinstance(prop, dict)]
01080                 params['annotations'].append(Annotation(**ann_args))
01081 
01082         # Unpack record.features
01083         if record.features:
01084             params['domain_architecture'] = DomainArchitecture(
01085                     length=len(record.seq),
01086                     domains=[ProteinDomain.from_seqfeature(feat)
01087                              for feat in record.features])
01088 
01089         return Sequence(**params)
01090 
01091     def to_seqrecord(self):
01092         """Create a SeqRecord object from this Sequence instance.
01093         
01094         The seqrecord.annotations dictionary is packed like so::
01095 
01096             { # Sequence attributes with no SeqRecord equivalent:
01097               'id_ref':     self.id_ref,
01098               'id_source':  self.id_source,
01099               'location':   self.location,
01100               'uri':        { 'value': self.uri.value,
01101                               'desc': self.uri.desc,
01102                               'type': self.uri.type },
01103               # Sequence.annotations attribute (list of Annotations)
01104               'annotations': [{ 'ref':      ann.ref,
01105                                 'source':   ann.source,
01106                                 'evidence': ann.evidence,
01107                                 'type':     ann.type,
01108                                 'confidence': [ ann.confidence.value,
01109                                                 ann.confidence.type ],
01110                                 'properties': [{ 'value': prop.value,
01111                                                  'ref': prop.ref,
01112                                                  'applies_to': prop.applies_to,
01113                                                  'datatype':   prop.datatype,
01114                                                  'unit':       prop.unit,
01115                                                  'id_ref':     prop.id_ref }
01116                                                for prop in ann.properties],
01117                               } for ann in self.annotations],
01118             }
01119         """
01120         def clean_dict(dct):
01121             """Remove None-valued items from a dictionary."""
01122             return dict((key, val) for key, val in dct.iteritems()
01123                         if val is not None)
01124 
01125         seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()),
01126                            **clean_dict({
01127                                'id':    str(self.accession),
01128                                'name':  self.symbol,
01129                                'description': self.name,
01130                                # 'dbxrefs': None,
01131                                }))
01132         if self.domain_architecture:
01133             seqrec.features = [dom.to_seqfeature()
01134                                for dom in self.domain_architecture.domains]
01135         # Sequence attributes with no SeqRecord equivalent
01136         seqrec.annotations = clean_dict({
01137                 'id_ref':       self.id_ref,
01138                 'id_source':    self.id_source,
01139                 'location':     self.location,
01140                 'uri':          self.uri and clean_dict({
01141                                     'value': self.uri.value,
01142                                     'desc': self.uri.desc,
01143                                     'type': self.uri.type,
01144                                     }),
01145                 'annotations':  self.annotations and [
01146                     clean_dict({
01147                         'ref':          ann.ref,
01148                         'source':       ann.source,
01149                         'evidence':     ann.evidence,
01150                         'type':         ann.type,
01151                         'confidence':   ann.confidence and [
01152                                             ann.confidence.value,
01153                                             ann.confidence.type],
01154                         'properties':   [clean_dict({
01155                                             'value':      prop.value,
01156                                             'ref':        prop.ref,
01157                                             'applies_to': prop.applies_to,
01158                                             'datatype':   prop.datatype,
01159                                             'unit':       prop.unit,
01160                                             'id_ref':     prop.id_ref })
01161                                          for prop in ann.properties],
01162                         }) for ann in self.annotations],
01163                 })
01164         return seqrec
01165 
01166     def get_alphabet(self):
01167         alph = self.alphabets.get(self.type, Alphabet.generic_alphabet)
01168         if self.mol_seq and self.mol_seq.is_aligned:
01169             return Alphabet.Gapped(alph)
01170         return alph
01171 
01172 
01173 class SequenceRelation(PhyloElement):
01174     """Express a typed relationship between two sequences.
01175 
01176     For example, this could be used to describe an orthology (in which case
01177     attribute 'type' is 'orthology'). 
01178 
01179     :Parameters:
01180         id_ref_0 : Id
01181             first sequence reference identifier
01182         id_ref_1 : Id
01183             second sequence reference identifier
01184         distance : float
01185             distance between the two sequences
01186         type : restricted string
01187             describe the type of relationship
01188         confidence : Confidence
01189             confidence value for this relation
01190     """
01191     ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology',
01192         'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other'))
01193 
01194     def __init__(self, type, id_ref_0, id_ref_1,
01195             distance=None, confidence=None):
01196         _check_str(type, self.ok_type.__contains__)
01197         self.distance = distance
01198         self.type = type
01199         self.id_ref_0 = id_ref_0
01200         self.id_ref_1 = id_ref_1
01201         self.confidence = confidence
01202 
01203 
01204 class Taxonomy(PhyloElement):
01205     """Describe taxonomic information for a clade.
01206 
01207     :Parameters:
01208         id_source : Id
01209             link other elements to a taxonomy (on the XML level)
01210         id : Id
01211             unique identifier of a taxon, e.g. Id('6500',
01212             provider='ncbi_taxonomy') for the California sea hare
01213         code : restricted string
01214             store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the
01215             California sea hare 'Aplysia californica'
01216         scientific_name : string
01217             the standard scientific name for this organism, e.g. 'Aplysia
01218             californica' for the California sea hare
01219         authority : string
01220             keep the authority, such as 'J. G. Cooper, 1863', associated with
01221             the 'scientific_name'
01222         common_names : list of strings
01223             common names for this organism
01224         synonyms : list of strings
01225             synonyms for this taxon?
01226         rank : restricted string
01227             taxonomic rank
01228         uri : Uri
01229             link
01230         other : list of Other objects
01231             non-phyloXML elements
01232     """
01233     re_code = re.compile(r'[a-zA-Z0-9_]{2,10}')
01234     ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom',
01235         'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum',
01236         'superdivision', 'division', 'subdivision', 'infradivision',
01237         'superclass', 'class', 'subclass', 'infraclass', 'superlegion',
01238         'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort',
01239         'subcohort', 'infracohort', 'superorder', 'order', 'suborder',
01240         'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe',
01241         'infratribe', 'genus', 'subgenus', 'superspecies', 'species',
01242         'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar',
01243         'unknown', 'other'))
01244 
01245     def __init__(self, 
01246             # Attributes
01247             id_source=None,
01248             # Child nodes
01249             id=None, code=None, scientific_name=None, authority=None,
01250             rank=None, uri=None,
01251             # Collections
01252             common_names=None, synonyms=None, other=None,
01253             ):
01254         _check_str(code, self.re_code.match)
01255         _check_str(rank, self.ok_rank.__contains__)
01256         self.id_source = id_source
01257         self.id = id
01258         self.code = code
01259         self.scientific_name = scientific_name
01260         self.authority = authority
01261         self.rank = rank
01262         self.uri = uri
01263         self.common_names = common_names or []
01264         self.synonyms = synonyms or []
01265         self.other = other or []
01266 
01267     def __str__(self):
01268         """Show the class name and an identifying attribute."""
01269         if self.code is not None:
01270             return self.code
01271         if self.scientific_name is not None:
01272             return self.scientific_name
01273         if self.rank is not None:
01274             return self.rank
01275         if self.id is not None:
01276             return str(self.id)
01277         return self.__class__.__name__
01278 
01279 
01280 class Uri(PhyloElement):
01281     """A uniform resource identifier.
01282 
01283     In general, this is expected to be an URL (for example, to link to an image
01284     on a website, in which case the ``type`` attribute might be 'image' and
01285     ``desc`` might be 'image of a California sea hare').
01286     """
01287     def __init__(self, value, desc=None, type=None):
01288         self.value = value
01289         self.desc = desc
01290         self.type = type
01291 
01292     def __str__(self):
01293         if self.value:
01294             return self.value
01295         return repr(self)