Back to index

python-biopython  1.60
SeqFeature.py
Go to the documentation of this file.
00001 # Copyright 2000-2003 Jeff Chang.
00002 # Copyright 2001-2008 Brad Chapman.
00003 # Copyright 2005-2011 by Peter Cock.
00004 # Copyright 2006-2009 Michiel de Hoon.
00005 # All rights reserved.
00006 # This code is part of the Biopython distribution and governed by its
00007 # license.  Please see the LICENSE file that should have been included
00008 # as part of this package.
00009 """Represent a Sequence Feature holding info about a part of a sequence.
00010 
00011 This is heavily modeled after the Biocorba SeqFeature objects, and
00012 may be pretty biased towards GenBank stuff since I'm writing it
00013 for the GenBank parser output...
00014 
00015 What's here:
00016 
00017 Base class to hold a Feature.
00018 ----------------------------
00019 classes:
00020 o SeqFeature
00021 
00022 Hold information about a Reference.
00023 ----------------------------------
00024 
00025 This is an attempt to create a General class to hold Reference type
00026 information.
00027 
00028 classes:
00029 o Reference
00030 
00031 Specify locations of a feature on a Sequence.
00032 ---------------------------------------------
00033 
00034 This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
00035 much the same way as Biocorba. This has the advantages of allowing us
00036 to handle fuzzy stuff in case anyone needs it, and also be compatible
00037 with Biocorba.
00038 
00039 classes:
00040 o FeatureLocation - Specify the start and end location of a feature.
00041 
00042 o ExactPosition - Specify the position as being exact.
00043 o WithinPosition - Specify a position occuring within some range.
00044 o BetweenPosition - Specify a position occuring between a range (OBSOLETE?).
00045 o BeforePosition - Specify the position as being found before some base.
00046 o AfterPosition - Specify the position as being found after some base.
00047 o OneOfPosition - Specify a position where the location can be multiple positions.
00048 """
00049 
00050 from Bio.Seq import MutableSeq, reverse_complement
00051 
00052 class SeqFeature(object):
00053     """Represent a Sequence Feature on an object.
00054 
00055     Attributes:
00056     o location - the location of the feature on the sequence (FeatureLocation)
00057     o type - the specified type of the feature (ie. CDS, exon, repeat...)
00058     o location_operator - a string specifying how this SeqFeature may
00059     be related to others. For example, in the example GenBank feature
00060     shown below, the location_operator would be "join"
00061     o strand - A value specifying on which strand (of a DNA sequence, for
00062     instance) the feature deals with. 1 indicates the plus strand, -1 
00063     indicates the minus strand, 0 indicates stranded but unknown (? in GFF3),
00064     while the default of None indicates that strand doesn't apply (dot in GFF3,
00065     e.g. features on proteins). Note this is a shortcut for accessing the
00066     strand property of the feature's location.
00067     o id - A string identifier for the feature.
00068     o ref - A reference to another sequence. This could be an accession
00069     number for some different sequence. Note this is a shortcut for the
00070     reference property of the feature's location.
00071     o ref_db - A different database for the reference accession number.
00072     Note this is a shortcut for the reference property of the location
00073     o qualifiers - A dictionary of qualifiers on the feature. These are
00074     analagous to the qualifiers from a GenBank feature table. The keys of
00075     the dictionary are qualifier names, the values are the qualifier
00076     values.
00077     o sub_features - Additional SeqFeatures which fall under this 'parent'
00078     feature. For instance, if we having something like:
00079 
00080     CDS    join(1..10,30..40,50..60)
00081 
00082     Then the top level feature would be of type 'CDS' from 1 to 60 (actually 0
00083     to 60 in Python counting) with location_operator='join', and the three sub-
00084     features would also be of type 'CDS', and would be from 1 to 10, 30 to
00085     40 and 50 to 60, respectively (although actually using Python counting).
00086 
00087     To get the nucleotide sequence for this CDS, you would need to take the
00088     parent sequence and do seq[0:10]+seq[29:40]+seq[49:60] (Python counting).
00089     Things are more complicated with strands and fuzzy positions. To save you
00090     dealing with all these special cases, the SeqFeature provides an extract
00091     method to do this for you.
00092     """
00093     def __init__(self, location = None, type = '', location_operator = '',
00094                  strand = None, id = "<unknown id>", 
00095                  qualifiers = None, sub_features = None,
00096                  ref = None, ref_db = None):
00097         """Initialize a SeqFeature on a Sequence.
00098 
00099         location can either be a FeatureLocation (with strand argument also
00100         given if required), or None.
00101 
00102         e.g. With no strand, on the forward strand, and on the reverse strand:
00103 
00104         >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
00105         >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain")
00106         >>> f1.strand == f1.location.strand == None
00107         True
00108         >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS")
00109         >>> f2.strand == f2.location.strand == +1
00110         True
00111         >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS")
00112         >>> f3.strand == f3.location.strand == -1
00113         True
00114 
00115         An invalid strand will trigger an exception:
00116 
00117         >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2)
00118         Traceback (most recent call last):
00119            ...
00120         ValueError: Strand should be +1, -1, 0 or None, not 2
00121 
00122         Similarly if set via the FeatureLocation directly:
00123 
00124         >>> loc4 = FeatureLocation(50, 60, strand=2)
00125         Traceback (most recent call last):
00126            ...
00127         ValueError: Strand should be +1, -1, 0 or None, not 2
00128 
00129         For exact start/end positions, an integer can be used (as shown above)
00130         as shorthand for the ExactPosition object. For non-exact locations, the
00131         FeatureLocation must be specified via the appropriate position objects.
00132         """
00133         if location is not None and not isinstance(location, FeatureLocation):
00134             raise TypeError("FeatureLocation (or None) required for the location")
00135         self.location = location
00136 
00137         self.type = type
00138         self.location_operator = location_operator
00139         if strand is not None:
00140             self.strand = strand
00141         self.id = id
00142         if qualifiers is None:
00143             qualifiers = {}
00144         self.qualifiers = qualifiers
00145         if sub_features is None:
00146             sub_features = []
00147         self.sub_features = sub_features
00148         if ref is not None:
00149             self.ref = ref
00150         if ref_db is not None:
00151             self.ref_db = ref_db
00152 
00153     def _get_strand(self):
00154         return self.location.strand
00155     def _set_strand(self, value):
00156         try:
00157             self.location.strand = value
00158         except AttributeError:
00159             if self.location is None:
00160                 if value is not None:
00161                     raise ValueError("Can't set strand without a location.")
00162             else:
00163                 raise
00164     strand = property(fget = _get_strand, fset = _set_strand,
00165                       doc = """Feature's strand
00166 
00167                             This is a shortcut for feature.location.strand
00168                             """)
00169 
00170     def _get_ref(self):
00171         return self.location.ref
00172     def _set_ref(self, value):
00173         try:
00174             self.location.ref = value
00175         except AttributeError:
00176             if self.location is None:
00177                 if value is not None:
00178                     raise ValueError("Can't set ref without a location.")
00179             else:
00180                 raise
00181     ref = property(fget = _get_ref, fset = _set_ref,
00182                    doc = """Feature location reference (e.g. accession).
00183 
00184                          This is a shortcut for feature.location.ref
00185                          """)
00186 
00187     def _get_ref_db(self):
00188         return self.location.ref_db
00189     def _set_ref_db(self, value):
00190         self.location.ref_db = value
00191     ref_db = property(fget = _get_ref_db, fset = _set_ref_db,
00192                       doc = """Feature location reference's database.
00193 
00194                             This is a shortcut for feature.location.ref_db
00195                             """)
00196 
00197     def __repr__(self):
00198         """A string representation of the record for debugging."""
00199         answer = "%s(%s" % (self.__class__.__name__, repr(self.location))
00200         if self.type:
00201             answer += ", type=%s" % repr(self.type)
00202         if self.location_operator:
00203             answer += ", location_operator=%s" % repr(self.location_operator)
00204         if self.id and self.id != "<unknown id>":
00205             answer += ", id=%s" % repr(self.id)
00206         if self.ref:
00207             answer += ", ref=%s" % repr(self.ref)
00208         if self.ref_db:
00209             answer += ", ref_db=%s" % repr(self.ref_db)
00210         answer += ")"
00211         return answer
00212 
00213     def __str__(self):
00214         """A readable summary of the feature intended to be printed to screen.
00215         """
00216         out = "type: %s\n" % self.type
00217         out += "location: %s\n" % self.location
00218         if self.id and self.id != "<unknown id>":
00219             out += "id: %s\n" % self.id
00220         out += "qualifiers: \n"
00221         for qual_key in sorted(self.qualifiers):
00222             out += "    Key: %s, Value: %s\n" % (qual_key,
00223                                                self.qualifiers[qual_key])
00224         if len(self.sub_features) != 0:
00225             out += "Sub-Features\n"
00226             for sub_feature in self.sub_features:
00227                 out +="%s\n" % sub_feature
00228         return out
00229 
00230     def _shift(self, offset):
00231         """Returns a copy of the feature with its location shifted (PRIVATE).
00232 
00233         The annotation qaulifiers are copied."""
00234         return SeqFeature(location = self.location._shift(offset),
00235             type = self.type,
00236             location_operator = self.location_operator,
00237             id = self.id,
00238             qualifiers = dict(self.qualifiers.iteritems()),
00239             sub_features = [f._shift(offset) for f in self.sub_features])
00240 
00241     def _flip(self, length):
00242         """Returns a copy of the feature with its location flipped (PRIVATE).
00243         
00244         The argument length gives the length of the parent sequence. For
00245         example a location 0..20 (+1 strand) with parent length 30 becomes
00246         after flipping 10..30 (-1 strand). Strandless (None) or unknown
00247         strand (0) remain like that - just their end points are changed.
00248 
00249         The annotation qaulifiers are copied.
00250         """
00251         return SeqFeature(location = self.location._flip(length),
00252             type = self.type,
00253             location_operator = self.location_operator,
00254             id = self.id,
00255             qualifiers = dict(self.qualifiers.iteritems()),
00256             sub_features = [f._flip(length) for f in self.sub_features[::-1]])
00257 
00258     def extract(self, parent_sequence):
00259         """Extract feature sequence from the supplied parent sequence.
00260 
00261         The parent_sequence can be a Seq like object or a string, and will
00262         generally return an object of the same type. The exception to this is
00263         a MutableSeq as the parent sequence will return a Seq object.
00264 
00265         This should cope with complex locations including complements, joins
00266         and fuzzy positions. Even mixed strand features should work! This
00267         also covers features on protein sequences (e.g. domains), although
00268         here reverse strand features are not permitted.
00269 
00270         >>> from Bio.Seq import Seq
00271         >>> from Bio.Alphabet import generic_protein
00272         >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
00273         >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein)
00274         >>> f = SeqFeature(FeatureLocation(8,15), type="domain")
00275         >>> f.extract(seq)
00276         Seq('VALIVIC', ProteinAlphabet())
00277 
00278         Note - currently only sub-features of type "join" are supported.
00279         """
00280         if isinstance(parent_sequence, MutableSeq):
00281             #This avoids complications with reverse complements
00282             #(the MutableSeq reverse complement acts in situ)
00283             parent_sequence = parent_sequence.toseq()
00284         if self.sub_features:
00285             if self.location_operator!="join":
00286                 raise ValueError(self.location_operator)
00287             if self.location.strand == -1:
00288                 #This is a special case given how the GenBank parser works.
00289                 #Must avoid doing the reverse complement twice.
00290                 parts = []
00291                 for f_sub in self.sub_features[::-1]:
00292                     assert f_sub.location.strand==-1
00293                     parts.append(f_sub.location.extract(parent_sequence))
00294             else:
00295                 #This copes with mixed strand features:
00296                 parts = [f_sub.location.extract(parent_sequence) \
00297                          for f_sub in self.sub_features]
00298             #We use addition rather than a join to avoid alphabet issues:
00299             f_seq = parts[0]
00300             for part in parts[1:] : f_seq += part
00301             return f_seq
00302         else:
00303             return self.location.extract(parent_sequence)
00304     
00305     def __nonzero__(self):
00306         """Returns True regardless of the length of the feature.
00307 
00308         This behaviour is for backwards compatibility, since until the
00309         __len__ method was added, a SeqFeature always evaluated as True.
00310 
00311         Note that in comparison, Seq objects, strings, lists, etc, will all
00312         evaluate to False if they have length zero.
00313 
00314         WARNING: The SeqFeature may in future evaluate to False when its
00315         length is zero (in order to better match normal python behaviour)!
00316         """
00317         return True
00318 
00319     def __len__(self):
00320         """Returns the length of the region described by a feature.
00321 
00322         >>> from Bio.Seq import Seq
00323         >>> from Bio.Alphabet import generic_protein
00324         >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
00325         >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein)
00326         >>> f = SeqFeature(FeatureLocation(8,15), type="domain")
00327         >>> len(f)
00328         7
00329         >>> f.extract(seq)
00330         Seq('VALIVIC', ProteinAlphabet())
00331         >>> len(f.extract(seq))
00332         7
00333 
00334         For simple features without subfeatures this is the same as the region
00335         spanned (end position minus start position). However, for a feature
00336         defined by combining several subfeatures (e.g. a CDS as the join of
00337         several exons) the gaps are not counted (e.g. introns). This ensures
00338         that len(f) == len(f.extract(parent_seq)), and also makes sure things
00339         work properly with features wrapping the origin etc.
00340         """
00341         if self.sub_features:
00342             return sum(len(f) for f in self.sub_features)
00343         else:
00344             return len(self.location)
00345 
00346     def __iter__(self):
00347         """Iterate over the parent positions within the feature.
00348 
00349         The iteration order is strand aware, and can be thought of as moving
00350         along the feature using the parent sequence coordinates:
00351 
00352         >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
00353         >>> f = SeqFeature(FeatureLocation(5,10), type="domain", strand=-1)
00354         >>> len(f)
00355         5
00356         >>> for i in f: print i
00357         9
00358         8
00359         7
00360         6
00361         5
00362         >>> list(f)
00363         [9, 8, 7, 6, 5]
00364         """
00365         if self.sub_features:
00366             if self.strand == -1:
00367                 for f in self.sub_features[::-1]:
00368                     for i in f.location:
00369                         yield i
00370             else:
00371                 for f in self.sub_features:
00372                     for i in f.location:
00373                         yield i
00374         else:
00375             for i in self.location:
00376                 yield i
00377 
00378     def __contains__(self, value):
00379         """Check if an integer position is within the feature.
00380 
00381         >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
00382         >>> f = SeqFeature(FeatureLocation(5,10), type="domain", strand=-1)
00383         >>> len(f)
00384         5
00385         >>> [i for i in range(15) if i in f]
00386         [5, 6, 7, 8, 9]
00387 
00388         For example, to see which features include a SNP position, you could
00389         use this:
00390 
00391         >>> from Bio import SeqIO
00392         >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb")
00393         >>> for f in record.features:
00394         ...     if 1750 in f:
00395         ...         print f.type, f.location
00396         source [0:154478](+)
00397         gene [1716:4347](-)
00398         tRNA [1716:4347](-)
00399 
00400         Note that for a feature defined as a join of several subfeatures (e.g.
00401         the union of several exons) the gaps are not checked (e.g. introns).
00402         In this example, the tRNA location is defined in the GenBank file as
00403         complement(join(1717..1751,4311..4347)), so that position 1760 falls
00404         in the gap:
00405 
00406         >>> for f in record.features:
00407         ...     if 1760 in f:
00408         ...         print f.type, f.location
00409         source [0:154478](+)
00410         gene [1716:4347](-)
00411 
00412         Note that additional care may be required with fuzzy locations, for
00413         example just before a BeforePosition:
00414 
00415         >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
00416         >>> from Bio.SeqFeature import BeforePosition
00417         >>> f = SeqFeature(FeatureLocation(BeforePosition(3),8), type="domain")
00418         >>> len(f)
00419         5
00420         >>> [i for i in range(10) if i in f]
00421         [3, 4, 5, 6, 7]
00422         """
00423         if not isinstance(value, int):
00424             raise ValueError("Currently we only support checking for integer "
00425                              "positions being within a SeqFeature.")
00426         if self.sub_features:
00427             for f in self.sub_features:
00428                 if value in f:
00429                     return True
00430             return False
00431         else:
00432             return value in self.location
00433 
00434 # --- References
00435 
00436 # TODO -- Will this hold PubMed and Medline information decently?
00437 class Reference(object):
00438     """Represent a Generic Reference object.
00439 
00440     Attributes:
00441     o location - A list of Location objects specifying regions of
00442     the sequence that the references correspond to. If no locations are
00443     specified, the entire sequence is assumed.
00444     o authors - A big old string, or a list split by author, of authors
00445     for the reference.
00446     o title - The title of the reference.
00447     o journal - Journal the reference was published in.
00448     o medline_id - A medline reference for the article.
00449     o pubmed_id - A pubmed reference for the article.
00450     o comment - A place to stick any comments about the reference.
00451     """
00452     def __init__(self):
00453         self.location = []
00454         self.authors = ''
00455         self.consrtm = ''
00456         self.title = ''
00457         self.journal = ''
00458         self.medline_id = ''
00459         self.pubmed_id = ''
00460         self.comment = ''
00461 
00462     def __str__(self):
00463         """Output an informative string for debugging.
00464         """
00465         out = ""
00466         for single_location in self.location:
00467             out += "location: %s\n" % single_location
00468         out += "authors: %s\n" % self.authors
00469         if self.consrtm:
00470             out += "consrtm: %s\n" % self.consrtm
00471         out += "title: %s\n" % self.title
00472         out += "journal: %s\n" % self.journal
00473         out += "medline id: %s\n" % self.medline_id
00474         out += "pubmed id: %s\n" % self.pubmed_id
00475         out += "comment: %s\n" % self.comment
00476         return out
00477 
00478     def __repr__(self):
00479         #TODO - Update this is __init__ later accpets values
00480         return "%s(title=%s, ...)" % (self.__class__.__name__,
00481                                       repr(self.title))
00482 
00483 # --- Handling feature locations
00484 
00485 class FeatureLocation(object):
00486     """Specify the location of a feature along a sequence.
00487 
00488     This attempts to deal with fuzziness of position ends, but also
00489     make it easy to get the start and end in the 'normal' case (no
00490     fuzziness).
00491 
00492     You should access the start and end attributes with
00493     your_location.start and your_location.end. If the start and
00494     end are exact, this will return the positions, if not, we'll return
00495     the approriate Fuzzy class with info about the position and fuzziness.
00496 
00497     Note that the start and end location numbering follow Python's scheme,
00498     thus a GenBank entry of 123..150 (one based counting) becomes a location
00499     of [122:150] (zero based counting).
00500     """
00501     def __init__(self, start, end, strand=None, ref=None, ref_db=None):
00502         """Specify the start, end, strand etc of a sequence feature.
00503 
00504         start and end arguments specify the values where the feature begins
00505         and ends. These can either by any of the *Position objects that
00506         inherit from AbstractPosition, or can just be integers specifying the
00507         position. In the case of integers, the values are assumed to be
00508         exact and are converted in ExactPosition arguments. This is meant
00509         to make it easy to deal with non-fuzzy ends.
00510 
00511         i.e. Short form:
00512         
00513         >>> from Bio.SeqFeature import FeatureLocation
00514         >>> loc = FeatureLocation(5, 10, strand=-1)
00515         >>> print loc
00516         [5:10](-)
00517         
00518         Explicit form:
00519 
00520         >>> from Bio.SeqFeature import FeatureLocation, ExactPosition
00521         >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1)
00522         >>> print loc
00523         [5:10](-)
00524 
00525         Other fuzzy positions are used similarly,
00526 
00527         >>> from Bio.SeqFeature import FeatureLocation
00528         >>> from Bio.SeqFeature import BeforePosition, AfterPosition
00529         >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1)
00530         >>> print loc2
00531         [<5:>10](-)
00532 
00533         For nucleotide features you will also want to specify the strand,
00534         use 1 for the forward (plus) strand, -1 for the reverse (negative)
00535         strand, 0 for stranded but strand unknown (? in GFF3), or None for
00536         when the strand does not apply (dot in GFF3), e.g. features on
00537         proteins.
00538 
00539         >>> loc = FeatureLocation(5, 10, strand=+1)
00540         >>> print loc
00541         [5:10](+)
00542         >>> print loc.strand
00543         1
00544 
00545         Normally feature locations are given relative to the parent
00546         sequence you are working with, but an explicit accession can
00547         be given with the optional ref and db_ref strings:
00548 
00549         >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1)
00550         >>> print loc
00551         AL391218.9[105172:108462](+)
00552         >>> print loc.ref
00553         AL391218.9
00554 
00555         """
00556         if isinstance(start, AbstractPosition):
00557             self._start = start
00558         elif isinstance(start, int):
00559             self._start = ExactPosition(start)
00560         else:
00561             raise TypeError(start)
00562         if isinstance(end, AbstractPosition):
00563             self._end = end
00564         elif isinstance(end, int):
00565             self._end = ExactPosition(end)
00566         else:
00567             raise TypeError(end)
00568         self.strand = strand
00569         self.ref = ref
00570         self.ref_db = ref_db
00571 
00572     def _get_strand(self):
00573         return self._strand
00574     def _set_strand(self, value):
00575         if value not in [+1, -1, 0, None]:
00576             raise ValueError("Strand should be +1, -1, 0 or None, not %r" \
00577                              % value)
00578         self._strand = value
00579     strand = property(fget = _get_strand, fset = _set_strand,
00580                       doc = "Strand of the location (+1, -1, 0 or None).")
00581 
00582     def __str__(self):
00583         """Returns a representation of the location (with python counting).
00584 
00585         For the simple case this uses the python splicing syntax, [122:150]
00586         (zero based counting) which GenBank would call 123..150 (one based
00587         counting).
00588         """
00589         answer = "[%s:%s]" % (self._start, self._end)
00590         if self.ref and self.ref_db:
00591             answer = "%s:%s%s" % (self.ref_db, self.ref, answer)
00592         elif self.ref:
00593             answer = self.ref + answer
00594         #Is ref_db without ref meaningful?
00595         if self.strand is None:
00596             return answer
00597         elif self.strand == +1:
00598             return answer + "(+)"
00599         elif self.strand == -1:
00600             return answer + "(-)"
00601         else:
00602             #strand = 0, stranded but strand unknown, ? in GFF3
00603             return answer + "(?)"
00604 
00605     def __repr__(self):
00606         """A string representation of the location for debugging."""
00607         optional = ""
00608         if self.strand is not None:
00609             optional += ", strand=%r" % self.strand
00610         if self.ref is not None:
00611             optional += ", ref=%r" % self.ref
00612         if self.ref_db is not None:
00613             optional += ", ref_db=%r" % self.ref_db
00614         return "%s(%r, %r%s)" \
00615                    % (self.__class__.__name__, self.start, self.end, optional)
00616 
00617     def __nonzero__(self):
00618         """Returns True regardless of the length of the feature.
00619 
00620         This behaviour is for backwards compatibility, since until the
00621         __len__ method was added, a FeatureLocation always evaluated as True.
00622 
00623         Note that in comparison, Seq objects, strings, lists, etc, will all
00624         evaluate to False if they have length zero.
00625 
00626         WARNING: The FeatureLocation may in future evaluate to False when its
00627         length is zero (in order to better match normal python behaviour)!
00628         """
00629         return True
00630 
00631     def __len__(self):
00632         """Returns the length of the region described by the FeatureLocation.
00633         
00634         Note that extra care may be needed for fuzzy locations, e.g.
00635 
00636         >>> from Bio.SeqFeature import FeatureLocation
00637         >>> from Bio.SeqFeature import BeforePosition, AfterPosition
00638         >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10))
00639         >>> len(loc)
00640         5
00641         """
00642         return int(self._end) - int(self._start)
00643 
00644     def __contains__(self, value):
00645         """Check if an integer position is within the FeatureLocation.
00646 
00647         Note that extra care may be needed for fuzzy locations, e.g.
00648 
00649         >>> from Bio.SeqFeature import FeatureLocation
00650         >>> from Bio.SeqFeature import BeforePosition, AfterPosition
00651         >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10))
00652         >>> len(loc)
00653         5
00654         >>> [i for i in range(15) if i in loc]
00655         [5, 6, 7, 8, 9]
00656         """
00657         if not isinstance(value, int):
00658             raise ValueError("Currently we only support checking for integer "
00659                              "positions being within a FeatureLocation.")
00660         if value < self._start or value >= self._end:
00661             return False
00662         else:
00663             return True
00664 
00665     def __iter__(self):
00666         """Iterate over the parent positions within the FeatureLocation.
00667 
00668         >>> from Bio.SeqFeature import FeatureLocation
00669         >>> from Bio.SeqFeature import BeforePosition, AfterPosition
00670         >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10))
00671         >>> len(loc)
00672         5
00673         >>> for i in loc: print i
00674         5
00675         6
00676         7
00677         8
00678         9
00679         >>> list(loc)
00680         [5, 6, 7, 8, 9]
00681         >>> [i for i in range(15) if i in loc]
00682         [5, 6, 7, 8, 9]
00683 
00684         Note this is strand aware:
00685 
00686         >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1)
00687         >>> list(loc)
00688         [9, 8, 7, 6, 5]
00689         """
00690         if self.strand == -1:
00691             for i in range(self._end - 1, self._start - 1, -1):
00692                 yield i
00693         else:
00694             for i in range(self._start, self._end):
00695                 yield i
00696 
00697     def _shift(self, offset):
00698         """Returns a copy of the location shifted by the offset (PRIVATE)."""
00699         if self.ref or self.ref_db:
00700             #TODO - Return self?
00701             raise ValueError("Feature references another sequence.")
00702         return FeatureLocation(start = self._start._shift(offset),
00703                                end = self._end._shift(offset),
00704                                strand = self.strand)
00705 
00706     def _flip(self, length):
00707         """Returns a copy of the location after the parent is reversed (PRIVATE)."""
00708         if self.ref or self.ref_db:
00709             #TODO - Return self?
00710             raise ValueError("Feature references another sequence.")
00711         #Note this will flip the start and end too!
00712         if self.strand == +1:
00713             flip_strand = -1
00714         elif self.strand == -1:
00715             flip_strand = +1
00716         else:
00717             #0 or None
00718             flip_strand = self.strand
00719         return FeatureLocation(start = self._end._flip(length),
00720                                end = self._start._flip(length),
00721                                strand = flip_strand)
00722 
00723     @property
00724     def start(self):
00725         """Start location (integer like, possibly a fuzzy position, read only)."""
00726         return self._start
00727 
00728     @property
00729     def end(self):
00730         """End location (integer like, possibly a fuzzy position, read only)."""
00731         return self._end
00732 
00733     @property
00734     def nofuzzy_start(self):
00735         """Start position (integer, approximated if fuzzy, read only) (OBSOLETE).
00736 
00737         This is now a alias for int(feature.start), which should be
00738         used in preference -- unless you are trying to support old
00739         versions of Biopython.
00740         """
00741         return int(self._start)
00742 
00743     @property
00744     def nofuzzy_end(self):
00745         """End position (integer, approximated if fuzzy, read only) (OBSOLETE).
00746 
00747         This is now a alias for int(feature.end), which should be
00748         used in preference -- unless you are trying to support old
00749         versions of Biopython.  
00750         """
00751         return int(self._end)
00752 
00753 
00754     def extract(self, parent_sequence):
00755         """Extract feature sequence from the supplied parent sequence."""
00756         if self.ref or self.ref_db:
00757             #TODO - Take a dictionary as an optional argument?
00758             raise ValueError("Feature references another sequence.")
00759         if isinstance(parent_sequence, MutableSeq):
00760             #This avoids complications with reverse complements
00761             #(the MutableSeq reverse complement acts in situ)
00762            parent_sequence = parent_sequence.toseq()
00763         f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end]
00764         if self.strand == -1:
00765             try:
00766                 f_seq = f_seq.reverse_complement()
00767             except AttributeError:
00768                 assert isinstance(f_seq, str)
00769                 f_seq = reverse_complement(f_seq)
00770         return f_seq
00771 
00772 class AbstractPosition(object):
00773     """Abstract base class representing a position.
00774     """
00775 
00776     def __repr__(self):
00777         """String representation of the location for debugging."""
00778         return "%s(...)" % (self.__class__.__name__)
00779 
00780 class ExactPosition(int, AbstractPosition):
00781     """Specify the specific position of a boundary.
00782 
00783     o position - The position of the boundary.
00784     o extension - An optional argument which must be zero since we don't
00785     have an extension. The argument is provided so that the same number of
00786     arguments can be passed to all position types.
00787 
00788     In this case, there is no fuzziness associated with the position.
00789 
00790     >>> p = ExactPosition(5)
00791     >>> p
00792     ExactPosition(5)
00793     >>> print p
00794     5
00795 
00796     >>> isinstance(p, AbstractPosition)
00797     True
00798     >>> isinstance(p, int)
00799     True
00800 
00801     Integer comparisons and operations should work as expected:
00802 
00803     >>> p == 5
00804     True
00805     >>> p < 6
00806     True
00807     >>> p <= 5
00808     True
00809     >>> p + 10
00810     15
00811 
00812     """
00813     def __new__(cls, position, extension = 0):
00814         if extension != 0:
00815             raise AttributeError("Non-zero extension %s for exact position."
00816                                  % extension)
00817         return int.__new__(cls, position)
00818 
00819     def __repr__(self):
00820         """String representation of the ExactPosition location for debugging."""
00821         return "%s(%i)" % (self.__class__.__name__, int(self))
00822 
00823     @property
00824     def position(self):
00825         """Legacy attribute to get position as integer (OBSOLETE)."""
00826         return int(self)
00827 
00828     @property
00829     def extension(self):
00830         """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""
00831         return 0
00832 
00833     def _shift(self, offset):
00834         #By default preserve any subclass
00835         return self.__class__(int(self) + offset)
00836 
00837     def _flip(self, length):
00838         #By default perserve any subclass
00839         return self.__class__(length - int(self))
00840 
00841 class UncertainPosition(ExactPosition):
00842     """Specify a specific position which is uncertain.
00843     
00844     This is used in UniProt, e.g. ?222 for uncertain position 222, or in the
00845     XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL.
00846     """
00847     pass
00848 
00849 class UnknownPosition(AbstractPosition):
00850     """Specify a specific position which is unknown (has no position).
00851 
00852     This is used in UniProt, e.g. ? or in the XML as unknown.
00853     """
00854 
00855     def __repr__(self):
00856         """String representation of the UnknownPosition location for debugging."""
00857         return "%s()" % self.__class__.__name__
00858 
00859     def __hash__(self):
00860         return hash(None)
00861 
00862     @property
00863     def position(self):
00864         """Legacy attribute to get position (None) (OBSOLETE)."""
00865         return None
00866 
00867     @property
00868     def extension(self):
00869         """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""
00870         return 0
00871 
00872     def _shift(self, offset):
00873         return self
00874 
00875     def _flip(self, length):
00876         return self
00877         
00878 class WithinPosition(int, AbstractPosition):
00879     """Specify the position of a boundary within some coordinates.
00880 
00881     Arguments:
00882     o position - The default integer position
00883     o left - The start (left) position of the boundary
00884     o right - The end (right) position of the boundary
00885 
00886     This allows dealing with a position like ((1.4)..100). This
00887     indicates that the start of the sequence is somewhere between 1
00888     and 4. Since this is a start coordindate, it should acts like
00889     it is at position 1 (or in Python counting, 0).
00890 
00891     >>> p = WithinPosition(10,10,13)
00892     >>> p
00893     WithinPosition(10, left=10, right=13)
00894     >>> print p
00895     (10.13)
00896     >>> int(p)
00897     10
00898 
00899     Basic integer comparisons and operations should work as though
00900     this were a plain integer:
00901 
00902     >>> p == 10
00903     True
00904     >>> p in [9,10,11]
00905     True
00906     >>> p < 11
00907     True
00908     >>> p + 10
00909     20
00910 
00911     >>> isinstance(p, WithinPosition)
00912     True
00913     >>> isinstance(p, AbstractPosition)
00914     True
00915     >>> isinstance(p, int)
00916     True
00917 
00918     Note this also applies for comparison to other position objects,
00919     where again the integer behaviour is used:
00920 
00921     >>> p == 10
00922     True
00923     >>> p == ExactPosition(10)
00924     True
00925     >>> p == BeforePosition(10)
00926     True
00927     >>> p == AfterPosition(10)
00928     True
00929 
00930     If this were an end point, you would want the position to be 13:
00931 
00932     >>> p2 = WithinPosition(13,10,13)
00933     >>> p2
00934     WithinPosition(13, left=10, right=13)
00935     >>> print p2
00936     (10.13)
00937     >>> int(p2)
00938     13
00939     >>> p2 == 13
00940     True
00941     >>> p2 == ExactPosition(13)
00942     True
00943 
00944     The old legacy properties of position and extension give the
00945     starting/lower/left position as an integer, and the distance
00946     to the ending/higher/right position as an integer. Note that
00947     the position object will act like either the left or the right
00948     end-point depending on how it was created:
00949 
00950     >>> p.position == p2.position == 10
00951     True
00952     >>> p.extension == p2.extension == 3
00953     True
00954     >>> int(p) == int(p2)
00955     False
00956     >>> p == 10
00957     True
00958     >>> p2 == 13
00959     True
00960     
00961     """
00962     def __new__(cls, position, left, right):
00963         assert position==left or position==right
00964         obj = int.__new__(cls, position)
00965         obj._left = left
00966         obj._right = right
00967         return obj
00968 
00969     def __repr__(self):
00970         """String representation of the WithinPosition location for debugging."""
00971         return "%s(%i, left=%i, right=%i)" \
00972                % (self.__class__.__name__, int(self),
00973                   self._left, self._right)
00974 
00975     def __str__(self):
00976         return "(%s.%s)" % (self._left, self._right)
00977 
00978     @property
00979     def position(self):
00980         """Legacy attribute to get (left) position as integer (OBSOLETE)."""
00981         return self._left
00982 
00983     @property
00984     def extension(self):
00985         """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE)."""
00986         return self._right - self._left
00987 
00988     def _shift(self, offset):
00989         return self.__class__(int(self) + offset,
00990                               self._left + offset,
00991                               self._right + offset)
00992 
00993     def _flip(self, length):
00994         return self.__class__(length - int(self),
00995                               length - self._right,
00996                               length - self._left)
00997 
00998 class BetweenPosition(int, AbstractPosition):
00999     """Specify the position of a boundary between two coordinates (OBSOLETE?).
01000 
01001     Arguments:
01002     o position - The default integer position
01003     o left - The start (left) position of the boundary
01004     o right - The end (right) position of the boundary
01005 
01006     This allows dealing with a position like 123^456. This                                                  
01007     indicates that the start of the sequence is somewhere between
01008     123 and 456. It is up to the parser to set the position argument
01009     to either boundary point (depending on if this is being used as
01010     a start or end of the feature). For example as a feature end:
01011 
01012     >>> p = BetweenPosition(456, 123, 456)
01013     >>> p
01014     BetweenPosition(456, left=123, right=456)
01015     >>> print p
01016     (123^456)
01017     >>> int(p)
01018     456
01019 
01020     Integer equality and comparison use the given position,
01021 
01022     >>> p == 456
01023     True
01024     >>> p in [455, 456, 457]
01025     True
01026     >>> p > 300
01027     True
01028 
01029     The old legacy properties of position and extension give the
01030     starting/lower/left position as an integer, and the distance
01031     to the ending/higher/right position as an integer. Note that
01032     the position object will act like either the left or the right
01033     end-point depending on how it was created:
01034 
01035     >>> p2 = BetweenPosition(123, left=123, right=456)
01036     >>> p.position == p2.position == 123
01037     True
01038     >>> p.extension
01039     333
01040     >>> p2.extension
01041     333
01042     >>> p.extension == p2.extension == 333
01043     True
01044     >>> int(p) == int(p2)
01045     False
01046     >>> p == 456
01047     True
01048     >>> p2 == 123
01049     True
01050 
01051     Note this potentially surprising behaviour:
01052 
01053     >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123)
01054     True
01055     >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123)
01056     True
01057     >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123)
01058     True
01059 
01060     i.e. For equality (and sorting) the position objects behave like
01061     integers.
01062     """
01063     def __new__(cls, position, left, right):
01064         assert position==left or position==right
01065         obj = int.__new__(cls, position)
01066         obj._left = left
01067         obj._right = right
01068         return obj
01069 
01070     def __repr__(self):
01071         """String representation of the WithinPosition location for debugging."""
01072         return "%s(%i, left=%i, right=%i)" \
01073                % (self.__class__.__name__, int(self),
01074                   self._left, self._right)
01075 
01076     def __str__(self):
01077         return "(%s^%s)" % (self._left, self._right)
01078 
01079     @property
01080     def position(self):
01081         """Legacy attribute to get (left) position as integer (OBSOLETE)."""
01082         return self._left
01083 
01084     @property
01085     def extension(self):
01086         """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE)."""
01087         return self._right - self._left
01088 
01089     def _shift(self, offset):
01090         return self.__class__(int(self) + offset,
01091                               self._left + offset,
01092                               self._right + offset)
01093 
01094     def _flip(self, length):
01095         return self.__class__(length - int(self),
01096                               length - self._right,
01097                               length - self._left)
01098 
01099 class BeforePosition(int, AbstractPosition):
01100     """Specify a position where the actual location occurs before it.
01101 
01102     Arguments:
01103     o position - The upper boundary of where the location can occur.
01104     o extension - An optional argument which must be zero since we don't
01105     have an extension. The argument is provided so that the same number of
01106     arguments can be passed to all position types.
01107 
01108     This is used to specify positions like (<10..100) where the location
01109     occurs somewhere before position 10.
01110 
01111     >>> p = BeforePosition(5)
01112     >>> p
01113     BeforePosition(5)
01114     >>> print p
01115     <5
01116     >>> int(p)
01117     5
01118     >>> p + 10
01119     15
01120 
01121     Note this potentially surprising behaviour:
01122 
01123     >>> p == ExactPosition(5)
01124     True
01125     >>> p == AfterPosition(5)
01126     True
01127 
01128     Just remember that for equality and sorting the position objects act
01129     like integers.
01130     """
01131     #Subclasses int so can't use __init__
01132     def __new__(cls, position, extension = 0):
01133         if extension != 0:
01134             raise AttributeError("Non-zero extension %s for exact position."
01135                                  % extension)
01136         return int.__new__(cls, position)
01137 
01138     @property
01139     def position(self):
01140         """Legacy attribute to get position as integer (OBSOLETE)."""
01141         return int(self)
01142 
01143     @property
01144     def extension(self):
01145         """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""
01146         return 0
01147 
01148     def __repr__(self):
01149         """A string representation of the location for debugging."""
01150         return "%s(%i)" % (self.__class__.__name__, int(self))
01151 
01152     def __str__(self):
01153         return "<%s" % self.position
01154 
01155     def _shift(self, offset):
01156         return self.__class__(int(self) + offset)
01157 
01158     def _flip(self, length):
01159         return AfterPosition(length - int(self))
01160 
01161 class AfterPosition(int, AbstractPosition):
01162     """Specify a position where the actual location is found after it.
01163 
01164     Arguments:
01165     o position - The lower boundary of where the location can occur.
01166     o extension - An optional argument which must be zero since we don't
01167     have an extension. The argument is provided so that the same number of
01168     arguments can be passed to all position types.
01169 
01170     This is used to specify positions like (>10..100) where the location
01171     occurs somewhere after position 10.
01172 
01173     >>> p = AfterPosition(7)
01174     >>> p
01175     AfterPosition(7)
01176     >>> print p
01177     >7
01178     >>> int(p)
01179     7
01180     >>> p + 10
01181     17
01182 
01183     >>> isinstance(p, AfterPosition)
01184     True
01185     >>> isinstance(p, AbstractPosition)
01186     True
01187     >>> isinstance(p, int)
01188     True
01189 
01190     Note this potentially surprising behaviour:
01191 
01192     >>> p == ExactPosition(7)
01193     True
01194     >>> p == BeforePosition(7)
01195     True
01196 
01197     Just remember that for equality and sorting the position objects act
01198     like integers.
01199     """
01200     #Subclasses int so can't use __init__
01201     def __new__(cls, position, extension = 0):
01202         if extension != 0:
01203             raise AttributeError("Non-zero extension %s for exact position."
01204                                  % extension)
01205         return int.__new__(cls, position)
01206 
01207     @property
01208     def position(self):
01209         """Legacy attribute to get position as integer (OBSOLETE)."""
01210         return int(self)
01211 
01212     @property
01213     def extension(self):
01214         """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""
01215         return 0
01216 
01217     def __repr__(self):
01218         """A string representation of the location for debugging."""
01219         return "%s(%i)" % (self.__class__.__name__, int(self))
01220 
01221     def __str__(self):
01222         return ">%s" % self.position
01223 
01224     def _shift(self, offset):
01225         return self.__class__(int(self) + offset)
01226 
01227     def _flip(self, length):
01228         return BeforePosition(length - int(self))
01229 
01230 
01231 class OneOfPosition(int, AbstractPosition):
01232     """Specify a position where the location can be multiple positions.
01233 
01234     This models the GenBank 'one-of(1888,1901)' function, and tries
01235     to make this fit within the Biopython Position models. If this was
01236     a start position it should act like 1888, but as an end position 1901.
01237 
01238     >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)])
01239     >>> p
01240     OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)])
01241     >>> int(p)
01242     1888
01243 
01244     Interget comparisons and operators act like using int(p),
01245 
01246     >>> p == 1888
01247     True
01248     >>> p <= 1888
01249     True
01250     >>> p > 1888
01251     False
01252     >>> p + 100
01253     1988
01254 
01255     >>> isinstance(p, OneOfPosition)
01256     True
01257     >>> isinstance(p, AbstractPosition)
01258     True
01259     >>> isinstance(p, int)
01260     True
01261 
01262     The old legacy properties of position and extension give the
01263     starting/lowest/left-most position as an integer, and the
01264     distance to the ending/highest/right-most position as an integer.
01265     Note that the position object will act like one of the list of
01266     possible locations depending on how it was created:
01267 
01268     >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)])
01269     >>> p.position == p2.position == 1888
01270     True
01271     >>> p.extension == p2.extension == 13
01272     True
01273     >>> int(p) == int(p2)
01274     False
01275     >>> p == 1888
01276     True
01277     >>> p2 == 1901
01278     True
01279 
01280     """
01281     def __new__(cls, position, choices):
01282         """Initialize with a set of posssible positions.
01283 
01284         position_list is a list of AbstractPosition derived objects,
01285         specifying possible locations.
01286 
01287         position is an integer specifying the default behaviour.
01288         """
01289         assert position in choices
01290         obj = int.__new__(cls, position)
01291         obj.position_choices = choices
01292         return obj
01293 
01294     @property
01295     def position(self):
01296         """Legacy attribute to get (left) position as integer (OBSOLETE)."""
01297         return min(int(pos) for pos in self.position_choices)
01298 
01299     @property
01300     def extension(self):
01301         """Legacy attribute to get extension as integer (OBSOLETE)."""
01302         positions = [int(pos) for pos in self.position_choices]
01303         return max(positions) - min(positions)
01304 
01305     def __repr__(self):
01306         """String representation of the OneOfPosition location for debugging."""
01307         return "%s(%i, choices=%r)" % (self.__class__.__name__, \
01308                                        int(self), self.position_choices)
01309 
01310     def __str__(self):
01311         out = "one-of("
01312         for position in self.position_choices:
01313             out += "%s," % position
01314         # replace the last comma with the closing parenthesis
01315         out = out[:-1] + ")"
01316         return out
01317 
01318     def _shift(self, offset):
01319         return self.__class__(int(self) + offset,
01320                               [p._shift(offset) for p in self.position_choices])
01321 
01322     def _flip(self, length):
01323         return self.__class__(length - int(self),
01324                               [p._flip(length) for p in self.position_choices[::-1]])
01325 
01326 
01327 class PositionGap(object):
01328     """Simple class to hold information about a gap between positions.
01329     """
01330     def __init__(self, gap_size):
01331         """Intialize with a position object containing the gap information.
01332         """
01333         self.gap_size = gap_size
01334 
01335     def __repr__(self):
01336         """A string representation of the position gap for debugging."""
01337         return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
01338     
01339     def __str__(self):
01340         out = "gap(%s)" % self.gap_size
01341         return out
01342 
01343 def _test():
01344     """Run the Bio.SeqFeature module's doctests (PRIVATE).
01345 
01346     This will try and locate the unit tests directory, and run the doctests
01347     from there in order that the relative paths used in the examples work.
01348     """
01349     import doctest
01350     import os
01351     if os.path.isdir(os.path.join("..","Tests")):
01352         print "Runing doctests..."
01353         cur_dir = os.path.abspath(os.curdir)
01354         os.chdir(os.path.join("..","Tests"))
01355         doctest.testmod()
01356         os.chdir(cur_dir)
01357         del cur_dir
01358         print "Done"
01359     elif os.path.isdir(os.path.join("Tests")) :
01360         print "Runing doctests..."
01361         cur_dir = os.path.abspath(os.curdir)
01362         os.chdir(os.path.join("Tests"))
01363         doctest.testmod()
01364         os.chdir(cur_dir)
01365         del cur_dir
01366         print "Done"
01367 
01368 
01369 if __name__ == "__main__":
01370     _test()