Back to index

python-biopython  1.60
BioSeq.py
Go to the documentation of this file.
00001 # Copyright 2002 by Andrew Dalke.  All rights reserved.
00002 # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved.
00003 # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved.
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 #
00008 # Note that BioSQL (including the database schema and scripts) is
00009 # available and licensed separately.  Please consult www.biosql.org
00010 """Implementations of Biopython-like Seq objects on top of BioSQL.
00011 
00012 This allows retrival of items stored in a BioSQL database using
00013 a biopython-like SeqRecord and Seq interface.
00014 
00015 Note: Currently we do not support recording per-letter-annotations
00016 (like quality scores) in BioSQL.
00017 """
00018 
00019 from Bio import Alphabet
00020 from Bio.Seq import Seq, UnknownSeq
00021 from Bio.SeqRecord import SeqRecord, _RestrictedDict
00022 from Bio import SeqFeature
00023 
00024 class DBSeq(Seq):  # This implements the biopython Seq interface
00025     def __init__(self, primary_id, adaptor, alphabet, start, length):
00026         """Create a new DBSeq object referring to a BioSQL entry.
00027 
00028         You wouldn't normally create a DBSeq object yourself, this is done
00029         for you when retreiving a DBSeqRecord object from the database.
00030         """
00031         self.primary_id = primary_id
00032         self.adaptor = adaptor
00033         self.alphabet = alphabet
00034         self._length = length
00035         self.start = start
00036 
00037     def __len__(self):
00038         return self._length
00039     
00040     def __getitem__(self, index) :                 # Seq API requirement
00041         #Note since Python 2.0, __getslice__ is deprecated
00042         #and __getitem__ is used instead.
00043         #See http://docs.python.org/ref/sequence-methods.html
00044         if isinstance(index, int):
00045             #Return a single letter as a string
00046             i = index
00047             if i < 0:
00048                 if -i > self._length:
00049                     raise IndexError(i)
00050                 i = i + self._length
00051             elif i >= self._length:
00052                 raise IndexError(i)            
00053             return self.adaptor.get_subseq_as_string(self.primary_id,
00054                                                      self.start + i,
00055                                                      self.start + i + 1)
00056         if not isinstance(index, slice):
00057             raise ValueError("Unexpected index type")
00058 
00059         #Return the (sub)sequence as another DBSeq or Seq object
00060         #(see the Seq obect's __getitem__ method)
00061         if index.start is None:
00062             i=0
00063         else:
00064             i = index.start
00065         if i < 0:
00066             #Map to equavilent positive index
00067             if -i > self._length:
00068                 raise IndexError(i)
00069             i = i + self._length
00070         elif i >= self._length:
00071             #Trivial case, should return empty string!
00072             i = self._length
00073 
00074         if index.stop is None:
00075             j = self._length
00076         else:
00077             j = index.stop
00078         if j < 0:
00079             #Map to equavilent positive index
00080             if -j > self._length:
00081                 raise IndexError(j)
00082             j = j + self._length
00083         elif j >= self._length:
00084             j = self._length
00085 
00086         if i >= j:
00087             #Trivial case, empty string.
00088             return Seq("", self.alphabet)
00089         elif index.step is None or index.step == 1:
00090             #Easy case - can return a DBSeq with the start and end adjusted
00091             return self.__class__(self.primary_id, self.adaptor, self.alphabet,
00092                                   self.start + i, j - i)
00093         else:
00094             #Tricky.  Will have to create a Seq object because of the stride
00095             full = self.adaptor.get_subseq_as_string(self.primary_id,
00096                                                      self.start + i,
00097                                                      self.start + j)
00098             return Seq(full[::index.step], self.alphabet)
00099         
00100     def tostring(self):
00101         """Returns the full sequence as a python string.
00102 
00103         Although not formally deprecated, you are now encouraged to use
00104         str(my_seq) instead of my_seq.tostring()."""
00105         return self.adaptor.get_subseq_as_string(self.primary_id,
00106                                                  self.start,
00107                                                  self.start + self._length)
00108     def __str__(self):
00109         """Returns the full sequence as a python string."""
00110         return self.adaptor.get_subseq_as_string(self.primary_id,
00111                                                  self.start,
00112                                                  self.start + self._length)
00113 
00114     data = property(tostring, doc="Sequence as string (DEPRECATED)")
00115 
00116     def toseq(self):
00117         """Returns the full sequence as a Seq object."""
00118         #Note - the method name copies that of the MutableSeq object
00119         return Seq(str(self), self.alphabet)
00120 
00121     def __add__(self, other):
00122         #Let the Seq object deal with the alphabet issues etc
00123         return self.toseq() + other
00124 
00125     def __radd__(self, other):
00126         #Let the Seq object deal with the alphabet issues etc
00127         return other + self.toseq()
00128 
00129 
00130 def _retrieve_seq(adaptor, primary_id):
00131     #The database schema ensures there will be only one matching
00132     #row in the table.
00133 
00134     #If an UnknownSeq was recorded, seq will be NULL,
00135     #but length will be populated.  This means length(seq)
00136     #will return None.
00137     seqs = adaptor.execute_and_fetchall(
00138         "SELECT alphabet, length, length(seq) FROM biosequence" \
00139         " WHERE bioentry_id = %s", (primary_id,))
00140     if not seqs : return
00141     assert len(seqs) == 1        
00142     moltype, given_length, length = seqs[0]
00143 
00144     try:
00145         length = int(length)
00146         given_length = int(length)
00147         assert length == given_length
00148         have_seq = True
00149     except TypeError:
00150         assert length is None
00151         seqs = adaptor.execute_and_fetchall(
00152             "SELECT alphabet, length, seq FROM biosequence" \
00153             " WHERE bioentry_id = %s", (primary_id,))
00154         assert len(seqs) == 1
00155         moltype, given_length, seq = seqs[0]
00156         assert seq is None or seq==""
00157         length = int(given_length)
00158         have_seq = False
00159         del seq
00160     del given_length
00161         
00162     moltype = moltype.lower() #might be upper case in database
00163     #We have no way of knowing if these sequences will use IUPAC
00164     #alphabets, and we certainly can't assume they are unambiguous!
00165     if moltype == "dna":
00166         alphabet = Alphabet.generic_dna
00167     elif moltype == "rna":
00168         alphabet = Alphabet.generic_rna
00169     elif moltype == "protein":
00170         alphabet = Alphabet.generic_protein
00171     elif moltype == "unknown":
00172         #This is used in BioSQL/Loader.py and would happen
00173         #for any generic or nucleotide alphabets.
00174         alphabet = Alphabet.single_letter_alphabet
00175     else:
00176         raise AssertionError("Unknown moltype: %s" % moltype)
00177 
00178     if have_seq:
00179         return DBSeq(primary_id, adaptor, alphabet, 0, int(length))
00180     else:
00181         return UnknownSeq(length, alphabet)
00182 
00183 def _retrieve_dbxrefs(adaptor, primary_id):
00184     """Retrieve the database cross references for the sequence."""
00185     _dbxrefs = []
00186     dbxrefs = adaptor.execute_and_fetchall(
00187         "SELECT dbname, accession, version" \
00188         " FROM bioentry_dbxref join dbxref using (dbxref_id)" \
00189         " WHERE bioentry_id = %s" \
00190         " ORDER BY rank", (primary_id,))
00191     for dbname, accession, version in dbxrefs:
00192         if version and version != "0":
00193             v = "%s.%s" % (accession, version)
00194         else:
00195             v = accession
00196         _dbxrefs.append("%s:%s" % (dbname, v))
00197     return _dbxrefs
00198 
00199 def _retrieve_features(adaptor, primary_id):
00200     sql = "SELECT seqfeature_id, type.name, rank" \
00201           " FROM seqfeature join term type on (type_term_id = type.term_id)" \
00202           " WHERE bioentry_id = %s" \
00203           " ORDER BY rank"
00204     results = adaptor.execute_and_fetchall(sql, (primary_id,))
00205     seq_feature_list = []
00206     for seqfeature_id, seqfeature_type, seqfeature_rank in results:
00207         # Get qualifiers [except for db_xref which is stored separately]
00208         qvs = adaptor.execute_and_fetchall(
00209             "SELECT name, value" \
00210             " FROM seqfeature_qualifier_value  join term using (term_id)" \
00211             " WHERE seqfeature_id = %s" \
00212             " ORDER BY rank", (seqfeature_id,))
00213         qualifiers = {}
00214         for qv_name, qv_value in qvs:
00215             qualifiers.setdefault(qv_name, []).append(qv_value)
00216         # Get db_xrefs [special case of qualifiers]
00217         qvs = adaptor.execute_and_fetchall(
00218             "SELECT dbxref.dbname, dbxref.accession" \
00219             " FROM dbxref join seqfeature_dbxref using (dbxref_id)" \
00220             " WHERE seqfeature_dbxref.seqfeature_id = %s" \
00221             " ORDER BY rank", (seqfeature_id,))
00222         for qv_name, qv_value in qvs:
00223             value = "%s:%s" % (qv_name, qv_value)
00224             qualifiers.setdefault("db_xref", []).append(value)
00225         # Get locations
00226         results = adaptor.execute_and_fetchall(
00227             "SELECT location_id, start_pos, end_pos, strand" \
00228             " FROM location" \
00229             " WHERE seqfeature_id = %s" \
00230             " ORDER BY rank", (seqfeature_id,))
00231         locations = []
00232         # convert to Python standard form
00233         # Convert strand = 0 to strand = None
00234         # re: comment in Loader.py:
00235         # Biopython uses None when we don't know strand information but
00236         # BioSQL requires something (non null) and sets this as zero
00237         # So we'll use the strand or 0 if Biopython spits out None
00238         for location_id, start, end, strand in results:
00239             if start:
00240                 start -= 1
00241             if strand == 0:
00242                 strand = None
00243             if strand not in (+1, -1, None):
00244                 raise ValueError("Invalid strand %s found in database for " \
00245                                  "seqfeature_id %s" % (strand, seqfeature_id))
00246             if end < start:
00247                 import warnings
00248                 warnings.warn("Inverted location start/end (%i and %i) for " \
00249                               "seqfeature_id %s" % (start, end, seqfeature_id))
00250             locations.append( (location_id, start, end, strand) )
00251         # Get possible remote reference information
00252         remote_results = adaptor.execute_and_fetchall(
00253             "SELECT location_id, dbname, accession, version" \
00254             " FROM location join dbxref using (dbxref_id)" \
00255             " WHERE seqfeature_id = %s", (seqfeature_id,))
00256         lookup = {}
00257         for location_id, dbname, accession, version in remote_results:
00258             if version and version != "0":
00259                 v = "%s.%s" % (accession, version)
00260             else:
00261                 v = accession
00262             # subfeature remote location db_ref are stored as a empty string when
00263             # not present
00264             if dbname == "":
00265                 dbname = None
00266             lookup[location_id] = (dbname, v)
00267         
00268         feature = SeqFeature.SeqFeature(type = seqfeature_type)
00269         feature._seqfeature_id = seqfeature_id #Store the key as a private property
00270         feature.qualifiers = qualifiers
00271         if len(locations) == 0:
00272             pass
00273         elif len(locations) == 1:
00274             location_id, start, end, strand = locations[0]
00275             #See Bug 2677, we currently don't record the location_operator
00276             #For consistency with older versions Biopython, default to "".
00277             feature.location_operator = \
00278                 _retrieve_location_qualifier_value(adaptor, location_id)
00279             dbname, version = lookup.get(location_id, (None, None))
00280             feature.location = SeqFeature.FeatureLocation(start, end)
00281             feature.strand = strand
00282             feature.ref_db = dbname
00283             feature.ref = version
00284         else:
00285             assert feature.sub_features == []
00286             for location in locations:
00287                 location_id, start, end, strand = location
00288                 dbname, version = lookup.get(location_id, (None, None))
00289                 subfeature = SeqFeature.SeqFeature()
00290                 subfeature.type = seqfeature_type
00291                 subfeature.location_operator = \
00292                     _retrieve_location_qualifier_value(adaptor, location_id)
00293                 #TODO - See Bug 2677 - we don't yet record location_operator,
00294                 #so for consistency with older versions of Biopython default
00295                 #to assuming its a join.
00296                 if not subfeature.location_operator:
00297                     subfeature.location_operator="join"
00298                 subfeature.location = SeqFeature.FeatureLocation(start, end)
00299                 subfeature.strand = strand
00300                 subfeature.ref_db = dbname
00301                 subfeature.ref = version
00302                 feature.sub_features.append(subfeature)
00303             # Assuming that the feature loc.op is the same as the sub_feature
00304             # loc.op:
00305             feature.location_operator = \
00306                 feature.sub_features[0].location_operator
00307             # Locations are in order, but because of remote locations for
00308             # sub-features they are not necessarily in numerical order:
00309             start = locations[0][1]
00310             end = locations[-1][2]
00311             feature.location = SeqFeature.FeatureLocation(start, end)
00312             # To get the parent strand (as done when parsing GenBank files),
00313             # need to consider evil mixed strand examples like this,
00314             # join(complement(69611..69724),139856..140087,140625..140650)
00315             strands = set(sf.strand for sf in feature.sub_features)
00316             if len(strands)==1:
00317                 feature.strand = feature.sub_features[0].strand
00318             else:
00319                 feature.strand = None # i.e. mixed strands
00320 
00321         seq_feature_list.append(feature)
00322 
00323     return seq_feature_list
00324 
00325 def _retrieve_location_qualifier_value(adaptor, location_id):
00326     value = adaptor.execute_and_fetch_col0(
00327         "SELECT value FROM location_qualifier_value" \
00328         " WHERE location_id = %s", (location_id,))
00329     try:
00330         return value[0] 
00331     except IndexError:
00332         return ""
00333 
00334 def _retrieve_annotations(adaptor, primary_id, taxon_id):
00335     annotations = {}
00336     annotations.update(_retrieve_qualifier_value(adaptor, primary_id))
00337     annotations.update(_retrieve_reference(adaptor, primary_id))
00338     annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id))
00339     annotations.update(_retrieve_comment(adaptor, primary_id))
00340     # Convert values into strings in cases of unicode from the database.
00341     # BioSQL could eventually be expanded to be unicode aware.
00342     str_anns = {}
00343     for key, val in annotations.items():
00344         if isinstance(val, list):
00345             val = [_make_unicode_into_string(x) for x in val]
00346         elif isinstance(val, unicode):
00347             val = str(val)
00348         str_anns[key] = val
00349     return str_anns
00350 
00351 def _make_unicode_into_string(text):
00352     if isinstance(text, unicode):
00353         return str(text)
00354     else :
00355         return text
00356 
00357 def _retrieve_qualifier_value(adaptor, primary_id):
00358     qvs = adaptor.execute_and_fetchall(
00359         "SELECT name, value" \
00360         " FROM bioentry_qualifier_value JOIN term USING (term_id)" \
00361         " WHERE bioentry_id = %s" \
00362         " ORDER BY rank", (primary_id,))
00363     qualifiers = {}
00364     for name, value in qvs:
00365         if name == "keyword": name = "keywords"
00366         #See handling of "date" in Loader.py
00367         elif name == "date_changed": name = "date"
00368         elif name == "secondary_accession": name = "accessions"
00369         qualifiers.setdefault(name, []).append(value)
00370     return qualifiers
00371 
00372 def _retrieve_reference(adaptor, primary_id):
00373     # XXX dbxref_qualifier_value
00374  
00375     refs = adaptor.execute_and_fetchall(
00376         "SELECT start_pos, end_pos, " \
00377         " location, title, authors," \
00378         " dbname, accession" \
00379         " FROM bioentry_reference" \
00380         " JOIN reference USING (reference_id)" \
00381         " LEFT JOIN dbxref USING (dbxref_id)" \
00382         " WHERE bioentry_id = %s" \
00383         " ORDER BY rank", (primary_id,))
00384     references = []
00385     for start, end, location, title, authors, dbname, accession in refs:
00386         reference = SeqFeature.Reference()
00387         #If the start/end are missing, reference.location is an empty list
00388         if (start is not None) or (end is not None):
00389             if start is not None: start -= 1 #python counting
00390             reference.location = [SeqFeature.FeatureLocation(start, end)]
00391         #Don't replace the default "" with None.
00392         if authors : reference.authors = authors
00393         if title : reference.title = title
00394         reference.journal = location
00395         if dbname == 'PUBMED':
00396             reference.pubmed_id = accession
00397         elif dbname == 'MEDLINE':
00398             reference.medline_id = accession
00399         references.append(reference)
00400     if references:
00401         return {'references': references}
00402     else:
00403         return {}
00404 
00405 def _retrieve_taxon(adaptor, primary_id, taxon_id):
00406     a = {}
00407     common_names = adaptor.execute_and_fetch_col0(
00408         "SELECT name FROM taxon_name WHERE taxon_id = %s" \
00409         " AND name_class = 'genbank common name'", (taxon_id,))
00410     if common_names:
00411         a['source'] = common_names[0]
00412     scientific_names = adaptor.execute_and_fetch_col0(
00413         "SELECT name FROM taxon_name WHERE taxon_id = %s" \
00414         " AND name_class = 'scientific name'", (taxon_id,))
00415     if scientific_names:
00416         a['organism'] = scientific_names[0]
00417     ncbi_taxids = adaptor.execute_and_fetch_col0(
00418         "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,))
00419     if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
00420         a['ncbi_taxid'] = ncbi_taxids[0]
00421 
00422     #Old code used the left/right values in the taxon table to get the
00423     #taxonomy lineage in one SQL command.  This was actually very slow,
00424     #and would fail if the (optional) left/right values were missing.
00425     #
00426     #The following code is based on a contribution from Eric Gibert, and
00427     #relies on the taxon table's parent_taxon_id field only (ignoring the
00428     #optional left/right values).  This means that it has to make a
00429     #separate SQL query for each entry in the lineage, but it does still
00430     #appear to be *much* faster.  See Bug 2494. 
00431     taxonomy = []
00432     while taxon_id:
00433         name, rank, parent_taxon_id = adaptor.execute_one(
00434         "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" \
00435         " FROM taxon, taxon_name" \
00436         " WHERE taxon.taxon_id=taxon_name.taxon_id" \
00437         " AND taxon_name.name_class='scientific name'" \
00438         " AND taxon.taxon_id = %s", (taxon_id,))
00439         if taxon_id == parent_taxon_id:
00440             # If the taxon table has been populated by the BioSQL script
00441             # load_ncbi_taxonomy.pl this is how top parent nodes are stored.
00442             # Personally, I would have used a NULL parent_taxon_id here.
00443             break
00444         if rank != "no rank":
00445             #For consistency with older versions of Biopython, we are only
00446             #interested in taxonomy entries with a stated rank.
00447             #Add this to the start of the lineage list.
00448             taxonomy.insert(0, name)
00449         taxon_id = parent_taxon_id
00450 
00451     if taxonomy:
00452         a['taxonomy'] = taxonomy
00453     return a
00454 
00455 def _retrieve_comment(adaptor, primary_id):
00456     qvs = adaptor.execute_and_fetchall(
00457         "SELECT comment_text FROM comment" \
00458         " WHERE bioentry_id=%s" \
00459         " ORDER BY rank", (primary_id,))
00460     comments = [comm[0] for comm in qvs]
00461     #Don't want to add an empty list...
00462     if comments:
00463         return {"comment": comments}
00464     else:
00465         return {}
00466 
00467 class DBSeqRecord(SeqRecord):
00468     """BioSQL equivalent of the biopython SeqRecord object.
00469     """
00470 
00471     def __init__(self, adaptor, primary_id):
00472         self._adaptor = adaptor
00473         self._primary_id = primary_id
00474 
00475         (self._biodatabase_id, self._taxon_id, self.name,
00476          accession, version, self._identifier,
00477          self._division, self.description) = self._adaptor.execute_one(
00478             "SELECT biodatabase_id, taxon_id, name, accession, version," \
00479             " identifier, division, description" \
00480             " FROM bioentry" \
00481             " WHERE bioentry_id = %s", (self._primary_id,))
00482         if version and version != "0":
00483             self.id = "%s.%s" % (accession, version)
00484         else:
00485             self.id = accession
00486         #We don't yet record any per-letter-annotations in the
00487         #BioSQL database, but we should set this property up
00488         #for completeness (and the __str__ method).
00489         try:
00490             length = len(self.seq)
00491         except:
00492             #Could be no sequence in the database!
00493             length = 0
00494         self._per_letter_annotations = _RestrictedDict(length=length)
00495 
00496     def __get_seq(self):
00497         if not hasattr(self, "_seq"):
00498             self._seq = _retrieve_seq(self._adaptor, self._primary_id)
00499         return self._seq
00500     def __set_seq(self, seq): self._seq = seq
00501     def __del_seq(self):      del self._seq
00502     seq = property(__get_seq, __set_seq, __del_seq, "Seq object")
00503 
00504     def __get_dbxrefs(self):
00505         if not hasattr(self,"_dbxrefs"):
00506             self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id)
00507         return self._dbxrefs
00508     def __set_dbxrefs(self, dbxrefs): self._dbxrefs = dbxrefs
00509     def __del_dbxrefs(self):      del self._dbxrefs
00510     dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs,
00511                        "Database cross references")
00512 
00513     def __get_features(self):
00514         if not hasattr(self, "_features"):
00515             self._features = _retrieve_features(self._adaptor,
00516                                                 self._primary_id)
00517         return self._features
00518     def __set_features(self, features): self._features = features
00519     def __del_features(self):      del self._features
00520     features = property(__get_features, __set_features, __del_features,
00521                         "Features")
00522 
00523     def __get_annotations(self):
00524         if not hasattr(self, "_annotations"):
00525             self._annotations = _retrieve_annotations(self._adaptor,
00526                                                       self._primary_id,
00527                                                       self._taxon_id)
00528             if self._identifier:
00529                 self._annotations["gi"] = self._identifier
00530             if self._division:
00531                 self._annotations["data_file_division"] = self._division
00532         return self._annotations
00533     def __set_annotations(self, annotations): self._annotations = annotations
00534     def __del_annotations(self): del self._annotations
00535     annotations = property(__get_annotations, __set_annotations,
00536                            __del_annotations, "Annotations")