Back to index

python-biopython  1.60
Classes | Functions | Variables
BioSQL.BioSeq Namespace Reference

Classes

class  DBSeq
class  DBSeqRecord

Functions

def __init__
def __len__
def __getitem__
def tostring
def __str__
def toseq
def __add__
def __radd__
def _retrieve_seq
def _retrieve_dbxrefs
def _retrieve_features
def _retrieve_location_qualifier_value
def _retrieve_annotations
def _make_unicode_into_string
def _retrieve_qualifier_value
def _retrieve_reference
def _retrieve_taxon
def _retrieve_comment

Variables

 primary_id
 adaptor
 alphabet
 _length
 start

Function Documentation

def BioSQL.BioSeq.__add__ (   self,
  other 
)

Definition at line 121 of file BioSeq.py.

00121 
00122     def __add__(self, other):
00123         #Let the Seq object deal with the alphabet issues etc
00124         return self.toseq() + other

def BioSQL.BioSeq.__getitem__ (   self,
  index 
)

Definition at line 40 of file BioSeq.py.

00040 
00041     def __getitem__(self, index) :                 # Seq API requirement
00042         #Note since Python 2.0, __getslice__ is deprecated
00043         #and __getitem__ is used instead.
00044         #See http://docs.python.org/ref/sequence-methods.html
00045         if isinstance(index, int):
00046             #Return a single letter as a string
00047             i = index
00048             if i < 0:
00049                 if -i > self._length:
00050                     raise IndexError(i)
00051                 i = i + self._length
00052             elif i >= self._length:
00053                 raise IndexError(i)            
00054             return self.adaptor.get_subseq_as_string(self.primary_id,
00055                                                      self.start + i,
00056                                                      self.start + i + 1)
00057         if not isinstance(index, slice):
00058             raise ValueError("Unexpected index type")
00059 
00060         #Return the (sub)sequence as another DBSeq or Seq object
00061         #(see the Seq obect's __getitem__ method)
00062         if index.start is None:
00063             i=0
00064         else:
00065             i = index.start
00066         if i < 0:
00067             #Map to equavilent positive index
00068             if -i > self._length:
00069                 raise IndexError(i)
00070             i = i + self._length
00071         elif i >= self._length:
00072             #Trivial case, should return empty string!
00073             i = self._length
00074 
00075         if index.stop is None:
00076             j = self._length
00077         else:
00078             j = index.stop
00079         if j < 0:
00080             #Map to equavilent positive index
00081             if -j > self._length:
00082                 raise IndexError(j)
00083             j = j + self._length
00084         elif j >= self._length:
00085             j = self._length
00086 
00087         if i >= j:
00088             #Trivial case, empty string.
00089             return Seq("", self.alphabet)
00090         elif index.step is None or index.step == 1:
00091             #Easy case - can return a DBSeq with the start and end adjusted
00092             return self.__class__(self.primary_id, self.adaptor, self.alphabet,
00093                                   self.start + i, j - i)
00094         else:
00095             #Tricky.  Will have to create a Seq object because of the stride
00096             full = self.adaptor.get_subseq_as_string(self.primary_id,
00097                                                      self.start + i,
00098                                                      self.start + j)
00099             return Seq(full[::index.step], self.alphabet)
        
def BioSQL.BioSeq.__init__ (   self,
  primary_id,
  adaptor,
  alphabet,
  start,
  length 
)
Create a new DBSeq object referring to a BioSQL entry.

You wouldn't normally create a DBSeq object yourself, this is done
for you when retreiving a DBSeqRecord object from the database.

Definition at line 25 of file BioSeq.py.

00025 
00026     def __init__(self, primary_id, adaptor, alphabet, start, length):
00027         """Create a new DBSeq object referring to a BioSQL entry.
00028 
00029         You wouldn't normally create a DBSeq object yourself, this is done
00030         for you when retreiving a DBSeqRecord object from the database.
00031         """
00032         self.primary_id = primary_id
00033         self.adaptor = adaptor
00034         self.alphabet = alphabet
00035         self._length = length
00036         self.start = start

Here is the caller graph for this function:

def BioSQL.BioSeq.__len__ (   self)

Definition at line 37 of file BioSeq.py.

00037 
00038     def __len__(self):
00039         return self._length
    
def BioSQL.BioSeq.__radd__ (   self,
  other 
)

Definition at line 125 of file BioSeq.py.

00125 
00126     def __radd__(self, other):
00127         #Let the Seq object deal with the alphabet issues etc
00128         return other + self.toseq()
00129 

def BioSQL.BioSeq.__str__ (   self)
Returns the full sequence as a python string.

Definition at line 108 of file BioSeq.py.

00108 
00109     def __str__(self):
00110         """Returns the full sequence as a python string."""
00111         return self.adaptor.get_subseq_as_string(self.primary_id,
00112                                                  self.start,
00113                                                  self.start + self._length)

def BioSQL.BioSeq._make_unicode_into_string (   text) [private]

Definition at line 351 of file BioSeq.py.

00351 
00352 def _make_unicode_into_string(text):
00353     if isinstance(text, unicode):
00354         return str(text)
00355     else :
00356         return text

Here is the caller graph for this function:

def BioSQL.BioSeq._retrieve_annotations (   adaptor,
  primary_id,
  taxon_id 
) [private]

Definition at line 334 of file BioSeq.py.

00334 
00335 def _retrieve_annotations(adaptor, primary_id, taxon_id):
00336     annotations = {}
00337     annotations.update(_retrieve_qualifier_value(adaptor, primary_id))
00338     annotations.update(_retrieve_reference(adaptor, primary_id))
00339     annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id))
00340     annotations.update(_retrieve_comment(adaptor, primary_id))
00341     # Convert values into strings in cases of unicode from the database.
00342     # BioSQL could eventually be expanded to be unicode aware.
00343     str_anns = {}
00344     for key, val in annotations.items():
00345         if isinstance(val, list):
00346             val = [_make_unicode_into_string(x) for x in val]
00347         elif isinstance(val, unicode):
00348             val = str(val)
00349         str_anns[key] = val
00350     return str_anns

Here is the call graph for this function:

def BioSQL.BioSeq._retrieve_comment (   adaptor,
  primary_id 
) [private]

Definition at line 455 of file BioSeq.py.

00455 
00456 def _retrieve_comment(adaptor, primary_id):
00457     qvs = adaptor.execute_and_fetchall(
00458         "SELECT comment_text FROM comment" \
00459         " WHERE bioentry_id=%s" \
00460         " ORDER BY rank", (primary_id,))
00461     comments = [comm[0] for comm in qvs]
00462     #Don't want to add an empty list...
00463     if comments:
00464         return {"comment": comments}
00465     else:
00466         return {}

Here is the caller graph for this function:

def BioSQL.BioSeq._retrieve_dbxrefs (   adaptor,
  primary_id 
) [private]
Retrieve the database cross references for the sequence.

Definition at line 183 of file BioSeq.py.

00183 
00184 def _retrieve_dbxrefs(adaptor, primary_id):
00185     """Retrieve the database cross references for the sequence."""
00186     _dbxrefs = []
00187     dbxrefs = adaptor.execute_and_fetchall(
00188         "SELECT dbname, accession, version" \
00189         " FROM bioentry_dbxref join dbxref using (dbxref_id)" \
00190         " WHERE bioentry_id = %s" \
00191         " ORDER BY rank", (primary_id,))
00192     for dbname, accession, version in dbxrefs:
00193         if version and version != "0":
00194             v = "%s.%s" % (accession, version)
00195         else:
00196             v = accession
00197         _dbxrefs.append("%s:%s" % (dbname, v))
00198     return _dbxrefs

def BioSQL.BioSeq._retrieve_features (   adaptor,
  primary_id 
) [private]

Definition at line 199 of file BioSeq.py.

00199 
00200 def _retrieve_features(adaptor, primary_id):
00201     sql = "SELECT seqfeature_id, type.name, rank" \
00202           " FROM seqfeature join term type on (type_term_id = type.term_id)" \
00203           " WHERE bioentry_id = %s" \
00204           " ORDER BY rank"
00205     results = adaptor.execute_and_fetchall(sql, (primary_id,))
00206     seq_feature_list = []
00207     for seqfeature_id, seqfeature_type, seqfeature_rank in results:
00208         # Get qualifiers [except for db_xref which is stored separately]
00209         qvs = adaptor.execute_and_fetchall(
00210             "SELECT name, value" \
00211             " FROM seqfeature_qualifier_value  join term using (term_id)" \
00212             " WHERE seqfeature_id = %s" \
00213             " ORDER BY rank", (seqfeature_id,))
00214         qualifiers = {}
00215         for qv_name, qv_value in qvs:
00216             qualifiers.setdefault(qv_name, []).append(qv_value)
00217         # Get db_xrefs [special case of qualifiers]
00218         qvs = adaptor.execute_and_fetchall(
00219             "SELECT dbxref.dbname, dbxref.accession" \
00220             " FROM dbxref join seqfeature_dbxref using (dbxref_id)" \
00221             " WHERE seqfeature_dbxref.seqfeature_id = %s" \
00222             " ORDER BY rank", (seqfeature_id,))
00223         for qv_name, qv_value in qvs:
00224             value = "%s:%s" % (qv_name, qv_value)
00225             qualifiers.setdefault("db_xref", []).append(value)
00226         # Get locations
00227         results = adaptor.execute_and_fetchall(
00228             "SELECT location_id, start_pos, end_pos, strand" \
00229             " FROM location" \
00230             " WHERE seqfeature_id = %s" \
00231             " ORDER BY rank", (seqfeature_id,))
00232         locations = []
00233         # convert to Python standard form
00234         # Convert strand = 0 to strand = None
00235         # re: comment in Loader.py:
00236         # Biopython uses None when we don't know strand information but
00237         # BioSQL requires something (non null) and sets this as zero
00238         # So we'll use the strand or 0 if Biopython spits out None
00239         for location_id, start, end, strand in results:
00240             if start:
00241                 start -= 1
00242             if strand == 0:
00243                 strand = None
00244             if strand not in (+1, -1, None):
00245                 raise ValueError("Invalid strand %s found in database for " \
00246                                  "seqfeature_id %s" % (strand, seqfeature_id))
00247             if end < start:
00248                 import warnings
00249                 warnings.warn("Inverted location start/end (%i and %i) for " \
00250                               "seqfeature_id %s" % (start, end, seqfeature_id))
00251             locations.append( (location_id, start, end, strand) )
00252         # Get possible remote reference information
00253         remote_results = adaptor.execute_and_fetchall(
00254             "SELECT location_id, dbname, accession, version" \
00255             " FROM location join dbxref using (dbxref_id)" \
00256             " WHERE seqfeature_id = %s", (seqfeature_id,))
00257         lookup = {}
00258         for location_id, dbname, accession, version in remote_results:
00259             if version and version != "0":
00260                 v = "%s.%s" % (accession, version)
00261             else:
00262                 v = accession
00263             # subfeature remote location db_ref are stored as a empty string when
00264             # not present
00265             if dbname == "":
00266                 dbname = None
00267             lookup[location_id] = (dbname, v)
00268         
00269         feature = SeqFeature.SeqFeature(type = seqfeature_type)
00270         feature._seqfeature_id = seqfeature_id #Store the key as a private property
00271         feature.qualifiers = qualifiers
00272         if len(locations) == 0:
00273             pass
00274         elif len(locations) == 1:
00275             location_id, start, end, strand = locations[0]
00276             #See Bug 2677, we currently don't record the location_operator
00277             #For consistency with older versions Biopython, default to "".
00278             feature.location_operator = \
00279                 _retrieve_location_qualifier_value(adaptor, location_id)
00280             dbname, version = lookup.get(location_id, (None, None))
00281             feature.location = SeqFeature.FeatureLocation(start, end)
00282             feature.strand = strand
00283             feature.ref_db = dbname
00284             feature.ref = version
00285         else:
00286             assert feature.sub_features == []
00287             for location in locations:
00288                 location_id, start, end, strand = location
00289                 dbname, version = lookup.get(location_id, (None, None))
00290                 subfeature = SeqFeature.SeqFeature()
00291                 subfeature.type = seqfeature_type
00292                 subfeature.location_operator = \
00293                     _retrieve_location_qualifier_value(adaptor, location_id)
00294                 #TODO - See Bug 2677 - we don't yet record location_operator,
00295                 #so for consistency with older versions of Biopython default
00296                 #to assuming its a join.
00297                 if not subfeature.location_operator:
00298                     subfeature.location_operator="join"
00299                 subfeature.location = SeqFeature.FeatureLocation(start, end)
00300                 subfeature.strand = strand
00301                 subfeature.ref_db = dbname
00302                 subfeature.ref = version
00303                 feature.sub_features.append(subfeature)
00304             # Assuming that the feature loc.op is the same as the sub_feature
00305             # loc.op:
00306             feature.location_operator = \
00307                 feature.sub_features[0].location_operator
00308             # Locations are in order, but because of remote locations for
00309             # sub-features they are not necessarily in numerical order:
00310             start = locations[0][1]
00311             end = locations[-1][2]
00312             feature.location = SeqFeature.FeatureLocation(start, end)
00313             # To get the parent strand (as done when parsing GenBank files),
00314             # need to consider evil mixed strand examples like this,
00315             # join(complement(69611..69724),139856..140087,140625..140650)
00316             strands = set(sf.strand for sf in feature.sub_features)
00317             if len(strands)==1:
00318                 feature.strand = feature.sub_features[0].strand
00319             else:
00320                 feature.strand = None # i.e. mixed strands
00321 
00322         seq_feature_list.append(feature)
00323 
00324     return seq_feature_list

Here is the call graph for this function:

def BioSQL.BioSeq._retrieve_location_qualifier_value (   adaptor,
  location_id 
) [private]

Definition at line 325 of file BioSeq.py.

00325 
00326 def _retrieve_location_qualifier_value(adaptor, location_id):
00327     value = adaptor.execute_and_fetch_col0(
00328         "SELECT value FROM location_qualifier_value" \
00329         " WHERE location_id = %s", (location_id,))
00330     try:
00331         return value[0] 
00332     except IndexError:
00333         return ""

Here is the caller graph for this function:

def BioSQL.BioSeq._retrieve_qualifier_value (   adaptor,
  primary_id 
) [private]

Definition at line 357 of file BioSeq.py.

00357 
00358 def _retrieve_qualifier_value(adaptor, primary_id):
00359     qvs = adaptor.execute_and_fetchall(
00360         "SELECT name, value" \
00361         " FROM bioentry_qualifier_value JOIN term USING (term_id)" \
00362         " WHERE bioentry_id = %s" \
00363         " ORDER BY rank", (primary_id,))
00364     qualifiers = {}
00365     for name, value in qvs:
00366         if name == "keyword": name = "keywords"
00367         #See handling of "date" in Loader.py
00368         elif name == "date_changed": name = "date"
00369         elif name == "secondary_accession": name = "accessions"
00370         qualifiers.setdefault(name, []).append(value)
00371     return qualifiers

Here is the caller graph for this function:

def BioSQL.BioSeq._retrieve_reference (   adaptor,
  primary_id 
) [private]

Definition at line 372 of file BioSeq.py.

00372 
00373 def _retrieve_reference(adaptor, primary_id):
00374     # XXX dbxref_qualifier_value
00375  
00376     refs = adaptor.execute_and_fetchall(
00377         "SELECT start_pos, end_pos, " \
00378         " location, title, authors," \
00379         " dbname, accession" \
00380         " FROM bioentry_reference" \
00381         " JOIN reference USING (reference_id)" \
00382         " LEFT JOIN dbxref USING (dbxref_id)" \
00383         " WHERE bioentry_id = %s" \
00384         " ORDER BY rank", (primary_id,))
00385     references = []
00386     for start, end, location, title, authors, dbname, accession in refs:
00387         reference = SeqFeature.Reference()
00388         #If the start/end are missing, reference.location is an empty list
00389         if (start is not None) or (end is not None):
00390             if start is not None: start -= 1 #python counting
00391             reference.location = [SeqFeature.FeatureLocation(start, end)]
00392         #Don't replace the default "" with None.
00393         if authors : reference.authors = authors
00394         if title : reference.title = title
00395         reference.journal = location
00396         if dbname == 'PUBMED':
00397             reference.pubmed_id = accession
00398         elif dbname == 'MEDLINE':
00399             reference.medline_id = accession
00400         references.append(reference)
00401     if references:
00402         return {'references': references}
00403     else:
00404         return {}

Here is the caller graph for this function:

def BioSQL.BioSeq._retrieve_seq (   adaptor,
  primary_id 
) [private]

Definition at line 130 of file BioSeq.py.

00130 
00131 def _retrieve_seq(adaptor, primary_id):
00132     #The database schema ensures there will be only one matching
00133     #row in the table.
00134 
00135     #If an UnknownSeq was recorded, seq will be NULL,
00136     #but length will be populated.  This means length(seq)
00137     #will return None.
00138     seqs = adaptor.execute_and_fetchall(
00139         "SELECT alphabet, length, length(seq) FROM biosequence" \
00140         " WHERE bioentry_id = %s", (primary_id,))
00141     if not seqs : return
00142     assert len(seqs) == 1        
00143     moltype, given_length, length = seqs[0]
00144 
00145     try:
00146         length = int(length)
00147         given_length = int(length)
00148         assert length == given_length
00149         have_seq = True
00150     except TypeError:
00151         assert length is None
00152         seqs = adaptor.execute_and_fetchall(
00153             "SELECT alphabet, length, seq FROM biosequence" \
00154             " WHERE bioentry_id = %s", (primary_id,))
00155         assert len(seqs) == 1
00156         moltype, given_length, seq = seqs[0]
00157         assert seq is None or seq==""
00158         length = int(given_length)
00159         have_seq = False
00160         del seq
00161     del given_length
00162         
00163     moltype = moltype.lower() #might be upper case in database
00164     #We have no way of knowing if these sequences will use IUPAC
00165     #alphabets, and we certainly can't assume they are unambiguous!
00166     if moltype == "dna":
00167         alphabet = Alphabet.generic_dna
00168     elif moltype == "rna":
00169         alphabet = Alphabet.generic_rna
00170     elif moltype == "protein":
00171         alphabet = Alphabet.generic_protein
00172     elif moltype == "unknown":
00173         #This is used in BioSQL/Loader.py and would happen
00174         #for any generic or nucleotide alphabets.
00175         alphabet = Alphabet.single_letter_alphabet
00176     else:
00177         raise AssertionError("Unknown moltype: %s" % moltype)
00178 
00179     if have_seq:
00180         return DBSeq(primary_id, adaptor, alphabet, 0, int(length))
00181     else:
00182         return UnknownSeq(length, alphabet)

def BioSQL.BioSeq._retrieve_taxon (   adaptor,
  primary_id,
  taxon_id 
) [private]

Definition at line 405 of file BioSeq.py.

00405 
00406 def _retrieve_taxon(adaptor, primary_id, taxon_id):
00407     a = {}
00408     common_names = adaptor.execute_and_fetch_col0(
00409         "SELECT name FROM taxon_name WHERE taxon_id = %s" \
00410         " AND name_class = 'genbank common name'", (taxon_id,))
00411     if common_names:
00412         a['source'] = common_names[0]
00413     scientific_names = adaptor.execute_and_fetch_col0(
00414         "SELECT name FROM taxon_name WHERE taxon_id = %s" \
00415         " AND name_class = 'scientific name'", (taxon_id,))
00416     if scientific_names:
00417         a['organism'] = scientific_names[0]
00418     ncbi_taxids = adaptor.execute_and_fetch_col0(
00419         "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,))
00420     if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
00421         a['ncbi_taxid'] = ncbi_taxids[0]
00422 
00423     #Old code used the left/right values in the taxon table to get the
00424     #taxonomy lineage in one SQL command.  This was actually very slow,
00425     #and would fail if the (optional) left/right values were missing.
00426     #
00427     #The following code is based on a contribution from Eric Gibert, and
00428     #relies on the taxon table's parent_taxon_id field only (ignoring the
00429     #optional left/right values).  This means that it has to make a
00430     #separate SQL query for each entry in the lineage, but it does still
00431     #appear to be *much* faster.  See Bug 2494. 
00432     taxonomy = []
00433     while taxon_id:
00434         name, rank, parent_taxon_id = adaptor.execute_one(
00435         "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" \
00436         " FROM taxon, taxon_name" \
00437         " WHERE taxon.taxon_id=taxon_name.taxon_id" \
00438         " AND taxon_name.name_class='scientific name'" \
00439         " AND taxon.taxon_id = %s", (taxon_id,))
00440         if taxon_id == parent_taxon_id:
00441             # If the taxon table has been populated by the BioSQL script
00442             # load_ncbi_taxonomy.pl this is how top parent nodes are stored.
00443             # Personally, I would have used a NULL parent_taxon_id here.
00444             break
00445         if rank != "no rank":
00446             #For consistency with older versions of Biopython, we are only
00447             #interested in taxonomy entries with a stated rank.
00448             #Add this to the start of the lineage list.
00449             taxonomy.insert(0, name)
00450         taxon_id = parent_taxon_id
00451 
00452     if taxonomy:
00453         a['taxonomy'] = taxonomy
00454     return a

Here is the caller graph for this function:

def BioSQL.BioSeq.toseq (   self)
Returns the full sequence as a Seq object.

Definition at line 116 of file BioSeq.py.

00116 
00117     def toseq(self):
00118         """Returns the full sequence as a Seq object."""
00119         #Note - the method name copies that of the MutableSeq object
00120         return Seq(str(self), self.alphabet)

def BioSQL.BioSeq.tostring (   self)
Returns the full sequence as a python string.

Although not formally deprecated, you are now encouraged to use
str(my_seq) instead of my_seq.tostring().

Definition at line 100 of file BioSeq.py.

00100 
00101     def tostring(self):
00102         """Returns the full sequence as a python string.
00103 
00104         Although not formally deprecated, you are now encouraged to use
00105         str(my_seq) instead of my_seq.tostring()."""
00106         return self.adaptor.get_subseq_as_string(self.primary_id,
00107                                                  self.start,
                                                 self.start + self._length)

Here is the caller graph for this function:


Variable Documentation

Definition at line 34 of file BioSeq.py.

Definition at line 32 of file BioSeq.py.

Definition at line 33 of file BioSeq.py.

Definition at line 31 of file BioSeq.py.

Definition at line 35 of file BioSeq.py.