Back to index

python-biopython  1.60
Public Member Functions | Public Attributes | Private Member Functions
BioSQL.Loader.DatabaseLoader Class Reference

List of all members.

Public Member Functions

def __init__
def load_seqrecord

Public Attributes

 adaptor
 dbid
 fetch_NCBI_taxonomy

Private Member Functions

def _get_ontology_id
def _get_term_id
def _add_dbxref
def _get_taxon_id
def _fix_name_class
def _get_taxon_id_from_ncbi_taxon_id
def _get_taxon_id_from_ncbi_lineage
def _load_bioentry_table
def _load_bioentry_date
def _load_biosequence
def _load_comment
def _load_annotations
def _load_reference
def _load_seqfeature
def _load_seqfeature_basic
def _load_seqfeature_locations
def _insert_seqfeature_location
def _load_seqfeature_qualifiers
def _load_seqfeature_dbxref
def _get_dbxref_id
def _get_seqfeature_dbxref
def _add_seqfeature_dbxref
def _load_dbxrefs
def _get_bioentry_dbxref
def _add_bioentry_dbxref

Detailed Description

Object used to load SeqRecord objects into a BioSQL database.

Definition at line 29 of file Loader.py.


Constructor & Destructor Documentation

def BioSQL.Loader.DatabaseLoader.__init__ (   self,
  adaptor,
  dbid,
  fetch_NCBI_taxonomy = False 
)
Initialize with connection information for the database.

Creating a DatabaseLoader object is normally handled via the
BioSeqDatabase DBServer object, for example:

from BioSQL import BioSeqDatabase
server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse",
         passwd = "biosql", host = "localhost", db="test_biosql")
try:
    db = server["test"]
except KeyError:
    db = server.new_database("test", description="For testing GBrowse")

Definition at line 31 of file Loader.py.

00031 
00032     def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
00033         """Initialize with connection information for the database.
00034 
00035         Creating a DatabaseLoader object is normally handled via the
00036         BioSeqDatabase DBServer object, for example:
00037 
00038         from BioSQL import BioSeqDatabase
00039         server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse",
00040                          passwd = "biosql", host = "localhost", db="test_biosql")
00041         try:
00042             db = server["test"]
00043         except KeyError:
00044             db = server.new_database("test", description="For testing GBrowse")
00045         """
00046         self.adaptor = adaptor
00047         self.dbid = dbid
00048         self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
    

Member Function Documentation

def BioSQL.Loader.DatabaseLoader._add_bioentry_dbxref (   self,
  bioentry_id,
  dbxref_id,
  rank 
) [private]
Insert a bioentry_dbxref row and return the seqfeature_id and
    dbxref_id

Definition at line 1032 of file Loader.py.

01032 
01033     def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
01034         """ Insert a bioentry_dbxref row and return the seqfeature_id and
01035             dbxref_id
01036         """
01037         sql = r'INSERT INTO bioentry_dbxref ' \
01038               '(bioentry_id,dbxref_id,rank) VALUES ' \
01039               '(%s, %s, %s)'
01040         self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
01041         return (bioentry_id, dbxref_id)
            

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._add_dbxref (   self,
  dbname,
  accession,
  version 
) [private]
Insert a dbxref and return its id.

Definition at line 122 of file Loader.py.

00122 
00123     def _add_dbxref(self, dbname, accession, version):
00124        """Insert a dbxref and return its id."""
00125        
00126        self.adaptor.execute(
00127            "INSERT INTO dbxref(dbname, accession, version)" \
00128            " VALUES (%s, %s, %s)", (dbname, accession, version))
00129        return self.adaptor.last_id("dbxref")
           

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._add_seqfeature_dbxref (   self,
  seqfeature_id,
  dbxref_id,
  rank 
) [private]
Insert a seqfeature_dbxref row and return the seqfeature_id and
    dbxref_id

Definition at line 981 of file Loader.py.

00981 
00982     def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
00983         """ Insert a seqfeature_dbxref row and return the seqfeature_id and
00984             dbxref_id
00985         """
00986         sql = r'INSERT INTO seqfeature_dbxref ' \
00987               '(seqfeature_id, dbxref_id, rank) VALUES' \
00988               r'(%s, %s, %s)'
00989         self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
00990         return (seqfeature_id, dbxref_id)

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._fix_name_class (   self,
  entrez_name 
) [private]
Map Entrez name terms to those used in taxdump (PRIVATE).

We need to make this conversion to match the taxon_name.name_class
values used by the BioSQL load_ncbi_taxonomy.pl script.

e.g.
"ScientificName" -> "scientific name",
"EquivalentName" -> "equivalent name",
"Synonym" -> "synonym",

Definition at line 297 of file Loader.py.

00297 
00298     def _fix_name_class(self, entrez_name):
00299         """Map Entrez name terms to those used in taxdump (PRIVATE).
00300 
00301         We need to make this conversion to match the taxon_name.name_class
00302         values used by the BioSQL load_ncbi_taxonomy.pl script.
00303         
00304         e.g.
00305         "ScientificName" -> "scientific name",
00306         "EquivalentName" -> "equivalent name",
00307         "Synonym" -> "synonym",
00308         """
00309         #Add any special cases here:
00310         #
00311         #known = {}
00312         #try:
00313         #    return known[entrez_name]
00314         #except KeyError:
00315         #    pass
00316 
00317         #Try automatically by adding spaces before each capital
00318         def add_space(letter):
00319             if letter.isupper():
00320                 return " "+letter.lower()
00321             else:
00322                 return letter
00323         answer = "".join([add_space(letter) for letter in entrez_name]).strip()
00324         assert answer == answer.lower()
00325         return answer

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._get_bioentry_dbxref (   self,
  bioentry_id,
  dbxref_id,
  rank 
) [private]
Check for a pre-existing bioentry_dbxref entry with the passed
    seqfeature_id and dbxref_id.  If one does not exist, insert new
    data

Definition at line 1015 of file Loader.py.

01015 
01016     def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
01017         """ Check for a pre-existing bioentry_dbxref entry with the passed
01018             seqfeature_id and dbxref_id.  If one does not exist, insert new
01019             data
01020 
01021         """
01022         # Check for an existing record
01023         sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \
01024               r"WHERE bioentry_id = %s AND dbxref_id = %s"
01025         result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id,
01026                                                            dbxref_id))
01027         # If there was a record, return without executing anything, else create
01028         # the record and return
01029         if result:
01030             return result
01031         return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._get_dbxref_id (   self,
  db,
  accession 
) [private]
_get_dbxref_id(self, db, accession) -> Int

    o db          String, the name of the external database containing
          the accession number

    o accession   String, the accession of the dbxref data

    Finds and returns the dbxref_id for the passed data.  The method
    attempts to find an existing record first, and inserts the data
    if there is no record.

Definition at line 942 of file Loader.py.

00942 
00943     def _get_dbxref_id(self, db, accession):
00944         """ _get_dbxref_id(self, db, accession) -> Int
00945 
00946             o db          String, the name of the external database containing
00947                           the accession number
00948 
00949             o accession   String, the accession of the dbxref data
00950 
00951             Finds and returns the dbxref_id for the passed data.  The method
00952             attempts to find an existing record first, and inserts the data
00953             if there is no record.
00954         """
00955         # Check for an existing record
00956         sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \
00957               r'AND accession = %s'
00958         dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
00959         # If there was a record, return the dbxref_id, else create the
00960         # record and return the created dbxref_id
00961         if dbxref_id:
00962             return dbxref_id[0]
00963         return self._add_dbxref(db, accession, 0)

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._get_ontology_id (   self,
  name,
  definition = None 
) [private]
Returns the identifier for the named ontology (PRIVATE).

This looks through the onotology table for a the given entry name.
If it is not found, a row is added for this ontology (using the
definition if supplied).  In either case, the id corresponding to
the provided name is returned, so that you can reference it in
another table.

Definition at line 65 of file Loader.py.

00065 
00066     def _get_ontology_id(self, name, definition=None):
00067         """Returns the identifier for the named ontology (PRIVATE).
00068 
00069         This looks through the onotology table for a the given entry name.
00070         If it is not found, a row is added for this ontology (using the
00071         definition if supplied).  In either case, the id corresponding to
00072         the provided name is returned, so that you can reference it in
00073         another table.
00074         """
00075         oids = self.adaptor.execute_and_fetch_col0(
00076             "SELECT ontology_id FROM ontology WHERE name = %s",
00077             (name,))
00078         if oids:
00079             return oids[0]
00080         self.adaptor.execute(
00081             "INSERT INTO ontology(name, definition) VALUES (%s, %s)",
00082             (name, definition))
00083         return self.adaptor.last_id("ontology")
00084 
    

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._get_seqfeature_dbxref (   self,
  seqfeature_id,
  dbxref_id,
  rank 
) [private]
Check for a pre-existing seqfeature_dbxref entry with the passed
    seqfeature_id and dbxref_id.  If one does not exist, insert new
    data

Definition at line 964 of file Loader.py.

00964 
00965     def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
00966         """ Check for a pre-existing seqfeature_dbxref entry with the passed
00967             seqfeature_id and dbxref_id.  If one does not exist, insert new
00968             data
00969 
00970         """
00971         # Check for an existing record
00972         sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \
00973               r"WHERE seqfeature_id = %s AND dbxref_id = %s"
00974         result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id,
00975                                                            dbxref_id))
00976         # If there was a record, return without executing anything, else create
00977         # the record and return
00978         if result:
00979             return result
00980         return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._get_taxon_id (   self,
  record 
) [private]
Get the taxon id for this record (PRIVATE).

record - a SeqRecord object

This searches the taxon/taxon_name tables using the
NCBI taxon ID, scientific name and common name to find
the matching taxon table entry's id.

If the species isn't in the taxon table, and we have at
least the NCBI taxon ID, scientific name or common name,
at least a minimal stub entry is created in the table.

Returns the taxon id (database key for the taxon table,
not an NCBI taxon ID), or None if the taxonomy information
is missing.

See also the BioSQL script load_ncbi_taxonomy.pl which
will populate and update the taxon/taxon_name tables
with the latest information from the NCBI.

Definition at line 130 of file Loader.py.

00130 
00131     def _get_taxon_id(self, record):
00132         """Get the taxon id for this record (PRIVATE).
00133 
00134         record - a SeqRecord object
00135 
00136         This searches the taxon/taxon_name tables using the
00137         NCBI taxon ID, scientific name and common name to find
00138         the matching taxon table entry's id.
00139         
00140         If the species isn't in the taxon table, and we have at
00141         least the NCBI taxon ID, scientific name or common name,
00142         at least a minimal stub entry is created in the table.
00143 
00144         Returns the taxon id (database key for the taxon table,
00145         not an NCBI taxon ID), or None if the taxonomy information
00146         is missing.
00147 
00148         See also the BioSQL script load_ncbi_taxonomy.pl which
00149         will populate and update the taxon/taxon_name tables
00150         with the latest information from the NCBI.
00151         """
00152         
00153         # To find the NCBI taxid, first check for a top level annotation
00154         ncbi_taxon_id = None
00155         if "ncbi_taxid" in record.annotations:
00156             #Could be a list of IDs.
00157             if isinstance(record.annotations["ncbi_taxid"],list):
00158                 if len(record.annotations["ncbi_taxid"])==1:
00159                     ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
00160             else:
00161                 ncbi_taxon_id = record.annotations["ncbi_taxid"]
00162         if not ncbi_taxon_id:
00163             # Secondly, look for a source feature
00164             for f in record.features:
00165                 if f.type == 'source':
00166                     quals = getattr(f, 'qualifiers', {})
00167                     if "db_xref" in quals:
00168                         for db_xref in f.qualifiers["db_xref"]:
00169                             if db_xref.startswith("taxon:"):
00170                                 ncbi_taxon_id = int(db_xref[6:])
00171                                 break
00172                 if ncbi_taxon_id: break
00173 
00174         try:
00175             scientific_name = record.annotations["organism"][:255]
00176         except KeyError:
00177             scientific_name = None
00178         try:
00179             common_name = record.annotations["source"][:255]
00180         except KeyError:
00181             common_name = None
00182         # Note: The maximum length for taxon names in the schema is 255.
00183         # Cropping it now should help in getting a match when searching,
00184         # and avoids an error if we try and add these to the database.
00185 
00186 
00187         if ncbi_taxon_id:
00188             #Good, we have the NCBI taxon to go on - this is unambiguous :)
00189             #Note that the scientific name and common name will only be
00190             #used if we have to record a stub entry.
00191             return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id,
00192                                                          scientific_name,
00193                                                          common_name)
00194         
00195         if not common_name and not scientific_name:
00196             # Nothing to go on... and there is no point adding
00197             # a new entry to the database.  We'll just leave this
00198             # sequence's taxon as a NULL in the database.
00199             return None
00200 
00201         # Next, we'll try to find a match based on the species name
00202         # (stored in GenBank files as the organism and/or the source).
00203         if scientific_name:
00204             taxa = self.adaptor.execute_and_fetch_col0(
00205                 "SELECT taxon_id FROM taxon_name" \
00206                 " WHERE name_class = 'scientific name' AND name = %s",
00207                 (scientific_name,))
00208             if taxa:
00209                 #Good, mapped the scientific name to a taxon table entry
00210                 return taxa[0]
00211 
00212         # Last chance...
00213         if common_name:
00214             taxa = self.adaptor.execute_and_fetch_col0(
00215                 "SELECT DISTINCT taxon_id FROM taxon_name" \
00216                 " WHERE name = %s",
00217                 (common_name,))
00218             #Its natural that several distinct taxa will have the same common
00219             #name - in which case we can't resolve the taxon uniquely.
00220             if len(taxa) > 1:
00221                 raise ValueError("Taxa: %d species have name %r" % (
00222                     len(taxa),
00223                     common_name))
00224             if taxa:
00225                 #Good, mapped the common name to a taxon table entry
00226                 return taxa[0]
00227 
00228         # At this point, as far as we can tell, this species isn't
00229         # in the taxon table already.  So we'll have to add it.
00230         # We don't have an NCBI taxonomy ID, so if we do record just
00231         # a stub entry, there is no simple way to fix this later.
00232         #
00233         # TODO - Should we try searching the NCBI taxonomy using the
00234         # species name?
00235         #
00236         # OK, let's try inserting the species.
00237         # Chances are we don't have enough information ...
00238         # Furthermore, it won't be in the hierarchy.
00239 
00240         lineage = []
00241         for c in record.annotations.get("taxonomy", []):
00242             lineage.append([None, None, c])
00243         if lineage:
00244             lineage[-1][1] = "genus"
00245         lineage.append([None, "species", record.annotations["organism"]])
00246         # XXX do we have them?
00247         if "subspecies" in record.annotations:
00248             lineage.append([None, "subspecies",
00249                             record.annotations["subspecies"]])
00250         if "variant" in record.annotations:
00251             lineage.append([None, "varietas",
00252                             record.annotations["variant"]])
00253         lineage[-1][0] = ncbi_taxon_id
00254         
00255         left_value = self.adaptor.execute_one(
00256             "SELECT MAX(left_value) FROM taxon")[0]
00257         if not left_value:
00258             left_value = 0
00259         left_value += 1
00260         
00261         # XXX -- Brad: Fixing this for now in an ugly way because
00262         # I am getting overlaps for right_values. I need to dig into this
00263         # more to actually understand how it works. I'm not sure it is
00264         # actually working right anyhow.
00265         right_start_value = self.adaptor.execute_one(
00266             "SELECT MAX(right_value) FROM taxon")[0]
00267         if not right_start_value:
00268             right_start_value = 0
00269         right_value = right_start_value + 2 * len(lineage) - 1
00270 
00271         parent_taxon_id = None
00272         for taxon in lineage:
00273             self.adaptor.execute(
00274                 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
00275                 " left_value, right_value)" \
00276                 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id,
00277                                                  taxon[0],
00278                                                  taxon[1],
00279                                                  left_value,
00280                                                  right_value))
00281             taxon_id = self.adaptor.last_id("taxon")
00282             self.adaptor.execute(
00283                 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
00284                 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255]))
00285             #Note the name field is limited to 255, some SwissProt files
00286             #have a multi-species name which can be longer.  So truncate this.
00287             left_value += 1
00288             right_value -= 1
00289             parent_taxon_id = taxon_id
00290         if common_name:
00291             self.adaptor.execute(
00292                 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
00293                 "VALUES (%s, %s, 'common name')", (
00294                 taxon_id, common_name))
00295 
00296         return taxon_id

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._get_taxon_id_from_ncbi_lineage (   self,
  taxonomic_lineage 
) [private]
This is recursive! (PRIVATE).

taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez

First dictionary in list is the taxonomy root, highest would be the species.
Each dictionary includes:
- TaxID (string, NCBI taxon id)
- Rank (string, e.g. "species", "genus", ..., "phylum", ...)
- ScientificName (string)
(and that is all at the time of writing)

This method will record all the lineage given, returning the the taxon id
(database key, not NCBI taxon id) of the final entry (the species).

Definition at line 437 of file Loader.py.

00437 
00438     def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
00439         """This is recursive! (PRIVATE).
00440 
00441         taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
00442 
00443         First dictionary in list is the taxonomy root, highest would be the species.
00444         Each dictionary includes:
00445         - TaxID (string, NCBI taxon id)
00446         - Rank (string, e.g. "species", "genus", ..., "phylum", ...)
00447         - ScientificName (string)
00448         (and that is all at the time of writing)
00449 
00450         This method will record all the lineage given, returning the the taxon id
00451         (database key, not NCBI taxon id) of the final entry (the species).
00452         """
00453         ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"]
00454 
00455         #Is this in the database already?  Check the taxon table...
00456         taxon_id = self.adaptor.execute_and_fetch_col0(
00457             "SELECT taxon_id FROM taxon" \
00458             " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id)
00459         if taxon_id:
00460             # we could verify that the Scientific Name etc in the database
00461             # is the same and update it or print a warning if not...
00462             if isinstance(taxon_id, list):
00463                 assert len(taxon_id)==1
00464                 return taxon_id[0]
00465             else:
00466                 return taxon_id
00467 
00468         #We have to record this.
00469         if len(taxonomic_lineage) > 1:
00470             #Use recursion to find out the taxon id (database key) of the parent.
00471             parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
00472             assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id)
00473         else:
00474             parent_taxon_id = None
00475 
00476         # INSERT new taxon
00477         rank = taxonomic_lineage[-1].get("Rank", None)
00478         self.adaptor.execute(
00479                 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)"\
00480                 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank))
00481         taxon_id = self.adaptor.last_id("taxon")
00482         assert isinstance(taxon_id, int) or isinstance(taxon_id, long), repr(taxon_id)
00483         # ... and its name in taxon_name
00484         scientific_name = taxonomic_lineage[-1].get("ScientificName", None)
00485         if scientific_name:
00486             self.adaptor.execute(
00487                     "INSERT INTO taxon_name(taxon_id, name, name_class)" \
00488                     " VALUES (%s, %s, 'scientific name')", (taxon_id, 
00489                                                             scientific_name[:255]))
00490         return taxon_id
00491 

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._get_taxon_id_from_ncbi_taxon_id (   self,
  ncbi_taxon_id,
  scientific_name = None,
  common_name = None 
) [private]
Get the taxon id for this record from the NCBI taxon ID (PRIVATE).

ncbi_taxon_id - string containing an NCBI taxon id
scientific_name - string, used if a stub entry is recorded
common_name - string, used if a stub entry is recorded

This searches the taxon table using ONLY the NCBI taxon ID
to find the matching taxon table entry's ID (database key).

If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
flag is true, Biopython will attempt to go online using Bio.Entrez
to fetch the official NCBI lineage, recursing up the tree until an
existing entry is found in the database or the full lineage has been
fetched.

Otherwise the NCBI taxon ID, scientific name and common name are
recorded as a minimal stub entry in the taxon and taxon_name tables.
Any partial information about the lineage from the SeqRecord is NOT
recorded.  This should mean that (re)running the BioSQL script
load_ncbi_taxonomy.pl can fill in the taxonomy lineage.

Returns the taxon id (database key for the taxon table, not
an NCBI taxon ID).

Definition at line 328 of file Loader.py.

00328 
00329                                          common_name = None):
00330         """Get the taxon id for this record from the NCBI taxon ID (PRIVATE).
00331 
00332         ncbi_taxon_id - string containing an NCBI taxon id
00333         scientific_name - string, used if a stub entry is recorded
00334         common_name - string, used if a stub entry is recorded
00335         
00336         This searches the taxon table using ONLY the NCBI taxon ID
00337         to find the matching taxon table entry's ID (database key).
00338         
00339         If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
00340         flag is true, Biopython will attempt to go online using Bio.Entrez
00341         to fetch the official NCBI lineage, recursing up the tree until an
00342         existing entry is found in the database or the full lineage has been
00343         fetched.
00344 
00345         Otherwise the NCBI taxon ID, scientific name and common name are
00346         recorded as a minimal stub entry in the taxon and taxon_name tables.
00347         Any partial information about the lineage from the SeqRecord is NOT
00348         recorded.  This should mean that (re)running the BioSQL script
00349         load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
00350 
00351         Returns the taxon id (database key for the taxon table, not
00352         an NCBI taxon ID).
00353         """
00354         assert ncbi_taxon_id
00355 
00356         taxon_id = self.adaptor.execute_and_fetch_col0(
00357             "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s",
00358             (ncbi_taxon_id,))
00359         if taxon_id:
00360             #Good, we have mapped the NCBI taxid to a taxon table entry
00361             return taxon_id[0]
00362 
00363         # At this point, as far as we can tell, this species isn't
00364         # in the taxon table already.  So we'll have to add it.
00365 
00366         parent_taxon_id = None
00367         rank = "species"
00368         genetic_code = None
00369         mito_genetic_code = None
00370         species_names = []
00371         if scientific_name:
00372             species_names.append(("scientific name", scientific_name))
00373         if common_name:
00374             species_names.append(("common name", common_name))
00375         
00376         if self.fetch_NCBI_taxonomy:
00377             #Go online to get the parent taxon ID!
00378             handle = Entrez.efetch(db="taxonomy",id=ncbi_taxon_id,retmode="XML")
00379             taxonomic_record = Entrez.read(handle)
00380             if len(taxonomic_record) == 1:
00381                 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \
00382                        "%s versus %s" % (taxonomic_record[0]["TaxId"],
00383                                          ncbi_taxon_id)
00384                 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( \
00385                                             taxonomic_record[0]["LineageEx"])
00386                 rank = taxonomic_record[0]["Rank"]
00387                 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"]
00388                 mito_genetic_code = taxonomic_record[0]["MitoGeneticCode"]["MGCId"]
00389                 species_names = [("scientific name",
00390                                   taxonomic_record[0]["ScientificName"])]
00391                 try:
00392                     for name_class, names in taxonomic_record[0]["OtherNames"].iteritems():
00393                         name_class = self._fix_name_class(name_class)
00394                         if not isinstance(names, list):
00395                             #The Entrez parser seems to return single entry
00396                             #lists as just a string which is annoying.
00397                             names = [names]
00398                         for name in names:
00399                             #Want to ignore complex things like ClassCDE entries
00400                             if isinstance(name, basestring):
00401                                 species_names.append((name_class, name))
00402                 except KeyError:
00403                     #OtherNames isn't always present,
00404                     #e.g. NCBI taxon 41205, Bromheadia finlaysoniana
00405                     pass
00406         else:
00407             pass
00408             # If we are not allowed to go online, we will record the bare minimum;
00409             # as long as the NCBI taxon id is present, then (re)running
00410             # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage
00411             # (and update the species names).
00412             #
00413             # I am NOT going to try and record the lineage, even if it
00414             # is in the record annotation as a list of names, as we won't
00415             # know the NCBI taxon IDs for these parent nodes.
00416 
00417         self.adaptor.execute(
00418             "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
00419             " genetic_code, mito_genetic_code, left_value, right_value)" \
00420             " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id,
00421                                                      ncbi_taxon_id,
00422                                                      rank,
00423                                                      genetic_code,
00424                                                      mito_genetic_code,
00425                                                      None,
00426                                                      None))
00427         taxon_id = self.adaptor.last_id("taxon")
00428 
00429         #Record the scientific name, common name, etc
00430         for name_class, name in species_names:
00431             self.adaptor.execute(
00432                 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
00433                 " VALUES (%s, %s, %s)", (taxon_id, 
00434                                          name[:255], 
00435                                          name_class))
00436         return taxon_id

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._get_term_id (   self,
  name,
  ontology_id = None,
  definition = None,
  identifier = None 
) [private]
Get the id that corresponds to a term (PRIVATE).

This looks through the term table for a the given term. If it
is not found, a new id corresponding to this term is created.
In either case, the id corresponding to that term is returned, so
that you can reference it in another table.

The ontology_id should be used to disambiguate the term.

Definition at line 89 of file Loader.py.

00089 
00090                      identifier=None):
00091         """Get the id that corresponds to a term (PRIVATE).
00092 
00093         This looks through the term table for a the given term. If it
00094         is not found, a new id corresponding to this term is created.
00095         In either case, the id corresponding to that term is returned, so
00096         that you can reference it in another table.
00097 
00098         The ontology_id should be used to disambiguate the term.
00099         """
00100 
00101         # try to get the term id
00102         sql = r"SELECT term_id FROM term " \
00103               r"WHERE name = %s"
00104         fields = [name]
00105         if ontology_id:
00106             sql += ' AND ontology_id = %s'
00107             fields.append(ontology_id)
00108         id_results = self.adaptor.execute_and_fetchall(sql, fields)
00109         # something is wrong
00110         if len(id_results) > 1:
00111             raise ValueError("Multiple term ids for %s: %r" % 
00112                              (name, id_results))
00113         elif len(id_results) == 1:
00114             return id_results[0][0]
00115         else:
00116             sql = r"INSERT INTO term (name, definition," \
00117                   r" identifier, ontology_id)" \
00118                   r" VALUES (%s, %s, %s, %s)"
00119             self.adaptor.execute(sql, (name, definition,
00120                                        identifier, ontology_id))
00121             return self.adaptor.last_id("term")

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._insert_seqfeature_location (   self,
  feature,
  rank,
  seqfeature_id 
) [private]
Add a location of a SeqFeature to the seqfeature_location table (PRIVATE).

TODO - Add location_operators to location_qualifier_value.

Definition at line 810 of file Loader.py.

00810 
00811     def _insert_seqfeature_location(self, feature, rank, seqfeature_id):
00812         """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE).
00813 
00814         TODO - Add location_operators to location_qualifier_value.
00815         """
00816         # convert biopython locations to the 1-based location system
00817         # used in bioSQL
00818         # XXX This could also handle fuzzies
00819         start = int(feature.location.start) + 1
00820         end = int(feature.location.end)
00821 
00822         # Biopython uses None when we don't know strand information but
00823         # BioSQL requires something (non null) and sets this as zero
00824         # So we'll use the strand or 0 if Biopython spits out None
00825         strand = feature.strand or 0
00826 
00827         # TODO - Record an ontology term for the location (location.term_id)
00828         # which for now like BioPerl we'll leave as NULL.
00829         # This might allow us to record "between" positions properly, but I
00830         # doesn't really see how it could work for before/after fuzzy positions
00831         loc_term_id = None
00832 
00833         if feature.ref:
00834             # sub_feature remote locations when they are in the same db as the current
00835             # record do not have a value for ref_db, which the SeqFeature object
00836             # stores as None. BioSQL schema requires a varchar and is not NULL 
00837             dbxref_id = self._get_dbxref_id(feature.ref_db or "", feature.ref)
00838         else:
00839             dbxref_id = None
00840 
00841         sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \
00842               r"start_pos, end_pos, strand, rank) " \
00843               r"VALUES (%s, %s, %s, %s, %s, %s, %s)"
00844         self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id,
00845                                    start, end, strand, rank))
00846 
00847         """
00848         # See Bug 2677
00849         # TODO - Record the location_operator (e.g. "join" or "order")
00850         # using the location_qualifier_value table (which we and BioPerl
00851         # have historically left empty).
00852         # Note this will need an ontology term for the location qualifer
00853         # (location_qualifier_value.term_id) for which oddly the schema
00854         # does not allow NULL.
00855         if feature.location_operator:
00856             #e.g. "join" (common),
00857             #or "order" (see Tests/GenBank/protein_refseq2.gb)
00858             location_id = self.adaptor.last_id('location')
00859             loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
00860             sql = r"INSERT INTO location_qualifier_value" \
00861                   r"(location_id, term_id, value)" \
00862                   r"VALUES (%s, %s, %s)"
00863             self.adaptor.execute(sql, (location_id, loc_qual_term_id,
00864                                        feature.location_operator))
00865         """

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_annotations (   self,
  record,
  bioentry_id 
) [private]
Record a SeqRecord's misc annotations in the database (PRIVATE).

The annotation strings are recorded in the bioentry_qualifier_value
table, except for special cases like the reference, comment and
taxonomy which are handled with their own tables.

record - a SeqRecord object with an annotations dictionary
bioentry_id - corresponding database identifier

Definition at line 638 of file Loader.py.

00638 
00639     def _load_annotations(self, record, bioentry_id):
00640         """Record a SeqRecord's misc annotations in the database (PRIVATE).
00641 
00642         The annotation strings are recorded in the bioentry_qualifier_value
00643         table, except for special cases like the reference, comment and
00644         taxonomy which are handled with their own tables.
00645 
00646         record - a SeqRecord object with an annotations dictionary
00647         bioentry_id - corresponding database identifier
00648         """
00649         mono_sql = "INSERT INTO bioentry_qualifier_value" \
00650                    "(bioentry_id, term_id, value)" \
00651                    " VALUES (%s, %s, %s)"
00652         many_sql = "INSERT INTO bioentry_qualifier_value" \
00653                    "(bioentry_id, term_id, value, rank)" \
00654                    " VALUES (%s, %s, %s, %s)"
00655         tag_ontology_id = self._get_ontology_id('Annotation Tags')
00656         for key, value in record.annotations.iteritems():
00657             if key in ["references", "comment", "ncbi_taxid", "date"]:
00658                 #Handled separately
00659                 continue
00660             term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
00661             if isinstance(value, list) or isinstance(value, tuple):
00662                 rank = 0
00663                 for entry in value:
00664                     if isinstance(entry, str) or isinstance(entry, int):
00665                         #Easy case
00666                         rank += 1
00667                         self.adaptor.execute(many_sql, \
00668                                      (bioentry_id, term_id, str(entry), rank))
00669                     else:
00670                         pass
00671                         #print "Ignoring annotation '%s' sub-entry of type '%s'" \
00672                         #      % (key, str(type(entry)))
00673             elif isinstance(value, str) or isinstance(value, int):
00674                 #Have a simple single entry, leave rank as the DB default
00675                 self.adaptor.execute(mono_sql, \
00676                                      (bioentry_id, term_id, str(value)))
00677             else:
00678                 pass
00679                 #print "Ignoring annotation '%s' entry of type '%s'" \
00680                 #      % (key, type(value))
00681 

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_bioentry_date (   self,
  record,
  bioentry_id 
) [private]
Add the effective date of the entry into the database.

record - a SeqRecord object with an annotated date
bioentry_id - corresponding database identifier

Definition at line 565 of file Loader.py.

00565 
00566     def _load_bioentry_date(self, record, bioentry_id):
00567         """Add the effective date of the entry into the database.
00568 
00569         record - a SeqRecord object with an annotated date
00570         bioentry_id - corresponding database identifier
00571         """
00572         # dates are GenBank style, like:
00573         # 14-SEP-2000
00574         date = record.annotations.get("date",
00575                                       strftime("%d-%b-%Y", gmtime()).upper())
00576         if isinstance(date, list) : date = date[0]
00577         annotation_tags_id = self._get_ontology_id("Annotation Tags")
00578         date_id = self._get_term_id("date_changed", annotation_tags_id)
00579         sql = r"INSERT INTO bioentry_qualifier_value" \
00580               r" (bioentry_id, term_id, value, rank)" \
00581               r" VALUES (%s, %s, %s, 1)" 
00582         self.adaptor.execute(sql, (bioentry_id, date_id, date))

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_bioentry_table (   self,
  record 
) [private]
Fill the bioentry table with sequence information (PRIVATE).

record - SeqRecord object to add to the database.

Definition at line 492 of file Loader.py.

00492 
00493     def _load_bioentry_table(self, record):
00494         """Fill the bioentry table with sequence information (PRIVATE).
00495 
00496         record - SeqRecord object to add to the database.
00497         """
00498         # get the pertinent info and insert it
00499         
00500         if record.id.count(".") == 1: # try to get a version from the id
00501             #This assumes the string is something like "XXXXXXXX.123"
00502             accession, version = record.id.split('.')
00503             try:
00504                 version = int(version)
00505             except ValueError:
00506                 accession = record.id
00507                 version = 0
00508         else: # otherwise just use a version of 0
00509             accession = record.id
00510             version = 0
00511 
00512         if "accessions" in record.annotations \
00513         and isinstance(record.annotations["accessions"],list) \
00514         and record.annotations["accessions"]:
00515             #Take the first accession (one if there is more than one)
00516             accession = record.annotations["accessions"][0]
00517 
00518         #Find the taxon id (this is not just the NCBI Taxon ID)
00519         #NOTE - If the species isn't defined in the taxon table,
00520         #a new minimal entry is created.
00521         taxon_id = self._get_taxon_id(record)
00522 
00523         if "gi" in record.annotations:
00524             identifier = record.annotations["gi"]
00525         else:
00526             identifier = record.id
00527 
00528         #Allow description and division to default to NULL as in BioPerl.
00529         description = getattr(record, 'description', None)
00530         division = record.annotations.get("data_file_division", None)
00531         
00532         sql = """
00533         INSERT INTO bioentry (
00534          biodatabase_id,
00535          taxon_id,
00536          name,
00537          accession,
00538          identifier,
00539          division,
00540          description,
00541          version)
00542         VALUES (
00543          %s,
00544          %s,
00545          %s,
00546          %s,
00547          %s,
00548          %s,
00549          %s,
00550          %s)"""
00551         #print self.dbid, taxon_id, record.name, accession, identifier, \
00552         #        division, description, version
00553         self.adaptor.execute(sql, (self.dbid,
00554                                    taxon_id,
00555                                    record.name, 
00556                                    accession,
00557                                    identifier,
00558                                    division,
00559                                    description,
00560                                    version))
00561         # now retrieve the id for the bioentry
00562         bioentry_id = self.adaptor.last_id('bioentry')
00563 
00564         return bioentry_id

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_biosequence (   self,
  record,
  bioentry_id 
) [private]
Record a SeqRecord's sequence and alphabet in the database (PRIVATE).

record - a SeqRecord object with a seq property
bioentry_id - corresponding database identifier

Definition at line 583 of file Loader.py.

00583 
00584     def _load_biosequence(self, record, bioentry_id):
00585         """Record a SeqRecord's sequence and alphabet in the database (PRIVATE).
00586 
00587         record - a SeqRecord object with a seq property
00588         bioentry_id - corresponding database identifier
00589         """
00590         if record.seq is None:
00591             #The biosequence table entry is optional, so if we haven't
00592             #got a sequence, we don't need to write to the table.
00593             return
00594         
00595         # determine the string representation of the alphabet
00596         if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
00597             alphabet = "dna"
00598         elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
00599             alphabet = "rna"
00600         elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
00601             alphabet = "protein"
00602         else:
00603             alphabet = "unknown"
00604 
00605         if isinstance(record.seq, UnknownSeq):
00606             seq_str = None
00607         else:
00608             seq_str = str(record.seq)
00609 
00610         sql = r"INSERT INTO biosequence (bioentry_id, version, " \
00611               r"length, seq, alphabet) " \
00612               r"VALUES (%s, 0, %s, %s, %s)"
00613         self.adaptor.execute(sql, (bioentry_id,
00614                                    len(record.seq),
00615                                    seq_str,
00616                                    alphabet))

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_comment (   self,
  record,
  bioentry_id 
) [private]
Record a SeqRecord's annotated comment in the database (PRIVATE).

record - a SeqRecord object with an annotated comment
bioentry_id - corresponding database identifier

Definition at line 617 of file Loader.py.

00617 
00618     def _load_comment(self, record, bioentry_id):
00619         """Record a SeqRecord's annotated comment in the database (PRIVATE).
00620 
00621         record - a SeqRecord object with an annotated comment
00622         bioentry_id - corresponding database identifier
00623         """
00624         comments = record.annotations.get('comment')
00625         if not comments:
00626             return
00627         if not isinstance(comments, list):
00628             #It should be a string then...
00629             comments = [comments]
00630 
00631         for index, comment in enumerate(comments):
00632             comment = comment.replace('\n', ' ')
00633             #TODO - Store each line as a separate entry?  This would preserve
00634             #the newlines, but we should check BioPerl etc to be consistent.
00635             sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \
00636                   " VALUES (%s, %s, %s)"
00637             self.adaptor.execute(sql, (bioentry_id, comment, index+1))
        

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_dbxrefs (   self,
  record,
  bioentry_id 
) [private]
Load any sequence level cross references into the database (PRIVATE).

See table bioentry_dbxref.

Definition at line 991 of file Loader.py.

00991 
00992     def _load_dbxrefs(self, record, bioentry_id):
00993         """Load any sequence level cross references into the database (PRIVATE).
00994 
00995         See table bioentry_dbxref."""
00996         for rank, value in enumerate(record.dbxrefs):
00997             # Split the DB:accession string at first colon.
00998             # We have to cope with things like:
00999             # "MGD:MGI:892" (db="MGD", accession="MGI:892")
01000             # "GO:GO:123" (db="GO", accession="GO:123")
01001             #
01002             # Annoyingly I have seen the NCBI use both the style
01003             # "GO:GO:123" and "GO:123" in different vintages.
01004             assert value.count("\n")==0
01005             try:
01006                 db, accession = value.split(':',1)
01007                 db = db.strip()
01008                 accession = accession.strip()
01009             except:
01010                 raise ValueError("Parsing of dbxrefs list failed: '%s'" % value)
01011             # Get the dbxref_id value for the dbxref data
01012             dbxref_id = self._get_dbxref_id(db, accession)
01013             # Insert the bioentry_dbxref  data
01014             self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank+1)

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_reference (   self,
  reference,
  rank,
  bioentry_id 
) [private]
Record a SeqRecord's annotated references in the database (PRIVATE).

record - a SeqRecord object with annotated references
bioentry_id - corresponding database identifier

Definition at line 682 of file Loader.py.

00682 
00683     def _load_reference(self, reference, rank, bioentry_id):
00684         """Record a SeqRecord's annotated references in the database (PRIVATE).
00685 
00686         record - a SeqRecord object with annotated references
00687         bioentry_id - corresponding database identifier
00688         """
00689 
00690         refs = None
00691         if reference.medline_id:
00692             refs = self.adaptor.execute_and_fetch_col0(
00693                 "SELECT reference_id" \
00694                 "  FROM reference JOIN dbxref USING (dbxref_id)" \
00695                 " WHERE dbname = 'MEDLINE' AND accession = %s",
00696                 (reference.medline_id,))
00697         if not refs and reference.pubmed_id:
00698             refs = self.adaptor.execute_and_fetch_col0(
00699                 "SELECT reference_id" \
00700                 "  FROM reference JOIN dbxref USING (dbxref_id)" \
00701                 " WHERE dbname = 'PUBMED' AND accession = %s",
00702                 (reference.pubmed_id,))
00703         if not refs:
00704             s = []
00705             for f in reference.authors, reference.title, reference.journal:
00706                 s.append(f or "<undef>")
00707             crc = crc64("".join(s))
00708             refs = self.adaptor.execute_and_fetch_col0(
00709                 "SELECT reference_id FROM reference" \
00710                   r" WHERE crc = %s", (crc,))
00711         if not refs:
00712             if reference.medline_id:
00713                 dbxref_id = self._add_dbxref("MEDLINE",
00714                                              reference.medline_id, 0)
00715             elif reference.pubmed_id:
00716                 dbxref_id = self._add_dbxref("PUBMED",
00717                                              reference.pubmed_id, 0)
00718             else:
00719                 dbxref_id = None
00720             authors = reference.authors or None
00721             title =  reference.title or None
00722             #The location/journal field cannot be Null, so default
00723             #to an empty string rather than None:
00724             journal = reference.journal or ""
00725             self.adaptor.execute(
00726                 "INSERT INTO reference (dbxref_id, location," \
00727                 " title, authors, crc)" \
00728                 " VALUES (%s, %s, %s, %s, %s)",
00729                 (dbxref_id, journal, title,
00730                  authors, crc))
00731             reference_id = self.adaptor.last_id("reference")
00732         else:
00733             reference_id = refs[0]
00734 
00735         if reference.location:
00736             start = 1 + int(str(reference.location[0].start))
00737             end = int(str(reference.location[0].end))
00738         else:
00739             start = None
00740             end = None
00741         
00742         sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \
00743               " start_pos, end_pos, rank)" \
00744               " VALUES (%s, %s, %s, %s, %s)"
00745         self.adaptor.execute(sql, (bioentry_id, reference_id,
00746                                    start, end, rank + 1))
        

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_seqfeature (   self,
  feature,
  feature_rank,
  bioentry_id 
) [private]
Load a biopython SeqFeature into the database (PRIVATE).

Definition at line 747 of file Loader.py.

00747 
00748     def _load_seqfeature(self, feature, feature_rank, bioentry_id):
00749         """Load a biopython SeqFeature into the database (PRIVATE).
00750         """
00751         seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank,
00752                                                     bioentry_id)
00753         self._load_seqfeature_locations(feature, seqfeature_id)
00754         self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_seqfeature_basic (   self,
  feature_type,
  feature_rank,
  bioentry_id 
) [private]
Load the first tables of a seqfeature and returns the id (PRIVATE).

This loads the "key" of the seqfeature (ie. CDS, gene) and
the basic seqfeature table itself.

Definition at line 755 of file Loader.py.

00755 
00756     def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id):
00757         """Load the first tables of a seqfeature and returns the id (PRIVATE).
00758 
00759         This loads the "key" of the seqfeature (ie. CDS, gene) and
00760         the basic seqfeature table itself.
00761         """
00762         ontology_id = self._get_ontology_id('SeqFeature Keys')
00763         seqfeature_key_id = self._get_term_id(feature_type,
00764                                               ontology_id = ontology_id)
00765         # XXX source is always EMBL/GenBank/SwissProt here; it should depend on
00766         # the record (how?)
00767         source_cat_id = self._get_ontology_id('SeqFeature Sources')
00768         source_term_id = self._get_term_id('EMBL/GenBank/SwissProt',
00769                                       ontology_id = source_cat_id)
00770         
00771         sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \
00772               r"source_term_id, rank) VALUES (%s, %s, %s, %s)"
00773         self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
00774                                    source_term_id, feature_rank + 1))
00775         seqfeature_id = self.adaptor.last_id('seqfeature')
00776 
00777         return seqfeature_id

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_seqfeature_dbxref (   self,
  dbxrefs,
  seqfeature_id 
) [private]
Add database crossreferences of a SeqFeature to the database (PRIVATE).

    o dbxrefs           List, dbxref data from the source file in the
                format <database>:<accession>

    o seqfeature_id     Int, the identifier for the seqfeature in the
                seqfeature table

    Insert dbxref qualifier data for a seqfeature into the
    seqfeature_dbxref and, if required, dbxref tables.
    The dbxref_id qualifier/value sets go into the dbxref table
    as dbname, accession, version tuples, with dbxref.dbxref_id
    being automatically assigned, and into the seqfeature_dbxref
    table as seqfeature_id, dbxref_id, and rank tuples

Definition at line 905 of file Loader.py.

00905 
00906     def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
00907         """Add database crossreferences of a SeqFeature to the database (PRIVATE).
00908 
00909             o dbxrefs           List, dbxref data from the source file in the
00910                                 format <database>:<accession>
00911 
00912             o seqfeature_id     Int, the identifier for the seqfeature in the
00913                                 seqfeature table
00914 
00915             Insert dbxref qualifier data for a seqfeature into the
00916             seqfeature_dbxref and, if required, dbxref tables.
00917             The dbxref_id qualifier/value sets go into the dbxref table
00918             as dbname, accession, version tuples, with dbxref.dbxref_id
00919             being automatically assigned, and into the seqfeature_dbxref
00920             table as seqfeature_id, dbxref_id, and rank tuples
00921         """
00922         # NOTE - In older versions of Biopython, we would map the GenBank
00923         # db_xref "name", for example "GI" to "GeneIndex", and give a warning
00924         # for any unknown terms.  This was a long term maintainance problem,
00925         # and differed from BioPerl and BioJava's implementation.  See bug 2405
00926         for rank, value in enumerate(dbxrefs):
00927             # Split the DB:accession format string at colons.  We have to
00928             # account for multiple-line and multiple-accession entries
00929             try:
00930                 dbxref_data = value.replace(' ','').replace('\n','').split(':')
00931                 db = dbxref_data[0]
00932                 accessions = dbxref_data[1:]
00933             except:
00934                 raise ValueError("Parsing of db_xref failed: '%s'" % value)
00935             # Loop over all the grabbed accessions, and attempt to fill the
00936             # table
00937             for accession in accessions:
00938                 # Get the dbxref_id value for the dbxref data
00939                 dbxref_id = self._get_dbxref_id(db, accession)
00940                 # Insert the seqfeature_dbxref data
00941                 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
        

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_seqfeature_locations (   self,
  feature,
  seqfeature_id 
) [private]
Load all of the locations for a SeqFeature into tables (PRIVATE).

This adds the locations related to the SeqFeature into the
seqfeature_location table. Fuzzies are not handled right now.
For a simple location, ie (1..2), we have a single table row
with seq_start = 1, seq_end = 2, location_rank = 1.

For split locations, ie (1..2, 3..4, 5..6) we would have three
row tables with:
    start = 1, end = 2, rank = 1
    start = 3, end = 4, rank = 2
    start = 5, end = 6, rank = 3

Definition at line 778 of file Loader.py.

00778 
00779     def _load_seqfeature_locations(self, feature, seqfeature_id):
00780         """Load all of the locations for a SeqFeature into tables (PRIVATE).
00781 
00782         This adds the locations related to the SeqFeature into the
00783         seqfeature_location table. Fuzzies are not handled right now.
00784         For a simple location, ie (1..2), we have a single table row
00785         with seq_start = 1, seq_end = 2, location_rank = 1.
00786 
00787         For split locations, ie (1..2, 3..4, 5..6) we would have three
00788         row tables with:
00789             start = 1, end = 2, rank = 1
00790             start = 3, end = 4, rank = 2
00791             start = 5, end = 6, rank = 3
00792         """
00793         # TODO - Record an ontology for the locations (using location.term_id)
00794         # which for now as in BioPerl we leave defaulting to NULL.
00795         if feature.location_operator and feature.location_operator != "join":
00796             # e.g. order locations... we don't record "order" so it
00797             # will become a "join" on reloading. What does BioPerl do?
00798             import warnings
00799             warnings.warn("%s location operators are not fully supported" \
00800                           % feature.location_operator)
00801         
00802         # two cases, a simple location or a split location
00803         if not feature.sub_features:    # simple location
00804             self._insert_seqfeature_location(feature, 1, seqfeature_id)
00805         else: # split location
00806             for rank, cur_feature in enumerate(feature.sub_features):
00807                 self._insert_seqfeature_location(cur_feature,
00808                                                  rank + 1,
00809                                                  seqfeature_id)

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader._load_seqfeature_qualifiers (   self,
  qualifiers,
  seqfeature_id 
) [private]
Insert the (key, value) pair qualifiers relating to a feature (PRIVATE).

Qualifiers should be a dictionary of the form:
    {key : [value1, value2]}

Definition at line 866 of file Loader.py.

00866 
00867     def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
00868         """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE).
00869 
00870         Qualifiers should be a dictionary of the form:
00871             {key : [value1, value2]}
00872         """
00873         tag_ontology_id = self._get_ontology_id('Annotation Tags')
00874         for qualifier_key in qualifiers:
00875             # Treat db_xref qualifiers differently to sequence annotation
00876             # qualifiers by populating the seqfeature_dbxref and dbxref
00877             # tables.  Other qualifiers go into the seqfeature_qualifier_value
00878             # and (if new) term tables.
00879             if qualifier_key != 'db_xref':
00880                 qualifier_key_id = self._get_term_id(qualifier_key,
00881                                                   ontology_id=tag_ontology_id)
00882                 # now add all of the values to their table
00883                 entries = qualifiers[qualifier_key]
00884                 if not isinstance(entries, list):
00885                     # Could be a plain string, or an int or a float.
00886                     # However, we exect a list of strings here.
00887                     entries = [entries]
00888                 for qual_value_rank in range(len(entries)):
00889                     qualifier_value = entries[qual_value_rank]
00890                     sql = r"INSERT INTO seqfeature_qualifier_value "\
00891                           r" (seqfeature_id, term_id, rank, value) VALUES"\
00892                           r" (%s, %s, %s, %s)"
00893                     self.adaptor.execute(sql, (seqfeature_id,
00894                                                qualifier_key_id,
00895                                                qual_value_rank + 1,
00896                                                qualifier_value))
00897             else:
00898                 # The dbxref_id qualifier/value sets go into the dbxref table
00899                 # as dbname, accession, version tuples, with dbxref.dbxref_id
00900                 # being automatically assigned, and into the seqfeature_dbxref
00901                 # table as seqfeature_id, dbxref_id, and rank tuples
00902                 self._load_seqfeature_dbxref(qualifiers[qualifier_key],
00903                                              seqfeature_id)
00904 

Here is the call graph for this function:

Here is the caller graph for this function:

def BioSQL.Loader.DatabaseLoader.load_seqrecord (   self,
  record 
)
Load a Biopython SeqRecord into the database.

Definition at line 49 of file Loader.py.

00049 
00050     def load_seqrecord(self, record):
00051         """Load a Biopython SeqRecord into the database.
00052         """
00053         bioentry_id = self._load_bioentry_table(record)
00054         self._load_bioentry_date(record, bioentry_id)
00055         self._load_biosequence(record, bioentry_id)
00056         self._load_comment(record, bioentry_id)
00057         self._load_dbxrefs(record, bioentry_id)
00058         references = record.annotations.get('references', ())
00059         for reference, rank in zip(references, range(len(references))):
00060             self._load_reference(reference, rank, bioentry_id)
00061         self._load_annotations(record, bioentry_id)
00062         for seq_feature_num in range(len(record.features)):
00063             seq_feature = record.features[seq_feature_num]
00064             self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)

Here is the call graph for this function:


Member Data Documentation

Definition at line 45 of file Loader.py.

Definition at line 46 of file Loader.py.

Definition at line 47 of file Loader.py.


The documentation for this class was generated from the following file: