Back to index

python-biopython  1.60
Loader.py
Go to the documentation of this file.
00001 # Copyright 2002 by Andrew Dalke.  All rights reserved.
00002 # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved.
00003 # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved.
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 #
00008 # Note that BioSQL (including the database schema and scripts) is
00009 # available and licensed separately.  Please consult www.biosql.org
00010 
00011 """Load biopython objects into a BioSQL database for persistent storage.
00012 
00013 This code makes it possible to store biopython objects in a relational
00014 database and then retrieve them back. You shouldn't use any of the
00015 classes in this module directly. Rather, call the load() method on
00016 a database object.
00017 """
00018 # standard modules
00019 from time import gmtime, strftime
00020 
00021 # biopython
00022 from Bio import Alphabet
00023 from Bio.SeqUtils.CheckSum import crc64
00024 from Bio import Entrez
00025 from Bio.Seq import UnknownSeq
00026 
00027 from Bio._py3k import _is_int_or_long
00028 
00029 class DatabaseLoader:
00030     """Object used to load SeqRecord objects into a BioSQL database."""
00031     def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
00032         """Initialize with connection information for the database.
00033 
00034         Creating a DatabaseLoader object is normally handled via the
00035         BioSeqDatabase DBServer object, for example:
00036 
00037         from BioSQL import BioSeqDatabase
00038         server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse",
00039                          passwd = "biosql", host = "localhost", db="test_biosql")
00040         try:
00041             db = server["test"]
00042         except KeyError:
00043             db = server.new_database("test", description="For testing GBrowse")
00044         """
00045         self.adaptor = adaptor
00046         self.dbid = dbid
00047         self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
00048     
00049     def load_seqrecord(self, record):
00050         """Load a Biopython SeqRecord into the database.
00051         """
00052         bioentry_id = self._load_bioentry_table(record)
00053         self._load_bioentry_date(record, bioentry_id)
00054         self._load_biosequence(record, bioentry_id)
00055         self._load_comment(record, bioentry_id)
00056         self._load_dbxrefs(record, bioentry_id)
00057         references = record.annotations.get('references', ())
00058         for reference, rank in zip(references, range(len(references))):
00059             self._load_reference(reference, rank, bioentry_id)
00060         self._load_annotations(record, bioentry_id)
00061         for seq_feature_num in range(len(record.features)):
00062             seq_feature = record.features[seq_feature_num]
00063             self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
00064 
00065     def _get_ontology_id(self, name, definition=None):
00066         """Returns the identifier for the named ontology (PRIVATE).
00067 
00068         This looks through the onotology table for a the given entry name.
00069         If it is not found, a row is added for this ontology (using the
00070         definition if supplied).  In either case, the id corresponding to
00071         the provided name is returned, so that you can reference it in
00072         another table.
00073         """
00074         oids = self.adaptor.execute_and_fetch_col0(
00075             "SELECT ontology_id FROM ontology WHERE name = %s",
00076             (name,))
00077         if oids:
00078             return oids[0]
00079         self.adaptor.execute(
00080             "INSERT INTO ontology(name, definition) VALUES (%s, %s)",
00081             (name, definition))
00082         return self.adaptor.last_id("ontology")
00083 
00084     
00085     def _get_term_id(self,
00086                      name,
00087                      ontology_id=None,
00088                      definition=None,
00089                      identifier=None):
00090         """Get the id that corresponds to a term (PRIVATE).
00091 
00092         This looks through the term table for a the given term. If it
00093         is not found, a new id corresponding to this term is created.
00094         In either case, the id corresponding to that term is returned, so
00095         that you can reference it in another table.
00096 
00097         The ontology_id should be used to disambiguate the term.
00098         """
00099 
00100         # try to get the term id
00101         sql = r"SELECT term_id FROM term " \
00102               r"WHERE name = %s"
00103         fields = [name]
00104         if ontology_id:
00105             sql += ' AND ontology_id = %s'
00106             fields.append(ontology_id)
00107         id_results = self.adaptor.execute_and_fetchall(sql, fields)
00108         # something is wrong
00109         if len(id_results) > 1:
00110             raise ValueError("Multiple term ids for %s: %r" % 
00111                              (name, id_results))
00112         elif len(id_results) == 1:
00113             return id_results[0][0]
00114         else:
00115             sql = r"INSERT INTO term (name, definition," \
00116                   r" identifier, ontology_id)" \
00117                   r" VALUES (%s, %s, %s, %s)"
00118             self.adaptor.execute(sql, (name, definition,
00119                                        identifier, ontology_id))
00120             return self.adaptor.last_id("term")
00121 
00122     def _add_dbxref(self, dbname, accession, version):
00123        """Insert a dbxref and return its id."""
00124        
00125        self.adaptor.execute(
00126            "INSERT INTO dbxref(dbname, accession, version)" \
00127            " VALUES (%s, %s, %s)", (dbname, accession, version))
00128        return self.adaptor.last_id("dbxref")
00129            
00130     def _get_taxon_id(self, record):
00131         """Get the taxon id for this record (PRIVATE).
00132 
00133         record - a SeqRecord object
00134 
00135         This searches the taxon/taxon_name tables using the
00136         NCBI taxon ID, scientific name and common name to find
00137         the matching taxon table entry's id.
00138         
00139         If the species isn't in the taxon table, and we have at
00140         least the NCBI taxon ID, scientific name or common name,
00141         at least a minimal stub entry is created in the table.
00142 
00143         Returns the taxon id (database key for the taxon table,
00144         not an NCBI taxon ID), or None if the taxonomy information
00145         is missing.
00146 
00147         See also the BioSQL script load_ncbi_taxonomy.pl which
00148         will populate and update the taxon/taxon_name tables
00149         with the latest information from the NCBI.
00150         """
00151         
00152         # To find the NCBI taxid, first check for a top level annotation
00153         ncbi_taxon_id = None
00154         if "ncbi_taxid" in record.annotations:
00155             #Could be a list of IDs.
00156             if isinstance(record.annotations["ncbi_taxid"],list):
00157                 if len(record.annotations["ncbi_taxid"])==1:
00158                     ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
00159             else:
00160                 ncbi_taxon_id = record.annotations["ncbi_taxid"]
00161         if not ncbi_taxon_id:
00162             # Secondly, look for a source feature
00163             for f in record.features:
00164                 if f.type == 'source':
00165                     quals = getattr(f, 'qualifiers', {})
00166                     if "db_xref" in quals:
00167                         for db_xref in f.qualifiers["db_xref"]:
00168                             if db_xref.startswith("taxon:"):
00169                                 ncbi_taxon_id = int(db_xref[6:])
00170                                 break
00171                 if ncbi_taxon_id: break
00172 
00173         try:
00174             scientific_name = record.annotations["organism"][:255]
00175         except KeyError:
00176             scientific_name = None
00177         try:
00178             common_name = record.annotations["source"][:255]
00179         except KeyError:
00180             common_name = None
00181         # Note: The maximum length for taxon names in the schema is 255.
00182         # Cropping it now should help in getting a match when searching,
00183         # and avoids an error if we try and add these to the database.
00184 
00185 
00186         if ncbi_taxon_id:
00187             #Good, we have the NCBI taxon to go on - this is unambiguous :)
00188             #Note that the scientific name and common name will only be
00189             #used if we have to record a stub entry.
00190             return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id,
00191                                                          scientific_name,
00192                                                          common_name)
00193         
00194         if not common_name and not scientific_name:
00195             # Nothing to go on... and there is no point adding
00196             # a new entry to the database.  We'll just leave this
00197             # sequence's taxon as a NULL in the database.
00198             return None
00199 
00200         # Next, we'll try to find a match based on the species name
00201         # (stored in GenBank files as the organism and/or the source).
00202         if scientific_name:
00203             taxa = self.adaptor.execute_and_fetch_col0(
00204                 "SELECT taxon_id FROM taxon_name" \
00205                 " WHERE name_class = 'scientific name' AND name = %s",
00206                 (scientific_name,))
00207             if taxa:
00208                 #Good, mapped the scientific name to a taxon table entry
00209                 return taxa[0]
00210 
00211         # Last chance...
00212         if common_name:
00213             taxa = self.adaptor.execute_and_fetch_col0(
00214                 "SELECT DISTINCT taxon_id FROM taxon_name" \
00215                 " WHERE name = %s",
00216                 (common_name,))
00217             #Its natural that several distinct taxa will have the same common
00218             #name - in which case we can't resolve the taxon uniquely.
00219             if len(taxa) > 1:
00220                 raise ValueError("Taxa: %d species have name %r" % (
00221                     len(taxa),
00222                     common_name))
00223             if taxa:
00224                 #Good, mapped the common name to a taxon table entry
00225                 return taxa[0]
00226 
00227         # At this point, as far as we can tell, this species isn't
00228         # in the taxon table already.  So we'll have to add it.
00229         # We don't have an NCBI taxonomy ID, so if we do record just
00230         # a stub entry, there is no simple way to fix this later.
00231         #
00232         # TODO - Should we try searching the NCBI taxonomy using the
00233         # species name?
00234         #
00235         # OK, let's try inserting the species.
00236         # Chances are we don't have enough information ...
00237         # Furthermore, it won't be in the hierarchy.
00238 
00239         lineage = []
00240         for c in record.annotations.get("taxonomy", []):
00241             lineage.append([None, None, c])
00242         if lineage:
00243             lineage[-1][1] = "genus"
00244         lineage.append([None, "species", record.annotations["organism"]])
00245         # XXX do we have them?
00246         if "subspecies" in record.annotations:
00247             lineage.append([None, "subspecies",
00248                             record.annotations["subspecies"]])
00249         if "variant" in record.annotations:
00250             lineage.append([None, "varietas",
00251                             record.annotations["variant"]])
00252         lineage[-1][0] = ncbi_taxon_id
00253         
00254         left_value = self.adaptor.execute_one(
00255             "SELECT MAX(left_value) FROM taxon")[0]
00256         if not left_value:
00257             left_value = 0
00258         left_value += 1
00259         
00260         # XXX -- Brad: Fixing this for now in an ugly way because
00261         # I am getting overlaps for right_values. I need to dig into this
00262         # more to actually understand how it works. I'm not sure it is
00263         # actually working right anyhow.
00264         right_start_value = self.adaptor.execute_one(
00265             "SELECT MAX(right_value) FROM taxon")[0]
00266         if not right_start_value:
00267             right_start_value = 0
00268         right_value = right_start_value + 2 * len(lineage) - 1
00269 
00270         parent_taxon_id = None
00271         for taxon in lineage:
00272             self.adaptor.execute(
00273                 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
00274                 " left_value, right_value)" \
00275                 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id,
00276                                                  taxon[0],
00277                                                  taxon[1],
00278                                                  left_value,
00279                                                  right_value))
00280             taxon_id = self.adaptor.last_id("taxon")
00281             self.adaptor.execute(
00282                 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
00283                 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255]))
00284             #Note the name field is limited to 255, some SwissProt files
00285             #have a multi-species name which can be longer.  So truncate this.
00286             left_value += 1
00287             right_value -= 1
00288             parent_taxon_id = taxon_id
00289         if common_name:
00290             self.adaptor.execute(
00291                 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
00292                 "VALUES (%s, %s, 'common name')", (
00293                 taxon_id, common_name))
00294 
00295         return taxon_id
00296 
00297     def _fix_name_class(self, entrez_name):
00298         """Map Entrez name terms to those used in taxdump (PRIVATE).
00299 
00300         We need to make this conversion to match the taxon_name.name_class
00301         values used by the BioSQL load_ncbi_taxonomy.pl script.
00302         
00303         e.g.
00304         "ScientificName" -> "scientific name",
00305         "EquivalentName" -> "equivalent name",
00306         "Synonym" -> "synonym",
00307         """
00308         #Add any special cases here:
00309         #
00310         #known = {}
00311         #try:
00312         #    return known[entrez_name]
00313         #except KeyError:
00314         #    pass
00315 
00316         #Try automatically by adding spaces before each capital
00317         def add_space(letter):
00318             if letter.isupper():
00319                 return " "+letter.lower()
00320             else:
00321                 return letter
00322         answer = "".join([add_space(letter) for letter in entrez_name]).strip()
00323         assert answer == answer.lower()
00324         return answer
00325 
00326     def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id,
00327                                          scientific_name = None,
00328                                          common_name = None):
00329         """Get the taxon id for this record from the NCBI taxon ID (PRIVATE).
00330 
00331         ncbi_taxon_id - string containing an NCBI taxon id
00332         scientific_name - string, used if a stub entry is recorded
00333         common_name - string, used if a stub entry is recorded
00334         
00335         This searches the taxon table using ONLY the NCBI taxon ID
00336         to find the matching taxon table entry's ID (database key).
00337         
00338         If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
00339         flag is true, Biopython will attempt to go online using Bio.Entrez
00340         to fetch the official NCBI lineage, recursing up the tree until an
00341         existing entry is found in the database or the full lineage has been
00342         fetched.
00343 
00344         Otherwise the NCBI taxon ID, scientific name and common name are
00345         recorded as a minimal stub entry in the taxon and taxon_name tables.
00346         Any partial information about the lineage from the SeqRecord is NOT
00347         recorded.  This should mean that (re)running the BioSQL script
00348         load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
00349 
00350         Returns the taxon id (database key for the taxon table, not
00351         an NCBI taxon ID).
00352         """
00353         assert ncbi_taxon_id
00354 
00355         taxon_id = self.adaptor.execute_and_fetch_col0(
00356             "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s",
00357             (ncbi_taxon_id,))
00358         if taxon_id:
00359             #Good, we have mapped the NCBI taxid to a taxon table entry
00360             return taxon_id[0]
00361 
00362         # At this point, as far as we can tell, this species isn't
00363         # in the taxon table already.  So we'll have to add it.
00364 
00365         parent_taxon_id = None
00366         rank = "species"
00367         genetic_code = None
00368         mito_genetic_code = None
00369         species_names = []
00370         if scientific_name:
00371             species_names.append(("scientific name", scientific_name))
00372         if common_name:
00373             species_names.append(("common name", common_name))
00374         
00375         if self.fetch_NCBI_taxonomy:
00376             #Go online to get the parent taxon ID!
00377             handle = Entrez.efetch(db="taxonomy",id=ncbi_taxon_id,retmode="XML")
00378             taxonomic_record = Entrez.read(handle)
00379             if len(taxonomic_record) == 1:
00380                 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \
00381                        "%s versus %s" % (taxonomic_record[0]["TaxId"],
00382                                          ncbi_taxon_id)
00383                 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( \
00384                                             taxonomic_record[0]["LineageEx"])
00385                 rank = taxonomic_record[0]["Rank"]
00386                 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"]
00387                 mito_genetic_code = taxonomic_record[0]["MitoGeneticCode"]["MGCId"]
00388                 species_names = [("scientific name",
00389                                   taxonomic_record[0]["ScientificName"])]
00390                 try:
00391                     for name_class, names in taxonomic_record[0]["OtherNames"].iteritems():
00392                         name_class = self._fix_name_class(name_class)
00393                         if not isinstance(names, list):
00394                             #The Entrez parser seems to return single entry
00395                             #lists as just a string which is annoying.
00396                             names = [names]
00397                         for name in names:
00398                             #Want to ignore complex things like ClassCDE entries
00399                             if isinstance(name, basestring):
00400                                 species_names.append((name_class, name))
00401                 except KeyError:
00402                     #OtherNames isn't always present,
00403                     #e.g. NCBI taxon 41205, Bromheadia finlaysoniana
00404                     pass
00405         else:
00406             pass
00407             # If we are not allowed to go online, we will record the bare minimum;
00408             # as long as the NCBI taxon id is present, then (re)running
00409             # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage
00410             # (and update the species names).
00411             #
00412             # I am NOT going to try and record the lineage, even if it
00413             # is in the record annotation as a list of names, as we won't
00414             # know the NCBI taxon IDs for these parent nodes.
00415 
00416         self.adaptor.execute(
00417             "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
00418             " genetic_code, mito_genetic_code, left_value, right_value)" \
00419             " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id,
00420                                                      ncbi_taxon_id,
00421                                                      rank,
00422                                                      genetic_code,
00423                                                      mito_genetic_code,
00424                                                      None,
00425                                                      None))
00426         taxon_id = self.adaptor.last_id("taxon")
00427 
00428         #Record the scientific name, common name, etc
00429         for name_class, name in species_names:
00430             self.adaptor.execute(
00431                 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
00432                 " VALUES (%s, %s, %s)", (taxon_id, 
00433                                          name[:255], 
00434                                          name_class))
00435         return taxon_id
00436 
00437     def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
00438         """This is recursive! (PRIVATE).
00439 
00440         taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
00441 
00442         First dictionary in list is the taxonomy root, highest would be the species.
00443         Each dictionary includes:
00444         - TaxID (string, NCBI taxon id)
00445         - Rank (string, e.g. "species", "genus", ..., "phylum", ...)
00446         - ScientificName (string)
00447         (and that is all at the time of writing)
00448 
00449         This method will record all the lineage given, returning the the taxon id
00450         (database key, not NCBI taxon id) of the final entry (the species).
00451         """
00452         ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"]
00453 
00454         #Is this in the database already?  Check the taxon table...
00455         taxon_id = self.adaptor.execute_and_fetch_col0(
00456             "SELECT taxon_id FROM taxon" \
00457             " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id)
00458         if taxon_id:
00459             # we could verify that the Scientific Name etc in the database
00460             # is the same and update it or print a warning if not...
00461             if isinstance(taxon_id, list):
00462                 assert len(taxon_id)==1
00463                 return taxon_id[0]
00464             else:
00465                 return taxon_id
00466 
00467         #We have to record this.
00468         if len(taxonomic_lineage) > 1:
00469             #Use recursion to find out the taxon id (database key) of the parent.
00470             parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
00471             assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id)
00472         else:
00473             parent_taxon_id = None
00474 
00475         # INSERT new taxon
00476         rank = taxonomic_lineage[-1].get("Rank", None)
00477         self.adaptor.execute(
00478                 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)"\
00479                 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank))
00480         taxon_id = self.adaptor.last_id("taxon")
00481         assert isinstance(taxon_id, int) or isinstance(taxon_id, long), repr(taxon_id)
00482         # ... and its name in taxon_name
00483         scientific_name = taxonomic_lineage[-1].get("ScientificName", None)
00484         if scientific_name:
00485             self.adaptor.execute(
00486                     "INSERT INTO taxon_name(taxon_id, name, name_class)" \
00487                     " VALUES (%s, %s, 'scientific name')", (taxon_id, 
00488                                                             scientific_name[:255]))
00489         return taxon_id
00490 
00491 
00492     def _load_bioentry_table(self, record):
00493         """Fill the bioentry table with sequence information (PRIVATE).
00494 
00495         record - SeqRecord object to add to the database.
00496         """
00497         # get the pertinent info and insert it
00498         
00499         if record.id.count(".") == 1: # try to get a version from the id
00500             #This assumes the string is something like "XXXXXXXX.123"
00501             accession, version = record.id.split('.')
00502             try:
00503                 version = int(version)
00504             except ValueError:
00505                 accession = record.id
00506                 version = 0
00507         else: # otherwise just use a version of 0
00508             accession = record.id
00509             version = 0
00510 
00511         if "accessions" in record.annotations \
00512         and isinstance(record.annotations["accessions"],list) \
00513         and record.annotations["accessions"]:
00514             #Take the first accession (one if there is more than one)
00515             accession = record.annotations["accessions"][0]
00516 
00517         #Find the taxon id (this is not just the NCBI Taxon ID)
00518         #NOTE - If the species isn't defined in the taxon table,
00519         #a new minimal entry is created.
00520         taxon_id = self._get_taxon_id(record)
00521 
00522         if "gi" in record.annotations:
00523             identifier = record.annotations["gi"]
00524         else:
00525             identifier = record.id
00526 
00527         #Allow description and division to default to NULL as in BioPerl.
00528         description = getattr(record, 'description', None)
00529         division = record.annotations.get("data_file_division", None)
00530         
00531         sql = """
00532         INSERT INTO bioentry (
00533          biodatabase_id,
00534          taxon_id,
00535          name,
00536          accession,
00537          identifier,
00538          division,
00539          description,
00540          version)
00541         VALUES (
00542          %s,
00543          %s,
00544          %s,
00545          %s,
00546          %s,
00547          %s,
00548          %s,
00549          %s)"""
00550         #print self.dbid, taxon_id, record.name, accession, identifier, \
00551         #        division, description, version
00552         self.adaptor.execute(sql, (self.dbid,
00553                                    taxon_id,
00554                                    record.name, 
00555                                    accession,
00556                                    identifier,
00557                                    division,
00558                                    description,
00559                                    version))
00560         # now retrieve the id for the bioentry
00561         bioentry_id = self.adaptor.last_id('bioentry')
00562 
00563         return bioentry_id
00564 
00565     def _load_bioentry_date(self, record, bioentry_id):
00566         """Add the effective date of the entry into the database.
00567 
00568         record - a SeqRecord object with an annotated date
00569         bioentry_id - corresponding database identifier
00570         """
00571         # dates are GenBank style, like:
00572         # 14-SEP-2000
00573         date = record.annotations.get("date",
00574                                       strftime("%d-%b-%Y", gmtime()).upper())
00575         if isinstance(date, list) : date = date[0]
00576         annotation_tags_id = self._get_ontology_id("Annotation Tags")
00577         date_id = self._get_term_id("date_changed", annotation_tags_id)
00578         sql = r"INSERT INTO bioentry_qualifier_value" \
00579               r" (bioentry_id, term_id, value, rank)" \
00580               r" VALUES (%s, %s, %s, 1)" 
00581         self.adaptor.execute(sql, (bioentry_id, date_id, date))
00582 
00583     def _load_biosequence(self, record, bioentry_id):
00584         """Record a SeqRecord's sequence and alphabet in the database (PRIVATE).
00585 
00586         record - a SeqRecord object with a seq property
00587         bioentry_id - corresponding database identifier
00588         """
00589         if record.seq is None:
00590             #The biosequence table entry is optional, so if we haven't
00591             #got a sequence, we don't need to write to the table.
00592             return
00593         
00594         # determine the string representation of the alphabet
00595         if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
00596             alphabet = "dna"
00597         elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
00598             alphabet = "rna"
00599         elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
00600             alphabet = "protein"
00601         else:
00602             alphabet = "unknown"
00603 
00604         if isinstance(record.seq, UnknownSeq):
00605             seq_str = None
00606         else:
00607             seq_str = str(record.seq)
00608 
00609         sql = r"INSERT INTO biosequence (bioentry_id, version, " \
00610               r"length, seq, alphabet) " \
00611               r"VALUES (%s, 0, %s, %s, %s)"
00612         self.adaptor.execute(sql, (bioentry_id,
00613                                    len(record.seq),
00614                                    seq_str,
00615                                    alphabet))
00616 
00617     def _load_comment(self, record, bioentry_id):
00618         """Record a SeqRecord's annotated comment in the database (PRIVATE).
00619 
00620         record - a SeqRecord object with an annotated comment
00621         bioentry_id - corresponding database identifier
00622         """
00623         comments = record.annotations.get('comment')
00624         if not comments:
00625             return
00626         if not isinstance(comments, list):
00627             #It should be a string then...
00628             comments = [comments]
00629 
00630         for index, comment in enumerate(comments):
00631             comment = comment.replace('\n', ' ')
00632             #TODO - Store each line as a separate entry?  This would preserve
00633             #the newlines, but we should check BioPerl etc to be consistent.
00634             sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \
00635                   " VALUES (%s, %s, %s)"
00636             self.adaptor.execute(sql, (bioentry_id, comment, index+1))
00637         
00638     def _load_annotations(self, record, bioentry_id):
00639         """Record a SeqRecord's misc annotations in the database (PRIVATE).
00640 
00641         The annotation strings are recorded in the bioentry_qualifier_value
00642         table, except for special cases like the reference, comment and
00643         taxonomy which are handled with their own tables.
00644 
00645         record - a SeqRecord object with an annotations dictionary
00646         bioentry_id - corresponding database identifier
00647         """
00648         mono_sql = "INSERT INTO bioentry_qualifier_value" \
00649                    "(bioentry_id, term_id, value)" \
00650                    " VALUES (%s, %s, %s)"
00651         many_sql = "INSERT INTO bioentry_qualifier_value" \
00652                    "(bioentry_id, term_id, value, rank)" \
00653                    " VALUES (%s, %s, %s, %s)"
00654         tag_ontology_id = self._get_ontology_id('Annotation Tags')
00655         for key, value in record.annotations.iteritems():
00656             if key in ["references", "comment", "ncbi_taxid", "date"]:
00657                 #Handled separately
00658                 continue
00659             term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
00660             if isinstance(value, list) or isinstance(value, tuple):
00661                 rank = 0
00662                 for entry in value:
00663                     if isinstance(entry, str) or isinstance(entry, int):
00664                         #Easy case
00665                         rank += 1
00666                         self.adaptor.execute(many_sql, \
00667                                      (bioentry_id, term_id, str(entry), rank))
00668                     else:
00669                         pass
00670                         #print "Ignoring annotation '%s' sub-entry of type '%s'" \
00671                         #      % (key, str(type(entry)))
00672             elif isinstance(value, str) or isinstance(value, int):
00673                 #Have a simple single entry, leave rank as the DB default
00674                 self.adaptor.execute(mono_sql, \
00675                                      (bioentry_id, term_id, str(value)))
00676             else:
00677                 pass
00678                 #print "Ignoring annotation '%s' entry of type '%s'" \
00679                 #      % (key, type(value))
00680 
00681 
00682     def _load_reference(self, reference, rank, bioentry_id):
00683         """Record a SeqRecord's annotated references in the database (PRIVATE).
00684 
00685         record - a SeqRecord object with annotated references
00686         bioentry_id - corresponding database identifier
00687         """
00688 
00689         refs = None
00690         if reference.medline_id:
00691             refs = self.adaptor.execute_and_fetch_col0(
00692                 "SELECT reference_id" \
00693                 "  FROM reference JOIN dbxref USING (dbxref_id)" \
00694                 " WHERE dbname = 'MEDLINE' AND accession = %s",
00695                 (reference.medline_id,))
00696         if not refs and reference.pubmed_id:
00697             refs = self.adaptor.execute_and_fetch_col0(
00698                 "SELECT reference_id" \
00699                 "  FROM reference JOIN dbxref USING (dbxref_id)" \
00700                 " WHERE dbname = 'PUBMED' AND accession = %s",
00701                 (reference.pubmed_id,))
00702         if not refs:
00703             s = []
00704             for f in reference.authors, reference.title, reference.journal:
00705                 s.append(f or "<undef>")
00706             crc = crc64("".join(s))
00707             refs = self.adaptor.execute_and_fetch_col0(
00708                 "SELECT reference_id FROM reference" \
00709                   r" WHERE crc = %s", (crc,))
00710         if not refs:
00711             if reference.medline_id:
00712                 dbxref_id = self._add_dbxref("MEDLINE",
00713                                              reference.medline_id, 0)
00714             elif reference.pubmed_id:
00715                 dbxref_id = self._add_dbxref("PUBMED",
00716                                              reference.pubmed_id, 0)
00717             else:
00718                 dbxref_id = None
00719             authors = reference.authors or None
00720             title =  reference.title or None
00721             #The location/journal field cannot be Null, so default
00722             #to an empty string rather than None:
00723             journal = reference.journal or ""
00724             self.adaptor.execute(
00725                 "INSERT INTO reference (dbxref_id, location," \
00726                 " title, authors, crc)" \
00727                 " VALUES (%s, %s, %s, %s, %s)",
00728                 (dbxref_id, journal, title,
00729                  authors, crc))
00730             reference_id = self.adaptor.last_id("reference")
00731         else:
00732             reference_id = refs[0]
00733 
00734         if reference.location:
00735             start = 1 + int(str(reference.location[0].start))
00736             end = int(str(reference.location[0].end))
00737         else:
00738             start = None
00739             end = None
00740         
00741         sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \
00742               " start_pos, end_pos, rank)" \
00743               " VALUES (%s, %s, %s, %s, %s)"
00744         self.adaptor.execute(sql, (bioentry_id, reference_id,
00745                                    start, end, rank + 1))
00746         
00747     def _load_seqfeature(self, feature, feature_rank, bioentry_id):
00748         """Load a biopython SeqFeature into the database (PRIVATE).
00749         """
00750         seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank,
00751                                                     bioentry_id)
00752         self._load_seqfeature_locations(feature, seqfeature_id)
00753         self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
00754 
00755     def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id):
00756         """Load the first tables of a seqfeature and returns the id (PRIVATE).
00757 
00758         This loads the "key" of the seqfeature (ie. CDS, gene) and
00759         the basic seqfeature table itself.
00760         """
00761         ontology_id = self._get_ontology_id('SeqFeature Keys')
00762         seqfeature_key_id = self._get_term_id(feature_type,
00763                                               ontology_id = ontology_id)
00764         # XXX source is always EMBL/GenBank/SwissProt here; it should depend on
00765         # the record (how?)
00766         source_cat_id = self._get_ontology_id('SeqFeature Sources')
00767         source_term_id = self._get_term_id('EMBL/GenBank/SwissProt',
00768                                       ontology_id = source_cat_id)
00769         
00770         sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \
00771               r"source_term_id, rank) VALUES (%s, %s, %s, %s)"
00772         self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
00773                                    source_term_id, feature_rank + 1))
00774         seqfeature_id = self.adaptor.last_id('seqfeature')
00775 
00776         return seqfeature_id
00777 
00778     def _load_seqfeature_locations(self, feature, seqfeature_id):
00779         """Load all of the locations for a SeqFeature into tables (PRIVATE).
00780 
00781         This adds the locations related to the SeqFeature into the
00782         seqfeature_location table. Fuzzies are not handled right now.
00783         For a simple location, ie (1..2), we have a single table row
00784         with seq_start = 1, seq_end = 2, location_rank = 1.
00785 
00786         For split locations, ie (1..2, 3..4, 5..6) we would have three
00787         row tables with:
00788             start = 1, end = 2, rank = 1
00789             start = 3, end = 4, rank = 2
00790             start = 5, end = 6, rank = 3
00791         """
00792         # TODO - Record an ontology for the locations (using location.term_id)
00793         # which for now as in BioPerl we leave defaulting to NULL.
00794         if feature.location_operator and feature.location_operator != "join":
00795             # e.g. order locations... we don't record "order" so it
00796             # will become a "join" on reloading. What does BioPerl do?
00797             import warnings
00798             warnings.warn("%s location operators are not fully supported" \
00799                           % feature.location_operator)
00800         
00801         # two cases, a simple location or a split location
00802         if not feature.sub_features:    # simple location
00803             self._insert_seqfeature_location(feature, 1, seqfeature_id)
00804         else: # split location
00805             for rank, cur_feature in enumerate(feature.sub_features):
00806                 self._insert_seqfeature_location(cur_feature,
00807                                                  rank + 1,
00808                                                  seqfeature_id)
00809 
00810     def _insert_seqfeature_location(self, feature, rank, seqfeature_id):
00811         """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE).
00812 
00813         TODO - Add location_operators to location_qualifier_value.
00814         """
00815         # convert biopython locations to the 1-based location system
00816         # used in bioSQL
00817         # XXX This could also handle fuzzies
00818         start = int(feature.location.start) + 1
00819         end = int(feature.location.end)
00820 
00821         # Biopython uses None when we don't know strand information but
00822         # BioSQL requires something (non null) and sets this as zero
00823         # So we'll use the strand or 0 if Biopython spits out None
00824         strand = feature.strand or 0
00825 
00826         # TODO - Record an ontology term for the location (location.term_id)
00827         # which for now like BioPerl we'll leave as NULL.
00828         # This might allow us to record "between" positions properly, but I
00829         # doesn't really see how it could work for before/after fuzzy positions
00830         loc_term_id = None
00831 
00832         if feature.ref:
00833             # sub_feature remote locations when they are in the same db as the current
00834             # record do not have a value for ref_db, which the SeqFeature object
00835             # stores as None. BioSQL schema requires a varchar and is not NULL 
00836             dbxref_id = self._get_dbxref_id(feature.ref_db or "", feature.ref)
00837         else:
00838             dbxref_id = None
00839 
00840         sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \
00841               r"start_pos, end_pos, strand, rank) " \
00842               r"VALUES (%s, %s, %s, %s, %s, %s, %s)"
00843         self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id,
00844                                    start, end, strand, rank))
00845 
00846         """
00847         # See Bug 2677
00848         # TODO - Record the location_operator (e.g. "join" or "order")
00849         # using the location_qualifier_value table (which we and BioPerl
00850         # have historically left empty).
00851         # Note this will need an ontology term for the location qualifer
00852         # (location_qualifier_value.term_id) for which oddly the schema
00853         # does not allow NULL.
00854         if feature.location_operator:
00855             #e.g. "join" (common),
00856             #or "order" (see Tests/GenBank/protein_refseq2.gb)
00857             location_id = self.adaptor.last_id('location')
00858             loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
00859             sql = r"INSERT INTO location_qualifier_value" \
00860                   r"(location_id, term_id, value)" \
00861                   r"VALUES (%s, %s, %s)"
00862             self.adaptor.execute(sql, (location_id, loc_qual_term_id,
00863                                        feature.location_operator))
00864         """
00865 
00866     def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
00867         """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE).
00868 
00869         Qualifiers should be a dictionary of the form:
00870             {key : [value1, value2]}
00871         """
00872         tag_ontology_id = self._get_ontology_id('Annotation Tags')
00873         for qualifier_key in qualifiers:
00874             # Treat db_xref qualifiers differently to sequence annotation
00875             # qualifiers by populating the seqfeature_dbxref and dbxref
00876             # tables.  Other qualifiers go into the seqfeature_qualifier_value
00877             # and (if new) term tables.
00878             if qualifier_key != 'db_xref':
00879                 qualifier_key_id = self._get_term_id(qualifier_key,
00880                                                   ontology_id=tag_ontology_id)
00881                 # now add all of the values to their table
00882                 entries = qualifiers[qualifier_key]
00883                 if not isinstance(entries, list):
00884                     # Could be a plain string, or an int or a float.
00885                     # However, we exect a list of strings here.
00886                     entries = [entries]
00887                 for qual_value_rank in range(len(entries)):
00888                     qualifier_value = entries[qual_value_rank]
00889                     sql = r"INSERT INTO seqfeature_qualifier_value "\
00890                           r" (seqfeature_id, term_id, rank, value) VALUES"\
00891                           r" (%s, %s, %s, %s)"
00892                     self.adaptor.execute(sql, (seqfeature_id,
00893                                                qualifier_key_id,
00894                                                qual_value_rank + 1,
00895                                                qualifier_value))
00896             else:
00897                 # The dbxref_id qualifier/value sets go into the dbxref table
00898                 # as dbname, accession, version tuples, with dbxref.dbxref_id
00899                 # being automatically assigned, and into the seqfeature_dbxref
00900                 # table as seqfeature_id, dbxref_id, and rank tuples
00901                 self._load_seqfeature_dbxref(qualifiers[qualifier_key],
00902                                              seqfeature_id)
00903 
00904 
00905     def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
00906         """Add database crossreferences of a SeqFeature to the database (PRIVATE).
00907 
00908             o dbxrefs           List, dbxref data from the source file in the
00909                                 format <database>:<accession>
00910 
00911             o seqfeature_id     Int, the identifier for the seqfeature in the
00912                                 seqfeature table
00913 
00914             Insert dbxref qualifier data for a seqfeature into the
00915             seqfeature_dbxref and, if required, dbxref tables.
00916             The dbxref_id qualifier/value sets go into the dbxref table
00917             as dbname, accession, version tuples, with dbxref.dbxref_id
00918             being automatically assigned, and into the seqfeature_dbxref
00919             table as seqfeature_id, dbxref_id, and rank tuples
00920         """
00921         # NOTE - In older versions of Biopython, we would map the GenBank
00922         # db_xref "name", for example "GI" to "GeneIndex", and give a warning
00923         # for any unknown terms.  This was a long term maintainance problem,
00924         # and differed from BioPerl and BioJava's implementation.  See bug 2405
00925         for rank, value in enumerate(dbxrefs):
00926             # Split the DB:accession format string at colons.  We have to
00927             # account for multiple-line and multiple-accession entries
00928             try:
00929                 dbxref_data = value.replace(' ','').replace('\n','').split(':')
00930                 db = dbxref_data[0]
00931                 accessions = dbxref_data[1:]
00932             except:
00933                 raise ValueError("Parsing of db_xref failed: '%s'" % value)
00934             # Loop over all the grabbed accessions, and attempt to fill the
00935             # table
00936             for accession in accessions:
00937                 # Get the dbxref_id value for the dbxref data
00938                 dbxref_id = self._get_dbxref_id(db, accession)
00939                 # Insert the seqfeature_dbxref data
00940                 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
00941         
00942     def _get_dbxref_id(self, db, accession):
00943         """ _get_dbxref_id(self, db, accession) -> Int
00944 
00945             o db          String, the name of the external database containing
00946                           the accession number
00947 
00948             o accession   String, the accession of the dbxref data
00949 
00950             Finds and returns the dbxref_id for the passed data.  The method
00951             attempts to find an existing record first, and inserts the data
00952             if there is no record.
00953         """
00954         # Check for an existing record
00955         sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \
00956               r'AND accession = %s'
00957         dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
00958         # If there was a record, return the dbxref_id, else create the
00959         # record and return the created dbxref_id
00960         if dbxref_id:
00961             return dbxref_id[0]
00962         return self._add_dbxref(db, accession, 0)
00963 
00964     def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
00965         """ Check for a pre-existing seqfeature_dbxref entry with the passed
00966             seqfeature_id and dbxref_id.  If one does not exist, insert new
00967             data
00968 
00969         """
00970         # Check for an existing record
00971         sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \
00972               r"WHERE seqfeature_id = %s AND dbxref_id = %s"
00973         result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id,
00974                                                            dbxref_id))
00975         # If there was a record, return without executing anything, else create
00976         # the record and return
00977         if result:
00978             return result
00979         return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
00980 
00981     def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
00982         """ Insert a seqfeature_dbxref row and return the seqfeature_id and
00983             dbxref_id
00984         """
00985         sql = r'INSERT INTO seqfeature_dbxref ' \
00986               '(seqfeature_id, dbxref_id, rank) VALUES' \
00987               r'(%s, %s, %s)'
00988         self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
00989         return (seqfeature_id, dbxref_id)
00990 
00991     def _load_dbxrefs(self, record, bioentry_id):
00992         """Load any sequence level cross references into the database (PRIVATE).
00993 
00994         See table bioentry_dbxref."""
00995         for rank, value in enumerate(record.dbxrefs):
00996             # Split the DB:accession string at first colon.
00997             # We have to cope with things like:
00998             # "MGD:MGI:892" (db="MGD", accession="MGI:892")
00999             # "GO:GO:123" (db="GO", accession="GO:123")
01000             #
01001             # Annoyingly I have seen the NCBI use both the style
01002             # "GO:GO:123" and "GO:123" in different vintages.
01003             assert value.count("\n")==0
01004             try:
01005                 db, accession = value.split(':',1)
01006                 db = db.strip()
01007                 accession = accession.strip()
01008             except:
01009                 raise ValueError("Parsing of dbxrefs list failed: '%s'" % value)
01010             # Get the dbxref_id value for the dbxref data
01011             dbxref_id = self._get_dbxref_id(db, accession)
01012             # Insert the bioentry_dbxref  data
01013             self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank+1)
01014 
01015     def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
01016         """ Check for a pre-existing bioentry_dbxref entry with the passed
01017             seqfeature_id and dbxref_id.  If one does not exist, insert new
01018             data
01019 
01020         """
01021         # Check for an existing record
01022         sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \
01023               r"WHERE bioentry_id = %s AND dbxref_id = %s"
01024         result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id,
01025                                                            dbxref_id))
01026         # If there was a record, return without executing anything, else create
01027         # the record and return
01028         if result:
01029             return result
01030         return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
01031 
01032     def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
01033         """ Insert a bioentry_dbxref row and return the seqfeature_id and
01034             dbxref_id
01035         """
01036         sql = r'INSERT INTO bioentry_dbxref ' \
01037               '(bioentry_id,dbxref_id,rank) VALUES ' \
01038               '(%s, %s, %s)'
01039         self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
01040         return (bioentry_id, dbxref_id)
01041             
01042 class DatabaseRemover:
01043     """Complement the Loader functionality by fully removing a database.
01044 
01045     This probably isn't really useful for normal purposes, since you
01046     can just do a:
01047         DROP DATABASE db_name
01048     and then recreate the database. But, it's really useful for testing
01049     purposes.
01050 
01051     YB: now use the cascaded deletions
01052     """
01053     def __init__(self, adaptor, dbid):
01054         """Initialize with a database id and adaptor connection.
01055         """
01056         self.adaptor = adaptor
01057         self.dbid = dbid
01058 
01059     def remove(self):
01060         """Remove everything related to the given database id.
01061         """
01062         sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s"
01063         self.adaptor.execute(sql, (self.dbid,))
01064         sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s"
01065         self.adaptor.execute(sql, (self.dbid,))
01066