Back to index

python-biopython  1.60
test_BioSQL_SeqIO.py
Go to the documentation of this file.
00001 # This code is part of the Biopython distribution and governed by its
00002 # license.  Please see the LICENSE file that should have been included
00003 # as part of this package.
00004 
00005 """Testing BioSQL with BioSQL
00006 
00007 Uses Bio.SeqIO to parse files, and then loads them into a BioSQL database,
00008 and checks we can retreive them again.
00009 
00010 Goals:
00011     Make sure that BioSQL preserves SeqRecord objects.
00012 """
00013 import os
00014 
00015 from Bio import MissingExternalDependencyError
00016 from Bio import SeqIO
00017 from Bio.Seq import UnknownSeq
00018 from StringIO import StringIO
00019 
00020 from BioSQL import BioSeqDatabase
00021 from BioSQL import BioSeq
00022 
00023 # This testing suite should try to detect whether a valid database
00024 # installation exists on this computer.  Only run the tests if it
00025 # does.
00026 try:
00027     from setup_BioSQL import DBDRIVER, DBTYPE
00028     from setup_BioSQL import DBHOST, DBUSER, DBPASSWD, TESTDB
00029     from setup_BioSQL import DBSCHEMA, SQL_FILE
00030 except (NameError, ImportError):
00031     message = "Check settings in Tests/setup_BioSQL.py "\
00032               "if you plan to use BioSQL."
00033     raise MissingExternalDependencyError(message)
00034 
00035 from seq_tests_common import checksum_summary, compare_record
00036 
00037 db_name = "biosql-seqio-test"
00038 #####################################################################
00039 
00040 #This list was based on a selection from test_SeqIO.py
00041 test_files = [ \
00042 #Following nucleic examples are also used in test_SeqIO_FastaIO.py
00043     ("fasta",  False, 'Fasta/lupine.nu', 1),
00044     ("fasta",  False, 'Fasta/elderberry.nu', 1),
00045     ("fasta",  False, 'Fasta/phlox.nu', 1),
00046     ("fasta",  False, 'Fasta/centaurea.nu', 1),
00047     ("fasta",  False, 'Fasta/wisteria.nu', 1),
00048     ("fasta",  False, 'Fasta/sweetpea.nu', 1),
00049     ("fasta",  False, 'Fasta/lavender.nu', 1),
00050 #Following protein examples are also used in test_SeqIO_FastaIO.py
00051     ("fasta",  False, 'Fasta/aster.pro', 1),
00052     ("fasta",  False, 'Fasta/loveliesbleeding.pro', 1),
00053     ("fasta",  False, 'Fasta/rose.pro', 1),
00054     ("fasta",  False, 'Fasta/rosemary.pro', 1),
00055 #Following examples are also used in test_SeqIO.py
00056     ("fasta",  False, 'Fasta/f001', 1), #Protein
00057     ("fasta",  False, 'Fasta/f002', 3), #DNA
00058     #("fasta", False, 'Fasta/f003', 2), #Protein with comments
00059     ("fasta",  False, 'Fasta/fa01', 2), #Protein with gaps
00060 #Following examples are also used in test_GFF.py
00061     ("fasta",  False, 'GFF/NC_001802.fna', 1), #upper case
00062     ("fasta",  True,  'GFF/multi.fna', 3), #Trivial nucleotide alignment
00063 #Following example is also used in test_registry.py
00064     ("fasta",  False, 'Registry/seqs.fasta', 2), #contains blank line
00065 #Following examples are also used in test_SwissProt.py
00066     ("swiss",  False, 'SwissProt/sp001', 1),
00067     ("swiss",  False, 'SwissProt/sp002', 1),
00068     ("swiss",  False, 'SwissProt/sp003', 1),
00069     ("swiss",  False, 'SwissProt/sp004', 1),
00070     ("swiss",  False, 'SwissProt/sp005', 1),
00071     ("swiss",  False, 'SwissProt/sp006', 1),
00072     ("swiss",  False, 'SwissProt/sp007', 1),
00073     ("swiss",  False, 'SwissProt/sp008', 1),
00074     ("swiss",  False, 'SwissProt/sp009', 1),
00075     ("swiss",  False, 'SwissProt/sp010', 1),
00076     ("swiss",  False, 'SwissProt/sp011', 1),
00077     ("swiss",  False, 'SwissProt/sp012', 1),
00078     ("swiss",  False, 'SwissProt/sp013', 1),
00079     ("swiss",  False, 'SwissProt/sp014', 1),
00080     ("swiss",  False, 'SwissProt/sp015', 1),
00081     ("swiss",  False, 'SwissProt/sp016', 1),
00082 #Following example is also used in test_registry.py
00083     ("swiss",  False, 'Registry/EDD_RAT.dat', 1),
00084 #Following examples are also used in test_GenBank.py
00085     ("genbank",False, 'GenBank/noref.gb', 1),
00086     ("genbank",False, 'GenBank/cor6_6.gb', 6),
00087     ("genbank",False, 'GenBank/iro.gb', 1),
00088     ("genbank",False, 'GenBank/pri1.gb', 1),
00089     ("genbank",False, 'GenBank/arab1.gb', 1),
00090     #protein_refseq.gb had malformed db_xref, fixed in protein_refseq2.gb
00091     #("genbank",False, 'GenBank/protein_refseq.gb', 1), 
00092     ("genbank",False, 'GenBank/protein_refseq2.gb', 1),
00093     ("genbank",False, 'GenBank/extra_keywords.gb', 1),
00094     ("genbank",False, 'GenBank/one_of.gb', 1),
00095     ("genbank",False, 'GenBank/NT_019265.gb', 1),
00096     ("genbank",False, 'GenBank/origin_line.gb', 1),
00097     ("genbank",False, 'GenBank/blank_seq.gb', 1),
00098     ("genbank",False, 'GenBank/dbsource_wrap.gb', 1),
00099     ("genbank",False, 'GenBank/NC_005816.gb', 1),
00100 # The next example is a truncated copy of gbvrl1.seq from
00101 # ftp://ftp.ncbi.nih.gov/genbank/gbvrl1.seq.gz
00102 # This includes an NCBI header, and the first three records:
00103     ("genbank",False, 'GenBank/gbvrl1_start.seq', 3),
00104 #Following files are also used in test_GFF.py
00105     ("genbank",False, 'GFF/NC_001422.gbk', 1),
00106 #Following files are currently only used here and test_SeqIO:
00107     ("embl",      False, 'EMBL/TRBG361.embl', 1),
00108     ("embl",      False, 'EMBL/DD231055_edited.embl', 1),
00109     ("embl",      False, 'EMBL/SC10H5.embl', 1), # Pre 2006 style ID line
00110     ("embl",      False, 'EMBL/U87107.embl', 1), # Old ID line with SV line
00111     ]
00112 
00113 #####################################################################
00114 
00115 #TODO - Should we re-use the create_database() function currently
00116 #       defined in test_BioSQL.py here too?  This would allow us
00117 #       to deal with the error of an unknown database...
00118 #
00119 #print "Creating database"
00120 #from setup_BioSQL import create_database
00121 #create_database()
00122 
00123 print "Connecting to database"
00124 try:
00125     server = BioSeqDatabase.open_database(driver = DBDRIVER,
00126                                       user = DBUSER, passwd = DBPASSWD,
00127                                       host = DBHOST, db = TESTDB)
00128 except Exception, e:
00129     message = "Connection failed, check settings in Tests/setup_BioSQL.py "\
00130               "if you plan to use BioSQL: %s" % str(e)
00131     raise MissingExternalDependencyError(message)
00132 
00133 print "Removing existing sub-database '%s' (if exists)" % db_name
00134 if db_name in server.keys():
00135     #Might exist from a failed test run...
00136     #db = server[db_name]
00137     server.remove_database(db_name)
00138     server.commit()
00139 
00140 print "(Re)creating empty sub-database '%s'" % db_name
00141 db = server.new_database(db_name)
00142 
00143 db_count = 0
00144 for (t_format, t_alignment, t_filename, t_count) in test_files:
00145     print "Testing loading from %s format file %s" % (t_format, t_filename)
00146     assert os.path.isfile(t_filename), t_filename
00147 
00148     iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format)
00149     count = db.load(iterator)
00150     assert count == t_count
00151     db_count += count
00152     
00153     #print " - Committing %i records" % count
00154     server.commit()
00155     
00156     iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format)
00157     for record in iterator:
00158         print " - %s, %s" % (checksum_summary(record), record.id)
00159 
00160         key = record.name
00161         print " - Retrieving by name/display_id '%s'," % key,
00162         db_rec = db.lookup(name=key)
00163         compare_record(record, db_rec)
00164         db_rec = db.lookup(display_id=key)
00165         compare_record(record, db_rec)
00166         print "OK"
00167 
00168         key = record.id
00169         if key.count(".")==1 and key.split(".")[1].isdigit():
00170             print " - Retrieving by version '%s'," % key,
00171             db_rec = db.lookup(version=key)
00172             compare_record(record, db_rec)
00173             print "OK"
00174         
00175         if "accessions" in record.annotations:
00176             accs = sorted(set(record.annotations["accessions"]))
00177             for key in accs:
00178                 assert key, "Blank accession in annotation %s" % repr(accs)
00179                 try:
00180                     print " - Retrieving by accession '%s'," % key,
00181                     db_rec = db.lookup(accession=key)
00182                     compare_record(record, db_rec)
00183                     print "OK"
00184                 except IndexError:
00185                     print "Failed"
00186                     pass
00187 
00188         if "gi" in record.annotations:
00189             key = record.annotations['gi']
00190             if key != record.id:
00191                 print " - Retrieving by GI '%s'," % key,
00192                 db_rec = db.lookup(primary_id=key)
00193                 compare_record(record, db_rec)
00194                 print "OK"
00195 
00196     assert db_count == len(db), "%i vs %i" % (count, len(db))
00197     assert db_count == len(db.keys())
00198     assert db_count == len(db.values())
00199     assert db_count == len(db.items())
00200 
00201 for db_rec in db.itervalues():
00202     assert isinstance(db_rec, BioSeq.DBSeqRecord)
00203 for key, db_rec in db.iteritems():
00204     assert isinstance(db_rec, BioSeq.DBSeqRecord)
00205     compare_record(db_rec, db[key])
00206 
00207 print "Removing (deleting) '%s'" % db_name
00208 server.remove_database(db_name)
00209 
00210 print "Committing remaining changes"
00211 server.commit()
00212 
00213 print "Closing connection"
00214 server.close()