Back to index

python-biopython  1.60
seq_tests_common.py
Go to the documentation of this file.
00001 # This code is part of the Biopython distribution and governed by its
00002 # license.  Please see the LICENSE file that should have been included
00003 # as part of this package.
00004 from Bio.Seq import UnknownSeq
00005 from Bio.SeqUtils.CheckSum import seguid
00006 from Bio.SeqFeature import ExactPosition, FeatureLocation, SeqFeature
00007 from Bio.SeqRecord import SeqRecord
00008 
00009 def checksum_summary(record):
00010     if isinstance(record.seq, UnknownSeq):
00011         return repr(record.seq)
00012     if len(record.seq) < 25:
00013         short = record.seq.tostring()
00014     else:
00015         short = record.seq.tostring()[:19] \
00016               + "..." + record.seq.tostring()[-3:]
00017     return "%s [%s] len %i" \
00018            % (short, seguid(record.seq), len(record.seq))
00019 
00020 def compare_reference(old_r, new_r):
00021     """Compare two Reference objects
00022 
00023     Note new_r is assumed to be a BioSQL DBSeqRecord, due to limitations
00024     of the BioSQL table structure.
00025     """
00026     assert old_r.title == new_r.title, \
00027            "%s vs %s" % (old_r.title, new_r.title)
00028     assert old_r.authors == new_r.authors, \
00029            "%s vs %s" % (old_r.authors, new_r.authors)
00030     assert old_r.journal == new_r.journal, \
00031            "%s vs %s" % (old_r.journal, new_r.journal)
00032     assert old_r.medline_id == new_r.medline_id, \
00033            "%s vs %s" % (old_r.medline_id, new_r.medline_id)
00034 
00035     if old_r.pubmed_id and new_r.pubmed_id:
00036         assert old_r.pubmed_id == new_r.pubmed_id
00037         #Looking at BioSQL/BioSeq.py function _retrieve_reference
00038         #it seems that it will get either the MEDLINE or PUBMED,
00039         #but not both.  I *think* the current schema does not allow
00040         #us to store both... must confirm this.
00041     
00042     #TODO - assert old_r.comment == new_r.comment
00043     #Looking at the tables, I *think* the current schema does not
00044     #allow us to store a reference comment.  Must confirm this.
00045     assert old_r.comment == new_r.comment or new_r.comment == "", \
00046                           "%r vs %r" % (old_r.comment, new_r.comment)
00047 
00048     #TODO - assert old_r.consrtm == new_r.consrtm
00049     #Looking at the tables, I *think* the current schema does not
00050     #allow us to store a consortium.
00051     assert old_r.consrtm == new_r.consrtm or new_r.consrtm == ""
00052     
00053     if len(old_r.location) == 0:
00054         assert len(new_r.location) == 0
00055     else:
00056         #BioSQL can only store ONE location!
00057         #TODO - Check BioPerl with a GenBank file with multiple ref locations
00058         assert isinstance(old_r.location[0], FeatureLocation)
00059         assert isinstance(new_r.location[0], FeatureLocation)
00060         assert old_r.location[0].start == new_r.location[0].start and \
00061                old_r.location[0].end == new_r.location[0].end
00062 
00063     return True
00064 
00065 def compare_feature(old_f, new_f):
00066     """Compare two SeqFeature objects"""
00067     assert isinstance(old_f, SeqFeature)
00068     assert isinstance(new_f, SeqFeature)
00069 
00070     assert old_f.type == new_f.type, \
00071         "%s -> %s" % (old_f.type, new_f.type) 
00072     
00073     assert old_f.strand == new_f.strand, \
00074         "%s -> %s" % (old_f.strand, new_f.strand)
00075 
00076     assert old_f.ref == new_f.ref, \
00077         "%s -> %s" % (old_f.ref, new_f.ref)
00078 
00079     assert old_f.ref_db == new_f.ref_db, \
00080         "%s -> %s" % (old_f.ref_db, new_f.ref_db)
00081 
00082     #TODO - BioSQL does not store/retrieve feature's id (Bug 2526)   
00083     assert old_f.id == new_f.id or new_f.id == "<unknown id>"
00084 
00085     #TODO - Work out how the location_qualifier_value table should
00086     #be used, given BioPerl seems to ignore it (Bug 2766)
00087     #assert old_f.location_operator == new_f.location_operator, \
00088     #        "%s -> %s" % (old_f.location_operator, new_f.location_operator)
00089     
00090     # We dont store fuzzy locations:
00091     try:
00092         assert str(old_f.location) == str(new_f.location), \
00093            "%s -> %s" % (str(old_f.location), str(new_f.location))
00094     except AssertionError, e:
00095         if isinstance(old_f.location.start, ExactPosition) and \
00096             isinstance(old_f.location.end, ExactPosition):
00097             # Its not a problem with fuzzy locations, re-raise 
00098             raise e
00099         else:
00100             assert old_f.location.nofuzzy_start == \
00101                     new_f.location.nofuzzy_start, \
00102                     "%s -> %s" % (old_f.location.nofuzzy_start, \
00103                                   new_f.location.nofuzzy_start)
00104             assert old_f.location.nofuzzy_end == \
00105                     new_f.location.nofuzzy_end, \
00106                     "%s -> %s" % (old_f.location.nofuzzy_end, \
00107                                   new_f.location.nofuzzy_end)
00108 
00109     assert len(old_f.sub_features) == len(new_f.sub_features), \
00110         "number of sub_features: %s -> %s" % \
00111         (len(old_f.sub_features), len(new_f.sub_features))
00112     
00113     for old_sub, new_sub in zip(old_f.sub_features, new_f.sub_features):
00114         
00115         assert old_sub.type == new_sub.type, \
00116             "%s -> %s" % (old_sub.type, new_sub.type)
00117         
00118         assert old_sub.strand == new_sub.strand, \
00119             "%s -> %s" % (old_sub.strand, new_sub.strand)
00120 
00121         assert old_sub.ref == new_sub.ref, \
00122             "%s -> %s" % (old_sub.ref, new_sub.ref)
00123 
00124         assert old_sub.ref_db == new_sub.ref_db, \
00125             "%s -> %s" % (old_sub.ref_db, new_sub.ref_db)
00126 
00127         #TODO - Work out how the location_qualifier_value table should
00128         #be used, given BioPerl seems to ignore it (Bug 2766)
00129         #assert old_sub.location_operator == new_sub.location_operator, \
00130         #    "%s -> %s" % (old_sub.location_operator, new_sub.location_operator)
00131 
00132         # Compare sub-feature Locations:
00133         # 
00134         # BioSQL currently does not store fuzzy locations, but instead stores
00135         # them as FeatureLocation.nofuzzy_start FeatureLocation.nofuzzy_end.
00136         # The vast majority of cases will be comparisons of ExactPosition
00137         # class locations, so we'll try that first and catch the exceptions.
00138 
00139         try:
00140             assert str(old_sub.location) == str(new_sub.location), \
00141                "%s -> %s" % (str(old_sub.location), str(new_sub.location))
00142         except AssertionError, e:
00143             if isinstance(old_sub.location.start, ExactPosition) and \
00144                 isinstance(old_sub.location.end, ExactPosition):
00145                 # Its not a problem with fuzzy locations, re-raise 
00146                 raise e
00147             else:
00148                 #At least one of the locations is fuzzy
00149                 assert old_sub.location.nofuzzy_start == \
00150                        new_sub.location.nofuzzy_start, \
00151                        "%s -> %s" % (old_sub.location.nofuzzy_start, \
00152                                      new_sub.location.nofuzzy_start)
00153                 assert old_sub.location.nofuzzy_end == \
00154                        new_sub.location.nofuzzy_end, \
00155                        "%s -> %s" % (old_sub.location.nofuzzy_end, \
00156                                      new_sub.location.nofuzzy_end)
00157 
00158     assert len(old_f.qualifiers) == len(new_f.qualifiers)    
00159     assert set(old_f.qualifiers) == set(new_f.qualifiers)
00160     for key in old_f.qualifiers:
00161         if isinstance(old_f.qualifiers[key], str):
00162             if isinstance(new_f.qualifiers[key], str):
00163                 assert old_f.qualifiers[key] == new_f.qualifiers[key]
00164             elif isinstance(new_f.qualifiers[key], list):
00165                 #Maybe a string turning into a list of strings?
00166                 assert [old_f.qualifiers[key]] == new_f.qualifiers[key], \
00167                         "%s -> %s" \
00168                         % (repr(old_f.qualifiers[key]),
00169                            repr(new_f.qualifiers[key]))
00170             else:
00171                 assert False, "Problem with feature's '%s' qualifier" & key
00172         else:
00173             #Should both be lists of strings...
00174             assert old_f.qualifiers[key] == new_f.qualifiers[key], \
00175                 "%s -> %s" % (old_f.qualifiers[key], new_f.qualifiers[key])
00176     return True
00177 
00178 def compare_sequence(old, new):
00179     """Compare two Seq or DBSeq objects"""
00180     assert len(old) == len(new), "%i vs %i" % (len(old), len(new))
00181     assert old.tostring() == new.tostring()
00182 
00183     if isinstance(old, UnknownSeq):
00184         assert isinstance(new, UnknownSeq)
00185     else:
00186         assert not isinstance(new, UnknownSeq)
00187 
00188     ln = len(old)
00189     s = old.tostring()
00190     assert isinstance(s, str)
00191 
00192     #Don't check every single element; for long sequences
00193     #this takes far far far too long to run!
00194     #Test both positive and negative indices
00195     if ln < 50:
00196         indices = range(-ln,ln)
00197     else:
00198         #A selection of end cases, and the mid point
00199         indices = [-ln,-ln+1,-(ln//2),-1,0,1,ln//2,ln-2,ln-1]
00200 
00201     #Test element access,    
00202     for i in indices:
00203         expected = s[i]
00204         assert expected == old[i]
00205         assert expected == new[i]
00206 
00207     #Test slices
00208     indices.append(ln) #check copes with overflows
00209     indices.append(ln+1000) #check copes with overflows
00210     for i in indices:
00211         for j in indices:
00212             expected = s[i:j]
00213             assert expected == old[i:j].tostring(), \
00214                    "Slice %s vs %s" % (repr(expected), repr(old[i:j]))
00215             assert expected == new[i:j].tostring(), \
00216                    "Slice %s vs %s" % (repr(expected), repr(new[i:j]))
00217             #Slicing with step of 1 should make no difference.
00218             #Slicing with step 3 might be useful for codons.
00219             for step in [1,3]:
00220                 expected = s[i:j:step]
00221                 assert expected == old[i:j:step].tostring()
00222                 assert expected == new[i:j:step].tostring()
00223 
00224         #Check automatic end points
00225         expected = s[i:]
00226         assert expected == old[i:].tostring()
00227         assert expected == new[i:].tostring()
00228                 
00229         expected = s[:i]
00230         assert expected == old[:i].tostring()
00231         assert expected == new[:i].tostring()
00232 
00233     #Check "copy" splice
00234     assert s == old[:].tostring()
00235     assert s == new[:].tostring()
00236     return True
00237 
00238 def compare_features(old_list, new_list):
00239     assert isinstance(old_list, list)
00240     assert isinstance(new_list, list)
00241     assert len(old_list) == len(new_list)
00242     for old_f, new_f in zip(old_list, new_list):
00243         if not compare_feature(old_f, new_f):
00244             return False
00245     return True
00246         
00247 def compare_record(old, new):
00248     """Compare two SeqRecord or DBSeqRecord objects"""
00249     assert isinstance(old, SeqRecord)
00250     assert isinstance(new, SeqRecord)
00251     #Sequence:
00252     compare_sequence(old.seq, new.seq)
00253     #Basics:
00254     assert old.id == new.id
00255     assert old.name == new.name
00256     assert old.description == new.description
00257     assert old.dbxrefs == new.dbxrefs, \
00258            "dbxrefs mismatch\nOld: %s\nNew: %s" \
00259            % (old.dbxrefs, new.dbxrefs)
00260     #Features:
00261     if not compare_features(old.features, new.features):
00262         return False
00263 
00264     #Annotation:
00265     #We are expecting to see some "extra" annotations appearing,
00266     #such as 'cross_references', 'dates', 'data_file_division',
00267     #'ncbi_taxon' and 'gi'.
00268     #TODO - address these, see Bug 2681?
00269     new_keys = set(new.annotations).difference(old.annotations)
00270     new_keys = new_keys.difference(['cross_references', 'date', 
00271                                     'data_file_division', 'ncbi_taxid', 'gi'])
00272     assert not new_keys, "Unexpected new annotation keys: %s" \
00273            % ", ".join(new_keys)
00274     missing_keys = set(old.annotations).difference(new.annotations)
00275     missing_keys = missing_keys.difference(['ncbi_taxid', # Can't store chimeras
00276                                             ])
00277     assert not missing_keys, "Unexpectedly missing annotation keys: %s" \
00278            % ", ".join(missing_keys)
00279     
00280     #In the short term, just compare any shared keys:
00281     for key in set(old.annotations).intersection(new.annotations):
00282         if key == "references":
00283             assert len(old.annotations[key]) == len(new.annotations[key])
00284             for old_r, new_r in zip(old.annotations[key], new.annotations[key]):
00285                 compare_reference(old_r, new_r)
00286         elif key == "comment":
00287             #Turn them both into containing strings for comparison - due to
00288             #line wrapping in GenBank etc we don't really expect the white
00289             #space to be 100% the same.
00290             if isinstance(old.annotations[key], list):
00291                 old_comment = " ".join(old.annotations[key])
00292             else:
00293                 old_comment = old.annotations[key]
00294             if isinstance(new.annotations[key], list):
00295                 new_comment = " ".join(new.annotations[key])
00296             else:
00297                 new_comment = new.annotations[key]
00298             old_comment = old_comment.replace("\n"," ").replace("  ", " ")
00299             new_comment = new_comment.replace("\n"," ").replace("  ", " ")
00300             assert old_comment == new_comment, \
00301                 "Comment annotation changed by load/retrieve\n" \
00302                 "Was:%s\nNow:%s" \
00303                 % (repr(old_comment), repr(new_comment))
00304         elif key in ["taxonomy", "organism", "source"]:
00305             #If there is a taxon id recorded, these fields get overwritten
00306             #by data from the taxon/taxon_name tables.  There is no
00307             #guarantee that they will be identical after a load/retrieve.
00308             assert isinstance(new.annotations[key], basestring) \
00309                 or isinstance(new.annotations[key], list)
00310         elif type(old.annotations[key]) == type(new.annotations[key]):
00311             assert old.annotations[key] == new.annotations[key], \
00312                 "Annotation '%s' changed by load/retrieve\nWas:%s\nNow:%s" \
00313                 % (key, old.annotations[key], new.annotations[key])
00314         elif isinstance(old.annotations[key], str) \
00315         and isinstance(new.annotations[key], list):
00316             #Any annotation which is a single string gets turned into
00317             #a list containing one string by BioSQL at the moment.
00318             assert [old.annotations[key]] == new.annotations[key], \
00319                 "Annotation '%s' changed by load/retrieve\nWas:%s\nNow:%s" \
00320                 % (key, old.annotations[key], new.annotations[key])
00321         elif isinstance(old.annotations[key], list) \
00322         and isinstance(new.annotations[key], str):
00323             assert old.annotations[key] == [new.annotations[key]], \
00324                 "Annotation '%s' changed by load/retrieve\nWas:%s\nNow:%s" \
00325                 % (key, old.annotations[key], new.annotations[key])
00326     return True
00327 
00328 def compare_records(old_list, new_list):
00329     assert isinstance(old_list, list)
00330     assert isinstance(new_list, list)
00331     assert len(old_list) == len(new_list)
00332     for old_r, new_r in zip(old_list, new_list):
00333         if not compare_record(old_r, new_r):
00334             return False
00335     return True