Back to index

python-biopython  1.60
test_GenBank.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 """Test the GenBank parser and make sure everything is working smoothly.
00003 """
00004 # standard library
00005 import os
00006 import cStringIO
00007 
00008 # GenBank stuff to test
00009 from Bio import GenBank
00010 from Bio.GenBank import utils
00011 
00012 gb_file_dir = os.path.join(os.getcwd(), 'GenBank')
00013 
00014 test_files = ['noref.gb', 'cor6_6.gb', 'iro.gb', 'pri1.gb', 'arab1.gb',
00015               'protein_refseq.gb', 'extra_keywords.gb', 'one_of.gb',
00016               'NT_019265.gb', 'origin_line.gb', 'blank_seq.gb',
00017               'dbsource_wrap.gb', 'gbvrl1_start.seq', 'NC_005816.gb']
00018 
00019 # We only test writing on a subset of the examples:
00020 write_format_files = ['noref.gb', 'cor6_6.gb', 'iro.gb', 'pri1.gb', 'arab1.gb',
00021                       'extra_keywords.gb', 'one_of.gb', 'origin_line.gb']
00022 # don't test writing on protein_refseq, since it is horribly nasty
00023 # don't test writing on the CONTIG refseq, because the wrapping of
00024 # locations won't work exactly
00025 # don't test writing on blank_seq because it lacks a sequence type
00026 # don't test dbsource_wrap because it is a junky RefSeq file
00027 
00028 files_to_parse = []
00029 for file in test_files:
00030     files_to_parse.append(os.path.join(gb_file_dir, file))
00031 
00032 # parse the bioperl test files
00033 # comment this out for now -- there are a bunch of junky records in here
00034 # that no longer exist in GenBank -- do we really need to support those?
00035 # files_to_parse = [os.path.join(os.getcwd(), 'GenBank', 'bioperl_test.gb')]
00036 
00037 # parse the biojava test files
00038 # files_to_parse += [os.path.join(os.getcwd(), 'GenBank', 'biojava_test.gb')]
00039 
00040 # test the parsers
00041 feature_parser = GenBank.FeatureParser(debug_level = 0)
00042 record_parser = GenBank.RecordParser(debug_level = 0)
00043 
00044 all_parsers = [feature_parser, record_parser]
00045 print "Testing parsers..."
00046 for parser in all_parsers:
00047     for filename in files_to_parse:
00048         if not os.path.isfile(filename):
00049             print "Missing test input file: %s" % filename
00050             continue
00051         
00052         handle = open(filename, 'r')
00053         iterator = GenBank.Iterator(handle, parser)
00054         
00055         while 1:
00056             cur_record = iterator.next()
00057 
00058             if cur_record is None:
00059                 break
00060 
00061             if isinstance(parser, GenBank.FeatureParser):
00062                 print "***Record from %s with the FeatureParser" \
00063                       % filename.split(os.path.sep)[-1]
00064                 print "Seq:", repr(cur_record.seq)
00065                 print "Id:", cur_record.id
00066                 print "Name:", cur_record.name
00067                 print "Description", cur_record.description
00068                 print "Annotations***"
00069                 ann_keys = cur_record.annotations.keys()
00070                 ann_keys.sort()
00071                 for ann_key in ann_keys:
00072                     if ann_key != 'references':
00073                         print "Key: %s" % ann_key
00074                         print "Value: %s" % \
00075                               cur_record.annotations[ann_key]
00076                     else:
00077                         print "References*"
00078                         for reference in cur_record.annotations[ann_key]:
00079                             print str(reference) 
00080                 print "Feaures"
00081                 for feature in cur_record.features:
00082                     print feature
00083                 print "DB cross refs", cur_record.dbxrefs
00084             elif isinstance(parser, GenBank.RecordParser):
00085                 print "***Record from %s with the RecordParser" \
00086                       % filename.split(os.path.sep)[-1]
00087                 print "sequence length: %i" % len(cur_record.sequence)
00088                 print "locus:", cur_record.locus
00089                 print "definition:", cur_record.definition
00090                 print "accession:", cur_record.accession
00091                 for reference in cur_record.references:
00092                     print "reference title:", reference.title
00093 
00094                 for feature in cur_record.features:
00095                     print "feature key:", feature.key
00096                     print "location:", feature.location
00097                     print "num qualifiers:", len(feature.qualifiers)
00098                     for qualifier in feature.qualifiers:
00099                         print "key:", qualifier.key, "value:", qualifier.value
00100                          
00101         handle.close()
00102         
00103 #The dictionaries code has been deprecated
00104 #print "Testing dictionaries..."
00105 #...
00106 
00107 # test writing GenBank format
00108 print "Testing writing GenBank format..."
00109 
00110 def do_comparison(good_record, test_record):
00111     """Compare two records to see if they are the same.
00112 
00113     Ths compares the two GenBank record, and will raise an AssertionError
00114     if two lines do not match, showing the non-matching lines.
00115     """
00116     good_handle = cStringIO.StringIO(good_record)
00117     test_handle = cStringIO.StringIO(test_record)
00118 
00119     while 1:
00120         good_line = good_handle.readline()
00121         test_line = test_handle.readline()
00122 
00123         if not(good_line) and not(test_line):
00124             break
00125         if not(good_line):
00126             raise AssertionError("Extra info in Test: `%s`" % test_line)
00127         if not(test_line):
00128             raise AssertionError("Extra info in Expected: `%s`" % good_line)
00129         test_normalized = ' '.join([x for x in test_line.split() if x])
00130         good_normalized = ' '.join([x for x in good_line.split() if x])
00131         assert test_normalized == good_normalized, \
00132                "Expected does not match Test.\nExpect:`%s`\nTest  :`%s`\n" % \
00133                (good_line, test_line)
00134     
00135 def t_write_format():
00136     record_parser = GenBank.RecordParser(debug_level = 0)
00137 
00138     for file in write_format_files:
00139         print "Testing GenBank writing for %s..." % os.path.basename(file)
00140         cur_handle = open(os.path.join("GenBank", file), "r")
00141         compare_handle = open(os.path.join("GenBank", file), "r")
00142         
00143         iterator = GenBank.Iterator(cur_handle, record_parser)
00144         compare_iterator = GenBank.Iterator(compare_handle)
00145         
00146         while 1:
00147             cur_record = iterator.next()
00148             compare_record = compare_iterator.next()
00149             
00150             if cur_record is None or compare_record is None:
00151                 break
00152 
00153             print "\tTesting for %s" % cur_record.version
00154 
00155             output_record = str(cur_record) + "\n"
00156             do_comparison(compare_record, output_record)
00157 
00158         cur_handle.close()
00159         compare_handle.close()
00160 
00161 t_write_format()
00162 
00163 def t_cleaning_features():
00164     """Test the ability to clean up feature values.
00165     """
00166     parser = GenBank.FeatureParser(feature_cleaner = \
00167                                    utils.FeatureValueCleaner())
00168     handle = open(os.path.join("GenBank", "arab1.gb"))
00169     iterator = GenBank.Iterator(handle, parser)
00170 
00171     first_record = iterator.next()
00172     
00173     # test for cleaning of translation
00174     translation_feature = first_record.features[1]
00175     test_trans = translation_feature.qualifiers["translation"][0]
00176     assert test_trans.find(" ") == -1, \
00177       "Did not clean spaces out of the translation"
00178     assert test_trans.find("\012") == -1, \
00179       "Did not clean newlines out of the translation"
00180 
00181     handle.close()
00182 
00183 print "Testing feature cleaning..."
00184 t_cleaning_features()
00185 
00186 def t_ensembl_locus():
00187     line = "LOCUS       HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011\n"
00188     s = GenBank.Scanner.GenBankScanner()
00189     c = GenBank._FeatureConsumer(True)
00190     s._feed_first_line(c, line)
00191     assert c.data.name == "HG531_PATCH", c.data.name
00192     assert c._expected_size == 1000000, c._expected_size
00193 
00194     line = "LOCUS       HG531_PATCH 759984 bp DNA HTG 18-JUN-2011\n"
00195     s = GenBank.Scanner.GenBankScanner()
00196     c = GenBank._FeatureConsumer(True)
00197     s._feed_first_line(c, line)
00198     assert c.data.name == "HG531_PATCH", c.data.name
00199     assert c._expected_size == 759984, c._expected_size
00200 
00201     line = "LOCUS       HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011\n"
00202     s = GenBank.Scanner.GenBankScanner()
00203     c = GenBank._FeatureConsumer(True)
00204     s._feed_first_line(c, line)
00205     assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name
00206     assert c._expected_size == 814959, c._expected_size
00207 
00208     line = "LOCUS       HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011\n"
00209     s = GenBank.Scanner.GenBankScanner()
00210     c = GenBank._FeatureConsumer(True)
00211     s._feed_first_line(c, line)
00212     assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name
00213     assert c._expected_size == 1219964, c._expected_size
00214 
00215     print "Done"
00216 print "Testing EnsEMBL LOCUS lines..."
00217 t_ensembl_locus()