Back to index

python-biopython  1.60
InsdcIO.py
Go to the documentation of this file.
00001 # Copyright 2007-2011 by Peter Cock.  All rights reserved.
00002 #
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package..
00006 
00007 """Bio.SeqIO support for the "genbank" and "embl" file formats.
00008 
00009 You are expected to use this module via the Bio.SeqIO functions.
00010 Note that internally this module calls Bio.GenBank to do the actual
00011 parsing of GenBank, EMBL and IMGT files.
00012 
00013 See also:
00014 
00015 International Nucleotide Sequence Database Collaboration
00016 http://www.insdc.org/
00017  
00018 GenBank
00019 http://www.ncbi.nlm.nih.gov/Genbank/
00020 
00021 EMBL Nucleotide Sequence Database
00022 http://www.ebi.ac.uk/embl/
00023 
00024 DDBJ (DNA Data Bank of Japan)
00025 http://www.ddbj.nig.ac.jp/
00026 
00027 IMGT (use a variant of EMBL format with longer feature indents)
00028 http://imgt.cines.fr/download/LIGM-DB/userman_doc.html
00029 http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html
00030 http://www.ebi.ac.uk/imgt/hla/docs/manual.html
00031 
00032 """
00033 
00034 from Bio.Seq import UnknownSeq
00035 from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner
00036 from Bio import Alphabet
00037 from Interfaces import SequentialSequenceWriter
00038 from Bio import SeqFeature
00039 
00040 from Bio._py3k import _is_int_or_long
00041 
00042 # NOTE
00043 # ====
00044 # The "brains" for parsing GenBank, EMBL and IMGT files (and any
00045 # other flat file variants from the INSDC in future) is in
00046 # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank)
00047 # However, all the writing code is in this file.
00048 
00049 
00050 def GenBankIterator(handle):
00051     """Breaks up a Genbank file into SeqRecord objects.
00052 
00053     Every section from the LOCUS line to the terminating // becomes
00054     a single SeqRecord with associated annotation and features.
00055     
00056     Note that for genomes or chromosomes, there is typically only
00057     one record."""
00058     #This calls a generator function:
00059     return GenBankScanner(debug=0).parse_records(handle)
00060 
00061 def EmblIterator(handle):
00062     """Breaks up an EMBL file into SeqRecord objects.
00063 
00064     Every section from the LOCUS line to the terminating // becomes
00065     a single SeqRecord with associated annotation and features.
00066     
00067     Note that for genomes or chromosomes, there is typically only
00068     one record."""
00069     #This calls a generator function:
00070     return EmblScanner(debug=0).parse_records(handle)
00071 
00072 def ImgtIterator(handle):
00073     """Breaks up an IMGT file into SeqRecord objects.
00074 
00075     Every section from the LOCUS line to the terminating // becomes
00076     a single SeqRecord with associated annotation and features.
00077     
00078     Note that for genomes or chromosomes, there is typically only
00079     one record."""
00080     #This calls a generator function:
00081     return _ImgtScanner(debug=0).parse_records(handle)
00082 
00083 def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
00084     """Breaks up a Genbank file into SeqRecord objects for each CDS feature.
00085 
00086     Every section from the LOCUS line to the terminating // can contain
00087     many CDS features.  These are returned as with the stated amino acid
00088     translation sequence (if given).
00089     """
00090     #This calls a generator function:
00091     return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
00092     
00093 def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
00094     """Breaks up a EMBL file into SeqRecord objects for each CDS feature.
00095 
00096     Every section from the LOCUS line to the terminating // can contain
00097     many CDS features.  These are returned as with the stated amino acid
00098     translation sequence (if given).
00099     """
00100     #This calls a generator function:
00101     return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
00102 
00103 def _insdc_feature_position_string(pos, offset=0):
00104     """Build a GenBank/EMBL position string (PRIVATE).
00105 
00106     Use offset=1 to add one to convert a start position from python counting.
00107     """
00108     if isinstance(pos, SeqFeature.ExactPosition):
00109         return "%i" % (pos.position+offset)
00110     elif isinstance(pos, SeqFeature.WithinPosition):
00111         return "(%i.%i)" % (pos.position + offset,
00112                             pos.position + pos.extension + offset)
00113     elif isinstance(pos, SeqFeature.BetweenPosition):
00114         return "(%i^%i)" % (pos.position + offset,
00115                             pos.position + pos.extension + offset)
00116     elif isinstance(pos, SeqFeature.BeforePosition):
00117         return "<%i" % (pos.position + offset)
00118     elif isinstance(pos, SeqFeature.AfterPosition):
00119         return ">%i" % (pos.position + offset)
00120     elif isinstance(pos, SeqFeature.OneOfPosition):
00121         return "one-of(%s)" \
00122                % ",".join([_insdc_feature_position_string(p,offset) \
00123                            for p in pos.position_choices])
00124     elif isinstance(pos, SeqFeature.AbstractPosition):
00125         raise NotImplementedError("Please report this as a bug in Biopython.")
00126     else:
00127         raise ValueError("Expected a SeqFeature position object.")
00128 
00129 
00130 def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
00131     if location.ref:
00132         ref = "%s:" % location.ref
00133     else:
00134         ref = ""
00135     assert not location.ref_db
00136     if isinstance(location.start, SeqFeature.ExactPosition) \
00137     and isinstance(location.end, SeqFeature.ExactPosition) \
00138     and location.start.position == location.end.position:
00139         #Special case, for 12:12 return 12^13
00140         #(a zero length slice, meaning the point between two letters)
00141         if location.end.position == rec_length:
00142             #Very special case, for a between position at the end of a
00143             #sequence (used on some circular genomes, Bug 3098) we have
00144             #N:N so return N^1
00145             return "%s%i^1" % (ref, rec_length)
00146         else:
00147             return "%s%i^%i" % (ref, location.end.position,
00148                                 location.end.position+1)
00149     if isinstance(location.start, SeqFeature.ExactPosition) \
00150     and isinstance(location.end, SeqFeature.ExactPosition) \
00151     and location.start.position+1 == location.end.position:
00152         #Special case, for 11:12 return 12 rather than 12..12
00153         #(a length one slice, meaning a single letter)
00154         return "%s%i" % (ref, location.end.position)
00155     elif isinstance(location.start, SeqFeature.UnknownPosition) \
00156     or isinstance(location.end, SeqFeature.UnknownPosition):
00157         #Special case for features from SwissProt/UniProt files
00158         if isinstance(location.start, SeqFeature.UnknownPosition) \
00159         and isinstance(location.end, SeqFeature.UnknownPosition):
00160             #import warnings
00161             #warnings.warn("Feature with unknown location")
00162             #return "?"
00163             raise ValueError("Feature with unknown location")
00164         elif isinstance(location.start, SeqFeature.UnknownPosition):
00165             #Treat the unknown start position as a BeforePosition
00166             return "%s<%i..%s" \
00167                 % (ref,
00168                    location.nofuzzy_end,
00169                    _insdc_feature_position_string(location.end))
00170         else:
00171             #Treat the unknown end position as an AfterPosition
00172             return "%s%s..>%i" \
00173                 % (ref,
00174                    _insdc_feature_position_string(location.start),
00175                    location.nofuzzy_start)
00176     else:
00177         #Typical case, e.g. 12..15 gets mapped to 11:15
00178         return ref \
00179                + _insdc_feature_position_string(location.start, +1) \
00180                + ".." + \
00181                _insdc_feature_position_string(location.end)
00182 
00183 def _insdc_feature_location_string(feature, rec_length):
00184     """Build a GenBank/EMBL location string from a SeqFeature (PRIVATE).
00185 
00186     There is a choice of how to show joins on the reverse complement strand,
00187     GenBank used "complement(join(1,10),(20,100))" while EMBL used to use
00188     "join(complement(20,100),complement(1,10))" instead (but appears to have
00189     now adopted the GenBank convention). Notice that the order of the entries
00190     is reversed! This function therefore uses the first form. In this situation
00191     we expect the parent feature and the two children to all be marked as
00192     strand == -1, and in the order 0:10 then 19:100.
00193 
00194     Also need to consider dual-strand examples like these from the Arabidopsis
00195     thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650)
00196     gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057)
00197     which is further complicated by a splice:
00198     join(complement(69611..69724),139856..140087,140625..140650)
00199 
00200     For mixed this mixed strand feature, the parent SeqFeature should have
00201     no strand (either 0 or None) while the child features should have either
00202     strand +1 or -1 as appropriate, and be listed in the order given here.
00203     """
00204 
00205     if not feature.sub_features:
00206         #Non-recursive.
00207         #assert feature.location_operator == "", \
00208         #       "%s has no subfeatures but location_operator %s" \
00209         #       % (repr(feature), feature.location_operator)
00210         location = _insdc_location_string_ignoring_strand_and_subfeatures(feature.location, rec_length)
00211         if feature.strand == -1:
00212             location = "complement(%s)" % location
00213         return location
00214     # As noted above, treat reverse complement strand features carefully:
00215     if feature.strand == -1:
00216         for f in feature.sub_features:
00217             if f.strand != -1:
00218                 raise ValueError("Inconsistent strands: %r for parent, %r for child" \
00219                                  % (feature.strand, f.strand))
00220         return "complement(%s(%s))" \
00221                % (feature.location_operator,
00222                   ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(f.location, rec_length) \
00223                            for f in feature.sub_features))
00224     #if feature.strand == +1:
00225     #    for f in feature.sub_features:
00226     #        assert f.strand == +1
00227     #This covers typical forward strand features, and also an evil mixed strand:
00228     assert feature.location_operator != ""
00229     return  "%s(%s)" % (feature.location_operator,
00230                         ",".join([_insdc_feature_location_string(f, rec_length) \
00231                                   for f in feature.sub_features]))
00232 
00233 
00234 class _InsdcWriter(SequentialSequenceWriter):
00235     """Base class for GenBank and EMBL writers (PRIVATE)."""
00236     MAX_WIDTH = 80
00237     QUALIFIER_INDENT = 21
00238     QUALIFIER_INDENT_STR = " "*QUALIFIER_INDENT
00239     QUALIFIER_INDENT_TMP = "     %s                " # 21 if %s is empty
00240 
00241     def _write_feature_qualifier(self, key, value=None, quote=None):
00242         if not value:
00243             self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key))
00244             return
00245         #Quick hack with no line wrapping, may be useful for testing:
00246         #self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value))
00247         if quote is None:
00248             #Try to mimic unwritten rules about when quotes can be left out:
00249             if _is_int_or_long(value):
00250                 quote = False
00251             else:
00252                 quote = True
00253         if quote:
00254             line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value)
00255         else:
00256             line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value)
00257         if len(line) <= self.MAX_WIDTH:
00258             self.handle.write(line+"\n")
00259             return
00260         while line.lstrip():
00261             if len(line) <= self.MAX_WIDTH:
00262                 self.handle.write(line+"\n")
00263                 return
00264             #Insert line break...
00265             for index in range(min(len(line)-1, self.MAX_WIDTH),
00266                                self.QUALIFIER_INDENT+1,-1):
00267                 if line[index] == " " : break
00268             if line[index] != " ":
00269                 #No nice place to break...
00270                 index = self.MAX_WIDTH
00271             assert index <= self.MAX_WIDTH
00272             self.handle.write(line[:index] + "\n")
00273             line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
00274 
00275     def _wrap_location(self, location):
00276         """Split a feature location into lines (break at commas)."""
00277         #TODO - Rewrite this not to recurse!
00278         length = self.MAX_WIDTH - self.QUALIFIER_INDENT
00279         if len(location) <= length:
00280             return location
00281         index = location[:length].rfind(",")
00282         if index == -1:
00283             #No good place to split (!)
00284             import warnings
00285             warnings.warn("Couldn't split location:\n%s" % location)
00286             return location
00287         return location[:index+1] + "\n" + \
00288                self.QUALIFIER_INDENT_STR + self._wrap_location(location[index+1:])
00289 
00290     def _write_feature(self, feature, record_length):
00291         """Write a single SeqFeature object to features table."""
00292         assert feature.type, feature
00293         location = _insdc_feature_location_string(feature, record_length)
00294         f_type = feature.type.replace(" ","_")
00295         line = (self.QUALIFIER_INDENT_TMP  % f_type)[:self.QUALIFIER_INDENT] \
00296                + self._wrap_location(location) + "\n"
00297         self.handle.write(line)
00298         #Now the qualifiers...
00299         for key, values in feature.qualifiers.iteritems():
00300             if isinstance(values, list) or isinstance(values, tuple):
00301                 for value in values:
00302                     self._write_feature_qualifier(key, value)
00303             elif values:
00304                 #String, int, etc
00305                 self._write_feature_qualifier(key, values)
00306             else:
00307                 #e.g. a /psuedo entry
00308                 self._write_feature_qualifier(key)
00309 
00310     def _get_annotation_str(self, record, key, default=".", just_first=False):
00311         """Get an annotation dictionary entry (as a string).
00312 
00313         Some entries are lists, in which case if just_first=True the first entry
00314         is returned.  If just_first=False (default) this verifies there is only
00315         one entry before returning it."""
00316         try:
00317             answer = record.annotations[key]
00318         except KeyError:
00319             return default
00320         if isinstance(answer, list):
00321             if not just_first : assert len(answer) == 1
00322             return str(answer[0])
00323         else:
00324             return str(answer)
00325 
00326     def _split_multi_line(self, text, max_len):
00327         """Returns a list of strings.
00328         
00329         Any single words which are too long get returned as a whole line
00330         (e.g. URLs) without an exception or warning.
00331         """
00332         #TODO - Do the line spliting while preserving white space?
00333         text = text.strip()
00334         if len(text) <= max_len:
00335             return [text]
00336 
00337         words = text.split()
00338         text = ""
00339         while words and len(text) + 1 + len(words[0]) <= max_len:
00340             text += " " + words.pop(0)
00341             text = text.strip()
00342         #assert len(text) <= max_len
00343         answer = [text]
00344         while words:
00345             text = words.pop(0)
00346             while words and len(text) + 1 + len(words[0]) <= max_len:
00347                 text += " " + words.pop(0)
00348                 text = text.strip()
00349             #assert len(text) <= max_len
00350             answer.append(text)
00351         assert not words
00352         return answer
00353 
00354     def _split_contig(self, record, max_len):
00355         "Returns a list of strings, splits on commas."""
00356         #TODO - Merge this with _write_multi_line method?
00357         #It would need the addition of the comma splitting logic...
00358         #are there any other cases where that would be sensible?
00359         contig = record.annotations.get("contig", "")
00360         if isinstance(contig, list) or isinstance(contig, tuple):
00361             contig = "".join(contig)
00362         contig = self.clean(contig)
00363         i = 0
00364         answer = []
00365         while contig:
00366             if len(contig) > max_len:
00367                 #Split lines at the commas
00368                 pos = contig[:max_len-1].rfind(",")
00369                 if pos == -1:
00370                     raise ValueError("Could not break up CONTIG")
00371                 text, contig = contig[:pos+1], contig[pos+1:]
00372             else:
00373                 text, contig = contig, ""
00374             answer.append(text)
00375         return answer
00376 
00377 class GenBankWriter(_InsdcWriter):
00378     HEADER_WIDTH = 12
00379     QUALIFIER_INDENT = 21
00380     
00381     def _write_single_line(self, tag, text):
00382         "Used in the the 'header' of each GenBank record."""
00383         assert len(tag) < self.HEADER_WIDTH
00384         if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH:
00385             import warnings
00386             warnings.warn("Annotation %r too long for %s line" % (text, tag))
00387         self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH),
00388                                       text.replace("\n", " ")))
00389 
00390     def _write_multi_line(self, tag, text):
00391         "Used in the the 'header' of each GenBank record."""
00392         #TODO - Do the line spliting while preserving white space?
00393         max_len = self.MAX_WIDTH - self.HEADER_WIDTH
00394         lines = self._split_multi_line(text, max_len)
00395         self._write_single_line(tag, lines[0])
00396         for line in lines[1:]:
00397             self._write_single_line("", line)
00398 
00399     def _write_multi_entries(self, tag, text_list):
00400         #used for DBLINK and any similar later line types.
00401         #If the list of strings is empty, nothing is written.
00402         for i, text in enumerate(text_list):
00403             if i == 0:
00404                 self._write_single_line(tag, text)
00405             else:
00406                 self._write_single_line("", text)
00407 
00408     def _get_date(self, record) :
00409         default = "01-JAN-1980"
00410         try :
00411             date = record.annotations["date"]
00412         except KeyError :
00413             return default
00414         #Cope with a list of one string:
00415         if isinstance(date, list) and len(date)==1 :
00416             date = date[0]
00417         #TODO - allow a Python date object
00418         if not isinstance(date, basestring) or len(date) != 11 \
00419         or date[2] != "-" or date[6] != "-" \
00420         or not date[:2].isdigit() or not date[7:].isdigit() \
00421         or int(date[:2]) > 31 \
00422         or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN",
00423                              "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] :
00424             #TODO - Check is a valid date (e.g. not 31 Feb)
00425             return default
00426         return date
00427 
00428     def _get_data_division(self, record):
00429         try:
00430             division = record.annotations["data_file_division"]
00431         except KeyError:
00432             division = "UNK"
00433         if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT",
00434                         "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS",
00435                         "GSS", "HTG", "HTC", "ENV", "CON"]:
00436             #Good, already GenBank style
00437             #    PRI - primate sequences
00438             #    ROD - rodent sequences
00439             #    MAM - other mammalian sequences
00440             #    VRT - other vertebrate sequences
00441             #    INV - invertebrate sequences
00442             #    PLN - plant, fungal, and algal sequences
00443             #    BCT - bacterial sequences [plus archea]
00444             #    VRL - viral sequences
00445             #    PHG - bacteriophage sequences
00446             #    SYN - synthetic sequences
00447             #    UNA - unannotated sequences
00448             #    EST - EST sequences (expressed sequence tags) 
00449             #    PAT - patent sequences
00450             #    STS - STS sequences (sequence tagged sites) 
00451             #    GSS - GSS sequences (genome survey sequences) 
00452             #    HTG - HTGS sequences (high throughput genomic sequences) 
00453             #    HTC - HTC sequences (high throughput cDNA sequences) 
00454             #    ENV - Environmental sampling sequences
00455             #    CON - Constructed sequences
00456             #
00457             #(plus UNK for unknown)
00458             pass
00459         else:
00460             #See if this is in EMBL style:
00461             #    Division                 Code
00462             #    -----------------        ----
00463             #    Bacteriophage            PHG - common
00464             #    Environmental Sample     ENV - common
00465             #    Fungal                   FUN - map to PLN (plants + fungal)
00466             #    Human                    HUM - map to PRI (primates)
00467             #    Invertebrate             INV - common
00468             #    Other Mammal             MAM - common
00469             #    Other Vertebrate         VRT - common
00470             #    Mus musculus             MUS - map to ROD (rodent)
00471             #    Plant                    PLN - common
00472             #    Prokaryote               PRO - map to BCT (poor name)
00473             #    Other Rodent             ROD - common
00474             #    Synthetic                SYN - common
00475             #    Transgenic               TGN - ??? map to SYN ???
00476             #    Unclassified             UNC - map to UNK
00477             #    Viral                    VRL - common
00478             #
00479             #(plus XXX for submiting which we can map to UNK)
00480             embl_to_gbk = {"FUN":"PLN",
00481                            "HUM":"PRI",
00482                            "MUS":"ROD",
00483                            "PRO":"BCT",
00484                            "UNC":"UNK",
00485                            "XXX":"UNK",
00486                            }
00487             try:
00488                 division = embl_to_gbk[division]
00489             except KeyError:
00490                 division = "UNK"
00491         assert len(division)==3
00492         return division
00493 
00494     def _write_the_first_line(self, record):
00495         """Write the LOCUS line."""
00496         
00497         locus = record.name
00498         if not locus or locus == "<unknown name>":
00499             locus = record.id
00500         if not locus or locus == "<unknown id>":
00501             locus = self._get_annotation_str(record, "accession", just_first=True)
00502         if len(locus) > 16:
00503             raise ValueError("Locus identifier %r is too long" % str(locus))
00504 
00505         if len(record) > 99999999999:
00506             #Currently GenBank only officially support up to 350000, but
00507             #the length field can take eleven digits
00508             raise ValueError("Sequence too long!")
00509 
00510         #Get the base alphabet (underneath any Gapped or StopCodon encoding)
00511         a = Alphabet._get_base_alphabet(record.seq.alphabet)
00512         if not isinstance(a, Alphabet.Alphabet):
00513             raise TypeError("Invalid alphabet")
00514         elif isinstance(a, Alphabet.ProteinAlphabet):
00515             units = "aa"
00516         elif isinstance(a, Alphabet.NucleotideAlphabet):
00517             units = "bp"
00518         else:
00519             #Must be something like NucleotideAlphabet or
00520             #just the generic Alphabet (default for fasta files)
00521             raise ValueError("Need a Nucleotide or Protein alphabet")
00522 
00523         #Get the molecule type
00524         #TODO - record this explicitly in the parser?
00525         if isinstance(a, Alphabet.ProteinAlphabet):
00526             mol_type = ""
00527         elif isinstance(a, Alphabet.DNAAlphabet):
00528             mol_type = "DNA"
00529         elif isinstance(a, Alphabet.RNAAlphabet):
00530             mol_type = "RNA"
00531         else:
00532             #Must be something like NucleotideAlphabet or
00533             #just the generic Alphabet (default for fasta files)
00534             raise ValueError("Need a DNA, RNA or Protein alphabet")
00535         
00536         division = self._get_data_division(record)
00537         
00538         assert len(units) == 2
00539         assert len(division) == 3
00540         #TODO - date
00541         #TODO - mol_type
00542         line = "LOCUS       %s %s %s    %s           %s %s\n" \
00543                      % (locus.ljust(16),
00544                         str(len(record)).rjust(11),
00545                         units,
00546                         mol_type.ljust(6),
00547                         division,
00548                         self._get_date(record))
00549         assert len(line) == 79+1, repr(line) #plus one for new line
00550 
00551         assert line[12:28].rstrip() == locus, \
00552                'LOCUS line does not contain the locus at the expected position:\n' + line
00553         assert line[28:29] == " "
00554         assert line[29:40].lstrip() == str(len(record)), \
00555                'LOCUS line does not contain the length at the expected position:\n' + line
00556 
00557         #Tests copied from Bio.GenBank.Scanner
00558         assert line[40:44] in [' bp ', ' aa '] , \
00559                'LOCUS line does not contain size units at expected position:\n' + line
00560         assert line[44:47] in ['   ', 'ss-', 'ds-', 'ms-'], \
00561                'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
00562         assert line[47:54].strip() == "" \
00563         or line[47:54].strip().find('DNA') != -1 \
00564         or line[47:54].strip().find('RNA') != -1, \
00565                'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
00566         assert line[54:55] == ' ', \
00567                'LOCUS line does not contain space at position 55:\n' + line
00568         assert line[55:63].strip() in ['', 'linear', 'circular'], \
00569                'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
00570         assert line[63:64] == ' ', \
00571                'LOCUS line does not contain space at position 64:\n' + line
00572         assert line[67:68] == ' ', \
00573                'LOCUS line does not contain space at position 68:\n' + line
00574         assert line[70:71] == '-', \
00575                'LOCUS line does not contain - at position 71 in date:\n' + line
00576         assert line[74:75] == '-', \
00577                'LOCUS line does not contain - at position 75 in date:\n' + line
00578 
00579         self.handle.write(line)
00580 
00581     def _write_references(self, record):
00582         number = 0
00583         for ref in record.annotations["references"]:
00584             if not isinstance(ref, SeqFeature.Reference):
00585                 continue
00586             number += 1
00587             data = str(number)
00588             #TODO - support more complex record reference locations?
00589             if ref.location and len(ref.location)==1:
00590                 a = Alphabet._get_base_alphabet(record.seq.alphabet)
00591                 if isinstance(a, Alphabet.ProteinAlphabet):
00592                     units = "residues"
00593                 else:
00594                     units = "bases"
00595                 data += "  (%s %i to %i)" % (units,
00596                                              ref.location[0].nofuzzy_start+1,
00597                                              ref.location[0].nofuzzy_end)
00598             self._write_single_line("REFERENCE", data)
00599             if ref.authors:
00600                 #We store the AUTHORS data as a single string
00601                 self._write_multi_line("  AUTHORS", ref.authors)
00602             if ref.consrtm:
00603                 #We store the consortium as a single string
00604                 self._write_multi_line("  CONSRTM", ref.consrtm)
00605             if ref.title:
00606                 #We store the title as a single string
00607                 self._write_multi_line("  TITLE", ref.title)
00608             if ref.journal:
00609                 #We store this as a single string - holds the journal name,
00610                 #volume, year, and page numbers of the citation
00611                 self._write_multi_line("  JOURNAL", ref.journal)
00612             if ref.medline_id:
00613                 #This line type is obsolete and was removed from the GenBank
00614                 #flatfile format in April 2005. Should we write it?
00615                 #Note this has a two space indent:
00616                 self._write_multi_line("  MEDLINE", ref.medline_id)
00617             if ref.pubmed_id:
00618                 #Note this has a THREE space indent:
00619                 self._write_multi_line("   PUBMED", ref.pubmed_id)
00620             if ref.comment:
00621                 self._write_multi_line("  REMARK", ref.comment)
00622             
00623 
00624     def _write_comment(self, record):
00625         #This is a bit complicated due to the range of possible
00626         #ways people might have done their annotation...
00627         #Currently the parser uses a single string with newlines.
00628         #A list of lines is also reasonable.
00629         #A single (long) string is perhaps the most natural of all.
00630         #This means we may need to deal with line wrapping.
00631         comment = record.annotations["comment"]
00632         if isinstance(comment, basestring):
00633             lines = comment.split("\n")
00634         elif isinstance(comment, list) or isinstance(comment, tuple):
00635             lines = comment
00636         else:
00637             raise ValueError("Could not understand comment annotation")
00638         self._write_multi_line("COMMENT", lines[0])
00639         for line in lines[1:]:
00640             self._write_multi_line("", line)
00641 
00642     def _write_contig(self, record):
00643         max_len = self.MAX_WIDTH - self.HEADER_WIDTH
00644         lines = self._split_contig(record, max_len)
00645         self._write_single_line("CONTIG", lines[0])
00646         for text in lines[1:] :
00647             self._write_single_line("", text)
00648 
00649     def _write_sequence(self, record):
00650         #Loosely based on code from Howard Salis
00651         #TODO - Force lower case?
00652         LETTERS_PER_LINE = 60
00653         SEQUENCE_INDENT = 9
00654 
00655         if isinstance(record.seq, UnknownSeq):
00656             #We have already recorded the length, and there is no need
00657             #to record a long sequence of NNNNNNN...NNN or whatever.
00658             if "contig" in record.annotations:
00659                 self._write_contig(record)
00660             else:
00661                 self.handle.write("ORIGIN\n")
00662             return
00663 
00664         #Catches sequence being None:
00665         data = self._get_seq_string(record).lower()
00666         seq_len = len(data)
00667         self.handle.write("ORIGIN\n")
00668         for line_number in range(0, seq_len, LETTERS_PER_LINE):
00669             self.handle.write(str(line_number+1).rjust(SEQUENCE_INDENT))
00670             for words in range(line_number,
00671                                min(line_number+LETTERS_PER_LINE, seq_len), 10):
00672                 self.handle.write(" %s" % data[words:words+10])
00673             self.handle.write("\n")
00674         
00675     def write_record(self, record):
00676         """Write a single record to the output file."""
00677         handle = self.handle
00678         self._write_the_first_line(record)
00679 
00680         accession = self._get_annotation_str(record, "accession",
00681                                              record.id.split(".", 1)[0],
00682                                              just_first=True)
00683         acc_with_version = accession
00684         if record.id.startswith(accession+"."):
00685             try:
00686                 acc_with_version = "%s.%i" \
00687                                    % (accession,
00688                                       int(record.id.split(".", 1)[1]))
00689             except ValueError:
00690                 pass
00691         gi = self._get_annotation_str(record, "gi", just_first=True)
00692 
00693         descr = record.description
00694         if descr == "<unknown description>" : descr = "."
00695         self._write_multi_line("DEFINITION", descr)
00696         
00697         self._write_single_line("ACCESSION", accession)
00698         if gi != ".":
00699             self._write_single_line("VERSION", "%s  GI:%s" \
00700                                     % (acc_with_version, gi))
00701         else:
00702             self._write_single_line("VERSION", "%s" % (acc_with_version))
00703 
00704         #The NCBI only expect two types of link so far,
00705         #e.g. "Project:28471" and "Trace Assembly Archive:123456"
00706         #TODO - Filter the dbxrefs list to just these?
00707         self._write_multi_entries("DBLINK", record.dbxrefs)
00708 
00709         try:
00710             #List of strings
00711             #Keywords should be given separated with semi colons,
00712             keywords = "; ".join(record.annotations["keywords"])
00713             #with a trailing period:
00714             if not keywords.endswith(".") :
00715                 keywords += "."
00716         except KeyError:
00717             #If no keywords, there should be just a period:
00718             keywords = "."
00719         self._write_multi_line("KEYWORDS", keywords)
00720 
00721         if "segment" in record.annotations:
00722             #Deal with SEGMENT line found only in segmented records,
00723             #e.g. AH000819
00724             segment = record.annotations["segment"]
00725             if isinstance(segment, list):
00726                 assert len(segment)==1, segment
00727                 segment = segment[0]
00728             self._write_single_line("SEGMENT", segment)
00729 
00730         self._write_multi_line("SOURCE", \
00731                                 self._get_annotation_str(record, "source"))
00732         #The ORGANISM line MUST be a single line, as any continuation is the taxonomy
00733         org = self._get_annotation_str(record, "organism")
00734         if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH:
00735             org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..."
00736         self._write_single_line("  ORGANISM", org)
00737         try:
00738             #List of strings
00739             #Taxonomy should be given separated with semi colons,
00740             taxonomy = "; ".join(record.annotations["taxonomy"])
00741             #with a trailing period:
00742             if not taxonomy.endswith(".") :
00743                 taxonomy += "."
00744         except KeyError:
00745             taxonomy = "."
00746         self._write_multi_line("", taxonomy)
00747 
00748         if "references" in record.annotations:
00749             self._write_references(record)
00750 
00751         if "comment" in record.annotations:
00752             self._write_comment(record)
00753 
00754         handle.write("FEATURES             Location/Qualifiers\n")
00755         rec_length = len(record)
00756         for feature in record.features:
00757             self._write_feature(feature, rec_length) 
00758         self._write_sequence(record)
00759         handle.write("//\n")
00760 
00761 class EmblWriter(_InsdcWriter):
00762     HEADER_WIDTH = 5
00763     QUALIFIER_INDENT = 21
00764     QUALIFIER_INDENT_STR = "FT" + " "*(QUALIFIER_INDENT-2)
00765     QUALIFIER_INDENT_TMP = "FT   %s                " # 21 if %s is empty
00766     FEATURE_HEADER = "FH   Key             Location/Qualifiers\n"
00767     
00768     def _write_contig(self, record):
00769         max_len = self.MAX_WIDTH - self.HEADER_WIDTH
00770         lines = self._split_contig(record, max_len)
00771         for text in lines:
00772             self._write_single_line("CO", text)
00773 
00774     def _write_sequence(self, record):
00775         LETTERS_PER_BLOCK = 10
00776         BLOCKS_PER_LINE = 6
00777         LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE
00778         POSITION_PADDING = 10
00779         handle = self.handle #save looking up this multiple times
00780         
00781         if isinstance(record.seq, UnknownSeq):
00782             #We have already recorded the length, and there is no need
00783             #to record a long sequence of NNNNNNN...NNN or whatever.
00784             if "contig" in record.annotations:
00785                 self._write_contig(record)
00786             else:
00787                 #TODO - Can the sequence just be left out as in GenBank files?
00788                 handle.write("SQ   \n")
00789             return
00790 
00791         #Catches sequence being None
00792         data = self._get_seq_string(record).lower()
00793         seq_len = len(data)
00794 
00795         #Get the base alphabet (underneath any Gapped or StopCodon encoding)
00796         a = Alphabet._get_base_alphabet(record.seq.alphabet)
00797         if isinstance(a, Alphabet.DNAAlphabet):
00798             #TODO - What if we have RNA?
00799             a_count = data.count('A') + data.count('a')
00800             c_count = data.count('C') + data.count('c')
00801             g_count = data.count('G') + data.count('g')
00802             t_count = data.count('T') + data.count('t')
00803             other = seq_len - (a_count + c_count + g_count + t_count)
00804             handle.write("SQ   Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" \
00805                          % (seq_len, a_count, c_count, g_count, t_count, other))
00806         else:
00807             handle.write("SQ   \n")
00808         
00809         for line_number in range(0, seq_len // LETTERS_PER_LINE):
00810             handle.write("    ") #Just four, not five
00811             for block in range(BLOCKS_PER_LINE) :
00812                 index = LETTERS_PER_LINE*line_number + LETTERS_PER_BLOCK*block
00813                 handle.write((" %s" % data[index:index+LETTERS_PER_BLOCK]))
00814             handle.write(str((line_number+1)
00815                              *LETTERS_PER_LINE).rjust(POSITION_PADDING))
00816             handle.write("\n")
00817         if seq_len % LETTERS_PER_LINE:
00818             #Final (partial) line
00819             line_number = (seq_len // LETTERS_PER_LINE)
00820             handle.write("    ") #Just four, not five
00821             for block in range(BLOCKS_PER_LINE) :
00822                 index = LETTERS_PER_LINE*line_number + LETTERS_PER_BLOCK*block
00823                 handle.write((" %s" % data[index:index+LETTERS_PER_BLOCK]).ljust(11))
00824             handle.write(str(seq_len).rjust(POSITION_PADDING))
00825             handle.write("\n")
00826 
00827     def _write_single_line(self, tag, text):
00828         assert len(tag)==2
00829         line = tag+"   "+text
00830         if len(text) > self.MAX_WIDTH:
00831             import warnings
00832             warnings.warn("Line %r too long" % line)
00833         self.handle.write(line+"\n")
00834 
00835     def _write_multi_line(self, tag, text):
00836         max_len = self.MAX_WIDTH - self.HEADER_WIDTH
00837         lines = self._split_multi_line(text, max_len)
00838         for line in lines :
00839             self._write_single_line(tag, line)
00840         
00841     def _write_the_first_lines(self, record):
00842         """Write the ID and AC lines."""
00843         if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
00844             version = "SV " + record.id.rsplit(".", 1)[1]
00845             accession = self._get_annotation_str(record, "accession",
00846                                                  record.id.rsplit(".", 1)[0],
00847                                                  just_first=True)
00848         else :
00849             version = ""
00850             accession = self._get_annotation_str(record, "accession",
00851                                                  record.id,
00852                                                  just_first=True)
00853         
00854         if ";" in accession :
00855             raise ValueError("Cannot have semi-colon in EMBL accession, %s" \
00856                              % repr(str(accession)))
00857         if " " in accession :
00858             #This is out of practicallity... might it be allowed?
00859             raise ValueError("Cannot have spaces in EMBL accession, %s" \
00860                              % repr(str(accession)))
00861 
00862         #Get the molecule type
00863         #TODO - record this explicitly in the parser?
00864         #Get the base alphabet (underneath any Gapped or StopCodon encoding)
00865         a = Alphabet._get_base_alphabet(record.seq.alphabet)
00866         if not isinstance(a, Alphabet.Alphabet):
00867             raise TypeError("Invalid alphabet")
00868         elif isinstance(a, Alphabet.DNAAlphabet):
00869             mol_type = "DNA"
00870             units = "BP"
00871         elif isinstance(a, Alphabet.RNAAlphabet):
00872             mol_type = "RNA"
00873             units = "BP"
00874         elif isinstance(a, Alphabet.ProteinAlphabet):
00875             mol_type = "PROTEIN"
00876             units = "AA"
00877         else:
00878             #Must be something like NucleotideAlphabet
00879             raise ValueError("Need a DNA, RNA or Protein alphabet")
00880 
00881         #Get the taxonomy division
00882         division = self._get_data_division(record)
00883 
00884         #TODO - Full ID line
00885         handle = self.handle
00886         #ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
00887         #1. Primary accession number
00888         #2. Sequence version number
00889         #3. Topology: 'circular' or 'linear'
00890         #4. Molecule type
00891         #5. Data class
00892         #6. Taxonomic division
00893         #7. Sequence length
00894         self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s." \
00895                                 % (accession, version, mol_type,
00896                                    division, len(record), units))
00897         handle.write("XX\n")
00898         self._write_single_line("AC", accession+";")
00899         handle.write("XX\n")
00900 
00901     def _get_data_division(self, record):
00902         try:
00903             division = record.annotations["data_file_division"]
00904         except KeyError:
00905             division = "UNC"
00906         if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT",
00907                         "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC",
00908                         "VRL", "XXX"]:
00909             #Good, already EMBL style
00910             #    Division                 Code
00911             #    -----------------        ----
00912             #    Bacteriophage            PHG
00913             #    Environmental Sample     ENV
00914             #    Fungal                   FUN
00915             #    Human                    HUM
00916             #    Invertebrate             INV
00917             #    Other Mammal             MAM
00918             #    Other Vertebrate         VRT
00919             #    Mus musculus             MUS
00920             #    Plant                    PLN
00921             #    Prokaryote               PRO
00922             #    Other Rodent             ROD
00923             #    Synthetic                SYN
00924             #    Transgenic               TGN
00925             #    Unclassified             UNC (i.e. unknown)
00926             #    Viral                    VRL
00927             #
00928             #(plus XXX used for submiting data to EMBL)
00929             pass
00930         else:
00931             #See if this is in GenBank style & can be converted.
00932             #Generally a problem as the GenBank groups are wider
00933             #than those of EMBL. Note that GenBank use "BCT" for
00934             #both bacteria and acherea thus this maps to EMBL's
00935             #"PRO" nicely.
00936             gbk_to_embl = {"BCT":"PRO",
00937                            "UNK":"UNC",
00938                            }
00939             try:
00940                 division = gbk_to_embl[division]
00941             except KeyError:
00942                 division = "UNC"
00943         assert len(division)==3
00944         return division
00945 
00946     def _write_references(self, record):
00947         #The order should be RN, RC, RP, RX, RG, RA, RT, RL
00948         number = 0
00949         for ref in record.annotations["references"]:
00950             if not isinstance(ref, SeqFeature.Reference):
00951                 continue
00952             number += 1
00953             self._write_single_line("RN", "[%i]" % number)
00954             #TODO - support for RC line (needed in parser too)
00955             #TODO - support more complex record reference locations?
00956             if ref.location and len(ref.location)==1:
00957                 self._write_single_line("RP", "%i-%i" % (ref.location[0].nofuzzy_start+1,
00958                                                          ref.location[0].nofuzzy_end))
00959             #TODO - record any DOI or AGRICOLA identifier in the reference object?
00960             if ref.pubmed_id:
00961                 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id)
00962             if ref.consrtm:
00963                 self._write_single_line("RG", "%s" % ref.consrtm)
00964             if ref.authors:
00965                 #We store the AUTHORS data as a single string
00966                 self._write_multi_line("RA", ref.authors+";")
00967             if ref.title:
00968                 #We store the title as a single string
00969                 self._write_multi_line("RT", '"%s";' % ref.title)
00970             if ref.journal:
00971                 #We store this as a single string - holds the journal name,
00972                 #volume, year, and page numbers of the citation
00973                 self._write_multi_line("RL", ref.journal)
00974             self.handle.write("XX\n")
00975 
00976     def _write_comment(self, record):
00977         #This is a bit complicated due to the range of possible
00978         #ways people might have done their annotation...
00979         #Currently the parser uses a single string with newlines.
00980         #A list of lines is also reasonable.
00981         #A single (long) string is perhaps the most natural of all.
00982         #This means we may need to deal with line wrapping.
00983         comment = record.annotations["comment"]
00984         if isinstance(comment, basestring):
00985             lines = comment.split("\n")
00986         elif isinstance(comment, list) or isinstance(comment, tuple):
00987             lines = comment
00988         else:
00989             raise ValueError("Could not understand comment annotation")
00990         #TODO - Merge this with the GenBank comment code?
00991         if not lines : return
00992         for line in lines:
00993             self._write_multi_line("CC", line)
00994         self.handle.write("XX\n")
00995 
00996     def write_record(self, record):
00997         """Write a single record to the output file."""
00998 
00999         handle = self.handle
01000         self._write_the_first_lines(record)
01001 
01002         #PR line (0 or 1 lines only), project identifier
01003         for xref in record.dbxrefs:
01004             if xref.startswith("Project:"):
01005                 self._write_single_line("PR", xref+";")
01006                 handle.write("XX\n")
01007                 break
01008 
01009         #TODO - DT lines (date)
01010 
01011         descr = record.description
01012         if descr == "<unknown description>" : descr = "."
01013         self._write_multi_line("DE", descr)
01014         handle.write("XX\n")
01015 
01016         #Should this be "source" or "organism"?
01017         self._write_multi_line("OS", self._get_annotation_str(record, "organism"))
01018         try:
01019             #List of strings
01020             taxonomy = "; ".join(record.annotations["taxonomy"]) + "."
01021         except KeyError:
01022             taxonomy = "."
01023         self._write_multi_line("OC", taxonomy)
01024         handle.write("XX\n")
01025 
01026         if "references" in record.annotations:
01027             self._write_references(record)
01028 
01029         if "comment" in record.annotations:
01030             self._write_comment(record)
01031 
01032         handle.write(self.FEATURE_HEADER)
01033         rec_length = len(record)
01034         for feature in record.features:
01035             self._write_feature(feature, rec_length)
01036 
01037         self._write_sequence(record)
01038         handle.write("//\n")
01039 
01040 class ImgtWriter(EmblWriter):
01041     HEADER_WIDTH = 5
01042     QUALIFIER_INDENT = 25 # Not 21 as in EMBL
01043     QUALIFIER_INDENT_STR = "FT" + " "*(QUALIFIER_INDENT-2)
01044     QUALIFIER_INDENT_TMP = "FT   %s                    " # 25 if %s is empty
01045     FEATURE_HEADER = "FH   Key                 Location/Qualifiers\n"
01046 
01047 if __name__ == "__main__":
01048     print "Quick self test"
01049     import os
01050     from StringIO import StringIO
01051 
01052     def compare_record(old, new):
01053         if old.id != new.id and old.name != new.name:
01054             raise ValueError("'%s' or '%s' vs '%s' or '%s' records" \
01055                              % (old.id, old.name, new.id, new.name))
01056         if len(old.seq) != len(new.seq):
01057             raise ValueError("%i vs %i" % (len(old.seq), len(new.seq)))
01058         if str(old.seq).upper() != str(new.seq).upper():
01059             if len(old.seq) < 200:
01060                 raise ValueError("'%s' vs '%s'" % (old.seq, new.seq))
01061             else:
01062                 raise ValueError("'%s...' vs '%s...'" % (old.seq[:100], new.seq[:100]))
01063         if old.features and new.features:
01064             return compare_features(old.features, new.features)
01065         #Just insist on at least one word in common:
01066         if (old.description or new.description) \
01067         and not set(old.description.split()).intersection(new.description.split()):
01068             raise ValueError("%s versus %s" \
01069                              % (repr(old.description), repr(new.description)))
01070         #TODO - check annotation
01071         if "contig" in old.annotations:
01072             assert old.annotations["contig"] == \
01073                    new.annotations["contig"]
01074         return True
01075 
01076     def compare_records(old_list, new_list):
01077         """Check two lists of SeqRecords agree, raises a ValueError if mismatch."""
01078         if len(old_list) != len(new_list):
01079             raise ValueError("%i vs %i records" % (len(old_list), len(new_list)))
01080         for old, new in zip(old_list, new_list):
01081             if not compare_record(old, new):
01082                 return False
01083         return True
01084 
01085     def compare_feature(old, new, ignore_sub_features=False):
01086         """Check two SeqFeatures agree."""
01087         if old.type != new.type:
01088             raise ValueError("Type %s versus %s" % (old.type, new.type))
01089         if old.location.nofuzzy_start != new.location.nofuzzy_start \
01090         or old.location.nofuzzy_end != new.location.nofuzzy_end:
01091             raise ValueError("%s versus %s:\n%s\nvs:\n%s" \
01092                              % (old.location, new.location, str(old), str(new)))
01093         if old.strand != new.strand:
01094             raise ValueError("Different strand:\n%s\nvs:\n%s" % (str(old), str(new)))
01095         if old.location.start != new.location.start:
01096             raise ValueError("Start %s versus %s:\n%s\nvs:\n%s" \
01097                              % (old.location.start, new.location.start, str(old), str(new)))
01098         if old.location.end != new.location.end:
01099             raise ValueError("End %s versus %s:\n%s\nvs:\n%s" \
01100                              % (old.location.end, new.location.end, str(old), str(new)))
01101         if not ignore_sub_features:
01102             if len(old.sub_features) != len(new.sub_features):
01103                 raise ValueError("Different sub features")
01104             for a, b in zip(old.sub_features, new.sub_features):
01105                 if not compare_feature(a, b):
01106                     return False
01107         #This only checks key shared qualifiers
01108         #Would a white list be easier?
01109         #for key in ["name", "gene", "translation", "codon_table", "codon_start", "locus_tag"]:
01110         for key in set(old.qualifiers).intersection(new.qualifiers):
01111             if key in ["db_xref", "protein_id", "product", "note"]:
01112                 #EMBL and GenBank files are use different references/notes/etc
01113                 continue
01114             if old.qualifiers[key] != new.qualifiers[key]:
01115                 raise ValueError("Qualifier mis-match for %s:\n%s\n%s" \
01116                                  % (key, old.qualifiers[key], new.qualifiers[key]))
01117         return True
01118 
01119     def compare_features(old_list, new_list, ignore_sub_features=False):
01120         """Check two lists of SeqFeatures agree, raises a ValueError if mismatch."""
01121         if len(old_list) != len(new_list):
01122             raise ValueError("%i vs %i features" % (len(old_list), len(new_list)))
01123         for old, new in zip(old_list, new_list):
01124             #This assumes they are in the same order
01125             if not compare_feature(old, new, ignore_sub_features):
01126                 return False
01127         return True
01128 
01129     def check_genbank_writer(records):
01130         handle = StringIO()
01131         GenBankWriter(handle).write_file(records)
01132         handle.seek(0)
01133 
01134         records2 = list(GenBankIterator(handle))
01135         assert compare_records(records, records2)
01136 
01137     def check_embl_writer(records):
01138         handle = StringIO()
01139         try:
01140             EmblWriter(handle).write_file(records)
01141         except ValueError, err:
01142             print err
01143             return
01144         handle.seek(0)
01145 
01146         records2 = list(EmblIterator(handle))
01147         assert compare_records(records, records2)
01148 
01149     for filename in os.listdir("../../Tests/GenBank"):
01150         if not filename.endswith(".gbk") and not filename.endswith(".gb"):
01151             continue
01152         print filename
01153         
01154         handle = open("../../Tests/GenBank/%s" % filename)
01155         records = list(GenBankIterator(handle))
01156         handle.close()
01157 
01158         check_genbank_writer(records)
01159         check_embl_writer(records)
01160 
01161     for filename in os.listdir("../../Tests/EMBL"):
01162         if not filename.endswith(".embl"):
01163             continue
01164         print filename
01165         
01166         handle = open("../../Tests/EMBL/%s" % filename)
01167         records = list(EmblIterator(handle))
01168         handle.close()
01169 
01170         check_genbank_writer(records)
01171         check_embl_writer(records)
01172 
01173     from Bio import SeqIO
01174     for filename in os.listdir("../../Tests/SwissProt"):
01175         if not filename.startswith("sp"):
01176             continue
01177         print filename
01178         
01179         handle = open("../../Tests/SwissProt/%s" % filename)
01180         records = list(SeqIO.parse(handle, "swiss"))
01181         handle.close()
01182 
01183         check_genbank_writer(records)
01184