Back to index

python-biopython  1.60
Scanner.py
Go to the documentation of this file.
00001 # Copyright 2007-2010 by Peter Cock.  All rights reserved.
00002 # Revisions copyright 2010 by Uri Laserson.  All rights reserved.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 """Internal code for parsing GenBank and EMBL files (PRIVATE).
00007 
00008 This code is NOT intended for direct use.  It provides a basic scanner
00009 (for use with a event consumer such as Bio.GenBank._FeatureConsumer)
00010 to parse a GenBank or EMBL file (with their shared INSDC feature table).
00011 
00012 It is used by Bio.GenBank to parse GenBank files
00013 It is also used by Bio.SeqIO to parse GenBank and EMBL files
00014 
00015 Feature Table Documentation:
00016 http://www.insdc.org/files/feature_table.html
00017 http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html
00018 ftp://ftp.ncbi.nih.gov/genbank/docs/
00019 """
00020 # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records.
00021 # These are GenBank files that summarize the content of a project, and provide lists of
00022 # scaffold and contig files in the project. These will be in annotations['wgs'] and
00023 # annotations['wgs_scafld']. These GenBank files do not have sequences. See
00024 # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36
00025 # http://is.gd/nNgk
00026 # for more details of this format, and an example.
00027 # Added by Ying Huang & Iddo Friedberg
00028 
00029 import warnings
00030 import re
00031 from Bio.Seq import Seq
00032 from Bio.SeqRecord import SeqRecord
00033 from Bio.Alphabet import generic_protein
00034 
00035 class InsdcScanner(object):
00036     """Basic functions for breaking up a GenBank/EMBL file into sub sections.
00037 
00038     The International Nucleotide Sequence Database Collaboration (INSDC)
00039     between the DDBJ, EMBL, and GenBank.  These organisations all use the
00040     same "Feature Table" layout in their plain text flat file formats.
00041 
00042     However, the header and sequence sections of an EMBL file are very
00043     different in layout to those produced by GenBank/DDBJ."""
00044 
00045     #These constants get redefined with sensible values in the sub classes:
00046     RECORD_START = "XXX"  # "LOCUS       " or "ID   "
00047     HEADER_WIDTH = 3   # 12 or 5
00048     FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
00049     FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
00050     FEATURE_QUALIFIER_INDENT = 0
00051     FEATURE_QUALIFIER_SPACER = ""
00052     SEQUENCE_HEADERS=["XXX"] #with right hand side spaces removed
00053 
00054     def __init__(self, debug=0):
00055         assert len(self.RECORD_START)==self.HEADER_WIDTH
00056         for marker in self.SEQUENCE_HEADERS:
00057             assert marker==marker.rstrip()
00058         assert len(self.FEATURE_QUALIFIER_SPACER)==self.FEATURE_QUALIFIER_INDENT
00059         self.debug = debug
00060         self.line = None
00061 
00062     def set_handle(self, handle):
00063         self.handle = handle
00064         self.line = ""
00065 
00066     def find_start(self):
00067         """Read in lines until find the ID/LOCUS line, which is returned.
00068         
00069         Any preamble (such as the header used by the NCBI on *.seq.gz archives)
00070         will we ignored."""
00071         while True:
00072             if self.line:
00073                 line = self.line
00074                 self.line = ""
00075             else:
00076                 line = self.handle.readline()
00077             if not line:
00078                 if self.debug : print "End of file"
00079                 return None
00080             if line[:self.HEADER_WIDTH]==self.RECORD_START:
00081                 if self.debug > 1: print "Found the start of a record:\n" + line
00082                 break
00083             line = line.rstrip()
00084             if line == "//":
00085                 if self.debug > 1: print "Skipping // marking end of last record"
00086             elif line == "":
00087                 if self.debug > 1: print "Skipping blank line before record"
00088             else:
00089                 #Ignore any header before the first ID/LOCUS line.
00090                 if self.debug > 1:
00091                         print "Skipping header line before record:\n" + line
00092         self.line = line
00093         return line
00094 
00095     def parse_header(self):
00096         """Return list of strings making up the header
00097 
00098         New line characters are removed.
00099 
00100         Assumes you have just read in the ID/LOCUS line.
00101         """
00102         assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
00103                "Not at start of record"
00104         
00105         header_lines = []
00106         while True:
00107             line = self.handle.readline()
00108             if not line:
00109                 raise ValueError("Premature end of line during sequence data")
00110             line = line.rstrip()
00111             if line in self.FEATURE_START_MARKERS:
00112                 if self.debug : print "Found header table"
00113                 break
00114             #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]:
00115             #    if self.debug : print "Found header table (?)"
00116             #    break
00117             if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
00118                 if self.debug : print "Found start of sequence"
00119                 break
00120             if line == "//":
00121                 raise ValueError("Premature end of sequence data marker '//' found")
00122             header_lines.append(line)
00123         self.line = line
00124         return header_lines
00125 
00126     def parse_features(self, skip=False):
00127         """Return list of tuples for the features (if present)
00128 
00129         Each feature is returned as a tuple (key, location, qualifiers)
00130         where key and location are strings (e.g. "CDS" and
00131         "complement(join(490883..490885,1..879))") while qualifiers
00132         is a list of two string tuples (feature qualifier keys and values).
00133 
00134         Assumes you have already read to the start of the features table.
00135         """
00136         if self.line.rstrip() not in self.FEATURE_START_MARKERS:
00137             if self.debug : print "Didn't find any feature table"
00138             return []
00139         
00140         while self.line.rstrip() in self.FEATURE_START_MARKERS:
00141             self.line = self.handle.readline()
00142 
00143         features = []
00144         line = self.line
00145         while True:
00146             if not line:
00147                 raise ValueError("Premature end of line during features table")
00148             if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
00149                 if self.debug : print "Found start of sequence"
00150                 break
00151             line = line.rstrip()
00152             if line == "//":
00153                 raise ValueError("Premature end of features table, marker '//' found")
00154             if line in self.FEATURE_END_MARKERS:
00155                 if self.debug : print "Found end of features"
00156                 line = self.handle.readline()
00157                 break
00158             if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "":
00159                 #This is an empty feature line between qualifiers. Empty
00160                 #feature lines within qualifiers are handled below (ignored).
00161                 line = self.handle.readline()
00162                 continue
00163             
00164             if skip:
00165                 line = self.handle.readline()
00166                 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER:
00167                     line = self.handle.readline()
00168             else:
00169                 #Build up a list of the lines making up this feature:
00170                 if line[self.FEATURE_QUALIFIER_INDENT]!=" " \
00171                 and " " in line[self.FEATURE_QUALIFIER_INDENT:]:
00172                     #The feature table design enforces a length limit on the feature keys.
00173                     #Some third party files (e.g. IGMT's EMBL like files) solve this by
00174                     #over indenting the location and qualifiers.
00175                     feature_key, line = line[2:].strip().split(None,1)
00176                     feature_lines = [line]
00177                     warnings.warn("Overindented %s feature?" % feature_key)
00178                 else:
00179                     feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip()
00180                     feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]]
00181                 line = self.handle.readline()
00182                 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \
00183                 or line.rstrip() == "" : # cope with blank lines in the midst of a feature
00184                     #Use strip to remove any harmless trailing white space AND and leading
00185                     #white space (e.g. out of spec files with too much intentation)
00186                     feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip())
00187                     line = self.handle.readline()
00188                 features.append(self.parse_feature(feature_key, feature_lines))
00189         self.line = line
00190         return features
00191 
00192     def parse_feature(self, feature_key, lines):
00193         """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
00194 
00195         For example given this GenBank feature:
00196 
00197              CDS             complement(join(490883..490885,1..879))
00198                              /locus_tag="NEQ001"
00199                              /note="conserved hypothetical [Methanococcus jannaschii];
00200                              COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
00201                              localization signal; IPR002743: Protein of unknown
00202                              function DUF57"
00203                              /codon_start=1
00204                              /transl_table=11
00205                              /product="hypothetical protein"
00206                              /protein_id="NP_963295.1"
00207                              /db_xref="GI:41614797"
00208                              /db_xref="GeneID:2732620"
00209                              /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
00210                              EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
00211                              KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
00212                              IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
00213                              EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
00214                              LNSMGFGFVNTKKNSAR"
00215 
00216         Then should give input key="CDS" and the rest of the data as a list of strings
00217         lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
00218         where the leading spaces and trailing newlines have been removed.
00219 
00220         Returns tuple containing: (key as string, location string, qualifiers as list)
00221         as follows for this example:
00222 
00223         key = "CDS", string
00224         location = "complement(join(490883..490885,1..879))", string
00225         qualifiers = list of string tuples:
00226 
00227         [('locus_tag', '"NEQ001"'),
00228          ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
00229          ('codon_start', '1'),
00230          ('transl_table', '11'),
00231          ('product', '"hypothetical protein"'),
00232          ('protein_id', '"NP_963295.1"'),
00233          ('db_xref', '"GI:41614797"'),
00234          ('db_xref', '"GeneID:2732620"'),
00235          ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
00236 
00237         In the above example, the "note" and "translation" were edited for compactness,
00238         and they would contain multiple new line characters (displayed above as \n)
00239 
00240         If a qualifier is quoted (in this case, everything except codon_start and
00241         transl_table) then the quotes are NOT removed.
00242 
00243         Note that no whitespace is removed.
00244         """
00245         #Skip any blank lines
00246         iterator = iter(filter(None, lines))
00247         try:
00248             line = iterator.next()
00249 
00250             feature_location = line.strip()
00251             while feature_location[-1:]==",":
00252                 #Multiline location, still more to come!
00253                 line = iterator.next()
00254                 feature_location += line.strip()
00255 
00256             qualifiers=[]
00257 
00258             for i, line in enumerate(iterator):
00259                 # check for extra wrapping of the location closing parentheses
00260                 if i == 0 and line.startswith(")"):
00261                     feature_location += line.strip()
00262                 elif line[0]=="/":
00263                     #New qualifier
00264                     i = line.find("=")
00265                     key = line[1:i] #does not work if i==-1
00266                     value = line[i+1:] #we ignore 'value' if i==-1
00267                     if i==-1:
00268                         #Qualifier with no key, e.g. /pseudo
00269                         key = line[1:]
00270                         qualifiers.append((key,None))
00271                     elif not value:
00272                         #ApE can output /note=
00273                         qualifiers.append((key,""))
00274                     elif value[0]=='"':
00275                         #Quoted...
00276                         if value[-1]!='"' or value!='"':
00277                             #No closing quote on the first line...
00278                             while value[-1] != '"':
00279                                 value += "\n" + iterator.next()
00280                         else:
00281                             #One single line (quoted)
00282                             assert value == '"'
00283                             if self.debug : print "Quoted line %s:%s" % (key, value)
00284                         #DO NOT remove the quotes...
00285                         qualifiers.append((key,value))
00286                     else:
00287                         #Unquoted
00288                         #if debug : print "Unquoted line %s:%s" % (key,value)
00289                         qualifiers.append((key,value))
00290                 else:
00291                     #Unquoted continuation
00292                     assert len(qualifiers) > 0
00293                     assert key==qualifiers[-1][0]
00294                     #if debug : print "Unquoted Cont %s:%s" % (key, line)
00295                     qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
00296             return (feature_key, feature_location, qualifiers)
00297         except StopIteration:
00298             #Bummer
00299             raise ValueError("Problem with '%s' feature:\n%s" \
00300                               % (feature_key, "\n".join(lines)))
00301 
00302     def parse_footer(self):
00303         """returns a tuple containing a list of any misc strings, and the sequence"""
00304         #This is a basic bit of code to scan and discard the sequence,
00305         #which was useful when developing the sub classes.
00306         if self.line in self.FEATURE_END_MARKERS:
00307             while self.line[:self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS:
00308                 self.line = self.handle.readline()
00309                 if not self.line:
00310                     raise ValueError("Premature end of file")
00311                 self.line = self.line.rstrip()
00312             
00313         assert self.line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS, \
00314                "Not at start of sequence"
00315         while True:
00316             line = self.handle.readline()
00317             if not line : raise ValueError("Premature end of line during sequence data")
00318             line = line.rstrip()
00319             if line == "//" : break
00320         self.line = line
00321         return ([],"") #Dummy values!
00322 
00323     def _feed_first_line(self, consumer, line):
00324         """Handle the LOCUS/ID line, passing data to the comsumer
00325         
00326         This should be implemented by the EMBL / GenBank specific subclass
00327         
00328         Used by the parse_records() and parse() methods.
00329         """
00330         pass
00331 
00332     def _feed_header_lines(self, consumer, lines):
00333         """Handle the header lines (list of strings), passing data to the comsumer
00334         
00335         This should be implemented by the EMBL / GenBank specific subclass
00336         
00337         Used by the parse_records() and parse() methods.
00338         """
00339         pass
00340 
00341 
00342     def _feed_feature_table(self, consumer, feature_tuples):
00343         """Handle the feature table (list of tuples), passing data to the comsumer
00344         
00345         Used by the parse_records() and parse() methods.
00346         """
00347         consumer.start_feature_table()
00348         for feature_key, location_string, qualifiers in feature_tuples:
00349             consumer.feature_key(feature_key)
00350             consumer.location(location_string)
00351             for q_key, q_value in qualifiers:
00352                 if q_value is None:
00353                     consumer.feature_qualifier(q_key, q_value)
00354                 else:
00355                     consumer.feature_qualifier(q_key, q_value.replace("\n"," "))
00356 
00357 
00358     def _feed_misc_lines(self, consumer, lines):
00359         """Handle any lines between features and sequence (list of strings), passing data to the consumer
00360         
00361         This should be implemented by the EMBL / GenBank specific subclass
00362         
00363         Used by the parse_records() and parse() methods.
00364         """
00365         pass
00366 
00367     def feed(self, handle, consumer, do_features=True):
00368         """Feed a set of data into the consumer.
00369 
00370         This method is intended for use with the "old" code in Bio.GenBank
00371 
00372         Arguments:
00373         handle - A handle with the information to parse.
00374         consumer - The consumer that should be informed of events.
00375         do_features - Boolean, should the features be parsed?
00376                       Skipping the features can be much faster.
00377 
00378         Return values:
00379         true  - Passed a record
00380         false - Did not find a record
00381         """        
00382         #Should work with both EMBL and GenBank files provided the
00383         #equivalent Bio.GenBank._FeatureConsumer methods are called...
00384         self.set_handle(handle)
00385         if not self.find_start():
00386             #Could not find (another) record
00387             consumer.data=None
00388             return False
00389                        
00390         #We use the above class methods to parse the file into a simplified format.
00391         #The first line, header lines and any misc lines after the features will be
00392         #dealt with by GenBank / EMBL specific derived classes.
00393 
00394         #First line and header:
00395         self._feed_first_line(consumer, self.line)
00396         self._feed_header_lines(consumer, self.parse_header())
00397 
00398         #Features (common to both EMBL and GenBank):
00399         if do_features:
00400             self._feed_feature_table(consumer, self.parse_features(skip=False))
00401         else:
00402             self.parse_features(skip=True) # ignore the data
00403         
00404         #Footer and sequence
00405         misc_lines, sequence_string = self.parse_footer()
00406         self._feed_misc_lines(consumer, misc_lines)
00407 
00408         consumer.sequence(sequence_string)
00409         #Calls to consumer.base_number() do nothing anyway
00410         consumer.record_end("//")
00411 
00412         assert self.line == "//"
00413 
00414         #And we are done
00415         return True
00416 
00417     def parse(self, handle, do_features=True):
00418         """Returns a SeqRecord (with SeqFeatures if do_features=True)
00419 
00420         See also the method parse_records() for use on multi-record files.
00421         """
00422         from Bio.GenBank import _FeatureConsumer
00423         from Bio.GenBank.utils import FeatureValueCleaner
00424 
00425         consumer = _FeatureConsumer(use_fuzziness = 1, 
00426                     feature_cleaner = FeatureValueCleaner())
00427 
00428         if self.feed(handle, consumer, do_features):
00429             return consumer.data
00430         else:
00431             return None
00432 
00433     
00434     def parse_records(self, handle, do_features=True):
00435         """Returns a SeqRecord object iterator
00436 
00437         Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
00438 
00439         The SeqRecord objects include SeqFeatures if do_features=True
00440         
00441         This method is intended for use in Bio.SeqIO
00442         """
00443         #This is a generator function
00444         while True:
00445             record = self.parse(handle, do_features)
00446             if record is None : break
00447             assert record.id is not None
00448             assert record.name != "<unknown name>"
00449             assert record.description != "<unknown description>"
00450             yield record
00451 
00452     def parse_cds_features(self, handle,
00453                            alphabet=generic_protein,
00454                            tags2id=('protein_id','locus_tag','product')):
00455         """Returns SeqRecord object iterator
00456 
00457         Each CDS feature becomes a SeqRecord.
00458 
00459         alphabet - Used for any sequence found in a translation field.
00460         tags2id  - Tupple of three strings, the feature keys to use
00461                    for the record id, name and description,
00462 
00463         This method is intended for use in Bio.SeqIO
00464         """
00465         self.set_handle(handle)
00466         while self.find_start():
00467             #Got an EMBL or GenBank record...
00468             self.parse_header() # ignore header lines!
00469             feature_tuples = self.parse_features()
00470             #self.parse_footer() # ignore footer lines!
00471             while True:
00472                 line = self.handle.readline()
00473                 if not line : break
00474                 if line[:2]=="//" : break
00475             self.line = line.rstrip()
00476 
00477             #Now go though those features...
00478             for key, location_string, qualifiers in feature_tuples:
00479                 if key=="CDS":
00480                     #Create SeqRecord
00481                     #================
00482                     #SeqRecord objects cannot be created with annotations, they
00483                     #must be added afterwards.  So create an empty record and
00484                     #then populate it:
00485                     record = SeqRecord(seq=None)
00486                     annotations = record.annotations
00487 
00488                     #Should we add a location object to the annotations?
00489                     #I *think* that only makes sense for SeqFeatures with their
00490                     #sub features...
00491                     annotations['raw_location'] = location_string.replace(' ','')
00492 
00493                     for (qualifier_name, qualifier_data) in qualifiers:
00494                         if qualifier_data is not None \
00495                         and qualifier_data[0]=='"' and qualifier_data[-1]=='"':
00496                             #Remove quotes
00497                             qualifier_data = qualifier_data[1:-1]
00498                         #Append the data to the annotation qualifier...
00499                         if qualifier_name == "translation":
00500                             assert record.seq is None, "Multiple translations!"
00501                             record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
00502                         elif qualifier_name == "db_xref":
00503                             #its a list, possibly empty.  Its safe to extend
00504                             record.dbxrefs.append(qualifier_data)
00505                         else:
00506                             if qualifier_data is not None:
00507                                 qualifier_data = qualifier_data.replace("\n"," ").replace("  "," ")
00508                             try:
00509                                 annotations[qualifier_name] += " " + qualifier_data
00510                             except KeyError:
00511                                 #Not an addition to existing data, its the first bit
00512                                 annotations[qualifier_name]= qualifier_data
00513                         
00514                     #Fill in the ID, Name, Description
00515                     #=================================
00516                     try:
00517                         record.id = annotations[tags2id[0]]
00518                     except KeyError:
00519                         pass
00520                     try:
00521                         record.name = annotations[tags2id[1]]
00522                     except KeyError:
00523                         pass
00524                     try:
00525                         record.description = annotations[tags2id[2]]
00526                     except KeyError:
00527                         pass
00528 
00529                     yield record
00530 
00531 
00532 class EmblScanner(InsdcScanner):
00533     """For extracting chunks of information in EMBL files"""
00534 
00535     RECORD_START = "ID   "
00536     HEADER_WIDTH = 5
00537     FEATURE_START_MARKERS = ["FH   Key             Location/Qualifiers","FH"]
00538     FEATURE_END_MARKERS = ["XX"] #XX can also mark the end of many things!
00539     FEATURE_QUALIFIER_INDENT = 21
00540     FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
00541     SEQUENCE_HEADERS=["SQ", "CO"] #Remove trailing spaces
00542 
00543     def parse_footer(self):
00544         """returns a tuple containing a list of any misc strings, and the sequence"""
00545         assert self.line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS, \
00546             "Eh? '%s'" % self.line
00547 
00548         #Note that the SQ line can be split into several lines...
00549         misc_lines = []
00550         while self.line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
00551             misc_lines.append(self.line)
00552             self.line = self.handle.readline()
00553             if not self.line:
00554                 raise ValueError("Premature end of file")
00555             self.line = self.line.rstrip()
00556 
00557         assert self.line[:self.HEADER_WIDTH] == " " * self.HEADER_WIDTH \
00558                or self.line.strip() == '//', repr(self.line)
00559         
00560         seq_lines = []
00561         line = self.line
00562         while True:
00563             if not line:
00564                 raise ValueError("Premature end of file in sequence data")
00565             line = line.strip()
00566             if not line:
00567                 raise ValueError("Blank line in sequence data")
00568             if line=='//':
00569                 break
00570             assert self.line[:self.HEADER_WIDTH] == " " * self.HEADER_WIDTH, \
00571                    repr(self.line)
00572             #Remove tailing number now, remove spaces later
00573             seq_lines.append(line.rsplit(None,1)[0])
00574             line = self.handle.readline()
00575         self.line = line
00576         return (misc_lines, "".join(seq_lines).replace(" ", ""))
00577 
00578     def _feed_first_line(self, consumer, line):
00579         assert line[:self.HEADER_WIDTH].rstrip() == "ID"
00580         if line[self.HEADER_WIDTH:].count(";") == 6:
00581             #Looks like the semi colon separated style introduced in 2006
00582             self._feed_first_line_new(consumer, line)
00583         elif line[self.HEADER_WIDTH:].count(";") == 3:
00584             #Looks like the pre 2006 style
00585             self._feed_first_line_old(consumer, line)
00586         else:
00587             raise ValueError('Did not recognise the ID line layout:\n' + line)
00588 
00589     def _feed_first_line_old(self, consumer, line):
00590         #Expects an ID line in the style before 2006, e.g.
00591         #ID   SC10H5 standard; DNA; PRO; 4870 BP.
00592         #ID   BSUB9999   standard; circular DNA; PRO; 4214630 BP.
00593         assert line[:self.HEADER_WIDTH].rstrip() == "ID"
00594         fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
00595         fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
00596         fields = [entry.strip() for entry in fields]
00597         """
00598         The tokens represent:
00599            0. Primary accession number
00600            (space sep)
00601            1. ??? (e.g. standard)
00602            (semi-colon)
00603            2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
00604            3. Taxonomic division (e.g. 'PRO')
00605            4. Sequence length (e.g. '4639675 BP.')
00606         """
00607         consumer.locus(fields[0]) #Should we also call the accession consumer?
00608         consumer.residue_type(fields[2])
00609         consumer.data_file_division(fields[3])
00610         self._feed_seq_length(consumer, fields[4])        
00611 
00612     def _feed_first_line_new(self, consumer, line):
00613         #Expects an ID line in the style introduced in 2006, e.g.
00614         #ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
00615         #ID   CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.
00616         assert line[:self.HEADER_WIDTH].rstrip() == "ID"
00617         fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
00618         assert len(fields) == 7
00619         """
00620         The tokens represent:
00621            0. Primary accession number
00622            1. Sequence version number
00623            2. Topology: 'circular' or 'linear'
00624            3. Molecule type (e.g. 'genomic DNA')
00625            4. Data class (e.g. 'STD')
00626            5. Taxonomic division (e.g. 'PRO')
00627            6. Sequence length (e.g. '4639675 BP.')
00628         """
00629 
00630         consumer.locus(fields[0])
00631 
00632         #Call the accession consumer now, to make sure we record
00633         #something as the record.id, in case there is no AC line
00634         consumer.accession(fields[0])
00635 
00636         #TODO - How to deal with the version field?  At the moment the consumer
00637         #will try and use this for the ID which isn't ideal for EMBL files.
00638         version_parts = fields[1].split()
00639         if len(version_parts)==2 \
00640         and version_parts[0]=="SV" \
00641         and version_parts[1].isdigit():
00642             consumer.version_suffix(version_parts[1])
00643 
00644         #Based on how the old GenBank parser worked, merge these two:
00645         consumer.residue_type(" ".join(fields[2:4])) #TODO - Store as two fields?
00646 
00647         #consumer.xxx(fields[4]) #TODO - What should we do with the data class?
00648 
00649         consumer.data_file_division(fields[5])
00650 
00651         self._feed_seq_length(consumer, fields[6])
00652 
00653     def _feed_seq_length(self, consumer, text):
00654         length_parts = text.split()
00655         assert len(length_parts) == 2
00656         assert length_parts[1].upper() in ["BP", "BP.", "AA."]
00657         consumer.size(length_parts[0])
00658 
00659     def _feed_header_lines(self, consumer, lines):
00660         EMBL_INDENT = self.HEADER_WIDTH
00661         EMBL_SPACER = " "  * EMBL_INDENT
00662         consumer_dict = {
00663             'AC' : 'accession',
00664             'SV' : 'version', # SV line removed in June 2006, now part of ID line
00665             'DE' : 'definition',
00666             #'RN' : 'reference_num',
00667             #'RC' : reference comment... TODO
00668             #'RP' : 'reference_bases',
00669             #'RX' : reference cross reference... DOI or Pubmed
00670             'RG' : 'consrtm', #optional consortium
00671             #'RA' : 'authors',
00672             #'RT' : 'title',
00673             'RL' : 'journal',
00674             'OS' : 'organism',
00675             'OC' : 'taxonomy',
00676             #'DR' : data reference
00677             'CC' : 'comment',
00678             #'XX' : splitter
00679         }
00680         #We have to handle the following specially:
00681         #RX (depending on reference type...)
00682         for line in lines:
00683             line_type = line[:EMBL_INDENT].strip()
00684             data = line[EMBL_INDENT:].strip()
00685             if line_type == 'XX':
00686                 pass
00687             elif line_type == 'RN':
00688                 # Reformat reference numbers for the GenBank based consumer
00689                 # e.g. '[1]' becomes '1'
00690                 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
00691                 consumer.reference_num(data)
00692             elif line_type == 'RP':
00693                 # Reformat reference numbers for the GenBank based consumer
00694                 # e.g. '1-4639675' becomes '(bases 1 to 4639675)'
00695                 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)'
00696                 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")]
00697                 consumer.reference_bases("(bases %s)" % "; ".join(parts))
00698             elif line_type == 'RT':
00699                 #Remove the enclosing quotes and trailing semi colon.
00700                 #Note the title can be split over multiple lines.
00701                 if data.startswith('"'):
00702                     data = data[1:]
00703                 if data.endswith('";'):
00704                     data = data[:-2]
00705                 consumer.title(data)
00706             elif line_type == 'RX':
00707                 # EMBL support three reference types at the moment:
00708                 # - PUBMED    PUBMED bibliographic database (NLM)
00709                 # - DOI       Digital Object Identifier (International DOI Foundation)
00710                 # - AGRICOLA  US National Agriculture Library (NAL) of the US Department
00711                 #             of Agriculture (USDA)
00712                 #
00713                 # Format:
00714                 # RX  resource_identifier; identifier.
00715                 #
00716                 # e.g.
00717                 # RX   DOI; 10.1016/0024-3205(83)90010-3.
00718                 # RX   PUBMED; 264242.
00719                 #
00720                 # Currently our reference object only supports PUBMED and MEDLINE
00721                 # (as these were in GenBank files?).
00722                 key, value = data.split(";",1)
00723                 if value.endswith(".") : value = value[:-1]
00724                 value = value.strip()
00725                 if key == "PUBMED":
00726                     consumer.pubmed_id(value)
00727                 #TODO - Handle other reference types (here and in BioSQL bindings)
00728             elif line_type == 'CC':
00729                 # Have to pass a list of strings for this one (not just a string)
00730                 consumer.comment([data])
00731             elif line_type == 'DR':
00732                 # Database Cross-reference, format:
00733                 # DR   database_identifier; primary_identifier; secondary_identifier.
00734                 #
00735                 # e.g.
00736                 # DR   MGI; 98599; Tcrb-V4.
00737                 #
00738                 # TODO - How should we store any secondary identifier?
00739                 parts = data.rstrip(".").split(";")
00740                 #Turn it into "database_identifier:primary_identifier" to
00741                 #mimic the GenBank parser. e.g. "MGI:98599"
00742                 consumer.dblink("%s:%s" % (parts[0].strip(),
00743                                            parts[1].strip()))
00744             elif line_type == 'RA':
00745                 # Remove trailing ; at end of authors list
00746                 consumer.authors(data.rstrip(";"))
00747             elif line_type == 'PR':
00748                 # Remove trailing ; at end of the project reference
00749                 # In GenBank files this corresponds to the old PROJECT
00750                 # line which is being replaced with the DBLINK line.
00751                 consumer.project(data.rstrip(";"))
00752             elif line_type in consumer_dict:
00753                 #Its a semi-automatic entry!
00754                 getattr(consumer, consumer_dict[line_type])(data)
00755             else:
00756                 if self.debug:
00757                     print "Ignoring EMBL header line:\n%s" % line
00758 
00759     def _feed_misc_lines(self, consumer, lines):
00760         #TODO - Should we do something with the information on the SQ line(s)?
00761         lines.append("")
00762         line_iter = iter(lines)
00763         try:
00764             for line in line_iter:
00765                 if line.startswith("CO   "):
00766                     line = line[5:].strip()
00767                     contig_location = line
00768                     while True:
00769                         line = line_iter.next()
00770                         if not line:
00771                             break
00772                         elif line.startswith("CO   "):
00773                             #Don't need to preseve the whitespace here.
00774                             contig_location += line[5:].strip()
00775                         else:
00776                             raise ValueError('Expected CO (contig) continuation line, got:\n' + line)
00777                     consumer.contig_location(contig_location)
00778             return
00779         except StopIteration:
00780             raise ValueError("Problem in misc lines before sequence")
00781 
00782 
00783 class _ImgtScanner(EmblScanner):
00784     """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE).
00785     
00786     IMGT files are like EMBL files but in order to allow longer feature types
00787     the features should be indented by 25 characters not 21 characters. In
00788     practice the IMGT flat files tend to use either 21 or 25 characters, so we
00789     must cope with both.
00790     
00791     This is private to encourage use of Bio.SeqIO rather than Bio.GenBank.
00792     """
00793 
00794     FEATURE_START_MARKERS = ["FH   Key             Location/Qualifiers",
00795                              "FH   Key             Location/Qualifiers (from EMBL)",
00796                              "FH   Key                 Location/Qualifiers",
00797                              "FH"]
00798 
00799     def parse_features(self, skip=False):
00800         """Return list of tuples for the features (if present)
00801 
00802         Each feature is returned as a tuple (key, location, qualifiers)
00803         where key and location are strings (e.g. "CDS" and
00804         "complement(join(490883..490885,1..879))") while qualifiers
00805         is a list of two string tuples (feature qualifier keys and values).
00806 
00807         Assumes you have already read to the start of the features table.
00808         """
00809         if self.line.rstrip() not in self.FEATURE_START_MARKERS:
00810             if self.debug : print "Didn't find any feature table"
00811             return []
00812         
00813         while self.line.rstrip() in self.FEATURE_START_MARKERS:
00814             self.line = self.handle.readline()
00815 
00816         bad_position_re = re.compile(r'([0-9]+)>{1}')
00817         
00818         features = []
00819         line = self.line
00820         while True:
00821             if not line:
00822                 raise ValueError("Premature end of line during features table")
00823             if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
00824                 if self.debug : print "Found start of sequence"
00825                 break
00826             line = line.rstrip()
00827             if line == "//":
00828                 raise ValueError("Premature end of features table, marker '//' found")
00829             if line in self.FEATURE_END_MARKERS:
00830                 if self.debug : print "Found end of features"
00831                 line = self.handle.readline()
00832                 break
00833             if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "":
00834                 #This is an empty feature line between qualifiers. Empty
00835                 #feature lines within qualifiers are handled below (ignored).
00836                 line = self.handle.readline()
00837                 continue
00838 
00839             if skip:
00840                 line = self.handle.readline()
00841                 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER:
00842                     line = self.handle.readline()
00843             else:
00844                 assert line[:2] == "FT"
00845                 try:
00846                     feature_key, location_start = line[2:].strip().split()
00847                 except ValueError:
00848                     #e.g. "FT   TRANSMEMBRANE-REGION2163..2240\n"
00849                     #Assume indent of 25 as per IMGT spec, with the location
00850                     #start in column 26 (one-based).
00851                     feature_key = line[2:25].strip()
00852                     location_start = line[25:].strip()
00853                 feature_lines = [location_start]
00854                 line = self.handle.readline()
00855                 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \
00856                 or line.rstrip() == "" : # cope with blank lines in the midst of a feature
00857                     #Use strip to remove any harmless trailing white space AND and leading
00858                     #white space (copes with 21 or 26 indents and orther variants)
00859                     assert line[:2] == "FT"
00860                     feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip())
00861                     line = self.handle.readline()
00862                 feature_key, location, qualifiers = \
00863                                 self.parse_feature(feature_key, feature_lines)
00864                 #Try to handle known problems with IMGT locations here:
00865                 if ">" in location:
00866                     #Nasty hack for common IMGT bug, should be >123 not 123>
00867                     #in a location string. At least here the meaning is clear, 
00868                     #and since it is so common I don't want to issue a warning
00869                     #warnings.warn("Feature location %s is invalid, "
00870                     #              "moving greater than sign before position"
00871                     #              % location)
00872                     location = bad_position_re.sub(r'>\1',location)
00873                 features.append((feature_key, location, qualifiers))
00874         self.line = line
00875         return features
00876 
00877 class GenBankScanner(InsdcScanner):
00878     """For extracting chunks of information in GenBank files"""
00879 
00880     RECORD_START = "LOCUS       "
00881     HEADER_WIDTH = 12
00882     FEATURE_START_MARKERS = ["FEATURES             Location/Qualifiers","FEATURES"]
00883     FEATURE_END_MARKERS = []
00884     FEATURE_QUALIFIER_INDENT = 21
00885     FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
00886     SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed
00887 
00888     def parse_footer(self):
00889         """returns a tuple containing a list of any misc strings, and the sequence"""
00890         assert self.line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS, \
00891                "Eh? '%s'" % self.line
00892 
00893         misc_lines = []
00894         while self.line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS \
00895         or self.line[:self.HEADER_WIDTH] == " "*self.HEADER_WIDTH \
00896         or "WGS" == self.line[:3]:
00897             misc_lines.append(self.line.rstrip())
00898             self.line = self.handle.readline()
00899             if not self.line:
00900                 raise ValueError("Premature end of file")
00901             self.line = self.line
00902 
00903         assert self.line[:self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS, \
00904                "Eh? '%s'" % self.line
00905 
00906         #Now just consume the sequence lines until reach the // marker
00907         #or a CONTIG line
00908         seq_lines = []
00909         line = self.line
00910         while True:
00911             if not line:
00912                 raise ValueError("Premature end of file in sequence data")
00913             line = line.rstrip()
00914             if not line:
00915                 import warnings
00916                 warnings.warn("Blank line in sequence data")
00917                 line = self.handle.readline()
00918                 continue
00919             if line=='//':
00920                 break
00921             if line.find('CONTIG')==0:
00922                 break
00923             if len(line) > 9 and  line[9:10]!=' ':
00924                 raise ValueError("Sequence line mal-formed, '%s'" % line)
00925             seq_lines.append(line[10:]) #remove spaces later
00926             line = self.handle.readline()
00927 
00928         self.line = line
00929         #Seq("".join(seq_lines), self.alphabet)
00930         return (misc_lines,"".join(seq_lines).replace(" ",""))
00931 
00932     def _feed_first_line(self, consumer, line):
00933         """Scan over and parse GenBank LOCUS line (PRIVATE).
00934 
00935         This must cope with several variants, primarily the old and new column
00936         based standards from GenBank. Additionally EnsEMBL produces GenBank
00937         files where the LOCUS line is space separated rather that following
00938         the column based layout.
00939 
00940         We also try to cope with GenBank like files with partial LOCUS lines.
00941         """
00942         #####################################
00943         # LOCUS line                        #
00944         #####################################
00945         GENBANK_INDENT = self.HEADER_WIDTH
00946         GENBANK_SPACER = " "*GENBANK_INDENT
00947         assert line[0:GENBANK_INDENT] == 'LOCUS       ', \
00948                'LOCUS line does not start correctly:\n' + line
00949 
00950         #Have to break up the locus line, and handle the different bits of it.
00951         #There are at least two different versions of the locus line...
00952         if line[29:33] in [' bp ', ' aa ',' rc '] and line[55:62] == '       ':
00953             #Old... note we insist on the 55:62 being empty to avoid trying
00954             #to parse space separated LOCUS lines from Ensembl etc, see below.
00955             #
00956             #    Positions  Contents
00957             #    ---------  --------
00958             #    00:06      LOCUS
00959             #    06:12      spaces
00960             #    12:??      Locus name
00961             #    ??:??      space
00962             #    ??:29      Length of sequence, right-justified
00963             #    29:33      space, bp, space
00964             #    33:41      strand type
00965             #    41:42      space
00966             #    42:51      Blank (implies linear), linear or circular
00967             #    51:52      space
00968             #    52:55      The division code (e.g. BCT, VRL, INV)
00969             #    55:62      space
00970             #    62:73      Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
00971             #
00972             #assert line[29:33] in [' bp ', ' aa ',' rc '] , \
00973             #       'LOCUS line does not contain size units at expected position:\n' + line
00974             assert line[41:42] == ' ', \
00975                    'LOCUS line does not contain space at position 42:\n' + line
00976             assert line[42:51].strip() in ['','linear','circular'], \
00977                    'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
00978             assert line[51:52] == ' ', \
00979                    'LOCUS line does not contain space at position 52:\n' + line
00980             #assert line[55:62] == '       ', \
00981             #      'LOCUS line does not contain spaces from position 56 to 62:\n' + line
00982             if line[62:73].strip():
00983                 assert line[64:65] == '-', \
00984                        'LOCUS line does not contain - at position 65 in date:\n' + line
00985                 assert line[68:69] == '-', \
00986                        'LOCUS line does not contain - at position 69 in date:\n' + line
00987 
00988             name_and_length_str = line[GENBANK_INDENT:29]
00989             while name_and_length_str.find('  ')!=-1:
00990                 name_and_length_str = name_and_length_str.replace('  ',' ')
00991             name_and_length = name_and_length_str.split(' ')
00992             assert len(name_and_length)<=2, \
00993                    'Cannot parse the name and length in the LOCUS line:\n' + line
00994             assert len(name_and_length)!=1, \
00995                    'Name and length collide in the LOCUS line:\n' + line
00996                    #Should be possible to split them based on position, if
00997                    #a clear definition of the standard exists THAT AGREES with
00998                    #existing files.
00999             consumer.locus(name_and_length[0])
01000             consumer.size(name_and_length[1])
01001             #consumer.residue_type(line[33:41].strip())
01002 
01003             if line[33:51].strip() == "" and line[29:33] == ' aa ':
01004                 #Amino acids -> protein (even if there is no residue type given)
01005                 #We want to use a protein alphabet in this case, rather than a
01006                 #generic one. Not sure if this is the best way to achieve this,
01007                 #but it works because the scanner checks for this:
01008                 consumer.residue_type("PROTEIN")
01009             else:
01010                 consumer.residue_type(line[33:51].strip())
01011 
01012             consumer.data_file_division(line[52:55])
01013             if line[62:73].strip():
01014                 consumer.date(line[62:73])
01015         elif line[40:44] in [' bp ', ' aa ',' rc '] \
01016         and line[54:64].strip() in ['','linear','circular']:
01017             #New... linear/circular/big blank test should avoid EnsEMBL style
01018             #LOCUS line being treated like a proper column based LOCUS line.
01019             #
01020             #    Positions  Contents
01021             #    ---------  --------
01022             #    00:06      LOCUS
01023             #    06:12      spaces
01024             #    12:??      Locus name
01025             #    ??:??      space
01026             #    ??:40      Length of sequence, right-justified
01027             #    40:44      space, bp, space
01028             #    44:47      Blank, ss-, ds-, ms-
01029             #    47:54      Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA
01030             #    54:55      space
01031             #    55:63      Blank (implies linear), linear or circular
01032             #    63:64      space
01033             #    64:67      The division code (e.g. BCT, VRL, INV)
01034             #    67:68      space
01035             #    68:79      Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
01036             #
01037             assert line[40:44] in [' bp ', ' aa ',' rc '] , \
01038                    'LOCUS line does not contain size units at expected position:\n' + line
01039             assert line[44:47] in ['   ', 'ss-', 'ds-', 'ms-'], \
01040                    'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
01041             assert line[47:54].strip() == "" \
01042             or line[47:54].strip().find('DNA') != -1 \
01043             or line[47:54].strip().find('RNA') != -1, \
01044                    'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
01045             assert line[54:55] == ' ', \
01046                    'LOCUS line does not contain space at position 55:\n' + line
01047             assert line[55:63].strip() in ['','linear','circular'], \
01048                    'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
01049             assert line[63:64] == ' ', \
01050                    'LOCUS line does not contain space at position 64:\n' + line
01051             assert line[67:68] == ' ', \
01052                    'LOCUS line does not contain space at position 68:\n' + line
01053             if line[68:79].strip():
01054                 assert line[70:71] == '-', \
01055                        'LOCUS line does not contain - at position 71 in date:\n' + line
01056                 assert line[74:75] == '-', \
01057                        'LOCUS line does not contain - at position 75 in date:\n' + line
01058 
01059             name_and_length_str = line[GENBANK_INDENT:40]
01060             while name_and_length_str.find('  ')!=-1:
01061                 name_and_length_str = name_and_length_str.replace('  ',' ')
01062             name_and_length = name_and_length_str.split(' ')
01063             assert len(name_and_length)<=2, \
01064                    'Cannot parse the name and length in the LOCUS line:\n' + line
01065             assert len(name_and_length)!=1, \
01066                    'Name and length collide in the LOCUS line:\n' + line
01067                    #Should be possible to split them based on position, if
01068                    #a clear definition of the stand exists THAT AGREES with
01069                    #existing files.
01070             consumer.locus(name_and_length[0])
01071             consumer.size(name_and_length[1])
01072 
01073             if line[44:54].strip() == "" and line[40:44] == ' aa ':
01074                 #Amino acids -> protein (even if there is no residue type given)
01075                 #We want to use a protein alphabet in this case, rather than a
01076                 #generic one. Not sure if this is the best way to achieve this,
01077                 #but it works because the scanner checks for this:
01078                 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
01079             else:
01080                 consumer.residue_type(line[44:63].strip())
01081 
01082             consumer.data_file_division(line[64:67])
01083             if line[68:79].strip():
01084                 consumer.date(line[68:79])
01085         elif line[GENBANK_INDENT:].strip().count(" ")==0 : 
01086             #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762
01087             #
01088             #e.g.
01089             #
01090             #    "LOCUS       U00096"
01091             #
01092             #rather than:
01093             #
01094             #    "LOCUS       U00096               4639675 bp    DNA     circular BCT"
01095             #
01096             #    Positions  Contents
01097             #    ---------  --------
01098             #    00:06      LOCUS
01099             #    06:12      spaces
01100             #    12:??      Locus name
01101             if line[GENBANK_INDENT:].strip() != "":
01102                 consumer.locus(line[GENBANK_INDENT:].strip())
01103             else:
01104                 #Must just have just "LOCUS       ", is this even legitimate?
01105                 #We should be able to continue parsing... we need real world testcases!
01106                 warnings.warn("Minimal LOCUS line found - is this correct?\n:%r" % line)
01107         elif len(line.split())==7 and line.split()[3] in ["aa","bp"]:
01108             #Cope with EnsEMBL genbank files which use space separation rather
01109             #than the expected column based layout. e.g.
01110             #LOCUS       HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011
01111             #LOCUS       HG531_PATCH 759984 bp DNA HTG 18-JUN-2011
01112             #LOCUS       HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011
01113             #LOCUS       HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011
01114             #Notice that the 'bp' can occur in the position expected by either
01115             #the old or the new fixed column standards (parsed above).
01116             splitline = line.split()
01117             consumer.locus(splitline[1])
01118             consumer.size(splitline[2])
01119             consumer.residue_type(splitline[4])
01120             consumer.data_file_division(splitline[5])
01121             consumer.date(splitline[6])
01122         elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]:
01123             #Cope with EMBOSS seqret output where it seems the locus id can cause
01124             #the other fields to overflow.  We just IGNORE the other fields!
01125             warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
01126             consumer.locus(line.split()[1])
01127             consumer.size(line.split()[2])
01128         elif len(line.split())>=4 and line.split()[-1] in ["aa","bp"]:
01129             #Cope with psuedo-GenBank files like this:
01130             #   "LOCUS       RNA5 complete       1718 bp"
01131             #Treat everything between LOCUS and the size as the identifier.
01132             warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
01133             consumer.locus(line[5:].rsplit(None,2)[0].strip())
01134             consumer.size(line.split()[-2])
01135         else:
01136             raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
01137 
01138 
01139     def _feed_header_lines(self, consumer, lines):
01140         #Following dictionary maps GenBank lines to the associated
01141         #consumer methods - the special cases like LOCUS where one
01142         #genbank line triggers several consumer calls have to be
01143         #handled individually.
01144         GENBANK_INDENT = self.HEADER_WIDTH
01145         GENBANK_SPACER = " "*GENBANK_INDENT
01146         consumer_dict = {
01147             'DEFINITION' : 'definition',
01148             'ACCESSION'  : 'accession',
01149             'NID'        : 'nid',
01150             'PID'        : 'pid',
01151             'DBSOURCE'   : 'db_source',
01152             'KEYWORDS'   : 'keywords',
01153             'SEGMENT'    : 'segment',
01154             'SOURCE'     : 'source',
01155             'AUTHORS'    : 'authors',
01156             'CONSRTM'    : 'consrtm',
01157             'PROJECT'    : 'project',
01158             'DBLINK'     : 'dblink',
01159             'TITLE'      : 'title',
01160             'JOURNAL'    : 'journal',
01161             'MEDLINE'    : 'medline_id',
01162             'PUBMED'     : 'pubmed_id',
01163             'REMARK'     : 'remark'}
01164         #We have to handle the following specially:
01165         #ORIGIN (locus, size, residue_type, data_file_division and date)
01166         #COMMENT (comment)
01167         #VERSION (version and gi)
01168         #REFERENCE (eference_num and reference_bases)
01169         #ORGANISM (organism and taxonomy)
01170         lines = filter(None,lines)
01171         lines.append("") #helps avoid getting StopIteration all the time
01172         line_iter = iter(lines)
01173         try:
01174             line = line_iter.next()
01175             while True:
01176                 if not line : break
01177                 line_type = line[:GENBANK_INDENT].strip()
01178                 data = line[GENBANK_INDENT:].strip()
01179 
01180                 if line_type == 'VERSION':
01181                     #Need to call consumer.version(), and maybe also consumer.gi() as well.
01182                     #e.g.
01183                     # VERSION     AC007323.5  GI:6587720
01184                     while data.find('  ')!=-1:
01185                         data = data.replace('  ',' ')
01186                     if data.find(' GI:')==-1:
01187                         consumer.version(data)
01188                     else:
01189                         if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
01190                         consumer.version(data.split(' GI:')[0])
01191                         consumer.gi(data.split(' GI:')[1])
01192                     #Read in the next line!
01193                     line = line_iter.next()
01194                 elif line_type == 'REFERENCE':
01195                     if self.debug >1 : print "Found reference [" + data + "]"
01196                     #Need to call consumer.reference_num() and consumer.reference_bases()
01197                     #e.g.
01198                     # REFERENCE   1  (bases 1 to 86436)
01199                     #
01200                     #Note that this can be multiline, see Bug 1968, e.g.
01201                     #
01202                     # REFERENCE   42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to
01203                     #             28259)
01204                     #
01205                     #For such cases we will call the consumer once only.
01206                     data = data.strip()
01207 
01208                     #Read in the next line, and see if its more of the reference:
01209                     while True:
01210                         line = line_iter.next()
01211                         if line[:GENBANK_INDENT] == GENBANK_SPACER:
01212                             #Add this continuation to the data string
01213                             data += " " + line[GENBANK_INDENT:]
01214                             if self.debug >1 : print "Extended reference text [" + data + "]"
01215                         else:
01216                             #End of the reference, leave this text in the variable "line"
01217                             break
01218 
01219                     #We now have all the reference line(s) stored in a string, data,
01220                     #which we pass to the consumer
01221                     while data.find('  ')!=-1:
01222                         data = data.replace('  ',' ')
01223                     if data.find(' ')==-1:
01224                         if self.debug >2 : print 'Reference number \"' + data + '\"'
01225                         consumer.reference_num(data)
01226                     else:
01227                         if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
01228                         consumer.reference_num(data[:data.find(' ')])
01229                         consumer.reference_bases(data[data.find(' ')+1:])
01230                 elif line_type == 'ORGANISM':
01231                     #Typically the first line is the organism, and subsequent lines
01232                     #are the taxonomy lineage.  However, given longer and longer
01233                     #species names (as more and more strains and sub strains get
01234                     #sequenced) the oragnism name can now get wrapped onto multiple
01235                     #lines.  The NCBI say we have to recognise the lineage line by
01236                     #the presense of semi-colon delimited entries.  In the long term,
01237                     #they are considering adding a new keyword (e.g. LINEAGE).
01238                     #See Bug 2591 for details.
01239                     organism_data = data
01240                     lineage_data = ""
01241                     while True:
01242                         line = line_iter.next()
01243                         if line[0:GENBANK_INDENT] == GENBANK_SPACER:
01244                             if lineage_data or ";" in line:
01245                                 lineage_data += " " + line[GENBANK_INDENT:]
01246                             else:
01247                                 organism_data += " " + line[GENBANK_INDENT:].strip()
01248                         else:
01249                             #End of organism and taxonomy
01250                             break
01251                     consumer.organism(organism_data)
01252                     if lineage_data.strip() == "" and self.debug > 1:
01253                         print "Taxonomy line(s) missing or blank"
01254                     consumer.taxonomy(lineage_data.strip())
01255                     del organism_data, lineage_data
01256                 elif line_type == 'COMMENT':
01257                     if self.debug > 1 : print "Found comment"
01258                     #This can be multiline, and should call consumer.comment() once
01259                     #with a list where each entry is a line.
01260                     comment_list=[]
01261                     comment_list.append(data)
01262                     while True:
01263                         line = line_iter.next()
01264                         if line[0:GENBANK_INDENT] == GENBANK_SPACER:
01265                             data = line[GENBANK_INDENT:]
01266                             comment_list.append(data)
01267                             if self.debug > 2 : print "Comment continuation [" + data + "]"
01268                         else:
01269                             #End of the comment
01270                             break
01271                     consumer.comment(comment_list)
01272                     del comment_list
01273                 elif line_type in consumer_dict:
01274                     #Its a semi-automatic entry!
01275                     #Now, this may be a multi line entry...
01276                     while True:
01277                         line = line_iter.next()
01278                         if line[0:GENBANK_INDENT] == GENBANK_SPACER:
01279                             data += ' ' + line[GENBANK_INDENT:]
01280                         else:
01281                             #We now have all the data for this entry:
01282                             getattr(consumer, consumer_dict[line_type])(data)
01283                             #End of continuation - return to top of loop!
01284                             break
01285                 else:
01286                     if self.debug:
01287                         print "Ignoring GenBank header line:\n" % line
01288                     #Read in next line
01289                     line = line_iter.next()
01290         except StopIteration:
01291             raise ValueError("Problem in header")
01292         
01293     def _feed_misc_lines(self, consumer, lines):
01294         #Deals with a few misc lines between the features and the sequence
01295         GENBANK_INDENT = self.HEADER_WIDTH
01296         GENBANK_SPACER = " "*GENBANK_INDENT
01297         lines.append("")
01298         line_iter = iter(lines)
01299         try:
01300             for line in line_iter:
01301                 if line.find('BASE COUNT')==0:
01302                     line = line[10:].strip()
01303                     if line:
01304                         if self.debug : print "base_count = " + line
01305                         consumer.base_count(line)
01306                 if line.find("ORIGIN")==0:
01307                     line = line[6:].strip()
01308                     if line:
01309                         if self.debug : print "origin_name = " + line
01310                         consumer.origin_name(line)
01311                 if line.find("WGS ")==0 :                        
01312                     line = line[3:].strip()
01313                     consumer.wgs(line)
01314                 if line.find("WGS_SCAFLD")==0 :                        
01315                     line = line[10:].strip()
01316                     consumer.add_wgs_scafld(line)
01317                 if line.find("CONTIG")==0:
01318                     line = line[6:].strip()
01319                     contig_location = line
01320                     while True:
01321                         line = line_iter.next()
01322                         if not line:
01323                             break
01324                         elif line[:GENBANK_INDENT]==GENBANK_SPACER:
01325                             #Don't need to preseve the whitespace here.
01326                             contig_location += line[GENBANK_INDENT:].rstrip()
01327                         else:
01328                             raise ValueError('Expected CONTIG continuation line, got:\n' + line)
01329                     consumer.contig_location(contig_location)
01330             return
01331         except StopIteration:
01332             raise ValueError("Problem in misc lines before sequence")
01333         
01334 if __name__ == "__main__":
01335     from StringIO import StringIO
01336 
01337     gbk_example = \
01338 """LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999
01339 DEFINITION  Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
01340             (AXL2) and Rev7p (REV7) genes, complete cds.
01341 ACCESSION   U49845
01342 VERSION     U49845.1  GI:1293613
01343 KEYWORDS    .
01344 SOURCE      Saccharomyces cerevisiae (baker's yeast)
01345   ORGANISM  Saccharomyces cerevisiae
01346             Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
01347             Saccharomycetales; Saccharomycetaceae; Saccharomyces.
01348 REFERENCE   1  (bases 1 to 5028)
01349   AUTHORS   Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
01350   TITLE     Cloning and sequence of REV7, a gene whose function is required for
01351             DNA damage-induced mutagenesis in Saccharomyces cerevisiae
01352   JOURNAL   Yeast 10 (11), 1503-1509 (1994)
01353   PUBMED    7871890
01354 REFERENCE   2  (bases 1 to 5028)
01355   AUTHORS   Roemer,T., Madden,K., Chang,J. and Snyder,M.
01356   TITLE     Selection of axial growth sites in yeast requires Axl2p, a novel
01357             plasma membrane glycoprotein
01358   JOURNAL   Genes Dev. 10 (7), 777-793 (1996)
01359   PUBMED    8846915
01360 REFERENCE   3  (bases 1 to 5028)
01361   AUTHORS   Roemer,T.
01362   TITLE     Direct Submission
01363   JOURNAL   Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
01364             Haven, CT, USA
01365 FEATURES             Location/Qualifiers
01366      source          1..5028
01367                      /organism="Saccharomyces cerevisiae"
01368                      /db_xref="taxon:4932"
01369                      /chromosome="IX"
01370                      /map="9"
01371      CDS             <1..206
01372                      /codon_start=3
01373                      /product="TCP1-beta"
01374                      /protein_id="AAA98665.1"
01375                      /db_xref="GI:1293614"
01376                      /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
01377                      AEVLLRVDNIIRARPRTANRQHM"
01378      gene            687..3158
01379                      /gene="AXL2"
01380      CDS             687..3158
01381                      /gene="AXL2"
01382                      /note="plasma membrane glycoprotein"
01383                      /codon_start=1
01384                      /function="required for axial budding pattern of S.
01385                      cerevisiae"
01386                      /product="Axl2p"
01387                      /protein_id="AAA98666.1"
01388                      /db_xref="GI:1293615"
01389                      /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
01390                      TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
01391                      VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
01392                      VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
01393                      TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
01394                      YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
01395                      DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
01396                      DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
01397                      NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
01398                      CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
01399                      NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
01400                      SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
01401                      YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
01402                      HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
01403                      VDFSNKSNVNVGQVKDIHGRIPEML"
01404      gene            complement(3300..4037)
01405                      /gene="REV7"
01406      CDS             complement(3300..4037)
01407                      /gene="REV7"
01408                      /codon_start=1
01409                      /product="Rev7p"
01410                      /protein_id="AAA98667.1"
01411                      /db_xref="GI:1293616"
01412                      /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
01413                      FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
01414                      KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
01415                      RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
01416                      LISGDDKILNGVYSQYEEGESIFGSLF"
01417 ORIGIN
01418         1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
01419        61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
01420       121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
01421       181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
01422       241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
01423       301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
01424       361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
01425       421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
01426       481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
01427       541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
01428       601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
01429       661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
01430       721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
01431       781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
01432       841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
01433       901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
01434       961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
01435      1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
01436      1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
01437      1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
01438      1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
01439      1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
01440      1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
01441      1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
01442      1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
01443      1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
01444      1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
01445      1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
01446      1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
01447      1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
01448      1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
01449      1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
01450      1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
01451      1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
01452      2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
01453      2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
01454      2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
01455      2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
01456      2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
01457      2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
01458      2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
01459      2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
01460      2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
01461      2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
01462      2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
01463      2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
01464      2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
01465      2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
01466      2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
01467      2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
01468      3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
01469      3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
01470      3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
01471      3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
01472      3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
01473      3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
01474      3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
01475      3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
01476      3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
01477      3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
01478      3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
01479      3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
01480      3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
01481      3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
01482      3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
01483      3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
01484      3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
01485      4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
01486      4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
01487      4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
01488      4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
01489      4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
01490      4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
01491      4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
01492      4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
01493      4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
01494      4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
01495      4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
01496      4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
01497      4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
01498      4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
01499      4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
01500      4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
01501      4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
01502 //"""
01503 
01504     # GenBank format protein (aka GenPept) file from:
01505     # http://www.molecularevolution.org/resources/fileformats/
01506     gbk_example2 = \
01507 """LOCUS       AAD51968                 143 aa            linear   BCT 21-AUG-2001
01508 DEFINITION  transcriptional regulator RovA [Yersinia enterocolitica].
01509 ACCESSION   AAD51968
01510 VERSION     AAD51968.1  GI:5805369
01511 DBSOURCE    locus AF171097 accession AF171097.1
01512 KEYWORDS    .
01513 SOURCE      Yersinia enterocolitica
01514   ORGANISM  Yersinia enterocolitica
01515             Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
01516             Enterobacteriaceae; Yersinia.
01517 REFERENCE   1  (residues 1 to 143)
01518   AUTHORS   Revell,P.A. and Miller,V.L.
01519   TITLE     A chromosomally encoded regulator is required for expression of the
01520             Yersinia enterocolitica inv gene and for virulence
01521   JOURNAL   Mol. Microbiol. 35 (3), 677-685 (2000)
01522   MEDLINE   20138369
01523    PUBMED   10672189
01524 REFERENCE   2  (residues 1 to 143)
01525   AUTHORS   Revell,P.A. and Miller,V.L.
01526   TITLE     Direct Submission
01527   JOURNAL   Submitted (22-JUL-1999) Molecular Microbiology, Washington
01528             University School of Medicine, Campus Box 8230, 660 South Euclid,
01529             St. Louis, MO 63110, USA
01530 COMMENT     Method: conceptual translation.
01531 FEATURES             Location/Qualifiers
01532      source          1..143
01533                      /organism="Yersinia enterocolitica"
01534                      /mol_type="unassigned DNA"
01535                      /strain="JB580v"
01536                      /serotype="O:8"
01537                      /db_xref="taxon:630"
01538      Protein         1..143
01539                      /product="transcriptional regulator RovA"
01540                      /name="regulates inv expression"
01541      CDS             1..143
01542                      /gene="rovA"
01543                      /coded_by="AF171097.1:380..811"
01544                      /note="regulator of virulence"
01545                      /transl_table=11
01546 ORIGIN      
01547         1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
01548        61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
01549       121 deiellsgli dklerniiql qsk
01550 //
01551 """
01552     
01553     embl_example="""ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
01554 XX
01555 AC   X56734; S46826;
01556 XX
01557 DT   12-SEP-1991 (Rel. 29, Created)
01558 DT   25-NOV-2005 (Rel. 85, Last updated, Version 11)
01559 XX
01560 DE   Trifolium repens mRNA for non-cyanogenic beta-glucosidase
01561 XX
01562 KW   beta-glucosidase.
01563 XX
01564 OS   Trifolium repens (white clover)
01565 OC   Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
01566 OC   Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
01567 OC   eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
01568 XX
01569 RN   [5]
01570 RP   1-1859
01571 RX   PUBMED; 1907511.
01572 RA   Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
01573 RT   "Nucleotide and derived amino acid sequence of the cyanogenic
01574 RT   beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
01575 RL   Plant Mol. Biol. 17(2):209-219(1991).
01576 XX
01577 RN   [6]
01578 RP   1-1859
01579 RA   Hughes M.A.;
01580 RT   ;
01581 RL   Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
01582 RL   Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
01583 RL   Upon Tyne, NE2 4HH, UK
01584 XX
01585 FH   Key             Location/Qualifiers
01586 FH
01587 FT   source          1..1859
01588 FT                   /organism="Trifolium repens"
01589 FT                   /mol_type="mRNA"
01590 FT                   /clone_lib="lambda gt10"
01591 FT                   /clone="TRE361"
01592 FT                   /tissue_type="leaves"
01593 FT                   /db_xref="taxon:3899"
01594 FT   CDS             14..1495
01595 FT                   /product="beta-glucosidase"
01596 FT                   /EC_number="3.2.1.21"
01597 FT                   /note="non-cyanogenic"
01598 FT                   /db_xref="GOA:P26204"
01599 FT                   /db_xref="InterPro:IPR001360"
01600 FT                   /db_xref="InterPro:IPR013781"
01601 FT                   /db_xref="UniProtKB/Swiss-Prot:P26204"
01602 FT                   /protein_id="CAA40058.1"
01603 FT                   /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
01604 FT                   FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
01605 FT                   DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
01606 FT                   VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
01607 FT                   CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
01608 FT                   DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
01609 FT                   IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
01610 FT                   EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
01611 FT                   IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
01612 FT   mRNA            1..1859
01613 FT                   /experiment="experimental evidence, no additional details
01614 FT                   recorded"
01615 XX
01616 SQ   Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
01617      aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt        60
01618      cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag       120
01619      tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga       180
01620      aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata       240
01621      tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta       300
01622      caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc       360
01623      ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa       420
01624      atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct       480
01625      ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg       540
01626      tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt       600
01627      gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg       660
01628      aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac       720
01629      aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta       780
01630      taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg       840
01631      gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga       900
01632      cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg       960
01633      gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg      1020
01634      ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc      1080
01635      acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa      1140
01636      acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat      1200
01637      gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct      1260
01638      gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga      1320
01639      agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg      1380
01640      ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg      1440
01641      taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga      1500
01642      tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa      1560
01643      ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt      1620
01644      tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg      1680
01645      aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc      1740
01646      agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac      1800
01647      tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa       1859
01648 //
01649 """
01650 
01651     print "GenBank CDS Iteration"
01652     print "====================="
01653 
01654     g = GenBankScanner()
01655     for record in g.parse_cds_features(StringIO(gbk_example)):
01656         print record
01657         
01658     g = GenBankScanner()
01659     for record in g.parse_cds_features(StringIO(gbk_example2),
01660                   tags2id=('gene','locus_tag','product')):
01661         print record
01662 
01663     g = GenBankScanner()
01664     for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
01665                                        tags2id=('gene','locus_tag','product')):
01666         print record
01667 
01668     print
01669     print "GenBank Iteration"
01670     print "================="
01671     g = GenBankScanner()
01672     for record in g.parse_records(StringIO(gbk_example),do_features=False):
01673         print record.id, record.name, record.description
01674         print record.seq
01675 
01676     g = GenBankScanner()
01677     for record in g.parse_records(StringIO(gbk_example),do_features=True):
01678         print record.id, record.name, record.description
01679         print record.seq
01680 
01681     g = GenBankScanner()
01682     for record in g.parse_records(StringIO(gbk_example2),do_features=False):
01683         print record.id, record.name, record.description
01684         print record.seq
01685 
01686     g = GenBankScanner()
01687     for record in g.parse_records(StringIO(gbk_example2),do_features=True):
01688         print record.id, record.name, record.description
01689         print record.seq
01690 
01691     print
01692     print "EMBL CDS Iteration"
01693     print "=================="
01694 
01695     e = EmblScanner()
01696     for record in e.parse_cds_features(StringIO(embl_example)):
01697         print record
01698         
01699     print
01700     print "EMBL Iteration"
01701     print "=============="
01702     e = EmblScanner()
01703     for record in e.parse_records(StringIO(embl_example),do_features=True):
01704         print record.id, record.name, record.description
01705         print record.seq