Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved.
00002 # Copyright 2006-2011 by Peter Cock.  All rights reserved.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 
00007 """Code to work with GenBank formatted files.
00008 
00009 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
00010 the "genbank" or "embl" format names to parse GenBank or EMBL files into
00011 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
00012 
00013 Using Bio.GenBank directly to parse GenBank files is only useful if you want 
00014 to obtain GenBank-specific Record objects, which is a much closer
00015 representation to the raw file contents that the SeqRecord alternative from
00016 the FeatureParser (used in Bio.SeqIO).
00017 
00018 To use the Bio.GenBank parser, there are two helper functions:
00019 
00020 read                  Parse a handle containing a single GenBank record
00021                       as Bio.GenBank specific Record objects.
00022 parse                 Iterate over a handle containing multiple GenBank
00023                       records as Bio.GenBank specific Record objects.
00024 
00025 The following internal classes are not intended for direct use and may
00026 be deprecated in a future release.
00027 
00028 Classes:
00029 Iterator              Iterate through a file of GenBank entries
00030 ErrorFeatureParser    Catch errors caused during parsing.
00031 FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects.
00032 RecordParser          Parse GenBank data into a Record object.
00033 
00034 Exceptions:
00035 ParserFailureError    Exception indicating a failure in the parser (ie.
00036                       scanner or consumer)
00037 LocationParserError   Exception indiciating a problem with the spark based
00038                       location parser.
00039 
00040 """
00041 import re
00042 
00043 # other Biopython stuff
00044 from Bio import SeqFeature
00045 
00046 # other Bio.GenBank stuff
00047 from utils import FeatureValueCleaner
00048 from Scanner import GenBankScanner
00049 
00050 #Constants used to parse GenBank header lines
00051 GENBANK_INDENT = 12
00052 GENBANK_SPACER = " " * GENBANK_INDENT
00053 
00054 #Constants for parsing GenBank feature lines
00055 FEATURE_KEY_INDENT = 5
00056 FEATURE_QUALIFIER_INDENT = 21
00057 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
00058 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
00059 
00060 #Regular expresions for location parsing
00061 _solo_location = r"[<>]?\d+"
00062 _pair_location = r"[<>]?\d+\.\.[<>]?\d+"
00063 _between_location = r"\d+\^\d+"
00064 
00065 _within_position = r"\(\d+\.\d+\)"
00066 _re_within_position = re.compile(_within_position)
00067 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \
00068                    % (_within_position,_within_position)
00069 assert _re_within_position.match("(3.9)")
00070 assert re.compile(_within_location).match("(3.9)..10")
00071 assert re.compile(_within_location).match("26..(30.33)")
00072 assert re.compile(_within_location).match("(13.19)..(20.28)")
00073 
00074 _oneof_position = r"one\-of\(\d+(,\d+)+\)"
00075 _re_oneof_position = re.compile(_oneof_position)
00076 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \
00077                    % (_oneof_position,_oneof_position)
00078 assert _re_oneof_position.match("one-of(6,9)")
00079 assert re.compile(_oneof_location).match("one-of(6,9)..101")
00080 assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)")
00081 assert re.compile(_oneof_location).match("6..one-of(101,104)")
00082 
00083 assert not _re_oneof_position.match("one-of(3)")
00084 assert _re_oneof_position.match("one-of(3,6)")
00085 assert _re_oneof_position.match("one-of(3,6,9)")
00086 
00087 
00088 _simple_location = r"\d+\.\.\d+"
00089 _re_simple_location = re.compile(_simple_location)
00090 _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" \
00091                                  % (_simple_location, _simple_location))
00092 _complex_location = r"([a-zA-z][a-zA-Z0-9_]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \
00093                     % (_pair_location, _solo_location, _between_location,
00094                        _within_location, _oneof_location)
00095 _re_complex_location = re.compile(r"^%s$" % _complex_location)
00096 _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \
00097                                           % (_complex_location, _complex_location)
00098 _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" \
00099                                  % (_possibly_complemented_complex_location,
00100                                     _possibly_complemented_complex_location))
00101 
00102 assert _re_simple_location.match("104..160")
00103 assert not _re_simple_location.match("<104..>160")
00104 assert not _re_simple_location.match("104")
00105 assert not _re_simple_location.match("<1")
00106 assert not _re_simple_location.match(">99999")
00107 assert not _re_simple_location.match("join(104..160,320..390,504..579)")
00108 assert not _re_simple_compound.match("bond(12,63)")
00109 assert _re_simple_compound.match("join(104..160,320..390,504..579)")
00110 assert _re_simple_compound.match("order(1..69,1308..1465)")
00111 assert not _re_simple_compound.match("order(1..69,1308..1465,1524)")
00112 assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)")
00113 assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)")
00114 assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)")
00115 assert not _re_simple_compound.match("test(1..69,1308..1465)")
00116 assert not _re_simple_compound.match("complement(1..69)")
00117 assert not _re_simple_compound.match("(1..69)")
00118 assert _re_complex_location.match("(3.9)..10")
00119 assert _re_complex_location.match("26..(30.33)")
00120 assert _re_complex_location.match("(13.19)..(20.28)")
00121 assert _re_complex_location.match("41^42") #between
00122 assert _re_complex_location.match("AL121804:41^42")
00123 assert _re_complex_location.match("AL121804:41..610")
00124 assert _re_complex_location.match("AL121804.2:41..610")
00125 assert _re_complex_location.match("one-of(3,6)..101")
00126 assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)")
00127 assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)")
00128 assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)")
00129 
00130 #Trans-spliced example from NC_016406, note underscore in reference name:
00131 assert _re_complex_location.match("NC_016402.1:6618..6676")
00132 assert _re_complex_location.match("181647..181905")
00133 assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
00134 assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
00135 assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
00136 assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
00137 assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
00138 
00139 
00140 def _pos(pos_str, offset=0):
00141     """Build a Position object (PRIVATE).
00142     
00143     For an end position, leave offset as zero (default):
00144 
00145     >>> _pos("5")
00146     ExactPosition(5)
00147 
00148     For a start position, set offset to minus one (for Python counting):
00149 
00150     >>> _pos("5", -1)
00151     ExactPosition(4)
00152 
00153     This also covers fuzzy positions:
00154 
00155     >>> p = _pos("<5")
00156     >>> p
00157     BeforePosition(5)
00158     >>> print p
00159     <5
00160     >>> int(p)
00161     5
00162 
00163     >>> _pos(">5")
00164     AfterPosition(5)
00165 
00166     By default assumes an end position, so note the integer behaviour:
00167 
00168     >>> p = _pos("one-of(5,8,11)")
00169     >>> p
00170     OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)])
00171     >>> print p
00172     one-of(5,8,11)
00173     >>> int(p)
00174     11
00175 
00176     >>> _pos("(8.10)")
00177     WithinPosition(10, left=8, right=10)
00178 
00179     Fuzzy start positions:
00180 
00181     >>> p = _pos("<5", -1)
00182     >>> p
00183     BeforePosition(4)
00184     >>> print p
00185     <4
00186     >>> int(p)
00187     4
00188 
00189     Notice how the integer behaviour changes too!
00190 
00191     >>> p = _pos("one-of(5,8,11)", -1)
00192     >>> p
00193     OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)])
00194     >>> print(p)
00195     one-of(4,7,10)
00196     >>> int(p)
00197     4
00198 
00199     """
00200     if pos_str.startswith("<"):
00201         return SeqFeature.BeforePosition(int(pos_str[1:])+offset)
00202     elif pos_str.startswith(">"):
00203         return SeqFeature.AfterPosition(int(pos_str[1:])+offset)
00204     elif _re_within_position.match(pos_str):
00205         s,e = pos_str[1:-1].split(".")
00206         s = int(s) + offset
00207         e = int(e) + offset
00208         if offset == -1:
00209             default = s
00210         else:
00211             default = e
00212         return SeqFeature.WithinPosition(default, left=s, right=e)
00213     elif _re_oneof_position.match(pos_str):
00214         assert pos_str.startswith("one-of(")
00215         assert pos_str[-1]==")"
00216         parts = [SeqFeature.ExactPosition(int(pos)+offset) \
00217                  for pos in pos_str[7:-1].split(",")]
00218         if offset == -1:
00219             default = min(int(pos) for pos in parts)
00220         else:
00221             default = max(int(pos) for pos in parts)
00222         return SeqFeature.OneOfPosition(default, choices=parts)
00223     else:
00224         return SeqFeature.ExactPosition(int(pos_str)+offset)
00225 
00226 def _loc(loc_str, expected_seq_length, strand):
00227     """FeatureLocation from non-compound non-complement location (PRIVATE).
00228     
00229     Simple examples,
00230 
00231     >>> _loc("123..456", 1000, +1)
00232     FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1)
00233     >>> _loc("<123..>456", 1000, strand = -1)
00234     FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1)
00235 
00236     A more complex location using within positions,
00237 
00238     >>> _loc("(9.10)..(20.25)", 1000, 1)
00239     FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1)
00240 
00241     Notice how that will act as though it has overall start 8 and end 25.
00242 
00243     Zero length between feature,
00244 
00245     >>> _loc("123^124", 1000, 0)
00246     FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0)
00247     
00248     The expected sequence length is needed for a special case, a between
00249     position at the start/end of a circular genome:
00250 
00251     >>> _loc("1000^1", 1000, 1)
00252     FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1)
00253     
00254     Apart from this special case, between positions P^Q must have P+1==Q,
00255 
00256     >>> _loc("123^456", 1000, 1)
00257     Traceback (most recent call last):
00258        ...
00259     ValueError: Invalid between location '123^456'
00260     """
00261     try:
00262         s, e = loc_str.split("..")
00263     except ValueError:
00264         assert ".." not in loc_str
00265         if "^" in loc_str:
00266             #A between location like "67^68" (one based counting) is a
00267             #special case (note it has zero length). In python slice
00268             #notation this is 67:67, a zero length slice.  See Bug 2622
00269             #Further more, on a circular genome of length N you can have
00270             #a location N^1 meaning the junction at the origin. See Bug 3098.
00271             #NOTE - We can imagine between locations like "2^4", but this
00272             #is just "3".  Similarly, "2^5" is just "3..4"
00273             s, e = loc_str.split("^")
00274             if int(s)+1==int(e):
00275                 pos = _pos(s)
00276             elif int(s)==expected_seq_length and e=="1":
00277                 pos = _pos(s)
00278             else:
00279                 raise ValueError("Invalid between location %s" % repr(loc_str))
00280             return SeqFeature.FeatureLocation(pos, pos, strand)
00281         else:
00282             #e.g. "123"
00283             s = loc_str
00284             e = loc_str
00285     return SeqFeature.FeatureLocation(_pos(s,-1), _pos(e), strand)
00286 
00287 def _split_compound_loc(compound_loc):
00288     """Split a tricky compound location string (PRIVATE).
00289     
00290     >>> list(_split_compound_loc("123..145"))
00291     ['123..145']
00292     >>> list(_split_compound_loc("123..145,200..209"))
00293     ['123..145', '200..209']
00294     >>> list(_split_compound_loc("one-of(200,203)..300"))
00295     ['one-of(200,203)..300']
00296     >>> list(_split_compound_loc("complement(123..145),200..209"))
00297     ['complement(123..145)', '200..209']
00298     >>> list(_split_compound_loc("123..145,one-of(200,203)..209"))
00299     ['123..145', 'one-of(200,203)..209']
00300     >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300"))
00301     ['123..145', 'one-of(200,203)..one-of(209,211)', '300']
00302     >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300"))
00303     ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300']
00304     >>> list(_split_compound_loc("123..145,200..one-of(209,211),300"))
00305     ['123..145', '200..one-of(209,211)', '300']
00306     >>> list(_split_compound_loc("123..145,200..one-of(209,211)"))
00307     ['123..145', '200..one-of(209,211)']
00308     >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905"))
00309     ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905']
00310     """
00311     if "one-of(" in compound_loc:
00312         #Hard case
00313         while "," in compound_loc:
00314             assert compound_loc[0] != ","
00315             assert compound_loc[0:2] != ".."
00316             i = compound_loc.find(",")
00317             part = compound_loc[:i]
00318             compound_loc = compound_loc[i:] #includes the comma
00319             while part.count("(") > part.count(")"):
00320                 assert "one-of(" in part, (part, compound_loc)
00321                 i = compound_loc.find(")")
00322                 part += compound_loc[:i+1]
00323                 compound_loc = compound_loc[i+1:]
00324             if compound_loc.startswith(".."):
00325                 i = compound_loc.find(",")
00326                 if i==-1:
00327                     part += compound_loc
00328                     compound_loc = ""
00329                 else:
00330                     part += compound_loc[:i]
00331                     compound_loc = compound_loc[i:] #includes the comma
00332             while part.count("(") > part.count(")"):
00333                 assert part.count("one-of(") == 2
00334                 i = compound_loc.find(")")
00335                 part += compound_loc[:i+1]
00336                 compound_loc = compound_loc[i+1:]
00337             if compound_loc.startswith(","):
00338                 compound_loc = compound_loc[1:]
00339             assert part
00340             yield part
00341         if compound_loc:
00342             yield compound_loc
00343     else:
00344         #Easy case
00345         for part in compound_loc.split(","):
00346             yield part
00347 
00348 class Iterator(object):
00349     """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE).
00350 
00351     This class is likely to be deprecated in a future release of Biopython.
00352     Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...)
00353     for SeqRecord and GenBank specific Record objects respectively instead.
00354     """
00355     def __init__(self, handle, parser = None):
00356         """Initialize the iterator.
00357 
00358         Arguments:
00359         o handle - A handle with GenBank entries to iterate through.
00360         o parser - An optional parser to pass the entries through before
00361         returning them. If None, then the raw entry will be returned.
00362         """
00363         self.handle = handle
00364         self._parser = parser
00365 
00366     def next(self):
00367         """Return the next GenBank record from the handle.
00368 
00369         Will return None if we ran out of records.
00370         """
00371         if self._parser is None:
00372             lines = []
00373             while True:
00374                 line = self.handle.readline()
00375                 if not line : return None #Premature end of file?
00376                 lines.append(line)
00377                 if line.rstrip() == "//" : break
00378             return "".join(lines)
00379         try:
00380             return self._parser.parse(self.handle)
00381         except StopIteration:
00382             return None
00383 
00384     def __iter__(self):
00385         return iter(self.next, None)
00386 
00387 class ParserFailureError(Exception):
00388     """Failure caused by some kind of problem in the parser.
00389     """
00390     pass
00391 
00392 class LocationParserError(Exception):
00393     """Could not Properly parse out a location from a GenBank file.
00394     """
00395     pass
00396 
00397 class FeatureParser(object):
00398     """Parse GenBank files into Seq + Feature objects (OBSOLETE).
00399 
00400     Direct use of this class is discouraged, and may be deprecated in
00401     a future release of Biopython.
00402 
00403     Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead.
00404     """
00405     def __init__(self, debug_level = 0, use_fuzziness = 1, 
00406                  feature_cleaner = FeatureValueCleaner()):
00407         """Initialize a GenBank parser and Feature consumer.
00408 
00409         Arguments:
00410         o debug_level - An optional argument that species the amount of
00411         debugging information the parser should spit out. By default we have
00412         no debugging info (the fastest way to do things), but if you want
00413         you can set this as high as two and see exactly where a parse fails.
00414         o use_fuzziness - Specify whether or not to use fuzzy representations.
00415         The default is 1 (use fuzziness).
00416         o feature_cleaner - A class which will be used to clean out the
00417         values of features. This class must implement the function 
00418         clean_value. GenBank.utils has a "standard" cleaner class, which
00419         is used by default.
00420         """
00421         self._scanner = GenBankScanner(debug_level)
00422         self.use_fuzziness = use_fuzziness
00423         self._cleaner = feature_cleaner
00424 
00425     def parse(self, handle):
00426         """Parse the specified handle.
00427         """
00428         self._consumer = _FeatureConsumer(self.use_fuzziness, 
00429                                           self._cleaner)
00430         self._scanner.feed(handle, self._consumer)
00431         return self._consumer.data
00432 
00433 class RecordParser(object):
00434     """Parse GenBank files into Record objects (OBSOLETE).
00435 
00436     Direct use of this class is discouraged, and may be deprecated in
00437     a future release of Biopython.
00438 
00439     Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions
00440     instead.
00441     """
00442     def __init__(self, debug_level = 0):
00443         """Initialize the parser.
00444 
00445         Arguments:
00446         o debug_level - An optional argument that species the amount of
00447         debugging information the parser should spit out. By default we have
00448         no debugging info (the fastest way to do things), but if you want
00449         you can set this as high as two and see exactly where a parse fails.
00450         """
00451         self._scanner = GenBankScanner(debug_level)
00452 
00453     def parse(self, handle):
00454         """Parse the specified handle into a GenBank record.
00455         """
00456         self._consumer = _RecordConsumer()
00457 
00458         self._scanner.feed(handle, self._consumer)
00459         return self._consumer.data
00460 
00461 class _BaseGenBankConsumer(object):
00462     """Abstract GenBank consumer providing useful general functions (PRIVATE).
00463 
00464     This just helps to eliminate some duplication in things that most
00465     GenBank consumers want to do.
00466     """
00467     # Special keys in GenBank records that we should remove spaces from
00468     # For instance, \translation keys have values which are proteins and
00469     # should have spaces and newlines removed from them. This class
00470     # attribute gives us more control over specific formatting problems.
00471     remove_space_keys = ["translation"]
00472 
00473     def __init__(self):
00474         pass
00475 
00476     def _unhandled(self, data):
00477         pass
00478 
00479     def __getattr__(self, attr):
00480         return self._unhandled
00481 
00482     def _split_keywords(self, keyword_string):
00483         """Split a string of keywords into a nice clean list.
00484         """
00485         # process the keywords into a python list
00486         if keyword_string == "" or keyword_string == ".":
00487             keywords = ""
00488         elif keyword_string[-1] == '.':
00489             keywords = keyword_string[:-1]
00490         else:
00491             keywords = keyword_string
00492         keyword_list = keywords.split(';')
00493         clean_keyword_list = [x.strip() for x in keyword_list]
00494         return clean_keyword_list
00495 
00496     def _split_accessions(self, accession_string):
00497         """Split a string of accession numbers into a list.
00498         """
00499         # first replace all line feeds with spaces
00500         # Also, EMBL style accessions are split with ';'
00501         accession = accession_string.replace("\n", " ").replace(";"," ")
00502 
00503         return [x.strip() for x in accession.split() if x.strip()]
00504 
00505     def _split_taxonomy(self, taxonomy_string):
00506         """Split a string with taxonomy info into a list.
00507         """
00508         if not taxonomy_string or taxonomy_string==".":
00509             #Missing data, no taxonomy
00510             return []
00511         
00512         if taxonomy_string[-1] == '.':
00513             tax_info = taxonomy_string[:-1]
00514         else:
00515             tax_info = taxonomy_string
00516         tax_list = tax_info.split(';')
00517         new_tax_list = []
00518         for tax_item in tax_list:
00519             new_items = tax_item.split("\n")
00520             new_tax_list.extend(new_items)
00521         while '' in new_tax_list:
00522             new_tax_list.remove('')
00523         clean_tax_list = [x.strip() for x in new_tax_list]
00524 
00525         return clean_tax_list
00526 
00527     def _clean_location(self, location_string):
00528         """Clean whitespace out of a location string.
00529 
00530         The location parser isn't a fan of whitespace, so we clean it out
00531         before feeding it into the parser.
00532         """
00533         #Originally this imported string.whitespace and did a replace
00534         #via a loop.  It's simpler to just split on whitespace and rejoin
00535         #the string - and this avoids importing string too.  See Bug 2684.
00536         return ''.join(location_string.split())
00537 
00538     def _remove_newlines(self, text):
00539         """Remove any newlines in the passed text, returning the new string.
00540         """
00541         # get rid of newlines in the qualifier value
00542         newlines = ["\n", "\r"]
00543         for ws in newlines:
00544             text = text.replace(ws, "")
00545 
00546         return text
00547 
00548     def _normalize_spaces(self, text):
00549         """Replace multiple spaces in the passed text with single spaces.
00550         """
00551         # get rid of excessive spaces
00552         text_parts = text.split(" ")
00553         text_parts = filter(None, text_parts)
00554         return ' '.join(text_parts)
00555 
00556     def _remove_spaces(self, text):
00557         """Remove all spaces from the passed text.
00558         """
00559         return text.replace(" ", "")
00560 
00561     def _convert_to_python_numbers(self, start, end):
00562         """Convert a start and end range to python notation.
00563 
00564         In GenBank, starts and ends are defined in "biological" coordinates,
00565         where 1 is the first base and [i, j] means to include both i and j.
00566 
00567         In python, 0 is the first base and [i, j] means to include i, but
00568         not j. 
00569 
00570         So, to convert "biological" to python coordinates, we need to 
00571         subtract 1 from the start, and leave the end and things should
00572         be converted happily.
00573         """
00574         new_start = start - 1
00575         new_end = end
00576 
00577         return new_start, new_end
00578 
00579 class _FeatureConsumer(_BaseGenBankConsumer):
00580     """Create a SeqRecord object with Features to return (PRIVATE).
00581 
00582     Attributes:
00583     o use_fuzziness - specify whether or not to parse with fuzziness in
00584     feature locations.
00585     o feature_cleaner - a class that will be used to provide specialized
00586     cleaning-up of feature values.
00587     """
00588     def __init__(self, use_fuzziness, feature_cleaner = None):
00589         from Bio.SeqRecord import SeqRecord
00590         _BaseGenBankConsumer.__init__(self)
00591         self.data = SeqRecord(None, id = None)
00592         self.data.id = None
00593         self.data.description = ""
00594 
00595         self._use_fuzziness = use_fuzziness
00596         self._feature_cleaner = feature_cleaner
00597 
00598         self._seq_type = ''
00599         self._seq_data = []
00600         self._cur_reference = None
00601         self._cur_feature = None
00602         self._expected_size = None
00603 
00604     def locus(self, locus_name):
00605         """Set the locus name is set as the name of the Sequence.
00606         """
00607         self.data.name = locus_name
00608 
00609     def size(self, content):
00610         """Record the sequence length."""
00611         self._expected_size = int(content)
00612 
00613     def residue_type(self, type):
00614         """Record the sequence type so we can choose an appropriate alphabet.
00615         """
00616         self._seq_type = type
00617 
00618     def data_file_division(self, division):
00619         self.data.annotations['data_file_division'] = division
00620 
00621     def date(self, submit_date):
00622         self.data.annotations['date'] = submit_date 
00623 
00624     def definition(self, definition):
00625         """Set the definition as the description of the sequence.
00626         """
00627         if self.data.description:
00628             #Append to any existing description
00629             #e.g. EMBL files with two DE lines.
00630             self.data.description += " " + definition
00631         else:
00632             self.data.description = definition
00633 
00634     def accession(self, acc_num):
00635         """Set the accession number as the id of the sequence.
00636 
00637         If we have multiple accession numbers, the first one passed is
00638         used.
00639         """
00640         new_acc_nums = self._split_accessions(acc_num)
00641 
00642         #Also record them ALL in the annotations
00643         try:
00644             #On the off chance there was more than one accession line:
00645             for acc in new_acc_nums:
00646                 #Prevent repeat entries
00647                 if acc not in self.data.annotations['accessions']:
00648                     self.data.annotations['accessions'].append(acc)
00649         except KeyError:
00650             self.data.annotations['accessions'] = new_acc_nums
00651 
00652         # if we haven't set the id information yet, add the first acc num
00653         if self.data.id is None:
00654             if len(new_acc_nums) > 0:
00655                 #self.data.id = new_acc_nums[0]
00656                 #Use the FIRST accession as the ID, not the first on this line!
00657                 self.data.id = self.data.annotations['accessions'][0]
00658 
00659     def wgs(self, content):
00660         self.data.annotations['wgs'] = content.split('-')
00661 
00662     def add_wgs_scafld(self, content):
00663         self.data.annotations.setdefault('wgs_scafld',[]).append(content.split('-'))
00664 
00665     def nid(self, content):
00666         self.data.annotations['nid'] = content
00667 
00668     def pid(self, content):
00669         self.data.annotations['pid'] = content
00670 
00671     def version(self, version_id):
00672         #Want to use the versioned accession as the record.id
00673         #This comes from the VERSION line in GenBank files, or the
00674         #obsolete SV line in EMBL.  For the new EMBL files we need
00675         #both the version suffix from the ID line and the accession
00676         #from the AC line.
00677         if version_id.count(".")==1 and version_id.split(".")[1].isdigit():
00678             self.accession(version_id.split(".")[0])
00679             self.version_suffix(version_id.split(".")[1])
00680         else:
00681             #For backwards compatibility...
00682             self.data.id = version_id
00683 
00684     def project(self, content):
00685         """Handle the information from the PROJECT line as a list of projects.
00686 
00687         e.g.
00688         PROJECT     GenomeProject:28471
00689 
00690         or:
00691         PROJECT     GenomeProject:13543  GenomeProject:99999
00692 
00693         This is stored as dbxrefs in the SeqRecord to be consistent with the
00694         projected switch of this line to DBLINK in future GenBank versions.
00695         Note the NCBI plan to replace "GenomeProject:28471" with the shorter
00696         "Project:28471" as part of this transition.
00697         """
00698         content = content.replace("GenomeProject:", "Project:")
00699         self.data.dbxrefs.extend([p for p in content.split() if p])
00700 
00701     def dblink(self, content):
00702         """Store DBLINK cross references as dbxrefs in our record object.
00703 
00704         This line type is expected to replace the PROJECT line in 2009. e.g.
00705 
00706         During transition:
00707         
00708         PROJECT     GenomeProject:28471
00709         DBLINK      Project:28471
00710                     Trace Assembly Archive:123456
00711 
00712         Once the project line is dropped:
00713 
00714         DBLINK      Project:28471
00715                     Trace Assembly Archive:123456
00716 
00717         Note GenomeProject -> Project.
00718 
00719         We'll have to see some real examples to be sure, but based on the
00720         above example we can expect one reference per line.
00721 
00722         Note that at some point the NCBI have included an extra space, e.g.
00723 
00724         DBLINK      Project: 28471
00725         """
00726         #During the transition period with both PROJECT and DBLINK lines,
00727         #we don't want to add the same cross reference twice.
00728         while ": " in content:
00729             content = content.replace(": ", ":")
00730         if content.strip() not in self.data.dbxrefs:
00731             self.data.dbxrefs.append(content.strip())
00732 
00733     def version_suffix(self, version):
00734         """Set the version to overwrite the id.
00735 
00736         Since the verison provides the same information as the accession
00737         number, plus some extra info, we set this as the id if we have
00738         a version.
00739         """
00740         #e.g. GenBank line:
00741         #VERSION     U49845.1  GI:1293613
00742         #or the obsolete EMBL line:
00743         #SV   U49845.1
00744         #Scanner calls consumer.version("U49845.1")
00745         #which then calls consumer.version_suffix(1)
00746         #
00747         #e.g. EMBL new line:
00748         #ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
00749         #Scanner calls consumer.version_suffix(1)
00750         assert version.isdigit()
00751         self.data.annotations['sequence_version'] = int(version)
00752 
00753     def db_source(self, content):
00754         self.data.annotations['db_source'] = content.rstrip()
00755 
00756     def gi(self, content):
00757         self.data.annotations['gi'] = content
00758 
00759     def keywords(self, content):
00760         self.data.annotations['keywords'] = self._split_keywords(content)
00761 
00762     def segment(self, content):
00763         self.data.annotations['segment'] = content
00764 
00765     def source(self, content):
00766         #Note that some software (e.g. VectorNTI) may produce an empty
00767         #source (rather than using a dot/period as might be expected).
00768         if content == "":
00769             source_info = ""
00770         elif content[-1] == '.':
00771             source_info = content[:-1]
00772         else:
00773             source_info = content
00774         self.data.annotations['source'] = source_info
00775 
00776     def organism(self, content):
00777         self.data.annotations['organism'] = content
00778 
00779     def taxonomy(self, content):
00780         """Records (another line of) the taxonomy lineage.
00781         """
00782         lineage = self._split_taxonomy(content)
00783         try:
00784             self.data.annotations['taxonomy'].extend(lineage)
00785         except KeyError:
00786             self.data.annotations['taxonomy'] = lineage
00787         
00788     def reference_num(self, content):
00789         """Signal the beginning of a new reference object.
00790         """
00791         # if we have a current reference that hasn't been added to
00792         # the list of references, add it.
00793         if self._cur_reference is not None:
00794             self.data.annotations['references'].append(self._cur_reference)
00795         else:
00796             self.data.annotations['references'] = []
00797 
00798         self._cur_reference = SeqFeature.Reference()
00799 
00800     def reference_bases(self, content):
00801         """Attempt to determine the sequence region the reference entails.
00802 
00803         Possible types of information we may have to deal with:
00804         
00805         (bases 1 to 86436)
00806         (sites)
00807         (bases 1 to 105654; 110423 to 111122)
00808         1  (residues 1 to 182)
00809         """
00810         # first remove the parentheses or other junk
00811         ref_base_info = content[1:-1]
00812 
00813         all_locations = []
00814         # parse if we've got 'bases' and 'to'
00815         if ref_base_info.find('bases') != -1 and \
00816             ref_base_info.find('to') != -1:
00817             # get rid of the beginning 'bases'
00818             ref_base_info = ref_base_info[5:]
00819             locations = self._split_reference_locations(ref_base_info)
00820             all_locations.extend(locations)
00821         elif (ref_base_info.find("residues") >= 0 and
00822               ref_base_info.find("to") >= 0):
00823             residues_start = ref_base_info.find("residues")
00824             # get only the information after "residues"
00825             ref_base_info = ref_base_info[(residues_start + len("residues ")):]
00826             locations = self._split_reference_locations(ref_base_info)
00827             all_locations.extend(locations)
00828 
00829         # make sure if we are not finding information then we have
00830         # the string 'sites' or the string 'bases'
00831         elif (ref_base_info == 'sites' or
00832               ref_base_info.strip() == 'bases'):
00833             pass
00834         # otherwise raise an error
00835         else:
00836             raise ValueError("Could not parse base info %s in record %s" %
00837                              (ref_base_info, self.data.id))
00838 
00839         self._cur_reference.location = all_locations
00840 
00841     def _split_reference_locations(self, location_string):
00842         """Get reference locations out of a string of reference information
00843         
00844         The passed string should be of the form:
00845 
00846             1 to 20; 20 to 100
00847 
00848         This splits the information out and returns a list of location objects
00849         based on the reference locations.
00850         """
00851         # split possibly multiple locations using the ';'
00852         all_base_info = location_string.split(';')
00853 
00854         new_locations = []
00855         for base_info in all_base_info:
00856             start, end = base_info.split('to')
00857             new_start, new_end = \
00858               self._convert_to_python_numbers(int(start.strip()),
00859                                               int(end.strip()))
00860             this_location = SeqFeature.FeatureLocation(new_start, new_end)
00861             new_locations.append(this_location)
00862         return new_locations
00863 
00864     def authors(self, content):
00865         if self._cur_reference.authors:
00866             self._cur_reference.authors += ' ' + content
00867         else:
00868             self._cur_reference.authors = content
00869 
00870     def consrtm(self, content):
00871         if self._cur_reference.consrtm:
00872             self._cur_reference.consrtm += ' ' + content
00873         else:
00874             self._cur_reference.consrtm = content
00875 
00876     def title(self, content):
00877         if self._cur_reference is None:
00878             import warnings
00879             from Bio import BiopythonParserWarning
00880             warnings.warn("GenBank TITLE line without REFERENCE line.",
00881                           BiopythonParserWarning)
00882         elif self._cur_reference.title:
00883             self._cur_reference.title += ' ' + content
00884         else:
00885             self._cur_reference.title = content
00886 
00887     def journal(self, content):
00888         if self._cur_reference.journal:
00889             self._cur_reference.journal += ' ' + content
00890         else:
00891             self._cur_reference.journal = content
00892 
00893     def medline_id(self, content):
00894         self._cur_reference.medline_id = content
00895 
00896     def pubmed_id(self, content):
00897         self._cur_reference.pubmed_id = content
00898 
00899     def remark(self, content):
00900         """Deal with a reference comment."""
00901         if self._cur_reference.comment:
00902             self._cur_reference.comment += ' ' + content
00903         else:
00904             self._cur_reference.comment = content
00905 
00906     def comment(self, content):
00907         try:
00908             self.data.annotations['comment'] += "\n" + "\n".join(content)
00909         except KeyError:
00910             self.data.annotations['comment'] = "\n".join(content)
00911 
00912     def features_line(self, content):
00913         """Get ready for the feature table when we reach the FEATURE line.
00914         """
00915         self.start_feature_table()
00916 
00917     def start_feature_table(self):
00918         """Indicate we've got to the start of the feature table.
00919         """
00920         # make sure we've added on our last reference object
00921         if self._cur_reference is not None:
00922             self.data.annotations['references'].append(self._cur_reference)
00923             self._cur_reference = None
00924 
00925     def feature_key(self, content):
00926         # start a new feature
00927         self._cur_feature = SeqFeature.SeqFeature()
00928         self._cur_feature.type = content
00929         self.data.features.append(self._cur_feature)
00930 
00931     def location(self, content):
00932         """Parse out location information from the location string.
00933 
00934         This uses simple Python code with some regular expressions to do the
00935         parsing, and then translates the results into appropriate objects.
00936         """
00937         # clean up newlines and other whitespace inside the location before
00938         # parsing - locations should have no whitespace whatsoever
00939         location_line = self._clean_location(content)
00940 
00941         # Older records have junk like replace(266,"c") in the
00942         # location line. Newer records just replace this with
00943         # the number 266 and have the information in a more reasonable
00944         # place. So we'll just grab out the number and feed this to the
00945         # parser. We shouldn't really be losing any info this way.
00946         if location_line.find('replace') != -1:
00947             comma_pos = location_line.find(',')
00948             location_line = location_line[8:comma_pos]
00949         
00950         cur_feature = self._cur_feature
00951 
00952         #Handle top level complement here for speed
00953         if location_line.startswith("complement("):
00954             assert location_line.endswith(")")
00955             location_line = location_line[11:-1]
00956             strand = -1
00957         elif self._seq_type.find("DNA") >= 0 \
00958         or self._seq_type.find("RNA") >= 0:
00959             #Nucleotide
00960             strand = 1
00961         else:
00962             #Protein
00963             strand = None
00964 
00965         #Special case handling of the most common cases for speed
00966         if _re_simple_location.match(location_line):
00967             #e.g. "123..456"
00968             s, e = location_line.split("..")
00969             cur_feature.location = SeqFeature.FeatureLocation(int(s)-1,
00970                                                               int(e),
00971                                                               strand)
00972             return
00973 
00974         if _re_simple_compound.match(location_line):
00975             #e.g. join(<123..456,480..>500)
00976             i = location_line.find("(")
00977             cur_feature.location_operator = location_line[:i]
00978             #we can split on the comma because these are simple locations
00979             for part in location_line[i+1:-1].split(","):
00980                 s, e = part.split("..")
00981                 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1,
00982                                                                      int(e),
00983                                                                      strand),
00984                         location_operator=cur_feature.location_operator,
00985                         type=cur_feature.type)
00986                 cur_feature.sub_features.append(f)
00987             s = cur_feature.sub_features[0].location.start
00988             e = cur_feature.sub_features[-1].location.end
00989             cur_feature.location = SeqFeature.FeatureLocation(s,e, strand)
00990             return
00991         
00992         #Handle the general case with more complex regular expressions
00993         if _re_complex_location.match(location_line):
00994             #e.g. "AL121804.2:41..610"
00995             if ":" in location_line:
00996                 location_ref, location_line = location_line.split(":")
00997                 cur_feature.location = _loc(location_line, self._expected_size, strand)
00998                 cur_feature.location.ref = location_ref
00999             else:
01000                 cur_feature.location = _loc(location_line, self._expected_size, strand)
01001             return
01002 
01003         if _re_complex_compound.match(location_line):
01004             i = location_line.find("(")
01005             cur_feature.location_operator = location_line[:i]
01006             #Can't split on the comma because of positions like one-of(1,2,3)
01007             for part in _split_compound_loc(location_line[i+1:-1]):
01008                 if part.startswith("complement("):
01009                     assert part[-1]==")"
01010                     part = part[11:-1]
01011                     assert strand != -1, "Double complement?"
01012                     part_strand = -1
01013                 else:
01014                     part_strand = strand
01015                 if ":" in part:
01016                     ref, part = part.split(":")
01017                 else:
01018                     ref = None
01019                 try:
01020                     loc = _loc(part, self._expected_size, part_strand)
01021                 except ValueError, err:
01022                     print location_line
01023                     print part
01024                     raise err
01025                 f = SeqFeature.SeqFeature(location=loc, ref=ref,
01026                         location_operator=cur_feature.location_operator,
01027                         type=cur_feature.type)
01028                 cur_feature.sub_features.append(f)
01029             # Historically a join on the reverse strand has been represented
01030             # in Biopython with both the parent SeqFeature and its children
01031             # (the exons for a CDS) all given a strand of -1.  Likewise, for
01032             # a join feature on the forward strand they all have strand +1.
01033             # However, we must also consider evil mixed strand examples like
01034             # this, join(complement(69611..69724),139856..140087,140625..140650)
01035             strands = set(sf.strand for sf in cur_feature.sub_features)
01036             if len(strands)==1:
01037                 strand = cur_feature.sub_features[0].strand
01038             else:
01039                 strand = None # i.e. mixed strands
01040             s = cur_feature.sub_features[0].location.start
01041             e = cur_feature.sub_features[-1].location.end
01042             cur_feature.location = SeqFeature.FeatureLocation(s, e, strand)
01043             return
01044         #Not recognised
01045         if "order" in location_line and "join" in location_line:
01046             #See Bug 3197
01047             msg = 'Combinations of "join" and "order" within the same ' + \
01048                   'location (nested operators) are illegal:\n' + location_line
01049             raise LocationParserError(msg)
01050         #This used to be an error....
01051         cur_feature.location = None
01052         import warnings
01053         from Bio import BiopythonParserWarning
01054         warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" \
01055                                              % (location_line)))
01056 
01057 
01058     def feature_qualifier(self, key, value):
01059         """When we get a qualifier key and its value.
01060         
01061         Can receive None, since you can have valueless keys such as /pseudo
01062         """
01063         # Hack to try to preserve historical behaviour of /pseudo etc
01064         if value is None:
01065             if key not in self._cur_feature.qualifiers:
01066                 self._cur_feature.qualifiers[key] = [""]
01067                 return
01068             
01069         value = value.replace('"', '')
01070         if self._feature_cleaner is not None:
01071             value = self._feature_cleaner.clean_value(key, value)
01072 
01073         # if the qualifier name exists, append the value
01074         if key in self._cur_feature.qualifiers:
01075             self._cur_feature.qualifiers[key].append(value)
01076         # otherwise start a new list of the key with its values
01077         else:
01078             self._cur_feature.qualifiers[key] = [value]
01079        
01080     def feature_qualifier_name(self, content_list):
01081         """Use feature_qualifier instead (OBSOLETE)."""
01082         raise NotImplementedError("Use the feature_qualifier method instead.")
01083 
01084     def feature_qualifier_description(self, content):
01085         """Use feature_qualifier instead (OBSOLETE)."""
01086         raise NotImplementedError("Use the feature_qualifier method instead.")
01087 
01088     def contig_location(self, content):
01089         """Deal with CONTIG information."""
01090         #Historically this was stored as a SeqFeature object, but it was
01091         #stored under record.annotations["contig"] and not under
01092         #record.features with the other SeqFeature objects.
01093         #
01094         #The CONTIG location line can include additional tokens like
01095         #Gap(), Gap(100) or Gap(unk100) which are not used in the feature
01096         #location lines, so storing it using SeqFeature based location
01097         #objects is difficult.
01098         #
01099         #We now store this a string, which means for BioSQL we are now in
01100         #much better agreement with how BioPerl records the CONTIG line
01101         #in the database.
01102         #
01103         #NOTE - This code assumes the scanner will return all the CONTIG
01104         #lines already combined into one long string!
01105         self.data.annotations["contig"] = content
01106 
01107     def origin_name(self, content):
01108         pass
01109 
01110     def base_count(self, content):
01111         pass
01112 
01113     def base_number(self, content):
01114         pass
01115 
01116     def sequence(self, content):
01117         """Add up sequence information as we get it.
01118 
01119         To try and make things speedier, this puts all of the strings
01120         into a list of strings, and then uses string.join later to put
01121         them together. Supposedly, this is a big time savings
01122         """
01123         assert ' ' not in content
01124         self._seq_data.append(content.upper())
01125 
01126     def record_end(self, content):
01127         """Clean up when we've finished the record.
01128         """
01129         from Bio import Alphabet
01130         from Bio.Alphabet import IUPAC
01131         from Bio.Seq import Seq, UnknownSeq
01132 
01133         #Try and append the version number to the accession for the full id
01134         if self.data.id is None:
01135             assert 'accessions' not in self.data.annotations, \
01136                    self.data.annotations['accessions']
01137             self.data.id = self.data.name #Good fall back?
01138         elif self.data.id.count('.') == 0:
01139             try:
01140                 self.data.id+='.%i' % self.data.annotations['sequence_version']
01141             except KeyError:
01142                 pass
01143         
01144         # add the sequence information
01145         # first, determine the alphabet
01146         # we default to an generic alphabet if we don't have a
01147         # seq type or have strange sequence information.
01148         seq_alphabet = Alphabet.generic_alphabet
01149 
01150         # now set the sequence
01151         sequence = "".join(self._seq_data)
01152 
01153         if self._expected_size is not None \
01154         and len(sequence) != 0 \
01155         and self._expected_size != len(sequence):
01156             import warnings
01157             from Bio import BiopythonParserWarning
01158             warnings.warn("Expected sequence length %i, found %i (%s)." \
01159                           % (self._expected_size, len(sequence), self.data.id),
01160                           BiopythonParserWarning)
01161 
01162         if self._seq_type:
01163             # mRNA is really also DNA, since it is actually cDNA
01164             if self._seq_type.find('DNA') != -1 or \
01165                self._seq_type.find('mRNA') != -1:
01166                 seq_alphabet = IUPAC.ambiguous_dna
01167             # are there ever really RNA sequences in GenBank?
01168             elif self._seq_type.find('RNA') != -1:
01169                 #Even for data which was from RNA, the sequence string
01170                 #is usually given as DNA (T not U).  Bug 2408
01171                 if "T" in sequence and "U" not in sequence:
01172                     seq_alphabet = IUPAC.ambiguous_dna
01173                 else:
01174                     seq_alphabet = IUPAC.ambiguous_rna
01175             elif self._seq_type.upper().find('PROTEIN') != -1:
01176                 seq_alphabet = IUPAC.protein  # or extended protein?
01177             # work around ugly GenBank records which have circular or
01178             # linear but no indication of sequence type
01179             elif self._seq_type in ["circular", "linear", "unspecified"]:
01180                 pass
01181             # we have a bug if we get here
01182             else:
01183                 raise ValueError("Could not determine alphabet for seq_type %s"
01184                                  % self._seq_type)
01185 
01186         if not sequence and self.__expected_size:
01187             self.data.seq = UnknownSeq(self._expected_size, seq_alphabet)
01188         else:
01189             self.data.seq = Seq(sequence, seq_alphabet)
01190 
01191 class _RecordConsumer(_BaseGenBankConsumer):
01192     """Create a GenBank Record object from scanner generated information (PRIVATE).
01193     """
01194     def __init__(self):
01195         _BaseGenBankConsumer.__init__(self)
01196         import Record
01197         self.data = Record.Record()
01198 
01199         self._seq_data = []
01200         self._cur_reference = None
01201         self._cur_feature = None
01202         self._cur_qualifier = None
01203         
01204     def wgs(self, content):
01205         self.data.wgs = content.split('-')
01206 
01207     def add_wgs_scafld(self, content):
01208         self.data.wgs_scafld.append(content.split('-'))
01209 
01210     def locus(self, content):
01211         self.data.locus = content
01212 
01213     def size(self, content):
01214         self.data.size = content
01215 
01216     def residue_type(self, content):
01217         self.data.residue_type = content
01218 
01219     def data_file_division(self, content):
01220         self.data.data_file_division = content
01221 
01222     def date(self, content):
01223         self.data.date = content
01224 
01225     def definition(self, content):
01226         self.data.definition = content
01227 
01228     def accession(self, content):
01229         for acc in self._split_accessions(content):
01230             if acc not in self.data.accession:
01231                 self.data.accession.append(acc)
01232 
01233     def nid(self, content):
01234         self.data.nid = content
01235 
01236     def pid(self, content):
01237         self.data.pid = content
01238 
01239     def version(self, content):
01240         self.data.version = content
01241 
01242     def db_source(self, content):
01243         self.data.db_source = content.rstrip()
01244 
01245     def gi(self, content):
01246         self.data.gi = content
01247 
01248     def keywords(self, content):
01249         self.data.keywords = self._split_keywords(content)
01250 
01251     def project(self, content):
01252         self.data.projects.extend([p for p in content.split() if p])
01253 
01254     def dblink(self, content):
01255         self.data.dblinks.append(content)
01256 
01257     def segment(self, content):
01258         self.data.segment = content
01259 
01260     def source(self, content):
01261         self.data.source = content
01262 
01263     def organism(self, content):
01264         self.data.organism = content
01265 
01266     def taxonomy(self, content):
01267         self.data.taxonomy = self._split_taxonomy(content)
01268 
01269     def reference_num(self, content):
01270         """Grab the reference number and signal the start of a new reference.
01271         """
01272         # check if we have a reference to add
01273         if self._cur_reference is not None:
01274             self.data.references.append(self._cur_reference)
01275 
01276         import Record
01277         self._cur_reference = Record.Reference()
01278         self._cur_reference.number = content
01279 
01280     def reference_bases(self, content):
01281         self._cur_reference.bases = content
01282 
01283     def authors(self, content):
01284         self._cur_reference.authors = content
01285 
01286     def consrtm(self, content):
01287         self._cur_reference.consrtm = content
01288 
01289     def title(self, content):
01290         if self._cur_reference is None:
01291             import warnings
01292             from Bio import BiopythonParserWarning
01293             warnings.warn("GenBank TITLE line without REFERENCE line.",
01294                           BiopythonParserWarning)
01295             return
01296         self._cur_reference.title = content
01297 
01298     def journal(self, content):
01299         self._cur_reference.journal = content
01300 
01301     def medline_id(self, content):
01302         self._cur_reference.medline_id = content
01303         
01304     def pubmed_id(self, content):
01305         self._cur_reference.pubmed_id = content
01306 
01307     def remark(self, content):
01308         self._cur_reference.remark = content
01309         
01310     def comment(self, content):
01311         self.data.comment += "\n".join(content)
01312 
01313     def primary_ref_line(self,content):
01314         """Data for the PRIMARY line"""
01315         self.data.primary.append(content)
01316 
01317     def primary(self,content):
01318         pass
01319     
01320     def features_line(self, content):
01321         """Get ready for the feature table when we reach the FEATURE line.
01322         """
01323         self.start_feature_table()
01324 
01325     def start_feature_table(self):
01326         """Signal the start of the feature table.
01327         """
01328         # we need to add on the last reference
01329         if self._cur_reference is not None:
01330             self.data.references.append(self._cur_reference)
01331 
01332     def feature_key(self, content):
01333         """Grab the key of the feature and signal the start of a new feature.
01334         """
01335         # first add on feature information if we've got any
01336         self._add_feature()
01337 
01338         import Record
01339         self._cur_feature = Record.Feature()
01340         self._cur_feature.key = content
01341 
01342     def _add_feature(self):
01343         """Utility function to add a feature to the Record.
01344 
01345         This does all of the appropriate checking to make sure we haven't
01346         left any info behind, and that we are only adding info if it
01347         exists.
01348         """
01349         if self._cur_feature is not None:
01350             # if we have a left over qualifier, add it to the qualifiers
01351             # on the current feature
01352             if self._cur_qualifier is not None:
01353                 self._cur_feature.qualifiers.append(self._cur_qualifier)
01354 
01355             self._cur_qualifier = None
01356             self.data.features.append(self._cur_feature)
01357 
01358     def location(self, content):
01359         self._cur_feature.location = self._clean_location(content)
01360 
01361     def feature_qualifier(self, key, value):
01362         self.feature_qualifier_name([key])
01363         if value is not None:
01364             self.feature_qualifier_description(value)
01365 
01366     def feature_qualifier_name(self, content_list):
01367         """Deal with qualifier names
01368         
01369         We receive a list of keys, since you can have valueless keys such as
01370         /pseudo which would be passed in with the next key (since no other
01371         tags separate them in the file)
01372         """
01373         import Record
01374         for content in content_list:
01375             # the record parser keeps the /s -- add them if we don't have 'em
01376             if content.find("/") != 0:
01377                 content = "/%s" % content
01378             # add on a qualifier if we've got one
01379             if self._cur_qualifier is not None:
01380                 self._cur_feature.qualifiers.append(self._cur_qualifier)
01381 
01382             self._cur_qualifier = Record.Qualifier()
01383             self._cur_qualifier.key = content
01384 
01385     def feature_qualifier_description(self, content):
01386         # if we have info then the qualifier key should have a ='s
01387         if self._cur_qualifier.key.find("=") == -1:
01388             self._cur_qualifier.key = "%s=" % self._cur_qualifier.key
01389         cur_content = self._remove_newlines(content)
01390         # remove all spaces from the value if it is a type where spaces
01391         # are not important
01392         for remove_space_key in self.__class__.remove_space_keys:
01393             if self._cur_qualifier.key.find(remove_space_key) >= 0:
01394                 cur_content = self._remove_spaces(cur_content)
01395         self._cur_qualifier.value = self._normalize_spaces(cur_content)
01396 
01397     def base_count(self, content):
01398         self.data.base_counts = content
01399 
01400     def origin_name(self, content):
01401         self.data.origin = content
01402 
01403     def contig_location(self, content):
01404         """Signal that we have contig information to add to the record.
01405         """
01406         self.data.contig = self._clean_location(content) 
01407 
01408     def sequence(self, content):
01409         """Add sequence information to a list of sequence strings.
01410 
01411         This removes spaces in the data and uppercases the sequence, and
01412         then adds it to a list of sequences. Later on we'll join this
01413         list together to make the final sequence. This is faster than
01414         adding on the new string every time.
01415         """
01416         assert ' ' not in content
01417         self._seq_data.append(content.upper())
01418 
01419     def record_end(self, content):
01420         """Signal the end of the record and do any necessary clean-up.
01421         """
01422         # add together all of the sequence parts to create the
01423         # final sequence string
01424         self.data.sequence = "".join(self._seq_data)
01425         # add on the last feature
01426         self._add_feature()
01427 
01428 
01429 def parse(handle):
01430     """Iterate over GenBank formatted entries as Record objects.
01431 
01432     >>> from Bio import GenBank
01433     >>> handle = open("GenBank/NC_000932.gb")
01434     >>> for record in GenBank.parse(handle):
01435     ...     print record.accession
01436     ['NC_000932']
01437     >>> handle.close()
01438 
01439     To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb")
01440     instead.
01441     """
01442     return iter(Iterator(handle, RecordParser()))
01443 
01444 def read(handle):
01445     """Read a handle containing a single GenBank entry as a Record object.
01446 
01447     >>> from Bio import GenBank
01448     >>> handle = open("GenBank/NC_000932.gb")
01449     >>> record = GenBank.read(handle)
01450     >>> print record.accession
01451     ['NC_000932']
01452     >>> handle.close()
01453                        
01454     To get a SeqRecord object use Bio.SeqIO.read(..., format="gb")
01455     instead.
01456     """
01457     iterator = parse(handle)
01458     try:
01459         first = iterator.next()
01460     except StopIteration:
01461         first = None
01462     if first is None:
01463         raise ValueError("No records found in handle")
01464     try:
01465         second = iterator.next()
01466     except StopIteration:
01467         second = None
01468     if second is not None:
01469         raise ValueError("More than one record found in handle")
01470     return first
01471 
01472 def _test():
01473     """Run the Bio.GenBank module's doctests."""
01474     import doctest
01475     import os
01476     if os.path.isdir(os.path.join("..","..","Tests")):
01477         print "Runing doctests..."
01478         cur_dir = os.path.abspath(os.curdir)
01479         os.chdir(os.path.join("..","..","Tests"))
01480         doctest.testmod()
01481         os.chdir(cur_dir)
01482         del cur_dir
01483         print "Done"
01484     elif os.path.isdir(os.path.join("Tests")):
01485         print "Runing doctests..."
01486         cur_dir = os.path.abspath(os.curdir)
01487         os.chdir(os.path.join("Tests"))
01488         doctest.testmod()
01489         os.chdir(cur_dir)
01490         del cur_dir
01491         print "Done"
01492 
01493 if __name__ == "__main__":
01494     _test()