Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 2007 by Michiel de Hoon.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """
00006 This module provides code to work with the sprotXX.dat file from
00007 SwissProt.
00008 http://www.expasy.ch/sprot/sprot-top.html
00009 
00010 Tested with:
00011 Release 56.9, 03-March-2009.
00012 
00013 
00014 Classes:
00015 Record             Holds SwissProt data.
00016 Reference          Holds reference data from a SwissProt record.
00017 
00018 Functions:
00019 read               Read one SwissProt record
00020 parse              Read multiple SwissProt records
00021 
00022 """
00023 
00024 from Bio._py3k import _as_string
00025 
00026 class Record(object):
00027     """Holds information from a SwissProt record.
00028 
00029     Members:
00030     entry_name        Name of this entry, e.g. RL1_ECOLI.
00031     data_class        Either 'STANDARD' or 'PRELIMINARY'.
00032     molecule_type     Type of molecule, 'PRT',
00033     sequence_length   Number of residues.
00034 
00035     accessions        List of the accession numbers, e.g. ['P00321']
00036     created           A tuple of (date, release).
00037     sequence_update   A tuple of (date, release).
00038     annotation_update A tuple of (date, release).
00039 
00040     description       Free-format description.
00041     gene_name         Gene name.  See userman.txt for description.
00042     organism          The source of the sequence.
00043     organelle         The origin of the sequence.
00044     organism_classification  The taxonomy classification.  List of strings.
00045                              (http://www.ncbi.nlm.nih.gov/Taxonomy/)
00046     taxonomy_id       A list of NCBI taxonomy id's.
00047     host_organism     A list of names of the hosts of a virus, if any.
00048     host_taxonomy_id  A list of NCBI taxonomy id's of the hosts, if any.
00049     references        List of Reference objects.
00050     comments          List of strings.
00051     cross_references  List of tuples (db, id1[, id2][, id3]).  See the docs.
00052     keywords          List of the keywords.
00053     features          List of tuples (key name, from, to, description).
00054                       from and to can be either integers for the residue
00055                       numbers, '<', '>', or '?'
00056 
00057     seqinfo           tuple of (length, molecular weight, CRC32 value)
00058     sequence          The sequence.
00059     
00060     """
00061     def __init__(self):
00062         self.entry_name = None
00063         self.data_class = None
00064         self.molecule_type = None
00065         self.sequence_length = None
00066         
00067         self.accessions = []
00068         self.created = None
00069         self.sequence_update = None
00070         self.annotation_update = None
00071         
00072         self.description = []
00073         self.gene_name = ''
00074         self.organism = []
00075         self.organelle = ''
00076         self.organism_classification = []
00077         self.taxonomy_id = []
00078         self.host_organism = []
00079         self.host_taxonomy_id = []
00080         self.references = []
00081         self.comments = []
00082         self.cross_references = []
00083         self.keywords = []
00084         self.features = []
00085         
00086         self.seqinfo = None
00087         self.sequence = ''
00088 
00089 
00090 class Reference(object):
00091     """Holds information from one reference in a SwissProt entry.
00092 
00093     Members:
00094     number      Number of reference in an entry.
00095     positions   Describes extent of work.  list of strings.
00096     comments    Comments.  List of (token, text).
00097     references  References.  List of (dbname, identifier)
00098     authors     The authors of the work.
00099     title       Title of the work.
00100     location    A citation for the work.
00101     
00102     """
00103     def __init__(self):
00104         self.number = None
00105         self.positions = []
00106         self.comments = []
00107         self.references = []
00108         self.authors = []
00109         self.title = []
00110         self.location = []
00111 
00112 
00113 def parse(handle):
00114     while True:
00115         record = _read(handle)
00116         if not record:
00117             return
00118         yield record
00119 
00120 
00121 def read(handle):
00122     record = _read(handle)
00123     if not record:
00124         raise ValueError("No SwissProt record found")
00125     # We should have reached the end of the record by now
00126     remainder = handle.read()
00127     if remainder:
00128         raise ValueError("More than one SwissProt record found")
00129     return record
00130 
00131  
00132 # Everything below is considered private
00133 
00134 
00135 def _read(handle):
00136     record = None
00137     unread = ""
00138     for line in handle:
00139         #This is for Python 3 to cope with a binary handle (byte strings),
00140         #or a text handle (unicode strings):
00141         line = _as_string(line)
00142         key, value = line[:2], line[5:].rstrip()
00143         if unread:
00144             value = unread + " " + value
00145             unread = ""
00146         if key=='**':
00147             #See Bug 2353, some files from the EBI have extra lines
00148             #starting "**" (two asterisks/stars).  They appear
00149             #to be unofficial automated annotations. e.g.
00150             #**
00151             #**   #################    INTERNAL SECTION    ##################
00152             #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
00153             pass
00154         elif key=='ID':
00155             record = Record()
00156             _read_id(record, line)
00157             _sequence_lines = []
00158         elif key=='AC':
00159             accessions = [word for word in value.rstrip(";").split("; ")]
00160             record.accessions.extend(accessions)
00161         elif key=='DT':
00162             _read_dt(record, line)
00163         elif key=='DE':
00164             record.description.append(value.strip())
00165         elif key=='GN':
00166             if record.gene_name:
00167                 record.gene_name += " "
00168             record.gene_name += value
00169         elif key=='OS':
00170             record.organism.append(value)
00171         elif key=='OG':
00172             record.organelle += line[5:]
00173         elif key=='OC':
00174             cols = [col for col in value.rstrip(";.").split("; ")]
00175             record.organism_classification.extend(cols)
00176         elif key=='OX':
00177             _read_ox(record, line)
00178         elif key=='OH':
00179             _read_oh(record, line)
00180         elif key=='RN':
00181             reference = Reference()
00182             _read_rn(reference, value)
00183             record.references.append(reference)
00184         elif key=='RP':
00185             assert record.references, "RP: missing RN"
00186             record.references[-1].positions.append(value)
00187         elif key=='RC':
00188             assert record.references, "RC: missing RN"
00189             reference = record.references[-1]
00190             unread = _read_rc(reference, value)
00191         elif key=='RX':
00192             assert record.references, "RX: missing RN"
00193             reference = record.references[-1]
00194             _read_rx(reference, value)
00195         elif key=='RL':
00196             assert record.references, "RL: missing RN"
00197             reference = record.references[-1]
00198             reference.location.append(value)
00199         # In UniProt release 1.12 of 6/21/04, there is a new RG
00200         # (Reference Group) line, which references a group instead of
00201         # an author.  Each block must have at least 1 RA or RG line.
00202         elif key=='RA':
00203             assert record.references, "RA: missing RN"
00204             reference = record.references[-1]
00205             reference.authors.append(value)
00206         elif key=='RG':
00207             assert record.references, "RG: missing RN"
00208             reference = record.references[-1]
00209             reference.authors.append(value)
00210         elif key=="RT":
00211             assert record.references, "RT: missing RN"
00212             reference = record.references[-1]
00213             reference.title.append(value)
00214         elif key=='CC':
00215             _read_cc(record, line)
00216         elif key=='DR':
00217             _read_dr(record, value)
00218         elif key=='PE':
00219             #TODO - Record this information?
00220             pass
00221         elif key=='KW':
00222             cols = value.rstrip(";.").split('; ')
00223             record.keywords.extend(cols)
00224         elif key=='FT':
00225             _read_ft(record, line)
00226         elif key=='SQ':
00227             cols = value.split()
00228             assert len(cols) == 7, "I don't understand SQ line %s" % line
00229             # Do more checking here?
00230             record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
00231         elif key=='  ':
00232             _sequence_lines.append(value.replace(" ", "").rstrip())
00233         elif key=='//':
00234             # Join multiline data into one string
00235             record.description = " ".join(record.description)
00236             record.organism = " ".join(record.organism)
00237             record.organelle   = record.organelle.rstrip()
00238             for reference in record.references:
00239                 reference.authors = " ".join(reference.authors).rstrip(";")
00240                 reference.title = " ".join(reference.title).rstrip(";")
00241                 if reference.title.startswith('"') and reference.title.endswith('"'):
00242                     reference.title = reference.title[1:-1] #remove quotes
00243                 reference.location = " ".join(reference.location)
00244             record.sequence = "".join(_sequence_lines)
00245             return record
00246         else:
00247             raise ValueError("Unknown keyword '%s' found" % key)
00248     if record:
00249         raise ValueError("Unexpected end of stream.")
00250 
00251 
00252 def _read_id(record, line):
00253     cols = line[5:].split()
00254     #Prior to release 51, included with MoleculeType:
00255     #ID   EntryName DataClass; MoleculeType; SequenceLength AA.
00256     #
00257     #Newer files lack the MoleculeType:
00258     #ID   EntryName DataClass; SequenceLength AA.
00259     if len(cols) == 5:
00260         record.entry_name = cols[0]
00261         record.data_class = cols[1].rstrip(";")
00262         record.molecule_type = cols[2].rstrip(";")
00263         record.sequence_length = int(cols[3])
00264     elif len(cols) == 4:
00265         record.entry_name = cols[0]
00266         record.data_class = cols[1].rstrip(";")
00267         record.molecule_type = None
00268         record.sequence_length = int(cols[2])
00269     else:
00270         raise ValueError("ID line has unrecognised format:\n"+line)
00271     # check if the data class is one of the allowed values
00272     allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed')
00273     if record.data_class not in allowed:
00274         raise ValueError("Unrecognized data class %s in line\n%s" % \
00275               (record.data_class, line))
00276     # molecule_type should be 'PRT' for PRoTein
00277     # Note that has been removed in recent releases (set to None)
00278     if record.molecule_type not in (None, 'PRT'):
00279         raise ValueError("Unrecognized molecule type %s in line\n%s" % \
00280               (record.molecule_type, line))
00281 
00282 
00283 def _read_dt(record, line):
00284     value = line[5:]
00285     uprline = value.upper()
00286     cols = value.rstrip().split()
00287     if 'CREATED' in uprline \
00288     or 'LAST SEQUENCE UPDATE' in uprline \
00289     or 'LAST ANNOTATION UPDATE' in uprline:
00290         # Old style DT line
00291         # =================
00292         # e.g.
00293         # DT   01-FEB-1995 (Rel. 31, Created)
00294         # DT   01-FEB-1995 (Rel. 31, Last sequence update)
00295         # DT   01-OCT-2000 (Rel. 40, Last annotation update)
00296         #
00297         # or:
00298         # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
00299         # ...
00300 
00301         # find where the version information will be located
00302         # This is needed for when you have cases like IPI where
00303         # the release verison is in a different spot:
00304         # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
00305         uprcols = uprline.split()
00306         rel_index = -1
00307         for index in range(len(uprcols)):
00308             if uprcols[index].find("REL.") >= 0:
00309                 rel_index = index
00310         assert rel_index >= 0, \
00311                 "Could not find Rel. in DT line: %s" % line
00312         version_index = rel_index + 1
00313         # get the version information
00314         str_version = cols[version_index].rstrip(",")
00315         # no version number
00316         if str_version == '':
00317             version = 0
00318         # dot versioned
00319         elif str_version.find(".") >= 0:
00320             version = str_version
00321         # integer versioned
00322         else:
00323             version = int(str_version)
00324         date = cols[0]
00325 
00326         if 'CREATED' in uprline:
00327             record.created = date, version
00328         elif 'LAST SEQUENCE UPDATE' in uprline:
00329             record.sequence_update = date, version
00330         elif 'LAST ANNOTATION UPDATE' in uprline:
00331             record.annotation_update = date, version
00332         else:
00333             assert False, "Shouldn't reach this line!"
00334     elif 'INTEGRATED INTO' in uprline \
00335     or 'SEQUENCE VERSION' in uprline \
00336     or 'ENTRY VERSION' in uprline:
00337         # New style DT line
00338         # =================
00339         # As of UniProt Knowledgebase release 7.0 (including
00340         # Swiss-Prot release 49.0 and TrEMBL release 32.0) the
00341         # format of the DT lines and the version information
00342         # in them was changed - the release number was dropped.
00343         #
00344         # For more information see bug 1948 and
00345         # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0
00346         #
00347         # e.g.
00348         # DT   01-JAN-1998, integrated into UniProtKB/Swiss-Prot.
00349         # DT   15-OCT-2001, sequence version 3.
00350         # DT   01-APR-2004, entry version 14.
00351         #
00352         #This is a new style DT line...
00353 
00354         # The date should be in string cols[1]
00355         # Get the version number if there is one.
00356         # For the three DT lines above: 0, 3, 14
00357         try:
00358             version = int(cols[-1])
00359         except ValueError:
00360             version = 0
00361         date = cols[0].rstrip(",")
00362 
00363         # Re-use the historical property names, even though
00364         # the meaning has changed slighty:
00365         if "INTEGRATED"  in uprline:
00366             record.created = date, version
00367         elif 'SEQUENCE VERSION' in uprline:
00368             record.sequence_update = date, version
00369         elif 'ENTRY VERSION' in uprline:
00370             record.annotation_update = date, version
00371         else:
00372             assert False, "Shouldn't reach this line!"
00373     else:
00374         raise ValueError("I don't understand the date line %s" % line)
00375 
00376 
00377 def _read_ox(record, line):
00378     # The OX line is in the format:
00379     # OX   DESCRIPTION=ID[, ID]...;
00380     # If there are too many id's to fit onto a line, then the ID's
00381     # continue directly onto the next line, e.g.
00382     # OX   DESCRIPTION=ID[, ID]...
00383     # OX   ID[, ID]...;
00384     # Currently, the description is always "NCBI_TaxID".
00385     # To parse this, I need to check to see whether I'm at the
00386     # first line.  If I am, grab the description and make sure
00387     # it's an NCBI ID.  Then, grab all the id's.
00388     if record.taxonomy_id:
00389         ids = line[5:].rstrip().rstrip(";")
00390     else:
00391         descr, ids = line[5:].rstrip().rstrip(";").split("=")
00392         assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
00393     record.taxonomy_id.extend(ids.split(', '))
00394 
00395 
00396 def _read_oh(record, line):
00397     # Line type OH (Organism Host) for viral hosts
00398     assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line
00399     line = line[16:].rstrip()
00400     assert line[-1]=="." and line.count(";")==1, line
00401     taxid, name = line[:-1].split(";")
00402     record.host_taxonomy_id.append(taxid.strip())
00403     record.host_organism.append(name.strip())
00404 
00405 
00406 def _read_rn(reference, rn):
00407     assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
00408     reference.number = int(rn[1:-1])
00409 
00410 
00411 def _read_rc(reference, value):
00412     cols = value.split(';')
00413     if value[-1]==';':
00414         unread = ""
00415     else:
00416         cols, unread = cols[:-1], cols[-1]
00417     for col in cols:
00418         if not col:  # last column will be the empty string
00419             return
00420         # The token is everything before the first '=' character.
00421         i = col.find("=")
00422         if i>=0:
00423             token, text = col[:i], col[i+1:]
00424             comment = token.lstrip(), text
00425             reference.comments.append(comment)
00426         else:
00427             comment = reference.comments[-1]
00428             comment = "%s %s" % (comment, col)
00429             reference.comments[-1] = comment
00430     return unread
00431 
00432 
00433 def _read_rx(reference, value):
00434     # The basic (older?) RX line is of the form:
00435     # RX   MEDLINE; 85132727.
00436     # but there are variants of this that need to be dealt with (see below)
00437 
00438     # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33
00439     # have extraneous information in the RX line.  Check for
00440     # this and chop it out of the line.
00441     # (noticed by katel@worldpath.net)
00442     value = value.replace(' [NCBI, ExPASy, Israel, Japan]','')
00443 
00444     # RX lines can also be used of the form
00445     # RX   PubMed=9603189;
00446     # reported by edvard@farmasi.uit.no
00447     # and these can be more complicated like:
00448     # RX   MEDLINE=95385798; PubMed=7656980;
00449     # RX   PubMed=15060122; DOI=10.1136/jmg 2003.012781;
00450     # We look for these cases first and deal with them
00451     warn = False
00452     if "=" in value:
00453         cols = value.split("; ")
00454         cols = [x.strip() for x in cols]
00455         cols = [x for x in cols if x]
00456         for col in cols:
00457             x = col.split("=")
00458             if len(x) != 2 or x == ("DOI", "DOI"):
00459                 warn = True
00460                 break
00461             assert len(x) == 2, "I don't understand RX line %s" % value
00462             reference.references.append((x[0], x[1].rstrip(";")))
00463     # otherwise we assume we have the type 'RX   MEDLINE; 85132727.'
00464     else:
00465         cols = value.split("; ")
00466         # normally we split into the three parts
00467         if len(cols) != 2:
00468             warn = True
00469         else:
00470             reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
00471     if warn:
00472         import warnings
00473         from Bio import BiopythonParserWarning
00474         warnings.warn("Possibly corrupt RX line %r" % value,
00475                       BiopythonParserWarning)
00476 
00477 def _read_cc(record, line):
00478     key, value = line[5:8], line[9:].rstrip()
00479     if key=='-!-':   # Make a new comment
00480         record.comments.append(value)
00481     elif key=='   ': # add to the previous comment
00482         if not record.comments:
00483             # TCMO_STRGA in Release 37 has comment with no topic
00484             record.comments.append(value)
00485         else:
00486             record.comments[-1] += " " + value
00487 
00488 
00489 def _read_dr(record, value):
00490     # Remove the comments at the end of the line
00491     i = value.find(' [')
00492     if i >= 0:
00493         value = value[:i]
00494     cols = value.rstrip(".").split('; ')
00495     record.cross_references.append(tuple(cols))
00496 
00497 
00498 def _read_ft(record, line):
00499     line = line[5:]    # get rid of junk in front
00500     name = line[0:8].rstrip()
00501     try:
00502         from_res = int(line[9:15])
00503     except ValueError:
00504         from_res = line[9:15].lstrip()
00505     try:
00506         to_res = int(line[16:22])
00507     except ValueError:
00508         to_res = line[16:22].lstrip()
00509     #if there is a feature_id (FTId), store it away
00510     if line[29:35]==r"/FTId=":
00511         ft_id = line[35:70].rstrip()[:-1]
00512         description = ""
00513     else:
00514         ft_id =""
00515         description = line[29:70].rstrip()
00516     if not name:  # is continuation of last one
00517         assert not from_res and not to_res
00518         name, from_res, to_res, old_description,old_ft_id = record.features[-1]
00519         del record.features[-1]
00520         description = ("%s %s" % (old_description, description)).strip()
00521 
00522         # special case -- VARSPLIC, reported by edvard@farmasi.uit.no
00523         if name == "VARSPLIC":
00524             # Remove unwanted spaces in sequences.
00525             # During line carryover, the sequences in VARSPLIC can get mangled
00526             # with unwanted spaces like:
00527             # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
00528             # We want to check for this case and correct it as it happens.
00529             descr_cols = description.split(" -> ")
00530             if len(descr_cols) == 2:
00531                 first_seq, second_seq = descr_cols
00532                 extra_info = ''
00533                 # we might have more information at the end of the
00534                 # second sequence, which should be in parenthesis
00535                 extra_info_pos = second_seq.find(" (")
00536                 if extra_info_pos != -1:
00537                     extra_info = second_seq[extra_info_pos:]
00538                     second_seq = second_seq[:extra_info_pos]
00539                 # now clean spaces out of the first and second string
00540                 first_seq = first_seq.replace(" ", "")
00541                 second_seq = second_seq.replace(" ", "")
00542                 # reassemble the description
00543                 description = first_seq + " -> " + second_seq + extra_info
00544     record.features.append((name, from_res, to_res, description,ft_id))
00545 
00546 
00547 if __name__ == "__main__":
00548     print "Quick self test..."
00549 
00550     example_filename = "../../Tests/SwissProt/sp008"
00551 
00552     import os
00553     if not os.path.isfile(example_filename):
00554         print "Missing test file %s" % example_filename
00555     else:
00556         #Try parsing it!
00557         
00558         handle = open(example_filename)
00559         records = parse(handle)
00560         for record in records:
00561             print record.entry_name
00562             print ",".join(record.accessions)
00563             print record.keywords
00564             print repr(record.organism)
00565             print record.sequence[:20] + "..."
00566         handle.close()