Back to index

python-biopython  1.60
Namespaces | Classes | Functions | Variables
Bio.SwissProt Namespace Reference

Namespaces

namespace  KeyWList

Classes

class  Record
class  Reference

Functions

def parse
def read
def _read
def _read_id
def _read_dt
def _read_ox
def _read_oh
def _read_rn
def _read_rc
def _read_rx
def _read_cc
def _read_dr
def _read_ft

Variables

string example_filename = "../../Tests/SwissProt/sp008"
tuple handle = open(example_filename)
tuple records = parse(handle)

Function Documentation

def Bio.SwissProt._read (   handle) [private]

Definition at line 135 of file __init__.py.

00135 
00136 def _read(handle):
00137     record = None
00138     unread = ""
00139     for line in handle:
00140         #This is for Python 3 to cope with a binary handle (byte strings),
00141         #or a text handle (unicode strings):
00142         line = _as_string(line)
00143         key, value = line[:2], line[5:].rstrip()
00144         if unread:
00145             value = unread + " " + value
00146             unread = ""
00147         if key=='**':
00148             #See Bug 2353, some files from the EBI have extra lines
00149             #starting "**" (two asterisks/stars).  They appear
00150             #to be unofficial automated annotations. e.g.
00151             #**
00152             #**   #################    INTERNAL SECTION    ##################
00153             #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
00154             pass
00155         elif key=='ID':
00156             record = Record()
00157             _read_id(record, line)
00158             _sequence_lines = []
00159         elif key=='AC':
00160             accessions = [word for word in value.rstrip(";").split("; ")]
00161             record.accessions.extend(accessions)
00162         elif key=='DT':
00163             _read_dt(record, line)
00164         elif key=='DE':
00165             record.description.append(value.strip())
00166         elif key=='GN':
00167             if record.gene_name:
00168                 record.gene_name += " "
00169             record.gene_name += value
00170         elif key=='OS':
00171             record.organism.append(value)
00172         elif key=='OG':
00173             record.organelle += line[5:]
00174         elif key=='OC':
00175             cols = [col for col in value.rstrip(";.").split("; ")]
00176             record.organism_classification.extend(cols)
00177         elif key=='OX':
00178             _read_ox(record, line)
00179         elif key=='OH':
00180             _read_oh(record, line)
00181         elif key=='RN':
00182             reference = Reference()
00183             _read_rn(reference, value)
00184             record.references.append(reference)
00185         elif key=='RP':
00186             assert record.references, "RP: missing RN"
00187             record.references[-1].positions.append(value)
00188         elif key=='RC':
00189             assert record.references, "RC: missing RN"
00190             reference = record.references[-1]
00191             unread = _read_rc(reference, value)
00192         elif key=='RX':
00193             assert record.references, "RX: missing RN"
00194             reference = record.references[-1]
00195             _read_rx(reference, value)
00196         elif key=='RL':
00197             assert record.references, "RL: missing RN"
00198             reference = record.references[-1]
00199             reference.location.append(value)
00200         # In UniProt release 1.12 of 6/21/04, there is a new RG
00201         # (Reference Group) line, which references a group instead of
00202         # an author.  Each block must have at least 1 RA or RG line.
00203         elif key=='RA':
00204             assert record.references, "RA: missing RN"
00205             reference = record.references[-1]
00206             reference.authors.append(value)
00207         elif key=='RG':
00208             assert record.references, "RG: missing RN"
00209             reference = record.references[-1]
00210             reference.authors.append(value)
00211         elif key=="RT":
00212             assert record.references, "RT: missing RN"
00213             reference = record.references[-1]
00214             reference.title.append(value)
00215         elif key=='CC':
00216             _read_cc(record, line)
00217         elif key=='DR':
00218             _read_dr(record, value)
00219         elif key=='PE':
00220             #TODO - Record this information?
00221             pass
00222         elif key=='KW':
00223             cols = value.rstrip(";.").split('; ')
00224             record.keywords.extend(cols)
00225         elif key=='FT':
00226             _read_ft(record, line)
00227         elif key=='SQ':
00228             cols = value.split()
00229             assert len(cols) == 7, "I don't understand SQ line %s" % line
00230             # Do more checking here?
00231             record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
00232         elif key=='  ':
00233             _sequence_lines.append(value.replace(" ", "").rstrip())
00234         elif key=='//':
00235             # Join multiline data into one string
00236             record.description = " ".join(record.description)
00237             record.organism = " ".join(record.organism)
00238             record.organelle   = record.organelle.rstrip()
00239             for reference in record.references:
00240                 reference.authors = " ".join(reference.authors).rstrip(";")
00241                 reference.title = " ".join(reference.title).rstrip(";")
00242                 if reference.title.startswith('"') and reference.title.endswith('"'):
00243                     reference.title = reference.title[1:-1] #remove quotes
00244                 reference.location = " ".join(reference.location)
00245             record.sequence = "".join(_sequence_lines)
00246             return record
00247         else:
00248             raise ValueError("Unknown keyword '%s' found" % key)
00249     if record:
00250         raise ValueError("Unexpected end of stream.")
00251 

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SwissProt._read_cc (   record,
  line 
) [private]

Definition at line 477 of file __init__.py.

00477 
00478 def _read_cc(record, line):
00479     key, value = line[5:8], line[9:].rstrip()
00480     if key=='-!-':   # Make a new comment
00481         record.comments.append(value)
00482     elif key=='   ': # add to the previous comment
00483         if not record.comments:
00484             # TCMO_STRGA in Release 37 has comment with no topic
00485             record.comments.append(value)
00486         else:
00487             record.comments[-1] += " " + value
00488 

Here is the caller graph for this function:

def Bio.SwissProt._read_dr (   record,
  value 
) [private]

Definition at line 489 of file __init__.py.

00489 
00490 def _read_dr(record, value):
00491     # Remove the comments at the end of the line
00492     i = value.find(' [')
00493     if i >= 0:
00494         value = value[:i]
00495     cols = value.rstrip(".").split('; ')
00496     record.cross_references.append(tuple(cols))
00497 

Here is the caller graph for this function:

def Bio.SwissProt._read_dt (   record,
  line 
) [private]

Definition at line 283 of file __init__.py.

00283 
00284 def _read_dt(record, line):
00285     value = line[5:]
00286     uprline = value.upper()
00287     cols = value.rstrip().split()
00288     if 'CREATED' in uprline \
00289     or 'LAST SEQUENCE UPDATE' in uprline \
00290     or 'LAST ANNOTATION UPDATE' in uprline:
00291         # Old style DT line
00292         # =================
00293         # e.g.
00294         # DT   01-FEB-1995 (Rel. 31, Created)
00295         # DT   01-FEB-1995 (Rel. 31, Last sequence update)
00296         # DT   01-OCT-2000 (Rel. 40, Last annotation update)
00297         #
00298         # or:
00299         # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
00300         # ...
00301 
00302         # find where the version information will be located
00303         # This is needed for when you have cases like IPI where
00304         # the release verison is in a different spot:
00305         # DT   08-JAN-2002 (IPI Human rel. 2.3, Created)
00306         uprcols = uprline.split()
00307         rel_index = -1
00308         for index in range(len(uprcols)):
00309             if uprcols[index].find("REL.") >= 0:
00310                 rel_index = index
00311         assert rel_index >= 0, \
00312                 "Could not find Rel. in DT line: %s" % line
00313         version_index = rel_index + 1
00314         # get the version information
00315         str_version = cols[version_index].rstrip(",")
00316         # no version number
00317         if str_version == '':
00318             version = 0
00319         # dot versioned
00320         elif str_version.find(".") >= 0:
00321             version = str_version
00322         # integer versioned
00323         else:
00324             version = int(str_version)
00325         date = cols[0]
00326 
00327         if 'CREATED' in uprline:
00328             record.created = date, version
00329         elif 'LAST SEQUENCE UPDATE' in uprline:
00330             record.sequence_update = date, version
00331         elif 'LAST ANNOTATION UPDATE' in uprline:
00332             record.annotation_update = date, version
00333         else:
00334             assert False, "Shouldn't reach this line!"
00335     elif 'INTEGRATED INTO' in uprline \
00336     or 'SEQUENCE VERSION' in uprline \
00337     or 'ENTRY VERSION' in uprline:
00338         # New style DT line
00339         # =================
00340         # As of UniProt Knowledgebase release 7.0 (including
00341         # Swiss-Prot release 49.0 and TrEMBL release 32.0) the
00342         # format of the DT lines and the version information
00343         # in them was changed - the release number was dropped.
00344         #
00345         # For more information see bug 1948 and
00346         # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0
00347         #
00348         # e.g.
00349         # DT   01-JAN-1998, integrated into UniProtKB/Swiss-Prot.
00350         # DT   15-OCT-2001, sequence version 3.
00351         # DT   01-APR-2004, entry version 14.
00352         #
00353         #This is a new style DT line...
00354 
00355         # The date should be in string cols[1]
00356         # Get the version number if there is one.
00357         # For the three DT lines above: 0, 3, 14
00358         try:
00359             version = int(cols[-1])
00360         except ValueError:
00361             version = 0
00362         date = cols[0].rstrip(",")
00363 
00364         # Re-use the historical property names, even though
00365         # the meaning has changed slighty:
00366         if "INTEGRATED"  in uprline:
00367             record.created = date, version
00368         elif 'SEQUENCE VERSION' in uprline:
00369             record.sequence_update = date, version
00370         elif 'ENTRY VERSION' in uprline:
00371             record.annotation_update = date, version
00372         else:
00373             assert False, "Shouldn't reach this line!"
00374     else:
00375         raise ValueError("I don't understand the date line %s" % line)
00376 

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SwissProt._read_ft (   record,
  line 
) [private]

Definition at line 498 of file __init__.py.

00498 
00499 def _read_ft(record, line):
00500     line = line[5:]    # get rid of junk in front
00501     name = line[0:8].rstrip()
00502     try:
00503         from_res = int(line[9:15])
00504     except ValueError:
00505         from_res = line[9:15].lstrip()
00506     try:
00507         to_res = int(line[16:22])
00508     except ValueError:
00509         to_res = line[16:22].lstrip()
00510     #if there is a feature_id (FTId), store it away
00511     if line[29:35]==r"/FTId=":
00512         ft_id = line[35:70].rstrip()[:-1]
00513         description = ""
00514     else:
00515         ft_id =""
00516         description = line[29:70].rstrip()
00517     if not name:  # is continuation of last one
00518         assert not from_res and not to_res
00519         name, from_res, to_res, old_description,old_ft_id = record.features[-1]
00520         del record.features[-1]
00521         description = ("%s %s" % (old_description, description)).strip()
00522 
00523         # special case -- VARSPLIC, reported by edvard@farmasi.uit.no
00524         if name == "VARSPLIC":
00525             # Remove unwanted spaces in sequences.
00526             # During line carryover, the sequences in VARSPLIC can get mangled
00527             # with unwanted spaces like:
00528             # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
00529             # We want to check for this case and correct it as it happens.
00530             descr_cols = description.split(" -> ")
00531             if len(descr_cols) == 2:
00532                 first_seq, second_seq = descr_cols
00533                 extra_info = ''
00534                 # we might have more information at the end of the
00535                 # second sequence, which should be in parenthesis
00536                 extra_info_pos = second_seq.find(" (")
00537                 if extra_info_pos != -1:
00538                     extra_info = second_seq[extra_info_pos:]
00539                     second_seq = second_seq[:extra_info_pos]
00540                 # now clean spaces out of the first and second string
00541                 first_seq = first_seq.replace(" ", "")
00542                 second_seq = second_seq.replace(" ", "")
00543                 # reassemble the description
00544                 description = first_seq + " -> " + second_seq + extra_info
00545     record.features.append((name, from_res, to_res, description,ft_id))
00546 

Here is the caller graph for this function:

def Bio.SwissProt._read_id (   record,
  line 
) [private]

Definition at line 252 of file __init__.py.

00252 
00253 def _read_id(record, line):
00254     cols = line[5:].split()
00255     #Prior to release 51, included with MoleculeType:
00256     #ID   EntryName DataClass; MoleculeType; SequenceLength AA.
00257     #
00258     #Newer files lack the MoleculeType:
00259     #ID   EntryName DataClass; SequenceLength AA.
00260     if len(cols) == 5:
00261         record.entry_name = cols[0]
00262         record.data_class = cols[1].rstrip(";")
00263         record.molecule_type = cols[2].rstrip(";")
00264         record.sequence_length = int(cols[3])
00265     elif len(cols) == 4:
00266         record.entry_name = cols[0]
00267         record.data_class = cols[1].rstrip(";")
00268         record.molecule_type = None
00269         record.sequence_length = int(cols[2])
00270     else:
00271         raise ValueError("ID line has unrecognised format:\n"+line)
00272     # check if the data class is one of the allowed values
00273     allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed')
00274     if record.data_class not in allowed:
00275         raise ValueError("Unrecognized data class %s in line\n%s" % \
00276               (record.data_class, line))
00277     # molecule_type should be 'PRT' for PRoTein
00278     # Note that has been removed in recent releases (set to None)
00279     if record.molecule_type not in (None, 'PRT'):
00280         raise ValueError("Unrecognized molecule type %s in line\n%s" % \
00281               (record.molecule_type, line))
00282 

Here is the caller graph for this function:

def Bio.SwissProt._read_oh (   record,
  line 
) [private]

Definition at line 396 of file __init__.py.

00396 
00397 def _read_oh(record, line):
00398     # Line type OH (Organism Host) for viral hosts
00399     assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line
00400     line = line[16:].rstrip()
00401     assert line[-1]=="." and line.count(";")==1, line
00402     taxid, name = line[:-1].split(";")
00403     record.host_taxonomy_id.append(taxid.strip())
00404     record.host_organism.append(name.strip())
00405 

Here is the caller graph for this function:

def Bio.SwissProt._read_ox (   record,
  line 
) [private]

Definition at line 377 of file __init__.py.

00377 
00378 def _read_ox(record, line):
00379     # The OX line is in the format:
00380     # OX   DESCRIPTION=ID[, ID]...;
00381     # If there are too many id's to fit onto a line, then the ID's
00382     # continue directly onto the next line, e.g.
00383     # OX   DESCRIPTION=ID[, ID]...
00384     # OX   ID[, ID]...;
00385     # Currently, the description is always "NCBI_TaxID".
00386     # To parse this, I need to check to see whether I'm at the
00387     # first line.  If I am, grab the description and make sure
00388     # it's an NCBI ID.  Then, grab all the id's.
00389     if record.taxonomy_id:
00390         ids = line[5:].rstrip().rstrip(";")
00391     else:
00392         descr, ids = line[5:].rstrip().rstrip(";").split("=")
00393         assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
00394     record.taxonomy_id.extend(ids.split(', '))
00395 

Here is the caller graph for this function:

def Bio.SwissProt._read_rc (   reference,
  value 
) [private]

Definition at line 411 of file __init__.py.

00411 
00412 def _read_rc(reference, value):
00413     cols = value.split(';')
00414     if value[-1]==';':
00415         unread = ""
00416     else:
00417         cols, unread = cols[:-1], cols[-1]
00418     for col in cols:
00419         if not col:  # last column will be the empty string
00420             return
00421         # The token is everything before the first '=' character.
00422         i = col.find("=")
00423         if i>=0:
00424             token, text = col[:i], col[i+1:]
00425             comment = token.lstrip(), text
00426             reference.comments.append(comment)
00427         else:
00428             comment = reference.comments[-1]
00429             comment = "%s %s" % (comment, col)
00430             reference.comments[-1] = comment
00431     return unread
00432 

Here is the caller graph for this function:

def Bio.SwissProt._read_rn (   reference,
  rn 
) [private]

Definition at line 406 of file __init__.py.

00406 
00407 def _read_rn(reference, rn):
00408     assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
00409     reference.number = int(rn[1:-1])
00410 

Here is the caller graph for this function:

def Bio.SwissProt._read_rx (   reference,
  value 
) [private]

Definition at line 433 of file __init__.py.

00433 
00434 def _read_rx(reference, value):
00435     # The basic (older?) RX line is of the form:
00436     # RX   MEDLINE; 85132727.
00437     # but there are variants of this that need to be dealt with (see below)
00438 
00439     # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33
00440     # have extraneous information in the RX line.  Check for
00441     # this and chop it out of the line.
00442     # (noticed by katel@worldpath.net)
00443     value = value.replace(' [NCBI, ExPASy, Israel, Japan]','')
00444 
00445     # RX lines can also be used of the form
00446     # RX   PubMed=9603189;
00447     # reported by edvard@farmasi.uit.no
00448     # and these can be more complicated like:
00449     # RX   MEDLINE=95385798; PubMed=7656980;
00450     # RX   PubMed=15060122; DOI=10.1136/jmg 2003.012781;
00451     # We look for these cases first and deal with them
00452     warn = False
00453     if "=" in value:
00454         cols = value.split("; ")
00455         cols = [x.strip() for x in cols]
00456         cols = [x for x in cols if x]
00457         for col in cols:
00458             x = col.split("=")
00459             if len(x) != 2 or x == ("DOI", "DOI"):
00460                 warn = True
00461                 break
00462             assert len(x) == 2, "I don't understand RX line %s" % value
00463             reference.references.append((x[0], x[1].rstrip(";")))
00464     # otherwise we assume we have the type 'RX   MEDLINE; 85132727.'
00465     else:
00466         cols = value.split("; ")
00467         # normally we split into the three parts
00468         if len(cols) != 2:
00469             warn = True
00470         else:
00471             reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
00472     if warn:
00473         import warnings
00474         from Bio import BiopythonParserWarning
00475         warnings.warn("Possibly corrupt RX line %r" % value,
00476                       BiopythonParserWarning)

Here is the caller graph for this function:

def Bio.SwissProt.parse (   handle)

Definition at line 113 of file __init__.py.

00113 
00114 def parse(handle):
00115     while True:
00116         record = _read(handle)
00117         if not record:
00118             return
00119         yield record
00120 

Here is the call graph for this function:

def Bio.SwissProt.read (   handle)

Definition at line 121 of file __init__.py.

00121 
00122 def read(handle):
00123     record = _read(handle)
00124     if not record:
00125         raise ValueError("No SwissProt record found")
00126     # We should have reached the end of the record by now
00127     remainder = handle.read()
00128     if remainder:
00129         raise ValueError("More than one SwissProt record found")
00130     return record
00131 
00132  
00133 # Everything below is considered private
00134 

Here is the call graph for this function:


Variable Documentation

string Bio.SwissProt.example_filename = "../../Tests/SwissProt/sp008"

Definition at line 550 of file __init__.py.

Definition at line 558 of file __init__.py.

Definition at line 559 of file __init__.py.