Back to index

python-biopython  1.60
KeyWList.py
Go to the documentation of this file.
00001 # Copyright 1999 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Code to parse the keywlist.txt file from SwissProt/UniProt
00007 
00008 See:
00009 http://www.expasy.ch/sprot/sprot-top.html
00010 ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt
00011 
00012 Classes:
00013 Record            Stores the information about one keyword or one category
00014                   in the keywlist.txt file.
00015 
00016 Functions:
00017 parse             Parses the keywlist.txt file and returns an iterator to
00018                   the records it contains.
00019 """
00020 
00021 
00022 class Record(dict):
00023     """
00024     This record stores the information of one keyword or category in the
00025     keywlist.txt as a Python dictionary. The keys in this dictionary are
00026     the line codes that can appear in the keywlist.txt file:
00027 
00028     ---------  ---------------------------     ----------------------
00029     Line code  Content                         Occurrence in an entry
00030     ---------  ---------------------------     ----------------------
00031     ID         Identifier (keyword)            Once; starts a keyword entry
00032     IC         Identifier (category)           Once; starts a category entry
00033     AC         Accession (KW-xxxx)             Once
00034     DE         Definition                      Once or more
00035     SY         Synonyms                        Optional; once or more
00036     GO         Gene ontology (GO) mapping      Optional; once or more
00037     HI         Hierarchy                       Optional; once or more
00038     WW         Relevant WWW site               Optional; once or more
00039     CA         Category                        Once per keyword entry; absent
00040                                                in category entries
00041     """
00042     def __init__(self):
00043         dict.__init__(self)
00044         for keyword in ("DE", "SY", "GO", "HI", "WW"):
00045             self[keyword] = []
00046     
00047 def parse(handle):
00048     record = Record()
00049     # First, skip the header - look for start of a record
00050     for line in handle:
00051         if line.startswith("ID   "):
00052             # Looks like there was no header
00053             record["ID"] = line[5:].strip()
00054             break
00055         if line.startswith("IC   "):
00056             # Looks like there was no header
00057             record["IC"] = line[5:].strip()
00058             break
00059     # Now parse the records
00060     for line in handle:
00061         if line.startswith("-------------------------------------"):
00062             # We have reached the footer
00063             break
00064         key = line[:2]
00065         if key=="//":
00066             record["DE"] = " ".join(record["DE"])
00067             record["SY"] = " ".join(record["SY"])
00068             yield record
00069             record = Record()
00070         elif line[2:5]=="   ":
00071             value = line[5:].strip()
00072             if key in ("ID", "IC", "AC", "CA"):
00073                 record[key] = value
00074             elif key in ("DE", "SY", "GO", "HI", "WW"):
00075                 record[key].append(value)
00076             else:
00077                 print "Ignoring: %s" % line.strip()
00078     # Read the footer and throw it away
00079     for line in handle:
00080         pass