Back to index

python-biopython  1.60
Prosite.py
Go to the documentation of this file.
00001 # Copyright 1999 by Jeffrey Chang.  All rights reserved.
00002 # Copyright 2000 by Jeffrey Chang.  All rights reserved.
00003 # Revisions Copyright 2007 by Peter Cock.  All rights reserved.
00004 # Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved.
00005 # This code is part of the Biopython distribution and governed by its
00006 # license.  Please see the LICENSE file that should have been included
00007 # as part of this package.
00008 """
00009 This module provides code to work with the prosite dat file from
00010 Prosite.
00011 http://www.expasy.ch/prosite/
00012 
00013 Tested with:
00014 Release 20.43, 10-Feb-2009
00015 
00016 
00017 Functions:
00018 read                  Reads a Prosite file containing one Prosite record
00019 parse                 Iterates over records in a Prosite file.
00020 
00021 Classes:
00022 Record                Holds Prosite data.
00023 """
00024 
00025 def parse(handle):
00026     """Parse Prosite records.
00027 
00028     This function is for parsing Prosite files containing multiple
00029     records.
00030 
00031     handle   - handle to the file."""
00032     while True:
00033         record = __read(handle)
00034         if not record:
00035             break
00036         yield record
00037 
00038 def read(handle):
00039     """Read one Prosite record.
00040 
00041     This function is for parsing Prosite files containing
00042     exactly one record.
00043 
00044     handle   - handle to the file."""
00045 
00046     record = __read(handle)
00047     # We should have reached the end of the record by now
00048     remainder = handle.read()
00049     if remainder:
00050         raise ValueError("More than one Prosite record found")
00051     return record
00052 
00053 class Record(object):
00054     """Holds information from a Prosite record.
00055 
00056     Members:
00057     name           ID of the record.  e.g. ADH_ZINC
00058     type           Type of entry.  e.g. PATTERN, MATRIX, or RULE
00059     accession      e.g. PS00387
00060     created        Date the entry was created.  (MMM-YYYY)
00061     data_update    Date the 'primary' data was last updated.
00062     info_update    Date data other than 'primary' data was last updated.
00063     pdoc           ID of the PROSITE DOCumentation.
00064     
00065     description    Free-format description.
00066     pattern        The PROSITE pattern.  See docs.
00067     matrix         List of strings that describes a matrix entry.
00068     rules          List of rule definitions (from RU lines).  (strings)
00069     prorules       List of prorules (from PR lines). (strings)
00070 
00071     NUMERICAL RESULTS
00072     nr_sp_release  SwissProt release.
00073     nr_sp_seqs     Number of seqs in that release of Swiss-Prot. (int)
00074     nr_total       Number of hits in Swiss-Prot.  tuple of (hits, seqs)
00075     nr_positive    True positives.  tuple of (hits, seqs)
00076     nr_unknown     Could be positives.  tuple of (hits, seqs)
00077     nr_false_pos   False positives.  tuple of (hits, seqs)
00078     nr_false_neg   False negatives.  (int)
00079     nr_partial     False negatives, because they are fragments. (int)
00080 
00081     COMMENTS
00082     cc_taxo_range  Taxonomic range.  See docs for format
00083     cc_max_repeat  Maximum number of repetitions in a protein
00084     cc_site        Interesting site.  list of tuples (pattern pos, desc.)
00085     cc_skip_flag   Can this entry be ignored?
00086     cc_matrix_type
00087     cc_scaling_db
00088     cc_author
00089     cc_ft_key
00090     cc_ft_desc
00091     cc_version     version number (introduced in release 19.0)
00092 
00093     DATA BANK REFERENCES - The following are all
00094                            lists of tuples (swiss-prot accession,
00095                                             swiss-prot name)
00096     dr_positive
00097     dr_false_neg
00098     dr_false_pos
00099     dr_potential   Potential hits, but fingerprint region not yet available.
00100     dr_unknown     Could possibly belong
00101 
00102     pdb_structs    List of PDB entries.
00103 
00104     """
00105     def __init__(self):
00106         self.name = ''
00107         self.type = ''
00108         self.accession = ''
00109         self.created = ''
00110         self.data_update = ''
00111         self.info_update = ''
00112         self.pdoc = ''
00113     
00114         self.description = ''
00115         self.pattern = ''
00116         self.matrix = []
00117         self.rules = []
00118         self.prorules = []
00119         self.postprocessing = []
00120 
00121         self.nr_sp_release = ''
00122         self.nr_sp_seqs = ''
00123         self.nr_total = (None, None)
00124         self.nr_positive = (None, None)
00125         self.nr_unknown = (None, None)
00126         self.nr_false_pos = (None, None)
00127         self.nr_false_neg = None
00128         self.nr_partial = None
00129 
00130         self.cc_taxo_range = ''
00131         self.cc_max_repeat = ''
00132         self.cc_site = []
00133         self.cc_skip_flag = ''
00134 
00135         self.dr_positive = []
00136         self.dr_false_neg = []
00137         self.dr_false_pos = []
00138         self.dr_potential = []
00139         self.dr_unknown = []
00140 
00141         self.pdb_structs = []
00142 
00143 
00144 # Everything below are private functions
00145 
00146 def __read(handle):
00147     import re
00148     record = None
00149     for line in handle:
00150         keyword, value = line[:2], line[5:].rstrip()
00151         if keyword=='ID':
00152             record = Record()
00153             cols = value.split("; ")
00154             if len(cols) != 2:
00155                 raise ValueError("I don't understand identification line\n%s" \
00156                          % line)
00157             record.name = cols[0]
00158             record.type = cols[1].rstrip('.')    # don't want '.'
00159         elif keyword=='AC':
00160             record.accession = value.rstrip(';')
00161         elif keyword=='DT':
00162             dates = value.rstrip('.').split("; ")
00163             if (not dates[0].endswith('(CREATED)')) or \
00164                (not dates[1].endswith('(DATA UPDATE)')) or \
00165                (not dates[2].endswith('(INFO UPDATE)')):
00166                 raise ValueError("I don't understand date line\n%s" % line)
00167             record.created = dates[0].rstrip(' (CREATED)')
00168             record.data_update = dates[1].rstrip(' (DATA UPDATE)')
00169             record.info_update = dates[2].rstrip(' (INFO UPDATE)')
00170         elif keyword=='DE':
00171             record.description = value
00172         elif keyword=='PA':
00173             record.pattern += value
00174         elif keyword=='MA':
00175             record.matrix.append(value)
00176         elif keyword=='PP':
00177             record.postprocessing.extend(value.split(";"))
00178         elif keyword=='RU':
00179             record.rules.append(value)
00180         elif keyword=='NR':
00181             cols = value.split(";")
00182             for col in cols:
00183                 if not col:
00184                     continue
00185                 qual, data = [word.lstrip() for word in col.split("=")]
00186                 if qual == '/RELEASE':
00187                     release, seqs = data.split(",")
00188                     record.nr_sp_release = release
00189                     record.nr_sp_seqs = int(seqs)
00190                 elif qual == '/FALSE_NEG':
00191                     record.nr_false_neg = int(data)
00192                 elif qual == '/PARTIAL':
00193                     record.nr_partial = int(data)
00194                 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
00195                     m = re.match(r'(\d+)\((\d+)\)', data)
00196                     if not m:
00197                         raise Exception("Broken data %s in comment line\n%s" \
00198                                         % (repr(data), line))
00199                     hits = tuple(map(int, m.groups()))
00200                     if(qual == "/TOTAL"):
00201                         record.nr_total = hits
00202                     elif(qual == "/POSITIVE"):
00203                         record.nr_positive = hits
00204                     elif(qual == "/UNKNOWN"):
00205                         record.nr_unknown = hits
00206                     elif(qual == "/FALSE_POS"):
00207                         record.nr_false_pos = hits
00208                 else:
00209                     raise ValueError("Unknown qual %s in comment line\n%s" \
00210                                      % (repr(qual), line))
00211         elif keyword=='CC':
00212             #Expect CC lines like this:
00213             #CC   /TAXO-RANGE=??EPV; /MAX-REPEAT=2;
00214             #Can (normally) split on ";" and then on "="
00215             cols = value.split(";")
00216             for col in cols:
00217                 if not col or col[:17] == 'Automatic scaling':
00218                     # DNAJ_2 in Release 15 has a non-standard comment line:
00219                     # CC   Automatic scaling using reversed database
00220                     # Throw it away.  (Should I keep it?)
00221                     continue
00222                 if col.count("=") == 0:
00223                     #Missing qualifier!  Can we recover gracefully?
00224                     #For example, from Bug 2403, in PS50293 have:
00225                     #CC /AUTHOR=K_Hofmann; N_Hulo
00226                     continue
00227                 qual, data = [word.lstrip() for word in col.split("=")]
00228                 if qual == '/TAXO-RANGE':
00229                     record.cc_taxo_range = data
00230                 elif qual == '/MAX-REPEAT':
00231                     record.cc_max_repeat = data
00232                 elif qual == '/SITE':
00233                     pos, desc = data.split(",")
00234                     record.cc_site.append((int(pos), desc))
00235                 elif qual == '/SKIP-FLAG':
00236                     record.cc_skip_flag = data
00237                 elif qual == '/MATRIX_TYPE':
00238                     record.cc_matrix_type = data
00239                 elif qual == '/SCALING_DB':
00240                     record.cc_scaling_db = data
00241                 elif qual == '/AUTHOR':
00242                     record.cc_author = data
00243                 elif qual == '/FT_KEY':
00244                     record.cc_ft_key = data
00245                 elif qual == '/FT_DESC':
00246                     record.cc_ft_desc = data
00247                 elif qual == '/VERSION':
00248                     record.cc_version = data
00249                 else:
00250                     raise ValueError("Unknown qual %s in comment line\n%s" \
00251                                      % (repr(qual), line))
00252         elif keyword=='DR':
00253             refs = value.split(";")
00254             for ref in refs:
00255                 if not ref:
00256                     continue
00257                 acc, name, type = [word.strip() for word in ref.split(",")]
00258                 if type == 'T':
00259                     record.dr_positive.append((acc, name))
00260                 elif type == 'F':
00261                     record.dr_false_pos.append((acc, name))
00262                 elif type == 'N':
00263                     record.dr_false_neg.append((acc, name))
00264                 elif type == 'P':
00265                     record.dr_potential.append((acc, name))
00266                 elif type == '?':
00267                     record.dr_unknown.append((acc, name))
00268                 else:
00269                     raise ValueError("I don't understand type flag %s" % type)
00270         elif keyword=='3D':
00271             cols = value.split()
00272             for id in cols:
00273                 record.pdb_structs.append(id.rstrip(';'))
00274         elif keyword=='PR':
00275             rules = value.split(";")
00276             record.prorules.extend(rules)
00277         elif keyword=='DO':
00278             record.pdoc = value.rstrip(';')
00279         elif keyword=='CC':
00280             continue
00281         elif keyword=='//':
00282             if not record:
00283                 # Then this was the copyright statement
00284                 continue
00285             break
00286         else:
00287             raise ValueError("Unknown keyword %s found" % keyword)
00288     else:
00289         return
00290     if not record:
00291         raise ValueError("Unexpected end of stream.")
00292     return record