Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 2001 by Gavin E. Crooks.  All rights reserved.
00002 # Modifications Copyright 2004/2005 James Casbon. All rights Reserved.
00003 # Modifications Copyright 2010 Jeffrey Finkelstein. All rights reserved.
00004 #
00005 # This code is part of the Biopython distribution and governed by its
00006 # license.  Please see the LICENSE file that should have been included
00007 # as part of this package.
00008 #
00009 # Changes made by James Casbon:
00010 # - New Astral class
00011 # - SQL functionality for both Scop and Astral classes
00012 # - All sunids are int not strings
00013 #
00014 # Code written by Jeffrey Chang to access SCOP over the internet, which
00015 # was previously in Bio.WWW.SCOP, has now been merged into this module.
00016 
00017 """ SCOP: Structural Classification of Proteins.
00018 
00019 The SCOP database aims to provide a manually constructed classification of
00020 all know protein structures into a hierarchy, the main levels of which
00021 are family, superfamily and fold.
00022 
00023 * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/
00024 
00025 * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html
00026 
00027 * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
00028 
00029 The Scop object in this module represents the entire SCOP classification. It
00030 can be built from the three SCOP parsable files, modified is so desired, and
00031 converted back to the same file formats. A single SCOP domain (represented
00032 by the Domain class) can be obtained from Scop using the domain's SCOP
00033 identifier (sid).
00034 
00035 
00036 nodeCodeDict  -- A mapping between known 2 letter node codes and a longer
00037                   description. The known node types are 'cl' (class), 'cf'
00038                   (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain), 
00039                   'sp' (species), 'px' (domain). Additional node types may
00040                   be added in the future.
00041 
00042 This module also provides code to access SCOP over the WWW.
00043 
00044 Functions:
00045 search        -- Access the main CGI script.
00046 _open         -- Internally used function.
00047 
00048 """
00049 
00050 from types import *
00051 import os
00052 
00053 import Des
00054 import Cla
00055 import Hie
00056 from Residues import *
00057 from Bio import SeqIO
00058 from Bio.Seq import Seq
00059 
00060 
00061 nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily',
00062                  'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'}
00063 
00064 
00065 _nodetype_to_code= { 'class': 'cl', 'fold': 'cf', 'superfamily': 'sf',
00066                      'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'}
00067 
00068 nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ] 
00069 
00070 astralBibIds = [10,20,25,30,35,40,50,70,90,95,100]
00071 
00072 astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15,
00073              1e-20, 1e-25, 1e-50]
00074 
00075 astralEv_to_file = { 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1',
00076                      0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3',
00077                      1e-4: 'e-4',  1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15',
00078                      1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' }
00079 
00080 astralEv_to_sql = { 10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1',
00081                      0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3',
00082                      1e-4: 'e_4',  1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15',
00083                      1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50' }
00084 
00085 try:
00086     #See if the cmp function exists (will on Python 2)
00087     _cmp = cmp
00088 except NameError:
00089     def _cmp(a,b):
00090         """Implementation of cmp(x,y) for Python 3 (PRIVATE).
00091 
00092         Based on Python 3 docs which say if you really need the cmp()
00093         functionality, you could use the expression (a > b) -  (a < b)
00094         as the equivalent for cmp(a, b)
00095         """
00096         return (a > b) -  (a < b)
00097 
00098 def cmp_sccs(sccs1, sccs2):
00099     """Order SCOP concise classification strings (sccs).
00100 
00101     a.4.5.1 < a.4.5.11 < b.1.1.1
00102     
00103     A sccs (e.g. a.4.5.11) compactly represents a domain's classification.
00104     The letter represents the class, and the numbers are the fold,
00105     superfamily, and family, respectively.
00106 
00107     """
00108 
00109     s1 = sccs1.split(".")
00110     s2 = sccs2.split(".")
00111 
00112     if s1[0] != s2[0]: return _cmp(s1[0], s2[0])
00113 
00114     s1 = list(map(int, s1[1:]))
00115     s2 = list(map(int, s2[1:]))
00116 
00117     return _cmp(s1,s2)
00118 
00119 
00120 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)")
00121 
00122 def parse_domain(str):
00123     """Convert an ASTRAL header string into a Scop domain.
00124 
00125     An ASTRAL (http://astral.stanford.edu/) header contains a concise
00126     description of a SCOP domain. A very similar format is used when a
00127     Domain object is converted into a string.  The Domain returned by this
00128     method contains most of the SCOP information, but it will not be located
00129     within the SCOP hierarchy (i.e. The parent node will be None). The
00130     description is composed of the SCOP protein and species descriptions.
00131 
00132     A typical ASTRAL header looks like --
00133     >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli}
00134     """
00135 
00136     m = _domain_re.match(str)
00137     if (not m) : raise ValueError("Domain: "+ str)
00138 
00139     dom = Domain()
00140     dom.sid = m.group(1)
00141     dom.sccs = m.group(2)
00142     dom.residues = Residues(m.group(3))
00143     if not dom.residues.pdbid:
00144         dom.residues.pdbid= dom.sid[1:5]
00145     dom.description = m.group(4).strip()
00146 
00147     return dom
00148 
00149 def _open_scop_file(scop_dir_path, version, filetype):
00150     filename = "dir.%s.scop.txt_%s" % (filetype,version)
00151     handle = open(os.path.join( scop_dir_path, filename))
00152     return handle
00153 
00154 
00155 class Scop(object):
00156     """The entire SCOP hierarchy.
00157 
00158     root -- The root node of the hierarchy 
00159     """
00160     def __init__(self, cla_handle=None, des_handle=None, hie_handle=None,
00161                  dir_path=None, db_handle=None, version=None):
00162         """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend.
00163 
00164         If no file handles are given, then a Scop object with a single
00165         empty root node is returned.
00166 
00167         If a directory and version are given (with dir_path=.., version=...) or
00168         file handles for each file, the whole scop tree will be built in memory.
00169 
00170         If a MySQLdb database handle is given, the tree will be built as needed,
00171         minimising construction times.  To build the SQL database to the methods
00172         write_xxx_sql to create the tables.
00173         
00174         """
00175         self._sidDict = {}
00176         self._sunidDict = {}
00177 
00178         if cla_handle==des_handle==hie_handle==dir_path==db_handle==None: return 
00179 
00180         if dir_path is None and db_handle is None:
00181             if cla_handle == None or des_handle==None or hie_handle==None:
00182                 raise RuntimeError("Need CLA, DES and HIE files to build SCOP")
00183 
00184         sunidDict = {}
00185 
00186         self.db_handle = db_handle
00187         try:
00188             
00189             if db_handle:
00190                 # do nothing if we have a db handle, we'll do it all on the fly
00191                 pass
00192             
00193             else:
00194                 # open SCOP parseable files 
00195                 if dir_path:
00196                     if not version:
00197                         raise RuntimeError("Need SCOP version to find parsable files in directory")
00198                     if cla_handle or des_handle or hie_handle:
00199                         raise RuntimeError("Cannot specify SCOP directory and specific files")
00200                 
00201                     cla_handle = _open_scop_file( dir_path, version, 'cla')
00202                     des_handle = _open_scop_file( dir_path, version, 'des')
00203                     hie_handle = _open_scop_file( dir_path, version, 'hie')
00204                 
00205                 root = Node()
00206                 domains = []
00207                 root.sunid=0
00208                 root.type='ro'
00209                 sunidDict[root.sunid] = root
00210                 self.root = root
00211                 root.description = 'SCOP Root'
00212 
00213                 # Build the rest of the nodes using the DES file
00214                 records = Des.parse(des_handle)
00215                 for record in records:
00216                     if record.nodetype =='px':
00217                         n = Domain()
00218                         n.sid = record.name
00219                         domains.append(n)
00220                     else : 
00221                         n = Node()
00222                     n.sunid = record.sunid
00223                     n.type = record.nodetype
00224                     n.sccs = record.sccs
00225                     n.description = record.description
00226                     
00227                     sunidDict[n.sunid] = n
00228  
00229                 # Glue all of the Nodes together using the HIE file
00230                 records = Hie.parse(hie_handle)
00231                 for record in records:
00232                     if record.sunid not in sunidDict:
00233                         print record.sunid
00234                         
00235                     n = sunidDict[record.sunid]
00236     
00237                     if record.parent != '' : # Not root node
00238     
00239                         if record.parent not in sunidDict:
00240                             raise ValueError("Incomplete data?")
00241                                        
00242                         n.parent = sunidDict[record.parent]
00243                 
00244                     for c in record.children:
00245                         if c not in sunidDict:
00246                             raise ValueError("Incomplete data?")
00247                         n.children.append(sunidDict[c])
00248 
00249                         
00250                 # Fill in the gaps with information from the CLA file
00251                 sidDict = {}
00252                 records = Cla.parse(cla_handle)
00253                 for record in records:
00254                     n = sunidDict[record.sunid]
00255                     assert n.sccs == record.sccs
00256                     assert n.sid == record.sid
00257                     n.residues = record.residues
00258                     sidDict[n.sid] = n
00259 
00260                 # Clean up
00261                 self._sunidDict = sunidDict
00262                 self._sidDict = sidDict
00263                 self._domains = tuple(domains)
00264 
00265         finally:
00266             if dir_path:
00267                 # If we opened the files, we close the files
00268                 if cla_handle : cla_handle.close()
00269                 if des_handle : des_handle.close()
00270                 if hie_handle : hie_handle.close()
00271                                     
00272 
00273     def getRoot(self):
00274         return self.getNodeBySunid(0)
00275 
00276 
00277     def getDomainBySid(self, sid):
00278         """Return a domain from its sid"""
00279         if sid in self._sidDict:
00280             return self._sidDict[sid]
00281         if self.db_handle:
00282             self.getDomainFromSQL(sid=sid)
00283             if sid in self._sidDict:
00284                 return self._sidDict[sid]
00285         else:
00286             return None
00287 
00288 
00289     def getNodeBySunid(self, sunid):
00290         """Return a node from its sunid"""
00291         if sunid in self._sunidDict:
00292             return self._sunidDict[sunid]
00293         if self.db_handle:
00294             self.getDomainFromSQL(sunid=sunid)
00295             if sunid in self._sunidDict:
00296                 return self._sunidDict[sunid]
00297         else:
00298             return None
00299 
00300     
00301     def getDomains(self):
00302         """Returns an ordered tuple of all SCOP Domains"""
00303         if self.db_handle:
00304             return self.getRoot().getDescendents('px')
00305         else:
00306             return self._domains
00307 
00308 
00309 
00310     def write_hie(self, handle):
00311         """Build an HIE SCOP parsable file from this object"""
00312         nodes = self._sunidDict.values()
00313         # We order nodes to ease comparison with original file
00314         nodes.sort(key = lambda n: n.sunid)
00315         for n in nodes:
00316             handle.write(str(n.toHieRecord()))
00317 
00318 
00319     def write_des(self, handle):
00320         """Build a DES SCOP parsable file from this object""" 
00321         nodes = self._sunidDict.values()
00322         # Origional SCOP file is not ordered?
00323         nodes.sort(key = lambda n: n.sunid)
00324         for n in nodes:
00325             if n != self.root:
00326                 handle.write(str(n.toDesRecord()))
00327 
00328 
00329     def write_cla(self, handle):
00330         """Build a CLA SCOP parsable file from this object"""                
00331         nodes = self._sidDict.values()
00332         # We order nodes to ease comparison with original file
00333         nodes.sort(key = lambda n: n.sunid)
00334         for n in nodes:
00335             handle.write(str(n.toClaRecord()))
00336 
00337 
00338     def getDomainFromSQL(self, sunid=None, sid=None):
00339         """Load a node from the SQL backend using sunid or sid"""
00340         if sunid==sid==None: return None
00341 
00342         cur = self.db_handle.cursor()
00343         
00344         if sid:
00345             cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid)
00346             res = cur.fetchone()
00347             if res is None:
00348                 return None
00349             sunid = res[0]
00350 
00351         cur.execute("SELECT * FROM des WHERE sunid=%s", sunid)
00352         data = cur.fetchone()
00353 
00354         if data is not None:
00355             n = None
00356             
00357             #determine if Node or Domain
00358             if data[1] != "px":
00359                 n = Node(scop=self)
00360 
00361                 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid)
00362                 children = []
00363                 for c in cur.fetchall():
00364                     children.append(c[0])
00365                 n.children = children
00366                 
00367                 
00368             else:
00369                 n = Domain(scop=self)
00370                 cur.execute("select sid, residues, pdbid from cla where sunid=%s",
00371                                sunid)
00372                 
00373                 [n.sid,n.residues,pdbid] = cur.fetchone()
00374                 n.residues = Residues(n.residues)
00375                 n.residues.pdbid=pdbid
00376                 self._sidDict[n.sid] = n
00377                 
00378             [n.sunid,n.type,n.sccs,n.description] = data
00379 
00380             if data[1] != 'ro':
00381                 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid)
00382                 n.parent = cur.fetchone()[0]
00383 
00384             n.sunid = int(n.sunid)
00385 
00386             self._sunidDict[n.sunid] = n
00387         
00388 
00389     def getAscendentFromSQL(self, node, type):
00390         """Get ascendents using SQL backend"""
00391         if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): return None
00392 
00393         cur = self.db_handle.cursor()
00394         cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid))
00395         result = cur.fetchone()
00396         if result is not None:
00397             return self.getNodeBySunid(result[0])
00398         else:
00399             return None
00400 
00401                             
00402     def getDescendentsFromSQL(self, node, type):
00403         """Get descendents of a node using the database backend.  This avoids
00404         repeated iteration of SQL calls and is therefore much quicker than
00405         repeatedly calling node.getChildren().
00406         """
00407         if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): return []
00408 
00409         des_list = []
00410         
00411 
00412         # SQL cla table knows nothing about 'ro'
00413         if node.type == 'ro':
00414             for c in node.getChildren():
00415                 for d in self.getDescendentsFromSQL(c,type):
00416                     des_list.append(d)
00417             return des_list
00418         
00419         cur = self.db_handle.cursor()
00420         
00421         
00422         if type != 'px':
00423             cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \
00424             cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid))
00425             data = cur.fetchall()
00426             for d in data:
00427                 if int(d[0]) not in self._sunidDict:
00428                     n = Node(scop=self)
00429                     [n.sunid,n.type,n.sccs,n.description] = d
00430                     n.sunid=int(n.sunid)
00431                     self._sunidDict[n.sunid] = n
00432 
00433                     cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid)
00434                     n.parent = cur.fetchone()[0]
00435                                     
00436                     cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid)
00437                     children = []
00438                     for c in cur.fetchall():
00439                         children.append(c[0])
00440                     n.children = children
00441                     
00442                     
00443                 des_list.append( self._sunidDict[int(d[0])] )
00444 
00445         else:
00446             cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\
00447              FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s",
00448                         node.sunid)
00449 
00450             data = cur.fetchall()
00451             for d in data:
00452                 if int(d[0]) not in self._sunidDict:
00453                     n = Domain(scop=self)
00454                     #[n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type,
00455                     #n.description,n.parent] = data
00456                     [n.sunid,n.sid, pdbid,n.residues,n.sccs,n.type,n.description,
00457                      n.parent] = d[0:8]
00458                     n.residues = Residues(n.residues)
00459                     n.residues.pdbid = pdbid
00460                     n.sunid = int(n.sunid)
00461                     self._sunidDict[n.sunid] = n
00462                     self._sidDict[n.sid] = n
00463                     
00464                     
00465                 des_list.append( self._sunidDict[int(d[0])] )
00466 
00467         return des_list
00468         
00469                 
00470         
00471 
00472     def write_hie_sql(self, handle):
00473         """Write HIE data to SQL database"""
00474         cur = handle.cursor()
00475 
00476         cur.execute("DROP TABLE IF EXISTS hie")
00477         cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\
00478         INDEX (parent) )")
00479 
00480         for p in self._sunidDict.itervalues():
00481             for c in p.children:
00482                 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
00483 
00484 
00485     def write_cla_sql(self, handle):
00486         """Write CLA data to SQL database"""
00487         cur = handle.cursor()
00488 
00489         cur.execute("DROP TABLE IF EXISTS cla")
00490         cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\
00491         residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\
00492         dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )")
00493 
00494         for n in self._sidDict.itervalues():
00495             c = n.toClaRecord()
00496             cur.execute( "INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
00497                          (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs,
00498                          n.getAscendent('cl').sunid, n.getAscendent('cf').sunid,
00499                          n.getAscendent('sf').sunid, n.getAscendent('fa').sunid,
00500                          n.getAscendent('dm').sunid, n.getAscendent('sp').sunid,
00501                          n.sunid ))                         
00502         
00503 
00504     def write_des_sql(self, handle):
00505         """Write DES data to SQL database"""
00506         cur = handle.cursor()
00507 
00508         cur.execute("DROP TABLE IF EXISTS des")
00509         cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\
00510         description VARCHAR(255),\
00511         PRIMARY KEY (sunid) )")
00512         
00513         for n in self._sunidDict.itervalues():
00514             cur.execute( "INSERT INTO des VALUES (%s,%s,%s,%s)",
00515                          ( n.sunid, n.type, n.sccs, n.description ) )                        
00516         
00517 
00518   
00519 class Node(object):
00520     """ A node in the Scop hierarchy
00521 
00522     sunid  -- SCOP unique identifiers. e.g. '14986'
00523 
00524     parent -- The parent node
00525 
00526     children -- A list of child nodes
00527 
00528     sccs     -- SCOP concise classification string. e.g. 'a.1.1.2'
00529 
00530     type     -- A 2 letter node type code. e.g. 'px' for domains
00531 
00532     description -- 
00533         
00534     """
00535     def __init__(self, scop=None):
00536         """Create a Node in the scop hierarchy.  If a Scop instance is provided to the
00537         constructor, this will be used to lookup related references using the SQL
00538         methods.  If no instance is provided, it is assumed the whole tree exists
00539         and is connected."""
00540         self.sunid=''    
00541         self.parent = None
00542         self.children=[]
00543         self.sccs = ''   
00544         self.type =''    
00545         self.description =''
00546         self.scop=scop
00547 
00548     def __str__(self):
00549         s = []
00550         s.append(str(self.sunid))
00551         s.append(self.sccs)
00552         s.append(self.type)
00553         s.append(self.description)
00554 
00555         return " ".join(s)
00556 
00557     def toHieRecord(self):
00558         """Return an Hie.Record"""
00559         rec = Hie.Record()
00560         rec.sunid = str(self.sunid)
00561         if self.getParent() : #Not root node
00562             rec.parent = str(self.getParent().sunid)
00563         else:
00564             rec.parent = '-'
00565         for c in self.getChildren():
00566             rec.children.append(str(c.sunid))
00567         return rec
00568     
00569     def toDesRecord(self):
00570         """Return a Des.Record"""        
00571         rec = Des.Record()
00572         rec.sunid = str(self.sunid)
00573         rec.nodetype = self.type
00574         rec.sccs = self.sccs
00575         rec.description = self.description
00576         return rec
00577 
00578     def getChildren(self):
00579         """Return a list of children of this Node"""
00580         if self.scop is None:
00581             return self.children
00582         else:
00583             return map ( self.scop.getNodeBySunid, self.children )
00584 
00585     def getParent(self):
00586         """Return the parent of this Node"""
00587         if self.scop is None:
00588             return self.parent
00589         else:
00590             return self.scop.getNodeBySunid( self.parent )
00591 
00592     def getDescendents( self, node_type):
00593         """ Return a list of all decendent nodes of the given type. Node type can a
00594         two letter code or longer description. e.g. 'fa' or 'family'
00595         """
00596         if node_type in _nodetype_to_code:
00597             node_type = _nodetype_to_code[node_type]
00598             
00599         nodes = [self]
00600         if self.scop:
00601             return self.scop.getDescendentsFromSQL(self,node_type)
00602         while nodes[0].type != node_type:
00603             if nodes[0].type == 'px' : return [] # Fell of the bottom of the hierarchy
00604             child_list = []
00605             for n in nodes:
00606                 for child in n.getChildren():
00607                     child_list.append( child )
00608                 nodes = child_list
00609                 
00610         return nodes
00611                     
00612 
00613     def getAscendent( self, node_type):
00614         """ Return the ancenstor node of the given type, or None.Node type can a
00615         two letter code or longer description. e.g. 'fa' or 'family'"""
00616         if node_type in _nodetype_to_code:
00617             node_type = _nodetype_to_code[node_type]
00618 
00619         if self.scop:
00620             return self.scop.getAscendentFromSQL(self,node_type)
00621         else:
00622             n = self
00623             if n.type == node_type: return None
00624             while n.type != node_type:
00625                 if n.type == 'ro': return None # Fell of the top of the hierarchy
00626                 n = n.getParent()            
00627                 
00628             return n
00629                                                             
00630     
00631 
00632 class Domain(Node):
00633     """ A SCOP domain. A leaf node in the Scop hierarchy.
00634 
00635     sid      -- The SCOP domain identifier. e.g. 'd5hbib_'
00636 
00637     residues -- A Residue object. It defines the collection
00638                   of PDB atoms that make up this domain.
00639     """
00640     def __init__(self,scop=None):
00641         Node.__init__(self,scop=scop)
00642         self.sid = ''         
00643         self.residues = None
00644 
00645     def __str__(self):
00646         s = []
00647         s.append(self.sid)
00648         s.append(self.sccs)
00649         s.append("("+str(self.residues)+")")
00650 
00651         if not self.getParent():
00652             s.append(self.description)
00653         else:
00654             sp = self.getParent()
00655             dm = sp.getParent()
00656             s.append(dm.description)
00657             s.append("{"+sp.description+"}")
00658 
00659         return " ".join(s)
00660 
00661     def toDesRecord(self):
00662         """Return a Des.Record"""
00663         rec = Node.toDesRecord(self)
00664         rec.name = self.sid
00665         return rec
00666 
00667     def toClaRecord(self):
00668         """Return a Cla.Record"""        
00669         rec = Cla.Record()
00670         rec.sid = self.sid
00671         rec.residues = self.residues
00672         rec.sccs = self.sccs
00673         rec.sunid = self.sunid
00674         
00675         n = self
00676         while n.sunid != 0: #Not root node
00677             rec.hierarchy[n.type] = str(n.sunid)
00678             n = n.getParent()
00679 
00680         # Order does not matter in the hierarchy field. For more info, see
00681         # http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html
00682         #rec.hierarchy.reverse()
00683        
00684         return rec
00685             
00686 class Astral(object):
00687     """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains,
00688     as well as clusterings by percent id or evalue.
00689     """
00690 
00691     def __init__( self, dir_path=None, version=None, scop=None,
00692                   astral_file=None, db_handle=None):
00693         """
00694         Initialise the astral database.
00695         
00696         You must provide either a directory of SCOP files:
00697                 
00698         dir_path - string, the path to location of the scopseq-x.xx directory
00699                    (not the directory itself), and
00700         version   -a version number.
00701         
00702         or, a FASTA file:
00703         
00704         astral_file - string, a path to a fasta file (which will be loaded in memory)
00705         
00706         or, a MYSQL database:
00707         
00708         db_handle - a database handle for a MYSQL database containing a table
00709                     'astral' with the astral data in it.  This can be created
00710                     using writeToSQL.
00711         """
00712 
00713         if astral_file==dir_path==db_handle==None:
00714             raise RuntimeError("Need either file handle, or (dir_path + "\
00715                        + "version) or database handle to construct Astral")
00716         if not scop:
00717             raise RuntimeError("Must provide a Scop instance to construct")
00718 
00719         self.scop = scop
00720         self.db_handle = db_handle 
00721 
00722         
00723         if not astral_file and not db_handle:
00724             if dir_path == None or version == None:
00725                 raise RuntimeError("must provide dir_path and version")
00726 
00727             self.version = version
00728             self.path = os.path.join( dir_path, "scopseq-%s" % version)
00729             astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version
00730             astral_file = os.path.join (self.path, astral_file)
00731 
00732         if astral_file:
00733             #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY
00734             self.fasta_dict = SeqIO.to_dict(SeqIO.parse(open(astral_file), "fasta"))
00735 
00736         self.astral_file = astral_file
00737         self.EvDatasets = {}
00738         self.EvDatahash = {}
00739         self.IdDatasets = {}
00740         self.IdDatahash = {}
00741                 
00742         
00743     def domainsClusteredByEv(self,id):
00744         """get domains clustered by evalue"""
00745         if id not in self.EvDatasets:
00746             if self.db_handle:
00747                 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id])
00748                 
00749             else:
00750                 if not self.path:
00751                     raise RuntimeError("No scopseq directory specified")
00752                 
00753                 file_prefix = "astral-scopdom-seqres-sel-gs"
00754                 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id] ,
00755                                                   self.version)
00756                 filename = os.path.join(self.path,filename)
00757                 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename)
00758         return self.EvDatasets[id]
00759 
00760 
00761     def domainsClusteredById(self,id):
00762         """get domains clustered by percent id"""
00763         if id not in self.IdDatasets:
00764             if self.db_handle:
00765                 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id))
00766                 
00767             else:
00768                 if not self.path:
00769                     raise RuntimeError("No scopseq directory specified")
00770                 
00771                 file_prefix = "astral-scopdom-seqres-sel-gs"
00772                 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version)
00773                 filename = os.path.join(self.path,filename)
00774                 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename)
00775         return self.IdDatasets[id]
00776 
00777 
00778     def getAstralDomainsFromFile(self,filename=None,file_handle=None):
00779         """Get the scop domains from a file containing a list of sids"""
00780         if file_handle == filename == None:
00781             raise RuntimeError("You must provide a filename or handle")
00782         if not file_handle:
00783             file_handle = open(filename)
00784         doms = []
00785         while 1:
00786             line = file_handle.readline()
00787             if not line:
00788                 break
00789             line = line.rstrip()
00790             doms.append(line)
00791         if filename:
00792             file_handle.close()
00793 
00794         doms = filter( lambda a: a[0]=='d', doms )
00795         doms = map( self.scop.getDomainBySid, doms )
00796         return doms
00797 
00798     def getAstralDomainsFromSQL(self, column):
00799         """Load a set of astral domains from a column in the astral table of a MYSQL
00800         database (which can be created with writeToSQL(...)"""
00801         cur = self.db_handle.cursor()
00802         cur.execute("SELECT sid FROM astral WHERE "+column+"=1")
00803         data = cur.fetchall()
00804         data = map( lambda x: self.scop.getDomainBySid(x[0]), data)
00805         
00806         return data
00807     
00808 
00809     def getSeqBySid(self,domain):
00810         """get the seq record of a given domain from its sid"""
00811         if self.db_handle is None:
00812             return self.fasta_dict[domain].seq
00813         
00814         else:
00815             cur = self.db_handle.cursor()
00816             cur.execute("SELECT seq FROM astral WHERE sid=%s", domain)
00817             return Seq(cur.fetchone()[0])
00818 
00819     def getSeq(self,domain):
00820         """Return seq associated with domain"""
00821         return self.getSeqBySid(domain.sid)
00822 
00823 
00824     def hashedDomainsById(self,id):
00825         """Get domains clustered by sequence identity in a dict"""
00826         if id not in self.IdDatahash:
00827             self.IdDatahash[id] = {}
00828             for d in self.domainsClusteredById(id):
00829                 self.IdDatahash[id][d] = 1
00830         return self.IdDatahash[id]
00831 
00832     def hashedDomainsByEv(self,id):
00833         """Get domains clustered by evalue in a dict"""
00834         if id not in self.EvDatahash:
00835             self.EvDatahash[id] = {}
00836             for d in self.domainsClusteredByEv(id):
00837                 self.EvDatahash[id][d] = 1
00838         return self.EvDatahash[id]
00839                                                         
00840 
00841     def isDomainInId(self,dom,id):
00842         """Returns true if the domain is in the astral clusters for percent ID"""
00843         return dom in self.hashedDomainsById(id)
00844 
00845     def isDomainInEv(self,dom,id):
00846         """Returns true if the domain is in the ASTRAL clusters for evalues"""
00847         return dom in self.hashedDomainsByEv(id)
00848             
00849 
00850     def writeToSQL(self, db_handle):
00851         """Write the ASTRAL database to a MYSQL database"""
00852         cur = db_handle.cursor()
00853 
00854         cur.execute("DROP TABLE IF EXISTS astral")
00855         cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))")
00856 
00857         for dom in self.fasta_dict:
00858             cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)",
00859                         (dom, self.fasta_dict[dom].seq.data))
00860         
00861         for i in astralBibIds:
00862             cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)")
00863             
00864             for d in self.domainsClusteredById(i):
00865                 cur.execute("UPDATE astral SET id"+str(i)+"=1  WHERE sid=%s",
00866                             d.sid)
00867 
00868         for ev in astralEvs:
00869             cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)")
00870 
00871             for d in self.domainsClusteredByEv(ev):
00872                 
00873                 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1  WHERE sid=%s",
00874                             d.sid)
00875 
00876 def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None,
00877            cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
00878     """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None,
00879     cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds)
00880 
00881     Access search.cgi and return a handle to the results.  See the
00882     online help file for an explanation of the parameters:
00883     http://scop.mrc-lmb.cam.ac.uk/scop/help.html
00884 
00885     Raises an IOError if there's a network error.
00886 
00887     """
00888     params = {'pdb' : pdb, 'key' : key, 'sid' : sid, 'disp' : disp,
00889               'dir' : dir, 'loc' : loc}
00890     variables = {}
00891     for k, v in params.iteritems():
00892         if v is not None:
00893             variables[k] = v
00894     variables.update(keywds)
00895     return _open(cgi, variables)
00896 
00897 def _open(cgi, params={}, get=1):
00898     """_open(cgi, params={}, get=1) -> UndoHandle
00899 
00900     Open a handle to SCOP.  cgi is the URL for the cgi script to access.
00901     params is a dictionary with the options to pass to it.  get is a boolean
00902     that describes whether a GET should be used.  Does some
00903     simple error checking, and will raise an IOError if it encounters one.
00904 
00905     """
00906     import urllib, urllib2
00907     # Open a handle to SCOP.
00908     options = urllib.urlencode(params)
00909     try:
00910         if get:  # do a GET
00911             if options:
00912                 cgi += "?" + options
00913             handle = urllib2.urlopen(cgi)
00914         else:    # do a POST
00915             handle = urllib2.urlopen(cgi, data=options)
00916     except urllib2.HTTPError, exception:
00917         raise exception
00918     return handle