Back to index

python-biopython  1.60
UniGene.py
Go to the documentation of this file.
00001 
00002 # Permission to use, copy, modify, and distribute this software and
00003 # its documentation with or without modifications and for any purpose
00004 # and without fee is hereby granted, provided that any copyright
00005 # notices appear in all copies and that both those copyright notices
00006 # and this permission notice appear in supporting documentation, and
00007 # that the names of the contributors or copyright holders not be used
00008 # in advertising or publicity pertaining to distribution of the software
00009 # without specific prior permission.
00010 #
00011 # THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
00012 # WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
00013 # WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
00014 # CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
00015 # OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
00016 # LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
00017 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
00018 # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00019 
00020 import warnings
00021 warnings.warn("The module Bio.UniGene.UniGene is now obsolete, "
00022               "and will be deprecated and removed in a future "
00023               "release of Biopython. To parse UniGene flat files, "
00024               "please use the parser in Bio.UniGene instead",
00025               PendingDeprecationWarning)
00026 
00027 import string
00028 import operator
00029 import urllib
00030 import sgmllib
00031 import UserDict
00032 import Bio.File
00033 
00034 
00035 
00036 class UniGeneParser( sgmllib.SGMLParser ):
00037 
00038     def reset( self ):
00039         sgmllib.SGMLParser.reset( self )
00040         self.text = ''
00041         self.queue = UserDict.UserDict()
00042         self.open_tag_stack = []
00043         self.open_tag = 'open_html'
00044         self.key_waiting = ''
00045         self.master_key = ''
00046         self.context = 'general_info'
00047 
00048     def parse( self, handle ):
00049         self.reset()
00050         self.feed( handle )
00051         for key in self.queue:
00052             if( self.queue[ key ] == {} ):
00053                 if( key[ :15 ] == 'UniGene Cluster' ):
00054                     self.queue[ 'UniGene Cluster' ] = key[ 16: ]
00055                 del self.queue[ key ]
00056         return self.queue
00057 
00058 #
00059 # Assumes an empty line between records
00060 #
00061     def feed( self, handle ):
00062         if isinstance(handle, Bio.File.UndoHandle):
00063             uhandle = handle
00064         else:
00065             uhandle = Bio.File.UndoHandle(handle)
00066         text = ''
00067         while 1:
00068             line = uhandle.readline()
00069             line = string.strip( line )
00070             if( line == '' ):
00071                 break
00072             text = text + ' ' + line
00073 
00074         sgmllib.SGMLParser.feed( self, text )
00075 
00076 
00077 
00078     def handle_data(self, newtext ):
00079         newtext = string.strip( newtext )
00080         self.text = self.text + newtext
00081 
00082     def start_a( self, attrs ):
00083         if( self.context == 'seq_info' ):
00084             if( self.open_tag != 'open_b' ):
00085                 self.text = ''
00086 
00087 #        self.queue.append( attrs )
00088 
00089     def end_a( self ):
00090         if( self.context == 'seq_info' ):
00091             if( self.open_tag != 'open_b' ):
00092                 if( self.key_waiting == '' ):
00093                     self.key_waiting = self.text
00094                     self.text = ''
00095 
00096     def start_b( self, attrs ):
00097 
00098         self.open_tag_stack.append( self.open_tag )
00099         self.open_tag = 'open_b'
00100         if( self.key_waiting == '' ):
00101             self.text = ''
00102 
00103     def end_b( self ):
00104         if( self.text[ :15 ] == 'UniGene Cluster' ):
00105             self.queue[ 'UniGene Cluster' ] = self.text[ 16: ]
00106             self.text = ''
00107         elif( self.key_waiting == '' ):
00108             self.extract_key()
00109 
00110     def extract_key( self ):
00111         text = string.strip( self.text )
00112         key = string.join( string.split( text ) )
00113         words = string.split( key )
00114         key = string.join( words[ :2 ] )
00115         self.text = ''
00116 
00117         try:
00118             self.open_tag = self.open_tag_stack.pop()
00119         except:
00120             self.open_tag = 'open_html'
00121         if( self.open_tag == 'open_table_data' ):
00122             if( self.context == 'general_info' ):
00123                 if( self.key_waiting == '' ):
00124                     self.key_waiting = key
00125                     self.text = ''
00126             elif( self.context == 'seq_info' ):
00127                 if( text == 'Key to Symbols' ):
00128                     self.context = 'legend'
00129                     self.master_key = key
00130         elif( self.context == 'general_info' ):
00131             self.master_key = key
00132             if( string.find( key, 'SEQUENCE' ) != -1 ):
00133                 self.context = 'seq_info'
00134             self.queue[ key ] = UserDict.UserDict()
00135         elif( self.context == 'seq_info' ):
00136             self.queue[ key ] = UserDict.UserDict()
00137             self.master_key = key
00138 
00139 
00140 
00141     def start_table( self, attrs ):
00142         self.open_tag_stack.append( self.open_tag )
00143         self.open_tag = 'open_table'
00144 
00145     def end_table( self ):
00146         try:
00147             self.open_tag = self.open_tag_stack.pop()
00148         except:
00149             self.open_tag = 'open_html'
00150         self.key_waiting = ''
00151 
00152     def start_tr( self, attrs ):
00153         self.open_tag_stack.append( self.open_tag )
00154         self.open_tag = 'open_table_row'
00155         self.text = ''
00156 
00157     def end_tr( self ):
00158         try:
00159             self.open_tag = self.open_tag_stack.pop()
00160         except:
00161             self.open_tag = 'open_html'
00162         text = self.text
00163         if text:
00164             self.text = ''
00165             if( text[ 0 ] == ':' ):
00166                 text = text[ 1: ]
00167             text = string.join( string.split( text ) )
00168             if( ( self.context == 'general_info' ) or \
00169                 ( self.context == 'seq_info' ) ):
00170                 try:
00171                     contents = self.queue[ self.master_key ][ self.key_waiting ]
00172                     if( type( contents ) == type( [] ) ):
00173                         contents.append( text )
00174                     else:
00175                         self.queue[ self.master_key ][ self.key_waiting ] = \
00176                             [ contents , text ]
00177                 except:
00178                     self.queue[ self.master_key ][ self.key_waiting ] = text
00179 
00180 
00181                 self.key_waiting = ''
00182 
00183 
00184 
00185     def start_td( self, attrs ):
00186         self.open_tag_stack.append( self.open_tag )
00187         self.open_tag = 'open_table_data'
00188 
00189     def end_td( self ):
00190         try:
00191             self.open_tag = self.open_tag_stack.pop()
00192         except:
00193             self.open_tag = 'open_html'
00194         if( self.context == 'seq_info' ):
00195             self.text = self.text + ' '
00196 
00197     def print_item( self, item, level = 1 ):
00198         indent = '    '
00199         for j in range( 0, level ):
00200             indent = indent + '    '
00201         if( type( item ) == type( '' ) ):
00202             if( item != '' ):
00203                 print '%s%s' % ( indent, item )
00204         elif( type( item ) == type([])):
00205             for subitem in item:
00206                 self.print_item( subitem, level + 1 )
00207         elif( isinstance( item, UserDict.UserDict ) ):
00208             for subitem in item:
00209                 print '%skey is %s' % ( indent, subitem )
00210                 self.print_item( item[ subitem ], level + 1 )
00211         else:
00212             print item
00213 
00214     def print_tags( self ):
00215         for key in self.queue:
00216             print 'key %s' % key
00217             self.print_item( self.queue[ key ] )
00218 
00219 
00220 
00221 if( __name__ == '__main__' ):
00222     handle = open( 'Hs13225.htm')
00223     undo_handle = Bio.File.UndoHandle( handle )
00224     unigene_parser = UniGeneParser()
00225     unigene_parser.parse( handle )
00226     unigene_parser.print_tags()
00227 
00228