Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 2006 by Sean Davis.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 #
00006 # $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $
00007 # Sean Davis <sdavis2 at mail dot nih dot gov>
00008 # National Cancer Institute
00009 # National Institutes of Health
00010 # Bethesda, MD, USA
00011 #
00012 
00013 """Parse Unigene flat file format files such as the Hs.data file.
00014 
00015 Here is an overview of the flat file format that this parser deals with:
00016    Line types/qualifiers:
00017 
00018        ID           UniGene cluster ID
00019        TITLE        Title for the cluster
00020        GENE         Gene symbol
00021        CYTOBAND     Cytological band
00022        EXPRESS      Tissues of origin for ESTs in cluster
00023        RESTR_EXPR   Single tissue or development stage contributes 
00024                     more than half the total EST frequency for this gene.
00025        GNM_TERMINUS genomic confirmation of presence of a 3' terminus; 
00026                     T if a non-templated polyA tail is found among 
00027                     a cluster's sequences; else
00028                     I if templated As are found in genomic sequence or
00029                     S if a canonical polyA signal is found on 
00030                       the genomic sequence
00031        GENE_ID      Entrez gene identifier associated with at least one
00032                     sequence in this cluster; 
00033                     to be used instead of LocusLink.  
00034        LOCUSLINK    LocusLink identifier associated with at least one
00035                     sequence in this cluster;  
00036                     deprecated in favor of GENE_ID
00037        HOMOL        Homology;
00038        CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping
00039                     on the arabidopsis genome.
00040        STS          STS
00041             ACC=         GenBank/EMBL/DDBJ accession number of STS
00042                          [optional field]
00043             UNISTS=      identifier in NCBI's UNISTS database
00044        TXMAP        Transcript map interval
00045             MARKER=      Marker found on at least one sequence in this
00046                          cluster
00047             RHPANEL=     Radiation Hybrid panel used to place marker
00048        PROTSIM      Protein Similarity data for the sequence with
00049                     highest-scoring protein similarity in this cluster
00050             ORG=         Organism
00051             PROTGI=      Sequence GI of protein
00052             PROTID=      Sequence ID of protein
00053             PCT=         Percent alignment
00054             ALN=         length of aligned region (aa)
00055        SCOUNT       Number of sequences in the cluster
00056        SEQUENCE     Sequence
00057             ACC=         GenBank/EMBL/DDBJ accession number of sequence
00058             NID=         Unique nucleotide sequence identifier (gi)
00059             PID=         Unique protein sequence identifier (used for
00060                          non-ESTs)
00061             CLONE=       Clone identifier (used for ESTs only)
00062             END=         End (5'/3') of clone insert read (used for
00063                          ESTs only) 
00064             LID=         Library ID; see Hs.lib.info for library name
00065                          and tissue
00066             MGC=         5' CDS-completeness indicator; if present, the
00067                          clone associated with this sequence is believed
00068                          CDS-complete. A value greater than 511 is the gi
00069                          of the CDS-complete mRNA matched by the EST,
00070                          otherwise the value is an indicator of the
00071                          reliability of the test indicating CDS
00072                          completeness; higher values indicate more
00073                          reliable CDS-completeness predictions. 
00074            SEQTYPE=      Description of the nucleotide sequence.
00075                          Possible values are mRNA, EST and HTC.
00076            TRACE=        The Trace ID of the EST sequence, as provided by
00077                          NCBI Trace Archive
00078 """
00079 
00080 
00081 class SequenceLine(object):
00082     """Store the information for one SEQUENCE line from a Unigene file
00083 
00084     Initialize with the text part of the SEQUENCE line, or nothing.
00085 
00086     Attributes and descriptions (access as LOWER CASE)
00087     ACC=         GenBank/EMBL/DDBJ accession number of sequence
00088     NID=         Unique nucleotide sequence identifier (gi)
00089     PID=         Unique protein sequence identifier (used for non-ESTs)
00090     CLONE=       Clone identifier (used for ESTs only)
00091     END=         End (5'/3') of clone insert read (used for ESTs only) 
00092     LID=         Library ID; see Hs.lib.info for library name and tissue
00093     MGC=         5' CDS-completeness indicator; if present, 
00094                  the clone associated with this sequence  
00095                  is believed CDS-complete. A value greater than 511
00096                  is the gi of the CDS-complete mRNA matched by the EST,
00097                  otherwise the value is an indicator of the reliability
00098                  of the test indicating CDS completeness;
00099                  higher values indicate more reliable CDS-completeness
00100                  predictions. 
00101     SEQTYPE=     Description of the nucleotide sequence. Possible values
00102                  are mRNA, EST and HTC.
00103     TRACE=       The Trace ID of the EST sequence, as provided by NCBI
00104                  Trace Archive
00105     """
00106     
00107     def __init__(self,text=None):
00108         self.acc = ''
00109         self.nid = ''
00110         self.lid = ''
00111         self.pid = ''
00112         self.clone = ''
00113         self.image = ''
00114         self.is_image = False
00115         self.end = ''
00116         self.mgc = ''
00117         self.seqtype = ''
00118         self.trace = ''
00119         if not text==None:
00120             self.text=text
00121             self._init_from_text(text)
00122 
00123     def _init_from_text(self,text):
00124         parts = text.split('; ');
00125         for part in parts:
00126             key, val = part.split("=")
00127             if key=='CLONE':
00128                 if val[:5]=='IMAGE':
00129                     self.is_image=True
00130                     self.image = val[6:]
00131             setattr(self,key.lower(),val)
00132 
00133     def __repr__(self):
00134         return self.text
00135         
00136 
00137 class ProtsimLine(object):
00138     """Store the information for one PROTSIM line from a Unigene file
00139 
00140     Initialize with the text part of the PROTSIM line, or nothing.
00141 
00142     Attributes and descriptions (access as LOWER CASE)
00143     ORG=         Organism
00144     PROTGI=      Sequence GI of protein
00145     PROTID=      Sequence ID of protein
00146     PCT=         Percent alignment
00147     ALN=         length of aligned region (aa)
00148     """
00149 
00150     def __init__(self,text=None):
00151         self.org = ''
00152         self.protgi = ''
00153         self.protid = ''
00154         self.pct = ''
00155         self.aln = ''
00156         if not text==None:
00157             self.text=text
00158             self._init_from_text(text)
00159 
00160     def _init_from_text(self,text):
00161         parts = text.split('; ');
00162         
00163         for part in parts:
00164             key, val = part.split("=")
00165             setattr(self,key.lower(),val)
00166 
00167     def __repr__(self):
00168         return self.text
00169         
00170 
00171 class STSLine(object):
00172     """Store the information for one STS line from a Unigene file
00173 
00174     Initialize with the text part of the STS line, or nothing.
00175 
00176     Attributes and descriptions (access as LOWER CASE)
00177 
00178     ACC=         GenBank/EMBL/DDBJ accession number of STS [optional field]
00179     UNISTS=      identifier in NCBI's UNISTS database
00180     """
00181 
00182     def __init__(self,text=None):
00183         self.acc = ''
00184         self.unists = ''
00185         if not text==None:
00186             self.text=text
00187             self._init_from_text(text)
00188 
00189     def _init_from_text(self,text):
00190         parts = text.split(' ');
00191         
00192         for part in parts:
00193             key, val = part.split("=")
00194             setattr(self,key.lower(),val)
00195 
00196     def __repr__(self):
00197         return self.text
00198         
00199 
00200 class Record(object):
00201     """Store a Unigene record
00202 
00203     Here is what is stored:
00204     
00205         self.ID           = ''  # ID line
00206         self.species      = ''  # Hs, Bt, etc.
00207         self.title        = ''  # TITLE line
00208         self.symbol       = ''  # GENE line
00209         self.cytoband     = ''  # CYTOBAND line
00210         self.express      = []  # EXPRESS line, parsed on ';'
00211                                 # Will be an array of strings
00212         self.restr_expr   = ''  # RESTR_EXPR line
00213         self.gnm_terminus = ''  # GNM_TERMINUS line
00214         self.gene_id      = ''  # GENE_ID line
00215         self.locuslink    = ''  # LOCUSLINK line
00216         self.homol        = ''  # HOMOL line
00217         self.chromosome   = ''  # CHROMOSOME line
00218         self.protsim      = []  # PROTSIM entries, array of Protsims
00219                                 # Type ProtsimLine
00220         self.sequence     = []  # SEQUENCE entries, array of Sequence entries
00221                                 # Type SequenceLine
00222         self.sts          = []  # STS entries, array of STS entries
00223                                 # Type STSLine
00224         self.txmap        = []  # TXMAP entries, array of TXMap entries
00225     """
00226 
00227     def __init__(self):
00228         self.ID           = ''  # ID line
00229         self.species      = ''  # Hs, Bt, etc.
00230         self.title        = ''  # TITLE line
00231         self.symbol       = ''  # GENE line
00232         self.cytoband     = ''  # CYTOBAND line
00233         self.express      = []  # EXPRESS line, parsed on ';'
00234         self.restr_expr   = ''  # RESTR_EXPR line
00235         self.gnm_terminus = ''  # GNM_TERMINUS line
00236         self.gene_id      = ''  # GENE_ID line
00237         self.locuslink    = ''  # LOCUSLINK line
00238         self.homol        = ''  # HOMOL line
00239         self.chromosome   = ''  # CHROMOSOME line
00240         self.protsim      = []  # PROTSIM entries, array of Protsims
00241         self.sequence     = []  # SEQUENCE entries, array of Sequence entries
00242         self.sts          = []  # STS entries, array of STS entries
00243         self.txmap        = []  # TXMAP entries, array of TXMap entries
00244 
00245     def __repr__(self):
00246         return "<%s> %s %s\n%s" % (self.__class__.__name__,
00247                           self.ID, self.symbol, self.title)
00248 
00249 def parse(handle):
00250     while True:
00251         record = _read(handle)
00252         if not record:
00253             return
00254         yield record
00255 
00256 
00257 def read(handle):
00258     record = _read(handle)
00259     if not record:
00260         raise ValueError("No SwissProt record found")
00261     # We should have reached the end of the record by now
00262     remainder = handle.read()
00263     if remainder:
00264         raise ValueError("More than one SwissProt record found")
00265     return record
00266 
00267 
00268 # Everything below is private
00269 
00270 
00271 def _read(handle):
00272     UG_INDENT = 12
00273     record = None
00274     for line in handle:
00275         tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
00276         line = line.rstrip()
00277         if tag=="ID":
00278             record = Record()
00279             record.ID = value
00280             record.species = record.ID.split('.')[0]
00281         elif tag=="TITLE":
00282             record.title = value
00283         elif tag=="GENE":
00284             record.symbol = value
00285         elif tag=="GENE_ID":
00286             record.gene_id = value
00287         elif tag=="LOCUSLINK":
00288             record.locuslink = value
00289         elif tag=="HOMOL":
00290             if value=="YES":
00291                 record.homol = True
00292             elif value=="NO":
00293                 record.homol = True
00294             else:
00295                 raise ValueError, "Cannot parse HOMOL line %s" % line
00296         elif tag=="EXPRESS":
00297             record.express = [word.strip() for word in value.split("|")]
00298         elif tag=="RESTR_EXPR":
00299             record.restr_expr = [word.strip() for word in value.split("|")]
00300         elif tag=="CHROMOSOME":
00301             record.chromosome = value
00302         elif tag=="CYTOBAND":
00303             record.cytoband = value
00304         elif tag=="PROTSIM":
00305             protsim = ProtsimLine(value)
00306             record.protsim.append(protsim)
00307         elif tag=="SCOUNT":
00308             scount = int(value)
00309         elif tag=="SEQUENCE":
00310             sequence = SequenceLine(value)
00311             record.sequence.append(sequence)
00312         elif tag=="STS":
00313             sts = STSLine(value)
00314             record.sts.append(sts)
00315         elif tag=='//':
00316             if len(record.sequence)!=scount:
00317                 raise ValueError, "The number of sequences specified in the record (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence))
00318             return record
00319         else:
00320             raise ValueError, "Unknown tag %s" % tag
00321     if record:
00322         raise ValueError("Unexpected end of stream.")