Back to index

python-biopython  1.60
PirIO.py
Go to the documentation of this file.
00001 # Copyright 2008-2009 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 #
00006 # This module is for reading and writing PIR or NBRF format files as
00007 # SeqRecord objects.  The code is based on Bio.SeqIO.FastaIO
00008 
00009 """Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
00010 
00011 You are expected to use this module via the Bio.SeqIO functions, or if
00012 the file contains a sequence alignment, optionally via Bio.AlignIO instead.
00013 
00014 This format was introduced for the Protein Information Resource (PIR), a
00015 project of the National Biomedical Research Foundation (NBRF).  The PIR
00016 database itself is now part of UniProt.
00017 
00018 The file format is described online at:
00019 http://www.ebi.ac.uk/help/pir_frame.html
00020 http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
00021 
00022 An example file in this format would be:
00023 
00024 >P1;CRAB_ANAPL
00025 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
00026   MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR 
00027   SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH 
00028   GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ 
00029   SDVPERSIPI TREEKPAIAG AQRK*
00030 
00031 >P1;CRAB_BOVIN
00032 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
00033   MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR 
00034   PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV 
00035   HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK 
00036   QASGPERTIP ITREEKPAVT AAPKK*
00037 
00038 Or, an example of a multiple sequence alignment:
00039 
00040 >P1;S27231
00041 rhodopsin - northern leopard frog
00042 MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
00043 VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
00044 GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
00045 EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
00046 ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
00047 YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
00048 
00049 >P1;I51200
00050 rhodopsin - African clawed frog
00051 MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
00052 VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
00053 GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
00054 EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
00055 LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
00056 YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
00057 
00058 >P1;JN0120
00059 rhodopsin - Japanese lamprey
00060 MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
00061 VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
00062 GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
00063 EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
00064 ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
00065 YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
00066 
00067 
00068 As with the FASTA format, each record starts with a line begining with ">"
00069 character.  There is then a two letter sequence type (P1, F1, DL, DC, RL,
00070 RC, or XX), a semi colon, and the identification code.  The second like is
00071 free text description.  The remaining lines contain the sequence itself,
00072 terminating in an asterisk.  Space separated blocks of ten letters as shown
00073 above are typical.
00074 
00075 Sequence codes and their meanings:
00076 
00077 P1 - Protein (complete)
00078 F1 - Protein (fragment)
00079 D1 - DNA (e.g. EMBOSS seqret output)
00080 DL - DNA (linear)
00081 DC - DNA (circular)
00082 RL - RNA (linear)
00083 RC - RNA (circular)
00084 N3 - tRNA
00085 N1 - Other functional RNA
00086 XX - Unknown
00087 """
00088 
00089 from Bio.Alphabet import single_letter_alphabet, generic_protein, \
00090                          generic_dna, generic_rna
00091 from Bio.Seq import Seq
00092 from Bio.SeqRecord import SeqRecord
00093 
00094 _pir_alphabets = {"P1" : generic_protein,
00095                   "F1" : generic_protein,
00096                   "D1" : generic_dna,
00097                   "DL" : generic_dna,
00098                   "DC" : generic_dna,
00099                   "RL" : generic_rna,
00100                   "RC" : generic_rna,
00101                   "N3" : generic_rna,
00102                   "XX" : single_letter_alphabet,
00103                   }
00104 
00105 #This is a generator function!
00106 def PirIterator(handle):
00107     """Generator function to iterate over Fasta records (as SeqRecord objects).
00108 
00109     handle - input file
00110     alphabet - optional alphabet
00111     title2ids - A function that, when given the title of the FASTA
00112     file (without the beginning >), will return the id, name and
00113     description (in that order) for the record as a tuple of strings.
00114 
00115     If this is not given, then the entire title line will be used
00116     as the description, and the first word as the id and name.
00117 
00118     Note that use of title2ids matches that of Bio.Fasta.SequenceParser
00119     but the defaults are slightly different.
00120     """
00121     #Skip any text before the first record (e.g. blank lines, comments)
00122     while True:
00123         line = handle.readline()
00124         if line == "":
00125             return #Premature end of file, or just empty?
00126         if line[0] == ">":
00127             break
00128 
00129     while True:
00130         if line[0] != ">":
00131             raise ValueError(\
00132                 "Records in PIR files should start with '>' character")
00133         pir_type = line[1:3]
00134         if pir_type not in _pir_alphabets or line[3] != ";":
00135             raise ValueError(\
00136                 "Records should start with '>XX;' "
00137                 "where XX is a valid sequence type")
00138         identifier = line[4:].strip()
00139         description = handle.readline().strip()
00140         
00141             
00142         lines = []
00143         line = handle.readline()
00144         while True:
00145             if not line:
00146                 break
00147             if line[0] == ">":
00148                 break
00149             #Remove trailing whitespace, and any internal spaces
00150             lines.append(line.rstrip().replace(" ",""))
00151             line = handle.readline()
00152         seq = "".join(lines)
00153         if seq[-1] != "*":
00154             #Note the * terminator is present on nucleotide sequences too,
00155             #it is not a stop codon!
00156             raise ValueError(\
00157                 "Sequences in PIR files should include a * terminator!")
00158             
00159         #Return the record and then continue...
00160         record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
00161                            id = identifier, name = identifier,
00162                            description = description)
00163         record.annotations["PIR-type"] = pir_type
00164         yield record
00165 
00166         if not line : return #StopIteration
00167     assert False, "Should not reach this line"
00168 
00169 if __name__ == "__main__":
00170     print "Running quick self test"
00171 
00172     import os
00173     
00174     for name in ["clustalw",  "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"]:
00175         print name
00176         filename = "../../Tests/NBRF/%s.pir" % name
00177         if not os.path.isfile(filename):
00178             print "Missing %s" % filename
00179             continue
00180 
00181         records = list(PirIterator(open(filename)))
00182         count = 0
00183         for record in records:
00184             count += 1
00185             parts = record.description.split()
00186             if "bases," in parts:
00187                 assert len(record) == int(parts[parts.index("bases,")-1])
00188         print "Could read %s (%i records)" % (name, count)
00189