Back to index

python-biopython  1.60
nextorf.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # Created: Thu Feb 15 14:22:12 2001
00003 # Last changed: Time-stamp: <01/02/18 11:16:42 thomas>
00004 # Copyright 2000 by Thomas Sicheritz-Ponten.  All rights reserved.
00005 # This code is part of the Biopython distribution and governed by its
00006 # license.  Please see the LICENSE file that should have been included
00007 # as part of this package.
00008 # Authors: Thomas Sicheritz-Ponten and Jan O. Andersson
00009 # thomas@cbs.dtu.dk, http://www.cbs.dtu.dk/thomas
00010 # Jan.O.Andersson@home.se
00011 # File: nextorf.py
00012 
00013 import re
00014 import os, sys, commands
00015 import getopt
00016 
00017 from Bio import SeqIO
00018 from Bio.Seq import Seq
00019 from Bio import Alphabet
00020 from Bio.Alphabet import IUPAC
00021 from Bio.Data import IUPACData, CodonTable
00022 
00023 class ProteinX(Alphabet.ProteinAlphabet):
00024    letters = IUPACData.extended_protein_letters + "X"
00025 
00026 proteinX = ProteinX()
00027 
00028 class MissingTable:
00029   def __init__(self, table):
00030     self._table = table
00031   def get(self, codon, stop_symbol):
00032     try:
00033       return self._table.get(codon, stop_symbol)
00034     except CodonTable.TranslationError:
00035       return 'X'
00036 
00037 # Make the codon table given an existing table
00038 def makeTableX(table):
00039   assert table.protein_alphabet == IUPAC.extended_protein
00040   return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX,
00041                                MissingTable(table.forward_table),
00042                                table.back_table, table.start_codons,
00043                                table.stop_codons)
00044 
00045 
00046 
00047 class NextOrf:
00048     def __init__(self, file, options):
00049         self.options = options
00050         self.file = file
00051         self.genetic_code = int(self.options['table'])
00052         self.table = makeTableX(CodonTable.ambiguous_dna_by_id[self.genetic_code])
00053         self.counter = 0
00054         self.ReadFile()
00055         
00056     def ReadFile(self):
00057         handle = open(self.file)
00058         for record in SeqIO.parse(handle, "fasta"):
00059             self.header = record.id
00060             frame_coordinates = ''
00061             dir = self.options['strand']
00062             plus = dir in ['both', 'plus']
00063             minus = dir in ['both', 'minus']
00064             start, stop = int(self.options['start']), int(self.options['stop'])
00065             s = str(record.seq).upper()
00066             if stop > 0:
00067                s = s[start:stop]
00068             else:
00069                s = s[start:]
00070             self.seq = Seq(s,IUPAC.ambiguous_dna)
00071             self.length = len(self.seq)
00072             self.rseq = None
00073             CDS = []
00074             if plus:
00075                 CDS.extend(self.GetCDS(self.seq))
00076             if minus:
00077                 self.rseq = self.seq.reverse_complement()
00078                 CDS.extend(self.GetCDS(self.rseq, strand = -1))
00079             self.Output(CDS)
00080 
00081     def ToFasta(self, header, seq):
00082        seq = re.sub('(............................................................)','\\1\n',seq)
00083        return '>%s\n%s' % (header, seq)
00084 
00085     def Gc(self, seq):
00086        d = {}
00087        for nt in 'ATGC':
00088           d[nt] = seq.count(nt)
00089        gc = d['G'] + d['C']
00090        if gc == 0: return 0
00091        return round(gc*100.0/(d['A'] +d['T'] + gc),1)
00092 
00093     def Gc2(self,seq):
00094        l = len(seq)
00095        d= {}
00096        for nt in ['A','T','G','C']:
00097           d[nt] = [0,0,0]
00098           
00099        for i in range(0,l,3):
00100           codon = seq[i:i+3]
00101           if len(codon) <3: codon = codon + '  '
00102           for pos in range(0,3):
00103              for nt in ['A','T','G','C']:
00104                 if codon[pos] == nt: d[nt][pos] = d[nt][pos] +1
00105 
00106        gc = {}
00107        gcall = 0
00108        nall = 0
00109        for i in range(0,3):
00110           try:
00111              n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
00112              gc[i] = (d['G'][i] + d['C'][i])*100.0/n
00113           except:
00114              gc[i] = 0
00115 
00116           gcall = gcall + d['G'][i] + d['C'][i]
00117           nall = nall + n
00118 
00119        gcall = 100.0*gcall/nall
00120        res = '%.1f%%, %.1f%%, %.1f%%, %.1f%%' % (gcall, gc[0], gc[1], gc[2])
00121        return res
00122 
00123     def GetOrfCoordinates(self, seq):
00124         s = seq.data
00125         letters = []
00126         table = self.table
00127         get = self.table.forward_table.get
00128         n = len(seq)
00129         start_codons = self.table.start_codons
00130         stop_codons = self.table.stop_codons
00131 #        print 'Start codons', start_codons
00132 #        print 'Stop codons', stop_codons
00133         frame_coordinates = []
00134         for frame in range(0,3):
00135             coordinates = []
00136             for i in range(0+frame, n-n%3, 3):
00137                 codon = s[i:i+3]
00138                 if codon in start_codons: coordinates.append((i+1,1,codon))
00139                 elif codon in stop_codons: coordinates.append((i+1,0,codon))
00140             frame_coordinates.append(coordinates)
00141         return frame_coordinates
00142 
00143     def GetCDS(self, seq, strand = 1):
00144         frame_coordinates = self.GetOrfCoordinates(seq)
00145         START, STOP = 1,0
00146         so = self.options
00147         nostart = so['nostart']
00148         minlength, maxlength = int(so['minlength']), int(so['maxlength'])
00149         CDS = []
00150         f = 0
00151         for frame in frame_coordinates:
00152             f+=1
00153             start_site = 0
00154             if nostart == '1': start_site = 1
00155             frame.append((self.length, 0, 'XXX'))
00156             for pos, codon_type, codon in frame:
00157                 if codon_type == START:
00158                     if start_site == 0: start_site = pos
00159                 elif codon_type == STOP:
00160                     if start_site == 0: continue
00161 #                    if codon == 'XXX': print 'do something'
00162                     stop = pos + 2
00163 #                    print stop
00164                     length = stop - start_site +1
00165                     if length >= minlength and length <= maxlength:
00166                         if nostart == '1' and start_site == 1:
00167                            start_site = start_site + f - 1
00168                         if codon == 'XXX': stop = start_site + 3*((int((stop-1)-start_site)/3))
00169                         s = seq[start_site -1 : stop]
00170                         CDS.append((start_site, stop, length, s, strand*f))
00171                         start_site = 0
00172                         if nostart == '1': start_site = stop + 1
00173                     elif length < minlength or length > maxlength:
00174                        start_site = 0
00175                        if nostart == '1': start_site = stop + 1
00176                     del stop   
00177         return CDS
00178 
00179     def Output(self, CDS):
00180         out = self.options['output']
00181         seqs = (self.seq, self.rseq)
00182         n = len(self.seq)
00183         for start, stop, length, subs, strand in CDS:
00184             self.counter += 1
00185             if strand > 0: head = 'orf_%s:%s:%d:%d:%d' % (self.counter, self.header, strand, start,stop)
00186             if strand < 0: head = 'orf_%s:%s:%d:%d:%d' % (self.counter, self.header, strand, n-stop+1,n-start+1)
00187             if self.options['gc']:
00188                 head = '%s:%s' % (head, self.Gc2(subs.data))
00189                 
00190             if out == 'aa':
00191                 orf = subs.translate(table=self.genetic_code)
00192                 print self.ToFasta(head, orf.data)
00193             elif out == 'nt':
00194                 print self.ToFasta(head, subs.data)
00195             elif out == 'pos':
00196                 print head
00197                 
00198         
00199     
00200 def help():
00201     global options
00202     print 'Usage:', sys.argv[0], '(<options>) <FASTA file>'
00203     
00204     print 'Options:                                                       default'
00205     print '--start       Start position in sequence                             0'
00206     print '--stop        Stop position in sequence            (end of seqence)'
00207     print '--minlength   Minimum length of orf in bp                          100'
00208     print '--maxlength   Maximum length of orf in bp, default           100000000'
00209     print '--strand      Strand to analyse [both, plus, minus]               both'
00210     print '--frame       Frame to analyse [1 2 3]                             all'
00211     print '--noframe     Ignore start codons [0 1]                              0'  
00212     print '--output      Output to generate [aa nt pos]                        aa'
00213     print '--gc          Creates GC statistics of ORF [0 1]                     0'
00214     print '--table       Genetic code to use (see below)                        1'
00215     
00216 #    for a,b in options.items(): print '\t', a,b
00217 #    print ''
00218     print "\nNCBI's Codon Tables:"
00219     for key, table in CodonTable.ambiguous_dna_by_id.items():
00220         print '\t',key, table._codon_table.names[0]
00221     print '\ne.g.\n./nextorf.py --minlength 5 --strand plus --output nt --gc 1 testjan.fas'
00222     sys.exit(0)
00223     
00224 
00225 options = {
00226     'start': 0,
00227     'stop': 0,
00228     'minlength': 100,
00229     'maxlength': 100000000,
00230     'strand': 'both',
00231     'output': 'aa',
00232     'frames': [1,2,3],
00233     'gc': 0,
00234     'nostart': 0,
00235     'table': 1,
00236     }
00237 
00238 if __name__ == '__main__':
00239     args = sys.argv[1:]
00240     show_help = len(sys.argv)<=1
00241 
00242     shorts = 'hv'
00243     longs = map(lambda x: x +'=', options.keys()) + ['help']
00244 
00245     optlist, args = getopt.getopt(args,shorts, longs)
00246     if show_help: help()
00247 
00248     for arg in optlist:
00249         if arg[0] == '-h' or arg[0] == '--help':
00250             help()
00251             sys.exit(0)
00252         for key in options.keys():
00253             if arg[1].lower() == 'no': arg[1] = 0
00254             elif arg[1].lower() == 'yes': arg[1] = 1
00255             if arg[0][2:] == key: options[key] = arg[1]
00256             
00257         if arg[0] == '-v':print 'OPTIONS', options
00258 
00259     file = args[0]
00260     nextorf = NextOrf(file, options)