Back to index

python-biopython  1.60
IUPACData.py
Go to the documentation of this file.
00001 # Information about the IUPAC alphabets
00002 
00003 protein_letters = "ACDEFGHIKLMNPQRSTVWY"
00004 extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
00005 #   B = "Asx";  aspartic acid or asparagine (D or N)
00006 #   X = "Xxx";  unknown or 'other' amino acid
00007 #   Z = "Glx";  glutamic acid or glutamine (E or Q)
00008 #   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
00009 #
00010 #   J = "Xle";  leucine or isoleucine (L or I, used in NMR)
00011 #   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
00012 #   Also the International Nucleotide Sequence Database Collaboration (INSDC)
00013 #   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
00014 #   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
00015 #
00016 #   Xle (J); Leucine or Isoleucine
00017 #   The residue abbreviations, Xle (the three-letter abbreviation) and J
00018 #   (the one-letter abbreviation) are reserved for the case that cannot
00019 #   experimentally distinguish leucine from isoleucine.
00020 #
00021 #   U = "Sec";  selenocysteine
00022 #   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
00023 #
00024 #   O = "Pyl";  pyrrolysine
00025 #   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35
00026 ambiguous_dna_letters = "GATCRYWSMKHBVDN"
00027 unambiguous_dna_letters = "GATC"
00028 ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
00029 unambiguous_rna_letters = "GAUC"
00030 
00031 #   B == 5-bromouridine
00032 #   D == 5,6-dihydrouridine
00033 #   S == thiouridine
00034 #   W == wyosine
00035 extended_dna_letters = "GATCBDSW"
00036 
00037 # are there extended forms?
00038 #extended_rna_letters = "GAUCBDSW"
00039 
00040 ambiguous_dna_values = {
00041     "A": "A",
00042     "C": "C",
00043     "G": "G",
00044     "T": "T",
00045     "M": "AC",
00046     "R": "AG",
00047     "W": "AT",
00048     "S": "CG",
00049     "Y": "CT",
00050     "K": "GT",
00051     "V": "ACG",
00052     "H": "ACT",
00053     "D": "AGT",
00054     "B": "CGT",
00055     "X": "GATC",
00056     "N": "GATC",
00057     }
00058 ambiguous_rna_values = {
00059     "A": "A",
00060     "C": "C",
00061     "G": "G",
00062     "U": "U",
00063     "M": "AC",
00064     "R": "AG",
00065     "W": "AU",
00066     "S": "CG",
00067     "Y": "CU",
00068     "K": "GU",
00069     "V": "ACG",
00070     "H": "ACU",
00071     "D": "AGU",
00072     "B": "CGU",
00073     "X": "GAUC",
00074     "N": "GAUC",
00075     }
00076 
00077 ambiguous_dna_complement = {
00078     "A": "T",
00079     "C": "G",
00080     "G": "C",
00081     "T": "A",
00082     "M": "K",
00083     "R": "Y",
00084     "W": "W",
00085     "S": "S",
00086     "Y": "R",
00087     "K": "M",
00088     "V": "B",
00089     "H": "D",
00090     "D": "H",
00091     "B": "V",
00092     "X": "X",
00093     "N": "N",
00094     }
00095 
00096 ambiguous_rna_complement = {
00097     "A": "U",
00098     "C": "G",
00099     "G": "C",
00100     "U": "A",
00101     "M": "K",
00102     "R": "Y",
00103     "W": "W",
00104     "S": "S",
00105     "Y": "R",
00106     "K": "M",
00107     "V": "B",
00108     "H": "D",
00109     "D": "H",
00110     "B": "V",
00111     "X": "X",
00112     "N": "N",
00113     }
00114 
00115 
00116 def _make_ranges(mydict):
00117     d = {}
00118     for key, value in mydict.iteritems():
00119         d[key] = (value, value)
00120     return d
00121 
00122 # From bioperl's SeqStats.pm
00123 unambiguous_dna_weights = {
00124     "A": 347.,
00125     "C": 323.,
00126     "G": 363.,
00127     "T": 322.,
00128     }
00129 unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)
00130 
00131 unambiguous_rna_weights = {
00132     "A": unambiguous_dna_weights["A"] + 16.,  # 16 for the oxygen
00133     "C": unambiguous_dna_weights["C"] + 16.,
00134     "G": unambiguous_dna_weights["G"] + 16.,
00135     "U": 340.,
00136 }
00137 unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)
00138 
00139 def _make_ambiguous_ranges(mydict, weight_table):
00140     range_d = {}
00141     avg_d = {}
00142     for letter, values in mydict.iteritems():
00143         #Following line is a quick hack to skip undefined weights for U and O
00144         if len(values)==1 and values[0] not in weight_table : continue
00145         weights = map(weight_table.get, values)
00146         range_d[letter] = (min(weights), max(weights))
00147         total_w = 0.0
00148         for w in weights:
00149             total_w = total_w + w
00150         avg_d[letter] = total_w / len(weights)
00151     return range_d, avg_d
00152 
00153 ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \
00154                _make_ambiguous_ranges(ambiguous_dna_values,
00155                                       unambiguous_dna_weights)
00156 
00157 ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \
00158                _make_ambiguous_ranges(ambiguous_rna_values,
00159                                       unambiguous_rna_weights)
00160 
00161 protein_weights = {
00162     "A": 89.09,
00163     "C": 121.16,
00164     "D": 133.10,
00165     "E": 147.13,
00166     "F": 165.19,
00167     "G": 75.07,
00168     "H": 155.16,
00169     "I": 131.18,
00170     "K": 146.19,
00171     "L": 131.18,
00172     "M": 149.21,
00173     "N": 132.12,
00174     #"O": 0.0, # Needs to be recorded!
00175     "P": 115.13,
00176     "Q": 146.15,
00177     "R": 174.20,
00178     "S": 105.09,
00179     "T": 119.12,
00180     #"U": 168.05, # To be confirmed
00181     "V": 117.15,
00182     "W": 204.23,
00183     "Y": 181.19
00184     }
00185 
00186 extended_protein_values = {
00187     "A": "A",
00188     "B": "ND",
00189     "C": "C",
00190     "D": "D",
00191     "E": "E",
00192     "F": "F",
00193     "G": "G",
00194     "H": "H",
00195     "I": "I",
00196     "J": "IL",
00197     "K": "K",
00198     "L": "L",
00199     "M": "M",
00200     "N": "N",
00201     "O": "O",
00202     "P": "P",
00203     "Q": "Q",
00204     "R": "R",
00205     "S": "S",
00206     "T": "T",
00207     "U": "U",
00208     "V": "V",
00209     "W": "W",
00210     "X": "ACDEFGHIKLMNPQRSTVWY",
00211     #TODO - Include U and O in the possible values of X?
00212     #This could alter the extended_protein_weight_ranges ...
00213     "Y": "Y",
00214     "Z": "QE",
00215 }
00216     
00217 protein_weight_ranges = _make_ranges(protein_weights)
00218 
00219 extended_protein_weight_ranges, avg_extended_protein_weights = \
00220                _make_ambiguous_ranges(extended_protein_values,
00221                                       protein_weights)
00222 
00223 
00224 # For Center of Mass Calculation.
00225 # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol
00226 atom_weights = {
00227     'H'  :   1.00794,
00228     'He' :   4.002602,
00229     'Li' :   6.941,
00230     'Be' :   9.012182,
00231     'B'  :  10.811,
00232     'C'  :  12.0107,
00233     'N'  :  14.0067,
00234     'O'  :  15.9994,
00235     'F'  :  18.9984032,
00236     'Ne' :  20.1797,
00237     'Na' :  22.989770,
00238     'Mg' :  24.3050,
00239     'Al' :  26.981538,
00240     'Si' :  28.0855,
00241     'P'  :  30.973761,
00242     'S'  :  32.065,
00243     'Cl' :  35.453,
00244     'Ar' :  39.948,
00245     'K'  :  39.0983,
00246     'Ca' :  40.078,
00247     'Sc' :  44.955910,
00248     'Ti' :  47.867,
00249     'V'  :  50.9415,
00250     'Cr' :  51.9961,
00251     'Mn' :  54.938049,
00252     'Fe' :  55.845,
00253     'Co' :  58.933200,
00254     'Ni' :  58.6934,
00255     'Cu' :  63.546,
00256     'Zn' :  65.39,
00257     'Ga' :  69.723,
00258     'Ge' :  72.64,
00259     'As' :  74.92160,
00260     'Se' :  78.96,
00261     'Br' :  79.904,   
00262     'Kr' :  83.80,
00263     'Rb' :  85.4678,
00264     'Sr' :  87.62,
00265     'Y'  :  88.90585,
00266     'Zr' :  91.224,
00267     'Nb' :  92.90638,
00268     'Mo' :  95.94,
00269     'Tc' :  98.0,
00270     'Ru' : 101.07,
00271     'Rh' : 102.90550,
00272     'Pd' : 106.42,
00273     'Ag' : 107.8682,
00274     'Cd' : 112.411,
00275     'In' : 114.818,
00276     'Sn' : 118.710,
00277     'Sb' : 121.760,
00278     'Te' : 127.60,
00279     'I'  : 126.90447,
00280     'Xe' : 131.293,
00281     'Cs' : 132.90545,
00282     'Ba' : 137.327,
00283     'La' : 138.9055,
00284     'Ce' : 140.116,
00285     'Pr' : 140.90765,
00286     'Nd' : 144.24,
00287     'Pm' : 145.0,
00288     'Sm' : 150.36,
00289     'Eu' : 151.964,
00290     'Gd' : 157.25,
00291     'Tb' : 158.92534,
00292     'Dy' : 162.50,
00293     'Ho' : 164.93032,
00294     'Er' : 167.259,
00295     'Tm' : 168.93421,
00296     'Yb' : 173.04,
00297     'Lu' : 174.967,
00298     'Hf' : 178.49,
00299     'Ta' : 180.9479,
00300     'W'  : 183.84,
00301     'Re' : 186.207,
00302     'Os' : 190.23,
00303     'Ir' : 192.217,
00304     'Pt' : 195.078,
00305     'Au' : 196.96655,
00306     'Hg' : 200.59,
00307     'Tl' : 204.3833,
00308     'Pb' : 207.2,
00309     'Bi' : 208.98038,
00310     'Po' : 208.98,
00311     'At' : 209.99,
00312     'Rn' : 222.02,
00313     'Fr' : 223.02,
00314     'Ra' : 226.03,
00315     'Ac' : 227.03,
00316     'Th' : 232.0381,
00317     'Pa' : 231.03588,
00318     'U'  : 238.02891,
00319     'Np' : 237.05,
00320     'Pu' : 244.06,
00321     'Am' : 243.06,
00322     'Cm' : 247.07,
00323     'Bk' : 247.07,
00324     'Cf' : 251.08,
00325     'Es' : 252.08,
00326     'Fm' : 257.10,
00327     'Md' : 258.10,
00328     'No' : 259.10,
00329     'Lr' : 262.11,
00330     'Rf' : 261.11,
00331     'Db' : 262.11,
00332     'Sg' : 266.12,
00333     'Bh' : 264.12,
00334     'Hs' : 269.13,
00335     'Mt' : 268.14,    
00336 }