Back to index

python-biopython  1.60
CheckSum.py
Go to the documentation of this file.
00001 # Copyright 2002 by Yves Bastide and Brad Chapman.
00002 # Copyright 2007 by Sebastian Bassi
00003 # All rights reserved.
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 
00008 """Functions to calculate assorted sequence checksums."""
00009 
00010 # crc32, crc64, gcg, and seguid
00011 # crc64 is adapted from BioPerl
00012 
00013 from binascii import crc32 as _crc32
00014 from Bio._py3k import _as_bytes
00015 
00016 def crc32(seq):
00017     """Returns the crc32 checksum for a sequence (string or Seq object)."""
00018     #NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned
00019     #Docs suggest should use crc32(x) & 0xffffffff for consistency.
00020     #TODO - Should we return crc32(x) & 0xffffffff here?
00021     try:
00022         #Assume its a Seq object
00023         return _crc32(_as_bytes(seq.tostring()))
00024     except AttributeError:
00025         #Assume its a string/unicode
00026         return _crc32(_as_bytes(seq))
00027 
00028 def _init_table_h():
00029     _table_h = []
00030     for i in range(256):
00031         l = i
00032         part_h = 0
00033         for j in range(8):
00034             rflag = l & 1
00035             l >>= 1
00036             if part_h & 1: l |= (1L << 31)
00037             part_h >>= 1L
00038             if rflag: part_h ^= 0xd8000000L
00039         _table_h.append(part_h)
00040     return _table_h
00041 
00042 # Initialisation
00043 _table_h = _init_table_h()
00044 
00045 def crc64(s):
00046     """Returns the crc64 checksum for a sequence (string or Seq object)."""
00047     crcl = 0
00048     crch = 0
00049     for c in s:
00050         shr = (crch & 0xFF) << 24
00051         temp1h = crch >> 8
00052         temp1l = (crcl >> 8) | shr
00053         idx  = (crcl ^ ord(c)) & 0xFF
00054         crch = temp1h ^ _table_h[idx]
00055         crcl = temp1l
00056 
00057     return "CRC-%08X%08X" % (crch, crcl)
00058 
00059 
00060 def gcg(seq):
00061     """Returns the GCG checksum (int) for a sequence (string or Seq object).
00062 
00063     Given a nucleotide or amino-acid secuence (or any string),
00064     returns the GCG checksum (int). Checksum used by GCG program.
00065     seq type = str.
00066     Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
00067     with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
00068     All sequences are converted to uppercase """
00069     try:
00070         #Assume its a Seq object
00071         seq = seq.tostring()
00072     except AttributeError:
00073         #Assume its a string
00074         pass
00075     index = checksum = 0
00076     for char in seq:
00077         index += 1
00078         checksum += index * ord(char.upper())
00079         if index == 57: index = 0
00080     return checksum % 10000
00081 
00082 def seguid(seq):
00083     """Returns the SEGUID (string) for a sequence (string or Seq object).
00084     
00085     Given a nucleotide or amino-acid secuence (or any string),
00086     returns the SEGUID string (A SEquence Globally Unique IDentifier).
00087     seq type = str. 
00088     For more information about SEGUID, see:
00089     http://bioinformatics.anl.gov/seguid/
00090     DOI: 10.1002/pmic.200600032 """
00091     try:
00092         #Python 2.5 sha1 is in hashlib
00093         import hashlib
00094         m = hashlib.sha1()
00095     except:
00096         #For older versions 
00097         import sha
00098         m = sha.new()
00099     import base64
00100     try:
00101         #Assume its a Seq object
00102         seq = seq.tostring()
00103     except AttributeError:
00104         #Assume its a string
00105         pass
00106     m.update(_as_bytes(seq.upper()))
00107     try:
00108         #For Python 3+
00109         return base64.encodebytes(m.digest()).decode().replace("\n","").rstrip("=")
00110     except AttributeError:
00111         pass
00112     try:
00113         #For Python 2.5+
00114         return base64.b64encode(m.digest()).rstrip("=")
00115     except:
00116         #For older versions
00117         import os
00118         #Note: Using os.linesep doesn't work on Windows,
00119         #where os.linesep= "\r\n" but the encoded string
00120         #contains "\n" but not "\r\n"
00121         return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
00122 
00123 if __name__ == "__main__":
00124     print "Quick self test"
00125 
00126     str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
00127                     + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
00128                     + "YCSSYAGSSTLVFGGGTKLTVL"
00129 
00130     str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
00131                     + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
00132                     + "YCCSYAGSSTWVFGGGTKLTVL"
00133 
00134     assert crc64(str_light_chain_one) == crc64(str_light_chain_two)
00135     assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one)
00136 
00137     assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one)
00138     assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two)
00139     
00140     print "Done"