Back to index

python-biopython  1.60
test_SeqUtils.py
Go to the documentation of this file.
00001 # Copyright 2007-2009 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 import os
00007 
00008 from Bio.SeqUtils import GC, quick_FASTA_reader
00009 from Bio.SeqUtils.CheckSum import crc32, crc64, gcg, seguid
00010 from Bio.SeqUtils.lcc import lcc_simp, lcc_mult
00011 from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
00012 from Bio.SeqRecord import SeqRecord
00013 from Bio.Seq import Seq, MutableSeq
00014 from Bio.Alphabet import single_letter_alphabet
00015 from Bio import SeqIO
00016 
00017 
00018 ######################
00019 # quick_FASTA_reader #
00020 ######################
00021 
00022 dna_fasta_filename = "Fasta/f002"
00023 
00024 tuple_records = quick_FASTA_reader(dna_fasta_filename)
00025 assert len(tuple_records)==3
00026 seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta"))
00027 assert len(seq_records)==3
00028 for tuple_record, seq_record in zip(tuple_records, seq_records):
00029     assert tuple_record == (seq_record.description, seq_record.seq.tostring())
00030     print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq))
00031 
00032 ##############
00033 # CodonUsage #
00034 ##############
00035 
00036 print
00037 print "Codon Adaption Index (CAI)"
00038 CAI = CodonAdaptationIndex()
00039 # Note - this needs a whole number of codons, and a DNA seq AS A STRING.
00040 print "Example CAI %0.5f using E. coli (default)" \
00041       % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")
00042 
00043 #We need a FASTA file of CDS sequences to count the codon usage...
00044 dna_fasta_filename = "fasta.tmp"
00045 dna_genbank_filename = "GenBank/NC_005816.gb"
00046 record = SeqIO.read(open(dna_genbank_filename), "genbank")
00047 records = []
00048 for feature in record.features:
00049     if feature.type == "CDS" \
00050     and not feature.sub_features:
00051         start = feature.location.start.position
00052         end = feature.location.end.position
00053         table = int(feature.qualifiers["transl_table"][0])
00054         if feature.strand == -1:
00055             seq = record.seq[start:end].reverse_complement()
00056         else:
00057             seq = record.seq[start:end]
00058         #Double check we have the CDS sequence expected
00059         #TODO - Use any cds_start option if/when added to deal with the met
00060         a = "M" + str(seq[3:].translate(table))
00061         b = feature.qualifiers["translation"][0]+"*"
00062         assert a == b, "%r vs %r" % (a,b)
00063         records.append(SeqRecord(seq, id=feature.qualifiers["protein_id"][0],
00064                                  description=feature.qualifiers["product"][0]))
00065 del start, end, table, seq
00066 if os.path.isfile(dna_fasta_filename):
00067     os.remove(dna_fasta_filename)
00068 handle = open(dna_fasta_filename, "w")
00069 SeqIO.write(records, handle, "fasta")
00070 handle.close()
00071 
00072 CAI = CodonAdaptationIndex()
00073 # Note - this needs a FASTA file which containing non-ambiguous DNA coding
00074 # sequences - which should each be a whole number of codons.
00075 CAI.generate_index(dna_fasta_filename)
00076 print "Example CAI %0.5f using %s" \
00077       % (CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
00078          record.annotations["source"])
00079 
00080 os.remove(dna_fasta_filename)
00081 del record, records
00082 del dna_genbank_filename
00083 del dna_fasta_filename
00084 
00085 print
00086 
00087 ###################
00088 # crc64 collision #
00089 ###################
00090 
00091 # Example of crc64 collision from Sebastian Bassi using the
00092 # immunoglobulin lambda light chain variable region from Homo sapiens
00093 # Both sequences share the same CRC64 checksum: 44CAAD88706CC153
00094 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
00095                 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
00096                 + "YCSSYAGSSTLVFGGGTKLTVL"
00097 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
00098                 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
00099                 + "YCCSYAGSSTWVFGGGTKLTVL"
00100 
00101 #Explicit testing of crc64 collision:
00102 assert str_light_chain_one != str_light_chain_two
00103 assert crc32(str_light_chain_one) != crc32(str_light_chain_two)
00104 assert crc64(str_light_chain_one) == crc64(str_light_chain_two)
00105 assert gcg(str_light_chain_one) != gcg(str_light_chain_two)
00106 assert seguid(str_light_chain_one) != seguid(str_light_chain_two)
00107 
00108 ###########################
00109 # main checksum/LCC tests #
00110 ###########################
00111 
00112 #Print some output, which the test harness will check
00113 examples = [str_light_chain_one, str_light_chain_two,
00114             "ATGCGTATCGATCGCGATACGATTAGGCGGAT"]
00115 
00116 def u_crc32(seq):
00117     #NOTE - On Python 2 crc32 could return a signed int, but on Python 3 it is
00118     #always unsigned
00119     #Docs suggest should use crc32(x) & 0xffffffff for consistency.
00120     return crc32(seq) & 0xffffffff 
00121 
00122 for i, seq_str in enumerate(examples):
00123     print "Example %i, length %i, %s..." % (i+1, len(seq_str), seq_str[:10])
00124 
00125     #Avoid cross platforms with printing floats by doing conversion explicitly
00126     def simple_LCC(s):
00127         return "%0.2f" % lcc_simp(s)
00128 
00129     def windowed_LCC(s):
00130         return ", ".join(["%0.2f" % v for v in lcc_mult(s,20)])
00131 
00132     for checksum in [u_crc32, crc64, gcg, seguid, simple_LCC, windowed_LCC]:
00133         #First using a string:
00134         value = checksum(seq_str)
00135         print " %s = %s" % (checksum.__name__, value)
00136         #Secondly check it works with a Seq object
00137         assert value == checksum(Seq(seq_str, single_letter_alphabet))
00138         #Finally check it works with a MutableSeq object
00139         assert value == checksum(MutableSeq(seq_str, single_letter_alphabet))
00140