Back to index

python-biopython  1.60
Proux_et_al_2002_Figure_6.py
Go to the documentation of this file.
00001 """GenomeDiagram script to mimic Proux et al 2002 Figure 6
00002 
00003 You can use the Entrez module to download the 3 required GenBank files
00004 
00005 This is an extended version of the example in the Biopython Tutorial
00006 which produces a GenomeDiagram figure close to Proux et al 2002 Figure 6.
00007 
00008 See http://dx.doi.org/10.1128/JB.184.21.6026-6036.2002
00009 """
00010 import os
00011 
00012 from reportlab.lib import colors
00013 from reportlab.lib.colors import red, grey, orange, green, brown, blue, lightblue, purple
00014 from reportlab.lib.units import cm
00015 
00016 from Bio.Graphics import GenomeDiagram
00017 from Bio.Graphics.GenomeDiagram import CrossLink
00018 
00019 from Bio import SeqIO
00020 from Bio.SeqFeature import SeqFeature, FeatureLocation
00021 
00022 name = "Proux Fig 6"
00023 
00024 #As explained in the Biopython Tutorial, these are three phage genomes. The first
00025 #two are self-containged GenBank files, but the third phage is integrated into a
00026 #bacterial genome, thus we slice the full record and also take the reverse
00027 #complement to match the strand orientation of the other two phage:
00028 
00029 A_rec = SeqIO.read("NC_002703.gbk", "gb")
00030 B_rec = SeqIO.read("AF323668.gbk", "gb")
00031 C_rec = SeqIO.read("NC_003212.gbk", "gb")[2587879:2625807].reverse_complement(name=True)
00032 records = dict((rec.name, rec) for rec in [A_rec, B_rec, C_rec])
00033 
00034 #Here we hard code the gene colors for simiplicity and to match the target image.
00035 #In practice you might have an automatic mapping based on the gene annotation
00036 #or some other classification:
00037 
00038 A_colors = [red]*5 + [grey]*7 + [orange]*2 + [grey]*2 + [orange] + [grey]*11 + [green]*4 \
00039          + [grey] + [green]*2 + [grey, green] + [brown]*5 + [blue]*4 + [lightblue]*5 \
00040          + [grey, lightblue] + [purple]*2 + [grey]
00041 B_colors = [red]*6 + [grey]*8 + [orange]*2 + [grey] + [orange] + [grey]*21 + [green]*5 \
00042          + [grey] + [brown]*4 + [blue]*3 + [lightblue]*3 + [grey]*5 + [purple]*2
00043 C_colors = [grey]*30 + [green]*5 + [brown]*4 + [blue]*2 + [grey, blue] + [lightblue]*2 \
00044          + [grey]*5
00045 
00046 #Here we hard code a list of cross-links with percentage identity scores, based
00047 #on a manual inspection of the target image (there could be mistakes here).
00048 #In practice you might generate the list of cross-mappings from a BLAST report
00049 #or similar computational analysis:
00050 
00051 #Tuc2009 (NC_002703) vs bIL285 (AF323668)
00052 A_vs_B = [
00053     (99, "Tuc2009_01", "int"),
00054     (33, "Tuc2009_03", "orf4"),
00055     (94, "Tuc2009_05", "orf6"),
00056     (100, "Tuc2009_06", "orf7"),
00057     (97, "Tuc2009_07", "orf8"),
00058     (98, "Tuc2009_08", "orf9"),
00059     (98, "Tuc2009_09", "orf10"),
00060     (100, "Tuc2009_10", "orf12"),
00061     (100, "Tuc2009_11", "orf13"),
00062     (94, "Tuc2009_12", "orf14"),
00063     (87, "Tuc2009_13", "orf15"),
00064     (94, "Tuc2009_14", "orf16"),
00065     (94, "Tuc2009_15", "orf17"),
00066     (88, "Tuc2009_17", "rusA"),
00067     (91, "Tuc2009_18", "orf20"),
00068     (93, "Tuc2009_19", "orf22"),
00069     (71, "Tuc2009_20", "orf23"),
00070     (51, "Tuc2009_22", "orf27"),
00071     (97, "Tuc2009_23", "orf28"),
00072     (88, "Tuc2009_24", "orf29"),
00073     (26, "Tuc2009_26", "orf38"),
00074     (19, "Tuc2009_46", "orf52"),
00075     (77, "Tuc2009_48", "orf54"),
00076     (91, "Tuc2009_49", "orf55"),
00077     (95, "Tuc2009_52", "orf60"), 
00078 ]
00079 
00080 #bIL285 (AF323668) vs Listeria innocua prophage 5 (in NC_003212)  
00081 B_vs_C = [
00082     (42, "orf39", "lin2581"),
00083     (31, "orf40", "lin2580"),
00084     (49, "orf41", "lin2579"), #terL
00085     (54, "orf42", "lin2578"), #portal
00086     (55, "orf43", "lin2577"), #protease
00087     (33, "orf44", "lin2576"), #mhp
00088     (51, "orf46", "lin2575"),
00089     (33, "orf47", "lin2574"),
00090     (40, "orf48", "lin2573"),
00091     (25, "orf49", "lin2572"),
00092     (50, "orf50", "lin2571"),
00093     (48, "orf51", "lin2570"),
00094     (24, "orf52", "lin2568"),
00095     (30, "orf53", "lin2567"),
00096     (28, "orf54", "lin2566"),
00097 ]
00098 
00099 def get_feature(features, id, tags=["locus_tag", "gene"]):
00100     """Search list of SeqFeature objects for an identifier under the given tags."""
00101     for f in features:
00102         for key in tags:
00103             #tag may not be present in this feature
00104             for x in f.qualifiers.get(key, []):
00105                 if x == id:
00106                      return f
00107     raise KeyError(id)
00108 
00109 gd_diagram = GenomeDiagram.Diagram(name)
00110 feature_sets = {}
00111 max_len = 0
00112 for i, record in enumerate([A_rec, B_rec, C_rec]):
00113     max_len = max(max_len, len(record))
00114     #Allocate tracks 5 (top), 3, 1 (bottom) for A, B, C
00115     #(empty tracks 2 and 4 add useful white space to emphasise the cross links
00116     #and also serve to make the tracks vertically more compressed)
00117     gd_track_for_features = gd_diagram.new_track(5-2*i,
00118                             name=record.name,
00119                             greytrack=True,
00120                             start=0, end=len(record))
00121     assert record.name not in feature_sets
00122     feature_sets[record.name] = gd_track_for_features.new_set()
00123 
00124 #We add dummy features to the tracks for each cross-link BEFORE we add the
00125 #arrow features for the genes. This ensures the genes appear on top:
00126 for X, Y, X_vs_Y in [("NC_002703", "AF323668", A_vs_B),
00127                      ("AF323668", "NC_003212", B_vs_C)]:
00128     features_X = records[X].features
00129     features_Y = records[Y].features
00130     set_X = feature_sets[X]
00131     set_Y = feature_sets[Y]
00132     for score, x, y in X_vs_Y:
00133         color = colors.linearlyInterpolatedColor(colors.white, colors.firebrick, 0, 100, score)
00134         border = colors.lightgrey
00135         f_x = get_feature(features_X, x)
00136         F_x = set_X.add_feature(SeqFeature(FeatureLocation(f_x.location.start, f_x.location.end, strand=0)),
00137                                 color=color, border=border)
00138         f_y = get_feature(features_Y, y)
00139         F_y = set_Y.add_feature(SeqFeature(FeatureLocation(f_y.location.start,f_y.location.end, strand=0)),
00140                                 color=color, border=border)
00141         gd_diagram.cross_track_links.append(CrossLink(F_x, F_y, color, border))
00142 
00143 
00144 for record, gene_colors in zip([A_rec, B_rec, C_rec], [A_colors, B_colors, C_colors]):
00145     gd_feature_set = feature_sets[record.name]
00146 
00147     i = 0
00148     for feature in record.features:
00149         if feature.type != "gene":
00150             #Exclude this feature
00151             continue
00152         gd_feature_set.add_feature(feature, sigil="ARROW",
00153                                    color=gene_colors[i], label=True,
00154                                    name = str(i+1),
00155                                    label_position="start",
00156                                    label_size = 6, label_angle=0)
00157         i+=1
00158 
00159 gd_diagram.draw(format="linear", pagesize='A4', fragments=1,
00160                 start=0, end=max_len)
00161 gd_diagram.write(name + ".pdf", "PDF")
00162 gd_diagram.write(name + ".eps", "EPS")
00163 gd_diagram.write(name + ".svg", "SVG")