Back to index

python-biopython  1.60
test_PAML_codeml.py
Go to the documentation of this file.
00001 # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license. Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 import unittest
00007 import os
00008 import os.path
00009 from Bio.Phylo.PAML import codeml
00010 from Bio.Phylo.PAML._paml import PamlError
00011 
00012 
00013 # Some constants to assist with testing:
00014 # This is the number of parameters that should be parsed for each 
00015 # NSsites site class model
00016 SITECLASS_PARAMS = {0: 6, 1: 4, 2: 4, 3: 4, 7: 5, 8: 9}
00017 # This is the default number of site classes per NSsites site
00018 # class model
00019 SITECLASSES = {0: None, 1: 2, 2: 3, 3: 3, 7: 10, 8: 11}
00020 
00021 
00022 class ModTest(unittest.TestCase):
00023     
00024     align_dir = os.path.join("PAML", "Alignments")
00025     tree_dir = os.path.join("PAML", "Trees")
00026     ctl_dir = os.path.join("PAML", "Control_files")
00027     results_dir = os.path.join("PAML", "Results")
00028     working_dir = os.path.join("PAML", "codeml_test")
00029 
00030     align_file = os.path.join(align_dir, "alignment.phylip")
00031     tree_file = os.path.join(tree_dir, "species.tree")
00032     bad_tree_file = os.path.join(tree_dir, "bad.tree")
00033     out_file = os.path.join(results_dir, "test.out")
00034     results_file = os.path.join(results_dir, "bad_results.out")
00035     bad_ctl_file1 = os.path.join(ctl_dir, "bad1.ctl")
00036     bad_ctl_file2 = os.path.join(ctl_dir, "bad2.ctl")
00037     bad_ctl_file3 = os.path.join(ctl_dir, "bad3.ctl")
00038     ctl_file = os.path.join(ctl_dir, "codeml", "codeml.ctl")
00039        
00040     def tearDown(self):
00041         """Just in case CODEML creates some junk files, do a clean-up."""
00042         del_files = [self.out_file, "2NG.dN", "2NG.dS", "2NG.t", "codeml.ctl", 
00043                 "lnf", "rst", "rst1", "rub"]
00044         for filename in del_files:
00045             if os.path.exists(filename):
00046                 os.remove(filename)
00047         if os.path.exists(self.working_dir):
00048             for filename in os.listdir(self.working_dir):
00049                 filepath = os.path.join(self.working_dir, filename)
00050                 os.remove(filepath)
00051             os.rmdir(self.working_dir)
00052     
00053     def setUp(self):
00054         self.cml = codeml.Codeml()
00055         
00056     def testAlignmentFileIsValid(self):
00057         self.assertRaises((AttributeError, TypeError),
00058             codeml.Codeml, alignment = 1)
00059         self.cml.alignment = 1
00060         self.cml.tree = self.tree_file
00061         self.cml.out_file = self.out_file
00062         self.assertRaises((AttributeError, TypeError),
00063             self.cml.run)
00064         
00065     def testAlignmentExists(self):
00066         self.assertRaises((EnvironmentError, IOError), codeml.Codeml, 
00067             alignment = "nonexistent")
00068         self.cml.alignment = "nonexistent"
00069         self.cml.tree = self.tree_file
00070         self.cml.out_file = self.out_file
00071         self.assertRaises(IOError, self.cml.run)
00072     
00073     def testTreeFileValid(self):
00074         self.assertRaises((AttributeError, TypeError),
00075             codeml.Codeml, tree = 1)
00076         self.cml.alignment = self.align_file
00077         self.cml.tree = 1
00078         self.cml.out_file = self.out_file
00079         self.assertRaises((AttributeError, TypeError),
00080             self.cml.run)
00081         
00082     def testTreeExists(self):
00083         self.assertRaises((EnvironmentError, IOError), codeml.Codeml, 
00084             tree = "nonexistent")
00085         self.cml.alignment = self.align_file
00086         self.cml.tree = "nonexistent"
00087         self.cml.out_file = self.out_file
00088         self.assertRaises(IOError, self.cml.run)
00089     
00090     def testWorkingDirValid(self):
00091         self.cml.tree = self.tree_file
00092         self.cml.alignment = self.align_file
00093         self.cml.out_file = self.out_file
00094         self.cml.working_dir = 1
00095         self.assertRaises((AttributeError, TypeError),
00096             self.cml.run)
00097     
00098     def testOutputFileValid(self):
00099         self.cml.tree = self.tree_file
00100         self.cml.alignment = self.align_file
00101         self.cml.out_file = 1
00102         self.assertRaises((AttributeError, TypeError),
00103             self.cml.run)
00104     
00105     def testOptionExists(self):
00106         self.assertRaises((AttributeError, KeyError),
00107                           self.cml.set_options, xxxx=1)
00108         self.assertRaises((AttributeError, KeyError),
00109                           self.cml.get_option, "xxxx")
00110     
00111     def testAlignmentSpecified(self):
00112         self.cml.tree = self.tree_file
00113         self.cml.out_file = self.out_file
00114         self.assertRaises((AttributeError, ValueError),
00115             self.cml.run)
00116         
00117     def testTreeSpecified(self):
00118         self.cml.alignment = self.align_file
00119         self.cml.out_file = self.out_file
00120         self.assertRaises((AttributeError, ValueError),
00121             self.cml.run)
00122         
00123     def testOutputFileSpecified(self):
00124         self.cml.alignment = self.align_file
00125         self.cml.tree = self.tree_file
00126         self.assertRaises((AttributeError, ValueError),
00127             self.cml.run)
00128         
00129     def testPamlErrorsCaught(self):
00130         self.cml.alignment = self.align_file
00131         self.cml.tree = self.bad_tree_file
00132         self.cml.out_file = self.out_file
00133         self.assertRaises((EnvironmentError, PamlError),
00134             self.cml.run)
00135         
00136     def testCtlFileValidOnRun(self):
00137         self.cml.alignment = self.align_file
00138         self.cml.tree = self.tree_file
00139         self.cml.out_file = self.out_file
00140         self.assertRaises((AttributeError, TypeError),
00141             self.cml.run, ctl_file = 1)
00142         
00143     def testCtlFileExistsOnRun(self):
00144         self.cml.alignment = self.align_file
00145         self.cml.tree = self.tree_file
00146         self.cml.out_file = self.out_file
00147         self.assertRaises((EnvironmentError, IOError),
00148             self.cml.run, ctl_file = "nonexistent")
00149             
00150     def testCtlFileValidOnRead(self):
00151         self.assertRaises((AttributeError, TypeError),
00152             self.cml.read_ctl_file, 1)
00153         self.assertRaises((AttributeError, KeyError), 
00154             self.cml.read_ctl_file, self.bad_ctl_file1)
00155         self.assertRaises(AttributeError, 
00156             self.cml.read_ctl_file, self.bad_ctl_file2)
00157         self.assertRaises(TypeError, 
00158             self.cml.read_ctl_file, self.bad_ctl_file3)
00159         target_options = {"noisy": 0,
00160                         "verbose": 0,
00161                         "runmode": 0,
00162                         "seqtype": 1, 
00163                         "CodonFreq": 2,
00164                         "ndata": None,
00165                         "clock": 0,
00166                         "aaDist": None,
00167                         "aaRatefile": None,
00168                         "model": 0,
00169                         "NSsites": [0],
00170                         "icode": 0,
00171                         "Mgene": 0,
00172                         "fix_kappa": 0,
00173                         "kappa": 4.54006,
00174                         "fix_omega": 0,
00175                         "omega": 1,
00176                         "fix_alpha": 1,
00177                         "alpha": 0,
00178                         "Malpha": 0,
00179                         "ncatG": None,
00180                         "getSE": 0,
00181                         "RateAncestor": 0,
00182                         "Small_Diff": None,
00183                         "cleandata": 1,
00184                         "fix_blength": 1,
00185                         "method": 0,
00186                         "rho": None,
00187                         "fix_rho": None}
00188         self.cml.read_ctl_file(self.ctl_file)
00189         self.assertEqual(sorted(self.cml._options.keys()), sorted(target_options.keys()))
00190         for key in target_options:
00191             self.assertEqual(self.cml._options[key], target_options[key], \
00192                              "%s: %r vs %r" \
00193                              % (key, self.cml._options[key], target_options[key]))
00194         
00195     def testCtlFileExistsOnRead(self):
00196         self.assertRaises((EnvironmentError, IOError),
00197             self.cml.read_ctl_file, ctl_file = "nonexistent")
00198         
00199     def testResultsValid(self):
00200         self.assertRaises((AttributeError, TypeError),
00201             codeml.read, 1)
00202     
00203     def testResultsExist(self):
00204         self.assertRaises((EnvironmentError, IOError),
00205             codeml.read, "nonexistent")
00206         
00207     def testResultsParsable(self):
00208         self.assertRaises(ValueError, codeml.read, self.results_file)
00209         
00210     def testParseSEs(self):
00211         res_dir = os.path.join(self.results_dir, "codeml", "SE")
00212         for results_file in os.listdir(res_dir):
00213             version = results_file.split('-')[1].split('.')[0]
00214             version_msg = "Improper parsing for version %s" \
00215                         % version.replace('_', '.')
00216             results_path = os.path.join(res_dir, results_file)
00217             results = codeml.read(results_path)
00218             self.assertEqual(len(results), 4, version_msg)
00219             self.assertTrue("NSsites" in results, version_msg)
00220             models = results["NSsites"]
00221             # Only site class model 0 was simulated
00222             self.assertEqual(len(models), 1, version_msg)
00223             self.assertTrue(0 in models, version_msg)
00224             model = models[0]
00225             self.assertEqual(len(model), 5, version_msg)
00226             self.assertTrue("parameters" in model, version_msg)
00227             params = model["parameters"]
00228             # There should be one new item in the parameters, "SEs" 
00229             self.assertEqual(len(params), SITECLASS_PARAMS[0] + 1, version_msg)
00230             self.assertTrue("SEs" in params, version_msg)
00231 
00232     def testParseAllNSsites(self):
00233         res_dir = os.path.join(self.results_dir, "codeml", "all_NSsites")
00234         for results_file in os.listdir(res_dir):
00235             version = results_file.split('-')[1].split('.')[0]
00236             version_msg = "Improper parsing for version %s" \
00237                         % version.replace('_', '.')                    
00238             results_path = os.path.join(res_dir, results_file)
00239             results = codeml.read(results_path)
00240             # There should be 4 top-level items: 'codon model', 'model', 
00241             # 'version', & 'NSsites'
00242             self.assertEqual(len(results), 4, version_msg)
00243             self.assertTrue("NSsites" in results, version_msg)
00244             # There should be 6 NSsites classes: 0, 1, 2, 3, 7 & 8
00245             self.assertEqual(len(results["NSsites"]), 6, version_msg)
00246             # Each site class model should have 5 sub-items: 'lnL', 'tree', 
00247             # 'description', 'parameters', & 'tree length'. It should
00248             # have the correct number of parameters also.
00249             for model_num in [0, 1, 2, 3, 7, 8]:
00250                 model = results["NSsites"][model_num]
00251                 self.assertEqual(len(model), 5, version_msg)
00252                 self.assertTrue("parameters" in model, version_msg)
00253                 params = model["parameters"]
00254                 self.assertEqual(len(params), SITECLASS_PARAMS[model_num],
00255                     version)
00256                 self.assertTrue("branches" in params, version_msg)
00257                 branches = params["branches"]
00258                 # There are 7 branches in the test case (specific to these
00259                 # test cases)
00260                 self.assertEqual(len(branches), 7, version_msg)
00261                 if "site classes" in params:
00262                     self.assertEqual(len(params["site classes"]),
00263                                  SITECLASSES[model_num], version_msg)
00264 
00265     def testParseNSsite3(self):
00266         res_dir = os.path.join(self.results_dir, "codeml", "NSsite3")
00267         for results_file in os.listdir(res_dir):
00268             version = results_file.split('-')[1].split('.')[0]
00269             version_msg = "Improper parsing for version %s" \
00270                         % version.replace('_', '.')                    
00271             results_path = os.path.join(res_dir, results_file)
00272             results = codeml.read(results_path)
00273             # There should be 5 top-level items: 'codon model', 'model', 
00274             # 'version', 'NSsites' & site-class model, the last of which
00275             # is only there when only one NSsites class is used
00276             self.assertEqual(len(results), 5, version_msg)
00277             self.assertTrue('site-class model' in results, version_msg)
00278             self.assertEqual(results['site-class model'], 'discrete', 
00279                     version_msg)
00280             self.assertTrue("NSsites" in results, version_msg)
00281             # There should be 1 NSsites classe: 3
00282             self.assertEqual(len(results["NSsites"]), 1, version_msg)
00283             # Each site class model should have 5 sub-items: 'lnL', 'tree', 
00284             # 'description', 'parameters', & 'tree length'. It should
00285             # have the correct number of parameters also.
00286             model = results["NSsites"][3]
00287             self.assertEqual(len(model), 5, version_msg)
00288             self.assertTrue("parameters" in model, version_msg)
00289             params = model["parameters"]
00290             self.assertEqual(len(params), SITECLASS_PARAMS[3],
00291                 version)
00292             self.assertTrue("site classes" in params, version_msg)
00293             site_classes = params["site classes"]
00294             self.assertEqual(len(site_classes), 4, version_msg)
00295         
00296     def testParseBranchSiteA(self):
00297         res_dir = os.path.join(self.results_dir, "codeml", "branchsiteA")
00298         for results_file in os.listdir(res_dir):
00299             version = results_file.split('-')[1].split('.')[0]    
00300             version_msg = "Improper parsing for version %s" \
00301                         % version.replace('_', '.')                
00302             results_path = os.path.join(res_dir, results_file)
00303             results = codeml.read(results_path)
00304             # There are 5 top-level items in this case:
00305             # 'codon model', 'model', 'version', 'NSsites' & 'site-class model'
00306             self.assertEqual(len(results), 5, version_msg)
00307             self.assertTrue("NSsites" in results, version_msg)
00308             models = results["NSsites"]
00309             # Only site class model 2 is simulated for Branch Site A
00310             self.assertEqual(len(models), 1, version_msg)
00311             self.assertTrue(2 in models, version_msg)
00312             model = models[2]
00313             self.assertEqual(len(model), 5, version_msg)
00314             self.assertTrue("parameters" in model, version_msg)
00315             params = model["parameters"]
00316             # Branch Site A results lack a "branches" parameter     
00317             self.assertEqual(len(params), SITECLASS_PARAMS[2]-1, version_msg)
00318             self.assertTrue("site classes" in params, version_msg)
00319             site_classes = params["site classes"]
00320             # Branch Site A adds another site class
00321             self.assertEqual(len(site_classes), SITECLASSES[2]+1,
00322                 version)        
00323             for class_num in [0, 1, 2, 3]:
00324                 self.assertTrue(class_num in site_classes, version_msg)
00325                 site_class = site_classes[class_num]
00326                 self.assertEqual(len(site_class), 2, version_msg)
00327                 self.assertTrue("branch types" in site_class, version_msg)
00328                 branches = site_class["branch types"]
00329                 self.assertEqual(len(branches), 2, version_msg)
00330 
00331     def testParseCladeModelC(self):
00332         cladeC_res_dir = os.path.join(self.results_dir, "codeml",
00333             "clademodelC")
00334         for results_file in os.listdir(cladeC_res_dir):
00335             version = results_file.split('-')[1].split('.')[0]
00336             version_msg = "Improper parsing for version %s" \
00337                         % version.replace('_', '.')
00338             results_path = os.path.join(cladeC_res_dir, results_file)
00339             results = codeml.read(results_path)
00340             # 5 top-level items again in this case
00341             self.assertEqual(len(results), 5, version_msg)
00342             self.assertTrue("NSsites" in results, version_msg)
00343             models = results["NSsites"]
00344             # Only site class model 2 is simulated for Clade Model C
00345             self.assertEqual(len(models), 1, version_msg)
00346             self.assertTrue(2 in models, version_msg)
00347             model = models[2]
00348             self.assertEqual(len(model), 5, version_msg)
00349             self.assertTrue("parameters" in model, version_msg)
00350             params = model["parameters"]
00351             # Clade Model C results lack a "branches" parameter     
00352             self.assertEqual(len(params), SITECLASS_PARAMS[2] - 1, version_msg)
00353             self.assertTrue("site classes" in params, version_msg)
00354             site_classes = params["site classes"]
00355             self.assertEqual(len(site_classes), SITECLASSES[2],
00356                 version)        
00357             for class_num in [0, 1, 2]:
00358                 self.assertTrue(class_num in site_classes, version_msg)
00359                 site_class = site_classes[class_num]
00360                 self.assertEqual(len(site_class), 2, version_msg)
00361                 self.assertTrue("branch types" in site_class, version_msg)
00362                 branches = site_class["branch types"]
00363                 self.assertEqual(len(branches), 2, version_msg)
00364 
00365     def testParseNgene2Mgene02(self):
00366         res_dir = os.path.join(self.results_dir, "codeml", "ngene2_mgene02")
00367         for results_file in os.listdir(res_dir):
00368             version = results_file.split('-')[1].split('.')[0]
00369             version_msg = "Improper parsing for version %s" \
00370                         % version.replace('_', '.')
00371             results_path = os.path.join(res_dir, results_file)
00372             results = codeml.read(results_path)
00373             self.assertEqual(len(results), 4, version_msg)
00374             self.assertTrue("NSsites" in results, version_msg)
00375             models = results["NSsites"]
00376             self.assertEqual(len(models), 1, version_msg)
00377             self.assertTrue(0 in models, version_msg)
00378             model = models[0]
00379             self.assertEqual(len(model), 5, version_msg)
00380             self.assertTrue("parameters" in model, version_msg)
00381             params = model["parameters"]
00382             # This type of model has fewer parameters for model 0
00383             self.assertEqual(len(params), 4, version_msg)
00384             self.assertTrue("rates" in params, version_msg)
00385             rates = params["rates"]
00386             self.assertEqual(len(rates), 2, version_msg)        
00387 
00388     def testParseNgene2Mgene1(self):
00389         res_dir = os.path.join(self.results_dir, "codeml", "ngene2_mgene1")
00390         for results_file in os.listdir(res_dir):
00391             version = results_file.split('-')[1].split('.')[0]
00392             version_msg = "Improper parsing for version %s" \
00393                         % version.replace('_', '.')
00394             results_path = os.path.join(res_dir, results_file)
00395             results = codeml.read(results_path)
00396             self.assertEqual(len(results), 4, version_msg)
00397             self.assertTrue("genes" in results, version_msg)
00398             genes = results["genes"]
00399             self.assertEqual(len(genes), 2, version_msg)
00400             model = genes[0]
00401             self.assertEqual(len(model), 5, version_msg)
00402             self.assertTrue("parameters" in model, version_msg)
00403             params = model["parameters"]
00404             self.assertEqual(len(params), SITECLASS_PARAMS[0], version_msg)
00405 
00406     def testParseNgene2Mgene34(self):
00407         res_dir = os.path.join(self.results_dir, "codeml", "ngene2_mgene34")
00408         for results_file in os.listdir(res_dir):
00409             version = results_file.split('-')[1].split('.')[0]    
00410             version_msg = "Improper parsing for version %s" \
00411                         % version.replace('_', '.')
00412             results_path = os.path.join(res_dir, results_file)
00413             results = codeml.read(results_path)
00414             self.assertEqual(len(results), 4, version_msg)
00415             self.assertTrue("NSsites" in results, version_msg)
00416             models = results["NSsites"]
00417             self.assertEqual(len(models), 1, version_msg)
00418             self.assertTrue(0 in models, version_msg)
00419             model = models[0]
00420             self.assertEqual(len(model), 5, version_msg)
00421             self.assertTrue("parameters" in model, version_msg)
00422             params = model["parameters"]
00423             # This type of model has fewer parameters for model 0
00424             self.assertEqual(len(params), 3, version_msg)
00425             self.assertTrue("rates" in params, version_msg)
00426             rates = params["rates"]
00427             self.assertEqual(len(rates), 2, version_msg)        
00428             self.assertTrue("genes" in params, version_msg)
00429             genes = params["genes"]
00430             self.assertEqual(len(genes), 2, version_msg)   
00431 
00432     def testParseFreeRatio(self):
00433         res_dir = os.path.join(self.results_dir, "codeml", "freeratio")
00434         for results_file in os.listdir(res_dir):
00435             version = results_file.split('-')[1].split('.')[0]    
00436             version_msg = "Improper parsing for version %s" \
00437                         % version.replace('_', '.')
00438             results_path = os.path.join(res_dir, results_file)
00439             results = codeml.read(results_path)
00440             self.assertEqual(len(results), 4, version_msg)
00441             self.assertTrue("NSsites" in results, version_msg)
00442             models = results["NSsites"]
00443             self.assertEqual(len(models), 1, version_msg)
00444             self.assertTrue(0 in models, version_msg)
00445             model = models[0]
00446             # With the free ratio model, you get 3 extra trees: dN tree,
00447             # dS tree and omega tree
00448             self.assertEqual(len(model), 8, version_msg)
00449             self.assertTrue("parameters" in model, version_msg)
00450             params = model["parameters"]
00451             self.assertEqual(len(params), SITECLASS_PARAMS[0], version_msg)
00452             self.assertTrue("branches" in params, version_msg)
00453             # There should be 7 branches
00454             branches = params["branches"]
00455             self.assertEqual(len(branches), 7, version_msg) 
00456             self.assertTrue("omega" in params, version_msg)
00457             omega = params["omega"]
00458             self.assertEqual(len(omega), 7, version_msg)
00459 
00460     def testParsePairwise(self):
00461         res_dir = os.path.join(self.results_dir, "codeml", "pairwise")
00462         for results_file in os.listdir(res_dir):
00463             version = results_file.split('-')[1].split('.')[0] 
00464             version_msg = "Improper parsing for version %s" \
00465                         % version.replace('_', '.')
00466             results_path = os.path.join(res_dir, results_file)
00467             results = codeml.read(results_path)
00468             # Pairwise models have an extra top-level item: pairwise
00469             self.assertEqual(len(results), 5, version_msg)
00470             self.assertTrue("pairwise" in results, version_msg)
00471             pairwise = results["pairwise"]
00472             self.assertEqual(len(pairwise), 5, version_msg) 
00473 
00474     def testParseAA(self):
00475         res_dir = os.path.join(self.results_dir, "codeml", "aa_model0")
00476         for results_file in os.listdir(res_dir):
00477             version = results_file.split('-')[1].split('.')[0]  
00478             version_msg = "Improper parsing for version %s" \
00479                         % version.replace('_', '.')
00480             results_path = os.path.join(res_dir, results_file)
00481             results = codeml.read(results_path)
00482             # Amino Acid analysis has different top-levels:
00483             # 'NSsites', 'model', 'version', 'lnL max', 'distances'
00484             # Version 4.1 doesn't seem to produce distances in the results
00485             if version == "4_1":
00486                 self.assertEqual(len(results), 4, version_msg)
00487                 self.assertTrue("lnL max" in results, version_msg)
00488             else:
00489                 self.assertEqual(len(results), 5, version_msg)
00490                 self.assertTrue("lnL max" in results, version_msg)
00491                 self.assertTrue("distances" in results, version_msg)
00492                 distances = results["distances"]
00493                 # non-pairwise AA analysis only gives raw distances
00494                 self.assertEqual(len(distances), 1, version_msg) 
00495 
00496     def testParseAAPairwise(self):
00497         res_dir = os.path.join(self.results_dir, "codeml", "aa_pairwise")
00498         for results_file in os.listdir(res_dir):
00499             version = results_file.split('-')[1].split('.')[0]    
00500             version_msg = "Improper parsing for version %s" \
00501                         % version.replace('_', '.')
00502             results_path = os.path.join(res_dir, results_file)
00503             results = codeml.read(results_path)
00504             # Pairwise AA analysis has one top-level fewer than non-pairwise
00505             self.assertEqual(len(results), 4, version_msg)
00506             self.assertTrue("lnL max" in results, version_msg)
00507             self.assertTrue("distances" in results, version_msg)
00508             distances = results["distances"]
00509             # Pairwise AA analysis has ML & raw distances
00510             self.assertEqual(len(distances), 2, version_msg) 
00511     
00512     
00513 if __name__ == "__main__":
00514     runner = unittest.TextTestRunner(verbosity = 2)
00515     unittest.main(testRunner=runner)