Back to index

python-biopython  1.60
NCBIWWW.py
Go to the documentation of this file.
00001 # Copyright 1999 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 # Patched by Brad Chapman.
00007 # Chris Wroe added modifications for work in myGrid
00008 
00009 """
00010 This module provides code to work with the WWW version of BLAST
00011 provided by the NCBI.
00012 http://blast.ncbi.nlm.nih.gov/
00013 
00014 Functions:
00015 qblast        Do a BLAST search using the QBLAST API.
00016 """
00017 
00018 import sys
00019 try:
00020     from cStringIO import StringIO
00021 except ImportError:
00022     from StringIO import StringIO
00023 
00024 from Bio._py3k import _as_string, _as_bytes
00025 
00026 def qblast(program, database, sequence,
00027            auto_format=None,composition_based_statistics=None,
00028            db_genetic_code=None,endpoints=None,entrez_query='(none)',
00029            expect=10.0,filter=None,gapcosts=None,genetic_code=None,
00030            hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
00031            matrix_name=None,nucl_penalty=None,nucl_reward=None,
00032            other_advanced=None,perc_ident=None,phi_pattern=None,
00033            query_file=None,query_believe_defline=None,query_from=None,
00034            query_to=None,searchsp_eff=None,service=None,threshold=None,
00035            ungapped_alignment=None,word_size=None,
00036            alignments=500,alignment_view=None,descriptions=500,
00037            entrez_links_new_window=None,expect_low=None,expect_high=None,
00038            format_entrez_query=None,format_object=None,format_type='XML',
00039            ncbi_gi=None,results_file=None,show_overview=None, megablast=None,
00040            ):
00041     """Do a BLAST search using the QBLAST server at NCBI.
00042 
00043     Supports all parameters of the qblast API for Put and Get.
00044     Some useful parameters:
00045     program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
00046     database       Which database to search against (e.g. "nr").
00047     sequence       The sequence to search.
00048     ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
00049     descriptions   Number of descriptions to show.  Def 500.
00050     alignments     Number of alignments to show.  Def 500.
00051     expect         An expect value cutoff.  Def 10.0.
00052     matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
00053     filter         "none" turns off filtering.  Default no filtering
00054     format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
00055     entrez_query   Entrez query to limit Blast search
00056     hitlist_size   Number of hits to return. Default 50
00057     megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
00058     service        plain, psi, phi, rpsblast, megablast (lower case)
00059 
00060     This function does no checking of the validity of the parameters
00061     and passes the values to the server as is.  More help is available at:
00062     http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html
00063 
00064     """
00065     import urllib, urllib2
00066     import time
00067 
00068     assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
00069 
00070     # Format the "Put" command, which sends search requests to qblast.
00071     # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
00072     # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
00073     # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
00074     # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
00075     parameters = [
00076         ('AUTO_FORMAT',auto_format),
00077         ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
00078         ('DATABASE',database),
00079         ('DB_GENETIC_CODE',db_genetic_code),
00080         ('ENDPOINTS',endpoints),
00081         ('ENTREZ_QUERY',entrez_query),
00082         ('EXPECT',expect),
00083         ('FILTER',filter),
00084         ('GAPCOSTS',gapcosts),
00085         ('GENETIC_CODE',genetic_code),
00086         ('HITLIST_SIZE',hitlist_size),
00087         ('I_THRESH',i_thresh),
00088         ('LAYOUT',layout),
00089         ('LCASE_MASK',lcase_mask),
00090         ('MEGABLAST',megablast),
00091         ('MATRIX_NAME',matrix_name),
00092         ('NUCL_PENALTY',nucl_penalty),
00093         ('NUCL_REWARD',nucl_reward),
00094         ('OTHER_ADVANCED',other_advanced),
00095         ('PERC_IDENT',perc_ident),
00096         ('PHI_PATTERN',phi_pattern),
00097         ('PROGRAM',program),
00098         #('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
00099         ('QUERY',sequence),
00100         ('QUERY_FILE',query_file),
00101         ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
00102         ('QUERY_FROM',query_from),
00103         ('QUERY_TO',query_to),
00104         #('RESULTS_FILE',...), - Can we use this parameter?
00105         ('SEARCHSP_EFF',searchsp_eff),
00106         ('SERVICE',service),
00107         ('THRESHOLD',threshold),
00108         ('UNGAPPED_ALIGNMENT',ungapped_alignment),
00109         ('WORD_SIZE',word_size),
00110         ('CMD', 'Put'),
00111         ]
00112     query = [x for x in parameters if x[1] is not None]
00113     message = _as_bytes(urllib.urlencode(query))
00114 
00115     # Send off the initial query to qblast.
00116     # Note the NCBI do not currently impose a rate limit here, other
00117     # than the request not to make say 50 queries at once using multiple
00118     # threads.
00119     request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
00120                               message,
00121                               {"User-Agent":"BiopythonClient"})
00122     handle = urllib2.urlopen(request)
00123 
00124     # Format the "Get" command, which gets the formatted results from qblast
00125     # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007       
00126     rid, rtoe = _parse_qblast_ref_page(handle)
00127     parameters = [
00128         ('ALIGNMENTS',alignments),
00129         ('ALIGNMENT_VIEW',alignment_view),
00130         ('DESCRIPTIONS',descriptions),
00131         ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
00132         ('EXPECT_LOW',expect_low),
00133         ('EXPECT_HIGH',expect_high),
00134         ('FORMAT_ENTREZ_QUERY',format_entrez_query),
00135         ('FORMAT_OBJECT',format_object),
00136         ('FORMAT_TYPE',format_type),
00137         ('NCBI_GI',ncbi_gi),
00138         ('RID',rid),
00139         ('RESULTS_FILE',results_file),
00140         ('SERVICE',service),
00141         ('SHOW_OVERVIEW',show_overview),
00142         ('CMD', 'Get'),
00143         ]
00144     query = [x for x in parameters if x[1] is not None]
00145     message = _as_bytes(urllib.urlencode(query))
00146 
00147     # Poll NCBI until the results are ready.  Use a 3 second wait
00148     delay = 3.0
00149     previous = time.time()
00150     while True:
00151         current = time.time()
00152         wait = previous + delay - current
00153         if wait > 0:
00154             time.sleep(wait)
00155             previous = current + wait
00156         else:
00157             previous = current
00158 
00159         request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
00160                                   message,
00161                                   {"User-Agent":"BiopythonClient"})
00162         handle = urllib2.urlopen(request)
00163         results = _as_string(handle.read())
00164 
00165         # Can see an "\n\n" page while results are in progress,
00166         # if so just wait a bit longer...
00167         if results=="\n\n":
00168             continue
00169         # XML results don't have the Status tag when finished
00170         if results.find("Status=") < 0:
00171             break
00172         i = results.index("Status=")
00173         j = results.index("\n", i)
00174         status = results[i+len("Status="):j].strip()
00175         if status.upper() == "READY":
00176             break
00177 
00178     return StringIO(results)
00179 
00180 def _parse_qblast_ref_page(handle):
00181     """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
00182 
00183     The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably
00184     'Request Time of Execution' and RID would be 'Request Identifier'.
00185     """
00186     s = _as_string(handle.read())
00187     i = s.find("RID =")
00188     if i == -1:
00189         rid = None
00190     else:
00191         j = s.find("\n", i)
00192         rid = s[i+len("RID ="):j].strip()
00193 
00194     i = s.find("RTOE =")
00195     if i == -1:
00196         rtoe = None
00197     else:
00198         j = s.find("\n", i)
00199         rtoe = s[i+len("RTOE ="):j].strip()
00200 
00201     if not rid and not rtoe:
00202         #Can we reliably extract the error message from the HTML page?
00203         #e.g.  "Message ID#24 Error: Failed to read the Blast query:
00204         #       Nucleotide FASTA provided for protein sequence"
00205         #or    "Message ID#32 Error: Query contains no data: Query
00206         #       contains no sequence data"
00207         #
00208         #This used to occur inside a <div class="error msInf"> entry:
00209         i = s.find('<div class="error msInf">')
00210         if i != -1:
00211             msg = s[i+len('<div class="error msInf">'):].strip()
00212             msg = msg.split("</div>",1)[0].split("\n",1)[0].strip()
00213             if msg:
00214                 raise ValueError("Error message from NCBI: %s" % msg)
00215         #In spring 2010 the markup was like this:
00216         i = s.find('<p class="error">')
00217         if i != -1:
00218             msg = s[i+len('<p class="error">'):].strip()
00219             msg = msg.split("</p>",1)[0].split("\n",1)[0].strip()
00220             if msg:
00221                 raise ValueError("Error message from NCBI: %s" % msg)
00222         #Generic search based on the way the error messages start:
00223         i = s.find('Message ID#')
00224         if i != -1:
00225             #Break the message at the first HTML tag
00226             msg = s[i:].split("<",1)[0].split("\n",1)[0].strip()
00227             raise ValueError("Error message from NCBI: %s" % msg)
00228         #We didn't recognise the error layout :(
00229         #print s
00230         raise ValueError("No RID and no RTOE found in the 'please wait' page, "
00231                          "there was probably an error in your request but we "
00232                          "could not extract a helpful error message.")
00233     elif not rid:
00234         #Can this happen?
00235         raise ValueError("No RID found in the 'please wait' page."
00236                          " (although RTOE = %s)" % repr(rtoe))
00237     elif not rtoe:
00238         #Can this happen?
00239         raise ValueError("No RTOE found in the 'please wait' page."
00240                          " (although RID = %s)" % repr(rid))
00241 
00242     try:
00243         return rid, int(rtoe)
00244     except ValueError:
00245         raise ValueError("A non-integer RTOE found in " \
00246                          +"the 'please wait' page, %s" % repr(rtoe))
00247 
00248