Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved.
00002 # Copyright 2008 by Michiel de Hoon.  All rights reserved.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 
00007 """Provides code to access NCBI over the WWW.
00008 
00009 The main Entrez web page is available at:
00010 http://www.ncbi.nlm.nih.gov/Entrez/
00011 
00012 A list of the Entrez utilities is available at:
00013 http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
00014 
00015 Variables:
00016 email        Set the Entrez email parameter (default is not set).
00017 tool         Set the Entrez tool parameter (default is  biopython).
00018 
00019 Functions:
00020 efetch       Retrieves records in the requested format from a list of one or
00021              more primary IDs or from the user's environment
00022 epost        Posts a file containing a list of primary IDs for future use in
00023              the user's environment to use with subsequent search strategies
00024 esearch      Searches and retrieves primary IDs (for use in EFetch, ELink,
00025              and ESummary) and term translations and optionally retains
00026              results for future use in the user's environment.
00027 elink        Checks for the existence of an external or Related Articles link
00028              from a list of one or more primary IDs.  Retrieves primary IDs
00029              and relevancy scores for links to Entrez databases or Related
00030              Articles;  creates a hyperlink to the primary LinkOut provider
00031              for a specific ID and database, or lists LinkOut URLs
00032              and Attributes for multiple IDs.
00033 einfo        Provides field index term counts, last update, and available
00034              links for each database.
00035 esummary     Retrieves document summaries from a list of primary IDs or from
00036              the user's environment.
00037 egquery      Provides Entrez database counts in XML for a single search
00038              using Global Query.
00039 espell       Retrieves spelling suggestions.
00040 
00041 read         Parses the XML results returned by any of the above functions.
00042              Typical usage is:
00043 
00044              >>> from Bio import Entrez
00045              >>> Entrez.email = "Your.Name.Here@example.org"
00046              >>> handle = Entrez.einfo() # or esearch, efetch, ...
00047              >>> record = Entrez.read(handle)
00048              >>> handle.close()
00049 
00050              where record is now a Python dictionary or list.
00051 
00052 parse        Parses the XML results returned by those of the above functions
00053              which can return multiple records - such as efetch, esummary
00054              and elink. Typical usage is:
00055 
00056              >>> handle = Entrez.efetch("pubmed", id="19304878,14630660", retmode="xml")
00057              >>> records = Entrez.parse(handle)
00058              >>> for record in records:
00059              ...     # each record is a Python dictionary or list.
00060              ...     print record['MedlineCitation']['Article']['ArticleTitle']
00061              Biopython: freely available Python tools for computational molecular biology and bioinformatics.
00062              PDB file parser and structure class implemented in Python.
00063              >>> handle.close()
00064 
00065              This function is appropriate only if the XML file contains
00066              multiple records, and is particular useful for large files. 
00067 
00068 _open        Internally used function.
00069 
00070 """
00071 import urllib, urllib2, time, warnings
00072 import os.path
00073 
00074 from Bio._py3k import _binary_to_string_handle
00075 
00076 email = None
00077 tool = "biopython"
00078 
00079 
00080 # XXX retmode?
00081 def epost(db, **keywds):
00082     """Post a file of identifiers for future use.
00083 
00084     Posts a file containing a list of UIs for future use in the user's
00085     environment to use with subsequent search strategies.
00086 
00087     See the online documentation for an explanation of the parameters:
00088     http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html
00089 
00090     Return a handle to the results.
00091 
00092     Raises an IOError exception if there's a network error.
00093     """
00094     cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
00095     variables = {'db' : db}
00096     variables.update(keywds)
00097     return _open(cgi, variables, post=True)
00098 
00099 def efetch(db, **keywds):
00100     """Fetches Entrez results which are returned as a handle.
00101 
00102     EFetch retrieves records in the requested format from a list of one or
00103     more UIs or from user's environment.
00104 
00105     See the online documentation for an explanation of the parameters:
00106     http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
00107 
00108     Return a handle to the results.
00109 
00110     Raises an IOError exception if there's a network error.
00111 
00112     Short example:
00113 
00114     >>> from Bio import Entrez
00115     >>> Entrez.email = "Your.Name.Here@example.org"
00116     >>> handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text")
00117     >>> print handle.readline().strip()
00118     LOCUS       AY851612                 892 bp    DNA     linear   PLN 10-APR-2007
00119     >>> handle.close()
00120 
00121     Warning: The NCBI changed the default retmode in Feb 2012, so many
00122     databases which previously returned text output now give XML.
00123     """
00124     cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
00125     variables = {'db' : db}
00126     keywords = keywds
00127     if "id" in keywds and isinstance(keywds["id"], list):
00128         #Fix for NCBI change (probably part of EFetch 2,0, Feb 2012) where
00129         #a list of ID strings now gives HTTP Error 500: Internal server error
00130         #This was turned into ...&id=22307645&id=22303114&... which used to work
00131         #while now the NCBI appear to insist on ...&id=22301129,22299544,...
00132         keywords = keywds.copy() #Don't alter input dict!
00133         keywords["id"] = ",".join(keywds["id"])
00134     variables.update(keywords)
00135     return _open(cgi, variables)
00136 
00137 def esearch(db, term, **keywds):
00138     """ESearch runs an Entrez search and returns a handle to the results.
00139 
00140     ESearch searches and retrieves primary IDs (for use in EFetch, ELink
00141     and ESummary) and term translations, and optionally retains results
00142     for future use in the user's environment.
00143 
00144     See the online documentation for an explanation of the parameters:
00145     http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
00146 
00147     Return a handle to the results which are always in XML format.
00148 
00149     Raises an IOError exception if there's a network error.
00150 
00151     Short example:
00152 
00153     >>> from Bio import Entrez
00154     >>> Entrez.email = "Your.Name.Here@example.org"
00155     >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD")
00156     >>> record = Entrez.read(handle)
00157     >>> handle.close()
00158     >>> record["Count"] >= 2
00159     True
00160     >>> "156535671" in record["IdList"]
00161     True
00162     >>> "156535673" in record["IdList"]
00163     True
00164 
00165     """
00166     cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
00167     variables = {'db' : db,
00168                  'term' : term}
00169     variables.update(keywds)
00170     return _open(cgi, variables)
00171 
00172 def elink(**keywds):
00173     """ELink checks for linked external articles and returns a handle.
00174 
00175     ELink checks for the existence of an external or Related Articles link
00176     from a list of one or more primary IDs;  retrieves IDs and relevancy
00177     scores for links to Entrez databases or Related Articles; creates a
00178     hyperlink to the primary LinkOut provider for a specific ID and
00179     database, or lists LinkOut URLs and attributes for multiple IDs.
00180 
00181     See the online documentation for an explanation of the parameters:
00182     http://www.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
00183 
00184     Return a handle to the results, by default in XML format.
00185 
00186     Raises an IOError exception if there's a network error.
00187 
00188     This example finds articles related to the Biopython application
00189     note's entry in the PubMed database:
00190 
00191     >>> from Bio import Entrez
00192     >>> Entrez.email = "Your.Name.Here@example.org"
00193     >>> pmid = "19304878"
00194     >>> handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed")
00195     >>> record = Entrez.read(handle)
00196     >>> handle.close()
00197     >>> print record[0]["LinkSetDb"][0]["LinkName"]
00198     pubmed_pubmed
00199     >>> linked = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]]
00200     >>> "17121776" in linked
00201     True
00202 
00203     This is explained in much more detail in the Biopython Tutorial.
00204     """
00205     cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
00206     variables = {}
00207     variables.update(keywds)
00208     return _open(cgi, variables)
00209 
00210 def einfo(**keywds):
00211     """EInfo returns a summary of the Entez databases as a results handle.
00212 
00213     EInfo provides field names, index term counts, last update, and
00214     available links for each Entrez database.
00215 
00216     See the online documentation for an explanation of the parameters:
00217     http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
00218 
00219     Return a handle to the results, by default in XML format.
00220 
00221     Raises an IOError exception if there's a network error.
00222 
00223     Short example:
00224 
00225     >>> from Bio import Entrez
00226     >>> Entrez.email = "Your.Name.Here@example.org"
00227     >>> record = Entrez.read(Entrez.einfo())
00228     >>> 'pubmed' in record['DbList']
00229     True
00230 
00231     """
00232     cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
00233     variables = {}
00234     variables.update(keywds)
00235     return _open(cgi, variables)
00236 
00237 def esummary(**keywds):
00238     """ESummary retrieves document summaries as a results handle.
00239 
00240     ESummary retrieves document summaries from a list of primary IDs or
00241     from the user's environment.
00242 
00243     See the online documentation for an explanation of the parameters:
00244     http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html
00245 
00246     Return a handle to the results, by default in XML format.
00247 
00248     Raises an IOError exception if there's a network error.
00249 
00250     This example discovers more about entry 30367 in the journals database:
00251 
00252     >>> from Bio import Entrez
00253     >>> Entrez.email = "Your.Name.Here@example.org"
00254     >>> handle = Entrez.esummary(db="journals", id="30367")
00255     >>> record = Entrez.read(handle)
00256     >>> handle.close()
00257     >>> print record[0]["Id"]
00258     30367
00259     >>> print record[0]["Title"]
00260     Computational biology and chemistry
00261 
00262     """
00263     cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
00264     variables = {}
00265     variables.update(keywds)
00266     return _open(cgi, variables)
00267 
00268 def egquery(**keywds):
00269     """EGQuery provides Entrez database counts for a global search.
00270 
00271     EGQuery provides Entrez database counts in XML for a single search
00272     using Global Query.
00273 
00274     See the online documentation for an explanation of the parameters:
00275     http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html
00276 
00277     Return a handle to the results in XML format.
00278 
00279     Raises an IOError exception if there's a network error.
00280 
00281     This quick example based on a longer version from the Biopython
00282     Tutorial just checks there are over 60 matches for 'Biopython'
00283     in PubMedCentral:
00284 
00285     >>> from Bio import Entrez
00286     >>> Entrez.email = "Your.Name.Here@example.org"
00287     >>> handle = Entrez.egquery(term="biopython")
00288     >>> record = Entrez.read(handle)
00289     >>> handle.close()
00290     >>> for row in record["eGQueryResult"]:
00291     ...     if "pmc" in row["DbName"]:
00292     ...         print row["Count"] > 60
00293     True
00294 
00295     """
00296     cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
00297     variables = {}
00298     variables.update(keywds)
00299     return _open(cgi, variables)
00300 
00301 def espell(**keywds):
00302     """ESpell retrieves spelling suggestions, returned in a results handle.
00303 
00304     ESpell retrieves spelling suggestions, if available.
00305 
00306     See the online documentation for an explanation of the parameters:
00307     http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html
00308 
00309     Return a handle to the results, by default in XML format.
00310 
00311     Raises an IOError exception if there's a network error.
00312 
00313     Short example:
00314 
00315     >>> from Bio import Entrez 
00316     >>> Entrez.email = "Your.Name.Here@example.org"
00317     >>> record = Entrez.read(Entrez.espell(term="biopythooon"))
00318     >>> print record["Query"] 
00319     biopythooon
00320     >>> print record["CorrectedQuery"] 
00321     biopython
00322 
00323     """
00324     cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
00325     variables = {}
00326     variables.update(keywds)
00327     return _open(cgi, variables)
00328 
00329 def read(handle, validate=True):
00330     """Parses an XML file from the NCBI Entrez Utilities into python objects.
00331     
00332     This function parses an XML file created by NCBI's Entrez Utilities,
00333     returning a multilevel data structure of Python lists and dictionaries.
00334     Most XML files returned by NCBI's Entrez Utilities can be parsed by
00335     this function, provided its DTD is available. Biopython includes the
00336     DTDs for most commonly used Entrez Utilities.
00337 
00338     If validate is True (default), the parser will validate the XML file
00339     against the DTD, and raise an error if the XML file contains tags that
00340     are not represented in the DTD. If validate is False, the parser will
00341     simply skip such tags.
00342 
00343     Whereas the data structure seems to consist of generic Python lists,
00344     dictionaries, strings, and so on, each of these is actually a class
00345     derived from the base type. This allows us to store the attributes
00346     (if any) of each element in a dictionary my_element.attributes, and
00347     the tag name in my_element.tag.
00348     """
00349     from Parser import DataHandler
00350     handler = DataHandler(validate)
00351     record = handler.read(handle)
00352     return record
00353 
00354 def parse(handle, validate=True):
00355     """Parses an XML file from the NCBI Entrez Utilities into python objects.
00356     
00357     This function parses an XML file created by NCBI's Entrez Utilities,
00358     returning a multilevel data structure of Python lists and dictionaries.
00359     This function is suitable for XML files that (in Python) can be represented
00360     as a list of individual records. Whereas 'read' reads the complete file
00361     and returns a single Python list, 'parse' is a generator function that
00362     returns the records one by one. This function is therefore particularly
00363     useful for parsing large files.
00364 
00365     Most XML files returned by NCBI's Entrez Utilities can be parsed by
00366     this function, provided its DTD is available. Biopython includes the
00367     DTDs for most commonly used Entrez Utilities.
00368 
00369     If validate is True (default), the parser will validate the XML file
00370     against the DTD, and raise an error if the XML file contains tags that
00371     are not represented in the DTD. If validate is False, the parser will
00372     simply skip such tags.
00373 
00374     Whereas the data structure seems to consist of generic Python lists,
00375     dictionaries, strings, and so on, each of these is actually a class
00376     derived from the base type. This allows us to store the attributes
00377     (if any) of each element in a dictionary my_element.attributes, and
00378     the tag name in my_element.tag.
00379     """
00380     from Parser import DataHandler
00381     handler = DataHandler(validate)
00382     records = handler.parse(handle)
00383     return records
00384 
00385 def _open(cgi, params={}, post=False):
00386     """Helper function to build the URL and open a handle to it (PRIVATE).
00387 
00388     Open a handle to Entrez.  cgi is the URL for the cgi script to access.
00389     params is a dictionary with the options to pass to it.  Does some
00390     simple error checking, and will raise an IOError if it encounters one.
00391 
00392     This function also enforces the "up to three queries per second rule"
00393     to avoid abusing the NCBI servers.
00394     """
00395     # NCBI requirement: At most three queries per second.
00396     # Equivalently, at least a third of second between queries
00397     delay = 0.333333334
00398     current = time.time()
00399     wait = _open.previous + delay - current
00400     if wait > 0:
00401         time.sleep(wait)
00402         _open.previous = current + wait
00403     else:
00404         _open.previous = current
00405     # Remove None values from the parameters
00406     for key, value in params.items():
00407         if value is None:
00408             del params[key]
00409     # Tell Entrez that we are using Biopython (or whatever the user has
00410     # specified explicitly in the parameters or by changing the default)
00411     if not "tool" in params:
00412         params["tool"] = tool
00413     # Tell Entrez who we are
00414     if not "email" in params:
00415         if email!=None:
00416             params["email"] = email
00417         else:
00418             warnings.warn("""
00419 Email address is not specified.
00420 
00421 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify
00422 your email address with each request. From June 1, 2010, this will be
00423 mandatory. As an example, if your email address is A.N.Other@example.com, you
00424 can specify it as follows:
00425    from Bio import Entrez
00426    Entrez.email = 'A.N.Other@example.com'
00427 In case of excessive usage of the E-utilities, NCBI will attempt to contact
00428 a user at the email address provided before blocking access to the
00429 E-utilities.""", UserWarning)
00430     # Open a handle to Entrez.
00431     options = urllib.urlencode(params, doseq=True)
00432     #print cgi + "?" + options
00433     try:
00434         if post:
00435             #HTTP POST
00436             handle = urllib2.urlopen(cgi, data=options)
00437         else:
00438             #HTTP GET
00439             cgi += "?" + options
00440             handle = urllib2.urlopen(cgi)
00441     except urllib2.HTTPError, exception:
00442         raise exception
00443 
00444     return _binary_to_string_handle(handle)
00445 
00446 _open.previous = 0
00447 
00448 
00449 def _test():
00450     """Run the module's doctests (PRIVATE)."""
00451     print "Runing doctests..."
00452     import doctest
00453     doctest.testmod()
00454     print "Done"
00455 
00456 if __name__ == "__main__":
00457     _test()