Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 2010-2011 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Provides code to access the TogoWS integrated websevices of DBCLS, Japan.
00007 
00008 This module aims to make the TogoWS (from DBCLS, Japan) easier to use. See:
00009 http://togows.dbcls.jp/
00010 
00011 The TogoWS REST service provides simple access to a range of databases, acting
00012 as a proxy to shield you from all the different provider APIs. This works using
00013 simple URLs (which this module will construct for you). For more details, see
00014 http://togows.dbcls.jp/site/en/rest.html
00015 
00016 The functionality is somewhat similar to Biopython's Bio.Entrez module which
00017 provides access to the NCBI's Entrez Utilities (E-Utils) which also covers a
00018 wide range of databases.
00019 
00020 Currently TogoWS does not provide any usage guidelines (unlike the NCBI whose
00021 requirements are reasonably clear). To avoid risking overloading the service,
00022 Biopython will only allow three calls per second.
00023 
00024 The TogoWS SOAP service offers a more complex API for calling web services
00025 (essentially calling remote functions) provided by DDBJ, KEGG and PDBj. For
00026 example, this allows you to run a remote BLAST search at the DDBJ. This is
00027 not yet covered by this module, however there are lots of Python examples
00028 on the TogoWS website using the SOAPpy python library. See:
00029 http://togows.dbcls.jp/site/en/soap.html
00030 http://soapy.sourceforge.net/
00031 """
00032 
00033 import urllib
00034 import urllib2
00035 import time
00036 from Bio._py3k import _binary_to_string_handle, _as_bytes
00037 
00038 #Constant
00039 _BASE_URL = "http://togows.dbcls.jp"
00040 
00041 #Caches:
00042 _search_db_names = None
00043 _entry_db_names = None
00044 _entry_db_fields = {}
00045 _entry_db_formats = {}
00046 _convert_formats = []
00047 
00048 def _get_fields(url):
00049     """Queries a TogoWS URL for a plain text list of values (PRIVATE)."""
00050     handle = _open(url)
00051     fields = handle.read().strip().split()
00052     handle.close()
00053     return fields
00054 
00055 def _get_entry_dbs():
00056     return _get_fields(_BASE_URL + "/entry")
00057 
00058 def _get_entry_fields(db):
00059     return _get_fields(_BASE_URL + "/entry/%s?fields" % db)
00060 
00061 def _get_entry_formats(db):
00062     return _get_fields(_BASE_URL + "/entry/%s?formats" % db)
00063 
00064 def _get_convert_formats():
00065     return [pair.split(".") for pair in \
00066             _get_fields(_BASE_URL + "/convert/")]
00067 
00068 def entry(db, id, format=None, field=None):
00069     """TogoWS fetch entry (returns a handle).
00070 
00071     db - database (string), see list below.
00072     id - identier (string) or a list of identifiers (either as a list of
00073          strings or a single string with comma separators).
00074     format - return data file format (string), options depend on the database
00075              e.g. "xml", "json", "gff", "fasta", "ttl" (RDF Turtle)
00076     field - specific field from within the database record (string)
00077             e.g. "au" or "authors" for pubmed.
00078 
00079     At the time of writing, this includes the following:
00080 
00081     KEGG: compound, drug, enzyme, genes, glycan, orthology, reaction,
00082           module, pathway
00083     DDBj: ddbj, dad, pdb
00084     NCBI: nuccore, nucest, nucgss, nucleotide, protein, gene, onim,
00085           homologue, snp, mesh, pubmed
00086     EBI:  embl, uniprot, uniparc, uniref100, uniref90, uniref50
00087 
00088     For the current list, please see http://togows.dbcls.jp/entry/
00089 
00090     This function is essentially equivalent to the NCBI Entrez service
00091     EFetch, available in Biopython as Bio.Entrez.efetch(...), but that
00092     does not offer field extraction.
00093     """
00094     global _entry_db_names, _entry_db_fields, fetch_db_formats
00095     if _entry_db_names is None:
00096         _entry_db_names = _get_entry_dbs()
00097     if db not in _entry_db_names:
00098         raise ValueError("TogoWS entry fetch does not officially support "
00099                          "database '%s'." % db)
00100     if field:
00101         try:
00102             fields = _entry_db_fields[db]
00103         except KeyError:
00104             fields = _get_entry_fields(db)
00105             _entry_db_fields[db] = fields
00106         if field not in fields:
00107             raise ValueError("TogoWS entry fetch does not explicitly support "
00108                              "field '%s' for database '%s'. Only: %s" \
00109                              % (field, db, ", ".join(sorted(fields))))
00110     if format:
00111         try:
00112             formats = _entry_db_formats[db]
00113         except KeyError:
00114             formats = _get_entry_formats(db)
00115             _entry_db_formats[db] = formats
00116         if format not in formats:
00117             raise ValueError("TogoWS entry fetch does not explicitly support "
00118                              "format '%s' for database '%s'. Only: %s" \
00119                              % (format, db, ", ".join(sorted(formats))))
00120 
00121     if isinstance(id, list):
00122         id = ",".join(id)
00123     url = _BASE_URL + "/entry/%s/%s" % (db, urllib.quote(id))
00124     if field:
00125         url += "/" + field
00126     if format:
00127         url += "." + format
00128     return _open(url)
00129 
00130 def search_count(db, query):
00131     """TogoWS search count (returns an integer).
00132 
00133     db - database (string), see http://togows.dbcls.jp/search
00134     query - search term (string)
00135 
00136     You could then use the count to download a large set of search results in
00137     batches using the offset and limit options to Bio.TogoWS.search(). In
00138     general however the Bio.TogoWS.search_iter() function is simpler to use.
00139     """
00140     global _search_db_names
00141     if _search_db_names is None:
00142         _search_db_names = _get_fields(_BASE_URL + "/search")
00143     if db not in _search_db_names:
00144         #TODO - Make this a ValueError? Right now despite the HTML website
00145         #claiming to, the "gene" or "ncbi-gene" don't work and are not listed.
00146         import warnings
00147         warnings.warn("TogoWS search does not officially support database '%s'. "
00148                       "See %s/search/ for options." % (db, _BASE_URL))
00149     handle = _open(_BASE_URL + "/search/%s/%s/count" \
00150                    % (db, urllib.quote(query)))
00151     count = int(handle.read().strip())
00152     handle.close()
00153     return count
00154 
00155 def search_iter(db, query, limit=None, batch=100):
00156     """TogoWS search iteratating over the results (generator function).
00157 
00158     db - database (string), see http://togows.dbcls.jp/search
00159     query - search term (string)
00160     limit - optional upper bound on number of search results
00161     batch - number of search results to pull back each time talk to
00162             TogoWS (currently limited to 100).
00163 
00164     You would use this function within a for loop, e.g.
00165 
00166     >>> for id in search_iter("pubmed", "lung+cancer+drug", limit=10):
00167     ...     print id #maybe fetch data with entry?
00168 
00169     Internally this first calls the Bio.TogoWS.search_count() and then
00170     uses Bio.TogoWS.search() to get the results in batches.
00171     """
00172     count = search_count(db, query)
00173     if not count:
00174         raise StopIteration
00175     #NOTE - We leave it to TogoWS to enforce any upper bound on each
00176     #batch, they currently return an HTTP 400 Bad Request if above 100.
00177     remain = count
00178     if limit is not None:
00179         remain = min(remain, limit)
00180     offset = 1 #They don't use zero based counting
00181     prev_ids = [] #Just cache the last batch for error checking
00182     while remain:
00183         batch = min(batch, remain)
00184         #print "%r left, asking for %r" % (remain, batch)
00185         ids = search(db, query, offset, batch).read().strip().split()
00186         assert len(ids)==batch, "Got %i, expected %i" % (len(ids), batch)
00187         #print "offset %i, %s ... %s" % (offset, ids[0], ids[-1])
00188         if ids == prev_ids:
00189             raise RuntimeError("Same search results for previous offset")
00190         for identifier in ids:
00191             if identifier in prev_ids:
00192                 raise RuntimeError("Result %s was in previous batch" \
00193                                    % identifier)
00194             yield identifier
00195         offset += batch
00196         remain -= batch
00197         prev_ids = ids
00198 
00199 def search(db, query, offset=None, limit=None, format=None):
00200     """TogoWS search (returns a handle).
00201 
00202     This is a low level wrapper for the TogoWS search function, which
00203     can return results in a several formats. In general, the search_iter
00204     function is more suitable for end users.
00205 
00206     db - database (string), see http://togows.dbcls.jp/search/
00207     query - search term (string)
00208     offset, limit - optional integers specifying which result to start from
00209             (1 based) and the number of results to return.
00210     format - return data file format (string), e.g. "json", "ttl" (RDF)
00211              By default plain text is returned, one result per line.
00212 
00213     At the time of writing, TogoWS applies a default count limit of 100
00214     search results, and this is an upper bound. To access more results,
00215     use the offset argument or the search_iter(...) function.
00216 
00217     TogoWS supports a long list of databases, including many from the NCBI
00218     (e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or "genbank", and
00219     "ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot" or
00220     "uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound").
00221     For the current list, see http://togows.dbcls.jp/search/
00222 
00223     The NCBI provide the Entrez Search service (ESearch) which is similar,
00224     available in Biopython as the Bio.Entrez.esearch() function.
00225 
00226     See also the function Bio.TogoWS.search_count() which returns the number
00227     of matches found, and the Bio.TogoWS.search_iter() function which allows
00228     you to iterate over the search results (taking care of batching for you).
00229     """
00230     global _search_db_names
00231     if _search_db_names is None:
00232         _search_db_names = _get_fields(_BASE_URL + "/search")
00233     if db not in _search_db_names:
00234         #TODO - Make this a ValueError? Right now despite the HTML website
00235         #claiming to, the "gene" or "ncbi-gene" don't work and are not listed.
00236         import warnings
00237         warnings.warn("TogoWS search does not explicitly support database '%s'. "
00238                       "See %s/search/ for options." % (db, _BASE_URL))
00239     url = _BASE_URL + "/search/%s/%s" % (db, urllib.quote(query))
00240     if offset is not None and limit is not None:
00241         try:
00242             offset = int(offset)
00243         except:
00244             raise ValueError("Offset should be an integer (at least one), not %r" % offset)
00245         try:
00246             limit = int(limit)
00247         except:
00248             raise ValueError("Limit should be an integer (at least one), not %r" % limit)
00249         if offset <= 0:
00250             raise ValueError("Offset should be at least one, not %i" % offset)
00251         if limit <= 0:
00252             raise ValueError("Count should be at least one, not %i" % limit)
00253         url += "/%i,%i" % (offset, limit)
00254     elif offset is not None or limit is not None:
00255         raise ValueError("Expect BOTH offset AND limit to be provided (or neither)")
00256     if format:
00257         url += "." + format
00258     #print url
00259     return _open(url)
00260 
00261 def convert(data, in_format, out_format):
00262     """TogoWS convert (returns a handle).
00263     
00264     data - string or handle containing input record(s)
00265     in_format - string describing the input file format (e.g. "genbank")
00266     out_format - string describing the requested output format (e.g. "fasta")
00267 
00268     For a list of supported conversions (e.g. "genbank" to "fasta"), see
00269     http://togows.dbcls.jp/convert/
00270     
00271     Note that Biopython has built in support for conversion of sequence and
00272     alignnent file formats (functions Bio.SeqIO.convert and Bio.AlignIO.convert)
00273     """
00274     global _convert_formats
00275     if not _convert_formats:
00276         _convert_formats = _get_convert_formats()
00277     if [in_format, out_format] not in _convert_formats:
00278         msg = "\n".join("%s -> %s" % tuple(pair) for pair in _convert_formats)
00279         raise ValueError("Unsupported conversion. Choose from:\n%s" % msg)
00280     url = _BASE_URL + "/convert/%s.%s" % (in_format, out_format)
00281     #TODO - Should we just accept a string not a handle? What about a filename?
00282     if hasattr(data, "read"):
00283         #Handle
00284         return _open(url, post={"data":data.read()})
00285     else:
00286         #String
00287         return _open(url, post={"data":data})
00288 
00289 def _open(url, post=None):
00290     """Helper function to build the URL and open a handle to it (PRIVATE).
00291 
00292     Open a handle to TogoWS, will raise an IOError if it encounters an error.
00293 
00294     In the absense of clear guidelines, this function enforces a limit of
00295     "up to three queries per second" to avoid abusing the TogoWS servers.
00296     """
00297     delay = 0.333333333 #one third of a second
00298     current = time.time()
00299     wait = _open.previous + delay - current
00300     if wait > 0:
00301         time.sleep(wait)
00302         _open.previous = current + wait
00303     else:
00304         _open.previous = current
00305 
00306     #print url
00307     try:
00308         if post:
00309             handle = urllib2.urlopen(url, _as_bytes(urllib.urlencode(post)))
00310         else:
00311             handle = urllib2.urlopen(url)
00312     except urllib2.HTTPError, exception:
00313         raise exception
00314 
00315     #We now trust TogoWS to have set an HTTP error code, that
00316     #suffices for my current unit tests. Previously we would
00317     #examine the start of the data returned back.
00318     return _binary_to_string_handle(handle)
00319 
00320 _open.previous = 0