Back to index

python-biopython  1.60
_convert.py
Go to the documentation of this file.
00001 # Copyright 2009 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """Optimised sequence conversion code (PRIVATE).
00006 
00007 You are not expected to access this module, or any of its code, directly. This
00008 is all handled internally by the Bio.SeqIO.convert(...) function which is the
00009 public interface for this.
00010 
00011 The idea here is rather while doing this will work:
00012 
00013 from Bio import SeqIO
00014 records = SeqIO.parse(in_handle, in_format)
00015 count = SeqIO.write(records, out_handle, out_format)
00016 
00017 it is shorter to write:
00018 
00019 from Bio import SeqIO
00020 count = SeqIO.convert(in_handle, in_format, out_handle, out_format)
00021 
00022 Also, the convert function can take a number of special case optimisations. This
00023 means that using Bio.SeqIO.convert() may be faster, as well as more convenient.
00024 All these file format specific optimisations are handled by this (private) module.
00025 """
00026 
00027 from Bio import SeqIO
00028 #NOTE - Lots of lazy imports further on...
00029 
00030 def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
00031     """Fast GenBank to FASTA (PRIVATE)."""
00032     #We don't need to parse the features...
00033     from Bio.GenBank.Scanner import GenBankScanner
00034     records = GenBankScanner().parse_records(in_handle, do_features=False)
00035     #For FASTA output we can ignore the alphabet too
00036     return SeqIO.write(records, out_handle, "fasta")
00037 
00038 
00039 def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
00040     """Fast EMBL to FASTA (PRIVATE)."""
00041     #We don't need to parse the features...
00042     from Bio.GenBank.Scanner import EmblScanner
00043     records = EmblScanner().parse_records(in_handle, do_features=False)
00044     #For FASTA output we can ignore the alphabet too
00045     return SeqIO.write(records, out_handle, "fasta")
00046 
00047 
00048 def _fastq_generic(in_handle, out_handle, mapping):
00049     """FASTQ helper function where can't have data loss by truncation (PRIVATE)."""
00050     from Bio.SeqIO.QualityIO import FastqGeneralIterator
00051     #For real speed, don't even make SeqRecord and Seq objects!
00052     count = 0
00053     null = chr(0)
00054     for title, seq, old_qual in FastqGeneralIterator(in_handle):
00055         count += 1
00056         #map the qual...
00057         qual = old_qual.translate(mapping)
00058         if null in qual:
00059             raise ValueError("Invalid character in quality string")
00060         out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
00061     return count
00062 
00063     
00064 def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
00065     """FASTQ helper function where there could be data loss by truncation (PRIVATE)."""
00066     from Bio.SeqIO.QualityIO import FastqGeneralIterator
00067     #For real speed, don't even make SeqRecord and Seq objects!
00068     count = 0
00069     null = chr(0)
00070     for title, seq, old_qual in FastqGeneralIterator(in_handle):
00071         count += 1
00072         #map the qual...
00073         qual = old_qual.translate(mapping)
00074         if null in qual:
00075             raise ValueError("Invalid character in quality string")
00076         if truncate_char in qual:
00077             qual = qual.replace(truncate_char, chr(126))
00078             import warnings
00079             warnings.warn(truncate_msg)
00080         out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
00081     return count
00082 
00083 
00084 def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
00085     """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE).
00086 
00087     Useful for removing line wrapping and the redundant second identifier
00088     on the plus lines. Will check also check the quality string is valid.
00089 
00090     Avoids creating SeqRecord and Seq objects in order to speed up this
00091     conversion.
00092     """
00093     #Map unexpected chars to null
00094     mapping = "".join([chr(0) for ascii in range(0, 33)] \
00095                      +[chr(ascii) for ascii in range(33, 127)] \
00096                      +[chr(0) for ascii in range(127, 256)])
00097     assert len(mapping)==256
00098     return _fastq_generic(in_handle, out_handle, mapping)
00099 
00100 
00101 def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
00102     """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).
00103 
00104     Useful for removing line wrapping and the redundant second identifier
00105     on the plus lines. Will check also check the quality string is valid.
00106     Avoids creating SeqRecord and Seq objects in order to speed up this
00107     conversion.
00108     """
00109     #Map unexpected chars to null
00110     mapping = "".join([chr(0) for ascii in range(0, 59)] \
00111                      +[chr(ascii) for ascii in range(59, 127)] \
00112                      +[chr(0) for ascii in range(127, 256)])
00113     assert len(mapping)==256
00114     return _fastq_generic(in_handle, out_handle, mapping)
00115 
00116 
00117 def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
00118     """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
00119 
00120     Useful for removing line wrapping and the redundant second identifier
00121     on the plus lines. Will check also check the quality string is valid.
00122     Avoids creating SeqRecord and Seq objects in order to speed up this
00123     conversion.
00124     """
00125     #Map unexpected chars to null
00126     mapping = "".join([chr(0) for ascii in range(0, 64)] \
00127                      +[chr(ascii) for ascii in range(64,127)] \
00128                      +[chr(0) for ascii in range(127,256)])
00129     assert len(mapping)==256
00130     return _fastq_generic(in_handle, out_handle, mapping)
00131 
00132 
00133 def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
00134     """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE).
00135 
00136     Avoids creating SeqRecord and Seq objects in order to speed up this
00137     conversion.
00138     """
00139     #Map unexpected chars to null
00140     mapping = "".join([chr(0) for ascii in range(0, 64)] \
00141                      +[chr(33+q) for q in range(0, 62+1)] \
00142                      +[chr(0) for ascii in range(127, 256)])
00143     assert len(mapping)==256
00144     return _fastq_generic(in_handle, out_handle, mapping)
00145 
00146 
00147 def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
00148     """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
00149 
00150     Avoids creating SeqRecord and Seq objects in order to speed up this
00151     conversion. Will issue a warning if the scores had to be truncated at 62
00152     (maximum possible in the Illumina 1.3+ FASTQ format)
00153     """
00154     #Map unexpected chars to null
00155     trunc_char = chr(1)
00156     mapping = "".join([chr(0) for ascii in range(0, 33)] \
00157                      +[chr(64+q) for q in range(0, 62+1) ] \
00158                      +[trunc_char for ascii in range(96,127)] \
00159                      +[chr(0) for ascii in range(127, 256)])
00160     assert len(mapping)==256
00161     return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
00162                           "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
00163 
00164 
00165 def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
00166     """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE).
00167 
00168     Avoids creating SeqRecord and Seq objects in order to speed up this
00169     conversion.
00170     """
00171     #Map unexpected chars to null
00172     from Bio.SeqIO.QualityIO import phred_quality_from_solexa
00173     mapping = "".join([chr(0) for ascii in range(0, 59)] \
00174                      +[chr(33+int(round(phred_quality_from_solexa(q)))) \
00175                        for q in range(-5, 62+1)]\
00176                       +[chr(0) for ascii in range(127, 256)])
00177     assert len(mapping)==256
00178     return _fastq_generic(in_handle, out_handle, mapping)
00179 
00180 def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
00181     """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE).
00182 
00183     Avoids creating SeqRecord and Seq objects in order to speed up this
00184     conversion. Will issue a warning if the scores had to be truncated at 62
00185     (maximum possible in the Solexa FASTQ format)
00186     """
00187     #Map unexpected chars to null
00188     from Bio.SeqIO.QualityIO import solexa_quality_from_phred
00189     trunc_char = chr(1)
00190     mapping = "".join([chr(0) for ascii in range(0, 33)] \
00191                      +[chr(64+int(round(solexa_quality_from_phred(q)))) \
00192                        for q in range(0, 62+1)] \
00193                      +[trunc_char for ascii in range(96, 127)] \
00194                      +[chr(0) for ascii in range(127, 256)])
00195     assert len(mapping)==256
00196     return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
00197                           "Data loss - max Solexa quality 62 in Solexa FASTQ")
00198 
00199 
00200 def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
00201     """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
00202 
00203     Avoids creating SeqRecord and Seq objects in order to speed up this
00204     conversion.
00205     """
00206     #Map unexpected chars to null
00207     from Bio.SeqIO.QualityIO import phred_quality_from_solexa
00208     mapping = "".join([chr(0) for ascii in range(0, 59)] \
00209                      +[chr(64+int(round(phred_quality_from_solexa(q)))) \
00210                        for q in range(-5, 62+1)]\
00211                       +[chr(0) for ascii in range(127, 256)])
00212     assert len(mapping)==256
00213     return _fastq_generic(in_handle, out_handle, mapping)
00214 
00215 
00216 def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
00217     """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE).
00218 
00219     Avoids creating SeqRecord and Seq objects in order to speed up this
00220     conversion.
00221     """
00222     #Map unexpected chars to null
00223     from Bio.SeqIO.QualityIO import solexa_quality_from_phred
00224     trunc_char = chr(1)
00225     mapping = "".join([chr(0) for ascii in range(0, 64)] \
00226                      +[chr(64+int(round(solexa_quality_from_phred(q)))) \
00227                        for q in range(0, 62+1)] \
00228                      +[chr(0) for ascii in range(127, 256)])
00229     assert len(mapping)==256
00230     return _fastq_generic(in_handle, out_handle, mapping)
00231 
00232 
00233 def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
00234     """Fast FASTQ to FASTA conversion (PRIVATE).
00235 
00236     Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
00237     Seq objects in order to speed up this conversion.
00238 
00239     NOTE - This does NOT check the characters used in the FASTQ quality string
00240     are valid!
00241     """
00242     from Bio.SeqIO.QualityIO import FastqGeneralIterator
00243     #For real speed, don't even make SeqRecord and Seq objects!
00244     count = 0
00245     for title, seq, qual in FastqGeneralIterator(in_handle):
00246         count += 1
00247         out_handle.write(">%s\n" % title)
00248         #Do line wrapping
00249         for i in range(0, len(seq), 60):
00250             out_handle.write(seq[i:i+60] + "\n")
00251     return count
00252 
00253 def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
00254     """Fast FASTQ to simple tabbed conversion (PRIVATE).
00255 
00256     Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
00257     Seq objects in order to speed up this conversion.
00258 
00259     NOTE - This does NOT check the characters used in the FASTQ quality string
00260     are valid!
00261     """
00262     from Bio.SeqIO.QualityIO import FastqGeneralIterator
00263     #For real speed, don't even make SeqRecord and Seq objects!
00264     count = 0
00265     for title, seq, qual in FastqGeneralIterator(in_handle):
00266         count += 1
00267         out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq))
00268     return count
00269 
00270 def _fastq_convert_qual(in_handle, out_handle, mapping):
00271     """FASTQ helper function for QUAL output (PRIVATE).
00272 
00273     Mapping should be a dictionary mapping expected ASCII characters from the
00274     FASTQ quality string to PHRED quality scores (as strings).
00275     """
00276     from Bio.SeqIO.QualityIO import FastqGeneralIterator
00277     #For real speed, don't even make SeqRecord and Seq objects!
00278     count = 0
00279     for title, seq, qual in FastqGeneralIterator(in_handle):
00280         count += 1
00281         out_handle.write(">%s\n" % title)
00282         #map the qual...
00283         try:
00284             qualities_strs = [mapping[ascii] for ascii in qual]
00285         except KeyError:
00286             raise ValueError("Invalid character in quality string")
00287         data = " ".join(qualities_strs)
00288         while True:
00289             if len(data) <= 60:
00290                 out_handle.write(data + "\n")
00291                 break
00292             else:
00293                 #By construction there must be spaces in the first 60 chars
00294                 #(unless we have 60 digit or higher quality scores!)
00295                 i = data.rfind(" ", 0, 60)
00296                 out_handle.write(data[:i] + "\n")
00297                 data = data[i+1:]
00298     return count
00299 
00300     
00301 def _fastq_sanger_convert_qual(in_handle, out_handle, alphabet=None):
00302     """Fast Sanger FASTQ to QUAL conversion (PRIVATE)."""
00303     mapping = dict((chr(q+33), str(q)) for q in range(0,93+1))
00304     return _fastq_convert_qual(in_handle, out_handle, mapping)
00305 
00306 
00307 def _fastq_solexa_convert_qual(in_handle, out_handle, alphabet=None):
00308     """Fast Solexa FASTQ to QUAL conversion (PRIVATE)."""
00309     from Bio.SeqIO.QualityIO import phred_quality_from_solexa
00310     mapping = dict((chr(q+64), str(int(round(phred_quality_from_solexa(q))))) \
00311                    for q in range(-5,62+1))
00312     return _fastq_convert_qual(in_handle, out_handle, mapping)
00313 
00314 
00315 def _fastq_illumina_convert_qual(in_handle, out_handle, alphabet=None):
00316     """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE)."""
00317     mapping = dict((chr(q+64), str(q)) for q in range(0,62+1))
00318     return _fastq_convert_qual(in_handle, out_handle, mapping)
00319 
00320 
00321 #TODO? - Handling aliases explicitly would let us shorten this list:
00322 _converter = {
00323     ("genbank", "fasta") : _genbank_convert_fasta,
00324     ("gb", "fasta") : _genbank_convert_fasta,
00325     ("embl", "fasta") : _embl_convert_fasta,
00326     ("fastq", "fasta") : _fastq_convert_fasta,
00327     ("fastq-sanger", "fasta") : _fastq_convert_fasta,
00328     ("fastq-solexa", "fasta") : _fastq_convert_fasta,
00329     ("fastq-illumina", "fasta") : _fastq_convert_fasta,
00330     ("fastq", "tab") : _fastq_convert_tab,
00331     ("fastq-sanger", "tab") : _fastq_convert_tab,
00332     ("fastq-solexa", "tab") : _fastq_convert_tab,
00333     ("fastq-illumina", "tab") : _fastq_convert_tab,
00334     ("fastq", "fastq") : _fastq_sanger_convert_fastq_sanger,
00335     ("fastq-sanger", "fastq") : _fastq_sanger_convert_fastq_sanger,
00336     ("fastq-solexa", "fastq") : _fastq_solexa_convert_fastq_sanger,
00337     ("fastq-illumina", "fastq") : _fastq_illumina_convert_fastq_sanger,
00338     ("fastq", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger,
00339     ("fastq-sanger", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger,
00340     ("fastq-solexa", "fastq-sanger") : _fastq_solexa_convert_fastq_sanger,
00341     ("fastq-illumina", "fastq-sanger") : _fastq_illumina_convert_fastq_sanger,
00342     ("fastq", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa,
00343     ("fastq-sanger", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa,
00344     ("fastq-solexa", "fastq-solexa") : _fastq_solexa_convert_fastq_solexa,
00345     ("fastq-illumina", "fastq-solexa") : _fastq_illumina_convert_fastq_solexa,
00346     ("fastq", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
00347     ("fastq-sanger", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
00348     ("fastq-solexa", "fastq-illumina") : _fastq_solexa_convert_fastq_illumina,
00349     ("fastq-illumina", "fastq-illumina") : _fastq_illumina_convert_fastq_illumina,
00350     ("fastq", "qual") : _fastq_sanger_convert_qual,
00351     ("fastq-sanger", "qual") : _fastq_sanger_convert_qual,
00352     ("fastq-solexa", "qual") : _fastq_solexa_convert_qual,
00353     ("fastq-illumina", "qual") : _fastq_illumina_convert_qual,
00354     }
00355 
00356 def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
00357     """SeqIO conversion function (PRIVATE)."""
00358     try:
00359         f = _converter[(in_format, out_format)]
00360     except KeyError:
00361         f = None
00362     if f:
00363         return f(in_handle, out_handle, alphabet)
00364     else:
00365         records = SeqIO.parse(in_handle, in_format, alphabet)
00366         return SeqIO.write(records, out_handle, out_format)