Back to index

python-biopython  1.60
FastaIO.py
Go to the documentation of this file.
00001 # Copyright 2006-2009 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 #
00006 # This module is for reading and writing FASTA format files as SeqRecord
00007 # objects.  The code is partly inspired  by earlier Biopython modules,
00008 # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA
00009 
00010 """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format.
00011 
00012 You are expected to use this module via the Bio.SeqIO functions."""
00013 
00014 from Bio.Alphabet import single_letter_alphabet
00015 from Bio.Seq import Seq
00016 from Bio.SeqRecord import SeqRecord
00017 from Bio.SeqIO.Interfaces import SequentialSequenceWriter
00018 
00019 #This is a generator function!
00020 def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None):
00021     """Generator function to iterate over Fasta records (as SeqRecord objects).
00022 
00023     handle - input file
00024     alphabet - optional alphabet
00025     title2ids - A function that, when given the title of the FASTA
00026     file (without the beginning >), will return the id, name and
00027     description (in that order) for the record as a tuple of strings.
00028 
00029     If this is not given, then the entire title line will be used
00030     as the description, and the first word as the id and name.
00031 
00032     Note that use of title2ids matches that of Bio.Fasta.SequenceParser
00033     but the defaults are slightly different.
00034     """
00035     #Skip any text before the first record (e.g. blank lines, comments)
00036     while True:
00037         line = handle.readline()
00038         if line == "" : return #Premature end of file, or just empty?
00039         if line[0] == ">":
00040             break
00041 
00042     while True:
00043         if line[0]!=">":
00044             raise ValueError("Records in Fasta files should start with '>' character")
00045         if title2ids:
00046             id, name, descr = title2ids(line[1:].rstrip())
00047         else:
00048             descr = line[1:].rstrip()
00049             try:
00050                 id = descr.split()[0]
00051             except IndexError:
00052                 assert not descr, repr(line)
00053                 #Should we use SeqRecord default for no ID?
00054                 id = ""
00055             name = id
00056 
00057         lines = []
00058         line = handle.readline()
00059         while True:
00060             if not line : break
00061             if line[0] == ">": break
00062             lines.append(line.rstrip())
00063             line = handle.readline()
00064 
00065         #Remove trailing whitespace, and any internal spaces
00066         #(and any embedded \r which are possible in mangled files
00067         #when not opened in universal read lines mode)
00068         result = "".join(lines).replace(" ", "").replace("\r", "")
00069 
00070         #Return the record and then continue...
00071         yield SeqRecord(Seq(result, alphabet),
00072                          id = id, name = name, description = descr)
00073 
00074         if not line : return #StopIteration
00075     assert False, "Should not reach this line"
00076 
00077 class FastaWriter(SequentialSequenceWriter):
00078     """Class to write Fasta format files."""
00079     def __init__(self, handle, wrap=60, record2title=None):
00080         """Create a Fasta writer.
00081 
00082         handle - Handle to an output file, e.g. as returned
00083                  by open(filename, "w")
00084         wrap -   Optional line length used to wrap sequence lines.
00085                  Defaults to wrapping the sequence at 60 characters
00086                  Use zero (or None) for no wrapping, giving a single
00087                  long line for the sequence.
00088         record2title - Optional function to return the text to be
00089                  used for the title line of each record.  By default the
00090                  a combination of the record.id and record.description
00091                  is used.  If the record.description starts with the
00092                  record.id, then just the record.description is used.
00093 
00094         You can either use:
00095 
00096         myWriter = FastaWriter(open(filename,"w"))
00097         writer.write_file(myRecords)
00098 
00099         Or, follow the sequential file writer system, for example:
00100 
00101         myWriter = FastaWriter(open(filename,"w"))
00102         writer.write_header() # does nothing for Fasta files
00103         ...
00104         Multiple calls to writer.write_record() and/or writer.write_records()
00105         ...
00106         writer.write_footer() # does nothing for Fasta files
00107         writer.close()
00108         """
00109         SequentialSequenceWriter.__init__(self, handle)
00110         #self.handle = handle
00111         self.wrap = None
00112         if wrap:
00113             if wrap < 1:
00114                 raise ValueError
00115         self.wrap = wrap
00116         self.record2title = record2title
00117 
00118     def write_record(self, record):
00119         """Write a single Fasta record to the file."""
00120         assert self._header_written
00121         assert not self._footer_written
00122         self._record_written = True
00123 
00124         if self.record2title:
00125             title=self.clean(self.record2title(record))
00126         else:
00127             id = self.clean(record.id)
00128             description = self.clean(record.description)
00129 
00130             #if description[:len(id)]==id:
00131             if description and description.split(None,1)[0]==id:
00132                 #The description includes the id at the start
00133                 title = description
00134             elif description:
00135                 title = "%s %s" % (id, description)
00136             else:
00137                 title = id
00138 
00139         assert "\n" not in title
00140         assert "\r" not in title
00141         self.handle.write(">%s\n" % title)
00142 
00143         data = self._get_seq_string(record) #Catches sequence being None
00144 
00145         assert "\n" not in data
00146         assert "\r" not in data
00147 
00148         if self.wrap:
00149             for i in range(0, len(data), self.wrap):
00150                 self.handle.write(data[i:i+self.wrap] + "\n")
00151         else:
00152             self.handle.write(data + "\n")
00153 
00154 if __name__ == "__main__":
00155     print "Running quick self test"
00156 
00157     import os
00158     from Bio.Alphabet import generic_protein, generic_nucleotide
00159 
00160     #Download the files from here:
00161     #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans
00162     fna_filename = "NC_005213.fna"
00163     faa_filename = "NC_005213.faa"
00164 
00165     def genbank_name_function(text):
00166         text, descr = text.split(None,1)
00167         id = text.split("|")[3]
00168         name = id.split(".",1)[0]
00169         return id, name, descr
00170 
00171     def print_record(record):
00172         #See also bug 2057
00173         #http://bugzilla.open-bio.org/show_bug.cgi?id=2057
00174         print "ID:" + record.id
00175         print "Name:" + record.name
00176         print "Descr:" + record.description
00177         print record.seq
00178         for feature in record.annotations:
00179             print '/%s=%s' % (feature, record.annotations[feature])
00180         if record.dbxrefs:
00181             print "Database cross references:"
00182             for x in record.dbxrefs : print " - %s" % x
00183 
00184     if os.path.isfile(fna_filename):
00185         print "--------"
00186         print "FastaIterator (single sequence)"
00187         iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function)
00188         count=0
00189         for record in iterator:
00190             count=count+1
00191             print_record(record)
00192         assert count == 1
00193         print str(record.__class__)
00194 
00195     if os.path.isfile(faa_filename):
00196         print "--------"
00197         print "FastaIterator (multiple sequences)"
00198         iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function)
00199         count=0
00200         for record in iterator:
00201             count=count+1
00202             print_record(record)
00203             break
00204         assert count>0
00205         print str(record.__class__)
00206 
00207     from cStringIO import StringIO
00208     print "--------"
00209     print "FastaIterator (empty input file)"
00210     #Just to make sure no errors happen
00211     iterator = FastaIterator(StringIO(""))
00212     count = 0
00213     for record in iterator:
00214         count = count+1
00215     assert count==0
00216 
00217     print "Done"