Back to index

python-biopython  1.60
TabIO.py
Go to the documentation of this file.
00001 # Copyright 2008-2010 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Bio.SeqIO support for the "tab" (simple tab separated) file format.
00007 
00008 You are expected to use this module via the Bio.SeqIO functions.
00009 
00010 The "tab" format is an ad-hoc plain text file format where each sequence is
00011 on one (long) line.  Each line contains the identifier/description, followed
00012 by a tab, followed by the sequence.  For example, consider the following
00013 short FASTA format file:
00014 
00015 >ID123456 possible binding site?
00016 CATCNAGATGACACTACGACTACGACTCAGACTAC
00017 >ID123457 random sequence
00018 ACACTACGACTACGACTCAGACTACAAN
00019 
00020 Apart from the descriptions, this can be represented in the simple two column
00021 tab separated format as follows:
00022 
00023 ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC
00024 ID123457(tab)ACACTACGACTACGACTCAGACTACAAN
00025 
00026 When reading this file, "ID123456" or "ID123457" will be taken as the record's
00027 .id and .name property.  There is no other information to record.
00028 
00029 Similarly, when writing to this format, Biopython will ONLY record the record's
00030 .id and .seq (and not the description or any other information) as in the
00031 example above.
00032 """
00033 
00034 from Bio.Alphabet import single_letter_alphabet
00035 from Bio.Seq import Seq
00036 from Bio.SeqRecord import SeqRecord
00037 from Bio.SeqIO.Interfaces import SequentialSequenceWriter
00038 
00039 #This is a generator function!
00040 def TabIterator(handle, alphabet = single_letter_alphabet):
00041     """Iterates over tab separated lines (as SeqRecord objects).
00042 
00043     Each line of the file should contain one tab only, dividing the line
00044     into an identifier and the full sequence.
00045 
00046     handle - input file
00047     alphabet - optional alphabet
00048 
00049     The first field is taken as the record's .id and .name (regardless of
00050     any spaces within the text) and the second field is the sequence.
00051 
00052     Any blank lines are ignored.
00053     """
00054     for line in handle:
00055         try:
00056             title, seq = line.split("\t") #will fail if more than one tab!
00057         except:
00058             if line.strip() == "":
00059                 #It's a blank line, ignore it
00060                 continue
00061             raise ValueError("Each line should have one tab separating the" + \
00062                              " title and sequence, this line has %i tabs: %s" \
00063                              % (line.count("\t"), repr(line)))
00064         title = title.strip()
00065         seq = seq.strip() #removes the trailing new line
00066         yield SeqRecord(Seq(seq, alphabet),
00067                         id=title, name=title,
00068                         description="")
00069 
00070 class TabWriter(SequentialSequenceWriter):
00071     """Class to write simple tab separated format files.
00072 
00073     Each line consists of "id(tab)sequence" only.
00074 
00075     Any description, name or other annotation is not recorded.
00076     """
00077     def write_record(self, record):
00078         """Write a single tab line to the file."""
00079         assert self._header_written
00080         assert not self._footer_written
00081         self._record_written = True
00082         
00083         title = self.clean(record.id)
00084         seq = self._get_seq_string(record) #Catches sequence being None
00085         assert "\t" not in title
00086         assert "\n" not in title
00087         assert "\r" not in title
00088         assert "\t" not in seq
00089         assert "\n" not in seq
00090         assert "\r" not in seq
00091         self.handle.write("%s\t%s\n" % (title, seq))
00092 
00093 
00094 if __name__ == "__main__":
00095     print "Running quick self test"
00096     from StringIO import StringIO
00097 
00098     #This example has a trailing blank line which should be ignored
00099     handle = StringIO("Alpha\tAAAAAAA\nBeta\tCCCCCCC\n\n")
00100     records = list(TabIterator(handle))
00101     assert len(records) == 2
00102 
00103     handle = StringIO("Alpha\tAAAAAAA\tExtra\nBeta\tCCCCCCC\n")
00104     try:
00105         records = list(TabIterator(handle))
00106         assert False, "Should have reject this invalid example!"
00107     except ValueError:
00108         #Good!
00109         pass
00110 
00111     print "Done"