Back to index

python-biopython  1.60
IgIO.py
Go to the documentation of this file.
00001 # Copyright 2008-2010 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 #
00006 # This module is for reading and writing IntelliGenetics format files as
00007 # SeqRecord objects.  This file format appears to be the same as the MASE
00008 # multiple sequence alignment format.
00009 
00010 """Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
00011 
00012 You are expected to use this module via the Bio.SeqIO functions."""
00013 
00014 from Bio.Alphabet import single_letter_alphabet
00015 from Bio.Seq import Seq
00016 from Bio.SeqRecord import SeqRecord
00017 
00018 #This is a generator function!
00019 def IgIterator(handle, alphabet = single_letter_alphabet):
00020     """Iterate over IntelliGenetics records (as SeqRecord objects).
00021 
00022     handle - input file
00023     alphabet - optional alphabet
00024 
00025     The optional free format file header lines (which start with two
00026     semi-colons) are ignored.
00027 
00028     The free format commentary lines at the start of each record (which
00029     start with a semi-colon) are recorded as a single string with embedded
00030     new line characters in the SeqRecord's annotations dictionary under the
00031     key 'comment'.
00032     """
00033     #Skip any file header text before the first record (;; lines)
00034     while True:
00035         line = handle.readline()
00036         if not line : break #Premature end of file, or just empty?
00037         if not line.startswith(";;") : break
00038 
00039     while line:
00040         #Now iterate over the records
00041         if line[0] != ";":
00042             raise ValueError( \
00043                   "Records should start with ';' and not:\n%s" % repr(line))
00044 
00045         #Try and agree with SeqRecord convention from the GenBank parser,
00046         #(and followed in the SwissProt parser) which stores the comments
00047         #as a long string with newlines under annotations key 'comment'.
00048 
00049         #Note some examples use "; ..." and others ";..."
00050         comment_lines = []
00051         while line.startswith(";"):
00052             #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
00053             comment_lines.append(line[1:].strip())
00054             line = handle.readline()
00055         title = line.rstrip()
00056 
00057         seq_lines = []
00058         while True:
00059             line = handle.readline()
00060             if not line:
00061                 break
00062             if line[0] == ";":
00063                 break
00064             #Remove trailing whitespace, and any internal spaces
00065             seq_lines.append(line.rstrip().replace(" ",""))
00066         seq_str = "".join(seq_lines)
00067         if seq_str.endswith("1"):
00068             #Remove the optional terminator (digit one)
00069             seq_str = seq_str[:-1]
00070         if "1" in seq_str:
00071             raise ValueError(\
00072                 "Potential terminator digit one found within sequence.")
00073                 
00074         #Return the record and then continue...
00075         record = SeqRecord(Seq(seq_str, alphabet),
00076                            id = title, name = title)
00077         record.annotations['comment'] = "\n".join(comment_lines)
00078         yield record
00079     
00080     #We should be at the end of the file now
00081     assert not line
00082 
00083 if __name__ == "__main__":
00084     print "Running quick self test"
00085     
00086     import os
00087     path = "../../Tests/IntelliGenetics/"
00088     if os.path.isdir(path):
00089         for filename in os.listdir(path):
00090             if os.path.splitext(filename)[-1] == ".txt":
00091                 print
00092                 print filename
00093                 print "-"*len(filename)
00094                 handle = open(os.path.join(path, filename))
00095                 for record in IgIterator(handle):
00096                     print record.id, len(record)
00097                 handle.close()
00098         print "Done"
00099     else:
00100         print "Could not find input files"