Back to index

python-biopython  1.60
PhylipIO.py
Go to the documentation of this file.
00001 # Copyright 2006-2011 by Peter Cock.  All rights reserved.
00002 # Revisions copyright 2011 Brandon Invergo. All rights reserved.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 """
00007 AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools.
00008 
00009 You are expected to use this module via the Bio.AlignIO functions (or the
00010 Bio.SeqIO functions if you want to work directly with the gapped sequences).
00011 
00012 Support for "relaxed phylip" format is also provided. Relaxed phylip differs
00013 from standard phylip format in the following ways:
00014 
00015  * No whitespace is allowed in the sequence ID.
00016  * No truncation is performed. Instead, sequence IDs are padded to the longest
00017    ID length, rather than 10 characters. A space separates the sequence
00018    identifier from the sequence.
00019 
00020 Relaxed phylip is supported by RAxML and PHYML.
00021 
00022 Note
00023 ====
00024 In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
00025 a dot/period (".") in a sequence is interpreted as meaning the same
00026 character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69
00027 http://evolution.genetics.washington.edu/phylip/doc/sequence.html says:
00028 
00029    "a period was also previously allowed but it is no longer allowed,
00030    because it sometimes is used in different senses in other programs"
00031 
00032 Biopython 1.58 or later treats dots/periods in the sequence as invalid, both
00033 for reading and writing. Older versions did nothing special with a dot/period.
00034 """
00035 import string
00036 
00037 from Bio.Seq import Seq
00038 from Bio.SeqRecord import SeqRecord
00039 from Bio.Align import MultipleSeqAlignment
00040 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
00041 
00042 try:
00043     any
00044 except NameError:
00045     #Hack for Python 2.4
00046     def any(iterable):
00047         for element in iterable:
00048             if element:
00049                return True
00050         return False
00051 
00052 _PHYLIP_ID_WIDTH = 10
00053 
00054 
00055 class PhylipWriter(SequentialAlignmentWriter):
00056     """Phylip alignment writer."""
00057 
00058     def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
00059         """Use this to write (another) single alignment to an open file.
00060 
00061         This code will write interlaced alignments (when the sequences are
00062         longer than 50 characters).
00063 
00064         Note that record identifiers are strictly truncated to id_width,
00065         defaulting to the value required to comply with the PHYLIP standard.
00066 
00067         For more information on the file format, please see:
00068         http://evolution.genetics.washington.edu/phylip/doc/sequence.html
00069         http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
00070         """
00071         handle = self.handle
00072 
00073         if len(alignment)==0:
00074             raise ValueError("Must have at least one sequence")
00075         length_of_seqs = alignment.get_alignment_length()
00076         for record in alignment:
00077             if length_of_seqs != len(record.seq):
00078                 raise ValueError("Sequences must all be the same length")
00079         if length_of_seqs <= 0:
00080             raise ValueError("Non-empty sequences are required")
00081 
00082         # Check for repeated identifiers...
00083         # Apply this test *after* cleaning the identifiers
00084         names = []
00085         for record in alignment:
00086             """
00087             Quoting the PHYLIP version 3.6 documentation:
00088 
00089             The name should be ten characters in length, filled out to
00090             the full ten characters by blanks if shorter. Any printable
00091             ASCII/ISO character is allowed in the name, except for
00092             parentheses ("(" and ")"), square brackets ("[" and "]"),
00093             colon (":"), semicolon (";") and comma (","). If you forget
00094             to extend the names to ten characters in length by blanks,
00095             the program [i.e. PHYLIP] will get out of synchronization
00096             with the contents of the data file, and an error message will
00097             result.
00098 
00099             Note that Tab characters count as only one character in the
00100             species names. Their inclusion can cause trouble.
00101             """
00102             name = record.id.strip()
00103             #Either remove the banned characters, or map them to something
00104             #else like an underscore "_" or pipe "|" character...
00105             for char in "[](),":
00106                 name = name.replace(char,"")
00107             for char in ":;":
00108                 name = name.replace(char,"|")
00109             name = name[:id_width]
00110             if name in names:
00111                 raise ValueError("Repeated name %r (originally %r), "
00112                                  "possibly due to truncation" \
00113                                  % (name, record.id))
00114             names.append(name)
00115 
00116         # From experimentation, the use of tabs is not understood by the
00117         # EMBOSS suite.  The nature of the expected white space is not
00118         # defined in the PHYLIP documentation, simply "These are in free
00119         # format, separated by blanks".  We'll use spaces to keep EMBOSS
00120         # happy.
00121         handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
00122         block=0
00123         while True:
00124             for name, record in zip(names, alignment):
00125                 if block==0:
00126                     #Write name (truncated/padded to id_width characters)
00127                     #Now truncate and right pad to expected length.
00128                     handle.write(name[:id_width].ljust(id_width))
00129                 else:
00130                     #write indent
00131                     handle.write(" " * id_width)
00132                 #Write five chunks of ten letters per line...
00133                 sequence = str(record.seq)
00134                 if "." in sequence:
00135                     raise ValueError("PHYLIP format no longer allows dots in "
00136                                      "sequence")
00137                 for chunk in range(0,5):
00138                     i = block*50 + chunk*10
00139                     seq_segment = sequence[i:i+10]
00140                     #TODO - Force any gaps to be '-' character?  Look at the
00141                     #alphabet...
00142                     #TODO - How to cope with '?' or '.' in the sequence?
00143                     handle.write(" %s" % seq_segment)
00144                     if i+10 > length_of_seqs : break
00145                 handle.write("\n")
00146             block=block+1
00147             if block*50 > length_of_seqs : break
00148             handle.write("\n")
00149 
00150 class PhylipIterator(AlignmentIterator):
00151     """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator.
00152 
00153     Record identifiers are limited to at most 10 characters.
00154 
00155     It only copes with interlaced phylip files!  Sequential files won't work
00156     where the sequences are split over multiple lines.
00157 
00158     For more information on the file format, please see:
00159     http://evolution.genetics.washington.edu/phylip/doc/sequence.html
00160     http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
00161     """
00162 
00163     # Default truncation length
00164     id_width = _PHYLIP_ID_WIDTH
00165 
00166     def _is_header(self, line):
00167         line = line.strip()
00168         parts = filter(None, line.split())
00169         if len(parts)!=2:
00170             return False # First line should have two integers
00171         try:
00172             number_of_seqs = int(parts[0])
00173             length_of_seqs = int(parts[1])
00174             return True
00175         except ValueError:
00176             return False # First line should have two integers
00177 
00178     def _split_id(self, line):
00179         """
00180         Extracts the sequence ID from a Phylip line, returning a tuple
00181         containing:
00182 
00183             (sequence_id, sequence_residues)
00184 
00185         The first 10 characters in the line are are the sequence id, the
00186         remainder are sequence data.
00187         """
00188         seq_id = line[:self.id_width].strip()
00189         seq = line[self.id_width:].strip().replace(' ', '')
00190         return seq_id, seq
00191 
00192     def next(self):
00193         handle = self.handle
00194 
00195         try:
00196             #Header we saved from when we were parsing
00197             #the previous alignment.
00198             line = self._header
00199             del self._header
00200         except AttributeError:
00201             line = handle.readline()
00202 
00203         if not line:
00204             raise StopIteration
00205         line = line.strip()
00206         parts = filter(None, line.split())
00207         if len(parts)!=2:
00208             raise ValueError("First line should have two integers")
00209         try:
00210             number_of_seqs = int(parts[0])
00211             length_of_seqs = int(parts[1])
00212         except ValueError:
00213             raise ValueError("First line should have two integers")
00214 
00215         assert self._is_header(line)
00216 
00217         if self.records_per_alignment is not None \
00218         and self.records_per_alignment != number_of_seqs:
00219             raise ValueError("Found %i records in this alignment, told to expect %i" \
00220                              % (number_of_seqs, self.records_per_alignment))
00221 
00222         ids = []
00223         seqs = []
00224 
00225         # By default, expects STRICT truncation / padding to 10 characters.
00226         # Does not require any whitespace between name and seq.
00227         for i in xrange(number_of_seqs):
00228             line = handle.readline().rstrip()
00229             sequence_id, s = self._split_id(line)
00230             ids.append(sequence_id)
00231             if "." in s:
00232                 raise ValueError("PHYLIP format no longer allows dots in sequence")
00233             seqs.append([s])
00234 
00235         #Look for further blocks
00236         line=""
00237         while True:
00238             #Skip any blank lines between blocks...
00239             while ""==line.strip():
00240                 line = handle.readline()
00241                 if not line : break #end of file
00242             if not line : break #end of file
00243 
00244             if self._is_header(line):
00245                 #Looks like the start of a concatenated alignment
00246                 self._header = line
00247                 break
00248 
00249             #print "New block..."
00250             for i in xrange(number_of_seqs):
00251                 s = line.strip().replace(" ","")
00252                 if "." in s:
00253                     raise ValueError("PHYLIP format no longer allows dots in sequence")
00254                 seqs[i].append(s)
00255                 line = handle.readline()
00256                 if (not line) and i+1 < number_of_seqs:
00257                     raise ValueError("End of file mid-block")
00258             if not line : break #end of file
00259 
00260         records = (SeqRecord(Seq("".join(s), self.alphabet), \
00261                              id=i, name=i, description=i) \
00262                    for (i,s) in zip(ids, seqs))
00263         return MultipleSeqAlignment(records, self.alphabet)
00264 
00265 # Relaxed Phylip
00266 class RelaxedPhylipWriter(PhylipWriter):
00267     """
00268     Relaxed Phylip format writer
00269     """
00270 
00271     def write_alignment(self, alignment):
00272         """
00273         Write a relaxed phylip alignment
00274         """
00275         # Check inputs
00276         for name in (s.id.strip() for s in alignment):
00277             if any(c in name for c in string.whitespace):
00278                 raise ValueError("Whitespace not allowed in identifier: %s"
00279                         % name)
00280 
00281         # Calculate a truncation length - maximum length of sequence ID plus a
00282         # single character for padding
00283         # If no sequences, set id_width to 1. super(...) call will raise a
00284         # ValueError
00285         if len(alignment) == 0:
00286             id_width = 1
00287         else:
00288             id_width = max((len(s.id.strip()) for s in alignment)) + 1
00289         super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
00290 
00291 
00292 class RelaxedPhylipIterator(PhylipIterator):
00293     """
00294     Relaxed Phylip format Iterator
00295     """
00296 
00297     def _split_id(self, line):
00298         """Returns the ID, sequence data from a line
00299         Extracts the sequence ID from a Phylip line, returning a tuple
00300         containing:
00301 
00302             (sequence_id, sequence_residues)
00303 
00304         For relaxed format - split at the first whitespace character
00305         """
00306         seq_id, sequence = line.split(None, 1)
00307         sequence = sequence.strip().replace(" ", "")
00308         return seq_id, sequence
00309 
00310 
00311 class SequentialPhylipWriter(SequentialAlignmentWriter):
00312     """
00313     Sequential Phylip format Writer
00314     """
00315     def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
00316         handle = self.handle
00317 
00318         if len(alignment)==0:
00319             raise ValueError("Must have at least one sequence")
00320         length_of_seqs = alignment.get_alignment_length()
00321         for record in alignment:
00322             if length_of_seqs != len(record.seq):
00323                 raise ValueError("Sequences must all be the same length")
00324         if length_of_seqs <= 0:
00325             raise ValueError("Non-empty sequences are required")
00326 
00327         # Check for repeated identifiers...
00328         # Apply this test *after* cleaning the identifiers
00329         names = []
00330         for record in alignment:
00331             name = record.id.strip()
00332             #Either remove the banned characters, or map them to something
00333             #else like an underscore "_" or pipe "|" character...
00334             for char in "[](),":
00335                 name = name.replace(char,"")
00336             for char in ":;":
00337                 name = name.replace(char,"|")
00338             name = name[:id_width]
00339             if name in names:
00340                 raise ValueError("Repeated name %r (originally %r), "
00341                                  "possibly due to truncation" \
00342                                  % (name, record.id))
00343             names.append(name)
00344 
00345         # From experimentation, the use of tabs is not understood by the
00346         # EMBOSS suite.  The nature of the expected white space is not
00347         # defined in the PHYLIP documentation, simply "These are in free
00348         # format, separated by blanks".  We'll use spaces to keep EMBOSS
00349         # happy.
00350         handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
00351         for name, record in zip(names, alignment):
00352             sequence = str(record.seq)
00353             if "." in sequence:
00354                 raise ValueError("PHYLIP format no longer allows dots in "
00355                                  "sequence")
00356             handle.write(name[:id_width].ljust(id_width))
00357             # Write the entire sequence to one line (see sequential format
00358             # notes in the SequentialPhylipIterator docstring
00359             handle.write(sequence)
00360             handle.write("\n")
00361         
00362 
00363 class SequentialPhylipIterator(PhylipIterator):
00364     """
00365     Sequential Phylip format Iterator
00366 
00367     The sequential format carries the same restrictions as the normal
00368     interleaved one, with the difference being that the sequences are listed
00369     sequentially, each sequence written in its entirety before the start of
00370     the next. According to the PHYLIP documentation for input file formatting,
00371     newlines and spaces may optionally be entered at any point in the sequences.
00372     """
00373     def next(self):
00374         handle = self.handle
00375 
00376         try:
00377             #Header we saved from when we were parsing
00378             #the previous alignment.
00379             line = self._header
00380             del self._header
00381         except AttributeError:
00382             line = handle.readline()
00383 
00384         if not line:
00385             raise StopIteration
00386         line = line.strip()
00387         parts = filter(None, line.split())
00388         if len(parts)!=2:
00389             raise ValueError("First line should have two integers")
00390         try:
00391             number_of_seqs = int(parts[0])
00392             length_of_seqs = int(parts[1])
00393         except ValueError:
00394             raise ValueError("First line should have two integers")
00395 
00396         assert self._is_header(line)
00397 
00398         if self.records_per_alignment is not None \
00399         and self.records_per_alignment != number_of_seqs:
00400             raise ValueError("Found %i records in this alignment, told to expect %i" \
00401                              % (number_of_seqs, self.records_per_alignment))
00402 
00403         ids = []
00404         seqs = []
00405 
00406         # By default, expects STRICT truncation / padding to 10 characters.
00407         # Does not require any whitespace between name and seq.
00408         for i in xrange(number_of_seqs):
00409             line = handle.readline().rstrip()
00410             sequence_id, s = self._split_id(line)
00411             ids.append(sequence_id)
00412             while len(s) < length_of_seqs:
00413                 # The sequence may be split into multiple lines
00414                 line = handle.readline().strip()
00415                 if not line:
00416                     break
00417                 if line == "":
00418                     continue
00419                 s = "".join([s, line.strip().replace(" ", "")])
00420                 if len(s) > length_of_seqs:
00421                     raise ValueError("Found a record of length %i, should be %i" \
00422                             % (len(s), length_of_seqs))
00423             if "." in s:
00424                 raise ValueError("PHYLIP format no longer allows dots in sequence")
00425             seqs.append(s)
00426         while True:
00427             # Find other alignments in the file
00428             line = handle.readline()
00429             if not line:
00430                 break
00431             if self._is_header(line):
00432                 self._header = line
00433                 break
00434 
00435         records = (SeqRecord(Seq(s, self.alphabet), \
00436                              id=i, name=i, description=i) \
00437                    for (i,s) in zip(ids, seqs))
00438         return MultipleSeqAlignment(records, self.alphabet)
00439         
00440 
00441 if __name__=="__main__":
00442     print "Running short mini-test"
00443 
00444     phylip_text="""     8    286
00445 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
00446 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
00447 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
00448 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
00449 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
00450 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
00451 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
00452 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
00453 
00454            MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
00455            ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
00456            TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
00457            TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
00458            LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
00459            TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
00460            MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
00461            TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
00462 
00463            LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
00464            LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
00465            LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
00466            LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
00467            LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
00468            LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
00469            LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
00470            LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
00471 
00472            DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
00473            DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
00474            DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
00475            DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
00476            DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
00477            DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
00478            DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
00479            SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
00480 
00481            FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
00482            YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
00483            TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
00484            TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
00485            ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
00486            VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
00487            AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
00488            AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
00489 
00490            LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
00491            LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
00492            VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
00493            LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
00494            LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
00495            LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
00496            LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
00497            LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
00498 """
00499 
00500     from cStringIO import StringIO
00501     handle = StringIO(phylip_text)
00502     count=0
00503     for alignment in PhylipIterator(handle):
00504         for record in alignment:
00505             count=count+1
00506             print record.id
00507             #print record.seq.tostring()
00508     assert count == 8
00509 
00510     expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
00511     fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
00512     aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
00513     dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
00514     alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper()
00515     assert record.seq.tostring().replace("-","") == expected
00516 
00517     #From here:
00518     #http://atgc.lirmm.fr/phyml/usersguide.html
00519     phylip_text2="""5 60
00520 Tax1        CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
00521 Tax2        CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
00522 Tax3        CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
00523 Tax4        TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
00524 Tax5        CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
00525 
00526 GAAATGGTCAATATTACAAGGT
00527 GAAATGGTCAACATTAAAAGAT
00528 GAAATCGTCAATATTAAAAGGT
00529 GAAATGGTCAATCTTAAAAGGT
00530 GAAATGGTCAATATTAAAAGGT"""
00531 
00532     phylip_text3="""5 60
00533 Tax1        CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
00534 Tax2        CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
00535 Tax3        CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
00536 Tax4        TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
00537 Tax5        CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
00538 
00539     handle = StringIO(phylip_text2)
00540     list2 = list(PhylipIterator(handle))
00541     handle.close()
00542     assert len(list2)==1
00543     assert len(list2[0])==5
00544 
00545     handle = StringIO(phylip_text3)
00546     list3 = list(PhylipIterator(handle))
00547     handle.close()
00548     assert len(list3)==1
00549     assert len(list3[0])==5
00550 
00551     for i in range(0,5):
00552         list2[0][i].id == list3[0][i].id
00553         list2[0][i].seq.tostring() == list3[0][i].seq.tostring()
00554 
00555     #From here:
00556     #http://evolution.genetics.washington.edu/phylip/doc/sequence.html
00557     #Note the lack of any white space between names 2 and 3 and their seqs.
00558     phylip_text4="""  5    42
00559 Turkey    AAGCTNGGGC ATTTCAGGGT
00560 Salmo gairAAGCCTTGGC AGTGCAGGGT
00561 H. SapiensACCGGTTGGC CGTTCAGGGT
00562 Chimp     AAACCCTTGC CGTTACGCTT
00563 Gorilla   AAACCCTTGC CGGTACGCTT
00564 
00565 GAGCCCGGGC AATACAGGGT AT
00566 GAGCCGTGGC CGGGCACGGT AT
00567 ACAGGTTGGC CGTTCAGGGT AA
00568 AAACCGAGGC CGGGACACTC AT
00569 AAACCATTGC CGGTACGCTT AA"""
00570 
00571     #From here:
00572     #http://evolution.genetics.washington.edu/phylip/doc/sequence.html
00573     phylip_text5="""  5    42
00574 Turkey    AAGCTNGGGC ATTTCAGGGT
00575 GAGCCCGGGC AATACAGGGT AT
00576 Salmo gairAAGCCTTGGC AGTGCAGGGT
00577 GAGCCGTGGC CGGGCACGGT AT
00578 H. SapiensACCGGTTGGC CGTTCAGGGT
00579 ACAGGTTGGC CGTTCAGGGT AA
00580 Chimp     AAACCCTTGC CGTTACGCTT
00581 AAACCGAGGC CGGGACACTC AT
00582 Gorilla   AAACCCTTGC CGGTACGCTT
00583 AAACCATTGC CGGTACGCTT AA"""
00584 
00585     phylip_text5a="""  5    42
00586 Turkey    AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
00587 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
00588 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
00589 Chimp     AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
00590 Gorilla   AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
00591 
00592     handle = StringIO(phylip_text4)
00593     list4 = list(PhylipIterator(handle))
00594     handle.close()
00595     assert len(list4)==1
00596     assert len(list4[0])==5
00597 
00598     handle = StringIO(phylip_text5)
00599     try:
00600         list5 = list(PhylipIterator(handle))
00601         assert len(list5)==1
00602         assert len(list5[0])==5
00603         print "That should have failed..."
00604     except ValueError:
00605         print "Evil multiline non-interlaced example failed as expected"
00606     handle.close()
00607 
00608     handle = StringIO(phylip_text5a)
00609     list5 = list(PhylipIterator(handle))
00610     handle.close()
00611     assert len(list5)==1
00612     assert len(list4[0])==5
00613 
00614     print "Concatenation"
00615     handle = StringIO(phylip_text4 + "\n" + phylip_text4)
00616     assert len(list(PhylipIterator(handle))) == 2
00617 
00618     handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text)
00619     assert len(list(PhylipIterator(handle))) == 3
00620 
00621     print "OK"
00622 
00623     print "Checking write/read"
00624     handle = StringIO()
00625     PhylipWriter(handle).write_file(list5)
00626     handle.seek(0)
00627     list6 = list(PhylipIterator(handle))
00628     assert len(list5) == len(list6)
00629     for a1,a2 in zip(list5, list6):
00630         assert len(a1) == len(a2)
00631         for r1, r2 in zip(a1, a2):
00632             assert r1.id == r2.id
00633             assert r1.seq.tostring() == r2.seq.tostring()
00634     print "Done"