Back to index

python-biopython  1.60
Nexus.py
Go to the documentation of this file.
00001 # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license. Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 #
00006 # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla.
00007 """Nexus class. Parse the contents of a NEXUS file.
00008 
00009 Based upon 'NEXUS: An extensible file format for systematic information'
00010 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
00011 """
00012 # For with in Python/Jython 2.5
00013 from __future__ import with_statement
00014 
00015 import os,sys, math, random, copy
00016 
00017 from Bio import File
00018 from Bio.Alphabet import IUPAC
00019 from Bio.Data import IUPACData
00020 from Bio.Seq import Seq
00021 
00022 from Trees import Tree
00023 
00024 INTERLEAVE=70
00025 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
00026         'matrix','tree', 'utree','translate','codonposset','title']
00027 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons']
00028 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
00029 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
00030 WHITESPACE=' \t\n'
00031 #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments
00032 SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored
00033 CHARSET='chars'
00034 TAXSET='taxa'
00035 CODONPOSITIONS='codonpositions'
00036 DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; '
00037 class NexusError(Exception): pass
00038 
00039 class CharBuffer(object):
00040     """Helps reading NEXUS-words and characters from a buffer."""
00041     def __init__(self,string):
00042         if string:
00043             self.buffer=list(string)
00044         else:
00045             self.buffer=[]
00046     
00047     def peek(self):
00048         if self.buffer:
00049             return self.buffer[0]
00050         else:
00051             return None
00052 
00053     def peek_nonwhitespace(self):
00054         b=''.join(self.buffer).strip()
00055         if b:
00056             return b[0]
00057         else:
00058             return None
00059                 
00060     def next(self):
00061         if self.buffer:
00062             return self.buffer.pop(0)
00063         else:
00064             return None
00065 
00066     def next_nonwhitespace(self):
00067         while True:
00068             p=self.next()
00069             if p is None:
00070                 break
00071             if p not in WHITESPACE: 
00072                 return p
00073         return None
00074 
00075     def skip_whitespace(self):
00076         while self.buffer[0] in WHITESPACE:
00077             self.buffer=self.buffer[1:]
00078     
00079     def next_until(self,target):
00080         for t in target:
00081             try:
00082                 pos=self.buffer.index(t)
00083             except ValueError:
00084                 pass
00085             else:
00086                 found=''.join(self.buffer[:pos])
00087                 self.buffer=self.buffer[pos:]
00088                 return found
00089         else:
00090             return None
00091 
00092     def peek_word(self,word):
00093         return ''.join(self.buffer[:len(word)])==word
00094     
00095     def next_word(self):
00096         """Return the next NEXUS word from a string.
00097 
00098         This deals with single and double quotes, whitespace and punctuation.
00099         """
00100 
00101         word=[]
00102         quoted=False
00103         first=self.next_nonwhitespace()                                   # get first character
00104         if not first:                                       # return empty if only whitespace left
00105             return None
00106         word.append(first)
00107         if first=="'":                                      # word starts with a quote
00108             quoted="'"
00109         elif first=='"':
00110             quoted='"'
00111         elif first in PUNCTUATION:                          # if it's punctuation, return immediately
00112             return first
00113         while True:             
00114             c=self.peek()
00115             if c==quoted:                                      # a quote?
00116                 word.append(self.next())                    # store quote 
00117                 if self.peek()==quoted:                        # double quote
00118                     skip=self.next()                        # skip second quote 
00119                 elif quoted:                                # second single quote ends word
00120                     break
00121             elif quoted:
00122                 word.append(self.next())                              # if quoted, then add anything
00123             elif not c or c in PUNCTUATION or c in WHITESPACE:        # if not quoted and special character, stop 
00124                 break
00125             else:
00126                 word.append(self.next())                    # standard character
00127         return ''.join(word)
00128                 
00129     def rest(self):
00130         """Return the rest of the string without parsing."""
00131         return ''.join(self.buffer)
00132 
00133 class StepMatrix(object):
00134     """Calculate a stepmatrix for weighted parsimony.
00135 
00136     See Wheeler (1990), Cladistics 6:269-275.
00137     """
00138     
00139     def __init__(self,symbols,gap):
00140         self.data={}
00141         self.symbols=[s for s in symbols]
00142         self.symbols.sort()
00143         if gap:
00144             self.symbols.append(gap)
00145         for x in self.symbols:
00146             for y in [s for s in self.symbols if s!=x]:
00147                 self.set(x,y,0)
00148 
00149     def set(self,x,y,value):
00150         if x>y:
00151             x,y=y,x
00152         self.data[x+y]=value
00153 
00154     def add(self,x,y,value):
00155         if x>y:
00156             x,y=y,x
00157         self.data[x+y]+=value
00158 
00159     def sum(self):
00160         return reduce(lambda x,y:x+y,self.data.values()) 
00161 
00162     def transformation(self):
00163         total=self.sum()
00164         if total!=0:
00165             for k in self.data:
00166                 self.data[k]=self.data[k]/float(total)
00167         return self
00168     
00169     def weighting(self):
00170         for k in self.data:
00171             if self.data[k]!=0:
00172                 self.data[k]=-math.log(self.data[k])
00173         return self
00174 
00175     def smprint(self,name='your_name_here'):
00176         matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
00177         matrix+='        %s\n' % '        '.join(self.symbols)
00178         for x in self.symbols:
00179             matrix+='[%s]'.ljust(8) % x
00180             for y in self.symbols:
00181                 if x==y:
00182                     matrix+=' .       '
00183                 else:
00184                     if x>y:
00185                         x1,y1=y,x
00186                     else:
00187                         x1,y1=x,y
00188                     if self.data[x1+y1]==0:
00189                         matrix+='inf.     '
00190                     else:
00191                         matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
00192             matrix+='\n'
00193         matrix+=';\n'
00194         return matrix
00195     
00196 def safename(name,mrbayes=False):
00197     """Return a taxon identifier according to NEXUS standard.
00198 
00199     Wrap quotes around names with punctuation or whitespace, and double
00200     single quotes.
00201 
00202     mrbayes=True: write names without quotes, whitespace or punctuation
00203     for the mrbayes software package.
00204     """
00205     if mrbayes:
00206         safe=name.replace(' ','_')
00207         safe=''.join([c for c in safe if c in MRBAYESSAFE])
00208     else:
00209         safe=name.replace("'","''")
00210         if set(safe).intersection(set(WHITESPACE+PUNCTUATION)):
00211             safe="'"+safe+"'"
00212     return safe
00213 
00214 def quotestrip(word):
00215     """Remove quotes and/or double quotes around identifiers."""
00216     if not word:
00217         return None
00218     while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
00219         word=word[1:-1]
00220     return word
00221 
00222 def get_start_end(sequence, skiplist=['-','?']):
00223     """Return position of first and last character which is not in skiplist.
00224 
00225     Skiplist defaults to ['-','?'])."""
00226 
00227     length=len(sequence)
00228     if length==0:
00229         return None,None
00230     end=length-1
00231     while end>=0 and (sequence[end] in skiplist):
00232         end-=1
00233     start=0
00234     while start<length and (sequence[start] in skiplist):
00235         start+=1
00236     if start==length and end==-1: # empty sequence
00237         return -1,-1
00238     else:
00239         return start,end 
00240     
00241 def _sort_keys_by_values(p):
00242     """Returns a sorted list of keys of p sorted by values of p."""     
00243     startpos=[(p[pn],pn) for pn in p if p[pn]]
00244     startpos.sort()
00245     # parenthisis added because of py3k
00246     return (zip(*startpos))[1]
00247     
00248 def _make_unique(l):
00249     """Check that all values in list are unique and return a pruned and sorted list."""
00250     l=list(set(l))
00251     l.sort()
00252     return l
00253 
00254 def _unique_label(previous_labels,label):
00255     """Returns a unique name if label is already in previous_labels."""
00256     while label in previous_labels:
00257         if label.split('.')[-1].startswith('copy'):
00258             label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1)
00259         else:
00260             label+='.copy'
00261     return label
00262 
00263 def _seqmatrix2strmatrix(matrix):
00264     """Converts a Seq-object matrix to a plain sequence-string matrix."""
00265     return dict([(t,matrix[t].tostring()) for t in matrix])
00266 
00267 def _compact4nexus(orig_list):
00268     """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
00269     into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
00270     """
00271     
00272     if not orig_list:
00273         return ''
00274     orig_list=list(set(orig_list))
00275     orig_list.sort()
00276     shortlist=[]
00277     clist=orig_list[:]
00278     clist.append(clist[-1]+.5) # dummy value makes it easier 
00279     while len(clist)>1:
00280         step=1
00281         for i,x in enumerate(clist):
00282             if x==clist[0]+i*step:   # are we still in the right step?
00283                 continue
00284             elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
00285                 # second element, and possibly at least 3 elements to link,
00286                 # and the next one is in the right step 
00287                 step=x-clist[0]
00288             else:   # pattern broke, add all values before current position to new list
00289                 sub=clist[:i]
00290                 if len(sub)==1:
00291                     shortlist.append(str(sub[0]+1))
00292                 else:
00293                     if step==1:
00294                         shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
00295                     else:
00296                         shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
00297                 clist=clist[i:]
00298                 break
00299     return ' '.join(shortlist)
00300 
00301 def combine(matrices):
00302     """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
00303 
00304     combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
00305     Character sets, character partitions and taxon sets are prefixed, readjusted
00306     and present in the combined matrix. 
00307     """
00308     
00309     if not matrices:
00310         return None
00311     name=matrices[0][0]
00312     combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix
00313     mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1)
00314     if mixed_datatypes:
00315         combined.datatype='None'    # dealing with mixed matrices is application specific. You take care of that yourself!
00316     #    raise NexusError('Matrices must be of same datatype')
00317     combined.charlabels=None
00318     combined.statelabels=None
00319     combined.interleave=False
00320     combined.translate=None
00321 
00322     # rename taxon sets and character sets and name them with prefix
00323     for cn,cs in combined.charsets.iteritems():
00324         combined.charsets['%s.%s' % (name,cn)]=cs
00325         del combined.charsets[cn]
00326     for tn,ts in combined.taxsets.iteritems():
00327         combined.taxsets['%s.%s' % (name,tn)]=ts
00328         del combined.taxsets[tn]
00329     # previous partitions usually don't make much sense in combined matrix
00330     # just initiate one new partition parted by single matrices
00331     combined.charpartitions={'combined':{name:range(combined.nchar)}}
00332     for n,m in matrices[1:]:    # add all other matrices
00333         both=[t for t in combined.taxlabels if t in m.taxlabels]
00334         combined_only=[t for t in combined.taxlabels if t not in both]
00335         m_only=[t for t in m.taxlabels if t not in both]
00336         for t in both:
00337             # concatenate sequences and unify gap and missing character symbols
00338             combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
00339         # replace date of missing taxa with symbol for missing data
00340         for t in combined_only:
00341             combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
00342         for t in m_only:
00343             combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
00344                 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
00345         combined.taxlabels.extend(m_only)    # new taxon list
00346         for cn,cs in m.charsets.iteritems(): # adjust character sets for new matrix
00347             combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
00348         if m.taxsets:
00349             if not combined.taxsets:
00350                 combined.taxsets={}
00351             # update taxon sets
00352             combined.taxsets.update(dict(('%s.%s' % (n,tn),ts) \
00353                                          for tn,ts in m.taxsets.iteritems()))
00354         # update new charpartition
00355         combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)
00356         # update charlabels
00357         if m.charlabels:
00358             if not combined.charlabels:
00359                 combined.charlabels={}
00360             combined.charlabels.update(dict((combined.nchar+i,label) \
00361                                             for (i,label) in m.charlabels.iteritems()))
00362         combined.nchar+=m.nchar # update nchar and ntax
00363         combined.ntax+=len(m_only)
00364         
00365     # some prefer partitions, some charsets:
00366     # make separate charset for ecah initial dataset
00367     for c in combined.charpartitions['combined']:
00368         combined.charsets[c]=combined.charpartitions['combined'][c]
00369 
00370     return combined
00371 
00372 def _kill_comments_and_break_lines(text):
00373     """Delete []-delimited comments out of a file and break into lines separated by ';'.
00374     
00375     stripped_text=_kill_comments_and_break_lines(text):
00376     Nested and multiline comments are allowed. [ and ] symbols within single
00377     or double quotes are ignored, newline ends a quote, all symbols with quotes are
00378     treated the same (thus not quoting inside comments like [this character ']' ends a comment])
00379     Special [&...] and [\...] comments remain untouched, if not inside standard comment.
00380     Quotes inside special [& and [\ are treated as normal characters,
00381     but no nesting inside these special comments allowed (like [&   [\   ]]).
00382     ';' ist deleted from end of line.
00383    
00384     NOTE: this function is very slow for large files, and obsolete when using C extension cnexus
00385     """
00386     contents=iter(text)
00387     newtext=[] 
00388     newline=[]
00389     quotelevel=''
00390     speciallevel=False
00391     commlevel=0
00392     #Parse with one character look ahead (for special comments)
00393     t2 = contents.next()
00394     while True:
00395         t = t2
00396         try:
00397             t2 = contents.next()
00398         except StopIteration:
00399             t2 = None
00400         if t is None:
00401             break
00402         if t==quotelevel and not (commlevel or speciallevel):            # matching quote ends quotation
00403             quotelevel=''
00404         elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation
00405             quotelevel=t
00406         elif not quotelevel and t=='[':                             # opening bracket outside a quote
00407             if t2 in SPECIALCOMMENTS and commlevel==0 and not speciallevel:
00408                 speciallevel=True
00409             else:
00410                 commlevel+=1
00411         elif not quotelevel and t==']':                             # closing bracket ioutside a quote 
00412             if speciallevel:
00413                 speciallevel=False
00414             else:
00415                 commlevel-=1
00416                 if commlevel<0:
00417                     raise NexusError('Nexus formatting error: unmatched ]')
00418                 continue 
00419         if commlevel==0:                        # copy if we're not in comment
00420             if t==';' and not quotelevel:
00421                 newtext.append(''.join(newline))
00422                 newline=[]
00423             else:
00424                 newline.append(t)
00425     #level of comments should be 0 at the end of the file
00426     if newline:
00427         newtext.append('\n'.join(newline))
00428     if commlevel>0:
00429         raise NexusError('Nexus formatting error: unmatched [')
00430     return newtext
00431 
00432 
00433 def _adjust_lines(lines):
00434     """Adjust linebreaks to match ';', strip leading/trailing whitespace.
00435 
00436     list_of_commandlines=_adjust_lines(input_text)
00437     Lines are adjusted so that no linebreaks occur within a commandline 
00438     (except matrix command line)
00439     """
00440     formatted_lines=[] 
00441     for l in lines:
00442         #Convert line endings
00443         l=l.replace('\r\n','\n').replace('\r','\n').strip()
00444         if l.lower().startswith('matrix'):
00445             formatted_lines.append(l)
00446         else:
00447             l=l.replace('\n',' ')
00448             if l:
00449                 formatted_lines.append(l)
00450     return formatted_lines
00451     
00452 def _replace_parenthesized_ambigs(seq,rev_ambig_values):
00453     """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
00454 
00455     opening=seq.find('(')
00456     while opening>-1:
00457         closing=seq.find(')')
00458         if closing<0:
00459             raise NexusError('Missing closing parenthesis in: '+seq)
00460         elif closing<opening:
00461             raise NexusError('Missing opening parenthesis in: '+seq)
00462         ambig=[x for x in seq[opening+1:closing]]
00463         ambig.sort()
00464         ambig=''.join(ambig)
00465         ambig_code=rev_ambig_values[ambig.upper()]
00466         if ambig!=ambig.upper():
00467             ambig_code=ambig_code.lower()
00468         seq=seq[:opening]+ambig_code+seq[closing+1:]        
00469         opening=seq.find('(')
00470     return seq
00471 
00472 class Commandline(object):
00473     """Represent a commandline as command and options."""
00474     
00475     def __init__(self, line, title):
00476         self.options={}
00477         options=[]
00478         self.command=None
00479         try:
00480             #Assume matrix (all other command lines have been stripped of \n)
00481             self.command, options = line.strip().split('\n', 1)
00482         except ValueError: #Not matrix
00483             #self.command,options=line.split(' ',1)  #no: could be tab or spaces (translate...)
00484             self.command=line.split()[0]
00485             options=' '.join(line.split()[1:])
00486         self.command = self.command.strip().lower()
00487         if self.command in SPECIAL_COMMANDS:   # special command that need newlines and order of options preserved
00488             self.options=options.strip()
00489         else:    
00490             if len(options) > 0:
00491                 try: 
00492                     options = options.replace('=', ' = ').split()
00493                     valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
00494                     indices = []
00495                     for sl in valued_indices:
00496                         indices.extend(sl)
00497                     token_indices = [n for n in range(len(options)) if n not in indices]
00498                     for opt in valued_indices:
00499                         #self.options[options[opt[0]].lower()] = options[opt[2]].lower()
00500                         self.options[options[opt[0]].lower()] = options[opt[2]]
00501                     for token in token_indices:
00502                         self.options[options[token].lower()] = None
00503                 except ValueError:
00504                     raise NexusError('Incorrect formatting in line: %s' % line)
00505                 
00506 class Block(object):
00507     """Represent a NEXUS block with block name and list of commandlines."""
00508     def __init__(self,title=None):
00509         self.title=title
00510         self.commandlines=[]
00511 
00512 class Nexus(object):
00513 
00514     __slots__=['original_taxon_order','__dict__']
00515 
00516     def __init__(self, input=None):
00517         self.ntax=0                     # number of taxa
00518         self.nchar=0                    # number of characters
00519         self.unaltered_taxlabels=[]          # taxlabels as the appear in the input file (incl. duplicates, etc.)
00520         self.taxlabels=[]               # labels for taxa, ordered by their id
00521         self.charlabels=None            # ... and for characters
00522         self.statelabels=None           # ... and for states
00523         self.datatype='dna'             # (standard), dna, rna, nucleotide, protein
00524         self.respectcase=False          # case sensitivity
00525         self.missing='?'                # symbol for missing characters
00526         self.gap='-'                    # symbol for gap
00527         self.symbols=None               # set of symbols
00528         self.equate=None                # set of symbol synonyms
00529         self.matchchar=None             # matching char for matrix representation
00530         self.labels=None                # left, right, no
00531         self.transpose=False            # whether matrix is transposed
00532         self.interleave=False           # whether matrix is interleaved
00533         self.tokens=False               # unsupported          
00534         self.eliminate=None             # unsupported 
00535         self.matrix=None                # ...
00536         self.unknown_blocks=[]          # blocks we don't care about
00537         self.taxsets={}
00538         self.charsets={}
00539         self.charpartitions={}
00540         self.taxpartitions={}
00541         self.trees=[]                   # list of Trees (instances of Tree class)
00542         self.translate=None             # Dict to translate taxon <-> taxon numbers
00543         self.structured=[]              # structured input representation
00544         self.set={}                     # dict of the set command to set various options 
00545         self.options={}                 # dict of the options command in the data block
00546         self.codonposset=None           # name of the charpartition that defines codon positions
00547 
00548         # some defaults
00549         self.options['gapmode']='missing'
00550         
00551         if input:
00552             self.read(input)
00553         else:
00554             self.read(DEFAULTNEXUS)
00555 
00556     def get_original_taxon_order(self):
00557         """Included for backwards compatibility (DEPRECATED)."""
00558         return self.taxlabels
00559     def set_original_taxon_order(self,value):
00560         """Included for backwards compatibility (DEPRECATED)."""
00561         self.taxlabels=value
00562     original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
00563 
00564     def read(self,input):
00565         """Read and parse NEXUS imput (a filename, file-handle, or string)."""
00566 
00567         # 1. Assume we have the name of a file in the execution dir or a
00568         # file-like object.
00569         # Note we need to add parsing of the path to dir/filename
00570         try:
00571             with File.as_handle(input, 'rU') as fp:
00572                 file_contents = fp.read()
00573                 self.filename = getattr(fp, 'name', 'Unknown_nexus_file')
00574         except (TypeError,IOError,AttributeError):
00575             #2 Assume we have a string from a fh.read()
00576             if isinstance(input, basestring):
00577                 file_contents = input
00578                 self.filename='input_string'
00579             else:
00580                 print input.strip()[:50]
00581                 raise NexusError('Unrecognized input: %s ...' % input[:100])
00582         file_contents=file_contents.strip()
00583         if file_contents.startswith('#NEXUS'):
00584             file_contents=file_contents[6:]
00585         commandlines=_get_command_lines(file_contents)
00586         # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times'
00587         for i,cl in enumerate(commandlines):
00588             try:
00589                 if cl[:6].upper()=='#NEXUS':
00590                     commandlines[i]=cl[6:].strip()
00591             except:
00592                 pass
00593         # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands
00594         nexus_block_gen = self._get_nexus_block(commandlines)
00595         while 1:
00596             try:
00597                 title, contents = nexus_block_gen.next()
00598             except StopIteration:
00599                 break
00600             if title in KNOWN_NEXUS_BLOCKS:
00601                 self._parse_nexus_block(title, contents)
00602             else:
00603                 self._unknown_nexus_block(title, contents)
00604 
00605     def _get_nexus_block(self,file_contents):
00606         """Generator for looping through Nexus blocks."""
00607         inblock=False
00608         blocklines=[]
00609         while file_contents:
00610             cl=file_contents.pop(0)
00611             if cl.lower().startswith('begin'):
00612                 if not inblock:
00613                     inblock=True
00614                     title=cl.split()[1].lower()
00615                 else:
00616                     raise NexusError('Illegal block nesting in block %s' % title)
00617             elif cl.lower().startswith('end'):
00618                 if inblock:
00619                     inblock=False
00620                     yield title,blocklines
00621                     blocklines=[]
00622                 else:
00623                     raise NexusError('Unmatched \'end\'.')
00624             elif inblock:
00625                 blocklines.append(cl)
00626 
00627     def _unknown_nexus_block(self,title, contents):
00628         block = Block()
00629         block.commandlines.append(contents)
00630         block.title = title
00631         self.unknown_blocks.append(block)        
00632 
00633     def _parse_nexus_block(self,title, contents):
00634         """Parse a known Nexus Block (PRIVATE)."""
00635         # attached the structered block representation
00636         self._apply_block_structure(title, contents)
00637         #now check for taxa,characters,data blocks. If this stuff is defined more than once
00638         #the later occurences will override the previous ones.
00639         block=self.structured[-1] 
00640         for line in block.commandlines:
00641             try:
00642                 getattr(self,'_'+line.command)(line.options)
00643             except AttributeError:
00644                 raise
00645                 raise NexusError('Unknown command: %s ' % line.command)
00646     
00647     def _title(self,options):
00648         pass
00649 
00650     def _dimensions(self,options):
00651         if 'ntax' in options:
00652             self.ntax=eval(options['ntax'])
00653         if 'nchar' in options:
00654             self.nchar=eval(options['nchar'])
00655 
00656     def _format(self,options):
00657         # print options
00658         # we first need to test respectcase, then symbols (which depends on respectcase)
00659         # then datatype (which, if standard, depends on symbols and respectcase in order to generate
00660         # dicts for ambiguous values and alphabet
00661         if 'respectcase' in options:
00662             self.respectcase=True
00663         # adjust symbols to for respectcase
00664         if 'symbols' in options:
00665             self.symbols=options['symbols']
00666             if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\
00667             (self.symbold.startswith("'") and self.symbols.endswith("'")):
00668                 self.symbols=self.symbols[1:-1].replace(' ','')
00669             if not self.respectcase:
00670                 self.symbols=self.symbols.lower()+self.symbols.upper()
00671                 self.symbols=list(set(self.symbols))
00672         if 'datatype' in options:
00673             self.datatype=options['datatype'].lower()
00674             if self.datatype=='dna' or self.datatype=='nucleotide':
00675                 self.alphabet=copy.deepcopy(IUPAC.ambiguous_dna)
00676                 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_dna_values)
00677                 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_dna_letters)
00678             elif self.datatype=='rna':
00679                 self.alphabet=copy.deepcopy(IUPAC.ambiguous_rna)
00680                 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_rna_values)
00681                 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_rna_letters)
00682             elif self.datatype=='protein':
00683                 self.alphabet=copy.deepcopy(IUPAC.protein)
00684                 self.ambiguous_values={'B':'DN','Z':'EQ','X':copy.deepcopy(IUPACData.protein_letters)} # that's how PAUP handles it
00685                 self.unambiguous_letters=copy.deepcopy(IUPACData.protein_letters)+'*' # stop-codon
00686             elif self.datatype=='standard':
00687                 raise NexusError('Datatype standard is not yet supported.')
00688                 #self.alphabet=None
00689                 #self.ambiguous_values={}
00690                 #if not self.symbols:
00691                 #    self.symbols='01' # if nothing else defined, then 0 and 1 are the default states
00692                 #self.unambiguous_letters=self.symbols
00693             else:
00694                 raise NexusError('Unsupported datatype: '+self.datatype)
00695             self.valid_characters=''.join(self.ambiguous_values)+self.unambiguous_letters
00696             if not self.respectcase:
00697                 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper()
00698             #we have to sort the reverse ambig coding dict key characters:
00699             #to be sure that it's 'ACGT':'N' and not 'GTCA':'N'
00700             rev=dict((i[1],i[0]) for i in self.ambiguous_values.iteritems() if i[0]!='X')
00701             self.rev_ambiguous_values={}
00702             for (k,v) in rev.iteritems():
00703                 key=[c for c in k]
00704                 key.sort()
00705                 self.rev_ambiguous_values[''.join(key)]=v
00706         #overwrite symbols for datype rna,dna,nucleotide
00707         if self.datatype in ['dna','rna','nucleotide']:
00708             self.symbols=self.alphabet.letters
00709             if self.missing not in self.ambiguous_values:
00710                 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap
00711             self.ambiguous_values[self.gap]=self.gap
00712         elif self.datatype=='standard':
00713             if not self.symbols:
00714                 self.symbols=['1','0']
00715         if 'missing' in options:
00716             self.missing=options['missing'][0]
00717         if 'gap' in options:
00718             self.gap=options['gap'][0]
00719         if 'equate' in options:
00720             self.equate=options['equate']
00721         if 'matchchar' in options:
00722             self.matchchar=options['matchchar'][0]
00723         if 'labels' in options:
00724             self.labels=options['labels']
00725         if 'transpose' in options:
00726             raise NexusError('TRANSPOSE is not supported!')
00727             self.transpose=True
00728         if 'interleave' in options:
00729             if options['interleave']==None or options['interleave'].lower()=='yes':
00730                 self.interleave=True
00731         if 'tokens' in options:
00732             self.tokens=True
00733         if 'notokens' in options:
00734             self.tokens=False
00735 
00736 
00737     def _set(self,options):
00738         self.set=options;
00739 
00740     def _options(self,options):
00741         self.options=options;
00742 
00743     def _eliminate(self,options):
00744         self.eliminate=options
00745 
00746     def _taxlabels(self,options):
00747         """Get taxon labels (PRIVATE).
00748 
00749         As the taxon names are already in the matrix, this is superfluous
00750         except for transpose matrices, which are currently unsupported anyway.
00751         Thus, we ignore the taxlabels command to make handling of duplicate
00752         taxon names easier.
00753         """
00754         pass
00755         #self.taxlabels=[]
00756         #opts=CharBuffer(options)
00757         #while True:
00758         #    taxon=quotestrip(opts.next_word())
00759         #    if not taxon:
00760         #        break
00761         #    self.taxlabels.append(taxon)
00762 
00763     def _check_taxlabels(self,taxon): 
00764         """Check for presence of taxon in self.taxlabels."""
00765         # According to NEXUS standard, underscores shall be treated as spaces...,
00766         # so checking for identity is more difficult
00767         nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
00768         nexid=taxon.replace(' ','_')
00769         return nextaxa.get(nexid)
00770 
00771     def _charlabels(self,options):
00772         self.charlabels={}
00773         opts=CharBuffer(options)
00774         while True:
00775             try:
00776                 # get id and state
00777                 w=opts.next_word()
00778                 if w is None: # McClade saves and reads charlabel-lists with terminal comma?!
00779                     break
00780                 identifier=self._resolve(w,set_type=CHARSET) 
00781                 state=quotestrip(opts.next_word())
00782                 self.charlabels[identifier]=state
00783                 # check for comma or end of command
00784                 c=opts.next_nonwhitespace()
00785                 if c is None:
00786                     break
00787                 elif c!=',':
00788                     raise NexusError('Missing \',\' in line %s.' % options)
00789             except NexusError:
00790                 raise
00791             except:
00792                 raise NexusError('Format error in line %s.' % options)
00793 
00794     def _charstatelabels(self,options):
00795         # warning: charstatelabels supports only charlabels-syntax!
00796         self._charlabels(options)
00797 
00798     def _statelabels(self,options):
00799         #self.charlabels=options
00800         #print 'Command statelabels is not supported and will be ignored.'
00801         pass
00802 
00803     def _matrix(self,options):
00804         if not self.ntax or not self.nchar:
00805             raise NexusError('Dimensions must be specified before matrix!')
00806         self.matrix={}
00807         taxcount=0
00808         first_matrix_block=True
00809     
00810         #eliminate empty lines and leading/trailing whitespace 
00811         lines=[l.strip() for l in options.split('\n') if l.strip()!='']
00812         lineiter=iter(lines)
00813         while 1:
00814             try:
00815                 l=lineiter.next()
00816             except StopIteration:
00817                 if taxcount<self.ntax:
00818                     raise NexusError('Not enough taxa in matrix.')
00819                 elif taxcount>self.ntax:
00820                     raise NexusError('Too many taxa in matrix.')
00821                 else:
00822                     break
00823             # count the taxa and check for interleaved matrix
00824             taxcount+=1
00825             ##print taxcount
00826             if taxcount>self.ntax:
00827                 if not self.interleave:
00828                     raise NexusError('Too many taxa in matrix - should matrix be interleaved?')
00829                 else:
00830                     taxcount=1
00831                     first_matrix_block=False
00832             #get taxon name and sequence
00833             linechars=CharBuffer(l)
00834             id=quotestrip(linechars.next_word())
00835             l=linechars.rest().strip()
00836             chars=''
00837             if self.interleave:
00838                 #interleaved matrix
00839                 #print 'In interleave'
00840                 if l:
00841                     chars=''.join(l.split())
00842                 else:
00843                     chars=''.join(lineiter.next().split())
00844             else:
00845                 #non-interleaved matrix
00846                 chars=''.join(l.split())
00847                 while len(chars)<self.nchar:
00848                     l=lineiter.next()
00849                     chars+=''.join(l.split())
00850             iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
00851             #first taxon has the reference sequence if matchhar is used
00852             if taxcount==1:
00853                 refseq=iupac_seq
00854             else:
00855                 if self.matchchar:
00856                     while 1:
00857                         p=iupac_seq.tostring().find(self.matchchar)
00858                         if p==-1:
00859                             break
00860                         iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
00861             #check for invalid characters
00862             for i,c in enumerate(iupac_seq.tostring()):
00863                 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
00864                     raise NexusError( \
00865                         ('Taxon %s: Illegal character %s in sequence %s ' + \
00866                          '(check dimensions/interleaving)') % (id,c, iupac_seq))
00867             #add sequence to matrix
00868             if first_matrix_block:
00869                 self.unaltered_taxlabels.append(id)
00870                 id=_unique_label(self.matrix.keys(),id)
00871                 self.matrix[id]=iupac_seq
00872                 self.taxlabels.append(id)
00873             else:
00874                 # taxon names need to be in the same order in each interleaved block
00875                 id=_unique_label(self.taxlabels[:taxcount-1],id)
00876                 taxon_present=self._check_taxlabels(id)
00877                 if taxon_present:
00878                     self.matrix[taxon_present]+=iupac_seq
00879                 else:
00880                     raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id)
00881         #check all sequences for length according to nchar
00882         for taxon in self.matrix:
00883             if len(self.matrix[taxon])!=self.nchar:
00884                 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \
00885                                  % (self.nchar, len(self.matrix[taxon]),taxon))
00886         #check that taxlabels is identical with matrix.keys. If not, it's a problem
00887         matrixkeys=sorted(self.matrix)
00888         taxlabelssort=self.taxlabels[:]
00889         taxlabelssort.sort()
00890         assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
00891 
00892     def _translate(self,options):
00893         self.translate={}
00894         opts=CharBuffer(options)
00895         while True:
00896             try:
00897                 # get id and state
00898                 identifier=int(opts.next_word()) 
00899                 label=quotestrip(opts.next_word())
00900                 self.translate[identifier]=label
00901                 # check for comma or end of command
00902                 c=opts.next_nonwhitespace()
00903                 if c is None:
00904                     break
00905                 elif c!=',':
00906                     raise NexusError('Missing \',\' in line %s.' % options)
00907             except NexusError:
00908                 raise
00909             except:
00910                 raise NexusError('Format error in line %s.' % options)
00911 
00912     def _utree(self,options):
00913         """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
00914         self._tree(options)
00915         
00916     def _tree(self,options):
00917         opts=CharBuffer(options)
00918         if opts.peek_nonwhitespace()=='*': # a star can be used to make it the default tree in some software packages
00919             dummy=opts.next_nonwhitespace()
00920         name=opts.next_word()
00921         if opts.next_nonwhitespace()!='=':
00922             raise NexusError('Syntax error in tree description: %s' \
00923                              % options[:50])
00924         rooted=False
00925         weight=1.0
00926         while opts.peek_nonwhitespace()=='[':
00927             open=opts.next_nonwhitespace()
00928             symbol=opts.next()
00929             if symbol!='&':
00930                 raise NexusError('Illegal special comment [%s...] in tree description: %s' \
00931                                  % (symbol, options[:50]))
00932             special=opts.next()
00933             value=opts.next_until(']')
00934             closing=opts.next()
00935             if special=='R':
00936                 rooted=True
00937             elif special=='U':
00938                 rooted=False
00939             elif special=='W':
00940                 weight=float(value)
00941         tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip())
00942         # if there's an active translation table, translate
00943         if self.translate:
00944             for n in tree.get_terminals():
00945                 try:
00946                     tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)])
00947                 except (ValueError,KeyError):
00948                     raise NexusError('Unable to substitue %s using \'translate\' data.' \
00949                                      % tree.node(n).data.taxon)
00950         self.trees.append(tree)
00951 
00952     def _apply_block_structure(self,title,lines):
00953         block=Block('')
00954         block.title = title            
00955         for line in lines:
00956             block.commandlines.append(Commandline(line, title))
00957         self.structured.append(block)
00958        
00959     def _taxset(self, options):
00960         name,taxa=self._get_indices(options,set_type=TAXSET)
00961         self.taxsets[name]=_make_unique(taxa)
00962                 
00963     def _charset(self, options):
00964         name,sites=self._get_indices(options,set_type=CHARSET)
00965         self.charsets[name]=_make_unique(sites)
00966         
00967     def _taxpartition(self, options):
00968         taxpartition={}
00969         quotelevel=False
00970         opts=CharBuffer(options)
00971         name=self._name_n_vector(opts)
00972         if not name:
00973             raise NexusError('Formatting error in taxpartition: %s ' % options)
00974         # now collect thesubbpartitions and parse them
00975         # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
00976         # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words
00977         sub=''
00978         while True:
00979             w=opts.next()
00980             if w is None or (w==',' and not quotelevel):
00981                 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
00982                 taxpartition[subname]=_make_unique(subindices)
00983                 sub=''
00984                 if w is None:
00985                     break
00986             else:
00987                 if w=="'":
00988                     quotelevel=not quotelevel
00989                 sub+=w
00990         self.taxpartitions[name]=taxpartition
00991 
00992     def _codonposset(self,options):
00993         """Read codon positions from a codons block as written from McClade.
00994 
00995         Here codonposset is just a fancy name for a character partition with
00996         the name CodonPositions and the partitions N,1,2,3
00997         """
00998 
00999         prev_partitions=self.charpartitions.keys()
01000         self._charpartition(options)
01001         # mcclade calls it CodonPositions, but you never know...
01002         codonname=[n for n in self.charpartitions if n not in prev_partitions]
01003         if codonname==[] or len(codonname)>1:
01004             raise NexusError('Formatting Error in codonposset: %s ' % options)
01005         else:
01006             self.codonposset=codonname[0]
01007   
01008     def _codeset(self,options):
01009         pass
01010 
01011     def _charpartition(self, options):
01012         charpartition={}
01013         quotelevel=False
01014         opts=CharBuffer(options)
01015         name=self._name_n_vector(opts)
01016         if not name:
01017             raise NexusError('Formatting error in charpartition: %s ' % options)
01018         # now collect thesubbpartitions and parse them
01019         # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
01020         sub=''
01021         while True:
01022             w=opts.next()
01023             if w is None or (w==',' and not quotelevel):
01024                 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
01025                 charpartition[subname]=_make_unique(subindices)
01026                 sub=''
01027                 if w is None:
01028                     break
01029             else:
01030                 if w=="'":
01031                     quotelevel=not quotelevel
01032                 sub+=w
01033         self.charpartitions[name]=charpartition
01034 
01035     def _get_indices(self,options,set_type=CHARSET,separator='='):
01036         """Parse the taxset/charset specification (PRIVATE).
01037 
01038         e.g. '1 2   3 - 5 dog cat   10 - 20 \\ 3'
01039         --> [0,1,2,3,4,'dog','cat',9,12,15,18]
01040         """
01041         opts=CharBuffer(options)
01042         name=self._name_n_vector(opts,separator=separator)
01043         indices=self._parse_list(opts,set_type=set_type)
01044         if indices is None:
01045             raise NexusError('Formatting error in line: %s ' % options)
01046         return name,indices
01047 
01048     def _name_n_vector(self,opts,separator='='):
01049         """Extract name and check that it's not in vector format."""
01050         rest=opts.rest()
01051         name=opts.next_word()
01052         # we ignore * before names
01053         if name=='*':
01054             name=opts.next_word()
01055         if not name:
01056             raise NexusError('Formatting error in line: %s ' % rest)
01057         name=quotestrip(name)
01058         if opts.peek_nonwhitespace=='(':
01059             open=opts.next_nonwhitespace()
01060             qualifier=open.next_word()
01061             close=opts.next_nonwhitespace()
01062             if  qualifier.lower()=='vector':
01063                 raise NexusError('Unsupported VECTOR format in line %s' \
01064                                  % (opts))
01065             elif qualifier.lower()!='standard':
01066                 raise NexusError('Unknown qualifier %s in line %s' \
01067                                  % (qualifier, opts))
01068         if opts.next_nonwhitespace()!=separator:
01069             raise NexusError('Formatting error in line: %s ' % rest)
01070         return name
01071     
01072     def _parse_list(self,options_buffer,set_type):
01073         """Parse a NEXUS list (PRIVATE).
01074 
01075         e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21],
01076         (assuming dog is taxon no. 17 and cat is taxon no. 21).
01077         """
01078         plain_list=[]
01079         if options_buffer.peek_nonwhitespace():
01080             try:    # capture all possible exceptions and treat them as formatting erros, if they are not NexusError
01081                 while True:
01082                     identifier=options_buffer.next_word()                                     # next list element
01083                     if not identifier:                                              # end of list?
01084                         break
01085                     start=self._resolve(identifier,set_type=set_type)
01086                     if options_buffer.peek_nonwhitespace()=='-':                            # followd by -
01087                         end=start
01088                         step=1
01089                         # get hyphen and end of range
01090                         hyphen=options_buffer.next_nonwhitespace()
01091                         end=self._resolve(options_buffer.next_word(),set_type=set_type)
01092                         if set_type==CHARSET:
01093                             if options_buffer.peek_nonwhitespace()=='\\':                           # followd by \
01094                                 backslash=options_buffer.next_nonwhitespace()
01095                                 step=int(options_buffer.next_word())         # get backslash and step
01096                             plain_list.extend(range(start,end+1,step)) 
01097                         else:
01098                             if type(start)==list or type(end)==list:
01099                                 raise NexusError('Name if character sets not allowed in range definition: %s'\
01100                                                  % identifier)
01101                             start=self.taxlabels.index(start)
01102                             end=self.taxlabels.index(end)
01103                             taxrange=self.taxlabels[start:end+1]
01104                             plain_list.extend(taxrange)
01105                     else:
01106                         if type(start)==list:           # start was the name of charset or taxset
01107                             plain_list.extend(start)
01108                         else:                           # start was an ordinary identifier
01109                             plain_list.append(start)
01110             except NexusError:
01111                 raise
01112             except:
01113                 return None
01114         return plain_list
01115         
01116     def _resolve(self,identifier,set_type=None):
01117         """Translate identifier in list into character/taxon index.
01118 
01119         Characters (which are referred to by their index in Nexus.py):
01120             Plain numbers are returned minus 1 (Nexus indices to python indices)
01121             Text identifiers are translaterd into their indices (if plain character indentifiers),
01122             the first hit in charlabels is returned (charlabels don't need to be unique)
01123             or the range of indices is returned (if names of character sets).
01124         Taxa (which are referred to by their unique name in Nexus.py):
01125             Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
01126             Names are returned unchanged (if plain taxon identifiers), or the names in
01127             the corresponding taxon set is returned
01128         """
01129         identifier=quotestrip(identifier)
01130         if not set_type:
01131             raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
01132         if set_type==CHARSET:
01133             try:
01134                 n=int(identifier)
01135             except ValueError:
01136                 if self.charlabels and identifier in self.charlabels.itervalues():
01137                     for k in self.charlabels:
01138                         if self.charlabels[k]==identifier:
01139                             return k
01140                 elif self.charsets and identifier in self.charsets:
01141                     return self.charsets[identifier]
01142                 else:
01143                     raise NexusError('Unknown character identifier: %s' \
01144                                      % identifier)
01145             else:
01146                 if n<=self.nchar:
01147                     return n-1
01148                 else:
01149                     raise NexusError('Illegal character identifier: %d>nchar (=%d).' \
01150                                      % (identifier,self.nchar))
01151         elif set_type==TAXSET:
01152             try:
01153                 n=int(identifier)
01154             except ValueError:
01155                 taxlabels_id=self._check_taxlabels(identifier)
01156                 if taxlabels_id:
01157                     return taxlabels_id
01158                 elif self.taxsets and identifier in self.taxsets:
01159                     return self.taxsets[identifier]
01160                 else:
01161                     raise NexusError('Unknown taxon identifier: %s' \
01162                                      % identifier)
01163             else:
01164                 if n>0 and n<=self.ntax:
01165                     return self.taxlabels[n-1]
01166                 else:
01167                     raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \
01168                                      % (identifier,self.ntax))
01169         else:
01170             raise NexusError('Unknown set specification: %s.'% set_type)
01171 
01172     def _stateset(self, options):
01173         #Not implemented
01174         pass
01175 
01176     def _changeset(self, options):
01177         #Not implemented
01178         pass
01179 
01180     def _treeset(self, options):
01181         #Not implemented
01182         pass
01183 
01184     def _treepartition(self, options):
01185         #Not implemented
01186         pass
01187 
01188     def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
01189             exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
01190         """Writes a nexus file for each partition in charpartition.
01191 
01192         Only non-excluded characters and non-deleted taxa are included,
01193         just the data block is written.
01194         """
01195 
01196         if not matrix:
01197             matrix=self.matrix
01198         if not matrix:
01199             return
01200         if not filename:
01201             filename=self.filename
01202         if charpartition:
01203             pfilenames={}
01204             for p in charpartition:
01205                 total_exclude=[]+exclude
01206                 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
01207                 total_exclude=_make_unique(total_exclude)
01208                 pcomment=comment+'\nPartition: '+p+'\n'
01209                 dot=filename.rfind('.')
01210                 if dot>0:
01211                     pfilename=filename[:dot]+'_'+p+'.data'
01212                 else:
01213                     pfilename=filename+'_'+p
01214                 pfilenames[p]=pfilename
01215                 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
01216                         interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
01217                         mrbayes=mrbayes)
01218             return pfilenames
01219         else:
01220             fn=self.filename+'.data'
01221             self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
01222                     exclude=exclude,delete=delete,comment=comment,append_sets=False,
01223                     mrbayes=mrbayes)
01224             return fn
01225     
01226     def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
01227             blocksize=None, interleave=False, interleave_by_partition=False,\
01228             comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\
01229             codons_block=True):
01230         """Writes a nexus file with data and sets block to a file or handle.
01231 
01232         Character sets and partitions are appended by default, and are
01233         adjusted according to excluded characters (i.e. character sets
01234         still point to the same sites (not necessarily same positions),
01235         without including the deleted characters.
01236 
01237         filename - Either a filename as a string (which will be opened,
01238                    written to and closed), or a handle object (which will
01239                    be written to but NOT closed).
01240         interleave_by_partition - Optional name of partition (string)
01241         omit_NEXUS - Boolean.  If true, the '#NEXUS' line normally at the
01242                    start of the file is ommited.
01243 
01244         Returns the filename/handle used to write the data.
01245         """
01246         if not matrix:
01247             matrix=self.matrix
01248         if not matrix:
01249             return
01250         if not filename:
01251             filename=self.filename
01252         if [t for t in delete if not self._check_taxlabels(t)]:
01253             raise NexusError('Unknown taxa: %s' \
01254                              % ', '.join(set(delete).difference(set(self.taxlabels))))
01255         if interleave_by_partition:
01256             if not interleave_by_partition in self.charpartitions:
01257                 raise NexusError('Unknown partition: %r' % interleave_by_partition)
01258             else:
01259                 partition=self.charpartitions[interleave_by_partition]
01260                 # we need to sort the partition names by starting position before we exclude characters
01261                 names=_sort_keys_by_values(partition)
01262                 newpartition={}
01263                 for p in partition:
01264                     newpartition[p]=[c for c in partition[p] if c not in exclude]
01265         # how many taxa and how many characters are left?
01266         undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
01267         cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
01268         ntax_adjusted=len(undelete)
01269         nchar_adjusted=len(cropped_matrix[undelete[0]])
01270         if not undelete or (undelete and undelete[0]==''):
01271             return
01272 
01273         with File.as_handle(filename, mode='w') as fh:
01274             if not omit_NEXUS:
01275                 fh.write('#NEXUS\n')
01276             if comment:
01277                 fh.write('['+comment+']\n')
01278             fh.write('begin data;\n')
01279             fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
01280             fh.write('\tformat datatype='+self.datatype)
01281             if self.respectcase:
01282                 fh.write(' respectcase')
01283             if self.missing:
01284                 fh.write(' missing='+self.missing)
01285             if self.gap:
01286                 fh.write(' gap='+self.gap)
01287             if self.matchchar:
01288                 fh.write(' matchchar='+self.matchchar)
01289             if self.labels:
01290                 fh.write(' labels='+self.labels)
01291             if self.equate:
01292                 fh.write(' equate='+self.equate)
01293             if interleave or interleave_by_partition:
01294                 fh.write(' interleave')
01295             fh.write(';\n')
01296             #if self.taxlabels:
01297             #    fh.write('taxlabels '+' '.join(self.taxlabels)+';\n')
01298             if self.charlabels:
01299                 newcharlabels=self._adjust_charlabels(exclude=exclude)
01300                 clkeys=sorted(newcharlabels)
01301                 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
01302             fh.write('matrix\n')
01303             if not blocksize:
01304                 if interleave:
01305                     blocksize=70
01306                 else:
01307                     blocksize=self.nchar
01308             # delete deleted taxa and ecxclude excluded characters...
01309             namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
01310             if interleave_by_partition:
01311                 # interleave by partitions, but adjust partitions with regard to excluded characters
01312                 seek=0
01313                 for p in names:
01314                     fh.write('[%s: %s]\n' % (interleave_by_partition,p))
01315                     if len(newpartition[p])>0:
01316                         for taxon in undelete:
01317                             fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
01318                             fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
01319                         fh.write('\n')
01320                     else:
01321                         fh.write('[empty]\n\n')
01322                     seek+=len(newpartition[p])
01323             elif interleave:
01324                 for seek in range(0,nchar_adjusted,blocksize):
01325                     for taxon in undelete:
01326                         fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
01327                         fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
01328                     fh.write('\n')
01329             else:
01330                 for taxon in undelete:
01331                     if blocksize<nchar_adjusted:
01332                         fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
01333                     else:
01334                         fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
01335                     taxon_seq = cropped_matrix[taxon]
01336                     for seek in range(0,nchar_adjusted,blocksize):
01337                         fh.write(taxon_seq[seek:seek+blocksize]+'\n')
01338                     del taxon_seq
01339             fh.write(';\nend;\n')
01340             if append_sets:
01341                 if codons_block:
01342                     fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False))
01343                     fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True))
01344                 else:
01345                     fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
01346         return filename
01347 
01348 
01349     def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
01350         """Returns a sets block."""
01351         if not self.charsets and not self.taxsets and not self.charpartitions:
01352             return ''
01353         if codons_only:
01354             setsb=['\nbegin codons']
01355         else:
01356             setsb=['\nbegin sets']
01357         # - now if characters have been excluded, the character sets need to be adjusted,
01358         #   so that they still point to the right character positions
01359         # calculate a list of offsets: for each deleted character, the following character position
01360         # in the new file will have an additional offset of -1
01361         offset=0
01362         offlist=[]
01363         for c in range(self.nchar):
01364             if c in exclude:
01365                 offset+=1
01366                 offlist.append(-1)  # dummy value as these character positions are excluded
01367             else:
01368                 offlist.append(c-offset)
01369         # now adjust each of the character sets
01370         if not codons_only:
01371             for n,ns in self.charsets.iteritems():
01372                 cset=[offlist[c] for c in ns if c not in exclude]
01373                 if cset: 
01374                     setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 
01375             for n,s in self.taxsets.iteritems():
01376                 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
01377                 if tset:
01378                     setsb.append('taxset %s = %s' % (safename(n),' '.join(tset))) 
01379         for n,p in self.charpartitions.iteritems():
01380             if not include_codons and n==CODONPOSITIONS:
01381                 continue
01382             elif codons_only and n!=CODONPOSITIONS:
01383                 continue
01384             # as characters have been excluded, the partitions must be adjusted
01385             # if a partition is empty, it will be omitted from the charpartition command
01386             # (although paup allows charpartition part=t1:,t2:,t3:1-100)
01387             names=_sort_keys_by_values(p)
01388             newpartition={}
01389             for sn in names:
01390                 nsp=[offlist[c] for c in p[sn] if c not in exclude]
01391                 if nsp:
01392                     newpartition[sn]=nsp
01393             if newpartition:
01394                 if include_codons and n==CODONPOSITIONS:
01395                     command='codonposset'
01396                 else:
01397                     command='charpartition'
01398                 setsb.append('%s %s = %s' % (command,safename(n),\
01399                 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
01400         # now write charpartititions, much easier than charpartitions
01401         for n,p in self.taxpartitions.iteritems():
01402             names=_sort_keys_by_values(p)
01403             newpartition={}
01404             for sn in names:
01405                 nsp=[t for t in p[sn] if t not in delete]
01406                 if nsp:
01407                     newpartition[sn]=nsp
01408             if newpartition:
01409                 setsb.append('taxpartition %s = %s' % (safename(n),\
01410                 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
01411         # add 'end' and return everything
01412         setsb.append('end;\n')
01413         if len(setsb)==2: # begin and end only
01414                 return ''
01415         else:
01416             return ';\n'.join(setsb)
01417     
01418     def export_fasta(self, filename=None, width=70):
01419         """Writes matrix into a fasta file: (self, filename=None, width=70)."""       
01420         if not filename:
01421             if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
01422                 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
01423             else:
01424                 filename=self.filename+'.fas'
01425         fh=open(filename,'w')
01426         for taxon in self.taxlabels:
01427             fh.write('>'+safename(taxon)+'\n')
01428             for i in range(0, len(self.matrix[taxon].tostring()), width):
01429                 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')    
01430         fh.close()
01431         return filename
01432 
01433     def export_phylip(self, filename=None):
01434         """Writes matrix into a PHYLIP file: (self, filename=None).
01435 
01436         Note that this writes a relaxed PHYLIP format file, where the names
01437         are not truncated, nor checked for invalid characters."""
01438         if not filename:
01439             if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
01440                 filename='.'.join(self.filename.split('.')[:-1])+'.phy'
01441             else:
01442                 filename=self.filename+'.phy'
01443         fh=open(filename,'w')
01444         fh.write('%d %d\n' % (self.ntax,self.nchar))
01445         for taxon in self.taxlabels:
01446             fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring()))
01447         fh.close()
01448         return filename
01449     
01450     def constant(self,matrix=None,delete=[],exclude=[]):
01451         """Return a list with all constant characters."""
01452         if not matrix:
01453             matrix=self.matrix
01454         undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
01455         if not undelete:
01456             return None
01457         elif len(undelete)==1:
01458             return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
01459         # get the first sequence and expand all ambiguous values
01460         constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 
01461                 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
01462         for taxon in undelete[1:]:
01463             newconstant=[]
01464             for site in constant:
01465                 #print '%d (paup=%d)' % (site[0],site[0]+1),
01466                 seqsite=matrix[taxon][site[0]].upper()
01467                 #print seqsite,'checked against',site[1],'\t',
01468                 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 
01469                     # missing or same as before  -> ok
01470                     newconstant.append(site)
01471                 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
01472                     # subset of an ambig or only missing in previous -> take subset
01473                     newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
01474                 elif seqsite in self.ambiguous_values:  # is it an ambig: check the intersection with prev. values
01475                     intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1]))
01476                     if intersect:
01477                         newconstant.append((site[0],''.join(intersect)))
01478                     #    print 'ok'
01479                     #else:
01480                     #    print 'failed'
01481                 #else:
01482                 #    print 'failed'
01483             constant=newconstant
01484         cpos=[s[0] for s in constant]
01485         return cpos
01486 
01487     def cstatus(self,site,delete=[],narrow=True):
01488         """Summarize character.
01489 
01490         narrow=True:  paup-mode (a c ? --> ac; ? ? ? --> ?)
01491         narrow=false:           (a c ? --> a c g t -; ? ? ? --> a c g t -)
01492         """
01493         undelete=[t for t in self.taxlabels if t not in delete]
01494         if not undelete:
01495             return None
01496         cstatus=[]
01497         for t in undelete:
01498             c=self.matrix[t][site].upper()
01499             if self.options.get('gapmode')=='missing' and c==self.gap:
01500                 c=self.missing
01501             if narrow and c==self.missing:
01502                 if c not in cstatus:
01503                     cstatus.append(c)
01504             else:
01505                 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
01506         if self.missing in cstatus and narrow and len(cstatus)>1:
01507             cstatus=[c for c in cstatus if c!=self.missing]
01508         cstatus.sort()
01509         return cstatus
01510 
01511     def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
01512         """Calculates a stepmatrix for weighted parsimony.
01513 
01514         See Wheeler (1990), Cladistics 6:269-275 and
01515         Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
01516         """    
01517         m=StepMatrix(self.unambiguous_letters,self.gap)
01518         for site in [s for s in range(self.nchar) if s not in exclude]:
01519             cstatus=self.cstatus(site,delete)
01520             for i,b1 in enumerate(cstatus[:-1]):
01521                 for b2 in cstatus[i+1:]:
01522                     m.add(b1.upper(),b2.upper(),1)
01523         return m.transformation().weighting().smprint(name=name)
01524 
01525     def crop_matrix(self,matrix=None, delete=[], exclude=[]):
01526         """Return a matrix without deleted taxa and excluded characters."""
01527         if not matrix:
01528             matrix=self.matrix
01529         if [t for t in delete if not self._check_taxlabels(t)]:
01530             raise NexusError('Unknown taxa: %s' \
01531                              % ', '.join(set(delete).difference(self.taxlabels)))
01532         if exclude!=[]:
01533             undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
01534             if not undelete:
01535                 return {}
01536             m=[matrix[k].tostring() for k in undelete]
01537             zipped_m=zip(*m)
01538             sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
01539             if sitesm==[]:
01540                 return dict([(t,Seq('',self.alphabet)) for t in undelete])
01541             else:
01542                 zipped_sitesm=zip(*sitesm)
01543                 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
01544                 return dict(zip(undelete,m))
01545         else:
01546             return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
01547            
01548     def bootstrap(self,matrix=None,delete=[],exclude=[]):
01549         """Return a bootstrapped matrix."""
01550         if not matrix:
01551             matrix=self.matrix
01552         seqobjects=isinstance(matrix[matrix.keys()[0]],Seq)         # remember if Seq objects
01553         cm=self.crop_matrix(delete=delete,exclude=exclude)          # crop data out
01554         if not cm:                                                  # everything deleted?
01555             return {}
01556         elif len(cm[cm.keys()[0]])==0:                              # everything excluded?
01557             return cm
01558         undelete=[t for t in self.taxlabels if t in cm]  
01559         if seqobjects:
01560             sitesm=zip(*[cm[t].tostring() for t in undelete])
01561             alphabet=matrix[matrix.keys()[0]].alphabet
01562         else:
01563             sitesm=zip(*[cm[t] for t in undelete])
01564         bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
01565         bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
01566         if seqobjects:
01567             bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
01568         return dict(zip(undelete,bootstrapseqs)) 
01569 
01570     def add_sequence(self,name,sequence):
01571         """Adds a sequence (string) to the matrix."""
01572         
01573         if not name:
01574             raise NexusError('New sequence must have a name')
01575 
01576         diff=self.nchar-len(sequence)
01577         if diff<0:
01578             self.insert_gap(self.nchar,-diff)
01579         elif diff>0:
01580             sequence+=self.missing*diff
01581 
01582         if name in self.taxlabels:
01583             unique_name=_unique_label(self.taxlabels,name)
01584             #print "WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name)
01585         else:
01586             unique_name=name
01587 
01588         assert unique_name not in self.matrix, "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug."
01589 
01590         self.matrix[unique_name]=Seq(sequence,self.alphabet)
01591         self.ntax+=1
01592         self.taxlabels.append(unique_name)
01593         self.unaltered_taxlabels.append(name)
01594 
01595     def insert_gap(self,pos,n=1,leftgreedy=False):
01596         """Add a gap into the matrix and adjust charsets and partitions.
01597         
01598         pos=0: first position
01599         pos=nchar: last position
01600         """
01601 
01602         def _adjust(set,x,d,leftgreedy=False):
01603             """Adjusts chartacter sets if gaps are inserted, taking care of
01604             new gaps within a coherent character set.""" 
01605             # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3  8 9 10 11 13 14 15
01606             # then the adjusted set will be 1 2 3  8 9 10 11 12 13 14 15 16 17 18 
01607             # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18
01608             set.sort()
01609             addpos=0
01610             for i,c in enumerate(set):
01611                 if c>=x:
01612                     set[i]=c+d
01613                 # if we add gaps within a group of characters, we want the gap position included in this group
01614                 if c==x:
01615                     if leftgreedy or (i>0 and set[i-1]==c-1):  
01616                         addpos=i
01617             if addpos>0:
01618                 set[addpos:addpos]=range(x,x+d)
01619             return set
01620 
01621         if pos<0 or pos>self.nchar:
01622             raise NexusError('Illegal gap position: %d' % pos)
01623         if n==0:
01624             return
01625         if self.taxlabels:
01626             #python 2.3 does not support zip(*[])
01627             sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
01628         else:
01629             sitesm=[]
01630         sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
01631         # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\
01632         #        i,taxon in enumerate(self.taxlabels)])
01633         zipped=zip(*sitesm)
01634         mapped=map(''.join,zipped)
01635         listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
01636         self.matrix=dict(listed) 
01637         self.nchar+=n
01638         # now adjust character sets
01639         for i,s in self.charsets.iteritems():
01640             self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
01641         for p in self.charpartitions:
01642             for sp,s in self.charpartitions[p].iteritems():
01643                 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
01644         # now adjust character state labels
01645         self.charlabels=self._adjust_charlabels(insert=[pos]*n)
01646         return self.charlabels
01647       
01648     def _adjust_charlabels(self,exclude=None,insert=None):
01649         """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
01650         if exclude and insert:
01651             raise NexusError('Can\'t exclude and insert at the same time')
01652         if not self.charlabels:
01653             return None
01654         labels=sorted(self.charlabels)
01655         newcharlabels={}
01656         if exclude:
01657             exclude.sort()
01658             exclude.append(sys.maxint)
01659             excount=0
01660             for c in labels:
01661                 if not c in exclude:
01662                     while c>exclude[excount]:
01663                         excount+=1
01664                     newcharlabels[c-excount]=self.charlabels[c]
01665         elif insert:
01666             insert.sort()
01667             insert.append(sys.maxint)
01668             icount=0
01669             for c in labels:
01670                 while c>=insert[icount]:
01671                     icount+=1
01672                 newcharlabels[c+icount]=self.charlabels[c]
01673         else:
01674             return self.charlabels
01675         return newcharlabels
01676 
01677     def invert(self,charlist):
01678         """Returns all character indices that are not in charlist."""
01679         return [c for c in range(self.nchar) if c not in charlist]
01680 
01681     def gaponly(self,include_missing=False):
01682         """Return gap-only sites."""
01683         gap=set(self.gap)
01684         if include_missing:
01685             gap.add(self.missing)
01686         sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
01687         gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)]
01688         return gaponly 
01689         
01690     def terminal_gap_to_missing(self,missing=None,skip_n=True):
01691         """Replaces all terminal gaps with missing character.
01692         
01693         Mixtures like ???------??------- are properly resolved."""
01694         
01695         if not missing:
01696             missing=self.missing
01697         replace=[self.missing,self.gap]
01698         if not skip_n:
01699             replace.extend(['n','N'])
01700         for taxon in self.taxlabels:
01701             sequence=self.matrix[taxon].tostring()
01702             length=len(sequence)
01703             start,end=get_start_end(sequence,skiplist=replace)
01704             if start==-1 and end==-1:
01705                 sequence=missing*length
01706             else:
01707                 sequence=sequence[:end+1]+missing*(length-end-1)
01708                 sequence=start*missing+sequence[start:]
01709             assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon
01710             self.matrix[taxon]=Seq(sequence,self.alphabet)
01711 
01712 
01713 try:
01714     import cnexus
01715 except ImportError:
01716     def _get_command_lines(file_contents):
01717         lines=_kill_comments_and_break_lines(file_contents)
01718         commandlines=_adjust_lines(lines)
01719         return commandlines
01720 else:
01721     def _get_command_lines(file_contents):
01722         decommented=cnexus.scanfile(file_contents)
01723         #check for unmatched parentheses
01724         if decommented=='[' or decommented==']':
01725             raise NexusError('Unmatched %s' % decommented)
01726         # cnexus can't return lists, so in analogy we separate
01727         # commandlines with chr(7) (a character that shoudn't be part of a
01728         # nexus file under normal circumstances)
01729         commandlines=_adjust_lines(decommented.split(chr(7)))
01730         return commandlines