Back to index

python-biopython  1.60
Ace.py
Go to the documentation of this file.
00001 # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """
00006 Parser for ACE files output by PHRAP.
00007 
00008 Written by Frank Kauff (fkauff@duke.edu) and
00009 Cymon J. Cox (cymon@duke.edu)
00010 
00011 Usage:
00012 
00013 There are two ways of reading an ace file:
00014 1) The function 'read' reads the whole file at once;
00015 2) The function 'parse' reads the file contig after contig.
00016     
00017 1) Parse whole ace file at once:
00018 
00019         from Bio.Sequencing import Ace
00020         acefilerecord=Ace.read(open('my_ace_file.ace'))
00021 
00022 This gives you:
00023         acefilerecord.ncontigs (the number of contigs in the ace file)
00024         acefilerecord.nreads (the number of reads in the ace file)
00025         acefilerecord.contigs[] (one instance of the Contig class for each contig)
00026 
00027 The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
00028 for this contig in a list of instances of the Read class, e.g.:
00029 
00030         contig3=acefilerecord.contigs[2]
00031         read4=contig3.reads[3]
00032         RD_of_read4=read4.rd
00033         DS_of_read4=read4.ds
00034 
00035 CT, WA, RT tags from the end of the file can appear anywhere are automatically
00036 sorted into the right place.
00037 
00038 see _RecordConsumer for details.
00039 
00040 2) Or you can iterate over the contigs of an ace file one by one in the ususal way:
00041 
00042         from Bio.Sequencing import Ace
00043         contigs=Ace.parse(open('my_ace_file.ace'))
00044         for contig in contigs:
00045             print contig.name
00046             ...
00047 
00048 Please note that for memory efficiency, when using the iterator approach, only one
00049 contig is kept in memory at once.  However, there can be a footer to the ACE file
00050 containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
00051 Because the parser doesn't see this data until the final record, it cannot be added to
00052 the appropriate records.  Instead these tags will be returned with the last contig record.
00053 Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
00054 are needed, the 'read' function rather than the 'parse' function might be more appropriate.
00055 """
00056 
00057 
00058 class rd(object):
00059     """RD (reads), store a read with its name, sequence etc.
00060     
00061     The location and strand each read is mapped to is held in the AF lines.
00062     """
00063     def __init__(self):
00064         self.name=''
00065         self.padded_bases=None
00066         self.info_items=None
00067         self.read_tags=None
00068         self.sequence=''
00069 
00070 class qa(object):
00071     """QA (read quality), including which part if any was used as the consensus."""
00072     def __init__(self, line=None):
00073         self.qual_clipping_start=None
00074         self.qual_clipping_end=None
00075         self.align_clipping_start=None
00076         self.align_clipping_end=None
00077         if line:
00078             header=map(eval,line.split()[1:])
00079             self.qual_clipping_start=header[0]
00080             self.qual_clipping_end=header[1]
00081             self.align_clipping_start=header[2]
00082             self.align_clipping_end=header[3]
00083 
00084 class ds(object):
00085     """DS lines, include file name of a read's chromatogram file."""
00086     def __init__(self, line=None):
00087         self.chromat_file=''
00088         self.phd_file=''
00089         self.time=''
00090         self.chem=''
00091         self.dye=''
00092         self.template=''
00093         self.direction=''
00094         if line:
00095             tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION']
00096             poss=map(line.find,tags)
00097             tagpos=dict(zip(poss,tags))
00098             if -1 in tagpos:
00099                 del tagpos[-1]
00100             ps=tagpos.keys()
00101             ps.sort()
00102             for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]):
00103                 setattr(self,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())   
00104 
00105     
00106 class af(object):
00107     """AF lines, define the location of the read within the contig.
00108     
00109     Note attribute coru is short for complemented (C) or uncomplemented (U),
00110     since the strand information is stored in an ACE file using either the
00111     C or U character.
00112     """
00113     def __init__(self, line=None):
00114         self.name=''
00115         self.coru=None
00116         self.padded_start=None
00117         if line:
00118             header = line.split()
00119             self.name = header[1]
00120             self.coru = header[2]
00121             self.padded_start = int(header[3])
00122 
00123 class bs(object):
00124     """"BS (base segment), which read was chosen as the consensus at each position."""
00125     def __init__(self, line=None):
00126         self.name=''
00127         self.padded_start=None
00128         self.padded_end=None
00129         if line:
00130             header = line.split()
00131             self.padded_start = int(header[1])
00132             self.padded_end = int(header[2])
00133             self.name = header[3]
00134 
00135 class rt(object):   
00136     """RT (transient read tags), generated by crossmatch and phrap."""
00137     def __init__(self, line=None):
00138         self.name=''
00139         self.tag_type=''
00140         self.program=''
00141         self.padded_start=None
00142         self.padded_end=None
00143         self.date=''
00144         self.comment=[]
00145         if line:
00146             header=line.split()
00147             self.name=header[0]
00148             self.tag_type=header[1]
00149             self.program=header[2]
00150             self.padded_start=int(header[3])
00151             self.padded_end=int(header[4])
00152             self.date=header[5]
00153 
00154 class ct(object):
00155     """CT (consensus tags)."""
00156     def __init__(self, line=None):
00157         self.name=''
00158         self.tag_type=''
00159         self.program=''
00160         self.padded_start=None
00161         self.padded_end=None
00162         self.date=''
00163         self.notrans=''
00164         self.info=[]
00165         self.comment=[]
00166         if line:
00167             header=line.split()
00168             self.name = header[0]
00169             self.tag_type = header[1]
00170             self.program = header[2]
00171             self.padded_start = int(header[3])
00172             self.padded_end = int(header[4])
00173             self.date = header[5]
00174             if len(header)==7:
00175                 self.notrans = header[6]
00176 
00177 class wa(object):
00178     """WA (whole assembly tag), holds the assembly program name, version, etc."""
00179     def __init__(self, line=None):
00180         self.tag_type=''
00181         self.program=''
00182         self.date=''
00183         self.info=[]
00184         if line:
00185             header = line.split()
00186             self.tag_type = header[0]
00187             self.program = header[1]
00188             self.date = header[2]
00189 
00190 class wr(object):
00191     """WR lines."""
00192     def __init__(self, line=None):
00193         self.name=''
00194         self.aligned=''
00195         self.program=''
00196         self.date=[]
00197         if line:
00198             header = line.split()
00199             self.name = header[0]
00200             self.aligned = header[1]
00201             self.program = header[2]
00202             self.date = header[3]
00203 
00204 class Reads(object):
00205     """Holds information about a read supporting an ACE contig."""
00206     def __init__(self, line=None):
00207         self.rd=None    # one per read
00208         self.qa=None    # one per read
00209         self.ds=None    # none or one per read
00210         self.rt=None    # none or many per read
00211         self.wr=None    # none or many per read
00212         if line:
00213             self.rd = rd()
00214             header = line.split()
00215             self.rd.name = header[1]
00216             self.rd.padded_bases = int(header[2])
00217             self.rd.info_items = int(header[3])
00218             self.rd.read_tags = int(header[4])
00219         
00220 class Contig(object):
00221     """Holds information about a contig from an ACE record."""
00222     def __init__(self, line=None):
00223         self.name = ''
00224         self.nbases=None
00225         self.nreads=None
00226         self.nsegments=None
00227         self.uorc=None
00228         self.sequence=""
00229         self.quality=[]
00230         self.af=[]
00231         self.bs=[]
00232         self.reads=[]
00233         self.ct=None    # none or many
00234         self.wa=None    # none or many
00235         if line:
00236             header = line.split()
00237             self.name = header[1]
00238             self.nbases = int(header[2])
00239             self.nreads = int(header[3])
00240             self.nsegments = int(header[4])
00241             self.uorc = header[5]
00242 
00243 def parse(handle):
00244     """parse(handle)
00245         
00246     where handle is a file-like object.
00247     
00248     This function returns an iterator that allows you to iterate
00249     over the ACE file record by record:
00250 
00251         records = parse(handle)
00252         for record in records:
00253             # do something with the record
00254 
00255     where each record is a Contig object.
00256     """
00257 
00258     handle = iter(handle)
00259 
00260     line = ""
00261     while True:
00262         # at beginning, skip the AS and look for first CO command
00263         try:
00264             while True:
00265                 if line.startswith('CO'):
00266                     break
00267                 line = handle.next()
00268         except StopIteration:
00269             return
00270 
00271         record = Contig(line)
00272 
00273         for line in handle:
00274             line = line.strip()
00275             if not line:
00276                 break
00277             record.sequence+=line
00278 
00279         for line in handle:
00280             if line.strip():
00281                 break
00282         if not line.startswith("BQ"):
00283             raise ValueError("Failed to find BQ line")
00284 
00285         for line in handle:
00286             if not line.strip():
00287                 break
00288             record.quality.extend(map(int,line.split()))
00289 
00290         for line in handle:
00291             if line.strip():
00292                 break
00293 
00294         while True:
00295             if not line.startswith("AF "):
00296                 break
00297             record.af.append(af(line))
00298             try:
00299                 line = handle.next()
00300             except StopIteration:
00301                 raise ValueError("Unexpected end of AF block")
00302 
00303         while True:
00304             if line.strip():
00305                 break
00306             try:
00307                 line = handle.next()
00308             except StopIteration:
00309                 raise ValueError("Unexpected end of file")
00310 
00311         while True:
00312             if not line.startswith("BS "):
00313                 break
00314             record.bs.append(bs(line))
00315             try:
00316                 line = handle.next()
00317             except StopIteration:
00318                 raise ValueError("Failed to find end of BS block")
00319 
00320         # now read all the read data
00321         # it starts with a 'RD', and then a mandatory QA
00322         # then follows an optional DS
00323         # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig,
00324         # or, if encountered at the end of file, to any previous read or contig. the sort() method deals
00325         # with that later.
00326         while True:
00327 
00328             # each read must have a rd and qa
00329             try:
00330                 while True:
00331                     # If I've met the condition, then stop reading the line.
00332                     if line.startswith("RD "):
00333                         break
00334                     line = handle.next()
00335             except StopIteration:
00336                 raise ValueError("Failed to find RD line")
00337 
00338             record.reads.append(Reads(line))
00339 
00340             for line in handle:
00341                 line = line.strip()
00342                 if not line:
00343                     break
00344                 record.reads[-1].rd.sequence+=line
00345 
00346             for line in handle:
00347                 if line.strip():
00348                     break
00349             if not line.startswith("QA "):
00350                 raise ValueError("Failed to find QA line")
00351             record.reads[-1].qa = qa(line)
00352 
00353             # now one ds can follow
00354             for line in handle:
00355                 if line.strip():
00356                     break
00357             else:
00358                 break
00359 
00360             if line.startswith("DS "):
00361                 record.reads[-1].ds = ds(line)
00362                 line = ""
00363             # the file could just end, or there's some more stuff. In ace files, anything can happen.
00364             # the following tags are interspersed between reads and can appear multiple times. 
00365             while True:
00366                 # something left 
00367                 try:
00368                     while True:
00369                         if line.strip():
00370                             break
00371                         line = handle.next()
00372                 except StopIteration:
00373                     # file ends here
00374                     break
00375                 if line.startswith("RT{"):
00376                     # now if we're at the end of the file, this rt could
00377                     # belong to a previous read, not the actual one.
00378                     # we store it here were it appears, the user can sort later.
00379                     if record.reads[-1].rt is None:
00380                         record.reads[-1].rt=[]
00381                     for line in handle:
00382                         line=line.strip()
00383                         #if line=="COMMENT{":
00384                         if line.startswith("COMMENT{"):
00385                             if line[8:].strip():
00386                                 #MIRA 3.0.5 would miss the new line out :(
00387                                 record.reads[-1].rt[-1].comment.append(line[8:])
00388                             for line in handle:
00389                                 line = line.strip()
00390                                 if line.endswith("C}"):
00391                                     break
00392                                 record.reads[-1].rt[-1].comment.append(line)
00393                         elif line=='}':
00394                             break
00395                         else:
00396                             record.reads[-1].rt.append(rt(line))
00397                     line = ""
00398                 elif line.startswith("WR{"):
00399                     if record.reads[-1].wr is None:
00400                         record.reads[-1].wr=[]
00401                     for line in handle:
00402                         line=line.strip()
00403                         if line=='}': break
00404                         record.reads[-1].wr.append(wr(line))
00405                     line = ""
00406                 elif line.startswith("WA{"):
00407                     if record.wa is None:
00408                         record.wa=[]
00409                     try:
00410                         line = handle.next()
00411                     except StopIteration:
00412                         raise ValueError("Failed to read WA block")
00413                     record.wa.append(wa(line))
00414                     for line in handle:
00415                         line=line.strip()
00416                         if line=='}': break
00417                         record.wa[-1].info.append(line)
00418                     line = ""
00419                 elif line.startswith("CT{"):
00420                     if record.ct is None:
00421                         record.ct=[]
00422                     try:
00423                         line = handle.next()
00424                     except StopIteration:
00425                         raise ValueError("Failed to read CT block")
00426                     record.ct.append(ct(line))
00427                     for line in handle:
00428                         line=line.strip()
00429                         if line=="COMMENT{":
00430                             for line in handle:
00431                                 line = line.strip()
00432                                 if line.endswith("C}"):
00433                                     break
00434                                 record.ct[-1].comment.append(line)
00435                         elif line=='}':
00436                             break
00437                         else:
00438                             record.ct[-1].info.append(line)
00439                     line = ""
00440                 else:
00441                     break
00442 
00443             if not line.startswith('RD'): # another read?
00444                 break    
00445 
00446         yield record
00447 
00448 class ACEFileRecord(object):
00449     """Holds data of an ACE file.
00450     """
00451     def __init__(self):
00452         self.ncontigs=None
00453         self.nreads=None
00454         self.contigs=[]
00455         self.wa=None    # none or many
00456 
00457     def sort(self):
00458         """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible.  """
00459        
00460         ct=[]
00461         rt=[]
00462         wr=[]
00463         # search for tags that aren't in the right position
00464         for i in range(len(self.contigs)):
00465             c = self.contigs[i]
00466             if c.wa:
00467                 if not self.wa:
00468                     self.wa=[]
00469                 self.wa.extend(c.wa)
00470             if c.ct:
00471                 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
00472                 for x in newcts:
00473                     self.contigs[i].ct.remove(x)
00474                 ct.extend(newcts)
00475             for j in range(len(c.reads)):
00476                 r = c.reads[j]
00477                 if r.rt:
00478                     newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
00479                     for x in newrts:
00480                         self.contigs[i].reads[j].rt.remove(x)
00481                     rt.extend(newrts)
00482                 if r.wr:
00483                     newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
00484                     for x in newwrs:
00485                         self.contigs[i].reads[j].wr.remove(x)
00486                     wr.extend(newwrs)
00487         # now sort them into their proper place
00488         for i in range(len(self.contigs)):
00489             c = self.contigs[i]
00490             for ct_tag in ct:
00491                 if ct_tag.name==c.name:
00492                     if self.contigs[i].ct is None:
00493                         self.contigs[i].ct=[]
00494                     self.contigs[i].ct.append(ct_tag)
00495             if rt or wr:
00496                 for j in range(len(c.reads)):
00497                     r = c.reads[j]
00498                     for rt_tag in rt:
00499                         if rt_tag.name==r.rd.name:
00500                             if self.contigs[i].reads[j].rt is None:
00501                                 self.contigs[i].reads[j].rt=[]
00502                             self.contigs[i].reads[j].rt.append(rt_tag)
00503                     for wr_tag in wr:
00504                         if wr_tag.name==r.rd.name:
00505                             if self.contigs[i].reads[j].wr is None:
00506                                 self.contigs[i].reads[j].wr=[]
00507                             self.contigs[i].reads[j].wr.append(wr_tag)
00508        
00509 def read(handle):
00510     """Parses the full ACE file in list of contigs.
00511 
00512     """
00513 
00514     handle = iter(handle)
00515 
00516     record=ACEFileRecord()
00517 
00518     try:
00519         line = handle.next()
00520     except StopIteration:
00521         raise ValueError("Premature end of file")
00522 
00523     # check if the file starts correctly
00524     if not line.startswith('AS'):
00525         raise ValueError("File does not start with 'AS'.")
00526 
00527     words = line.split()
00528     record.ncontigs, record.nreads = map(int, words[1:3])
00529 
00530     # now read all the records
00531     record.contigs = list(parse(handle))
00532     # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?).
00533     # If the iterator is used, the tags are returned with the contig or the read after which they appear,
00534     # if all tags are at the end, they are read with the last contig. The concept of an
00535     # iterator leaves no other choice. But if the user uses the ACEParser, we can check
00536     # them and put them into the appropriate contig/read instance.
00537     # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable...
00538     record.sort()
00539     return record