Back to index

python-biopython  1.60
FastaIO.py
Go to the documentation of this file.
00001 # Copyright 2008-2011 by Peter Cock.  All rights reserved.
00002 #
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 """
00007 Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools.
00008 
00009 You are expected to use this module via the Bio.AlignIO functions (or the
00010 Bio.SeqIO functions if you want to work directly with the gapped sequences).
00011 
00012 This module contains a parser for the pairwise alignments produced by Bill
00013 Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is
00014 refered to as the "fasta-m10" file format (as we only support the machine
00015 readable output format selected with the -m 10 command line option).
00016 
00017 This module does NOT cover the generic "fasta" file format originally
00018 developed as an input format to the FASTA tools.  The Bio.AlignIO and
00019 Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files,
00020 which can also be used to store a multiple sequence alignments.
00021 """
00022 
00023 from Bio.Seq import Seq
00024 from Bio.SeqRecord import SeqRecord
00025 from Bio.Align import MultipleSeqAlignment
00026 from Interfaces import AlignmentIterator
00027 from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein
00028 from Bio.Alphabet import Gapped
00029 
00030 
00031 def _extract_alignment_region(alignment_seq_with_flanking, annotation):
00032     """Helper function for the main parsing code (PRIVATE).
00033 
00034     To get the actual pairwise alignment sequences, we must first
00035     translate the un-gapped sequence based coordinates into positions
00036     in the gapped sequence (which may have a flanking region shown
00037     using leading - characters).  To date, I have never seen any
00038     trailing flanking region shown in the m10 file, but the
00039     following code should also cope with that.
00040 
00041     Note that this code seems to work fine even when the "sq_offset"
00042     entries are prsent as a result of using the -X command line option.
00043     """
00044     align_stripped = alignment_seq_with_flanking.strip("-")
00045     display_start = int(annotation['al_display_start'])
00046     if int(annotation['al_start']) <= int(annotation['al_stop']):
00047         start = int(annotation['al_start']) \
00048               - display_start
00049         end   = int(annotation['al_stop']) \
00050               - display_start + 1
00051     else:
00052         #FASTA has flipped this sequence...
00053         start = display_start \
00054               - int(annotation['al_start'])
00055         end   = display_start \
00056               - int(annotation['al_stop']) + 1
00057     end += align_stripped.count("-")
00058     assert 0 <= start and start < end and end <= len(align_stripped), \
00059            "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \
00060            % (alignment_seq_with_flanking, start, end, annotation)
00061     return align_stripped[start:end]
00062 
00063 def FastaM10Iterator(handle, alphabet = single_letter_alphabet):
00064     """Alignment iterator for the FASTA tool's pairwise alignment output.
00065 
00066     This is for reading the pairwise alignments output by Bill Pearson's
00067     FASTA program when called with the -m 10 command line option for machine
00068     readable output.  For more details about the FASTA tools, see the website
00069     http://fasta.bioch.virginia.edu/ and the paper:
00070 
00071          W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
00072 
00073     This class is intended to be used via the Bio.AlignIO.parse() function
00074     by specifying the format as "fasta-m10" as shown in the following code:
00075 
00076         from Bio import AlignIO
00077         handle = ...
00078         for a in AlignIO.parse(handle, "fasta-m10"):
00079             assert len(a) == 2, "Should be pairwise!"
00080             print "Alignment length %i" % a.get_alignment_length()
00081             for record in a:
00082                 print record.seq, record.name, record.id
00083 
00084     Note that this is not a full blown parser for all the information
00085     in the FASTA output - for example, most of the header and all of the
00086     footer is ignored.  Also, the alignments are not batched according to
00087     the input queries.
00088 
00089     Also note that there can be up to about 30 letters of flanking region
00090     included in the raw FASTA output as contextual information.  This is NOT
00091     part of the alignment itself, and is not included in the resulting
00092     MultipleSeqAlignment objects returned.
00093     """
00094     if alphabet is None:
00095         alphabet = single_letter_alphabet
00096     
00097     state_PREAMBLE = -1
00098     state_NONE = 0
00099     state_QUERY_HEADER = 1
00100     state_ALIGN_HEADER = 2
00101     state_ALIGN_QUERY = 3
00102     state_ALIGN_MATCH = 4
00103     state_ALIGN_CONS = 5
00104 
00105     def build_hsp():
00106         if not query_tags and not match_tags:
00107             raise ValueError("No data for query %r, match %r" \
00108                              % (query_id, match_id))
00109         assert query_tags, query_tags
00110         assert match_tags, match_tags
00111         evalue = align_tags.get("fa_expect", None)
00112         q = "?" #Just for printing len(q) in debug below
00113         m = "?" #Just for printing len(m) in debug below
00114         tool = global_tags.get("tool", "").upper()
00115         try:
00116             q = _extract_alignment_region(query_seq, query_tags)
00117             if tool in ["TFASTX"] and len(match_seq) == len(q):
00118                 m = match_seq
00119                 #Quick hack until I can work out how -, * and / characters
00120                 #and the apparent mix of aa and bp coordindates works.
00121             else:
00122                 m = _extract_alignment_region(match_seq, match_tags)
00123             assert len(q) == len(m)
00124         except AssertionError, err:
00125             print "Darn... amino acids vs nucleotide coordinates?"
00126             print tool
00127             print query_seq
00128             print query_tags
00129             print q, len(q)
00130             print match_seq
00131             print match_tags
00132             print m, len(m)
00133             print handle.name
00134             raise err
00135 
00136         assert alphabet is not None
00137         alignment = MultipleSeqAlignment([], alphabet)
00138 
00139         #TODO - Introduce an annotated alignment class?
00140         #For now, store the annotation a new private property:
00141         alignment._annotations = {}
00142         
00143         #Want to record both the query header tags, and the alignment tags.
00144         for key, value in header_tags.iteritems():
00145             alignment._annotations[key] = value
00146         for key, value in align_tags.iteritems():
00147             alignment._annotations[key] = value
00148         
00149         #Query
00150         #=====
00151         record = SeqRecord(Seq(q, alphabet),
00152                            id = query_id,
00153                            name = "query",
00154                            description = query_descr,
00155                            annotations = {"original_length" : int(query_tags["sq_len"])})
00156         #TODO - handle start/end coordinates properly. Short term hack for now:
00157         record._al_start = int(query_tags["al_start"])
00158         record._al_stop = int(query_tags["al_stop"])
00159         alignment.append(record)
00160 
00161         #TODO - What if a specific alphabet has been requested?
00162         #TODO - Use an IUPAC alphabet?
00163         #TODO - Can FASTA output RNA?
00164         if alphabet == single_letter_alphabet and "sq_type" in query_tags:
00165             if query_tags["sq_type"] == "D":
00166                 record.seq.alphabet = generic_dna
00167             elif query_tags["sq_type"] == "p":
00168                 record.seq.alphabet = generic_protein
00169         if "-" in q:
00170             if not hasattr(record.seq.alphabet,"gap_char"):
00171                 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
00172 
00173         #Match
00174         #=====
00175         record = SeqRecord(Seq(m, alphabet),
00176                            id = match_id,
00177                            name = "match",
00178                            description = match_descr,
00179                            annotations = {"original_length" : int(match_tags["sq_len"])})
00180         #TODO - handle start/end coordinates properly. Short term hack for now:
00181         record._al_start = int(match_tags["al_start"])
00182         record._al_stop = int(match_tags["al_stop"])
00183         alignment.append(record)
00184 
00185         #This is still a very crude way of dealing with the alphabet:
00186         if alphabet == single_letter_alphabet and "sq_type" in match_tags:
00187             if match_tags["sq_type"] == "D":
00188                 record.seq.alphabet = generic_dna
00189             elif match_tags["sq_type"] == "p":
00190                 record.seq.alphabet = generic_protein
00191         if "-" in m:
00192             if not hasattr(record.seq.alphabet,"gap_char"):
00193                 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
00194 
00195         return alignment
00196 
00197     state = state_PREAMBLE
00198     query_id = None
00199     match_id = None
00200     query_descr = ""
00201     match_descr = ""
00202     global_tags = {}
00203     header_tags = {}
00204     align_tags = {}
00205     query_tags = {}
00206     match_tags = {}
00207     query_seq = ""
00208     match_seq = ""
00209     cons_seq = ""
00210     for line in handle:
00211         if ">>>" in line and not line.startswith(">>>"):
00212             if query_id and match_id:
00213                 #This happens on old FASTA output which lacked an end of
00214                 #query >>><<< marker line.
00215                 yield build_hsp()
00216             state = state_NONE
00217             query_descr = line[line.find(">>>")+3:].strip()
00218             query_id = query_descr.split(None,1)[0]
00219             match_id = None
00220             header_tags = {}
00221             align_tags = {}
00222             query_tags = {}
00223             match_tags = {}
00224             query_seq = ""
00225             match_seq = ""
00226             cons_seq = ""
00227         elif line.startswith("!! No "):
00228             #e.g.
00229             #!! No library sequences with E() < 0.5
00230             #or on more recent versions,
00231             #No sequences with E() < 0.05
00232             assert state == state_NONE
00233             assert not header_tags
00234             assert not align_tags
00235             assert not match_tags
00236             assert not query_tags
00237             assert match_id is None
00238             assert not query_seq
00239             assert not match_seq
00240             assert not cons_seq
00241             query_id = None
00242         elif line.strip() in [">>><<<", ">>>///"]:
00243             #End of query, possible end of all queries
00244             if query_id and match_id:
00245                 yield build_hsp()
00246             state = state_NONE
00247             query_id = None
00248             match_id = None
00249             header_tags = {}
00250             align_tags = {}
00251             query_tags = {}
00252             match_tags = {}
00253             query_seq = ""
00254             match_seq = ""
00255             cons_seq = ""
00256         elif line.startswith(">>>"):
00257             #Should be start of a match!
00258             assert query_id is not None
00259             assert line[3:].split(", ",1)[0] == query_id, line
00260             assert match_id is None
00261             assert not header_tags
00262             assert not align_tags
00263             assert not query_tags
00264             assert not match_tags
00265             assert not match_seq
00266             assert not query_seq
00267             assert not cons_seq
00268             state = state_QUERY_HEADER
00269         elif line.startswith(">>"):
00270             #Should now be at start of a match alignment!
00271             if query_id and match_id:
00272                 yield build_hsp()
00273             align_tags = {}
00274             query_tags = {}
00275             match_tags = {}
00276             query_seq = ""
00277             match_seq = ""
00278             cons_seq = ""
00279             match_descr = line[2:].strip()
00280             match_id = match_descr.split(None,1)[0]
00281             state = state_ALIGN_HEADER
00282         elif line.startswith(">--"):
00283             #End of one HSP
00284             assert query_id and match_id, line
00285             yield build_hsp()
00286             #Clean up read for next HSP
00287             #but reuse header_tags
00288             align_tags = {}
00289             query_tags = {}
00290             match_tags = {}
00291             query_seq = ""
00292             match_seq = ""
00293             cons_seq = ""
00294             state = state_ALIGN_HEADER
00295         elif line.startswith(">"):
00296             if state == state_ALIGN_HEADER:
00297                 #Should be start of query alignment seq...
00298                 assert query_id is not None, line
00299                 assert match_id is not None, line
00300                 assert query_id.startswith(line[1:].split(None,1)[0]), line
00301                 state = state_ALIGN_QUERY
00302             elif state == state_ALIGN_QUERY:
00303                 #Should be start of match alignment seq
00304                 assert query_id is not None, line
00305                 assert match_id is not None, line
00306                 assert match_id.startswith(line[1:].split(None,1)[0]), line
00307                 state = state_ALIGN_MATCH
00308             elif state == state_NONE:
00309                 #Can get > as the last line of a histogram
00310                 pass
00311             else:
00312                 assert False, "state %i got %r" % (state, line)
00313         elif line.startswith("; al_cons"):
00314             assert state == state_ALIGN_MATCH, line
00315             state = state_ALIGN_CONS
00316             #Next line(s) should be consensus seq...
00317         elif line.startswith("; "):
00318             if ": " in line:
00319                 key, value = [s.strip() for s in line[2:].split(": ",1)]
00320             else:
00321                 import warnings
00322                 #Seen in lalign36, specifically version 36.3.4 Apr, 2011
00323                 #Fixed in version 36.3.5b Oct, 2011(preload8)
00324                 warnings.warn("Missing colon in line: %r" % line)
00325                 try:
00326                     key, value = [s.strip() for s in line[2:].split(" ",1)]
00327                 except ValueError:
00328                     raise ValueError("Bad line: %r" % line)
00329             if state == state_QUERY_HEADER:
00330                 header_tags[key] = value
00331             elif state == state_ALIGN_HEADER:
00332                 align_tags[key] = value
00333             elif state == state_ALIGN_QUERY:
00334                 query_tags[key] = value
00335             elif state == state_ALIGN_MATCH:
00336                 match_tags[key] = value
00337             else:
00338                 assert False, "Unexpected state %r, %r" % (state, line)
00339         elif state == state_ALIGN_QUERY:
00340             query_seq += line.strip()
00341         elif state == state_ALIGN_MATCH:
00342             match_seq += line.strip()
00343         elif state == state_ALIGN_CONS:
00344             cons_seq += line.strip("\n")
00345         elif state == state_PREAMBLE:
00346             if line.startswith("#"):
00347                 global_tags["command"] = line[1:].strip()
00348             elif line.startswith(" version "):
00349                 global_tags["version"] = line[9:].strip()
00350             elif " compares a " in line:
00351                 global_tags["tool"] = line[:line.find(" compares a ")].strip()
00352             elif " searches a " in line:
00353                 global_tags["tool"] = line[:line.find(" searches a ")].strip()
00354         else:
00355             pass
00356 
00357 
00358 if __name__ == "__main__":
00359     print "Running a quick self-test"
00360 
00361     #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple
00362     simple_example = \
00363 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
00364 FASTA searches a protein or DNA sequence data bank
00365  version 34.26 January 12, 2007
00366 Please cite:
00367  W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
00368 
00369 Query library NC_002127.faa vs NC_009649.faa library
00370 searching NC_009649.faa library
00371 
00372   1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa
00373  vs  NC_009649.faa library
00374 
00375   45119 residues in   180 sequences
00376   Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273
00377  mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1  B-trim: 9 in 1/25
00378  Lambda= 0.175043
00379 
00380 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
00381  join: 36, opt: 24, open/ext: -10/-2, width:  16
00382  Scan time:  0.000
00383 The best scores are:                                      opt bits E(180)
00384 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931)   71 24.9    0.58
00385 gi|152973588|ref|YP_001338639.1| F pilus assembly  ( 459)   63 23.1    0.99
00386 
00387 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library
00388 ; pg_name: /opt/fasta/fasta34
00389 ; pg_ver: 34.26
00390 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
00391 ; pg_name: FASTA
00392 ; pg_ver: 3.5 Sept 2006
00393 ; pg_matrix: BL50 (15:-5)
00394 ; pg_open-ext: -10 -2
00395 ; pg_ktup: 2
00396 ; pg_optcut: 24
00397 ; pg_cgap: 36
00398 ; mp_extrap: 60000 180
00399 ; mp_stats:  Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273  mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1  B-trim: 9 in 1/25  Lambda= 0.175043
00400 ; mp_KS: -0.0000 (N=0) at 8159228
00401 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
00402 ; fa_frame: f
00403 ; fa_initn:  65
00404 ; fa_init1:  43
00405 ; fa_opt:  71
00406 ; fa_z-score: 90.3
00407 ; fa_bits: 24.9
00408 ; fa_expect:   0.58
00409 ; sw_score: 71
00410 ; sw_ident: 0.250
00411 ; sw_sim: 0.574
00412 ; sw_overlap: 108
00413 >gi|10955263| ..
00414 ; sq_len: 107
00415 ; sq_offset: 1
00416 ; sq_type: p
00417 ; al_start: 5
00418 ; al_stop: 103
00419 ; al_display_start: 1
00420 --------------------------MTKRSGSNT-RRRAISRPVRLTAE
00421 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM-----
00422 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD
00423 >gi|152973457|ref|YP_001338508.1| ..
00424 ; sq_len: 931
00425 ; sq_type: p
00426 ; al_start: 96
00427 ; al_stop: 195
00428 ; al_display_start: 66
00429 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ
00430 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI
00431 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY
00432 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ
00433 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
00434 ; fa_frame: f
00435 ; fa_initn:  33
00436 ; fa_init1:  33
00437 ; fa_opt:  63
00438 ; fa_z-score: 86.1
00439 ; fa_bits: 23.1
00440 ; fa_expect:   0.99
00441 ; sw_score: 63
00442 ; sw_ident: 0.266
00443 ; sw_sim: 0.656
00444 ; sw_overlap: 64
00445 >gi|10955263| ..
00446 ; sq_len: 107
00447 ; sq_offset: 1
00448 ; sq_type: p
00449 ; al_start: 32
00450 ; al_stop: 94
00451 ; al_display_start: 2
00452 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK
00453 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL
00454 LSRLMAD
00455 >gi|152973588|ref|YP_001338639.1| ..
00456 ; sq_len: 459
00457 ; sq_type: p
00458 ; al_start: 191
00459 ; al_stop: 248
00460 ; al_display_start: 161
00461 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK
00462 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG
00463 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT
00464 SNKALKSQISALLSSIQNKAVADEKLTDQE
00465   2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
00466  vs  NC_009649.faa library
00467 
00468   45119 residues in   180 sequences
00469   Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313
00470  mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1  B-trim: 8 in 1/25
00471  Lambda= 0.179384
00472 
00473 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
00474  join: 36, opt: 24, open/ext: -10/-2, width:  16
00475  Scan time:  0.000
00476 The best scores are:                                      opt bits E(180)
00477 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101)   58 22.9    0.29
00478 
00479 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
00480 ; pg_name: /opt/fasta/fasta34
00481 ; pg_ver: 34.26
00482 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
00483 ; pg_name: FASTA
00484 ; pg_ver: 3.5 Sept 2006
00485 ; pg_matrix: BL50 (15:-5)
00486 ; pg_open-ext: -10 -2
00487 ; pg_ktup: 2
00488 ; pg_optcut: 24
00489 ; pg_cgap: 36
00490 ; mp_extrap: 60000 180
00491 ; mp_stats:  Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313  mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1  B-trim: 8 in 1/25  Lambda= 0.179384
00492 ; mp_KS: -0.0000 (N=0) at 8159228
00493 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
00494 ; fa_frame: f
00495 ; fa_initn:  50
00496 ; fa_init1:  50
00497 ; fa_opt:  58
00498 ; fa_z-score: 95.8
00499 ; fa_bits: 22.9
00500 ; fa_expect:   0.29
00501 ; sw_score: 58
00502 ; sw_ident: 0.289
00503 ; sw_sim: 0.632
00504 ; sw_overlap: 38
00505 >gi|10955264| ..
00506 ; sq_len: 126
00507 ; sq_offset: 1
00508 ; sq_type: p
00509 ; al_start: 1
00510 ; al_stop: 38
00511 ; al_display_start: 1
00512 ------------------------------MKKDKKYQIEAIKNKDKTLF
00513 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF
00514 NGEKFSSYTLNKVTKTDEYN
00515 >gi|152973462|ref|YP_001338513.1| ..
00516 ; sq_len: 101
00517 ; sq_type: p
00518 ; al_start: 44
00519 ; al_stop: 81
00520 ; al_display_start: 14
00521 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI
00522 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA
00523   3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa
00524  vs  NC_009649.faa library
00525 
00526   45119 residues in   180 sequences
00527   Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461
00528  mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1  B-trim: 14 in 1/25
00529  Lambda= 0.210386
00530 
00531 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
00532  join: 37, opt: 25, open/ext: -10/-2, width:  16
00533  Scan time:  0.020
00534 The best scores are:                                      opt bits E(180)
00535 gi|152973545|ref|YP_001338596.1| putative plasmid  ( 242)   70 27.5   0.082
00536 
00537 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library
00538 ; pg_name: /opt/fasta/fasta34
00539 ; pg_ver: 34.26
00540 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
00541 ; pg_name: FASTA
00542 ; pg_ver: 3.5 Sept 2006
00543 ; pg_matrix: BL50 (15:-5)
00544 ; pg_open-ext: -10 -2
00545 ; pg_ktup: 2
00546 ; pg_optcut: 25
00547 ; pg_cgap: 37
00548 ; mp_extrap: 60000 180
00549 ; mp_stats:  Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461  mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1  B-trim: 14 in 1/25  Lambda= 0.210386
00550 ; mp_KS: -0.0000 (N=0) at 8159228
00551 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
00552 ; fa_frame: f
00553 ; fa_initn:  52
00554 ; fa_init1:  52
00555 ; fa_opt:  70
00556 ; fa_z-score: 105.5
00557 ; fa_bits: 27.5
00558 ; fa_expect:  0.082
00559 ; sw_score: 70
00560 ; sw_ident: 0.279
00561 ; sw_sim: 0.651
00562 ; sw_overlap: 43
00563 >gi|10955265| ..
00564 ; sq_len: 346
00565 ; sq_offset: 1
00566 ; sq_type: p
00567 ; al_start: 197
00568 ; al_stop: 238
00569 ; al_display_start: 167
00570 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
00571 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
00572 GEYFTENKPKYIIREIHQET
00573 >gi|152973545|ref|YP_001338596.1| ..
00574 ; sq_len: 242
00575 ; sq_type: p
00576 ; al_start: 52
00577 ; al_stop: 94
00578 ; al_display_start: 22
00579 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
00580 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
00581 QDFAFTRKMRREARQVEQSW
00582 >>><<<
00583 
00584 
00585 579 residues in 3 query   sequences
00586 45119 residues in 180 library sequences
00587  Scomplib [34.26]
00588  start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008
00589  Total Scan time:  0.020 Total Display time:  0.010
00590 
00591 Function used was FASTA [version 34.26 January 12, 2007]
00592 
00593 """                 
00594 
00595 
00596     from StringIO import StringIO
00597 
00598     alignments = list(FastaM10Iterator(StringIO(simple_example)))
00599     assert len(alignments) == 4, len(alignments)
00600     assert len(alignments[0]) == 2
00601     for a in alignments:
00602         print "Alignment %i sequences of length %i" \
00603               % (len(a), a.get_alignment_length())
00604         for r in a:
00605             print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"])
00606         #print a.annotations
00607     print "Done"
00608 
00609     import os
00610     path = "../../Tests/Fasta/"
00611     files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"]
00612     files.sort()
00613     for filename in files:
00614         if os.path.splitext(filename)[-1] == ".m10":
00615             print
00616             print filename
00617             print "="*len(filename)
00618             for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))):
00619                 print "#%i, %s" % (i+1,a)
00620                 for r in a:
00621                     if "-" in r.seq:
00622                         assert r.seq.alphabet.gap_char == "-"
00623                     else:
00624                         assert not hasattr(r.seq.alphabet, "gap_char")