Back to index

python-biopython  1.60
fasta_iterator.py
Go to the documentation of this file.
00001 # The New Way
00002 # ===========
00003 # This next bit of code use Bio.SeqIO to parse a FASTA file
00004 
00005 from Bio import SeqIO
00006 
00007 def extract_organisms(file_to_parse, format):
00008     all_species = []
00009     for cur_record in SeqIO.parse(open(file_to_parse), format) :
00010         # extract the info from the description
00011         new_species = cur_record.description.split()[1]
00012 
00013         # append the new species to the list if it isn't there
00014         if new_species not in all_species:
00015             all_species.append(new_species)
00016 
00017     return all_species
00018 
00019 if __name__ == "__main__":
00020     print "Using Bio.SeqIO on a FASTA file"
00021     all_species = extract_organisms("ls_orchid.fasta", "fasta")
00022     print "number of species:", len(all_species)
00023     print 'species names:', all_species
00024 
00025 
00026 # The Old Way
00027 # ===========
00028 # This next bit of code still works fine, it uses Bio.Fasta instead
00029 
00030 from Bio import Fasta
00031 
00032 def extract_organisms(file_to_parse):
00033     # set up the parser and iterator
00034     parser = Fasta.RecordParser()
00035     file = open(file_to_parse, 'r')
00036     iterator = Fasta.Iterator(file, parser)
00037 
00038     all_species = []
00039 
00040     while 1:
00041         cur_record = iterator.next()
00042 
00043         if cur_record is None:
00044             break
00045         
00046         # extract the info from the title
00047         new_species = cur_record.title.split()[1]
00048 
00049         # append the new species to the list if it isn't there
00050         if new_species not in all_species:
00051             all_species.append(new_species)
00052 
00053     return all_species
00054 
00055 if __name__ == "__main__":
00056     print "Using Bio.Fasta"
00057     all_species = extract_organisms("ls_orchid.fasta")
00058     print "number of species:", len(all_species)
00059     print 'species names:', all_species
00060