Back to index

python-biopython  1.60
fasta_dictionary.py
Go to the documentation of this file.
00001 # In Memory
00002 # =========
00003 # This next bit of code uses Bio.SeqIO.parse() to load a FASTA file,
00004 # and then turns it into an in-memory python dictionary.
00005 # This is *not* suitable for FASTA files with millions of entries.
00006 
00007 from Bio.Alphabet import generic_dna
00008 from Bio import SeqIO
00009 
00010 def get_accession_num(seq_record):
00011     accession_atoms = seq_record.id.split('|')
00012     gb_name = accession_atoms[3]
00013     # strip the version info before returning
00014     return gb_name[:-2]
00015 
00016 rec_iterator = SeqIO.parse("ls_orchid.fasta","fasta", generic_dna)
00017 orchid_dict = SeqIO.to_dict(rec_iterator, get_accession_num)
00018 
00019 for id_num in orchid_dict:
00020     print 'id number:', id_num
00021     print 'description:', orchid_dict[id_num].description
00022     print 'sequence:', orchid_dict[id_num].seq
00023 
00024 
00025 # Indexed
00026 # =======
00027 # This next version uses the Bio.SeqIO.index() function which will index
00028 # the FASTA file without loading all the records into memory at once.
00029 # This is suitable for FASTA files with millions of entries.
00030 
00031 from Bio.Alphabet import generic_dna
00032 from Bio import SeqIO
00033 
00034 def get_accession_num(record_id):
00035     accession_atoms = record_id.split('|')
00036     gb_name = accession_atoms[3]
00037     # strip the version info before returning
00038     return gb_name[:-2]
00039 
00040 orchid_dict = SeqIO.index("ls_orchid.fasta","fasta", generic_dna)
00041 
00042 for id_num in orchid_dict:
00043     print 'id number:', id_num
00044     print 'description:', orchid_dict[id_num].description
00045     print 'sequence:', orchid_dict[id_num].seq