Back to index

python-biopython  1.60
LargeFileParser.py
Go to the documentation of this file.
00001 # Copyright 2010 by Tiago Antao.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """
00007 Large file parsing of Genepop files
00008 
00009 The standard parser loads the whole file into memory. This parser
00010 provides an iterator over data.
00011 
00012 Classes:
00013 LargeRecord           Holds GenePop data.
00014 
00015 Functions:
00016 read             Parses a GenePop record (file) into a Record object.
00017 
00018 """
00019 
00020 from copy import deepcopy
00021 
00022 
00023 def get_indiv(line):
00024     indiv_name, marker_line = line.split(',')
00025     markers = marker_line.replace('\t', ' ').split(' ')
00026     markers = [marker for marker in markers if marker!='']
00027     if len(markers[0]) in [2, 4]: #2 digits per allele
00028         marker_len = 2
00029     else:
00030         marker_len = 3
00031     try:
00032         allele_list = [(int(marker[0:marker_len]),
00033                        int(marker[marker_len:]))
00034                    for marker in markers]
00035     except ValueError: #Haploid
00036         allele_list = [(int(marker[0:marker_len]),)
00037                    for marker in markers]
00038     return indiv_name, allele_list, marker_len
00039 
00040 def read(handle):
00041     """Parses a handle containing a GenePop file.
00042 
00043        handle is a file-like object that contains a GenePop record.
00044     """
00045     record = Record(handle)
00046     record.comment_line = str(handle.readline()).rstrip()
00047     #We can now have one loci per line or all loci in a single line
00048     #separated by either space or comma+space...
00049     #We will remove all commas on loci... that should not be a problem
00050     sample_loci_line = str(handle.readline()).rstrip().replace(',', '')
00051     all_loci = sample_loci_line.split(' ')
00052     record.loci_list.extend(all_loci)
00053     line = handle.readline()
00054     while line!="":
00055         line = line.rstrip()
00056         if line.upper()=="POP":
00057             record.stack.append("POP")            
00058             break
00059         record.loci_list.append(line)
00060         line = handle.readline()
00061     next_line = handle.readline().rstrip()
00062     indiv_name, allele_list, record.marker_len = get_indiv(next_line)
00063     record.stack.append(next_line)
00064     return record
00065 
00066 
00067 class Record(object):
00068     """Holds information from a GenePop record.
00069 
00070     Members:
00071     marker_len         The marker length (2 or 3 digit code per allele).    
00072     
00073     comment_line       Comment line.
00074 
00075     loci_list          List of loci names.
00076 
00077     data_generator     Iterates over population data.
00078 
00079     The generator will only work once. If you want to read a handle
00080     twice you have to re-open it!
00081 
00082     data_generator can either be () - an empty tuple - marking a new
00083     population or an individual. An individual is something like
00084     ('Ind1', [(1,1), (3,None), (200,201)],
00085     In the case above the individual is called Ind1,
00086     has three diploid loci. For the second loci, one of the alleles
00087     is unknown.
00088 
00089     
00090     """
00091     def __init__(self, handle):
00092         self.handle          = handle
00093         self.marker_len      = 0
00094         self.comment_line    = ""
00095         self.loci_list       = []
00096         self.populations     = []
00097         self.data_generator  = None
00098         self.stack           = [] 
00099 
00100     def data_generator(self):
00101         for handle in [self.stack, self.handle]:
00102             for line in handle:
00103                 line = line.rstrip()
00104                 if line.upper()=='POP':
00105                     yield ()
00106                 else:
00107                     indiv_name, allele_list, marker_len = get_indiv(line)
00108                     clean_list = []
00109                     for locus in allele_list:
00110                         mk_real = []
00111                         for al in locus:
00112                             if al==0:
00113                                 mk_real.append(None)
00114                             else:
00115                                 mk_real.append(al)
00116                         clean_list.append(tuple(mk_real))
00117                     yield indiv_name, clean_list
00118         raise StopIteration()
00119