Back to index

python-biopython  1.60
FileParser.py
Go to the documentation of this file.
00001 # Copyright 2010 by Tiago Antao.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """
00007 This class provides code to parse BIG GenePop files.
00008 
00009 The difference between this class and the standard Bio.PopGen.GenePop.Record
00010 class is that this one does not read the whole file to memory.
00011 It provides an iterator interface, slower but consuming much mess memory.
00012 Should be used with big files (Thousands of markers and individuals).
00013 
00014 See http://wbiomed.curtin.edu.au/genepop/ , the format is documented
00015 here: http://wbiomed.curtin.edu.au/genepop/help_input.html .
00016 
00017 Classes:
00018 FileRecord           Holds GenePop data.
00019 
00020 Functions:
00021 
00022 
00023 """
00024 from copy import deepcopy
00025 from Bio.PopGen.GenePop import get_indiv
00026 
00027 def read(fname):
00028     """Parses a file containing a GenePop file.
00029 
00030        fname is a file name that contains a GenePop record.
00031     """
00032     record = FileRecord(fname)
00033     return record
00034 
00035 
00036 class FileRecord(object):
00037     """Holds information from a GenePop record.
00038 
00039     Members:
00040     marker_len         The marker length (2 or 3 digit code per allele).    
00041     
00042     comment_line       Comment line.
00043 
00044     loci_list          List of loci names.
00045 
00046     Functions:
00047     get_individual     Returns the next individual of the current population.
00048 
00049     skip_population    Skips the current population.
00050     
00051     skip_population skips the individuals of the current population, returns
00052     True if there are more populations.
00053 
00054     get_individual returns an individual of the current population (or None
00055     if the list ended).
00056     Each individual is a pair composed by individual
00057     name and a list of alleles (2 per marker or 1 for haploid data).
00058     Examples
00059             ('Ind1', [(1,2),    (3,3), (200,201)]
00060             ('Ind2', [(2,None), (3,3), (None,None)]
00061             ('Other1', [(1,1),  (4,3), (200,200)]
00062 
00063     
00064     """
00065     def __init__(self, fname):
00066         self.comment_line    = ""
00067         self.loci_list       = []
00068         self.fname           = fname
00069         self.start_read()
00070 
00071     def __str__(self):
00072         """Returns (reconstructs) a GenePop textual representation.
00073 
00074            This might take a lot of memory.
00075            Marker length will be 3.
00076         """
00077         marker_len = 3
00078         rep  = [self.comment_line + '\n']
00079         rep.append('\n'.join(self.loci_list) + '\n')
00080         current_pop = self.current_pop
00081         current_ind = self.current_ind
00082         self._handle.seek(0)
00083         self.skip_header()
00084         rep.append('Pop\n')
00085         more = True
00086         while more:
00087             res = self.get_individual()
00088             if res == True:
00089                 rep.append('Pop\n')
00090             elif res == False:
00091                 more = False
00092             else:
00093                 name, markers = res
00094                 rep.append(name)
00095                 rep.append(',')
00096                 for marker in markers:
00097                     rep.append(' ')
00098                     for al in marker:
00099                         if al == None:
00100                             al = '0'
00101                         aStr = str(al)
00102                         while len(aStr)<marker_len:
00103                             aStr = "".join(['0', aStr])
00104                         rep.append(aStr)
00105                 rep.append('\n')
00106         self.seek_position(current_pop, current_ind)
00107         return "".join(rep)
00108 
00109 
00110     def start_read(self):
00111         """Starts parsing a file containing a GenePop file.
00112         """
00113         self._handle = open(self.fname)
00114         self.comment_line = self._handle.readline().rstrip()
00115         #We can now have one loci per line or all loci in a single line
00116         #separated by either space or comma+space...
00117         #We will remove all commas on loci... that should not be a problem
00118         sample_loci_line = self._handle.readline().rstrip().replace(',', '')
00119         all_loci = sample_loci_line.split(' ')
00120         self.loci_list.extend(all_loci)
00121         for line in self._handle:
00122             line = line.rstrip()
00123             if line.upper()=='POP':
00124                 break
00125             self.loci_list.append(line)
00126         else:
00127             raise ValueError('No population data found, file probably not GenePop related')
00128         #self._after_pop = True
00129         self.current_pop = 0
00130         self.current_ind = 0
00131 
00132     def skip_header(self):
00133         """Skips the Header. To be done after a re-open."""
00134         self.current_pop = 0
00135         self.current_ind = 0
00136         for line in self._handle:
00137             if line.rstrip().upper()=="POP":
00138                 return
00139 
00140     def seek_position(self, pop, indiv):
00141         """Seeks a certain position in the file.
00142 
00143            pop   - pop position (0 is first)
00144            indiv - individual in pop
00145         """
00146         self._handle.seek(0)
00147         self.skip_header()
00148         while pop>0:
00149             self.skip_population()
00150             pop -= 1
00151         while indiv>0:
00152             self.get_individual()
00153             indiv -= 1
00154 
00155     def skip_population(self):
00156         "Skips the current population. Returns true if there is another pop."
00157         for line in self._handle:
00158             if line=="":
00159                 return False
00160             line = line.rstrip()
00161             if line.upper()=='POP':
00162                 self.current_pop += 1
00163                 self.current_ind = 0
00164                 return True
00165 
00166     def get_individual(self):
00167         """Gets the next individual.
00168 
00169            Returns individual information if there are more individuals
00170            in the current population.
00171            Returns True if there are no more individuals in the current
00172            population, but there are more populations. Next read will
00173            be of the following pop.
00174            Returns False if at end of file.
00175         """
00176         marker_len = None
00177         for line in self._handle:
00178             line = line.rstrip()
00179             if line.upper()=='POP':
00180                 self.current_pop += 1
00181                 self.current_ind = 0
00182                 return True
00183             else:
00184                 self.current_ind += 1
00185                 indiv_name, allele_list, ignore = get_indiv(line)
00186                 return (indiv_name, allele_list)
00187         return False
00188 
00189     def remove_population(self, pos, fname):
00190         """Removes a population (by position).
00191 
00192            pos - position
00193            fname - file to be created with population removed
00194         """
00195         old_rec = read(self.fname)
00196         f = open(fname, "w")
00197         f.write(self.comment_line + "\n")
00198         for locus in old_rec.loci_list:
00199             f.write(locus + "\n")
00200         curr_pop = 0
00201         l_parser = old_rec.get_individual()
00202         start_pop = True
00203         while l_parser:
00204             if curr_pop == pos:
00205                 old_rec.skip_population()
00206                 curr_pop += 1
00207             else:
00208                 if l_parser == True:
00209                     curr_pop += 1
00210                     start_pop = True
00211                 else:
00212                     if start_pop:
00213                         f.write("POP\n")
00214                         start_pop = False
00215                     name, markers = l_parser
00216                     f.write(name + ",")
00217                     for marker in markers:
00218                         f.write(' ')
00219                         for al in marker:
00220                             if al == None:
00221                                 al = '0'
00222                             aStr = str(al)
00223                             while len(aStr)<3:
00224                                 aStr = "".join(['0', aStr])
00225                             f.write(aStr)
00226                     f.write('\n')
00227         
00228             l_parser = old_rec.get_individual()
00229         f.close()
00230     
00231     def remove_locus_by_position(self, pos, fname):
00232         """Removes a locus by position.
00233 
00234            pos - position
00235            fname - file to be created with locus removed
00236         """
00237         old_rec = read(self.fname)
00238         f = open(fname, "w")
00239         f.write(self.comment_line + "\n")
00240         loci_list = old_rec.loci_list
00241         del loci_list[pos]
00242         for locus in loci_list:
00243             f.write(locus + "\n")
00244         l_parser = old_rec.get_individual()
00245         f.write("POP\n")
00246         while l_parser:
00247             if l_parser == True:
00248                 f.write("POP\n")
00249             else:
00250                 name, markers = l_parser
00251                 f.write(name + ",")
00252                 marker_pos = 0
00253                 for marker in markers:
00254                     if marker_pos == pos:
00255                         marker_pos += 1
00256                         continue
00257                     marker_pos += 1
00258                     f.write(' ')
00259                     for al in marker:
00260                         if al == None:
00261                             al = '0'
00262                         aStr = str(al)
00263                         while len(aStr)<3:
00264                             aStr = "".join(['0', aStr])
00265                         f.write(aStr)
00266                 f.write('\n')
00267 
00268             l_parser = old_rec.get_individual()
00269         f.close()
00270 
00271     def remove_loci_by_position(self, positions, fname):
00272         """Removes a set of loci by position.
00273 
00274            positions - positions
00275            fname - file to be created with locus removed
00276         """
00277         old_rec = read(self.fname)
00278         f = open(fname, "w")
00279         f.write(self.comment_line + "\n")
00280         loci_list = old_rec.loci_list
00281         positions.sort()
00282         positions.reverse()
00283         for pos in positions:
00284             del loci_list[pos]
00285         for locus in loci_list:
00286             f.write(locus + "\n")
00287         l_parser = old_rec.get_individual()
00288         f.write("POP\n")
00289         while l_parser:
00290             if l_parser == True:
00291                 f.write("POP\n")
00292             else:
00293                 name, markers = l_parser
00294                 f.write(name + ",")
00295                 marker_pos = 0
00296                 for marker in markers:
00297                     if marker_pos in positions:
00298                         marker_pos += 1
00299                         continue
00300                     marker_pos += 1
00301                     f.write(' ')
00302                     for al in marker:
00303                         if al == None:
00304                             al = '0'
00305                         aStr = str(al)
00306                         while len(aStr)<3:
00307                             aStr = "".join(['0', aStr])
00308                         f.write(aStr)
00309                 f.write('\n')
00310 
00311             l_parser = old_rec.get_individual()
00312         f.close()
00313 
00314     def remove_locus_by_name(self, name, fname):
00315         """Removes a locus by name.
00316 
00317            name - name
00318            fname - file to be created with locus removed
00319         """
00320         for i in range(len(self.loci_list)):
00321             if self.loci_list[i] == name:
00322                 self.remove_locus_by_position(i, fname)
00323                 return
00324         #If here than locus not existent... Maybe raise exception?
00325         #   Although it should be Ok... Just a boolean return, maybe?
00326     
00327     def remove_loci_by_name(self, names, fname):
00328         """Removes a loci list (by name).
00329 
00330            names - names
00331            fname - file to be created with loci removed
00332         """
00333         positions = []
00334         for i in range(len(self.loci_list)):
00335             if self.loci_list[i] in names:
00336                 positions.append(i)
00337         self.remove_loci_by_position(positions, fname)
00338         #If here than locus not existent... Maybe raise exception?
00339         #   Although it should be Ok... Just a boolean return, maybe?
00340