Back to index

python-biopython  1.60
Record.py
Go to the documentation of this file.
00001 """Hold GenBank data in a straightforward format.
00002 
00003 classes:
00004 o Record - All of the information in a GenBank record.
00005 o Reference - hold reference data for a record.
00006 o Feature - Hold the information in a Feature Table.
00007 o Qualifier - Qualifiers on a Feature.
00008 17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg
00009 """
00010 # local stuff
00011 import Bio.GenBank
00012 
00013 def _wrapped_genbank(information, indent, wrap_space = 1, split_char = " "):
00014     """Write a line of GenBank info that can wrap over multiple lines.
00015 
00016     This takes a line of information which can potentially wrap over
00017     multiple lines, and breaks it up with carriage returns and
00018     indentation so it fits properly into a GenBank record.
00019 
00020     Arguments:
00021 
00022     o information - The string holding the information we want
00023     wrapped in GenBank method.
00024 
00025     o indent - The indentation on the lines we are writing.
00026 
00027     o wrap_space - Whether or not to wrap only on spaces in the
00028     information.
00029 
00030     o split_char - A specific character to split the lines on. By default
00031     spaces are used.
00032     """
00033     info_length = Record.GB_LINE_LENGTH - indent
00034 
00035     if not information:
00036         #GenBank files use "." for missing data
00037         return ".\n"
00038 
00039     if wrap_space:
00040         info_parts = information.split(split_char)
00041     else:
00042         cur_pos = 0
00043         info_parts = []
00044         while cur_pos < len(information):
00045             info_parts.append(information[cur_pos: cur_pos + info_length])
00046             cur_pos += info_length
00047             
00048     # first get the information string split up by line
00049     output_parts = []
00050     cur_part = ""
00051     for info_part in info_parts:
00052         if len(cur_part) + 1 + len(info_part) > info_length:
00053             if cur_part:
00054                 if split_char != " ":
00055                     cur_part += split_char
00056                 output_parts.append(cur_part)
00057             cur_part = info_part
00058         else:
00059             if cur_part == "":
00060                 cur_part = info_part
00061             else:
00062                 cur_part += split_char + info_part
00063 
00064     # add the last bit of information to the output
00065     if cur_part:
00066         output_parts.append(cur_part)
00067 
00068     # now format the information string for return
00069     output_info = output_parts[0] + "\n"
00070     for output_part in output_parts[1:]:
00071         output_info += " " * indent + output_part + "\n"
00072 
00073     return output_info            
00074         
00075 def _indent_genbank(information, indent):
00076     """Write out information with the specified indent.
00077 
00078     Unlike _wrapped_genbank, this function makes no attempt to wrap
00079     lines -- it assumes that the information already has newlines in the
00080     appropriate places, and will add the specified indent to the start of
00081     each line.
00082     """
00083     # split the info into lines based on line breaks
00084     info_parts = information.split("\n")
00085 
00086     # the first line will have no indent
00087     output_info = info_parts[0] + "\n"
00088     for info_part in info_parts[1:]:
00089         output_info += " " * indent + info_part + "\n"
00090 
00091     return output_info
00092 
00093 class Record(object):
00094     """Hold GenBank information in a format similar to the original record.
00095 
00096     The Record class is meant to make data easy to get to when you are
00097     just interested in looking at GenBank data.
00098 
00099     Attributes:
00100     o locus - The name specified after the LOCUS keyword in the GenBank
00101     record. This may be the accession number, or a clone id or something else.
00102     o size - The size of the record.
00103     o residue_type - The type of residues making up the sequence in this
00104     record. Normally something like RNA, DNA or PROTEIN, but may be as
00105     esoteric as 'ss-RNA circular'.
00106     o data_file_division - The division this record is stored under in
00107     GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...)
00108     o date - The date of submission of the record, in a form like '28-JUL-1998'
00109     o accession - list of all accession numbers for the sequence.
00110     o nid - Nucleotide identifier number.
00111     o pid - Proteint identifier number
00112     o version - The accession number + version (ie. AB01234.2)
00113     o db_source - Information about the database the record came from
00114     o gi - The NCBI gi identifier for the record.
00115     o keywords - A list of keywords related to the record.
00116     o segment - If the record is one of a series, this is info about which
00117     segment this record is (something like '1 of 6').
00118     o source - The source of material where the sequence came from.
00119     o organism - The genus and species of the organism (ie. 'Homo sapiens')
00120     o taxonomy - A listing of the taxonomic classification of the organism,
00121     starting general and getting more specific.
00122     o references - A list of Reference objects.
00123     o comment - Text with any kind of comment about the record.
00124     o features - A listing of Features making up the feature table.
00125     o base_counts - A string with the counts of bases for the sequence.
00126     o origin - A string specifying info about the origin of the sequence.
00127     o sequence - A string with the sequence itself.
00128     o contig - A string of location information for a CONTIG in a RefSeq file
00129     o project - The genome sequencing project numbers
00130                 (will be replaced by the dblink cross-references in 2009).
00131     o dblinks - The genome sequencing project number(s) and other links.
00132                 (will replace the project information in 2009).
00133     """
00134     # constants for outputting GenBank information
00135     GB_LINE_LENGTH = 79
00136     GB_BASE_INDENT = 12
00137     GB_FEATURE_INDENT = 21
00138     GB_INTERNAL_INDENT = 2
00139     GB_OTHER_INTERNAL_INDENT = 3
00140     GB_FEATURE_INTERNAL_INDENT = 5
00141     GB_SEQUENCE_INDENT = 9
00142 
00143     BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s"
00144     INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \
00145                       str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s"
00146     OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \
00147                             str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \
00148                             "s"
00149 
00150     BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s"
00151     INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \
00152                               str(GB_FEATURE_INDENT -
00153                                   GB_FEATURE_INTERNAL_INDENT) + "s"
00154     SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s"
00155     
00156     def __init__(self):
00157         self.locus = ''
00158         self.size = ''
00159         self.residue_type = ''
00160         self.data_file_division = ''
00161         self.date = ''
00162         self.definition = ''
00163         self.accession = []
00164         self.nid = ''
00165         self.pid = ''
00166         self.version = ''
00167         self.projects = []
00168         self.dblinks = []
00169         self.db_source = ''
00170         self.gi = ''
00171         self.keywords = []
00172         self.segment = ''
00173         self.source = ''
00174         self.organism = ''
00175         self.taxonomy = []
00176         self.references = []
00177         self.comment = ''
00178         self.features = []
00179         self.base_counts = ''
00180         self.origin = ''
00181         self.sequence = ''
00182         self.contig = ''
00183         self.primary=[]
00184         self.wgs = ''
00185         self.wgs_scafld = []
00186 
00187     def __str__(self):
00188         """Provide a GenBank formatted output option for a Record.
00189 
00190         The objective of this is to provide an easy way to read in a GenBank
00191         record, modify it somehow, and then output it in 'GenBank format.'
00192         We are striving to make this work so that a parsed Record that is
00193         output using this function will look exactly like the original
00194         record.
00195 
00196         Much of the output is based on format description info at:
00197 
00198         ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt
00199         """
00200         output = self._locus_line()
00201         output += self._definition_line()
00202         output += self._accession_line()
00203         output += self._version_line()
00204         output += self._project_line()
00205         output += self._dblink_line()
00206         output += self._nid_line()
00207         output += self._pid_line()
00208         output += self._keywords_line()
00209         output += self._db_source_line()
00210         output += self._segment_line()
00211         output += self._source_line()
00212         output += self._organism_line()
00213         for reference in self.references:
00214             output += str(reference)
00215         output += self._comment_line()
00216         output += self._features_line()
00217         for feature in self.features:
00218             output += str(feature)
00219         output += self._base_count_line()
00220         output += self._origin_line()
00221         output += self._sequence_line()
00222         output += self._wgs_line()
00223         output += self._wgs_scafld_line()
00224         output += self._contig_line()
00225         output += "//"
00226         return output
00227             
00228     def _locus_line(self):
00229         """Provide the output string for the LOCUS line.
00230         """
00231         output = "LOCUS"
00232         output += " " * 7 # 6-12 spaces
00233         output += "%-9s" % self.locus
00234         output += " " # 22 space
00235         output += "%7s" % self.size
00236         if self.residue_type.find("PROTEIN") >= 0:
00237             output += " aa"
00238         else:
00239             output += " bp "
00240 
00241         # treat circular types differently, since they'll have long residue
00242         # types
00243         if self.residue_type.find("circular") >= 0:
00244              output += "%17s" % self.residue_type
00245         # second case: ss-DNA types of records
00246         elif self.residue_type.find("-") >= 0:
00247             output += "%7s" % self.residue_type
00248             output += " " * 10 # spaces for circular
00249         else:
00250             output += " " * 3 # spaces for stuff like ss-
00251             output += "%-4s" % self.residue_type
00252             output += " " * 10 # spaces for circular
00253 
00254         output += " " * 2
00255         output += "%3s" % self.data_file_division
00256         output += " " * 7 # spaces for 56-63
00257         output += "%11s" % self.date
00258         output += "\n"
00259         return output
00260 
00261     def _definition_line(self):
00262         """Provide output for the DEFINITION line.
00263         """
00264         output = Record.BASE_FORMAT % "DEFINITION"
00265         output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT)
00266         return output
00267 
00268     def _accession_line(self):
00269         """Output for the ACCESSION line.
00270         """
00271         if self.accession:
00272             output = Record.BASE_FORMAT % "ACCESSION"
00273 
00274             acc_info = ""
00275             for accession in self.accession:
00276                 acc_info += "%s " % accession
00277             # strip off an extra space at the end
00278             acc_info = acc_info.rstrip()
00279             output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT)
00280         else:
00281             output = ""
00282         
00283         return output
00284 
00285     def _version_line(self):
00286         """Output for the VERSION line.
00287         """
00288         if self.version:
00289             output = Record.BASE_FORMAT % "VERSION"
00290             output += self.version
00291             output += "  GI:"
00292             output += "%s\n" % self.gi
00293         else:
00294             output = ""
00295         return output
00296 
00297     def _project_line(self):
00298         output = ""
00299         if len(self.projects) > 0:
00300             output = Record.BASE_FORMAT % "PROJECT"
00301             output += "%s\n" % "  ".join(self.projects)
00302         return output
00303 
00304     def _dblink_line(self):
00305         output = ""
00306         if len(self.dblinks) > 0:
00307             output = Record.BASE_FORMAT % "DBLINK"
00308             dblink_info = "\n".join(self.dblinks)
00309             output += _wrapped_genbank(dblink_info, Record.GB_BASE_INDENT)
00310         return output
00311 
00312     def _nid_line(self):
00313         """Output for the NID line. Use of NID is obsolete in GenBank files.
00314         """
00315         if self.nid:
00316             output = Record.BASE_FORMAT % "NID"
00317             output += "%s\n" % self.nid
00318         else:
00319             output = ""
00320         return output
00321 
00322     def _pid_line(self):
00323         """Output for PID line. Presumedly, PID usage is also obsolete.
00324         """
00325         if self.pid:
00326             output = Record.BASE_FORMAT % "PID"
00327             output += "%s\n" % self.pid
00328         else:
00329             output = ""
00330         return output
00331 
00332     def _keywords_line(self):
00333         """Output for the KEYWORDS line.
00334         """
00335         output = ""
00336         if len(self.keywords) >= 0:
00337             output +=  Record.BASE_FORMAT % "KEYWORDS"
00338             keyword_info = ""
00339             for keyword in self.keywords:
00340                 keyword_info += "%s; " % keyword
00341             # replace the ; at the end with a period
00342             keyword_info = keyword_info[:-2]
00343             keyword_info += "."
00344             
00345             output += _wrapped_genbank(keyword_info,
00346                                        Record.GB_BASE_INDENT)
00347 
00348         return output
00349 
00350     def _db_source_line(self):
00351         """Output for DBSOURCE line.
00352         """
00353         if self.db_source:
00354             output = Record.BASE_FORMAT % "DBSOURCE"
00355             output += "%s\n" % self.db_source
00356         else:
00357             output = ""
00358         return output
00359 
00360     def _segment_line(self):
00361         """Output for the SEGMENT line.
00362         """
00363         output = ""
00364         if self.segment:
00365             output += Record.BASE_FORMAT % "SEGMENT"
00366             output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT)
00367         return output
00368 
00369     def _source_line(self):
00370         """Output for SOURCE line on where the sample came from.
00371         """
00372         output = Record.BASE_FORMAT % "SOURCE"
00373         output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT)
00374         return output
00375     
00376     def _organism_line(self):
00377         """Output for ORGANISM line with taxonomy info.
00378         """
00379         output = Record.INTERNAL_FORMAT % "ORGANISM"
00380         # Now that species names can be too long, this line can wrap (Bug 2591)
00381         output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT)
00382         output += " " * Record.GB_BASE_INDENT
00383         taxonomy_info = ""
00384         for tax in self.taxonomy:
00385             taxonomy_info += "%s; " % tax
00386         # replace the ; at the end with a period
00387         taxonomy_info = taxonomy_info[:-2]
00388         taxonomy_info += "."
00389         output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT)
00390 
00391         return output
00392             
00393     def _comment_line(self):
00394         """Output for the COMMENT lines.
00395         """
00396         output = ""
00397         if self.comment:
00398             output += Record.BASE_FORMAT % "COMMENT"
00399             output += _indent_genbank(self.comment,
00400                                       Record.GB_BASE_INDENT)
00401         return output
00402 
00403     def _features_line(self):
00404         """Output for the FEATURES line.
00405         """
00406         output = ""
00407         if len(self.features) > 0:
00408             output += Record.BASE_FEATURE_FORMAT % "FEATURES"
00409             output += "Location/Qualifiers\n"
00410         return output
00411 
00412     def _base_count_line(self):
00413         """Output for the BASE COUNT line with base information.
00414         """
00415         output = ""
00416         if self.base_counts:
00417             output += Record.BASE_FORMAT % "BASE COUNT  "
00418             # split up the base counts into their individual parts
00419             count_parts = self.base_counts.split(" ")
00420             while '' in count_parts:
00421                 count_parts.remove('')
00422             # deal with the standard case, with a normal origin line
00423             # like: 474 a    356 c    428 g    364 t
00424             if len(count_parts) % 2 == 0:
00425                 while len(count_parts) > 0:
00426                     count_info = count_parts.pop(0)
00427                     count_type = count_parts.pop(0)
00428 
00429                     output += "%7s %s" % (count_info, count_type)
00430             # deal with ugly ORIGIN lines like:
00431             # 1311257 a2224835 c2190093 g1309889 t
00432             # by just outputting the raw information
00433             else:
00434                 output += self.base_counts
00435             output += "\n"
00436         return output
00437 
00438     def _origin_line(self):
00439         """Output for the ORIGIN line
00440         """
00441         output = ""
00442         # only output the ORIGIN line if we have a sequence
00443         if self.sequence:
00444             output += Record.BASE_FORMAT % "ORIGIN"
00445             if self.origin:
00446                 output += _wrapped_genbank(self.origin,
00447                                            Record.GB_BASE_INDENT)
00448             else:
00449                 output += "\n"
00450         return output
00451 
00452     def _sequence_line(self):
00453         """Output for all of the sequence.
00454         """
00455         output = ""
00456         if self.sequence:
00457             cur_seq_pos = 0
00458             while cur_seq_pos < len(self.sequence):
00459                 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1)
00460 
00461                 for section in range(6):
00462                     start_pos = cur_seq_pos + section * 10
00463                     end_pos = start_pos + 10
00464                     seq_section = self.sequence[start_pos:end_pos]
00465                     output += " %s" % seq_section.lower()
00466 
00467                     # stop looping if we are out of sequence
00468                     if end_pos > len(self.sequence):
00469                         break
00470                 
00471                 output += "\n"
00472                 cur_seq_pos += 60
00473         return output
00474 
00475     def _wgs_line(self):
00476             output = ""
00477             if self.wgs:
00478                     output += Record.BASE_FORMAT % "WGS"
00479                     output += self.wgs
00480             return output
00481 
00482     def _wgs_scafld_line(self):
00483             output = ""
00484             if self.wgs_scafld:
00485                     output += Record.BASE_FORMAT % "WGS_SCAFLD"
00486                     output += self.wgs_scafld
00487             return output
00488         
00489     def _contig_line(self):
00490         """Output for CONTIG location information from RefSeq.
00491         """
00492         output = ""
00493         if self.contig:
00494             output += Record.BASE_FORMAT % "CONTIG"
00495             output += _wrapped_genbank(self.contig,
00496                                        Record.GB_BASE_INDENT, split_char = ',')
00497         return output
00498         
00499 
00500 class Reference(object):
00501     """Hold information from a GenBank reference.
00502 
00503     Attributes:
00504     o number - The number of the reference in the listing of references.
00505     o bases - The bases in the sequence the reference refers to.
00506     o authors - String with all of the authors.
00507     o consrtm - Consortium the authors belong to. 
00508     o title - The title of the reference.
00509     o journal - Information about the journal where the reference appeared.
00510     o medline_id - The medline id for the reference.
00511     o pubmed_id - The pubmed_id for the reference.
00512     o remark - Free-form remarks about the reference.
00513     """
00514     def __init__(self):
00515         self.number = ''
00516         self.bases = ''
00517         self.authors = ''
00518         self.consrtm = ''
00519         self.title = ''
00520         self.journal = ''
00521         self.medline_id = ''
00522         self.pubmed_id = ''
00523         self.remark = ''
00524 
00525     def __str__(self):
00526         output = self._reference_line()
00527         output += self._authors_line()
00528         output += self._consrtm_line()
00529         output += self._title_line()
00530         output += self._journal_line()
00531         output += self._medline_line()
00532         output += self._pubmed_line()
00533         output += self._remark_line()
00534         
00535         return output
00536 
00537     def _reference_line(self):
00538         """Output for REFERENCE lines.
00539         """
00540         output = Record.BASE_FORMAT % "REFERENCE"
00541         if self.number:
00542             if self.bases:
00543                 output += "%-3s" % self.number
00544                 output += "%s" % self.bases
00545             else:
00546                 output += "%s" % self.number
00547 
00548         output += "\n"
00549         return output
00550 
00551     def _authors_line(self):
00552         """Output for AUTHORS information.
00553         """
00554         output = ""
00555         if self.authors:
00556             output += Record.INTERNAL_FORMAT % "AUTHORS"
00557             output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT)
00558         return output
00559 
00560     def _consrtm_line(self):
00561         """Output for CONSRTM information.
00562         """
00563         output = ""
00564         if self.consrtm:
00565             output += Record.INTERNAL_FORMAT % "CONSRTM"
00566             output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT)
00567         return output
00568 
00569     def _title_line(self):
00570         """Output for TITLE information.
00571         """
00572         output = ""
00573         if self.title:
00574             output += Record.INTERNAL_FORMAT % "TITLE"
00575             output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT)
00576         return output
00577 
00578     def _journal_line(self):
00579         """Output for JOURNAL information.
00580         """
00581         output = ""
00582         if self.journal:
00583             output += Record.INTERNAL_FORMAT % "JOURNAL"
00584             output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT)
00585         return output
00586 
00587     def _medline_line(self):
00588         """Output for MEDLINE information.
00589         """
00590         output = ""
00591         if self.medline_id:
00592             output += Record.INTERNAL_FORMAT % "MEDLINE"
00593             output += self.medline_id + "\n"
00594         return output
00595     
00596     def _pubmed_line(self):
00597         """Output for PUBMED information.
00598         """
00599         output = ""
00600         if self.pubmed_id:
00601             output += Record.OTHER_INTERNAL_FORMAT % "PUBMED"
00602             output += self.pubmed_id + "\n"
00603         return output
00604     
00605     def _remark_line(self):
00606         """Output for REMARK information.
00607         """
00608         output = ""
00609         if self.remark:
00610             output += Record.INTERNAL_FORMAT % "REMARK"
00611             output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT)
00612         return output
00613     
00614 class Feature(object):
00615     """Hold information about a Feature in the Feature Table of GenBank record.
00616 
00617     Attributes:
00618     o key - The key name of the featue (ie. source)
00619     o location - The string specifying the location of the feature.
00620     o qualfiers - A listing Qualifier objects in the feature.
00621     """
00622     def __init__(self):
00623         self.key = ''
00624         self.location = ''
00625         self.qualifiers = []
00626 
00627     def __str__(self):
00628         output = Record.INTERNAL_FEATURE_FORMAT % self.key
00629         output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT,
00630                                    split_char = ',')
00631         for qualifier in self.qualifiers:
00632             output += " " * Record.GB_FEATURE_INDENT
00633             
00634             # determine whether we can wrap on spaces
00635             space_wrap = 1
00636             for no_space_key in \
00637                 Bio.GenBank._BaseGenBankConsumer.remove_space_keys:
00638                 if qualifier.key.find(no_space_key) >= 0:
00639                     space_wrap = 0
00640             
00641             output += _wrapped_genbank(qualifier.key + qualifier.value,
00642                                        Record.GB_FEATURE_INDENT, space_wrap)
00643         return output
00644 
00645 class Qualifier(object):
00646     """Hold information about a qualifier in a GenBank feature.
00647 
00648     Attributes:
00649     o key - The key name of the qualifier (ie. /organism=)
00650     o value - The value of the qualifier ("Dictyostelium discoideum").
00651     """
00652     def __init__(self):
00653         self.key = ''
00654         self.value = ''