Back to index

python-biopython  1.60
Schema.py
Go to the documentation of this file.
00001 """Deal with Motifs or Signatures allowing ambiguity in the sequences.
00002 
00003 This class contains Schema which deal with Motifs and Signatures at
00004 a higher level, by introducing `don't care` (ambiguity) symbols into
00005 the sequences. For instance, you could combine the following Motifs:
00006 
00007 'GATC', 'GATG', 'GATG', 'GATT'
00008 
00009 as all falling under a schema like 'GAT*', where the star indicates a
00010 character can be anything. This helps us condense a whole ton of
00011 motifs or signatures.
00012 """
00013 # standard modules
00014 import random
00015 import re
00016 
00017 # biopython
00018 from Bio import Alphabet
00019 from Bio.Seq import MutableSeq
00020 
00021 # neural network libraries
00022 from Pattern import PatternRepository
00023 
00024 # genetic algorithm libraries
00025 from Bio.GA import Organism
00026 from Bio.GA.Evolver import GenerationEvolver
00027 from Bio.GA.Mutation.Simple import SinglePositionMutation
00028 from Bio.GA.Crossover.Point import SinglePointCrossover
00029 from Bio.GA.Repair.Stabilizing import AmbiguousRepair
00030 from Bio.GA.Selection.Tournament import TournamentSelection
00031 from Bio.GA.Selection.Diversity import DiversitySelection
00032 
00033 class Schema(object):
00034     """Deal with motifs that have ambiguity characters in it.
00035 
00036     This motif class allows specific ambiguity characters and tries to
00037     speed up finding motifs using regular expressions.
00038 
00039     This is likely to be a replacement for the Schema representation,
00040     since it allows multiple ambiguity characters to be used.
00041     """
00042     def __init__(self, ambiguity_info):
00043         """Initialize with ambiguity information.
00044 
00045         Arguments:
00046         
00047         o ambiguity_info - A dictionary which maps letters in the motifs to
00048         the ambiguous characters which they might represent. For example,
00049         {'R' : 'AG'} specifies that Rs in the motif can match a A or a G.
00050         All letters in the motif must be represented in the ambiguity_info
00051         dictionary.
00052         """
00053         self._ambiguity_info = ambiguity_info
00054 
00055         # a cache of all encoded motifs
00056         self._motif_cache = {}
00057 
00058     def encode_motif(self, motif):
00059         """Encode the passed motif as a regular expression pattern object.
00060         
00061         Arguments:
00062 
00063         o motif - The motif we want to encode. This should be a string.
00064         
00065         Returns:
00066         A compiled regular expression pattern object that can be used
00067         for searching strings.
00068         """
00069         regexp_string = ""
00070 
00071         for motif_letter in motif:
00072             try:
00073                 letter_matches = self._ambiguity_info[motif_letter]
00074             except KeyError:
00075                 raise KeyError("No match information for letter %s"
00076                                % motif_letter)
00077 
00078             if len(letter_matches) > 1:
00079                 regexp_match = "[" + letter_matches + "]"
00080             elif len(letter_matches) == 1:
00081                 regexp_match = letter_matches
00082             else:
00083                 raise ValueError("Unexpected match information %s"
00084                                  % letter_matches)
00085 
00086             regexp_string += regexp_match
00087 
00088         return re.compile(regexp_string)
00089 
00090     def find_ambiguous(self, motif):
00091         """Return the location of ambiguous items in the motif.
00092 
00093         This just checks through the motif and compares each letter
00094         against the ambiguity information. If a letter stands for multiple
00095         items, it is ambiguous.
00096         """
00097         ambig_positions = []
00098         for motif_letter_pos in range(len(motif)):
00099             motif_letter = motif[motif_letter_pos]
00100             try:
00101                 letter_matches = self._ambiguity_info[motif_letter]
00102             except KeyError:
00103                 raise KeyError("No match information for letter %s"
00104                                % motif_letter)
00105 
00106             if len(letter_matches) > 1:
00107                 ambig_positions.append(motif_letter_pos)
00108 
00109         return ambig_positions
00110 
00111     def num_ambiguous(self, motif):
00112         """Return the number of ambiguous letters in a given motif.
00113         """
00114         ambig_positions = self.find_ambiguous(motif)
00115         return len(ambig_positions)
00116 
00117     def find_matches(self, motif, query):
00118         """Return all non-overlapping motif matches in the query string.
00119 
00120         This utilizes the regular expression findall function, and will
00121         return a list of all non-overlapping occurances in query that
00122         match the ambiguous motif.
00123         """
00124         try:
00125             motif_pattern = self._motif_cache[motif]
00126         except KeyError:
00127             motif_pattern = self.encode_motif(motif)
00128             self._motif_cache[motif] = motif_pattern
00129 
00130         return motif_pattern.findall(query)
00131 
00132     def num_matches(self, motif, query):
00133         """Find the number of non-overlapping times motif occurs in query.
00134         """
00135         all_matches = self.find_matches(motif, query)
00136         return len(all_matches)
00137 
00138     def all_unambiguous(self):
00139         """Return a listing of all unambiguous letters allowed in motifs.
00140         """
00141         all_letters = sorted(self._ambiguity_info)
00142         unambig_letters = []
00143 
00144         for letter in all_letters:
00145             possible_matches = self._ambiguity_info[letter]
00146             if len(possible_matches) == 1:
00147                 unambig_letters.append(letter)
00148 
00149         return unambig_letters
00150 
00151 # --- helper classes and functions for the default SchemaFinder
00152 
00153 # -- Alphabets
00154 
00155 class SchemaDNAAlphabet(Alphabet.Alphabet):
00156     """Alphabet of a simple Schema for DNA sequences.
00157 
00158     This defines a simple alphabet for DNA sequences that has a single
00159     character which can match any other character.
00160 
00161     o G,A,T,C - The standard unambiguous DNA alphabet.
00162 
00163     o * - Any letter
00164     """
00165     letters = ["G", "A", "T", "C", "*"]
00166     
00167     alphabet_matches = {"G" : "G",
00168                         "A" : "A",
00169                         "T" : "T",
00170                         "C" : "C",
00171                         "*" : "GATC"}
00172 
00173 # -- GA schema finder
00174 
00175 class GeneticAlgorithmFinder(object):
00176     """Find schemas using a genetic algorithm approach.
00177 
00178     This approach to finding schema uses Genetic Algorithms to evolve
00179     a set of schema and find the best schema for a specific set of
00180     records.
00181 
00182     The 'default' finder searches for ambiguous DNA elements. This
00183     can be overridden easily by creating a GeneticAlgorithmFinder
00184     with a different alphabet.
00185     """
00186     def __init__(self, alphabet = SchemaDNAAlphabet()):
00187         """Initialize a finder to get schemas using Genetic Algorithms.
00188 
00189         Arguments:
00190 
00191         o alphabet -- The alphabet which specifies the contents of the
00192         schemas we'll be generating. This alphabet must contain the
00193         attribute 'alphabet_matches', which is a dictionary specifying
00194         the potential ambiguities of each letter in the alphabet. These
00195         ambiguities will be used in building up the schema.
00196         """
00197         self.alphabet = alphabet
00198 
00199         self.initial_population = 500
00200         self.min_generations = 10
00201 
00202         self._set_up_genetic_algorithm()
00203 
00204     def _set_up_genetic_algorithm(self):
00205         """Overrideable function to set up the genetic algorithm parameters.
00206 
00207         This functions sole job is to set up the different genetic
00208         algorithm functionality. Since this can be quite complicated, this
00209         allows cusotmizablity of all of the parameters. If you want to
00210         customize specially, you can inherit from this class and override
00211         this function.
00212         """
00213         self.motif_generator = RandomMotifGenerator(self.alphabet)
00214         
00215         self.mutator = SinglePositionMutation(mutation_rate = 0.1)
00216         self.crossover = SinglePointCrossover(crossover_prob = 0.25)
00217         self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches),
00218                                       4)
00219         self.base_selector = TournamentSelection(self.mutator, self.crossover,
00220                                                  self.repair, 2)
00221         self.selector = DiversitySelection(self.base_selector,
00222                                            self.motif_generator.random_motif)
00223 
00224     def find_schemas(self, fitness, num_schemas):
00225         """Find the given number of unique schemas using a genetic algorithm
00226 
00227         Arguments:
00228 
00229         o fitness - A callable object (ie. function) which will evaluate
00230         the fitness of a motif.
00231 
00232         o num_schemas - The number of unique schemas with good fitness
00233         that we want to generate.
00234         """
00235         start_population = \
00236            Organism.function_population(self.motif_generator.random_motif,
00237                                         self.initial_population,
00238                                         fitness)
00239         finisher = SimpleFinisher(num_schemas, self.min_generations)
00240 
00241         # set up the evolver and do the evolution
00242         evolver = GenerationEvolver(start_population, self.selector)
00243         evolved_pop = evolver.evolve(finisher.is_finished)
00244 
00245         # convert the evolved population into a PatternRepository
00246         schema_info = {}
00247         for org in evolved_pop:
00248             # convert the Genome from a MutableSeq to a Seq so that
00249             # the schemas are just strings (and not array("c")s)
00250             seq_genome = org.genome.toseq()
00251             schema_info[seq_genome.tostring()] = org.fitness
00252 
00253         return PatternRepository(schema_info)
00254 
00255 # -- fitness classes
00256 
00257 class DifferentialSchemaFitness(object):
00258     """Calculate fitness for schemas that differentiate between sequences.
00259     """
00260     def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
00261         """Initialize with different sequences to evaluate
00262 
00263         Arguments:
00264         
00265         o positive_seq - A list of SeqRecord objects which are the 'positive'
00266         sequences -- the ones we want to select for.
00267 
00268         o negative_seq - A list of SeqRecord objects which are the 'negative'
00269         sequences that we want to avoid selecting.
00270 
00271         o schema_evaluator - An Schema class which can be used to
00272         evaluate find motif matches in sequences.
00273         """
00274         self._pos_seqs = positive_seqs
00275         self._neg_seqs = negative_seqs
00276         self._schema_eval = schema_evaluator
00277 
00278     def calculate_fitness(self, genome):
00279         """Calculate the fitness for a given schema.
00280 
00281         Fitness is specified by the number of occurances of the schema in
00282         the positive sequences minus the number of occurances in the
00283         negative examples.
00284 
00285         This fitness is then modified by multiplying by the length of the
00286         schema and then dividing by the number of ambiguous characters in
00287         the schema. This helps select for schema which are longer and have
00288         less redundancy.
00289         """
00290         # convert the genome into a string
00291         seq_motif = genome.toseq()
00292         motif = seq_motif.tostring()
00293         
00294         # get the counts in the positive examples
00295         num_pos = 0
00296         for seq_record in self._pos_seqs:
00297             cur_counts = self._schema_eval.num_matches(motif,
00298                                                       seq_record.seq.tostring())
00299             num_pos += cur_counts
00300 
00301         # get the counts in the negative examples
00302         num_neg = 0
00303         for seq_record in self._neg_seqs:
00304             cur_counts = self._schema_eval.num_matches(motif,
00305                                                       seq_record.seq.tostring())
00306 
00307             num_neg += cur_counts
00308 
00309         num_ambiguous = self._schema_eval.num_ambiguous(motif)
00310         # weight the ambiguous stuff more highly
00311         num_ambiguous = pow(2.0, num_ambiguous)
00312         # increment num ambiguous to prevent division by zero errors.
00313         num_ambiguous += 1
00314 
00315         motif_size = len(motif)
00316         motif_size = motif_size * 4.0
00317 
00318         discerning_power = num_pos - num_neg
00319         
00320         diff = (discerning_power * motif_size) / float(num_ambiguous)
00321         return diff
00322 
00323 class MostCountSchemaFitness(object):
00324     """Calculate a fitness giving weight to schemas that match many times.
00325 
00326     This fitness function tries to maximize schemas which are found many
00327     times in a group of sequences.
00328     """
00329     def __init__(self, seq_records, schema_evaluator):
00330         """Initialize with sequences to evaluate.
00331 
00332         Arguments:
00333         
00334         o seq_records -- A set of SeqRecord objects which we use to
00335         calculate the fitness.
00336 
00337         o schema_evaluator - An Schema class which can be used to
00338         evaluate find motif matches in sequences.
00339         """
00340         self._records = seq_records
00341         self._evaluator = schema_evaluator
00342 
00343     def calculate_fitness(self, genome):
00344         """Calculate the fitness of a genome based on schema matches.
00345 
00346         This bases the fitness of a genome completely on the number of times
00347         it matches in the set of seq_records. Matching more times gives a
00348         better fitness
00349         """
00350         # convert the genome into a string
00351         seq_motif = genome.toseq()
00352         motif = seq_motif.tostring()
00353         
00354         # find the number of times the genome matches
00355         num_times = 0
00356         for seq_record in self._records:
00357             cur_counts = self._evaluator.num_matches(motif,
00358                                                      seq_record.seq.tostring())
00359             num_times += cur_counts
00360 
00361         return num_times
00362 
00363 # -- Helper classes
00364 class RandomMotifGenerator(object):
00365     """Generate a random motif within given parameters.
00366     """
00367     def __init__(self, alphabet, min_size = 12, max_size = 17):
00368         """Initialize with the motif parameters.
00369 
00370         Arguments:
00371 
00372         o alphabet - An alphabet specifying what letters can be inserted in
00373         a motif.
00374 
00375         o min_size, max_size - Specify the range of sizes for motifs.
00376         """
00377         self._alphabet = alphabet
00378         self._min_size = min_size
00379         self._max_size = max_size
00380 
00381     def random_motif(self):
00382         """Create a random motif within the given parameters.
00383         
00384         This returns a single motif string with letters from the given
00385         alphabet. The size of the motif will be randomly chosen between
00386         max_size and min_size.
00387         """
00388         motif_size = random.randrange(self._min_size, self._max_size)
00389 
00390         motif = ""
00391         for letter_num in range(motif_size):
00392             cur_letter = random.choice(self._alphabet.letters)
00393             motif += cur_letter
00394 
00395         return MutableSeq(motif, self._alphabet)
00396 
00397 class SimpleFinisher(object):
00398     """Determine when we are done evolving motifs.
00399 
00400     This takes the very simple approach of halting evolution when the
00401     GA has proceeded for a specified number of generations and has
00402     a given number of unique schema with positive fitness.
00403     """
00404     def __init__(self, num_schemas, min_generations = 100):
00405         """Initialize the finisher with its parameters.
00406 
00407         Arguments:
00408 
00409         o num_schemas -- the number of useful (positive fitness) schemas
00410         we want to generation
00411 
00412         o min_generations -- The minimum number of generations to allow
00413         the GA to proceed.
00414         """
00415         self.num_generations = 0
00416 
00417         self.num_schemas = num_schemas
00418         self.min_generations = min_generations
00419 
00420     def is_finished(self, organisms):
00421         """Determine when we can stop evolving the population.
00422         """
00423         self.num_generations += 1
00424         # print "generation %s" % self.num_generations
00425 
00426         if self.num_generations >= self.min_generations:
00427             all_seqs = []
00428             for org in organisms:
00429                 if org.fitness > 0:
00430                     if org.genome not in all_seqs:
00431                         all_seqs.append(org.genome)
00432 
00433             if len(all_seqs) >= self.num_schemas:
00434                 return 1
00435 
00436         return 0
00437 # ---
00438 
00439 class SchemaFinder(object):
00440     """Find schema in a set of sequences using a genetic algorithm approach.
00441 
00442     Finding good schemas is very difficult because it takes forever to
00443     enumerate all of the potential schemas. This finder using a genetic
00444     algorithm approach to evolve good schema which match many times in
00445     a set of sequences.
00446 
00447     The default implementation of the finder is ready to find schemas
00448     in a set of DNA sequences, but the finder can be customized to deal
00449     with any type of data.
00450     """
00451     def __init__(self, num_schemas = 100,
00452                  schema_finder = GeneticAlgorithmFinder()):
00453         self.num_schemas = num_schemas
00454         self._finder = schema_finder
00455 
00456         self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
00457 
00458     def find(self, seq_records):
00459         """Find well-represented schemas in the given set of SeqRecords.
00460         """
00461         fitness_evaluator = MostCountSchemaFitness(seq_records,
00462                                                    self.evaluator)
00463 
00464         return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
00465                                          self.num_schemas)
00466 
00467     def find_differences(self, first_records, second_records):
00468         """Find schemas which differentiate between the two sets of SeqRecords.
00469         """
00470         fitness_evaluator = DifferentialSchemaFitness(first_records,
00471                                                       second_records,
00472                                                       self.evaluator)
00473 
00474         return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
00475                                          self.num_schemas)
00476 
00477 class SchemaCoder(object):
00478     """Convert a sequence into a representation of ambiguous motifs (schemas).
00479 
00480     This takes a sequence, and returns the number of times specified
00481     motifs are found in the sequence. This lets you represent a sequence
00482     as just a count of (possibly ambiguous) motifs.
00483     """
00484     def __init__(self, schemas, ambiguous_converter):
00485         """Initialize the coder to convert sequences
00486 
00487         Arguments:
00488 
00489         o schema - A list of all of the schemas we want to search for
00490         in input sequences.
00491 
00492         o ambiguous_converter - An Schema class which can be
00493         used to convert motifs into regular expressions for searching.
00494         """
00495         self._schemas = schemas
00496         self._converter = ambiguous_converter
00497 
00498     def representation(self, sequence):
00499         """Represent the given input sequence as a bunch of motif counts.
00500 
00501         Arguments:
00502 
00503         o sequence - A Bio.Seq object we are going to represent as schemas.
00504 
00505         This takes the sequence, searches for the motifs within it, and then
00506         returns counts specifying the relative number of times each motifs
00507         was found. The frequencies are in the order the original motifs were
00508         passed into the initializer.
00509         """
00510         schema_counts = []
00511 
00512         for schema in self._schemas:
00513             num_counts = self._converter.num_matches(schema, sequence.tostring())
00514             schema_counts.append(num_counts)
00515 
00516         # normalize the counts to go between zero and one
00517         min_count = 0
00518         max_count = max(schema_counts)
00519 
00520         # only normalize if we've actually found something, otherwise
00521         # we'll just return 0 for everything
00522         if max_count > 0:
00523             for count_num in range(len(schema_counts)):
00524                 schema_counts[count_num] = (float(schema_counts[count_num]) -
00525                                            float(min_count)) / float(max_count)
00526 
00527         return schema_counts
00528 
00529 def matches_schema(pattern, schema, ambiguity_character = '*'):
00530     """Determine whether or not the given pattern matches the schema.
00531 
00532     Arguments:
00533 
00534     o pattern - A string representing the pattern we want to check for
00535     matching. This pattern can contain ambiguity characters (which are
00536     assumed to be the same as those in the schema).
00537 
00538     o schema - A string schema with ambiguity characters.
00539 
00540     o ambiguity_character - The character used for ambiguity in the schema.
00541     """
00542     if len(pattern) != len(schema):
00543         return 0
00544 
00545     # check each position, and return a non match if the schema and pattern
00546     # are non ambiguous and don't match
00547     for pos in range(len(pattern)):
00548         if (schema[pos] != ambiguity_character and
00549             pattern[pos] != ambiguity_character and
00550             pattern[pos] != schema[pos]):
00551             
00552             return 0
00553 
00554     return 1
00555 
00556 class SchemaFactory(object):
00557     """Generate Schema from inputs of Motifs or Signatures.
00558     """
00559     def __init__(self, ambiguity_symbol = '*'):
00560         """Initialize the SchemaFactory
00561 
00562         Arguments:
00563 
00564         o ambiguity_symbol -- The symbol to use when specifying that
00565         a position is arbitrary.
00566         """
00567         self._ambiguity_symbol = ambiguity_symbol
00568 
00569     def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
00570         """Generate schema from a list of motifs.
00571 
00572         Arguments:
00573 
00574         o motif_repository - A MotifRepository class that has all of the
00575         motifs we want to convert to Schema.
00576 
00577         o motif_percent - The percentage of motifs in the motif bank which
00578         should be matches. We'll try to create schema that match this
00579         percentage of motifs.
00580 
00581         o num_ambiguous - The number of ambiguous characters to include
00582         in each schema. The positions of these ambiguous characters will
00583         be randomly selected.
00584         """
00585         # get all of the motifs we can deal with
00586         all_motifs = motif_repository.get_top_percentage(motif_percent)
00587 
00588         # start building up schemas
00589         schema_info = {}
00590         # continue until we've built schema matching the desired percentage
00591         # of motifs
00592         total_count = self._get_num_motifs(motif_repository, all_motifs)
00593         matched_count = 0
00594         assert total_count > 0, "Expected to have motifs to match"
00595         while (float(matched_count) / float(total_count)) < motif_percent:
00596             
00597             new_schema, matching_motifs = \
00598                         self._get_unique_schema(schema_info.keys(),
00599                                                 all_motifs, num_ambiguous)
00600 
00601             # get the number of counts for the new schema and clean up
00602             # the motif list
00603             schema_counts = 0
00604             for motif in matching_motifs:
00605                 # get the counts for the motif
00606                 schema_counts += motif_repository.count(motif)
00607 
00608                 # remove the motif from the motif list since it is already
00609                 # represented by this schema
00610                 all_motifs.remove(motif)
00611 
00612 
00613             # all the schema info
00614             schema_info[new_schema] = schema_counts
00615 
00616             matched_count += schema_counts
00617 
00618             # print "percentage:", float(matched_count) / float(total_count)
00619 
00620         return PatternRepository(schema_info)
00621 
00622     def _get_num_motifs(self, repository, motif_list):
00623         """Return the number of motif counts for the list of motifs.
00624         """
00625         motif_count = 0
00626         for motif in motif_list:
00627             motif_count += repository.count(motif)
00628 
00629         return motif_count
00630 
00631     def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
00632         """Retrieve a unique schema from a motif.
00633 
00634         We don't want to end up with schema that match the same thing,
00635         since this could lead to ambiguous results, and be messy. This
00636         tries to create schema, and checks that they do not match any
00637         currently existing schema.
00638         """
00639         # create a schema starting with a random motif
00640         # we'll keep doing this until we get a completely new schema that
00641         # doesn't match any old schema
00642         num_tries = 0
00643         
00644         while 1:
00645             # pick a motif to work from and make a schema from it
00646             cur_motif = random.choice(motif_list)
00647             
00648             num_tries += 1
00649                 
00650             new_schema, matching_motifs = \
00651                         self._schema_from_motif(cur_motif, motif_list,
00652                                                 num_ambiguous)
00653 
00654             has_match = 0
00655             for old_schema in cur_schemas:
00656                 if matches_schema(new_schema, old_schema,
00657                                   self._ambiguity_symbol):
00658                     has_match = 1
00659 
00660             # if the schema doesn't match any other schema we've got
00661             # a good one
00662             if not(has_match):
00663                 break
00664 
00665             # check for big loops in which we can't find a new schema
00666             assert num_tries < 150, \
00667                    "Could not generate schema in %s tries from %s with %s" \
00668                    % (num_tries, motif_list, cur_schemas)
00669 
00670         return new_schema, matching_motifs
00671 
00672     def _schema_from_motif(self, motif, motif_list, num_ambiguous):
00673         """Create a schema from a given starting motif.
00674 
00675         Arguments:
00676 
00677         o motif - A motif with the pattern we will start from.
00678 
00679         o motif_list - The total motifs we have.to match to.
00680 
00681         o num_ambiguous - The number of ambiguous characters that should
00682         be present in the schema.
00683 
00684         Returns:
00685 
00686         o A string representing the newly generated schema.
00687 
00688         o A list of all of the motifs in motif_list that match the schema.
00689         """
00690         assert motif in motif_list, \
00691                "Expected starting motif present in remaining motifs."
00692 
00693         # convert random positions in the motif to ambiguous characters
00694         # convert the motif into a list of characters so we can manipulate it
00695         new_schema_list = list(motif)
00696         for add_ambiguous in range(num_ambiguous):
00697             # add an ambiguous position in a new place in the motif
00698             while 1:
00699                 ambig_pos = random.choice(range(len(new_schema_list)))
00700 
00701                 # only add a position if it isn't already ambiguous
00702                 # otherwise, we'll try again
00703                 if new_schema_list[ambig_pos] != self._ambiguity_symbol:
00704                     new_schema_list[ambig_pos] = self._ambiguity_symbol
00705                     break
00706 
00707         # convert the schema back to a string
00708         new_schema = ''.join(new_schema_list)
00709 
00710         # get the motifs that the schema matches
00711         matched_motifs = []
00712         for motif in motif_list:
00713             if matches_schema(motif, new_schema, self._ambiguity_symbol):
00714                 matched_motifs.append(motif)
00715 
00716         return new_schema, matched_motifs
00717             
00718     def from_signatures(self, signature_repository, num_ambiguous):
00719         raise NotImplementedError("Still need to code this.")