DEBUG = False import string from sets import Set K = 4 # Number of nucleotides to choose from # DNA - Array of DNA sequences # l - length of the l-mer def GreedyMotifSearch(DNA, l): t = len(DNA) # Number of DNA sequences n = len(DNA[0]) # Length of a single sequence (all should be the same) s = [0] * t # List of starting positions for the ith DNA sequence BestMotif = [0] * t # [0, 0, ... , 0] for s[0] in range(0, n - l + 1): for s[1] in range(0, n - 1 + 1): if Score(s, 2, DNA, l) > Score(BestMotif, 2, DNA, l): BestMotif[0] = s[0] BestMotif[1] = s[1] s[0] = BestMotif[0] s[1] = BestMotif[1] for i in range(2, t): for s[i] in range(0, n - l + 1): if Score(s, i+1, DNA, l) > Score(BestMotif, i+1, DNA, l): BestMotif[i] = s[i] s[i] = BestMotif[i] return BestMotif def Score(s, i, DNA, l): profile = [] s = s[0:i] for j in range(0,i): profile += [DNA[j][s[j]:s[j]+l]] profile = Transpose(profile) score = 0 for row in profile: score += Count(row) return score def Count(row): if row == []: return 0 uniqueSet = Set() for i in range(len(row)): if isinstance(row[i],str): # Convert all characters to lowercase row[i] = row[i].lower() uniqueSet.add(row[i]) # Get a set of unique elements counts = [row.count(item) for item in uniqueSet] counts.sort(lambda x, y: -1 * cmp(x,y)) # Descending sort return counts[0] # Highest count def Transpose(matrix): if not matrix: return [] return map(lambda *row: list(row), *matrix) def Nucleotides(s): nucleotide = ['a','c','g','t'] string = '' for c in s: if (c == 0): break string += nucleotide[c-1] return string def PrintConsensus(s, DNA, l): for i in range(0,len(s)): print DNA[i][s[i]:s[i]+l] DNA = [ "CGGGGCTATGCAACTGGGTCGTCACATTCCCCTTTCGATA", "TTTGAGGGTGCCCAATAAATGCAACTCCAAAGCGGACAAA", "GGATGCAACTGATGCCGTTTGACGACCTAAATCAACGGCC", "AAGGATGCAACTCCAGGAGCGCCTTTGCTGGTTCTACCTG", "AATTTTCTAAAAAGATTATAATGTCGGTCCATGCAACTTC", "CTGCTGTACAACTGAGATCATGCTGCATGCAACTTTCAAC", "TACATGATCTTTTGATGCAACTTGGATGAGGGAATGATGC" ] s = GreedyMotifSearch(DNA, 8) PrintConsensus(s, DNA, 8) DNA = ["CGGGGCTATcCAgCTGGGTCGTCACATTCCCCTTTCGATA" "TTTGAGGGTGCCCAATAAggGCAACTCCAAAGCGGACAAA", "GGATGgAtCTGATGCCGTTTGACGACCTAAATCAACGGCC", "AAGGAaGCAACcCCAGGAGCGCCTTTGCTGGTTCTACCTG", "AATTTTCTAAAAAGATTATAATGTCGGTCCtTGgAACTTC", "CTGCTGTACAACTGAGATCATGCTGCATGCcAtTTTCAAC", "TACATGATCTTTTGATGgcACTTGGATGAGGGAATGATGC"] s = GreedyMotifSearch(DNA, 8) PrintConsensus(s, DNA, 8) if DEBUG: print "Test Transpose Function" Matrix1 = [[1,0,0],[0,1,0],[0,0,1]] Matrix2 = [[1,2,3],[4,5,6]] Matrix3 = [[1,4],[2,5],[3,6]] print Transpose(Matrix1) == Matrix1 print Transpose(Matrix2) == Matrix3 print Transpose(Matrix3) == Matrix2 print "Testing Nucleotides Function" print Nucleotides([0,0,0,0]) == "" print Nucleotides([1,2,3,4]) == "acgt" print Nucleotides([4,3,2,1]) == "tgca" print "Testing Count Function" List1 = ['a','b','b','c','c','c'] List2 = ['a','b','c','d'] List3 = ['a','A','a','B','b','c',] List4 = [1,2,3,4] List5 = [] print Count(List1) == 3 print Count(List2) == 1 print Count(List3) == 3 print Count(List4) == 1 print Count(List5) == 0 print "Testing Score Function" DNA1 = [[1,2,3,4]] * 2 s = [0,0] l,t = 4, 2 print Score(s,2,DNA1,l) == l * t