import string DEBUG = True K = 4 # Number of nucleotides to choose from # DNA - Array of DNA sequences # l - length of the l-mer def BranchAndBoundMedianSearch(DNA, l): s = [0] * l # [0, 0, ... , 0] t = len(DNA) bestDistance = t * l i = 0 while True: if i < l: prefix = nucleotides(s) # Check the index optimisticDistance = totaldistance(prefix, DNA) if optimisticDistance > bestDistance: s, i = bypass(s,i,l,K) else: s,i = nextvertex(s,i,l,K) else: word = nucleotides(s) distance = totaldistance(word, DNA) if distance < bestDistance: if DEBUG: print word, ": ", distance bestDistance = distance bestWord = word s, i = nextvertex(s,i,l,K) if i == 0: break return bestWord # word - DNA sequence to compare # DNA - Sets of DNA sequences def totaldistance(word, DNA): total = 0 l = len(word) # Length of the string v for seq in DNA: n = len(seq) # Lenght of a DNA sequence best = l for i in range(0,n-l+1): s = seq[i:i+l] # Substring of the ith element in DNA. score = hamdist(word,s) if score < best: best = score if score == 0: break # We found a perfect match; Stop examining this sequence total += best return total def nucleotides(s): nucleotide = ['a','c','g','t'] string = '' for c in s: if (c == 0): break string += nucleotide[c-1] return string def hamdist(s1,s2): diff = 0 for ch1, ch2 in zip(s1,s2): if ch1 != 0 and ch2 != 0 and ch1.lower() != ch2.lower(): diff += 1 return diff # i - level in the tree # L - # of levels in the tree def nextvertex(a,i,L,k): if i < L: a[i] = 1 return [a, i+ 1] else: for j in reversed(range(0,L)): if a[j] < k: a[j] += 1 return [a,j+1] a[j] = 0 return [a,0] def bypass(a,i,L,k): for j in reversed(range(0,i)): if a[j]< k: a[j] += 1 return [a,j+1] a[j] = 0 return [a,0] DNA = [ "CGGGGCTATGCAACTGGGTCGTCACATTCCCCTTTCGATA", "TTTGAGGGTGCCCAATAAATGCAACTCCAAAGCGGACAAA", "GGATGCAACTGATGCCGTTTGACGACCTAAATCAACGGCC", "AAGGATGCAACTCCAGGAGCGCCTTTGCTGGTTCTACCTG", "AATTTTCTAAAAAGATTATAATGTCGGTCCATGCAACTTC", "CTGCTGTACAACTGAGATCATGCTGCATGCAACTTTCAAC", "TACATGATCTTTTGATGCAACTTGGATGAGGGAATGATGC" ] print BranchAndBoundMedianSearch(DNA,8) DNA = ["CGGGGCTATcCAgCTGGGTCGTCACATTCCCCTTTCGATA" "TTTGAGGGTGCCCAATAAggGCAACTCCAAAGCGGACAAA", "GGATGgAtCTGATGCCGTTTGACGACCTAAATCAACGGCC", "AAGGAaGCAACcCCAGGAGCGCCTTTGCTGGTTCTACCTG", "AATTTTCTAAAAAGATTATAATGTCGGTCCtTGgAACTTC", "CTGCTGTACAACTGAGATCATGCTGCATGCcAtTTTCAAC", "TACATGATCTTTTGATGgcACTTGGATGAGGGAATGATGC"] print BranchAndBoundMedianSearch(DNA,8) if DEBUG: print "Test Information" print totaldistance("ATGCAACT",DNA) == 0 A = [0,0,0,0] print hamdist(A,"asdf") == 0 print hamdist(DNA[0], DNA[1]) == 31 I = 0 i = 0 while True: A,I = nextvertex(A, I, 4, 2) i += 1 if I == 0: break print i == 31