import string
DEBUG = True
K = 4 # Number of nucleotides to choose from

# DNA - Array of DNA sequences
# l - length of the l-mer
def BranchAndBoundMedianSearch(DNA, l):
	s = [0] * l # [0, 0, ... , 0]
	t = len(DNA)
	bestDistance = t * l
	i = 0
	while True:
		if i < l:
			prefix = nucleotides(s) # Check the index
			optimisticDistance = totaldistance(prefix, DNA)
			if optimisticDistance > bestDistance:
				s, i = bypass(s,i,l,K)
			else:
				s,i = nextvertex(s,i,l,K)
		else:
			word = nucleotides(s)
			distance = totaldistance(word, DNA)
			if distance < bestDistance:
				if DEBUG: print word, ": ", distance
				bestDistance = distance
				bestWord = word
			s, i = nextvertex(s,i,l,K)
		if i == 0: break
	return bestWord
	
# word - DNA sequence to compare
# DNA - Sets of DNA sequences
def totaldistance(word, DNA):
	total = 0
	l = len(word) # Length of the string v
	for seq in DNA:
		n = len(seq) # Lenght of a DNA sequence
		best = l
		for i in range(0,n-l+1):
			s = seq[i:i+l] # Substring of the ith element in DNA. 
			score = hamdist(word,s)
			if score < best: best = score
			if score == 0: break # We found a perfect match; Stop examining this sequence
		total += best
	return total
	
def nucleotides(s):
	nucleotide = ['a','c','g','t']
	string = ''
	for c in s:
		if (c == 0): break
		string += nucleotide[c-1]
	return string
	
def hamdist(s1,s2):
	diff = 0
	for ch1, ch2 in zip(s1,s2):
		if ch1 != 0 and ch2 != 0 and ch1.lower() != ch2.lower(): 
			diff += 1
	return diff

# i - level in the tree
# L - # of levels in the tree
def nextvertex(a,i,L,k):
	if i < L:
		a[i] = 1
		return [a, i+ 1]
	else:
		for j in reversed(range(0,L)):
			if a[j] < k:
				a[j] += 1
				return [a,j+1]
			a[j] = 0
	return [a,0]
	
def bypass(a,i,L,k):
	for j in reversed(range(0,i)):
		if a[j]< k:
			a[j] += 1
			return [a,j+1]
		a[j] = 0
	return [a,0]
	
DNA = [ "CGGGGCTATGCAACTGGGTCGTCACATTCCCCTTTCGATA",
		"TTTGAGGGTGCCCAATAAATGCAACTCCAAAGCGGACAAA",
		"GGATGCAACTGATGCCGTTTGACGACCTAAATCAACGGCC",
		"AAGGATGCAACTCCAGGAGCGCCTTTGCTGGTTCTACCTG",
		"AATTTTCTAAAAAGATTATAATGTCGGTCCATGCAACTTC",
		"CTGCTGTACAACTGAGATCATGCTGCATGCAACTTTCAAC",
		"TACATGATCTTTTGATGCAACTTGGATGAGGGAATGATGC" ]
print BranchAndBoundMedianSearch(DNA,8)

DNA = ["CGGGGCTATcCAgCTGGGTCGTCACATTCCCCTTTCGATA" 
		"TTTGAGGGTGCCCAATAAggGCAACTCCAAAGCGGACAAA",
		"GGATGgAtCTGATGCCGTTTGACGACCTAAATCAACGGCC",
		"AAGGAaGCAACcCCAGGAGCGCCTTTGCTGGTTCTACCTG",
		"AATTTTCTAAAAAGATTATAATGTCGGTCCtTGgAACTTC",
		"CTGCTGTACAACTGAGATCATGCTGCATGCcAtTTTCAAC",
		"TACATGATCTTTTGATGgcACTTGGATGAGGGAATGATGC"]
print BranchAndBoundMedianSearch(DNA,8)

if DEBUG:
	print "Test Information"
	print totaldistance("ATGCAACT",DNA) == 0
	A = [0,0,0,0]
	print hamdist(A,"asdf") == 0
	print hamdist(DNA[0], DNA[1]) == 31
	I = 0
	i = 0
	while True:
		A,I = nextvertex(A, I, 4, 2)
		i += 1	
		if I == 0: break
	print i == 31