I'm having serious issues with a CS assignment I was given. It's supposed to take a set of "DNA strands" and align them as best as possible given a set of rules. I used the algorithm my prof gave me but my memoization algorithm is apparently too slow. Strands of length 60 shouldn't take more than ten seconds, but even strands of 15 take minutes before I get frustrated and kill the process. Can anyone point out where I can optimize stuff?
Code:#!/usr/bin/env python #encoding: utf-8 import random # for seed, random import sys # for stdout import time ############################################################################ #Declarations #The following dictionary structure will be referred to as Alignment: # #typedef dictionary { # STRING strand1; # STRING strand2; # STRING plusstr; # STRING minusstr; # INT score; #}Alignment; # #typedef dictionary { # Alignment alignment; # STRING strand1; # STRING strand2; # INT length; #}CachedAlignment; # #declare List<Alignment> cache; ############################################################################# #initialize cache and default null descriptor __NULLALIGN__ = { \ "alignment" : \ { "strand1" : "", \ "strand2" : "", \ "plusstr" : "", \ "minusstr" : "", \ "score" : 0 }, \ "strand1" : "", \ "strand2" : "", \ "length" : 0 \ } cache = list() #constants __MAXLEN__ = 15 __MINLEN__ = 2 __NOT_FOUND__ = -1 #Compares two Alignment records and returns a signed integer indicating #the results; -1 if A<B, 0 if A==B, and 1 if A>B. def compareAlignments(a,b) : #check first string result = cmp(a["strand1"],b["strand1"]) if(result == 0) : #first strands are equivalent, check second return cmp(a["strand2"],b["strand2"]) #first two aren't equivalent, return appropriate value else : return result #Performs an insertion sort on a list of Alignment records. def __insertion_sort(item,start,end) : global cache global __NOT_FOUND__ middle = int((start + end) / 2) compvalue = compareAlignments(cache[middle],item) if(end <= start): if(compvalue < 0) : cache.insert(middle + 1,item) return True elif(compvalue > 0) : cache.insert(middle,item) return True else : return False if(compvalue > 0) : return __insertion_sort(item,start,middle - 1) elif(compvalue < 0) : return __insertion_sort(item,middle + 1,end) else : return False def insertion_sort(item) : global cache if(len(cache) == 0) : cache += [item] return True else : return __insertion_sort(item,0,len(cache) - 1) #Performs a binary search on a list of Alignment records and returns the index #of the one that matches best. if the record is not found, -1 is returned. def __bsearch(item,start,end) : global cache global __NOT_FOUND__ if(end<=start): if(compareAlignments(cache[start],item) == 0) : return start else: return __NOT_FOUND__ middle = (start + end) / 2 compvalue = compareAlignments(cache[middle],item) #sys.stdout.write("start: " + str(start) + " end: " + str(end) + " middle: " + str(middle) + "\n") if(compvalue > 0) : return __bsearch(item,start,middle - 1) elif(compvalue < 0) : return __bsearch(item,middle + 1,end) else : return middle def bsearch(item) : global cache global __NOT_FOUND__ if(len(cache) == 0) : return __NOT_FOUND__ return __bsearch(item,0,len(cache) - 1) #Memoizes Alignment records. Returns boolean indicating if the item was memoized or not. def memoize(strand1,strand2,alignment) : global cache #get the longest length that this is matchable for. if(len(strand1) > len(strand2)) : length = len(strand2) else : length = len(strand1) item = { \ "alignment" : alignment, \ "length" : length, \ "strand1" : strand1, \ "strand2" : strand2, \ } return insertion_sort(item) #finds the longest matching pair possible. Slightly less than O(nlogn) average running time. def matchLongestPair(s1,s2) : global cache global __NULLALIGN__ #get the shortest length of the two strings if(len(s1) > len(s2)) : length = len(s2) else : length = len(s1) if(len(cache) == 0) : return __NULLALIGN__ #go from longest length to shortest. anything 2 or below in length is not worth checking. #there has to be a faster way of doing this... for i in xrange(length,__MINLEN__,-1) : #create a dummy record for searching. the score doesn't matter. tmprecord = {"strand1" : s1[0:i], "strand2" : s2[0:i]} index = bsearch(tmprecord) if(index != __NOT_FOUND__) : #match found! return CachedAlignment dict return cache[index] else : #ran out of options, no match found. return the null descriptor return __NULLALIGN__ # Computes the score of the optimal alignment of two DNA strands. #Returns Alignment record def findOptimalAlignment(strand1, strand2): # if one of the two strands is empty, then there is only # one possible alignment, and of course it's optimal if(len(strand1) == 0): return {"strand1" : "", "strand2" : strand2, "score" : -len(strand2), "minusstr" : "", "plusstr" : ""} if(len(strand2) == 0): return {"strand1" : strand1, "strand2" : "", "score" : -len(strand1), "minusstr" : "", "plusstr" : ""} #get minimum length to compare strings on if(len(strand1) > len(strand2)) : minlength = len(strand2) else : minlength = len(strand1) #attempt to retrieve cached alignment, if it exists. #cached_result is a CachedAlignment record, not Alignment. if(minlength >= __MINLEN__) : cached_result = matchLongestPair(strand1,strand2) #check to see if returned result is valid if(cached_result["length"] > 0) : #found a valid cached result! find the optimal alignment for the rest of the strand. rest = findOptimalAlignment(strand1[cached_result["length"]:],strand2[cached_result["length"]:]) #now we have everything we need. build a new Alignment record and return the result. result = dict() result["score"] = cached_result["alignment"]["score"] + rest["score"] result["strand1"] = cached_result["alignment"]["strand1"] + rest["strand1"] result["strand2"] = cached_result["alignment"]["strand2"] + rest["strand2"] result["plusstr"] = cached_result["alignment"]["plusstr"] + rest["plusstr"] result["minusstr"] = cached_result["alignment"]["minusstr"] + rest["minusstr"] if(minlength >= __MINLEN__) : memoize(strand1,strand2,result) return result # There's the scenario where the two leading bases of # each strand are forced to align, regardless of whether or not # they actually match. best = findOptimalAlignment(strand1[1:],strand2[1:]) best["strand1"] = strand1[0] + best["strand1"] best["strand2"] = strand2[0] + best["strand2"] if(strand1[0] == strand2[0]): best["score"] += 1 best["plusstr"] = "1" + best["plusstr"] best["minusstr"] = "-" + best["minusstr"] return best # no benefit from making other recursive calls else : best["score"] -= 1 best["plusstr"] = "-" + best["plusstr"] best["minusstr"] = "1" + best["minusstr"] # It's possible that the leading base of strand1 best # matches not the leading base of strand2, but the one after it. bestWithout = findOptimalAlignment(strand1,strand2[1:]) bestWithout["score"] -= 2 # penalize for insertion of space if bestWithout["score"] > best["score"]: bestWithout["strand1"] = "." + bestWithout["strand1"] bestWithout["strand2"] = strand2[0] + bestWithout["strand2"] bestWithout["plusstr"] = "-" + bestWithout["plusstr"] bestWithout["minusstr"] = "2" + bestWithout["minusstr"] best = bestWithout # opposite scenario bestWithout = findOptimalAlignment(strand1[1:],strand2) bestWithout["score"] -= 2 # penalize for insertion of space if bestWithout["score"] > best["score"]: bestWithout["strand1"] = strand1[0] + bestWithout["strand1"] bestWithout["strand2"] = "." + bestWithout["strand2"] bestWithout["plusstr"] = "-" + bestWithout["plusstr"] bestWithout["minusstr"] = "2" + bestWithout["minusstr"] best = bestWithout #not worth memoizing short stuff if(minlength >= __MINLEN__) : memoize(strand1,strand2,best) return best # Utility function that generates a random DNA string of # a random length drawn from the range [minlength, maxlength] def generateRandomDNAStrand(minlength, maxlength): assert minlength > 0, \ "Minimum length passed to generateRandomDNAStrand" \ "must be a positive number" assert maxlength >= minlength, \ "Maximum length passed to generateRandomDNAStrand must be at " \ "as large as the specified minimum length" strand = "" length = random.choice(xrange(minlength, maxlength + 1)) bases = ['A', 'T', 'G', 'C'] for i in xrange(0, length): strand += random.choice(bases) return strand # Method that just prints out the supplied alignment score. # This is more of a placeholder for what will ultimately # print out not only the score but the alignment as well. # # Modification: alignment is of type ALIGNMENT def printAlignment(alignment, out = sys.stdout): out.write("Optimal alignment score: " + str(alignment["score"]) + "\n\n" + \ " + " + alignment["plusstr"] + "\n" + \ " " + alignment["strand1"] + "\n" + \ " " + alignment["strand2"] + "\n" + \ " - " + alignment["minusstr"] + "\n\n\n") # Unit test main in place to do little more than # exercise the above algorithm. As written, it # generates two fairly short DNA strands and # determines the optimal alignment score. # # As you change the implementation of findOptimalAlignment # to use memoization, you should change the 8s to 40s and # the 10s to 60s and still see everything execute very # quickly. def main(): global __MAXLEN__ while (True): sys.stdout.write("Generate random DNA strands? ") answer = sys.stdin.readline() if answer == "no\n": break strand1 = generateRandomDNAStrand(__MAXLEN__ - 2, __MAXLEN__) strand2 = generateRandomDNAStrand(__MAXLEN__ - 2, __MAXLEN__) sys.stdout.write("Aligning these two strands: " + strand1 + "\n") sys.stdout.write(" " + strand2 + "\n") starttime = time.time() alignment = findOptimalAlignment(strand1, strand2) endtime = time.time() elapsed = endtime - starttime if(elapsed < 1) : elapsed *= 1000 unit = "ms" elif (elapsed > 60) : elapsed /= 60 unit = "min" else: unit = "s" sys.stdout.write("Time elapsed: " + str(elapsed) + " " + unit + ".\n") printAlignment(alignment) if __name__ == "__main__": main()
With regard to memoizing certain functions (which generally decreases computational complexity but increases space complexity) you can implement it in many ways.
I usually use dictionaries like this:
Code:cache = {} def fac(n): if n <= 1: return 1 if n in cache: return cache[n] cache[n] = n*fac(n-1) return cache[n]
I actually figured out what was making it so slow (it was 40x faster than the minimum requirements when I fixed it, instead of 20x slower). It was in my matchLongestPair function. I should've cut out the for-each loop, like so:
Code:def matchLongestPair(s1,s2) : global cache global __NULLALIGN__ #get the shortest length of the two strings if(len(s1) > len(s2)) : length = len(s2) else : length = len(s1) if(len(cache) == 0) : return __NULLALIGN__ tmprecord = {"strand1" : s1, "strand2" : s2} index = bsearch(tmprecord) if(index == __NOT_FOUND__) : return __NULLALIGN__ else : return cache[index]
Wow that looks really complex I hope i as good at that one day
If you ignore the bsearch and insert functions it's actually pretty simple. You can find the assignment itself (and an explanation of how it works) here.
Hey thanks![]()
There are 10 kinds of people in this world,
Those that can read binary and those who can't
There are currently 1 users browsing this thread. (0 members and 1 guests)
Bookmarks