""" CS146 Example with sequence alignment using recursion and memoization """ from functools import wraps import urllib.request # Python has rejected the certificate used for the CS dept. server so we bypass some of those checks import ssl ssl._create_default_https_context = ssl._create_unverified_context # Edge scores MATCH = 1 MISMATCH = -1 GAP = -1 # Record source of incoming best edge DIAG = 0 LEFT = 1 UPWD = 2 TAXI = 3 def countcalls(func): """Decorator to track the number of times a function is called. Provides `calls` attribute.""" countcalls.calls = 0 @wraps(func) def wrapper(*args, **kwargs): initial_calls = countcalls.calls countcalls.calls += 1 result = func(*args, **kwargs) wrapper.calls = countcalls.calls - initial_calls return result return wrapper @countcalls def global_align(lft_seq, top_seq, lft, top): """Perform Needleman-Wunsch global alignment of two sequences Args: lft_seq: String sequence on "left side" of the alignment grid top_seq: String sequence on "top" of the alignment grid lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] Returns: Score of global alignment of left_seq[lft-1] with top_seq[top-1] """ if lft == 0 and top == 0: return 0 elif lft == 0: return global_align(lft_seq, top_seq, lft, top-1) + GAP elif top == 0: return global_align(lft_seq, top_seq, lft-1, top) + GAP else: if lft_seq[lft-1] == top_seq[top-1]: diag = MATCH else: diag = MISMATCH return max( global_align(lft_seq, top_seq, lft, top-1) + GAP, global_align(lft_seq, top_seq, lft-1, top) + GAP, global_align(lft_seq, top_seq, lft-1, top-1) + diag ) # Test example # global_align("TGTCC", "TGATC", 5, 5) # print("Number of recursive calls:", global_align.calls) # Report number of recursive calls @countcalls def global_align_memo(lft_seq, top_seq, lft, top, scores): """Perform memoized Needleman-Wunsch global alignment of two sequences Args: lft_seq: String sequence on "left side" of the alignment grid top_seq: String sequence on "top" of the alignment grid lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] scores: Dictionary of memoized scores for each (lft, top) pair Returns: Score of global alignment of left_seq[lft-1] with top_seq[top-1] """ if (lft, top) in scores: return scores[(lft, top)] if lft == 0 and top == 0: edge = 0 elif lft == 0: edge = global_align_memo(lft_seq, top_seq, lft, top-1, scores) + GAP elif top == 0: edge = global_align_memo(lft_seq, top_seq, lft-1, top, scores) + GAP else: if lft_seq[lft-1] == top_seq[top-1]: diag = MATCH else: diag = MISMATCH edge = max( global_align_memo(lft_seq, top_seq, lft, top-1, scores) + GAP, global_align_memo(lft_seq, top_seq, lft-1, top, scores) + GAP, global_align_memo(lft_seq, top_seq, lft-1, top-1, scores) + diag ) scores[(lft, top)] = edge return edge # Test example # scores = {} # global_align_memo("TGTCC", "TGATC", 5, 5, scores) # print("Number of recursive calls:", global_align_memo.calls) # Report number of recursive calls def global_align_back(lft_seq, top_seq, lft, top, scores): """Perform memoized Needleman-Wunsch global alignment of two sequences, storing backtrack pointers Args: lft_seq: String sequence on "left side" of the alignment grid top_seq: String sequence on "top" of the alignment grid lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] scores: Dictionary of memoized scores for each (lft, top) pair Returns: Score of global alignment of left_seq[lft-1] with top_seq[top-1] """ if (lft, top) in scores: return scores[(lft, top)][0] if lft == 0 and top == 0: edge = (0, None) elif lft == 0: score = global_align_back(lft_seq, top_seq, lft, top-1, scores) edge = (score + GAP, LEFT) elif top == 0: score = global_align_back(lft_seq, top_seq, lft-1, top, scores) edge = (score + GAP, UPWD) else: if lft_seq[lft-1] == top_seq[top-1]: diag = MATCH else: diag = MISMATCH edge = max( (global_align_back(lft_seq, top_seq, lft, top-1, scores) + GAP, LEFT), (global_align_back(lft_seq, top_seq, lft-1, top, scores) + GAP, UPWD), (global_align_back(lft_seq, top_seq, lft-1, top-1, scores) + diag, DIAG) ) scores[(lft, top)] = edge return edge[0] def backtrack(lft_seq, top_seq, lft, top, scores): """Return actual alignments by backtracking via pointers in scores dictionary Args: lft_seq: String sequence on "left side" of the alignment grid top_seq: String sequence on "top" of the alignment grid lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] scores: Dictionary of memoized scores and pointers for each (lft, top) pair Returns: Tuple of (left, top) alignments with gaps inserted """ lft_align = "" top_align = "" while True: score, edge = scores[(lft, top)] if edge == DIAG: lft_align = lft_seq[lft-1] + lft_align top_align = top_seq[top-1] + top_align lft -= 1 top -= 1 elif edge == UPWD: lft_align = lft_seq[lft-1] + lft_align top_align = "-" + top_align lft -= 1 elif edge == LEFT: lft_align = "-" + lft_align top_align = top_seq[top-1] + top_align top -= 1 elif edge == TAXI: lft_align = lft_seq[:lft] + lft_align top_align = "-" * lft + top_align break else: break return (lft_align, top_align) # scores = {} # global_align_back("TGTCC", "TGATC", 5, 5, scores) # backtrack("TGTCC", "TGATC", 5, 5, scores) def fit_align_back(lft_seq, top_seq, lft, top, scores): """Perform memoized fitting alignment where top must fully align somewhere in a longer left sequence Args: lft_seq: String sequence on "left side" of the alignment grid top_seq: String sequence on "top" of the alignment grid lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1] scores: Dictionary of memoized scores for each (lft, top) pair Returns: Score of global alignment of left_seq[lft-1] with top_seq[top-1] """ if (lft, top) in scores: return scores[(lft, top)][0] if lft == 0 and top == 0: edge = (0, None) elif lft == 0: score = fit_align_back(lft_seq, top_seq, lft, top-1, scores) edge = (score + GAP, LEFT) elif top == 0: edge = max( (0, TAXI), (fit_align_back(lft_seq, top_seq, lft-1, top, scores) + GAP, UPWD) ) else: if lft_seq[lft-1] == top_seq[top-1]: diag = MATCH else: diag = MISMATCH edge = max( (fit_align_back(lft_seq, top_seq, lft, top-1, scores) + GAP, LEFT), (fit_align_back(lft_seq, top_seq, lft-1, top, scores) + GAP, UPWD), (fit_align_back(lft_seq, top_seq, lft-1, top-1, scores) + diag, DIAG) ) scores[(lft, top)] = edge return edge[0] def find_fit_start(lft, top, scores): """Find the start of the best "fitting" alignment in last column of lattice Args: lft: Length of the "left", longer, sequence top: Length of the "top", shorter, sequence scores: Dictionary of memoized alignment scores and pointers Returns: Tuple of best (lft, top) in last column """ best_node = (0, top) best_score, _ = scores[best_node] for i in range(1, lft+1): node_score, _ = scores[(i, top)] if node_score > best_score: best_node = (i, top) best_score = node_score return best_node def fitting_alignment(ref_seq, query_seq): """Perform fitting alignment query against a reference sequence Args: ref_seq: Longer reference sequence query_seq: Shorter query sequence Returns: Tuple with score, ref alignment and query alignment """ scores = {} fit_align_back(ref_seq, query_seq, len(ref_seq), len(query_seq), scores) ref_end, query_end = find_fit_start( len(ref_seq), len(query_seq), scores) assert query_end == len(query_seq), "Fitting alignment should end in last column" ref_align, query_align = backtrack(ref_seq, query_seq, ref_end, query_end, scores) # How many gaps at end of query_seq? tail = len(ref_seq) - ref_end return (scores[(ref_end, query_end)][0], ref_align + ref_seq[ref_end:], query_align + "-" * tail) # fitting_alignment("TTAGTAGGCTTAAGGTTA", "TAGATA") def pileup(ref, reads_url): """Generate text pileup of read sequences against a reference sequence Args: ref: Reference genome sequence reads_url: URL for text file of sequencing reads """ print(ref) with urllib.request.urlopen(reads_url) as url: for query_seq in url: query_seq = query_seq.decode("utf-8", "ignore") # Obtain a string from the raw bytes _, ref_align, query_align = fitting_alignment(ref, query_seq.strip()) assert len(ref_align) == len(query_align), "Alignment lengths should match" # Squash any gaps in the reference sequence, since they are hard to plot # in pileup. squashed_query = "" for i in range(len(ref_align)): if ref_align[i] != "-": squashed_query += query_align[i] print(squashed_query) REF_GENOME="AATCTGCCTTGTGGTCGGCTCTTACCTTCAGGCTGCTCTGAGCCCAGAGCAGAATGGTCATCACAGCTCTCCTCAACTTGGCATTGCCTGAGATCAGGATGGCTGCATGCCCAGAGGGACAAGCTGCCATTATCCCAACACAAACCATCACCCCTATTTTGTCGCGCCACAGAATCAGTAGGGGCACAGAGATGAAGGCAGC" # pileup(REF_GENOME, "https://www.cs.middlebury.edu/~mlinderman/courses/cs146/f24/classes/NA12878.ready.sample.txt")