"""
CS146 Example with sequence alignment using recursion and memoization
"""
from functools import wraps
import urllib.request

# Python has rejected the certificate used for the CS dept. server so we bypass some of those checks
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Edge scores
MATCH = 1
MISMATCH = -1
GAP = -1

# Record source of incoming best edge
DIAG = 0
LEFT = 1
UPWD = 2
TAXI = 3

def countcalls(func):
    """Decorator to track the number of times a function is called. Provides `calls` attribute."""
    countcalls.calls = 0

    @wraps(func)
    def wrapper(*args, **kwargs):
        initial_calls = countcalls.calls
        countcalls.calls += 1
        result = func(*args, **kwargs)
        wrapper.calls = countcalls.calls - initial_calls
        return result

    return wrapper

@countcalls
def global_align(lft_seq, top_seq, lft, top):
    """Perform Needleman-Wunsch global alignment of two sequences

    Args:
        lft_seq: String sequence on "left side" of the alignment grid
        top_seq: String sequence on "top" of the alignment grid
        lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]

    Returns:
        Score of global alignment of left_seq[lft-1] with top_seq[top-1]
    """
    if lft == 0 and top == 0:
        return 0
    elif lft == 0:
        return global_align(lft_seq, top_seq, lft, top-1) + GAP
    elif top == 0:
        return global_align(lft_seq, top_seq, lft-1, top) + GAP
    else:
        if lft_seq[lft-1] == top_seq[top-1]:
            diag = MATCH
        else:
            diag = MISMATCH
        return max(
            global_align(lft_seq, top_seq, lft, top-1) + GAP,
            global_align(lft_seq, top_seq, lft-1, top) + GAP,
            global_align(lft_seq, top_seq, lft-1, top-1) + diag
        )

# Test example    
# global_align("TGTCC", "TGATC", 5, 5)
# print("Number of recursive calls:", global_align.calls) # Report number of recursive calls

@countcalls
def global_align_memo(lft_seq, top_seq, lft, top, scores):
    """Perform memoized Needleman-Wunsch global alignment of two sequences

    Args:
        lft_seq: String sequence on "left side" of the alignment grid
        top_seq: String sequence on "top" of the alignment grid
        lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        scores: Dictionary of memoized scores for each (lft, top) pair

    Returns:
        Score of global alignment of left_seq[lft-1] with top_seq[top-1]
    """
    if (lft, top) in scores:
        return scores[(lft, top)]

    if lft == 0 and top == 0:
        edge = 0
    elif lft == 0:
        edge = global_align_memo(lft_seq, top_seq, lft, top-1, scores) + GAP
    elif top == 0:
        edge = global_align_memo(lft_seq, top_seq, lft-1, top, scores) + GAP
    else:
        if lft_seq[lft-1] == top_seq[top-1]:
            diag = MATCH
        else:
            diag = MISMATCH
        edge = max(
            global_align_memo(lft_seq, top_seq, lft, top-1, scores) + GAP,
            global_align_memo(lft_seq, top_seq, lft-1, top, scores) + GAP,
            global_align_memo(lft_seq, top_seq, lft-1, top-1, scores) + diag
        )
    
    scores[(lft, top)] = edge
    return edge

# Test example
# scores = {}
# global_align_memo("TGTCC", "TGATC", 5, 5, scores)
# print("Number of recursive calls:", global_align_memo.calls) # Report number of recursive calls


def global_align_back(lft_seq, top_seq, lft, top, scores):
    """Perform memoized Needleman-Wunsch global alignment of two sequences, storing backtrack pointers

    Args:
        lft_seq: String sequence on "left side" of the alignment grid
        top_seq: String sequence on "top" of the alignment grid
        lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        scores: Dictionary of memoized scores for each (lft, top) pair

    Returns:
        Score of global alignment of left_seq[lft-1] with top_seq[top-1]
    """
    if (lft, top) in scores:
        return scores[(lft, top)][0]

    if lft == 0 and top == 0:
        edge = (0, None)
    elif lft == 0:
        score = global_align_back(lft_seq, top_seq, lft, top-1, scores)
        edge = (score + GAP, LEFT)
    elif top == 0:
        score = global_align_back(lft_seq, top_seq, lft-1, top, scores)
        edge = (score + GAP, UPWD)
    else:
        if lft_seq[lft-1] == top_seq[top-1]:
            diag = MATCH
        else:
            diag = MISMATCH
        edge = max(
            (global_align_back(lft_seq, top_seq, lft, top-1, scores) + GAP, LEFT),
            (global_align_back(lft_seq, top_seq, lft-1, top, scores) + GAP, UPWD),
            (global_align_back(lft_seq, top_seq, lft-1, top-1, scores) + diag, DIAG)
        )
    
    scores[(lft, top)] = edge
    return edge[0]

def backtrack(lft_seq, top_seq, lft, top, scores):
    """Return actual alignments by backtracking via pointers in scores dictionary

    Args:
        lft_seq: String sequence on "left side" of the alignment grid
        top_seq: String sequence on "top" of the alignment grid
        lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        scores: Dictionary of memoized scores and pointers for each (lft, top) pair

    Returns:
       Tuple of (left, top) alignments with gaps inserted
    """
    lft_align = ""
    top_align = ""
    while True:
        score, edge = scores[(lft, top)]
        if edge == DIAG:
            lft_align = lft_seq[lft-1] + lft_align
            top_align = top_seq[top-1] + top_align
            lft -= 1
            top -= 1
        elif edge == UPWD:
            lft_align = lft_seq[lft-1] + lft_align
            top_align = "-" + top_align
            lft -= 1
        elif edge == LEFT:
            lft_align = "-" + lft_align
            top_align = top_seq[top-1] + top_align
            top -= 1
        elif edge == TAXI:
            lft_align = lft_seq[:lft] + lft_align
            top_align = "-" * lft + top_align
            break
        else:
            break
    return (lft_align, top_align)        
            

# scores = {}
# global_align_back("TGTCC", "TGATC", 5, 5, scores)
# backtrack("TGTCC", "TGATC", 5, 5, scores)

def fit_align_back(lft_seq, top_seq, lft, top, scores):
    """Perform memoized fitting alignment where top must fully align somewhere in a 
    longer left sequence

    Args:
        lft_seq: String sequence on "left side" of the alignment grid
        top_seq: String sequence on "top" of the alignment grid
        lft: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        top: Index to align through, i.e., left_seq[lft-1] with top_seq[top-1]
        scores: Dictionary of memoized scores for each (lft, top) pair

    Returns:
        Score of global alignment of left_seq[lft-1] with top_seq[top-1]
    """
    if (lft, top) in scores:
        return scores[(lft, top)][0]

    if lft == 0 and top == 0:
        edge = (0, None)
    elif lft == 0:
        score = fit_align_back(lft_seq, top_seq, lft, top-1, scores)
        edge = (score + GAP, LEFT)
    elif top == 0:
        edge = max(
            (0, TAXI),
            (fit_align_back(lft_seq, top_seq, lft-1, top, scores) + GAP, UPWD)
        )
    else:
        if lft_seq[lft-1] == top_seq[top-1]:
            diag = MATCH
        else:
            diag = MISMATCH
        edge = max(
            (fit_align_back(lft_seq, top_seq, lft, top-1, scores) + GAP, LEFT),
            (fit_align_back(lft_seq, top_seq, lft-1, top, scores) + GAP, UPWD),
            (fit_align_back(lft_seq, top_seq, lft-1, top-1, scores) + diag, DIAG)
        )
    
    scores[(lft, top)] = edge
    return edge[0]

def find_fit_start(lft, top, scores):
    """Find the start of the best "fitting" alignment in last column of lattice

    Args:
        lft: Length of the "left", longer, sequence
        top: Length of the "top", shorter, sequence
        scores: Dictionary of memoized alignment scores and pointers

    Returns:
        Tuple of best (lft, top) in last column
    """
    best_node = (0, top)
    best_score, _ = scores[best_node]
    for i in range(1, lft+1):
        node_score, _ = scores[(i, top)]
        if node_score > best_score:
            best_node = (i, top)
            best_score = node_score
    return best_node

def fitting_alignment(ref_seq, query_seq):
    """Perform fitting alignment query against a reference sequence

    Args:
        ref_seq: Longer reference sequence
        query_seq: Shorter query sequence

    Returns:
        Tuple with score, ref alignment and query alignment
    """
    scores = {}
    fit_align_back(ref_seq, query_seq, len(ref_seq), len(query_seq), scores)
    
    ref_end, query_end = find_fit_start( len(ref_seq), len(query_seq), scores)
    assert query_end == len(query_seq), "Fitting alignment should end in last column"
    
    ref_align, query_align = backtrack(ref_seq, query_seq, ref_end, query_end, scores)
    # How many gaps at end of query_seq?
    tail = len(ref_seq) - ref_end
    return (scores[(ref_end, query_end)][0], ref_align + ref_seq[ref_end:], query_align + "-" * tail)

# fitting_alignment("TTAGTAGGCTTAAGGTTA", "TAGATA")


def pileup(ref, reads_url):
    """Generate text pileup of read sequences against a reference sequence

    Args:
        ref: Reference genome sequence
        reads_url: URL for text file of sequencing reads 
    """
    print(ref)
    with urllib.request.urlopen(reads_url) as url:
        for query_seq in url:
            query_seq = query_seq.decode("utf-8", "ignore") # Obtain a string from the raw bytes
            _, ref_align, query_align = fitting_alignment(ref, query_seq.strip())
            assert len(ref_align) == len(query_align), "Alignment lengths should match"
            # Squash any gaps in the reference sequence, since they are hard to plot
            # in pileup.
            squashed_query = ""
            for i in range(len(ref_align)):
                if ref_align[i] != "-":
                    squashed_query += query_align[i]

            print(squashed_query)


REF_GENOME="AATCTGCCTTGTGGTCGGCTCTTACCTTCAGGCTGCTCTGAGCCCAGAGCAGAATGGTCATCACAGCTCTCCTCAACTTGGCATTGCCTGAGATCAGGATGGCTGCATGCCCAGAGGGACAAGCTGCCATTATCCCAACACAAACCATCACCCCTATTTTGTCGCGCCACAGAATCAGTAGGGGCACAGAGATGAAGGCAGC"
# pileup(REF_GENOME, "https://www.cs.middlebury.edu/~mlinderman/courses/cs146/f24/classes/NA12878.ready.sample.txt")