Python/dynamic_programming/smith_waterman.py


# https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
# Score constants
"""
Score constants used in the Smith-Waterman algorithm. Matches are given a positive
score while mismatches are given a negative score. Gaps are also penalized.
"""
MATCH = 1
MISMATCH = -1
GAP = -2


def score_function(a: str, b: str) -> int:
    """
    Calculate the score for a character pair based on whether they match or mismatch.
    Returns 1 if the characters match, -1 if they mismatch.
    >>> score_function('A', 'A')
    1
    >>> score_function('A', 'C')
    -1
    """
    if a == b:
        return MATCH
    else:
        return MISMATCH


def smith_waterman(query: str, subject: str) -> list[list[int]]:
    """
    Perform the Smith-Waterman local sequence alignment algorithm.
    Returns a 2D list representing the score matrix. Each value in the matrix
    corresponds to the score of the best local alignment ending at that point.
    >>> smith_waterman('ACAC', 'CA')
    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
    """

    # Initialize score matrix
    m = len(query)
    n = len(subject)
    score = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            # Calculate scores for each cell
            match = score[i - 1][j - 1] + score_function(query[i - 1], subject[j - 1])
            delete = score[i - 1][j] + GAP
            insert = score[i][j - 1] + GAP

            # Take maximum score
            score[i][j] = max(0, match, delete, insert)

    return score


def traceback(score: list[list[int]], query: str, subject: str) -> str:
    r"""
    Perform traceback to find the optimal local alignment.
    Starts from the highest scoring cell in the matrix and traces back recursively
    until a 0 score is found. Returns the alignment strings.
    >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA')
    'CAC\nCA-'
    """

    # Traceback logic to find optimal alignment
    i = len(query)
    j = len(subject)
    align1 = ""
    align2 = ""

    while i > 0 and j > 0:
        if score[i][j] == score[i - 1][j - 1] + score_function(
            query[i - 1], subject[j - 1]
        ):
            # optimal path is a diagonal take both letters
            align1 = query[i - 1] + align1
            align2 = subject[j - 1] + align2
            i -= 1
            j -= 1
        elif score[i][j] == score[i - 1][j] + GAP:
            # optimal path is a vertical
            align1 = query[i - 1] + align1
            align2 = "-" + align2
            i -= 1
        else:
            # optimal path is a horizontal
            align1 = "-" + align1
            align2 = subject[j - 1] + align2
            j -= 1

    return f'{align1}\n{align2}'


if __name__ == "__main__":
    query = "HEAGAWGHEE"
    subject = "PAWHEAE"

    score = smith_waterman(query, subject)
    print(traceback(score, query, subject))