Python/dynamic_programming/smith_waterman.py


# https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
# Score constants
"""
Score constants used in the Smith-Waterman algorithm. Matches are given a positive
score while mismatches are given a negative score. Gaps are also penalized.
"""
MATCH = 1
MISMATCH = -1
GAP = -2


def score_function(a: str, b: str) -> int:
    """
    Calculate the score for a character pair based on whether they match or mismatch.
    Returns 1 if the characters match, -1 if they mismatch.
    >>> score_function('A', 'A')
    1
    >>> score_function('A', 'C')
    -1
    """
    if a == b:
        return MATCH
    else:
        return MISMATCH


def smith_waterman(query: str, subject: str) -> list[list[int]]:
    """
    Perform the Smith-Waterman local sequence alignment algorithm.
    Returns a 2D list representing the score matrix. Each value in the matrix
    corresponds to the score of the best local alignment ending at that point.
    >>> smith_waterman('ACAC', 'CA')
    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
    """

    # Initialize score matrix
    m = len(query)
    n = len(subject)
    score = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            # Calculate scores for each cell
            match = score[i - 1][j - 1] + score_function(query[i - 1], subject[j - 1])
            delete = score[i - 1][j] + GAP
            insert = score[i][j - 1] + GAP

            # Take maximum score
            score[i][j] = max(0, match, delete, insert)

    return score


def traceback(score: list[list[int]], query: str, subject: str) -> str:
    r"""
    Perform traceback to find the optimal local alignment.
    Starts from the highest scoring cell in the matrix and traces back recursively
    until a 0 score is found. Returns the alignment strings.
    >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA')
    'CAC\nCA-'
    """

    # Traceback logic to find optimal alignment
    i = len(query)
    j = len(subject)
    align1 = ""
    align2 = ""

    while i > 0 and j > 0:
        if score[i][j] == score[i - 1][j - 1] + score_function(
            query[i - 1], subject[j - 1]
        ):
            # optimal path is a diagonal take both letters
            align1 = query[i - 1] + align1
            align2 = subject[j - 1] + align2
            i -= 1
            j -= 1
        elif score[i][j] == score[i - 1][j] + GAP:
            # optimal path is a vertical
            align1 = query[i - 1] + align1
            align2 = "-" + align2
            i -= 1
        else:
            # optimal path is a horizontal
            align1 = "-" + align1
            align2 = subject[j - 1] + align2
            j -= 1

    return f'{align1}\n{align2}'


if __name__ == "__main__":
    query = "HEAGAWGHEE"
    subject = "PAWHEAE"

    score = smith_waterman(query, subject)
    print(traceback(score, query, subject))
added smith waterman algorithm 2023-08-21 00:09:04 +01:00
			`# https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm`
			`# Score constants`
			`"""`
			`Score constants used in the Smith-Waterman algorithm. Matches are given a positive`
			`score while mismatches are given a negative score. Gaps are also penalized.`
			`"""`
			`MATCH = 1`
			`MISMATCH = -1`
			`GAP = -2`


			`def score_function(a: str, b: str) -> int:`
			`"""`
			`Calculate the score for a character pair based on whether they match or mismatch.`
			`Returns 1 if the characters match, -1 if they mismatch.`
			`>>> score_function('A', 'A')`
			`1`
			`>>> score_function('A', 'C')`
			`-1`
			`"""`
			`if a == b:`
			`return MATCH`
			`else:`
			`return MISMATCH`


			`def smith_waterman(query: str, subject: str) -> list[list[int]]:`
			`"""`
			`Perform the Smith-Waterman local sequence alignment algorithm.`
			`Returns a 2D list representing the score matrix. Each value in the matrix`
			`corresponds to the score of the best local alignment ending at that point.`
			`>>> smith_waterman('ACAC', 'CA')`
			`[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]`
			`"""`

			`# Initialize score matrix`
			`m = len(query)`
			`n = len(subject)`
			`score = [[0] * (n + 1) for _ in range(m + 1)]`

			`for i in range(1, m + 1):`
			`for j in range(1, n + 1):`
			`# Calculate scores for each cell`
			`match = score[i - 1][j - 1] + score_function(query[i - 1], subject[j - 1])`
			`delete = score[i - 1][j] + GAP`
			`insert = score[i][j - 1] + GAP`

			`# Take maximum score`
			`score[i][j] = max(0, match, delete, insert)`

			`return score`


			`def traceback(score: list[list[int]], query: str, subject: str) -> str:`
			`r"""`
			`Perform traceback to find the optimal local alignment.`
			`Starts from the highest scoring cell in the matrix and traces back recursively`
			`until a 0 score is found. Returns the alignment strings.`
			`>>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA')`
			`'CAC\nCA-'`
			`"""`

			`# Traceback logic to find optimal alignment`
			`i = len(query)`
			`j = len(subject)`
			`align1 = ""`
			`align2 = ""`

			`while i > 0 and j > 0:`
			`if score[i][j] == score[i - 1][j - 1] + score_function(`
			`query[i - 1], subject[j - 1]`
			`):`
			`# optimal path is a diagonal take both letters`
			`align1 = query[i - 1] + align1`
			`align2 = subject[j - 1] + align2`
			`i -= 1`
			`j -= 1`
			`elif score[i][j] == score[i - 1][j] + GAP:`
			`# optimal path is a vertical`
			`align1 = query[i - 1] + align1`
			`align2 = "-" + align2`
			`i -= 1`
			`else:`
			`# optimal path is a horizontal`
			`align1 = "-" + align1`
			`align2 = subject[j - 1] + align2`
			`j -= 1`

			`return f'{align1}\n{align2}'`


			`if __name__ == "__main__":`
			`query = "HEAGAWGHEE"`
			`subject = "PAWHEAE"`

			`score = smith_waterman(query, subject)`
			`print(traceback(score, query, subject))`