added smith waterman algorithm (#9001)

* added smith waterman algorithm * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * descriptive names for the parameters a and b * doctesting lowercase upcase empty string cases * updated block quot,fixed traceback and doctests * shorter block quote Co-authored-by: Tianyi Zheng <tianyizheng02@gmail.com> * global vars to func params,more doctests * updated doctests * user access to SW params * formating --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tianyi Zheng <tianyizheng02@gmail.com>
2025-05-28 19:36:36 +00:00 · 2023-09-30 05:33:13 +01:00 · 2023-09-30 05:33:13 +01:00 · 467903aa33
commit 467903aa33
parent 35dd529c85
1 changed files with 193 additions and 0 deletions
--- a/dynamic_programming/smith_waterman.py
+++ b/dynamic_programming/smith_waterman.py
@ -0,0 +1,193 @@
+"""
+https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
+The Smith-Waterman algorithm is a dynamic programming algorithm used for sequence
+alignment. It is particularly useful for finding similarities between two sequences,
+such as DNA or protein sequences. In this implementation, gaps are penalized
+linearly, meaning that the score is reduced by a fixed amount for each gap introduced
+in the alignment. However, it's important to note that the Smith-Waterman algorithm
+supports other gap penalty methods as well.
+"""
+
+
+def score_function(
+    source_char: str,
+    target_char: str,
+    match: int = 1,
+    mismatch: int = -1,
+    gap: int = -2,
+) -> int:
+    """
+    Calculate the score for a character pair based on whether they match or mismatch.
+    Returns 1 if the characters match, -1 if they mismatch, and -2 if either of the
+    characters is a gap.
+    >>> score_function('A', 'A')
+    1
+    >>> score_function('A', 'C')
+    -1
+    >>> score_function('-', 'A')
+    -2
+    >>> score_function('A', '-')
+    -2
+    >>> score_function('-', '-')
+    -2
+    """
+    if "-" in (source_char, target_char):
+        return gap
+    return match if source_char == target_char else mismatch
+
+
+def smith_waterman(
+    query: str,
+    subject: str,
+    match: int = 1,
+    mismatch: int = -1,
+    gap: int = -2,
+) -> list[list[int]]:
+    """
+    Perform the Smith-Waterman local sequence alignment algorithm.
+    Returns a 2D list representing the score matrix. Each value in the matrix
+    corresponds to the score of the best local alignment ending at that point.
+    >>> smith_waterman('ACAC', 'CA')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+    >>> smith_waterman('acac', 'ca')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+    >>> smith_waterman('ACAC', 'ca')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+    >>> smith_waterman('acac', 'CA')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+    >>> smith_waterman('ACAC', '')
+    [[0], [0], [0], [0], [0]]
+    >>> smith_waterman('', 'CA')
+    [[0, 0, 0]]
+    >>> smith_waterman('ACAC', 'CA')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+
+    >>> smith_waterman('acac', 'ca')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+
+    >>> smith_waterman('ACAC', 'ca')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+
+    >>> smith_waterman('acac', 'CA')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+
+    >>> smith_waterman('ACAC', '')
+    [[0], [0], [0], [0], [0]]
+
+    >>> smith_waterman('', 'CA')
+    [[0, 0, 0]]
+
+    >>> smith_waterman('AGT', 'AGT')
+    [[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3]]
+
+    >>> smith_waterman('AGT', 'GTA')
+    [[0, 0, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 2, 0]]
+
+    >>> smith_waterman('AGT', 'GTC')
+    [[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0]]
+
+    >>> smith_waterman('AGT', 'G')
+    [[0, 0], [0, 0], [0, 1], [0, 0]]
+
+    >>> smith_waterman('G', 'AGT')
+    [[0, 0, 0, 0], [0, 0, 1, 0]]
+
+    >>> smith_waterman('AGT', 'AGTCT')
+    [[0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0], [0, 0, 0, 3, 1, 1]]
+
+    >>> smith_waterman('AGTCT', 'AGT')
+    [[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3], [0, 0, 0, 1], [0, 0, 0, 1]]
+
+    >>> smith_waterman('AGTCT', 'GTC')
+    [[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3], [0, 0, 1, 1]]
+    """
+    # make both query and subject uppercase
+    query = query.upper()
+    subject = subject.upper()
+
+    # Initialize score matrix
+    m = len(query)
+    n = len(subject)
+    score = [[0] * (n + 1) for _ in range(m + 1)]
+    kwargs = {"match": match, "mismatch": mismatch, "gap": gap}
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            # Calculate scores for each cell
+            match = score[i - 1][j - 1] + score_function(
+                query[i - 1], subject[j - 1], **kwargs
+            )
+            delete = score[i - 1][j] + gap
+            insert = score[i][j - 1] + gap
+
+            # Take maximum score
+            score[i][j] = max(0, match, delete, insert)
+
+    return score
+
+
+def traceback(score: list[list[int]], query: str, subject: str) -> str:
+    r"""
+    Perform traceback to find the optimal local alignment.
+    Starts from the highest scoring cell in the matrix and traces back recursively
+    until a 0 score is found. Returns the alignment strings.
+    >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA')
+    'CA\nCA'
+    >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'acac', 'ca')
+    'CA\nCA'
+    >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'ca')
+    'CA\nCA'
+    >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'acac', 'CA')
+    'CA\nCA'
+    >>> traceback([[0, 0, 0]], 'ACAC', '')
+    ''
+    """
+    # make both query and subject uppercase
+    query = query.upper()
+    subject = subject.upper()
+    # find the indices of the maximum value in the score matrix
+    max_value = float("-inf")
+    i_max = j_max = 0
+    for i, row in enumerate(score):
+        for j, value in enumerate(row):
+            if value > max_value:
+                max_value = value
+                i_max, j_max = i, j
+    # Traceback logic to find optimal alignment
+    i = i_max
+    j = j_max
+    align1 = ""
+    align2 = ""
+    gap = score_function("-", "-")
+    # guard against empty query or subject
+    if i == 0 or j == 0:
+        return ""
+    while i > 0 and j > 0:
+        if score[i][j] == score[i - 1][j - 1] + score_function(
+            query[i - 1], subject[j - 1]
+        ):
+            # optimal path is a diagonal take both letters
+            align1 = query[i - 1] + align1
+            align2 = subject[j - 1] + align2
+            i -= 1
+            j -= 1
+        elif score[i][j] == score[i - 1][j] + gap:
+            # optimal path is a vertical
+            align1 = query[i - 1] + align1
+            align2 = f"-{align2}"
+            i -= 1
+        else:
+            # optimal path is a horizontal
+            align1 = f"-{align1}"
+            align2 = subject[j - 1] + align2
+            j -= 1
+
+    return f"{align1}\n{align2}"
+
+
+if __name__ == "__main__":
+    query = "HEAGAWGHEE"
+    subject = "PAWHEAE"
+
+    score = smith_waterman(query, subject, match=1, mismatch=-1, gap=-2)
+    print(traceback(score, query, subject))