mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-02-26 10:58:40 +00:00
99 lines
2.9 KiB
Python
99 lines
2.9 KiB
Python
|
|
||
|
# https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
|
||
|
# Score constants
|
||
|
"""
|
||
|
Score constants used in the Smith-Waterman algorithm. Matches are given a positive
|
||
|
score while mismatches are given a negative score. Gaps are also penalized.
|
||
|
"""
|
||
|
MATCH = 1
|
||
|
MISMATCH = -1
|
||
|
GAP = -2
|
||
|
|
||
|
|
||
|
def score_function(a: str, b: str) -> int:
|
||
|
"""
|
||
|
Calculate the score for a character pair based on whether they match or mismatch.
|
||
|
Returns 1 if the characters match, -1 if they mismatch.
|
||
|
>>> score_function('A', 'A')
|
||
|
1
|
||
|
>>> score_function('A', 'C')
|
||
|
-1
|
||
|
"""
|
||
|
if a == b:
|
||
|
return MATCH
|
||
|
else:
|
||
|
return MISMATCH
|
||
|
|
||
|
|
||
|
def smith_waterman(query: str, subject: str) -> list[list[int]]:
|
||
|
"""
|
||
|
Perform the Smith-Waterman local sequence alignment algorithm.
|
||
|
Returns a 2D list representing the score matrix. Each value in the matrix
|
||
|
corresponds to the score of the best local alignment ending at that point.
|
||
|
>>> smith_waterman('ACAC', 'CA')
|
||
|
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
|
||
|
"""
|
||
|
|
||
|
# Initialize score matrix
|
||
|
m = len(query)
|
||
|
n = len(subject)
|
||
|
score = [[0] * (n + 1) for _ in range(m + 1)]
|
||
|
|
||
|
for i in range(1, m + 1):
|
||
|
for j in range(1, n + 1):
|
||
|
# Calculate scores for each cell
|
||
|
match = score[i - 1][j - 1] + score_function(query[i - 1], subject[j - 1])
|
||
|
delete = score[i - 1][j] + GAP
|
||
|
insert = score[i][j - 1] + GAP
|
||
|
|
||
|
# Take maximum score
|
||
|
score[i][j] = max(0, match, delete, insert)
|
||
|
|
||
|
return score
|
||
|
|
||
|
|
||
|
def traceback(score: list[list[int]], query: str, subject: str) -> str:
|
||
|
r"""
|
||
|
Perform traceback to find the optimal local alignment.
|
||
|
Starts from the highest scoring cell in the matrix and traces back recursively
|
||
|
until a 0 score is found. Returns the alignment strings.
|
||
|
>>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA')
|
||
|
'CAC\nCA-'
|
||
|
"""
|
||
|
|
||
|
# Traceback logic to find optimal alignment
|
||
|
i = len(query)
|
||
|
j = len(subject)
|
||
|
align1 = ""
|
||
|
align2 = ""
|
||
|
|
||
|
while i > 0 and j > 0:
|
||
|
if score[i][j] == score[i - 1][j - 1] + score_function(
|
||
|
query[i - 1], subject[j - 1]
|
||
|
):
|
||
|
# optimal path is a diagonal take both letters
|
||
|
align1 = query[i - 1] + align1
|
||
|
align2 = subject[j - 1] + align2
|
||
|
i -= 1
|
||
|
j -= 1
|
||
|
elif score[i][j] == score[i - 1][j] + GAP:
|
||
|
# optimal path is a vertical
|
||
|
align1 = query[i - 1] + align1
|
||
|
align2 = "-" + align2
|
||
|
i -= 1
|
||
|
else:
|
||
|
# optimal path is a horizontal
|
||
|
align1 = "-" + align1
|
||
|
align2 = subject[j - 1] + align2
|
||
|
j -= 1
|
||
|
|
||
|
return f'{align1}\n{align2}'
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
query = "HEAGAWGHEE"
|
||
|
subject = "PAWHEAE"
|
||
|
|
||
|
score = smith_waterman(query, subject)
|
||
|
print(traceback(score, query, subject))
|