2021-09-07 11:37:03 +00:00
|
|
|
from __future__ import annotations
|
2020-10-06 08:31:15 +00:00
|
|
|
|
|
|
|
|
2023-09-28 18:24:46 +00:00
|
|
|
def knuth_morris_pratt(text: str, pattern: str) -> int:
|
2017-11-11 15:19:41 +00:00
|
|
|
"""
|
|
|
|
The Knuth-Morris-Pratt Algorithm for finding a pattern within a piece of text
|
|
|
|
with complexity O(n + m)
|
|
|
|
|
|
|
|
1) Preprocess pattern to identify any suffixes that are identical to prefixes
|
|
|
|
|
2020-06-16 08:09:19 +00:00
|
|
|
This tells us where to continue from if we get a mismatch between a character
|
|
|
|
in our pattern and the text.
|
2017-11-11 15:19:41 +00:00
|
|
|
|
2020-06-16 08:09:19 +00:00
|
|
|
2) Step through the text one character at a time and compare it to a character in
|
|
|
|
the pattern updating our location within the pattern if necessary
|
2017-11-11 15:19:41 +00:00
|
|
|
|
2023-09-28 18:24:46 +00:00
|
|
|
>>> kmp = "knuth_morris_pratt"
|
|
|
|
>>> all(
|
|
|
|
... knuth_morris_pratt(kmp, s) == kmp.find(s)
|
|
|
|
... for s in ("kn", "h_m", "rr", "tt", "not there")
|
|
|
|
... )
|
|
|
|
True
|
2017-11-11 15:19:41 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
# 1) Construct the failure array
|
2018-01-07 12:49:51 +00:00
|
|
|
failure = get_failure_array(pattern)
|
2017-11-11 15:19:41 +00:00
|
|
|
|
|
|
|
# 2) Step through text searching for pattern
|
|
|
|
i, j = 0, 0 # index into text, pattern
|
|
|
|
while i < len(text):
|
|
|
|
if pattern[j] == text[i]:
|
|
|
|
if j == (len(pattern) - 1):
|
2023-09-28 18:24:46 +00:00
|
|
|
return i - j
|
2017-11-11 15:19:41 +00:00
|
|
|
j += 1
|
|
|
|
|
|
|
|
# if this is a prefix in our pattern
|
|
|
|
# just go back far enough to continue
|
2018-01-07 12:49:51 +00:00
|
|
|
elif j > 0:
|
|
|
|
j = failure[j - 1]
|
|
|
|
continue
|
|
|
|
i += 1
|
2023-09-28 18:24:46 +00:00
|
|
|
return -1
|
2017-11-11 15:19:41 +00:00
|
|
|
|
|
|
|
|
2021-09-07 11:37:03 +00:00
|
|
|
def get_failure_array(pattern: str) -> list[int]:
|
2018-01-07 12:49:51 +00:00
|
|
|
"""
|
|
|
|
Calculates the new index we should go to if we fail a comparison
|
|
|
|
:param pattern:
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
failure = [0]
|
|
|
|
i = 0
|
|
|
|
j = 1
|
|
|
|
while j < len(pattern):
|
|
|
|
if pattern[i] == pattern[j]:
|
|
|
|
i += 1
|
|
|
|
elif i > 0:
|
2019-10-05 05:14:13 +00:00
|
|
|
i = failure[i - 1]
|
2018-01-07 12:49:51 +00:00
|
|
|
continue
|
|
|
|
j += 1
|
|
|
|
failure.append(i)
|
|
|
|
return failure
|
|
|
|
|
2017-11-11 15:19:41 +00:00
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
if __name__ == "__main__":
|
2023-09-28 18:24:46 +00:00
|
|
|
import doctest
|
|
|
|
|
|
|
|
doctest.testmod()
|
|
|
|
|
2017-11-11 15:19:41 +00:00
|
|
|
# Test 1)
|
|
|
|
pattern = "abc1abc12"
|
|
|
|
text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc"
|
|
|
|
text2 = "alskfjaldsk23adsfabcabc"
|
2023-10-11 18:30:02 +00:00
|
|
|
assert knuth_morris_pratt(text1, pattern)
|
|
|
|
assert knuth_morris_pratt(text2, pattern)
|
2017-11-11 15:19:41 +00:00
|
|
|
|
|
|
|
# Test 2)
|
|
|
|
pattern = "ABABX"
|
|
|
|
text = "ABABZABABYABABX"
|
2023-09-28 18:24:46 +00:00
|
|
|
assert knuth_morris_pratt(text, pattern)
|
2017-11-11 15:19:41 +00:00
|
|
|
|
2018-01-07 12:49:51 +00:00
|
|
|
# Test 3)
|
|
|
|
pattern = "AAAB"
|
|
|
|
text = "ABAAAAAB"
|
2023-09-28 18:24:46 +00:00
|
|
|
assert knuth_morris_pratt(text, pattern)
|
2018-01-07 12:49:51 +00:00
|
|
|
|
|
|
|
# Test 4)
|
|
|
|
pattern = "abcdabcy"
|
|
|
|
text = "abcxabcdabxabcdabcdabcy"
|
2023-09-28 18:24:46 +00:00
|
|
|
assert knuth_morris_pratt(text, pattern)
|
|
|
|
|
|
|
|
# Test 5) -> Doctests
|
|
|
|
kmp = "knuth_morris_pratt"
|
|
|
|
assert all(
|
|
|
|
knuth_morris_pratt(kmp, s) == kmp.find(s)
|
|
|
|
for s in ("kn", "h_m", "rr", "tt", "not there")
|
|
|
|
)
|
2017-11-11 15:19:41 +00:00
|
|
|
|
2023-09-28 18:24:46 +00:00
|
|
|
# Test 6)
|
2018-01-07 12:49:51 +00:00
|
|
|
pattern = "aabaabaaa"
|
|
|
|
assert get_failure_array(pattern) == [0, 1, 0, 1, 2, 3, 4, 5, 2]
|