2018-01-07 12:49:51 +00:00
|
|
|
def kmp(pattern, text):
|
2017-11-11 15:19:41 +00:00
|
|
|
"""
|
|
|
|
The Knuth-Morris-Pratt Algorithm for finding a pattern within a piece of text
|
|
|
|
with complexity O(n + m)
|
|
|
|
|
|
|
|
1) Preprocess pattern to identify any suffixes that are identical to prefixes
|
|
|
|
|
|
|
|
This tells us where to continue from if we get a mismatch between a character in our pattern
|
|
|
|
and the text.
|
|
|
|
|
|
|
|
2) Step through the text one character at a time and compare it to a character in the pattern
|
|
|
|
updating our location within the pattern if necessary
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# 1) Construct the failure array
|
2018-01-07 12:49:51 +00:00
|
|
|
failure = get_failure_array(pattern)
|
2017-11-11 15:19:41 +00:00
|
|
|
|
|
|
|
# 2) Step through text searching for pattern
|
|
|
|
i, j = 0, 0 # index into text, pattern
|
|
|
|
while i < len(text):
|
|
|
|
if pattern[j] == text[i]:
|
|
|
|
if j == (len(pattern) - 1):
|
|
|
|
return True
|
|
|
|
j += 1
|
|
|
|
|
|
|
|
# if this is a prefix in our pattern
|
|
|
|
# just go back far enough to continue
|
2018-01-07 12:49:51 +00:00
|
|
|
elif j > 0:
|
|
|
|
j = failure[j - 1]
|
|
|
|
continue
|
|
|
|
i += 1
|
2017-11-11 15:19:41 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
2018-01-07 12:49:51 +00:00
|
|
|
def get_failure_array(pattern):
|
|
|
|
"""
|
|
|
|
Calculates the new index we should go to if we fail a comparison
|
|
|
|
:param pattern:
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
failure = [0]
|
|
|
|
i = 0
|
|
|
|
j = 1
|
|
|
|
while j < len(pattern):
|
|
|
|
if pattern[i] == pattern[j]:
|
|
|
|
i += 1
|
|
|
|
elif i > 0:
|
2019-10-05 05:14:13 +00:00
|
|
|
i = failure[i - 1]
|
2018-01-07 12:49:51 +00:00
|
|
|
continue
|
|
|
|
j += 1
|
|
|
|
failure.append(i)
|
|
|
|
return failure
|
|
|
|
|
2017-11-11 15:19:41 +00:00
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
if __name__ == "__main__":
|
2017-11-11 15:19:41 +00:00
|
|
|
# Test 1)
|
|
|
|
pattern = "abc1abc12"
|
|
|
|
text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc"
|
|
|
|
text2 = "alskfjaldsk23adsfabcabc"
|
|
|
|
assert kmp(pattern, text1) and not kmp(pattern, text2)
|
|
|
|
|
|
|
|
# Test 2)
|
|
|
|
pattern = "ABABX"
|
|
|
|
text = "ABABZABABYABABX"
|
|
|
|
assert kmp(pattern, text)
|
|
|
|
|
2018-01-07 12:49:51 +00:00
|
|
|
# Test 3)
|
|
|
|
pattern = "AAAB"
|
|
|
|
text = "ABAAAAAB"
|
|
|
|
assert kmp(pattern, text)
|
|
|
|
|
|
|
|
# Test 4)
|
|
|
|
pattern = "abcdabcy"
|
|
|
|
text = "abcxabcdabxabcdabcdabcy"
|
|
|
|
assert kmp(pattern, text)
|
2017-11-11 15:19:41 +00:00
|
|
|
|
2018-01-07 12:49:51 +00:00
|
|
|
# Test 5)
|
|
|
|
pattern = "aabaabaaa"
|
|
|
|
assert get_failure_array(pattern) == [0, 1, 0, 1, 2, 3, 4, 5, 2]
|