From 0d36dc60c50acb555e1aa7f6127ddb4fd4ee9040 Mon Sep 17 00:00:00 2001 From: damelLP Date: Sun, 7 Jan 2018 12:49:51 +0000 Subject: [PATCH] fixed failure function and cleaned up code in kmp + added rabin-karp --- strings/knuth-morris-pratt.py | 53 +++++++++++++++++++++++++---------- strings/rabin-karp.py | 50 +++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 15 deletions(-) create mode 100644 strings/rabin-karp.py diff --git a/strings/knuth-morris-pratt.py b/strings/knuth-morris-pratt.py index 731f75c58..455394428 100644 --- a/strings/knuth-morris-pratt.py +++ b/strings/knuth-morris-pratt.py @@ -1,4 +1,4 @@ -def kmp(pattern, text, len_p=None, len_t=None): +def kmp(pattern, text): """ The Knuth-Morris-Pratt Algorithm for finding a pattern within a piece of text with complexity O(n + m) @@ -14,14 +14,7 @@ def kmp(pattern, text, len_p=None, len_t=None): """ # 1) Construct the failure array - failure = [0] - i = 0 - for index, char in enumerate(pattern[1:]): - if pattern[i] == char: - i += 1 - else: - i = 0 - failure.append(i) + failure = get_failure_array(pattern) # 2) Step through text searching for pattern i, j = 0, 0 # index into text, pattern @@ -29,20 +22,38 @@ def kmp(pattern, text, len_p=None, len_t=None): if pattern[j] == text[i]: if j == (len(pattern) - 1): return True - i += 1 j += 1 # if this is a prefix in our pattern # just go back far enough to continue - elif failure[j] > 0: - j = failure[j] - 1 - else: - i += 1 + elif j > 0: + j = failure[j - 1] + continue + i += 1 return False -if __name__ == '__main__': +def get_failure_array(pattern): + """ + Calculates the new index we should go to if we fail a comparison + :param pattern: + :return: + """ + failure = [0] + i = 0 + j = 1 + while j < len(pattern): + if pattern[i] == pattern[j]: + i += 1 + elif i > 0: + i = failure[i-1] + continue + j += 1 + failure.append(i) + return failure + +if __name__ == '__main__': # Test 1) pattern = "abc1abc12" text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc" @@ -54,4 +65,16 @@ if __name__ == '__main__': text = "ABABZABABYABABX" assert kmp(pattern, text) + # Test 3) + pattern = "AAAB" + text = "ABAAAAAB" + assert kmp(pattern, text) + # Test 4) + pattern = "abcdabcy" + text = "abcxabcdabxabcdabcdabcy" + assert kmp(pattern, text) + + # Test 5) + pattern = "aabaabaaa" + assert get_failure_array(pattern) == [0, 1, 0, 1, 2, 3, 4, 5, 2] diff --git a/strings/rabin-karp.py b/strings/rabin-karp.py new file mode 100644 index 000000000..04a849266 --- /dev/null +++ b/strings/rabin-karp.py @@ -0,0 +1,50 @@ +def rabin_karp(pattern, text): + """ + + The Rabin-Karp Algorithm for finding a pattern within a piece of text + with complexity O(nm), most efficient when it is used with multiple patterns + as it is able to check if any of a set of patterns match a section of text in o(1) given the precomputed hashes. + + This will be the simple version which only assumes one pattern is being searched for but it's not hard to modify + + 1) Calculate pattern hash + + 2) Step through the text one character at a time passing a window with the same length as the pattern + calculating the hash of the text within the window compare it with the hash of the pattern. Only testing + equality if the hashes match + + """ + p_len = len(pattern) + p_hash = hash(pattern) + + for i in range(0, len(text) - (p_len - 1)): + + # written like this t + text_hash = hash(text[i:i + p_len]) + if text_hash == p_hash and \ + text[i:i + p_len] == pattern: + return True + return False + + +if __name__ == '__main__': + # Test 1) + pattern = "abc1abc12" + text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc" + text2 = "alskfjaldsk23adsfabcabc" + assert rabin_karp(pattern, text1) and not rabin_karp(pattern, text2) + + # Test 2) + pattern = "ABABX" + text = "ABABZABABYABABX" + assert rabin_karp(pattern, text) + + # Test 3) + pattern = "AAAB" + text = "ABAAAAAB" + assert rabin_karp(pattern, text) + + # Test 4) + pattern = "abcdabcy" + text = "abcxabcdabxabcdabcdabcy" + assert rabin_karp(pattern, text)