mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-27 15:01:08 +00:00
Aho-Corasick String Matching Algorithm (#346)
* add aho-corasick algorithm * Add a doctest and format with black
This commit is contained in:
parent
8b572e6cfd
commit
92268561a5
92
strings/aho-corasick.py
Normal file
92
strings/aho-corasick.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
from collections import deque
|
||||
|
||||
|
||||
class Automaton:
|
||||
def __init__(self, keywords):
|
||||
self.adlist = list()
|
||||
self.adlist.append(
|
||||
{"value": "", "next_states": [], "fail_state": 0, "output": []}
|
||||
)
|
||||
|
||||
for keyword in keywords:
|
||||
self.add_keyword(keyword)
|
||||
self.set_fail_transitions()
|
||||
|
||||
def find_next_state(self, current_state, char):
|
||||
for state in self.adlist[current_state]["next_states"]:
|
||||
if char == self.adlist[state]["value"]:
|
||||
return state
|
||||
return None
|
||||
|
||||
def add_keyword(self, keyword):
|
||||
current_state = 0
|
||||
for character in keyword:
|
||||
if self.find_next_state(current_state, character):
|
||||
current_state = self.find_next_state(current_state, character)
|
||||
else:
|
||||
self.adlist.append(
|
||||
{
|
||||
"value": character,
|
||||
"next_states": [],
|
||||
"fail_state": 0,
|
||||
"output": [],
|
||||
}
|
||||
)
|
||||
self.adlist[current_state]["next_states"].append(len(self.adlist) - 1)
|
||||
current_state = len(self.adlist) - 1
|
||||
self.adlist[current_state]["output"].append(keyword)
|
||||
|
||||
def set_fail_transitions(self):
|
||||
q = deque()
|
||||
for node in self.adlist[0]["next_states"]:
|
||||
q.append(node)
|
||||
self.adlist[node]["fail_state"] = 0
|
||||
while q:
|
||||
r = q.popleft()
|
||||
for child in self.adlist[r]["next_states"]:
|
||||
q.append(child)
|
||||
state = self.adlist[r]["fail_state"]
|
||||
while (
|
||||
self.find_next_state(state, self.adlist[child]["value"]) == None
|
||||
and state != 0
|
||||
):
|
||||
state = self.adlist[state]["fail_state"]
|
||||
self.adlist[child]["fail_state"] = self.find_next_state(
|
||||
state, self.adlist[child]["value"]
|
||||
)
|
||||
if self.adlist[child]["fail_state"] == None:
|
||||
self.adlist[child]["fail_state"] = 0
|
||||
self.adlist[child]["output"] = (
|
||||
self.adlist[child]["output"]
|
||||
+ self.adlist[self.adlist[child]["fail_state"]]["output"]
|
||||
)
|
||||
|
||||
def search_in(self, string):
|
||||
"""
|
||||
>>> A = Automaton(["what", "hat", "ver", "er"])
|
||||
>>> A.search_in("whatever, err ... , wherever")
|
||||
{'what': [0], 'hat': [1], 'ver': [5, 25], 'er': [6, 10, 22, 26]}
|
||||
"""
|
||||
result = dict() # returns a dict with keywords and list of its occurences
|
||||
current_state = 0
|
||||
for i in range(len(string)):
|
||||
while (
|
||||
self.find_next_state(current_state, string[i]) == None
|
||||
and current_state != 0
|
||||
):
|
||||
current_state = self.adlist[current_state]["fail_state"]
|
||||
current_state = self.find_next_state(current_state, string[i])
|
||||
if current_state is None:
|
||||
current_state = 0
|
||||
else:
|
||||
for key in self.adlist[current_state]["output"]:
|
||||
if not (key in result):
|
||||
result[key] = []
|
||||
result[key].append((i - len(key) + 1))
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
doctest.testmod()
|
Loading…
Reference in New Issue
Block a user