Python/strings/jaro_winkler.py
Maxim Smolskiy a42eb35702
Enable ruff E741 rule (#11370)
* Enable ruff E741 rule

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-19 21:30:22 +02:00

79 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"""
def jaro_winkler(str1: str, str2: str) -> float:
"""
JaroWinkler distance is a string metric measuring an edit distance between two
sequences.
Output value is between 0.0 and 1.0.
>>> jaro_winkler("martha", "marhta")
0.9611111111111111
>>> jaro_winkler("CRATE", "TRACE")
0.7333333333333334
>>> jaro_winkler("test", "dbdbdbdb")
0.0
>>> jaro_winkler("test", "test")
1.0
>>> jaro_winkler("hello world", "HeLLo W0rlD")
0.6363636363636364
>>> jaro_winkler("test", "")
0.0
>>> jaro_winkler("hello", "world")
0.4666666666666666
>>> jaro_winkler("hell**o", "*world")
0.4365079365079365
"""
def get_matched_characters(_str1: str, _str2: str) -> str:
matched = []
limit = min(len(_str1), len(_str2)) // 2
for i, char in enumerate(_str1):
left = int(max(0, i - limit))
right = int(min(i + limit + 1, len(_str2)))
if char in _str2[left:right]:
matched.append(char)
_str2 = f"{_str2[0:_str2.index(char)]} {_str2[_str2.index(char) + 1:]}"
return "".join(matched)
# matching characters
matching_1 = get_matched_characters(str1, str2)
matching_2 = get_matched_characters(str2, str1)
match_count = len(matching_1)
# transposition
transpositions = (
len([(c1, c2) for c1, c2 in zip(matching_1, matching_2) if c1 != c2]) // 2
)
if not match_count:
jaro = 0.0
else:
jaro = (
1
/ 3
* (
match_count / len(str1)
+ match_count / len(str2)
+ (match_count - transpositions) / match_count
)
)
# common prefix up to 4 characters
prefix_len = 0
for c1, c2 in zip(str1[:4], str2[:4]):
if c1 == c2:
prefix_len += 1
else:
break
return jaro + 0.1 * prefix_len * (1 - jaro)
if __name__ == "__main__":
import doctest
doctest.testmod()
print(jaro_winkler("hello", "world"))