Python/strings/jaro_winkler.py
pre-commit-ci[bot] 4fe50bc1fc
[pre-commit.ci] pre-commit autoupdate -- ruff 2025 stable format (#12521)
* [pre-commit.ci] pre-commit autoupdate

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.8.6 → v0.9.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.6...v0.9.1)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update maths/dual_number_automatic_differentiation.py

* Update maths/dual_number_automatic_differentiation.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dual_number_automatic_differentiation.py

* Update dual_number_automatic_differentiation.py

* No <fin-streamer> tag with the specified data-test attribute found.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Christian Clauss <cclauss@me.com>
2025-01-13 21:52:12 +01:00

81 lines
2.1 KiB
Python

"""https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"""
def jaro_winkler(str1: str, str2: str) -> float:
"""
Jaro-Winkler distance is a string metric measuring an edit distance between two
sequences.
Output value is between 0.0 and 1.0.
>>> jaro_winkler("martha", "marhta")
0.9611111111111111
>>> jaro_winkler("CRATE", "TRACE")
0.7333333333333334
>>> jaro_winkler("test", "dbdbdbdb")
0.0
>>> jaro_winkler("test", "test")
1.0
>>> jaro_winkler("hello world", "HeLLo W0rlD")
0.6363636363636364
>>> jaro_winkler("test", "")
0.0
>>> jaro_winkler("hello", "world")
0.4666666666666666
>>> jaro_winkler("hell**o", "*world")
0.4365079365079365
"""
def get_matched_characters(_str1: str, _str2: str) -> str:
matched = []
limit = min(len(_str1), len(_str2)) // 2
for i, char in enumerate(_str1):
left = int(max(0, i - limit))
right = int(min(i + limit + 1, len(_str2)))
if char in _str2[left:right]:
matched.append(char)
_str2 = (
f"{_str2[0 : _str2.index(char)]} {_str2[_str2.index(char) + 1 :]}"
)
return "".join(matched)
# matching characters
matching_1 = get_matched_characters(str1, str2)
matching_2 = get_matched_characters(str2, str1)
match_count = len(matching_1)
# transposition
transpositions = (
len([(c1, c2) for c1, c2 in zip(matching_1, matching_2) if c1 != c2]) // 2
)
if not match_count:
jaro = 0.0
else:
jaro = (
1
/ 3
* (
match_count / len(str1)
+ match_count / len(str2)
+ (match_count - transpositions) / match_count
)
)
# common prefix up to 4 characters
prefix_len = 0
for c1, c2 in zip(str1[:4], str2[:4]):
if c1 == c2:
prefix_len += 1
else:
break
return jaro + 0.1 * prefix_len * (1 - jaro)
if __name__ == "__main__":
import doctest
doctest.testmod()
print(jaro_winkler("hello", "world"))