"""https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance""" def jaro_winkler(str1: str, str2: str) -> float: """ Jaro-Winkler distance is a string metric measuring an edit distance between two sequences. Output value is between 0.0 and 1.0. >>> jaro_winkler("martha", "marhta") 0.9611111111111111 >>> jaro_winkler("CRATE", "TRACE") 0.7333333333333334 >>> jaro_winkler("test", "dbdbdbdb") 0.0 >>> jaro_winkler("test", "test") 1.0 >>> jaro_winkler("hello world", "HeLLo W0rlD") 0.6363636363636364 >>> jaro_winkler("test", "") 0.0 >>> jaro_winkler("hello", "world") 0.4666666666666666 >>> jaro_winkler("hell**o", "*world") 0.4365079365079365 """ def get_matched_characters(_str1: str, _str2: str) -> str: matched = [] limit = min(len(_str1), len(_str2)) // 2 for i, char in enumerate(_str1): left = int(max(0, i - limit)) right = int(min(i + limit + 1, len(_str2))) if char in _str2[left:right]: matched.append(char) _str2 = f"{_str2[0:_str2.index(char)]} {_str2[_str2.index(char) + 1:]}" return "".join(matched) # matching characters matching_1 = get_matched_characters(str1, str2) matching_2 = get_matched_characters(str2, str1) match_count = len(matching_1) # transposition transpositions = ( len([(c1, c2) for c1, c2 in zip(matching_1, matching_2) if c1 != c2]) // 2 ) if not match_count: jaro = 0.0 else: jaro = ( 1 / 3 * ( match_count / len(str1) + match_count / len(str2) + (match_count - transpositions) / match_count ) ) # common prefix up to 4 characters prefix_len = 0 for c1, c2 in zip(str1[:4], str2[:4]): if c1 == c2: prefix_len += 1 else: break return jaro + 0.1 * prefix_len * (1 - jaro) if __name__ == "__main__": import doctest doctest.testmod() print(jaro_winkler("hello", "world"))