From 154e5e8681b7ae9711fbef0b89f0ce365a8bf5bf Mon Sep 17 00:00:00 2001 From: Pedram_Mohajer <48964282+pedram-mohajer@users.noreply.github.com> Date: Sun, 26 Nov 2023 17:46:54 -0500 Subject: [PATCH] Update levenshtein_distance.py (#11171) * Update levenshtein_distance.py * Update levenshtein_distance.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update levenshtein_distance.py * Update levenshtein_distance.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update levenshtein_distance.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update levenshtein_distance.py * Update levenshtein_distance.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Christian Clauss --- strings/levenshtein_distance.py | 95 ++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 20 deletions(-) diff --git a/strings/levenshtein_distance.py b/strings/levenshtein_distance.py index 7be4074dc..3af660872 100644 --- a/strings/levenshtein_distance.py +++ b/strings/levenshtein_distance.py @@ -1,20 +1,9 @@ -""" -This is a Python implementation of the levenshtein distance. -Levenshtein distance is a string metric for measuring the -difference between two sequences. - -For doctests run following command: -python -m doctest -v levenshtein-distance.py -or -python3 -m doctest -v levenshtein-distance.py - -For manual testing run: -python levenshtein-distance.py -""" +from collections.abc import Callable def levenshtein_distance(first_word: str, second_word: str) -> int: - """Implementation of the levenshtein distance in Python. + """ + Implementation of the Levenshtein distance in Python. :param first_word: the first word to measure the difference. :param second_word: the second word to measure the difference. :return: the levenshtein distance between the two words. @@ -47,7 +36,7 @@ def levenshtein_distance(first_word: str, second_word: str) -> int: current_row = [i + 1] for j, c2 in enumerate(second_word): - # Calculate insertions, deletions and substitutions + # Calculate insertions, deletions, and substitutions insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) @@ -62,9 +51,75 @@ def levenshtein_distance(first_word: str, second_word: str) -> int: return previous_row[-1] -if __name__ == "__main__": - first_word = input("Enter the first word:\n").strip() - second_word = input("Enter the second word:\n").strip() +def levenshtein_distance_optimized(first_word: str, second_word: str) -> int: + """ + Compute the Levenshtein distance between two words (strings). + The function is optimized for efficiency by modifying rows in place. + :param first_word: the first word to measure the difference. + :param second_word: the second word to measure the difference. + :return: the Levenshtein distance between the two words. + Examples: + >>> levenshtein_distance_optimized("planet", "planetary") + 3 + >>> levenshtein_distance_optimized("", "test") + 4 + >>> levenshtein_distance_optimized("book", "back") + 2 + >>> levenshtein_distance_optimized("book", "book") + 0 + >>> levenshtein_distance_optimized("test", "") + 4 + >>> levenshtein_distance_optimized("", "") + 0 + >>> levenshtein_distance_optimized("orchestration", "container") + 10 + """ + if len(first_word) < len(second_word): + return levenshtein_distance_optimized(second_word, first_word) - result = levenshtein_distance(first_word, second_word) - print(f"Levenshtein distance between {first_word} and {second_word} is {result}") + if len(second_word) == 0: + return len(first_word) + + previous_row = list(range(len(second_word) + 1)) + + for i, c1 in enumerate(first_word): + current_row = [i + 1] + [0] * len(second_word) + + for j, c2 in enumerate(second_word): + insertions = previous_row[j + 1] + 1 + deletions = current_row[j] + 1 + substitutions = previous_row[j] + (c1 != c2) + current_row[j + 1] = min(insertions, deletions, substitutions) + + previous_row = current_row + + return previous_row[-1] + + +def benchmark_levenshtein_distance(func: Callable) -> None: + """ + Benchmark the Levenshtein distance function. + :param str: The name of the function being benchmarked. + :param func: The function to be benchmarked. + """ + from timeit import timeit + + stmt = f"{func.__name__}('sitting', 'kitten')" + setup = f"from __main__ import {func.__name__}" + number = 25_000 + result = timeit(stmt=stmt, setup=setup, number=number) + print(f"{func.__name__:<30} finished {number:,} runs in {result:.5f} seconds") + + +if __name__ == "__main__": + # Get user input for words + first_word = input("Enter the first word for Levenshtein distance:\n").strip() + second_word = input("Enter the second word for Levenshtein distance:\n").strip() + + # Calculate and print Levenshtein distances + print(f"{levenshtein_distance(first_word, second_word) = }") + print(f"{levenshtein_distance_optimized(first_word, second_word) = }") + + # Benchmark the Levenshtein distance functions + benchmark_levenshtein_distance(levenshtein_distance) + benchmark_levenshtein_distance(levenshtein_distance_optimized)