diff --git a/DIRECTORY.md b/DIRECTORY.md index 8e67c85c6..681d252b2 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -1167,6 +1167,7 @@ * [Snake Case To Camel Pascal Case](strings/snake_case_to_camel_pascal_case.py) * [Split](strings/split.py) * [Text Justification](strings/text_justification.py) + * [Top K Frequent Words](strings/top_k_frequent_words.py) * [Upper](strings/upper.py) * [Wave](strings/wave.py) * [Wildcard Pattern Matching](strings/wildcard_pattern_matching.py) diff --git a/data_structures/heap/heap.py b/data_structures/heap/heap.py index b14c55d9d..c1004f349 100644 --- a/data_structures/heap/heap.py +++ b/data_structures/heap/heap.py @@ -1,9 +1,28 @@ from __future__ import annotations +from abc import abstractmethod from collections.abc import Iterable +from typing import Generic, Protocol, TypeVar -class Heap: +class Comparable(Protocol): + @abstractmethod + def __lt__(self: T, other: T) -> bool: + pass + + @abstractmethod + def __gt__(self: T, other: T) -> bool: + pass + + @abstractmethod + def __eq__(self: T, other: object) -> bool: + pass + + +T = TypeVar("T", bound=Comparable) + + +class Heap(Generic[T]): """A Max Heap Implementation >>> unsorted = [103, 9, 1, 7, 11, 15, 25, 201, 209, 107, 5] @@ -27,7 +46,7 @@ class Heap: """ def __init__(self) -> None: - self.h: list[float] = [] + self.h: list[T] = [] self.heap_size: int = 0 def __repr__(self) -> str: @@ -79,7 +98,7 @@ class Heap: # fix the subsequent violation recursively if any self.max_heapify(violation) - def build_max_heap(self, collection: Iterable[float]) -> None: + def build_max_heap(self, collection: Iterable[T]) -> None: """build max heap from an unsorted array""" self.h = list(collection) self.heap_size = len(self.h) @@ -88,7 +107,7 @@ class Heap: for i in range(self.heap_size // 2 - 1, -1, -1): self.max_heapify(i) - def extract_max(self) -> float: + def extract_max(self) -> T: """get and remove max from heap""" if self.heap_size >= 2: me = self.h[0] @@ -102,7 +121,7 @@ class Heap: else: raise Exception("Empty heap") - def insert(self, value: float) -> None: + def insert(self, value: T) -> None: """insert a new value into the max heap""" self.h.append(value) idx = (self.heap_size - 1) // 2 @@ -144,7 +163,7 @@ if __name__ == "__main__": ]: print(f"unsorted array: {unsorted}") - heap = Heap() + heap: Heap[int] = Heap() heap.build_max_heap(unsorted) print(f"after build heap: {heap}") diff --git a/machine_learning/linear_discriminant_analysis.py b/machine_learning/linear_discriminant_analysis.py index c0a477be1..88c047157 100644 --- a/machine_learning/linear_discriminant_analysis.py +++ b/machine_learning/linear_discriminant_analysis.py @@ -399,7 +399,7 @@ def main(): if input("Press any key to restart or 'q' for quit: ").strip().lower() == "q": print("\n" + "GoodBye!".center(100, "-") + "\n") break - system("clear" if name == "posix" else "cls") # noqa: S605 + system("cls" if name == "nt" else "clear") # noqa: S605 if __name__ == "__main__": diff --git a/strings/top_k_frequent_words.py b/strings/top_k_frequent_words.py new file mode 100644 index 000000000..f3d1e0cd5 --- /dev/null +++ b/strings/top_k_frequent_words.py @@ -0,0 +1,101 @@ +""" +Finds the top K most frequent words from the provided word list. + +This implementation aims to show how to solve the problem using the Heap class +already present in this repository. +Computing order statistics is, in fact, a typical usage of heaps. + +This is mostly shown for educational purposes, since the problem can be solved +in a few lines using collections.Counter from the Python standard library: + +from collections import Counter +def top_k_frequent_words(words, k_value): + return [x[0] for x in Counter(words).most_common(k_value)] +""" + + +from collections import Counter +from functools import total_ordering + +from data_structures.heap.heap import Heap + + +@total_ordering +class WordCount: + def __init__(self, word: str, count: int) -> None: + self.word = word + self.count = count + + def __eq__(self, other: object) -> bool: + """ + >>> WordCount('a', 1).__eq__(WordCount('b', 1)) + True + >>> WordCount('a', 1).__eq__(WordCount('a', 1)) + True + >>> WordCount('a', 1).__eq__(WordCount('a', 2)) + False + >>> WordCount('a', 1).__eq__(WordCount('b', 2)) + False + >>> WordCount('a', 1).__eq__(1) + NotImplemented + """ + if not isinstance(other, WordCount): + return NotImplemented + return self.count == other.count + + def __lt__(self, other: object) -> bool: + """ + >>> WordCount('a', 1).__lt__(WordCount('b', 1)) + False + >>> WordCount('a', 1).__lt__(WordCount('a', 1)) + False + >>> WordCount('a', 1).__lt__(WordCount('a', 2)) + True + >>> WordCount('a', 1).__lt__(WordCount('b', 2)) + True + >>> WordCount('a', 2).__lt__(WordCount('a', 1)) + False + >>> WordCount('a', 2).__lt__(WordCount('b', 1)) + False + >>> WordCount('a', 1).__lt__(1) + NotImplemented + """ + if not isinstance(other, WordCount): + return NotImplemented + return self.count < other.count + + +def top_k_frequent_words(words: list[str], k_value: int) -> list[str]: + """ + Returns the `k_value` most frequently occurring words, + in non-increasing order of occurrence. + In this context, a word is defined as an element in the provided list. + + In case `k_value` is greater than the number of distinct words, a value of k equal + to the number of distinct words will be considered, instead. + + >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3) + ['c', 'a', 'b'] + >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2) + ['c', 'a'] + >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1) + ['c'] + >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0) + [] + >>> top_k_frequent_words([], 1) + [] + >>> top_k_frequent_words(['a', 'a'], 2) + ['a'] + """ + heap: Heap[WordCount] = Heap() + count_by_word = Counter(words) + heap.build_max_heap( + [WordCount(word, count) for word, count in count_by_word.items()] + ) + return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))] + + +if __name__ == "__main__": + import doctest + + doctest.testmod()