Solving the Top k most frequent words problem using a max-heap (#8685)

* Solving the `Top k most frequent words` problem using a max-heap * Mentioning Python standard library solution in `Top k most frequent words` docstring * ruff --fix . * updating DIRECTORY.md --------- Co-authored-by: Amos Paribocci <aparibocci@gmail.com> Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
2025-05-04 03:23:36 +00:00 · 2023-04-27 19:32:07 +02:00 · 2023-04-27 19:32:07 +02:00 · 4c1f876567
commit 4c1f876567
parent c1b3ea5355
4 changed files with 128 additions and 7 deletions
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@ -1167,6 +1167,7 @@
  * [Snake Case To Camel Pascal Case](strings/snake_case_to_camel_pascal_case.py)
  * [Split](strings/split.py)
  * [Text Justification](strings/text_justification.py)
+  * [Top K Frequent Words](strings/top_k_frequent_words.py)
  * [Upper](strings/upper.py)
  * [Wave](strings/wave.py)
  * [Wildcard Pattern Matching](strings/wildcard_pattern_matching.py)
--- a/data_structures/heap/heap.py
+++ b/data_structures/heap/heap.py
@ -1,9 +1,28 @@
 from __future__ import annotations

+from abc import abstractmethod
 from collections.abc import Iterable
+from typing import Generic, Protocol, TypeVar


-class Heap:
+class Comparable(Protocol):
+    @abstractmethod
+    def __lt__(self: T, other: T) -> bool:
+        pass
+
+    @abstractmethod
+    def __gt__(self: T, other: T) -> bool:
+        pass
+
+    @abstractmethod
+    def __eq__(self: T, other: object) -> bool:
+        pass
+
+
+T = TypeVar("T", bound=Comparable)
+
+
+class Heap(Generic[T]):
    """A Max Heap Implementation

    >>> unsorted = [103, 9, 1, 7, 11, 15, 25, 201, 209, 107, 5]
@ -27,7 +46,7 @@ class Heap:
    """

    def __init__(self) -> None:
-        self.h: list[float] = []
+        self.h: list[T] = []
        self.heap_size: int = 0

    def __repr__(self) -> str:
@ -79,7 +98,7 @@ class Heap:
                # fix the subsequent violation recursively if any
                self.max_heapify(violation)

-    def build_max_heap(self, collection: Iterable[float]) -> None:
+    def build_max_heap(self, collection: Iterable[T]) -> None:
        """build max heap from an unsorted array"""
        self.h = list(collection)
        self.heap_size = len(self.h)
@ -88,7 +107,7 @@ class Heap:
            for i in range(self.heap_size // 2 - 1, -1, -1):
                self.max_heapify(i)

-    def extract_max(self) -> float:
+    def extract_max(self) -> T:
        """get and remove max from heap"""
        if self.heap_size >= 2:
            me = self.h[0]
@ -102,7 +121,7 @@ class Heap:
        else:
            raise Exception("Empty heap")

-    def insert(self, value: float) -> None:
+    def insert(self, value: T) -> None:
        """insert a new value into the max heap"""
        self.h.append(value)
        idx = (self.heap_size - 1) // 2
@ -144,7 +163,7 @@ if __name__ == "__main__":
    ]:
        print(f"unsorted array: {unsorted}")

-        heap = Heap()
+        heap: Heap[int] = Heap()
        heap.build_max_heap(unsorted)
        print(f"after build heap: {heap}")

--- a/machine_learning/linear_discriminant_analysis.py
+++ b/machine_learning/linear_discriminant_analysis.py
@ -399,7 +399,7 @@ def main():
        if input("Press any key to restart or 'q' for quit: ").strip().lower() == "q":
            print("\n" + "GoodBye!".center(100, "-") + "\n")
            break
-        system("clear" if name == "posix" else "cls")  # noqa: S605
+        system("cls" if name == "nt" else "clear")  # noqa: S605


 if __name__ == "__main__":
--- a/strings/top_k_frequent_words.py
+++ b/strings/top_k_frequent_words.py
@ -0,0 +1,101 @@
+"""
+Finds the top K most frequent words from the provided word list.
+
+This implementation aims to show how to solve the problem using the Heap class
+already present in this repository.
+Computing order statistics is, in fact, a typical usage of heaps.
+
+This is mostly shown for educational purposes, since the problem can be solved
+in a few lines using collections.Counter from the Python standard library:
+
+from collections import Counter
+def top_k_frequent_words(words, k_value):
+    return [x[0] for x in Counter(words).most_common(k_value)]
+"""
+
+
+from collections import Counter
+from functools import total_ordering
+
+from data_structures.heap.heap import Heap
+
+
+@total_ordering
+class WordCount:
+    def __init__(self, word: str, count: int) -> None:
+        self.word = word
+        self.count = count
+
+    def __eq__(self, other: object) -> bool:
+        """
+        >>> WordCount('a', 1).__eq__(WordCount('b', 1))
+        True
+        >>> WordCount('a', 1).__eq__(WordCount('a', 1))
+        True
+        >>> WordCount('a', 1).__eq__(WordCount('a', 2))
+        False
+        >>> WordCount('a', 1).__eq__(WordCount('b', 2))
+        False
+        >>> WordCount('a', 1).__eq__(1)
+        NotImplemented
+        """
+        if not isinstance(other, WordCount):
+            return NotImplemented
+        return self.count == other.count
+
+    def __lt__(self, other: object) -> bool:
+        """
+        >>> WordCount('a', 1).__lt__(WordCount('b', 1))
+        False
+        >>> WordCount('a', 1).__lt__(WordCount('a', 1))
+        False
+        >>> WordCount('a', 1).__lt__(WordCount('a', 2))
+        True
+        >>> WordCount('a', 1).__lt__(WordCount('b', 2))
+        True
+        >>> WordCount('a', 2).__lt__(WordCount('a', 1))
+        False
+        >>> WordCount('a', 2).__lt__(WordCount('b', 1))
+        False
+        >>> WordCount('a', 1).__lt__(1)
+        NotImplemented
+        """
+        if not isinstance(other, WordCount):
+            return NotImplemented
+        return self.count < other.count
+
+
+def top_k_frequent_words(words: list[str], k_value: int) -> list[str]:
+    """
+    Returns the `k_value` most frequently occurring words,
+    in non-increasing order of occurrence.
+    In this context, a word is defined as an element in the provided list.
+
+    In case `k_value` is greater than the number of distinct words, a value of k equal
+    to the number of distinct words will be considered, instead.
+
+    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
+    ['c', 'a', 'b']
+    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
+    ['c', 'a']
+    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
+    ['c']
+    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
+    []
+    >>> top_k_frequent_words([], 1)
+    []
+    >>> top_k_frequent_words(['a', 'a'], 2)
+    ['a']
+    """
+    heap: Heap[WordCount] = Heap()
+    count_by_word = Counter(words)
+    heap.build_max_heap(
+        [WordCount(word, count) for word, count in count_by_word.items()]
+    )
+    return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))]
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()