Solving the Top k most frequent words problem using a max-heap (#8685)

* Solving the `Top k most frequent words` problem using a max-heap

* Mentioning Python standard library solution in `Top k most frequent words` docstring

* ruff --fix .

* updating DIRECTORY.md

---------

Co-authored-by: Amos Paribocci <aparibocci@gmail.com>
Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
This commit is contained in:
Christian Clauss 2023-04-27 19:32:07 +02:00 committed by GitHub
parent c1b3ea5355
commit 4c1f876567
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 128 additions and 7 deletions

View File

@ -1167,6 +1167,7 @@
* [Snake Case To Camel Pascal Case](strings/snake_case_to_camel_pascal_case.py) * [Snake Case To Camel Pascal Case](strings/snake_case_to_camel_pascal_case.py)
* [Split](strings/split.py) * [Split](strings/split.py)
* [Text Justification](strings/text_justification.py) * [Text Justification](strings/text_justification.py)
* [Top K Frequent Words](strings/top_k_frequent_words.py)
* [Upper](strings/upper.py) * [Upper](strings/upper.py)
* [Wave](strings/wave.py) * [Wave](strings/wave.py)
* [Wildcard Pattern Matching](strings/wildcard_pattern_matching.py) * [Wildcard Pattern Matching](strings/wildcard_pattern_matching.py)

View File

@ -1,9 +1,28 @@
from __future__ import annotations from __future__ import annotations
from abc import abstractmethod
from collections.abc import Iterable from collections.abc import Iterable
from typing import Generic, Protocol, TypeVar
class Heap: class Comparable(Protocol):
@abstractmethod
def __lt__(self: T, other: T) -> bool:
pass
@abstractmethod
def __gt__(self: T, other: T) -> bool:
pass
@abstractmethod
def __eq__(self: T, other: object) -> bool:
pass
T = TypeVar("T", bound=Comparable)
class Heap(Generic[T]):
"""A Max Heap Implementation """A Max Heap Implementation
>>> unsorted = [103, 9, 1, 7, 11, 15, 25, 201, 209, 107, 5] >>> unsorted = [103, 9, 1, 7, 11, 15, 25, 201, 209, 107, 5]
@ -27,7 +46,7 @@ class Heap:
""" """
def __init__(self) -> None: def __init__(self) -> None:
self.h: list[float] = [] self.h: list[T] = []
self.heap_size: int = 0 self.heap_size: int = 0
def __repr__(self) -> str: def __repr__(self) -> str:
@ -79,7 +98,7 @@ class Heap:
# fix the subsequent violation recursively if any # fix the subsequent violation recursively if any
self.max_heapify(violation) self.max_heapify(violation)
def build_max_heap(self, collection: Iterable[float]) -> None: def build_max_heap(self, collection: Iterable[T]) -> None:
"""build max heap from an unsorted array""" """build max heap from an unsorted array"""
self.h = list(collection) self.h = list(collection)
self.heap_size = len(self.h) self.heap_size = len(self.h)
@ -88,7 +107,7 @@ class Heap:
for i in range(self.heap_size // 2 - 1, -1, -1): for i in range(self.heap_size // 2 - 1, -1, -1):
self.max_heapify(i) self.max_heapify(i)
def extract_max(self) -> float: def extract_max(self) -> T:
"""get and remove max from heap""" """get and remove max from heap"""
if self.heap_size >= 2: if self.heap_size >= 2:
me = self.h[0] me = self.h[0]
@ -102,7 +121,7 @@ class Heap:
else: else:
raise Exception("Empty heap") raise Exception("Empty heap")
def insert(self, value: float) -> None: def insert(self, value: T) -> None:
"""insert a new value into the max heap""" """insert a new value into the max heap"""
self.h.append(value) self.h.append(value)
idx = (self.heap_size - 1) // 2 idx = (self.heap_size - 1) // 2
@ -144,7 +163,7 @@ if __name__ == "__main__":
]: ]:
print(f"unsorted array: {unsorted}") print(f"unsorted array: {unsorted}")
heap = Heap() heap: Heap[int] = Heap()
heap.build_max_heap(unsorted) heap.build_max_heap(unsorted)
print(f"after build heap: {heap}") print(f"after build heap: {heap}")

View File

@ -399,7 +399,7 @@ def main():
if input("Press any key to restart or 'q' for quit: ").strip().lower() == "q": if input("Press any key to restart or 'q' for quit: ").strip().lower() == "q":
print("\n" + "GoodBye!".center(100, "-") + "\n") print("\n" + "GoodBye!".center(100, "-") + "\n")
break break
system("clear" if name == "posix" else "cls") # noqa: S605 system("cls" if name == "nt" else "clear") # noqa: S605
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -0,0 +1,101 @@
"""
Finds the top K most frequent words from the provided word list.
This implementation aims to show how to solve the problem using the Heap class
already present in this repository.
Computing order statistics is, in fact, a typical usage of heaps.
This is mostly shown for educational purposes, since the problem can be solved
in a few lines using collections.Counter from the Python standard library:
from collections import Counter
def top_k_frequent_words(words, k_value):
return [x[0] for x in Counter(words).most_common(k_value)]
"""
from collections import Counter
from functools import total_ordering
from data_structures.heap.heap import Heap
@total_ordering
class WordCount:
def __init__(self, word: str, count: int) -> None:
self.word = word
self.count = count
def __eq__(self, other: object) -> bool:
"""
>>> WordCount('a', 1).__eq__(WordCount('b', 1))
True
>>> WordCount('a', 1).__eq__(WordCount('a', 1))
True
>>> WordCount('a', 1).__eq__(WordCount('a', 2))
False
>>> WordCount('a', 1).__eq__(WordCount('b', 2))
False
>>> WordCount('a', 1).__eq__(1)
NotImplemented
"""
if not isinstance(other, WordCount):
return NotImplemented
return self.count == other.count
def __lt__(self, other: object) -> bool:
"""
>>> WordCount('a', 1).__lt__(WordCount('b', 1))
False
>>> WordCount('a', 1).__lt__(WordCount('a', 1))
False
>>> WordCount('a', 1).__lt__(WordCount('a', 2))
True
>>> WordCount('a', 1).__lt__(WordCount('b', 2))
True
>>> WordCount('a', 2).__lt__(WordCount('a', 1))
False
>>> WordCount('a', 2).__lt__(WordCount('b', 1))
False
>>> WordCount('a', 1).__lt__(1)
NotImplemented
"""
if not isinstance(other, WordCount):
return NotImplemented
return self.count < other.count
def top_k_frequent_words(words: list[str], k_value: int) -> list[str]:
"""
Returns the `k_value` most frequently occurring words,
in non-increasing order of occurrence.
In this context, a word is defined as an element in the provided list.
In case `k_value` is greater than the number of distinct words, a value of k equal
to the number of distinct words will be considered, instead.
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
['c', 'a', 'b']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
['c', 'a']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
['c']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
[]
>>> top_k_frequent_words([], 1)
[]
>>> top_k_frequent_words(['a', 'a'], 2)
['a']
"""
heap: Heap[WordCount] = Heap()
count_by_word = Counter(words)
heap.build_max_heap(
[WordCount(word, count) for word, count in count_by_word.items()]
)
return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))]
if __name__ == "__main__":
import doctest
doctest.testmod()