mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-01-18 08:17:01 +00:00
Solving the Top k most frequent words
problem using a max-heap (#8685)
* Solving the `Top k most frequent words` problem using a max-heap * Mentioning Python standard library solution in `Top k most frequent words` docstring * ruff --fix . * updating DIRECTORY.md --------- Co-authored-by: Amos Paribocci <aparibocci@gmail.com> Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
This commit is contained in:
parent
c1b3ea5355
commit
4c1f876567
|
@ -1167,6 +1167,7 @@
|
|||
* [Snake Case To Camel Pascal Case](strings/snake_case_to_camel_pascal_case.py)
|
||||
* [Split](strings/split.py)
|
||||
* [Text Justification](strings/text_justification.py)
|
||||
* [Top K Frequent Words](strings/top_k_frequent_words.py)
|
||||
* [Upper](strings/upper.py)
|
||||
* [Wave](strings/wave.py)
|
||||
* [Wildcard Pattern Matching](strings/wildcard_pattern_matching.py)
|
||||
|
|
|
@ -1,9 +1,28 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from typing import Generic, Protocol, TypeVar
|
||||
|
||||
|
||||
class Heap:
|
||||
class Comparable(Protocol):
|
||||
@abstractmethod
|
||||
def __lt__(self: T, other: T) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __gt__(self: T, other: T) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __eq__(self: T, other: object) -> bool:
|
||||
pass
|
||||
|
||||
|
||||
T = TypeVar("T", bound=Comparable)
|
||||
|
||||
|
||||
class Heap(Generic[T]):
|
||||
"""A Max Heap Implementation
|
||||
|
||||
>>> unsorted = [103, 9, 1, 7, 11, 15, 25, 201, 209, 107, 5]
|
||||
|
@ -27,7 +46,7 @@ class Heap:
|
|||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.h: list[float] = []
|
||||
self.h: list[T] = []
|
||||
self.heap_size: int = 0
|
||||
|
||||
def __repr__(self) -> str:
|
||||
|
@ -79,7 +98,7 @@ class Heap:
|
|||
# fix the subsequent violation recursively if any
|
||||
self.max_heapify(violation)
|
||||
|
||||
def build_max_heap(self, collection: Iterable[float]) -> None:
|
||||
def build_max_heap(self, collection: Iterable[T]) -> None:
|
||||
"""build max heap from an unsorted array"""
|
||||
self.h = list(collection)
|
||||
self.heap_size = len(self.h)
|
||||
|
@ -88,7 +107,7 @@ class Heap:
|
|||
for i in range(self.heap_size // 2 - 1, -1, -1):
|
||||
self.max_heapify(i)
|
||||
|
||||
def extract_max(self) -> float:
|
||||
def extract_max(self) -> T:
|
||||
"""get and remove max from heap"""
|
||||
if self.heap_size >= 2:
|
||||
me = self.h[0]
|
||||
|
@ -102,7 +121,7 @@ class Heap:
|
|||
else:
|
||||
raise Exception("Empty heap")
|
||||
|
||||
def insert(self, value: float) -> None:
|
||||
def insert(self, value: T) -> None:
|
||||
"""insert a new value into the max heap"""
|
||||
self.h.append(value)
|
||||
idx = (self.heap_size - 1) // 2
|
||||
|
@ -144,7 +163,7 @@ if __name__ == "__main__":
|
|||
]:
|
||||
print(f"unsorted array: {unsorted}")
|
||||
|
||||
heap = Heap()
|
||||
heap: Heap[int] = Heap()
|
||||
heap.build_max_heap(unsorted)
|
||||
print(f"after build heap: {heap}")
|
||||
|
||||
|
|
|
@ -399,7 +399,7 @@ def main():
|
|||
if input("Press any key to restart or 'q' for quit: ").strip().lower() == "q":
|
||||
print("\n" + "GoodBye!".center(100, "-") + "\n")
|
||||
break
|
||||
system("clear" if name == "posix" else "cls") # noqa: S605
|
||||
system("cls" if name == "nt" else "clear") # noqa: S605
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
101
strings/top_k_frequent_words.py
Normal file
101
strings/top_k_frequent_words.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
"""
|
||||
Finds the top K most frequent words from the provided word list.
|
||||
|
||||
This implementation aims to show how to solve the problem using the Heap class
|
||||
already present in this repository.
|
||||
Computing order statistics is, in fact, a typical usage of heaps.
|
||||
|
||||
This is mostly shown for educational purposes, since the problem can be solved
|
||||
in a few lines using collections.Counter from the Python standard library:
|
||||
|
||||
from collections import Counter
|
||||
def top_k_frequent_words(words, k_value):
|
||||
return [x[0] for x in Counter(words).most_common(k_value)]
|
||||
"""
|
||||
|
||||
|
||||
from collections import Counter
|
||||
from functools import total_ordering
|
||||
|
||||
from data_structures.heap.heap import Heap
|
||||
|
||||
|
||||
@total_ordering
|
||||
class WordCount:
|
||||
def __init__(self, word: str, count: int) -> None:
|
||||
self.word = word
|
||||
self.count = count
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
"""
|
||||
>>> WordCount('a', 1).__eq__(WordCount('b', 1))
|
||||
True
|
||||
>>> WordCount('a', 1).__eq__(WordCount('a', 1))
|
||||
True
|
||||
>>> WordCount('a', 1).__eq__(WordCount('a', 2))
|
||||
False
|
||||
>>> WordCount('a', 1).__eq__(WordCount('b', 2))
|
||||
False
|
||||
>>> WordCount('a', 1).__eq__(1)
|
||||
NotImplemented
|
||||
"""
|
||||
if not isinstance(other, WordCount):
|
||||
return NotImplemented
|
||||
return self.count == other.count
|
||||
|
||||
def __lt__(self, other: object) -> bool:
|
||||
"""
|
||||
>>> WordCount('a', 1).__lt__(WordCount('b', 1))
|
||||
False
|
||||
>>> WordCount('a', 1).__lt__(WordCount('a', 1))
|
||||
False
|
||||
>>> WordCount('a', 1).__lt__(WordCount('a', 2))
|
||||
True
|
||||
>>> WordCount('a', 1).__lt__(WordCount('b', 2))
|
||||
True
|
||||
>>> WordCount('a', 2).__lt__(WordCount('a', 1))
|
||||
False
|
||||
>>> WordCount('a', 2).__lt__(WordCount('b', 1))
|
||||
False
|
||||
>>> WordCount('a', 1).__lt__(1)
|
||||
NotImplemented
|
||||
"""
|
||||
if not isinstance(other, WordCount):
|
||||
return NotImplemented
|
||||
return self.count < other.count
|
||||
|
||||
|
||||
def top_k_frequent_words(words: list[str], k_value: int) -> list[str]:
|
||||
"""
|
||||
Returns the `k_value` most frequently occurring words,
|
||||
in non-increasing order of occurrence.
|
||||
In this context, a word is defined as an element in the provided list.
|
||||
|
||||
In case `k_value` is greater than the number of distinct words, a value of k equal
|
||||
to the number of distinct words will be considered, instead.
|
||||
|
||||
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
|
||||
['c', 'a', 'b']
|
||||
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
|
||||
['c', 'a']
|
||||
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
|
||||
['c']
|
||||
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
|
||||
[]
|
||||
>>> top_k_frequent_words([], 1)
|
||||
[]
|
||||
>>> top_k_frequent_words(['a', 'a'], 2)
|
||||
['a']
|
||||
"""
|
||||
heap: Heap[WordCount] = Heap()
|
||||
count_by_word = Counter(words)
|
||||
heap.build_max_heap(
|
||||
[WordCount(word, count) for word, count in count_by_word.items()]
|
||||
)
|
||||
return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
doctest.testmod()
|
Loading…
Reference in New Issue
Block a user