mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-12-05 02:40:16 +00:00
bc8df6de31
* [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.2.2 → v0.3.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.2.2...v0.3.2) - [github.com/pre-commit/mirrors-mypy: v1.8.0 → v1.9.0](https://github.com/pre-commit/mirrors-mypy/compare/v1.8.0...v1.9.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
101 lines
3.1 KiB
Python
101 lines
3.1 KiB
Python
"""
|
|
Finds the top K most frequent words from the provided word list.
|
|
|
|
This implementation aims to show how to solve the problem using the Heap class
|
|
already present in this repository.
|
|
Computing order statistics is, in fact, a typical usage of heaps.
|
|
|
|
This is mostly shown for educational purposes, since the problem can be solved
|
|
in a few lines using collections.Counter from the Python standard library:
|
|
|
|
from collections import Counter
|
|
def top_k_frequent_words(words, k_value):
|
|
return [x[0] for x in Counter(words).most_common(k_value)]
|
|
"""
|
|
|
|
from collections import Counter
|
|
from functools import total_ordering
|
|
|
|
from data_structures.heap.heap import Heap
|
|
|
|
|
|
@total_ordering
|
|
class WordCount:
|
|
def __init__(self, word: str, count: int) -> None:
|
|
self.word = word
|
|
self.count = count
|
|
|
|
def __eq__(self, other: object) -> bool:
|
|
"""
|
|
>>> WordCount('a', 1).__eq__(WordCount('b', 1))
|
|
True
|
|
>>> WordCount('a', 1).__eq__(WordCount('a', 1))
|
|
True
|
|
>>> WordCount('a', 1).__eq__(WordCount('a', 2))
|
|
False
|
|
>>> WordCount('a', 1).__eq__(WordCount('b', 2))
|
|
False
|
|
>>> WordCount('a', 1).__eq__(1)
|
|
NotImplemented
|
|
"""
|
|
if not isinstance(other, WordCount):
|
|
return NotImplemented
|
|
return self.count == other.count
|
|
|
|
def __lt__(self, other: object) -> bool:
|
|
"""
|
|
>>> WordCount('a', 1).__lt__(WordCount('b', 1))
|
|
False
|
|
>>> WordCount('a', 1).__lt__(WordCount('a', 1))
|
|
False
|
|
>>> WordCount('a', 1).__lt__(WordCount('a', 2))
|
|
True
|
|
>>> WordCount('a', 1).__lt__(WordCount('b', 2))
|
|
True
|
|
>>> WordCount('a', 2).__lt__(WordCount('a', 1))
|
|
False
|
|
>>> WordCount('a', 2).__lt__(WordCount('b', 1))
|
|
False
|
|
>>> WordCount('a', 1).__lt__(1)
|
|
NotImplemented
|
|
"""
|
|
if not isinstance(other, WordCount):
|
|
return NotImplemented
|
|
return self.count < other.count
|
|
|
|
|
|
def top_k_frequent_words(words: list[str], k_value: int) -> list[str]:
|
|
"""
|
|
Returns the `k_value` most frequently occurring words,
|
|
in non-increasing order of occurrence.
|
|
In this context, a word is defined as an element in the provided list.
|
|
|
|
In case `k_value` is greater than the number of distinct words, a value of k equal
|
|
to the number of distinct words will be considered, instead.
|
|
|
|
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
|
|
['c', 'a', 'b']
|
|
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
|
|
['c', 'a']
|
|
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
|
|
['c']
|
|
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
|
|
[]
|
|
>>> top_k_frequent_words([], 1)
|
|
[]
|
|
>>> top_k_frequent_words(['a', 'a'], 2)
|
|
['a']
|
|
"""
|
|
heap: Heap[WordCount] = Heap()
|
|
count_by_word = Counter(words)
|
|
heap.build_max_heap(
|
|
[WordCount(word, count) for word, count in count_by_word.items()]
|
|
)
|
|
return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import doctest
|
|
|
|
doctest.testmod()
|