Python/strings/top_k_frequent_words.py

102 lines
3.1 KiB
Python
Raw Normal View History

"""
Finds the top K most frequent words from the provided word list.
This implementation aims to show how to solve the problem using the Heap class
already present in this repository.
Computing order statistics is, in fact, a typical usage of heaps.
This is mostly shown for educational purposes, since the problem can be solved
in a few lines using collections.Counter from the Python standard library:
from collections import Counter
def top_k_frequent_words(words, k_value):
return [x[0] for x in Counter(words).most_common(k_value)]
"""
from collections import Counter
from functools import total_ordering
from data_structures.heap.heap import Heap
@total_ordering
class WordCount:
def __init__(self, word: str, count: int) -> None:
self.word = word
self.count = count
def __eq__(self, other: object) -> bool:
"""
>>> WordCount('a', 1).__eq__(WordCount('b', 1))
True
>>> WordCount('a', 1).__eq__(WordCount('a', 1))
True
>>> WordCount('a', 1).__eq__(WordCount('a', 2))
False
>>> WordCount('a', 1).__eq__(WordCount('b', 2))
False
>>> WordCount('a', 1).__eq__(1)
NotImplemented
"""
if not isinstance(other, WordCount):
return NotImplemented
return self.count == other.count
def __lt__(self, other: object) -> bool:
"""
>>> WordCount('a', 1).__lt__(WordCount('b', 1))
False
>>> WordCount('a', 1).__lt__(WordCount('a', 1))
False
>>> WordCount('a', 1).__lt__(WordCount('a', 2))
True
>>> WordCount('a', 1).__lt__(WordCount('b', 2))
True
>>> WordCount('a', 2).__lt__(WordCount('a', 1))
False
>>> WordCount('a', 2).__lt__(WordCount('b', 1))
False
>>> WordCount('a', 1).__lt__(1)
NotImplemented
"""
if not isinstance(other, WordCount):
return NotImplemented
return self.count < other.count
def top_k_frequent_words(words: list[str], k_value: int) -> list[str]:
"""
Returns the `k_value` most frequently occurring words,
in non-increasing order of occurrence.
In this context, a word is defined as an element in the provided list.
In case `k_value` is greater than the number of distinct words, a value of k equal
to the number of distinct words will be considered, instead.
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
['c', 'a', 'b']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
['c', 'a']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
['c']
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
[]
>>> top_k_frequent_words([], 1)
[]
>>> top_k_frequent_words(['a', 'a'], 2)
['a']
"""
heap: Heap[WordCount] = Heap()
count_by_word = Counter(words)
heap.build_max_heap(
[WordCount(word, count) for word, count in count_by_word.items()]
)
return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))]
if __name__ == "__main__":
import doctest
doctest.testmod()