Python/strings/top_k_frequent_words.py

"""
Finds the top K most frequent words from the provided word list.

This implementation aims to show how to solve the problem using the Heap class
already present in this repository.
Computing order statistics is, in fact, a typical usage of heaps.

This is mostly shown for educational purposes, since the problem can be solved
in a few lines using collections.Counter from the Python standard library:

from collections import Counter
def top_k_frequent_words(words, k_value):
    return [x[0] for x in Counter(words).most_common(k_value)]
"""

from collections import Counter
from functools import total_ordering

from data_structures.heap.heap import Heap


@total_ordering
class WordCount:
    def __init__(self, word: str, count: int) -> None:
        self.word = word
        self.count = count

    def __eq__(self, other: object) -> bool:
        """
        >>> WordCount('a', 1).__eq__(WordCount('b', 1))
        True
        >>> WordCount('a', 1).__eq__(WordCount('a', 1))
        True
        >>> WordCount('a', 1).__eq__(WordCount('a', 2))
        False
        >>> WordCount('a', 1).__eq__(WordCount('b', 2))
        False
        >>> WordCount('a', 1).__eq__(1)
        NotImplemented
        """
        if not isinstance(other, WordCount):
            return NotImplemented
        return self.count == other.count

    def __lt__(self, other: object) -> bool:
        """
        >>> WordCount('a', 1).__lt__(WordCount('b', 1))
        False
        >>> WordCount('a', 1).__lt__(WordCount('a', 1))
        False
        >>> WordCount('a', 1).__lt__(WordCount('a', 2))
        True
        >>> WordCount('a', 1).__lt__(WordCount('b', 2))
        True
        >>> WordCount('a', 2).__lt__(WordCount('a', 1))
        False
        >>> WordCount('a', 2).__lt__(WordCount('b', 1))
        False
        >>> WordCount('a', 1).__lt__(1)
        NotImplemented
        """
        if not isinstance(other, WordCount):
            return NotImplemented
        return self.count < other.count


def top_k_frequent_words(words: list[str], k_value: int) -> list[str]:
    """
    Returns the `k_value` most frequently occurring words,
    in non-increasing order of occurrence.
    In this context, a word is defined as an element in the provided list.

    In case `k_value` is greater than the number of distinct words, a value of k equal
    to the number of distinct words will be considered, instead.

    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
    ['c', 'a', 'b']
    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
    ['c', 'a']
    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
    ['c']
    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
    []
    >>> top_k_frequent_words([], 1)
    []
    >>> top_k_frequent_words(['a', 'a'], 2)
    ['a']
    """
    heap: Heap[WordCount] = Heap()
    count_by_word = Counter(words)
    heap.build_max_heap(
        [WordCount(word, count) for word, count in count_by_word.items()]
    )
    return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))]


if __name__ == "__main__":
    import doctest

    doctest.testmod()