feat: replace dict get with dunder method

2025-05-03 02:53:36 +00:00 · 2024-10-05 19:52:56 +02:00 · 2024-10-05 19:52:56 +02:00 · 790475a622
commit 790475a622
parent 5b58203a0f
1 changed files with 6 additions and 6 deletions
--- a/strings/bpe_tokenizer.py
+++ b/strings/bpe_tokenizer.py
@ -7,7 +7,7 @@ import itertools
 from collections import OrderedDict


-def get_byte_pair_counts(ids: list[int]):
+def get_byte_pair_counts(ids: list[int]) -> dict:
    """Count consecutive byte-pairs of an encoded string.

    >>> ids = [73, 32, 97, 109, 32, 74, 111, 110, 83, 110, 111, 119, 46]
@ -23,7 +23,7 @@ def get_byte_pair_counts(ids: list[int]):
    return counts


-def merge(ids: list[int], pair: tuple, idx: int):
+def merge(ids: list[int], pair: tuple, idx: int) -> list[int]:
    """Replace most occurring byte pair with new byte that is not used
    in the data. For utf-8 encoding, we start with 256 as the new byte

@ -48,12 +48,12 @@ def merge(ids: list[int], pair: tuple, idx: int):
 class Tokenizer:
    """Tokenize a string using the byte-pair encoding algorithm"""

-    def __init__(self, num_merges: int = 20, verbose: bool = False):
+    def __init__(self, num_merges: int = 20, verbose: bool = False) -> None:
        self.num_merges = num_merges
        self.merges: dict = {}
        self.verbose = verbose

-    def encode(self, text: str):
+    def encode(self, text: str) -> list[int]:
        """Convert a string to tokens (bytes)

        >>> t = Tokenizer()
@ -80,7 +80,7 @@ class Tokenizer:
        # start merging most frequently occurring byte pairs
        for i in range(num_merges):
            counts = get_byte_pair_counts(ids)
-            pair = max(counts, key=counts.get)
+            pair = max(counts, key=counts.__getitem__)

            if counts[pair] == 1:
                continue
@ -93,7 +93,7 @@ class Tokenizer:

        return ids

-    def decode(self, ids: list[int]):
+    def decode(self, ids: list[int]) -> str:
        """Convert a list of tokens to the original string

        >>> t = Tokenizer()