mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-24 05:21:09 +00:00
Compare commits
8 Commits
c5cc8dd3ea
...
1a8738083a
Author | SHA1 | Date | |
---|---|---|---|
|
1a8738083a | ||
|
f3f32ae3ca | ||
|
e3bd7721c8 | ||
|
e3f3d668be | ||
|
aed9ddb168 | ||
|
790475a622 | ||
|
0f6994c67e | ||
|
5b58203a0f |
|
@ -16,7 +16,7 @@ repos:
|
||||||
- id: auto-walrus
|
- id: auto-walrus
|
||||||
|
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.7.2
|
rev: v0.7.4
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
|
@ -42,7 +42,7 @@ repos:
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
|
|
||||||
- repo: https://github.com/abravalheri/validate-pyproject
|
- repo: https://github.com/abravalheri/validate-pyproject
|
||||||
rev: v0.22
|
rev: v0.23
|
||||||
hooks:
|
hooks:
|
||||||
- id: validate-pyproject
|
- id: validate-pyproject
|
||||||
|
|
||||||
|
|
|
@ -1280,6 +1280,7 @@
|
||||||
* [Barcode Validator](strings/barcode_validator.py)
|
* [Barcode Validator](strings/barcode_validator.py)
|
||||||
* [Bitap String Match](strings/bitap_string_match.py)
|
* [Bitap String Match](strings/bitap_string_match.py)
|
||||||
* [Boyer Moore Search](strings/boyer_moore_search.py)
|
* [Boyer Moore Search](strings/boyer_moore_search.py)
|
||||||
|
* [Bpe Tokenizer](strings/bpe_tokenizer.py)
|
||||||
* [Camel Case To Snake Case](strings/camel_case_to_snake_case.py)
|
* [Camel Case To Snake Case](strings/camel_case_to_snake_case.py)
|
||||||
* [Can String Be Rearranged As Palindrome](strings/can_string_be_rearranged_as_palindrome.py)
|
* [Can String Be Rearranged As Palindrome](strings/can_string_be_rearranged_as_palindrome.py)
|
||||||
* [Capitalize](strings/capitalize.py)
|
* [Capitalize](strings/capitalize.py)
|
||||||
|
|
|
@ -172,7 +172,7 @@ def solved(values):
|
||||||
|
|
||||||
def from_file(filename, sep="\n"):
|
def from_file(filename, sep="\n"):
|
||||||
"Parse a file into a list of strings, separated by sep."
|
"Parse a file into a list of strings, separated by sep."
|
||||||
return open(filename).read().strip().split(sep) # noqa: SIM115
|
return open(filename).read().strip().split(sep)
|
||||||
|
|
||||||
|
|
||||||
def random_puzzle(assignments=17):
|
def random_puzzle(assignments=17):
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python3
|
#!python
|
||||||
import os
|
import os
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
129
strings/bpe_tokenizer.py
Normal file
129
strings/bpe_tokenizer.py
Normal file
|
@ -0,0 +1,129 @@
|
||||||
|
"""Byte-Pair Encoding: Subword-based tokenization algorithm used
|
||||||
|
by state-of-the-art language models.
|
||||||
|
|
||||||
|
Wikipedia: https://en.wikipedia.org/wiki/Byte_pair_encoding"""
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
|
def get_byte_pair_counts(ids: list[int]) -> dict:
|
||||||
|
"""Count consecutive byte-pairs of an encoded string.
|
||||||
|
|
||||||
|
>>> ids = [73, 32, 97, 109, 32, 74, 111, 110, 83, 110, 111, 119, 46]
|
||||||
|
>>> get_byte_pair_counts(ids)
|
||||||
|
{(73, 32): 1, (32, 97): 1, (97, 109): 1, (109, 32): 1, (32, 74): 1, (74, 111): 1, (111, 110): 1, (110, 83): 1, (83, 110): 1, (110, 111): 1, (111, 119): 1, (119, 46): 1}
|
||||||
|
>>> ids = [2, 3, 6, 2, 3, 6, 2, 5]
|
||||||
|
>>> get_byte_pair_counts(ids)
|
||||||
|
{(2, 3): 2, (3, 6): 2, (6, 2): 2, (2, 5): 1}
|
||||||
|
""" # noqa: E501
|
||||||
|
counts: dict = {}
|
||||||
|
for pair in itertools.pairwise(ids):
|
||||||
|
counts[pair] = counts.get(pair, 0) + 1
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
def merge(ids: list[int], pair: tuple, idx: int) -> list[int]:
|
||||||
|
"""Replace most occurring byte pair with new byte that is not used
|
||||||
|
in the data. For utf-8 encoding, we start with 256 as the new byte
|
||||||
|
|
||||||
|
>>> ids = [2, 3, 6, 2, 3, 6, 2, 5]
|
||||||
|
>>> pair = (2, 3)
|
||||||
|
>>> idx = 256
|
||||||
|
>>> merge(ids, pair, idx)
|
||||||
|
[256, 6, 256, 6, 2, 5]
|
||||||
|
"""
|
||||||
|
new_ids = []
|
||||||
|
i = 0
|
||||||
|
while i < len(ids):
|
||||||
|
if i < len(ids) - 1 and (ids[i] == pair[0] and ids[i + 1] == pair[1]):
|
||||||
|
new_ids.append(idx)
|
||||||
|
i += 2
|
||||||
|
else:
|
||||||
|
new_ids.append(ids[i])
|
||||||
|
i += 1
|
||||||
|
return new_ids
|
||||||
|
|
||||||
|
|
||||||
|
class Tokenizer:
|
||||||
|
"""Tokenize a string using the byte-pair encoding algorithm"""
|
||||||
|
|
||||||
|
def __init__(self, num_merges: int = 20, verbose: bool = False) -> None:
|
||||||
|
self.num_merges = num_merges
|
||||||
|
self.merges: dict = {}
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
|
def encode(self, text: str) -> list[int]:
|
||||||
|
"""Convert a string to tokens (bytes)
|
||||||
|
|
||||||
|
>>> t = Tokenizer()
|
||||||
|
>>> text = "I am JonSnow."
|
||||||
|
>>> t.encode(text)
|
||||||
|
[73, 32, 97, 109, 32, 74, 111, 110, 83, 110, 111, 119, 46]
|
||||||
|
|
||||||
|
>>> t = Tokenizer()
|
||||||
|
>>> text = ""
|
||||||
|
>>> t.encode(text)
|
||||||
|
[]
|
||||||
|
"""
|
||||||
|
text_b = text.encode("utf-8") # raw bytes
|
||||||
|
tokens = list(map(int, text_b)) # convert to list of integers
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
print(f"Input text: {text}")
|
||||||
|
print(f"Tokens: {tokens}")
|
||||||
|
|
||||||
|
ids = list(tokens) # create a copy of tokens
|
||||||
|
self.merges = OrderedDict() # store a mapping of merges (int, int) -> int
|
||||||
|
max_merges = len(tokens) - 1
|
||||||
|
num_merges = min(self.num_merges, max_merges)
|
||||||
|
# start merging most frequently occurring byte pairs
|
||||||
|
for i in range(num_merges):
|
||||||
|
counts = get_byte_pair_counts(ids)
|
||||||
|
pair = max(counts, key=counts.__getitem__)
|
||||||
|
|
||||||
|
if counts[pair] == 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
idx = 256 + i # create new token for every merge step
|
||||||
|
if self.verbose:
|
||||||
|
print(f"Merging {pair} into a new token {idx}")
|
||||||
|
ids = merge(ids, pair, idx)
|
||||||
|
self.merges[pair] = idx
|
||||||
|
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def decode(self, ids: list[int]) -> str:
|
||||||
|
"""Convert a list of tokens to the original string
|
||||||
|
|
||||||
|
>>> t = Tokenizer()
|
||||||
|
>>> ids = [73, 32, 97, 109, 32, 74, 111, 110, 83, 110, 111, 119, 46]
|
||||||
|
>>> t.decode(ids)
|
||||||
|
'I am JonSnow.'
|
||||||
|
|
||||||
|
>>> t = Tokenizer()
|
||||||
|
>>> ids = []
|
||||||
|
>>> t.decode(ids)
|
||||||
|
''
|
||||||
|
"""
|
||||||
|
vocab = {idx: bytes([idx]) for idx in range(256)} # original vocabulary
|
||||||
|
# The iteration of items should be in the order of
|
||||||
|
# their insertion. This is the default behavior in Python 3
|
||||||
|
# but we use an OrderedDict explicitly here
|
||||||
|
for (p0, p1), idx in self.merges.items():
|
||||||
|
vocab[idx] = vocab[p0] + vocab[p1]
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
print("Vocabulary (after merging): {vocab}")
|
||||||
|
|
||||||
|
tokens = b"".join(vocab[idx] for idx in ids)
|
||||||
|
# handle UnicodeDecodeError by replacing the invalid
|
||||||
|
# start byte to conform to utf-8 format
|
||||||
|
text = tokens.decode("utf-8", errors="replace")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
|
||||||
|
doctest.testmod()
|
Loading…
Reference in New Issue
Block a user