Fix Base58 encoding/decoding

2025-05-06 20:33:58 +00:00 · 2024-10-16 16:33:32 -04:00 · 2024-10-16 16:33:32 -04:00 · 5ae5d95672
commit 5ae5d95672
parent 03a42510b0
1 changed files with 142 additions and 0 deletions
--- a/ciphers/base58.py
+++ b/ciphers/base58.py
@ -0,0 +1,142 @@
+B64_CHARSET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+
+
+def base64_encode(data: bytes) -> bytes:
+    """Encodes data according to RFC4648.
+
+    The data is first transformed to binary and appended with binary digits so that its
+    length becomes a multiple of 6, then each 6 binary digits will match a character in
+    the B64_CHARSET string. The number of appended binary digits would later determine
+    how many "=" signs should be added, the padding.
+    For every 2 binary digits added, a "=" sign is added in the output.
+    We can add any binary digits to make it a multiple of 6, for instance, consider the
+    following example:
+    "AA" -> 0010100100101001 -> 001010 010010 1001
+    As can be seen above, 2 more binary digits should be added, so there's 4
+    possibilities here: 00, 01, 10 or 11.
+    That being said, Base64 encoding can be used in Steganography to hide data in these
+    appended digits.
+
+    >>> from base64 import b64encode
+    >>> a = b"This pull request is part of Hacktoberfest20!"
+    >>> b = b"https://tools.ietf.org/html/rfc4648"
+    >>> c = b"A"
+    >>> base64_encode(a) == b64encode(a)
+    True
+    >>> base64_encode(b) == b64encode(b)
+    True
+    >>> base64_encode(c) == b64encode(c)
+    True
+    >>> base64_encode("abc")
+    Traceback (most recent call last):
+      ...
+    TypeError: a bytes-like object is required, not 'str'
+    """
+    # Make sure the supplied data is a bytes-like object
+    if not isinstance(data, bytes):
+        msg = f"a bytes-like object is required, not '{data.__class__.__name__}'"
+        raise TypeError(msg)
+
+    binary_stream = "".join(bin(byte)[2:].zfill(8) for byte in data)
+
+    padding_needed = len(binary_stream) % 6 != 0
+
+    if padding_needed:
+        # The padding that will be added later
+        padding = b"=" * ((6 - len(binary_stream) % 6) // 2)
+
+        # Append binary_stream with arbitrary binary digits (0's by default) to make its
+        # length a multiple of 6.
+        binary_stream += "0" * (6 - len(binary_stream) % 6)
+    else:
+        padding = b""
+
+    # Encode every 6 binary digits to their corresponding Base64 character
+    return (
+        "".join(
+            B64_CHARSET[int(binary_stream[index : index + 6], 2)]
+            for index in range(0, len(binary_stream), 6)
+        ).encode()
+        + padding
+    )
+
+
+def base64_decode(encoded_data: str) -> bytes:
+    """Decodes data according to RFC4648.
+
+    This does the reverse operation of base64_encode.
+    We first transform the encoded data back to a binary stream, take off the
+    previously appended binary digits according to the padding, at this point we
+    would have a binary stream whose length is multiple of 8, the last step is
+    to convert every 8 bits to a byte.
+
+    >>> from base64 import b64decode
+    >>> a = "VGhpcyBwdWxsIHJlcXVlc3QgaXMgcGFydCBvZiBIYWNrdG9iZXJmZXN0MjAh"
+    >>> b = "aHR0cHM6Ly90b29scy5pZXRmLm9yZy9odG1sL3JmYzQ2NDg="
+    >>> c = "QQ=="
+    >>> base64_decode(a) == b64decode(a)
+    True
+    >>> base64_decode(b) == b64decode(b)
+    True
+    >>> base64_decode(c) == b64decode(c)
+    True
+    >>> base64_decode("abc")
+    Traceback (most recent call last):
+      ...
+    AssertionError: Incorrect padding
+    """
+    # Make sure encoded_data is either a string or a bytes-like object
+    if not isinstance(encoded_data, bytes) and not isinstance(encoded_data, str):
+        msg = (
+            "argument should be a bytes-like object or ASCII string, "
+            f"not '{encoded_data.__class__.__name__}'"
+        )
+        raise TypeError(msg)
+
+    # In case encoded_data is a bytes-like object, make sure it contains only
+    # ASCII characters so we convert it to a string object
+    if isinstance(encoded_data, bytes):
+        try:
+            encoded_data = encoded_data.decode("utf-8")
+        except UnicodeDecodeError:
+            raise ValueError("base64 encoded data should only contain ASCII characters")
+
+    padding = encoded_data.count("=")
+
+    # Check if the encoded string contains non base64 characters
+    if padding:
+        assert all(
+            char in B64_CHARSET for char in encoded_data[:-padding]
+        ), "Invalid base64 character(s) found."
+    else:
+        assert all(
+            char in B64_CHARSET for char in encoded_data
+        ), "Invalid base64 character(s) found."
+
+    # Check the padding
+    assert len(encoded_data) % 4 == 0 and padding < 3, "Incorrect padding"
+
+    if padding:
+        # Remove padding if there is one
+        encoded_data = encoded_data[:-padding]
+
+        binary_stream = "".join(
+            bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data
+        )[: -padding * 2]
+    else:
+        binary_stream = "".join(
+            bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data
+        )
+
+    data = [
+        int(binary_stream[index : index + 8], 2)
+        for index in range(0, len(binary_stream), 8)
+    ]
+
+    return bytes(data)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()