Python/ciphers/base64_encoding.py

B64_CHARSET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"


def base64_encode(data: bytes) -> bytes:
    """Encodes data according to RFC4648.

    The data is first transformed to binary and appended with binary digits so that its
    length becomes a multiple of 6, then each 6 binary digits will match a character in
    the B64_CHARSET string. The number of appended binary digits would later determine
    how many "=" sign should be added, the padding.
    For every 2 binary digits added, a "=" sign is added in the output.
    We can add any binary digits to make it a multiple of 6, for instance, consider the
    following example:
    "AA" -> 0010100100101001 -> 001010 010010 1001
    As can be seen above, 2 more binary digits should be added, so there's 4
    possibilities here: 00, 01, 10 or 11.
    That being said, Base64 encoding can be used in Steganography to hide data in these
    appended digits.

    >>> from base64 import b64encode
    >>> a = b"This pull request is part of Hacktoberfest20!"
    >>> b = b"https://tools.ietf.org/html/rfc4648"
    >>> c = b"A"
    >>> base64_encode(a) == b64encode(a)
    True
    >>> base64_encode(b) == b64encode(b)
    True
    >>> base64_encode(c) == b64encode(c)
    True
    >>> base64_encode("abc")
    Traceback (most recent call last):
      ...
    TypeError: a bytes-like object is required, not 'str'
    """
    # Make sure the supplied data is a bytes-like object
    if not isinstance(data, bytes):
        raise TypeError(
            f"a bytes-like object is required, not '{data.__class__.__name__}'"
        )

    binary_stream = "".join(bin(byte)[2:].zfill(8) for byte in data)

    padding_needed = len(binary_stream) % 6 != 0

    if padding_needed:
        # The padding that will be added later
        padding = b"=" * ((6 - len(binary_stream) % 6) // 2)

        # Append binary_stream with arbitrary binary digits (0's by default) to make its
        # length a multiple of 6.
        binary_stream += "0" * (6 - len(binary_stream) % 6)
    else:
        padding = b""

    # Encode every 6 binary digits to their corresponding Base64 character
    return (
        "".join(
            B64_CHARSET[int(binary_stream[index : index + 6], 2)]
            for index in range(0, len(binary_stream), 6)
        ).encode()
        + padding
    )


def base64_decode(encoded_data: str) -> bytes:
    """Decodes data according to RFC4648.

    This does the reverse operation of base64_encode.
    We first transform the encoded data back to a binary stream, take off the
    previously appended binary digits according to the padding, at this point we
    would have a binary stream whose length is multiple of 8, the last step is
    to convert every 8 bits to a byte.

    >>> from base64 import b64decode
    >>> a = "VGhpcyBwdWxsIHJlcXVlc3QgaXMgcGFydCBvZiBIYWNrdG9iZXJmZXN0MjAh"
    >>> b = "aHR0cHM6Ly90b29scy5pZXRmLm9yZy9odG1sL3JmYzQ2NDg="
    >>> c = "QQ=="
    >>> base64_decode(a) == b64decode(a)
    True
    >>> base64_decode(b) == b64decode(b)
    True
    >>> base64_decode(c) == b64decode(c)
    True
    >>> base64_decode("abc")
    Traceback (most recent call last):
      ...
    AssertionError: Incorrect padding
    """
    # Make sure encoded_data is either a string or a bytes-like object
    if not isinstance(encoded_data, bytes) and not isinstance(encoded_data, str):
        raise TypeError(
            "argument should be a bytes-like object or ASCII string, not "
            f"'{encoded_data.__class__.__name__}'"
        )

    # In case encoded_data is a bytes-like object, make sure it contains only
    # ASCII characters so we convert it to a string object
    if isinstance(encoded_data, bytes):
        try:
            encoded_data = encoded_data.decode("utf-8")
        except UnicodeDecodeError:
            raise ValueError("base64 encoded data should only contain ASCII characters")

    padding = encoded_data.count("=")

    # Check if the encoded string contains non base64 characters
    if padding:
        assert all(
            char in B64_CHARSET for char in encoded_data[:-padding]
        ), "Invalid base64 character(s) found."
    else:
        assert all(
            char in B64_CHARSET for char in encoded_data
        ), "Invalid base64 character(s) found."

    # Check the padding
    assert len(encoded_data) % 4 == 0 and padding < 3, "Incorrect padding"

    if padding:
        # Remove padding if there is one
        encoded_data = encoded_data[:-padding]

        binary_stream = "".join(
            bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data
        )[: -padding * 2]
    else:
        binary_stream = "".join(
            bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data
        )

    data = [
        int(binary_stream[index : index + 8], 2)
        for index in range(0, len(binary_stream), 8)
    ]

    return bytes(data)


if __name__ == "__main__":
    import doctest

    doctest.testmod()
Replace base64_cipher.py with an easy to understand version (#3925) * rename base64_cipher.py to base64_encoding.py * edit base64_encoding.py * import necessary modules inside doctests * make it behave like the official implementation * replace format with f-string where possible * replace format with f-string Co-authored-by: Christian Clauss <cclauss@me.com> * fix: syntax error due to closing parenthese * reformat code Co-authored-by: Christian Clauss <cclauss@me.com> 2020-11-25 12:38:02 +00:00			`B64_CHARSET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"`


			`def base64_encode(data: bytes) -> bytes:`
			`"""Encodes data according to RFC4648.`

			`The data is first transformed to binary and appended with binary digits so that its`
			`length becomes a multiple of 6, then each 6 binary digits will match a character in`
			`the B64_CHARSET string. The number of appended binary digits would later determine`
			`how many "=" sign should be added, the padding.`
			`For every 2 binary digits added, a "=" sign is added in the output.`
			`We can add any binary digits to make it a multiple of 6, for instance, consider the`
			`following example:`
			`"AA" -> 0010100100101001 -> 001010 010010 1001`
			`As can be seen above, 2 more binary digits should be added, so there's 4`
			`possibilities here: 00, 01, 10 or 11.`
			`That being said, Base64 encoding can be used in Steganography to hide data in these`
			`appended digits.`

			`>>> from base64 import b64encode`
			`>>> a = b"This pull request is part of Hacktoberfest20!"`
			`>>> b = b"https://tools.ietf.org/html/rfc4648"`
			`>>> c = b"A"`
			`>>> base64_encode(a) == b64encode(a)`
			`True`
			`>>> base64_encode(b) == b64encode(b)`
			`True`
			`>>> base64_encode(c) == b64encode(c)`
			`True`
			`>>> base64_encode("abc")`
			`Traceback (most recent call last):`
			`...`
			`TypeError: a bytes-like object is required, not 'str'`
			`"""`
			`# Make sure the supplied data is a bytes-like object`
			`if not isinstance(data, bytes):`
			`raise TypeError(`
			`f"a bytes-like object is required, not '{data.__class__.__name__}'"`
			`)`

			`binary_stream = "".join(bin(byte)[2:].zfill(8) for byte in data)`

			`padding_needed = len(binary_stream) % 6 != 0`

			`if padding_needed:`
			`# The padding that will be added later`
			`padding = b"=" * ((6 - len(binary_stream) % 6) // 2)`

			`# Append binary_stream with arbitrary binary digits (0's by default) to make its`
			`# length a multiple of 6.`
			`binary_stream += "0" * (6 - len(binary_stream) % 6)`
			`else:`
			`padding = b""`

			`# Encode every 6 binary digits to their corresponding Base64 character`
			`return (`
			`"".join(`
			`B64_CHARSET[int(binary_stream[index : index + 6], 2)]`
			`for index in range(0, len(binary_stream), 6)`
			`).encode()`
			`+ padding`
			`)`


			`def base64_decode(encoded_data: str) -> bytes:`
			`"""Decodes data according to RFC4648.`

			`This does the reverse operation of base64_encode.`
			`We first transform the encoded data back to a binary stream, take off the`
			`previously appended binary digits according to the padding, at this point we`
			`would have a binary stream whose length is multiple of 8, the last step is`
			`to convert every 8 bits to a byte.`

			`>>> from base64 import b64decode`
			`>>> a = "VGhpcyBwdWxsIHJlcXVlc3QgaXMgcGFydCBvZiBIYWNrdG9iZXJmZXN0MjAh"`
			`>>> b = "aHR0cHM6Ly90b29scy5pZXRmLm9yZy9odG1sL3JmYzQ2NDg="`
			`>>> c = "QQ=="`
			`>>> base64_decode(a) == b64decode(a)`
			`True`
			`>>> base64_decode(b) == b64decode(b)`
			`True`
			`>>> base64_decode(c) == b64decode(c)`
			`True`
			`>>> base64_decode("abc")`
			`Traceback (most recent call last):`
			`...`
			`AssertionError: Incorrect padding`
			`"""`
			`# Make sure encoded_data is either a string or a bytes-like object`
			`if not isinstance(encoded_data, bytes) and not isinstance(encoded_data, str):`
			`raise TypeError(`
			`"argument should be a bytes-like object or ASCII string, not "`
			`f"'{encoded_data.__class__.__name__}'"`
			`)`

			`# In case encoded_data is a bytes-like object, make sure it contains only`
			`# ASCII characters so we convert it to a string object`
			`if isinstance(encoded_data, bytes):`
			`try:`
			`encoded_data = encoded_data.decode("utf-8")`
			`except UnicodeDecodeError:`
			`raise ValueError("base64 encoded data should only contain ASCII characters")`

			`padding = encoded_data.count("=")`

			`# Check if the encoded string contains non base64 characters`
			`if padding:`
			`assert all(`
			`char in B64_CHARSET for char in encoded_data[:-padding]`
			`), "Invalid base64 character(s) found."`
			`else:`
			`assert all(`
			`char in B64_CHARSET for char in encoded_data`
			`), "Invalid base64 character(s) found."`

			`# Check the padding`
			`assert len(encoded_data) % 4 == 0 and padding < 3, "Incorrect padding"`

			`if padding:`
			`# Remove padding if there is one`
			`encoded_data = encoded_data[:-padding]`

			`binary_stream = "".join(`
			`bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data`
			`)[: -padding * 2]`
			`else:`
			`binary_stream = "".join(`
			`bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data`
			`)`

			`data = [`
			`int(binary_stream[index : index + 8], 2)`
			`for index in range(0, len(binary_stream), 8)`
			`]`

			`return bytes(data)`


			`if __name__ == "__main__":`
			`import doctest`

			`doctest.testmod()`