Python/compression/lz77.py

"""
LZ77 compression algorithm
- lossless data compression published in papers by Abraham Lempel and Jacob Ziv in 1977
- also known as LZ1 or sliding-window compression
- form the basis for many variations including LZW, LZSS, LZMA and others

It uses a “sliding window” method. Within the sliding window we have:
  - search buffer
  - look ahead buffer
len(sliding_window) = len(search_buffer) + len(look_ahead_buffer)

LZ77 manages a dictionary that uses triples composed of:
    - Offset into search buffer, it's the distance between the start of a phrase and
      the beginning of a file.
    - Length of the match, it's the number of characters that make up a phrase.
    - The indicator is represented by a character that is going to be encoded next.

As a file is parsed, the dictionary is dynamically updated to reflect the compressed
data contents and size.

Examples:
"cabracadabrarrarrad" <-> [(0, 0, 'c'), (0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'),
                           (3, 1, 'c'), (2, 1, 'd'), (7, 4, 'r'), (3, 5, 'd')]
"ababcbababaa" <-> [(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), (4, 3, 'a'), (2, 2, 'a')]
"aacaacabcabaaac" <-> [(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), (3, 3, 'a'), (1, 2, 'c')]

Sources:
en.wikipedia.org/wiki/LZ77_and_LZ78
"""

from dataclasses import dataclass

__version__ = "0.1"
__author__ = "Lucia Harcekova"


@dataclass
class Token:
    """
    Dataclass representing triplet called token consisting of length, offset
    and indicator. This triplet is used during LZ77 compression.
    """

    offset: int
    length: int
    indicator: str

    def __repr__(self) -> str:
        """
        >>> token = Token(1, 2, "c")
        >>> repr(token)
        '(1, 2, c)'
        >>> str(token)
        '(1, 2, c)'
        """
        return f"({self.offset}, {self.length}, {self.indicator})"


class LZ77Compressor:
    """
    Class containing compress and decompress methods using LZ77 compression algorithm.
    """

    def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> None:
        self.window_size = window_size
        self.lookahead_buffer_size = lookahead_buffer_size
        self.search_buffer_size = self.window_size - self.lookahead_buffer_size

    def compress(self, text: str) -> list[Token]:
        """
        Compress the given string text using LZ77 compression algorithm.

        Args:
            text: string to be compressed

        Returns:
            output: the compressed text as a list of Tokens

        >>> lz77_compressor = LZ77Compressor()
        >>> str(lz77_compressor.compress("ababcbababaa"))
        '[(0, 0, a), (0, 0, b), (2, 2, c), (4, 3, a), (2, 2, a)]'
        >>> str(lz77_compressor.compress("aacaacabcabaaac"))
        '[(0, 0, a), (1, 1, c), (3, 4, b), (3, 3, a), (1, 2, c)]'
        """

        output = []
        search_buffer = ""

        # while there are still characters in text to compress
        while text:
            # find the next encoding phrase
            # - triplet with offset, length, indicator (the next encoding character)
            token = self._find_encoding_token(text, search_buffer)

            # update the search buffer:
            # - add new characters from text into it
            # - check if size exceed the max search buffer size, if so, drop the
            #   oldest elements
            search_buffer += text[: token.length + 1]
            if len(search_buffer) > self.search_buffer_size:
                search_buffer = search_buffer[-self.search_buffer_size :]

            # update the text
            text = text[token.length + 1 :]

            # append the token to output
            output.append(token)

        return output

    def decompress(self, tokens: list[Token]) -> str:
        """
        Convert the list of tokens into an output string.

        Args:
            tokens: list containing triplets (offset, length, char)

        Returns:
            output: decompressed text

        Tests:
            >>> lz77_compressor = LZ77Compressor()
            >>> lz77_compressor.decompress([Token(0, 0, 'c'), Token(0, 0, 'a'),
            ... Token(0, 0, 'b'), Token(0, 0, 'r'), Token(3, 1, 'c'),
            ... Token(2, 1, 'd'), Token(7, 4, 'r'), Token(3, 5, 'd')])
            'cabracadabrarrarrad'
            >>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(0, 0, 'b'),
            ... Token(2, 2, 'c'), Token(4, 3, 'a'), Token(2, 2, 'a')])
            'ababcbababaa'
            >>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(1, 1, 'c'),
            ... Token(3, 4, 'b'), Token(3, 3, 'a'), Token(1, 2, 'c')])
            'aacaacabcabaaac'
        """

        output = ""

        for token in tokens:
            for _ in range(token.length):
                output += output[-token.offset]
            output += token.indicator

        return output

    def _find_encoding_token(self, text: str, search_buffer: str) -> Token:
        """Finds the encoding token for the first character in the text.

        Tests:
            >>> lz77_compressor = LZ77Compressor()
            >>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad").offset
            7
            >>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac").length
            1
            >>> lz77_compressor._find_encoding_token("abc", "xyz").offset
            0
            >>> lz77_compressor._find_encoding_token("", "xyz").offset
            Traceback (most recent call last):
                ...
            ValueError: We need some text to work with.
            >>> lz77_compressor._find_encoding_token("abc", "").offset
            0
        """

        if not text:
            raise ValueError("We need some text to work with.")

        # Initialise result parameters to default values
        length, offset = 0, 0

        if not search_buffer:
            return Token(offset, length, text[length])

        for i, character in enumerate(search_buffer):
            found_offset = len(search_buffer) - i
            if character == text[0]:
                found_length = self._match_length_from_index(text, search_buffer, 0, i)
                # if the found length is bigger than the current or if it's equal,
                # which means it's offset is smaller: update offset and length
                if found_length >= length:
                    offset, length = found_offset, found_length

        return Token(offset, length, text[length])

    def _match_length_from_index(
        self, text: str, window: str, text_index: int, window_index: int
    ) -> int:
        """Calculate the longest possible match of text and window characters from
        text_index in text and window_index in window.

        Args:
            text: _description_
            window: sliding window
            text_index: index of character in text
            window_index: index of character in sliding window

        Returns:
            The maximum match between text and window, from given indexes.

        Tests:
            >>> lz77_compressor = LZ77Compressor(13, 6)
            >>> lz77_compressor._match_length_from_index("rarrad", "adabrar", 0, 4)
            5
            >>> lz77_compressor._match_length_from_index("adabrarrarrad",
            ...     "cabrac", 0, 1)
            1
        """
        if not text or text[text_index] != window[window_index]:
            return 0
        return 1 + self._match_length_from_index(
            text, window + text[text_index], text_index + 1, window_index + 1
        )


if __name__ == "__main__":
    from doctest import testmod

    testmod()
    # Initialize compressor class
    lz77_compressor = LZ77Compressor(window_size=13, lookahead_buffer_size=6)

    # Example
    TEXT = "cabracadabrarrarrad"
    compressed_text = lz77_compressor.compress(TEXT)
    print(lz77_compressor.compress("ababcbababaa"))
    decompressed_text = lz77_compressor.decompress(compressed_text)
    assert decompressed_text == TEXT, "The LZ77 algorithm returned the invalid result."
Add LZ77 compression algorithm (#8059) * - add "lz77_compressor" class with compress and decompress methods using LZ77 compression algorithm * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * - use "list" instead "List", formatting * - fix spelling * - add Python type hints * - add 'Token' class to represent triplet (offset, length, indicator) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * - add test, hange type rom List to list * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * - remove extra import * - remove extra types in comments * - better test * - edit comments * - add return types * - add tests for __str__ and __repr__ * Update lz77.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2022-12-28 17:34:35 +00:00			`"""`
			`LZ77 compression algorithm`
			`- lossless data compression published in papers by Abraham Lempel and Jacob Ziv in 1977`
			`- also known as LZ1 or sliding-window compression`
			`- form the basis for many variations including LZW, LZSS, LZMA and others`

			`It uses a “sliding window” method. Within the sliding window we have:`
			`- search buffer`
			`- look ahead buffer`
			`len(sliding_window) = len(search_buffer) + len(look_ahead_buffer)`

			`LZ77 manages a dictionary that uses triples composed of:`
			`- Offset into search buffer, it's the distance between the start of a phrase and`
			`the beginning of a file.`
			`- Length of the match, it's the number of characters that make up a phrase.`
			`- The indicator is represented by a character that is going to be encoded next.`

			`As a file is parsed, the dictionary is dynamically updated to reflect the compressed`
			`data contents and size.`

			`Examples:`
			`"cabracadabrarrarrad" <-> [(0, 0, 'c'), (0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'),`
			`(3, 1, 'c'), (2, 1, 'd'), (7, 4, 'r'), (3, 5, 'd')]`
			`"ababcbababaa" <-> [(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), (4, 3, 'a'), (2, 2, 'a')]`
			`"aacaacabcabaaac" <-> [(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), (3, 3, 'a'), (1, 2, 'c')]`

			`Sources:`
			`en.wikipedia.org/wiki/LZ77_and_LZ78`
			`"""`

			`from dataclasses import dataclass`

			`__version__ = "0.1"`
			`__author__ = "Lucia Harcekova"`


			`@dataclass`
			`class Token:`
			`"""`
			`Dataclass representing triplet called token consisting of length, offset`
			`and indicator. This triplet is used during LZ77 compression.`
			`"""`

			`offset: int`
			`length: int`
			`indicator: str`

			`def __repr__(self) -> str:`
			`"""`
			`>>> token = Token(1, 2, "c")`
			`>>> repr(token)`
			`'(1, 2, c)'`
			`>>> str(token)`
			`'(1, 2, c)'`
			`"""`
			`return f"({self.offset}, {self.length}, {self.indicator})"`


			`class LZ77Compressor:`
			`"""`
			`Class containing compress and decompress methods using LZ77 compression algorithm.`
			`"""`

			`def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> None:`
			`self.window_size = window_size`
			`self.lookahead_buffer_size = lookahead_buffer_size`
			`self.search_buffer_size = self.window_size - self.lookahead_buffer_size`

			`def compress(self, text: str) -> list[Token]:`
			`"""`
			`Compress the given string text using LZ77 compression algorithm.`

			`Args:`
			`text: string to be compressed`

			`Returns:`
			`output: the compressed text as a list of Tokens`

			`>>> lz77_compressor = LZ77Compressor()`
			`>>> str(lz77_compressor.compress("ababcbababaa"))`
			`'[(0, 0, a), (0, 0, b), (2, 2, c), (4, 3, a), (2, 2, a)]'`
			`>>> str(lz77_compressor.compress("aacaacabcabaaac"))`
			`'[(0, 0, a), (1, 1, c), (3, 4, b), (3, 3, a), (1, 2, c)]'`
			`"""`

			`output = []`
			`search_buffer = ""`

			`# while there are still characters in text to compress`
			`while text:`
			`# find the next encoding phrase`
			`# - triplet with offset, length, indicator (the next encoding character)`
			`token = self._find_encoding_token(text, search_buffer)`

			`# update the search buffer:`
			`# - add new characters from text into it`
			`# - check if size exceed the max search buffer size, if so, drop the`
			`# oldest elements`
			`search_buffer += text[: token.length + 1]`
			`if len(search_buffer) > self.search_buffer_size:`
			`search_buffer = search_buffer[-self.search_buffer_size :]`

			`# update the text`
			`text = text[token.length + 1 :]`

			`# append the token to output`
			`output.append(token)`

			`return output`

			`def decompress(self, tokens: list[Token]) -> str:`
			`"""`
			`Convert the list of tokens into an output string.`

			`Args:`
			`tokens: list containing triplets (offset, length, char)`

			`Returns:`
			`output: decompressed text`

			`Tests:`
			`>>> lz77_compressor = LZ77Compressor()`
			`>>> lz77_compressor.decompress([Token(0, 0, 'c'), Token(0, 0, 'a'),`
			`... Token(0, 0, 'b'), Token(0, 0, 'r'), Token(3, 1, 'c'),`
			`... Token(2, 1, 'd'), Token(7, 4, 'r'), Token(3, 5, 'd')])`
			`'cabracadabrarrarrad'`
			`>>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(0, 0, 'b'),`
			`... Token(2, 2, 'c'), Token(4, 3, 'a'), Token(2, 2, 'a')])`
			`'ababcbababaa'`
			`>>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(1, 1, 'c'),`
			`... Token(3, 4, 'b'), Token(3, 3, 'a'), Token(1, 2, 'c')])`
			`'aacaacabcabaaac'`
			`"""`

			`output = ""`

			`for token in tokens:`
			`for _ in range(token.length):`
			`output += output[-token.offset]`
			`output += token.indicator`

			`return output`

			`def _find_encoding_token(self, text: str, search_buffer: str) -> Token:`
			`"""Finds the encoding token for the first character in the text.`

			`Tests:`
			`>>> lz77_compressor = LZ77Compressor()`
			`>>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad").offset`
			`7`
			`>>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac").length`
			`1`
			`>>> lz77_compressor._find_encoding_token("abc", "xyz").offset`
			`0`
			`>>> lz77_compressor._find_encoding_token("", "xyz").offset`
			`Traceback (most recent call last):`
			`...`
			`ValueError: We need some text to work with.`
			`>>> lz77_compressor._find_encoding_token("abc", "").offset`
			`0`
			`"""`

			`if not text:`
			`raise ValueError("We need some text to work with.")`

			`# Initialise result parameters to default values`
			`length, offset = 0, 0`

			`if not search_buffer:`
			`return Token(offset, length, text[length])`

			`for i, character in enumerate(search_buffer):`
			`found_offset = len(search_buffer) - i`
			`if character == text[0]:`
			`found_length = self._match_length_from_index(text, search_buffer, 0, i)`
			`# if the found length is bigger than the current or if it's equal,`
			`# which means it's offset is smaller: update offset and length`
			`if found_length >= length:`
			`offset, length = found_offset, found_length`

			`return Token(offset, length, text[length])`

			`def _match_length_from_index(`
			`self, text: str, window: str, text_index: int, window_index: int`
			`) -> int:`
			`"""Calculate the longest possible match of text and window characters from`
			`text_index in text and window_index in window.`

			`Args:`
			`text: _description_`
			`window: sliding window`
			`text_index: index of character in text`
			`window_index: index of character in sliding window`

			`Returns:`
			`The maximum match between text and window, from given indexes.`

			`Tests:`
			`>>> lz77_compressor = LZ77Compressor(13, 6)`
			`>>> lz77_compressor._match_length_from_index("rarrad", "adabrar", 0, 4)`
			`5`
			`>>> lz77_compressor._match_length_from_index("adabrarrarrad",`
			`... "cabrac", 0, 1)`
			`1`
			`"""`
			`if not text or text[text_index] != window[window_index]:`
			`return 0`
			`return 1 + self._match_length_from_index(`
			`text, window + text[text_index], text_index + 1, window_index + 1`
			`)`


			`if __name__ == "__main__":`
			`from doctest import testmod`

			`testmod()`
			`# Initialize compressor class`
			`lz77_compressor = LZ77Compressor(window_size=13, lookahead_buffer_size=6)`

			`# Example`
			`TEXT = "cabracadabrarrarrad"`
			`compressed_text = lz77_compressor.compress(TEXT)`
			`print(lz77_compressor.compress("ababcbababaa"))`
			`decompressed_text = lz77_compressor.decompress(compressed_text)`
			`assert decompressed_text == TEXT, "The LZ77 algorithm returned the invalid result."`