Python/compression/burrows_wheeler.py
Caeden 4d0c830d2c
Add flake8 pluin flake8 bugbear to pre-commit (#7132)
* ci(pre-commit): Add ``flake8-builtins`` additional dependency to ``pre-commit`` (#7104)

* refactor: Fix ``flake8-builtins`` (#7104)

* fix(lru_cache): Fix naming conventions in docstrings (#7104)

* ci(pre-commit): Order additional dependencies alphabetically (#7104)

* fix(lfu_cache): Correct function name in docstring (#7104)

* Update strings/snake_case_to_camel_pascal_case.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update data_structures/stacks/next_greater_element.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update digital_image_processing/index_calculation.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update graphs/prim.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update hashes/djb2.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* refactor: Rename `_builtin` to `builtin_` ( #7104)

* fix: Rename all instances (#7104)

* refactor: Update variable names (#7104)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci: Create ``tox.ini`` and ignore ``A003`` (#7123)

* revert: Remove function name changes (#7104)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Rename tox.ini to .flake8

* Update data_structures/heap/heap.py

Co-authored-by: Dhruv Manilawala <dhruvmanila@gmail.com>

* refactor: Rename `next_` to `next_item` (#7104)

* ci(pre-commit): Add `flake8` plugin `flake8-bugbear` (#7127)

* refactor: Follow `flake8-bugbear` plugin (#7127)

* fix: Correct `knapsack` code (#7127)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: Christian Clauss <cclauss@me.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Dhruv Manilawala <dhruvmanila@gmail.com>
2022-10-13 18:03:06 +02:00

177 lines
6.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform
The BurrowsWheeler transform (BWT, also called block-sorting compression)
rearranges a character string into runs of similar characters. This is useful
for compression, since it tends to be easy to compress a string that has runs
of repeated characters by techniques such as move-to-front transform and
run-length encoding. More importantly, the transformation is reversible,
without needing to store any additional data except the position of the first
original character. The BWT is thus a "free" method of improving the efficiency
of text compression algorithms, costing only some extra computation.
"""
from __future__ import annotations
from typing import TypedDict
class BWTTransformDict(TypedDict):
bwt_string: str
idx_original_string: int
def all_rotations(s: str) -> list[str]:
"""
:param s: The string that will be rotated len(s) times.
:return: A list with the rotations.
:raises TypeError: If s is not an instance of str.
Examples:
>>> all_rotations("^BANANA|") # doctest: +NORMALIZE_WHITESPACE
['^BANANA|', 'BANANA|^', 'ANANA|^B', 'NANA|^BA', 'ANA|^BAN', 'NA|^BANA',
'A|^BANAN', '|^BANANA']
>>> all_rotations("a_asa_da_casa") # doctest: +NORMALIZE_WHITESPACE
['a_asa_da_casa', '_asa_da_casaa', 'asa_da_casaa_', 'sa_da_casaa_a',
'a_da_casaa_as', '_da_casaa_asa', 'da_casaa_asa_', 'a_casaa_asa_d',
'_casaa_asa_da', 'casaa_asa_da_', 'asaa_asa_da_c', 'saa_asa_da_ca',
'aa_asa_da_cas']
>>> all_rotations("panamabanana") # doctest: +NORMALIZE_WHITESPACE
['panamabanana', 'anamabananap', 'namabananapa', 'amabananapan',
'mabananapana', 'abananapanam', 'bananapanama', 'ananapanamab',
'nanapanamaba', 'anapanamaban', 'napanamabana', 'apanamabanan']
>>> all_rotations(5)
Traceback (most recent call last):
...
TypeError: The parameter s type must be str.
"""
if not isinstance(s, str):
raise TypeError("The parameter s type must be str.")
return [s[i:] + s[:i] for i in range(len(s))]
def bwt_transform(s: str) -> BWTTransformDict:
"""
:param s: The string that will be used at bwt algorithm
:return: the string composed of the last char of each row of the ordered
rotations and the index of the original string at ordered rotations list
:raises TypeError: If the s parameter type is not str
:raises ValueError: If the s parameter is empty
Examples:
>>> bwt_transform("^BANANA")
{'bwt_string': 'BNN^AAA', 'idx_original_string': 6}
>>> bwt_transform("a_asa_da_casa")
{'bwt_string': 'aaaadss_c__aa', 'idx_original_string': 3}
>>> bwt_transform("panamabanana")
{'bwt_string': 'mnpbnnaaaaaa', 'idx_original_string': 11}
>>> bwt_transform(4)
Traceback (most recent call last):
...
TypeError: The parameter s type must be str.
>>> bwt_transform('')
Traceback (most recent call last):
...
ValueError: The parameter s must not be empty.
"""
if not isinstance(s, str):
raise TypeError("The parameter s type must be str.")
if not s:
raise ValueError("The parameter s must not be empty.")
rotations = all_rotations(s)
rotations.sort() # sort the list of rotations in alphabetically order
# make a string composed of the last char of each rotation
response: BWTTransformDict = {
"bwt_string": "".join([word[-1] for word in rotations]),
"idx_original_string": rotations.index(s),
}
return response
def reverse_bwt(bwt_string: str, idx_original_string: int) -> str:
"""
:param bwt_string: The string returned from bwt algorithm execution
:param idx_original_string: A 0-based index of the string that was used to
generate bwt_string at ordered rotations list
:return: The string used to generate bwt_string when bwt was executed
:raises TypeError: If the bwt_string parameter type is not str
:raises ValueError: If the bwt_string parameter is empty
:raises TypeError: If the idx_original_string type is not int or if not
possible to cast it to int
:raises ValueError: If the idx_original_string value is lower than 0 or
greater than len(bwt_string) - 1
>>> reverse_bwt("BNN^AAA", 6)
'^BANANA'
>>> reverse_bwt("aaaadss_c__aa", 3)
'a_asa_da_casa'
>>> reverse_bwt("mnpbnnaaaaaa", 11)
'panamabanana'
>>> reverse_bwt(4, 11)
Traceback (most recent call last):
...
TypeError: The parameter bwt_string type must be str.
>>> reverse_bwt("", 11)
Traceback (most recent call last):
...
ValueError: The parameter bwt_string must not be empty.
>>> reverse_bwt("mnpbnnaaaaaa", "asd") # doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
TypeError: The parameter idx_original_string type must be int or passive
of cast to int.
>>> reverse_bwt("mnpbnnaaaaaa", -1)
Traceback (most recent call last):
...
ValueError: The parameter idx_original_string must not be lower than 0.
>>> reverse_bwt("mnpbnnaaaaaa", 12) # doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
ValueError: The parameter idx_original_string must be lower than
len(bwt_string).
>>> reverse_bwt("mnpbnnaaaaaa", 11.0)
'panamabanana'
>>> reverse_bwt("mnpbnnaaaaaa", 11.4)
'panamabanana'
"""
if not isinstance(bwt_string, str):
raise TypeError("The parameter bwt_string type must be str.")
if not bwt_string:
raise ValueError("The parameter bwt_string must not be empty.")
try:
idx_original_string = int(idx_original_string)
except ValueError:
raise TypeError(
"The parameter idx_original_string type must be int or passive"
" of cast to int."
)
if idx_original_string < 0:
raise ValueError("The parameter idx_original_string must not be lower than 0.")
if idx_original_string >= len(bwt_string):
raise ValueError(
"The parameter idx_original_string must be lower than" " len(bwt_string)."
)
ordered_rotations = [""] * len(bwt_string)
for _ in range(len(bwt_string)):
for i in range(len(bwt_string)):
ordered_rotations[i] = bwt_string[i] + ordered_rotations[i]
ordered_rotations.sort()
return ordered_rotations[idx_original_string]
if __name__ == "__main__":
entry_msg = "Provide a string that I will generate its BWT transform: "
s = input(entry_msg).strip()
result = bwt_transform(s)
print(
f"Burrows Wheeler transform for string '{s}' results "
f"in '{result['bwt_string']}'"
)
original_string = reverse_bwt(result["bwt_string"], result["idx_original_string"])
print(
f"Reversing Burrows Wheeler transform for entry '{result['bwt_string']}' "
f"we get original string '{original_string}'"
)