diff --git a/compression/burrows_wheeler.py b/compression/burrows_wheeler.py new file mode 100644 index 000000000..fabeab39a --- /dev/null +++ b/compression/burrows_wheeler.py @@ -0,0 +1,176 @@ +""" +https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform + +The Burrows–Wheeler transform (BWT, also called block-sorting compression) +rearranges a character string into runs of similar characters. This is useful +for compression, since it tends to be easy to compress a string that has runs +of repeated characters by techniques such as move-to-front transform and +run-length encoding. More importantly, the transformation is reversible, +without needing to store any additional data except the position of the first +original character. The BWT is thus a "free" method of improving the efficiency +of text compression algorithms, costing only some extra computation. +""" +from typing import List, Dict + + +def all_rotations(s: str) -> List[str]: + """ + :param s: The string that will be rotated len(s) times. + :return: A list with the rotations. + :raises TypeError: If s is not an instance of str. + Examples: + + >>> all_rotations("^BANANA|") # doctest: +NORMALIZE_WHITESPACE + ['^BANANA|', 'BANANA|^', 'ANANA|^B', 'NANA|^BA', 'ANA|^BAN', 'NA|^BANA', + 'A|^BANAN', '|^BANANA'] + >>> all_rotations("a_asa_da_casa") # doctest: +NORMALIZE_WHITESPACE + ['a_asa_da_casa', '_asa_da_casaa', 'asa_da_casaa_', 'sa_da_casaa_a', + 'a_da_casaa_as', '_da_casaa_asa', 'da_casaa_asa_', 'a_casaa_asa_d', + '_casaa_asa_da', 'casaa_asa_da_', 'asaa_asa_da_c', 'saa_asa_da_ca', + 'aa_asa_da_cas'] + >>> all_rotations("panamabanana") # doctest: +NORMALIZE_WHITESPACE + ['panamabanana', 'anamabananap', 'namabananapa', 'amabananapan', + 'mabananapana', 'abananapanam', 'bananapanama', 'ananapanamab', + 'nanapanamaba', 'anapanamaban', 'napanamabana', 'apanamabanan'] + >>> all_rotations(5) + Traceback (most recent call last): + ... + TypeError: The parameter s type must be str. + """ + if not isinstance(s, str): + raise TypeError("The parameter s type must be str.") + + return [s[i:] + s[:i] for i in range(len(s))] + + +def bwt_transform(s: str) -> Dict: + """ + :param s: The string that will be used at bwt algorithm + :return: the string composed of the last char of each row of the ordered + rotations and the index of the original string at ordered rotations list + :raises TypeError: If the s parameter type is not str + :raises ValueError: If the s parameter is empty + Examples: + + >>> bwt_transform("^BANANA") + {'bwt_string': 'BNN^AAA', 'idx_original_string': 6} + >>> bwt_transform("a_asa_da_casa") + {'bwt_string': 'aaaadss_c__aa', 'idx_original_string': 3} + >>> bwt_transform("panamabanana") + {'bwt_string': 'mnpbnnaaaaaa', 'idx_original_string': 11} + >>> bwt_transform(4) + Traceback (most recent call last): + ... + TypeError: The parameter s type must be str. + >>> bwt_transform('') + Traceback (most recent call last): + ... + ValueError: The parameter s must not be empty. + """ + if not isinstance(s, str): + raise TypeError("The parameter s type must be str.") + if not s: + raise ValueError("The parameter s must not be empty.") + + rotations = all_rotations(s) + rotations.sort() # sort the list of rotations in alphabetically order + # make a string composed of the last char of each rotation + return { + "bwt_string": "".join([word[-1] for word in rotations]), + "idx_original_string": rotations.index(s), + } + + +def reverse_bwt(bwt_string: str, idx_original_string: int) -> str: + """ + :param bwt_string: The string returned from bwt algorithm execution + :param idx_original_string: A 0-based index of the string that was used to + generate bwt_string at ordered rotations list + :return: The string used to generate bwt_string when bwt was executed + :raises TypeError: If the bwt_string parameter type is not str + :raises ValueError: If the bwt_string parameter is empty + :raises TypeError: If the idx_original_string type is not int or if not + possible to cast it to int + :raises ValueError: If the idx_original_string value is lower than 0 or + greater than len(bwt_string) - 1 + + >>> reverse_bwt("BNN^AAA", 6) + '^BANANA' + >>> reverse_bwt("aaaadss_c__aa", 3) + 'a_asa_da_casa' + >>> reverse_bwt("mnpbnnaaaaaa", 11) + 'panamabanana' + >>> reverse_bwt(4, 11) + Traceback (most recent call last): + ... + TypeError: The parameter bwt_string type must be str. + >>> reverse_bwt("", 11) + Traceback (most recent call last): + ... + ValueError: The parameter bwt_string must not be empty. + >>> reverse_bwt("mnpbnnaaaaaa", "asd") # doctest: +NORMALIZE_WHITESPACE + Traceback (most recent call last): + ... + TypeError: The parameter idx_original_string type must be int or passive + of cast to int. + >>> reverse_bwt("mnpbnnaaaaaa", -1) + Traceback (most recent call last): + ... + ValueError: The parameter idx_original_string must not be lower than 0. + >>> reverse_bwt("mnpbnnaaaaaa", 12) # doctest: +NORMALIZE_WHITESPACE + Traceback (most recent call last): + ... + ValueError: The parameter idx_original_string must be lower than + len(bwt_string). + >>> reverse_bwt("mnpbnnaaaaaa", 11.0) + 'panamabanana' + >>> reverse_bwt("mnpbnnaaaaaa", 11.4) + 'panamabanana' + """ + if not isinstance(bwt_string, str): + raise TypeError("The parameter bwt_string type must be str.") + if not bwt_string: + raise ValueError("The parameter bwt_string must not be empty.") + try: + idx_original_string = int(idx_original_string) + except ValueError: + raise TypeError( + ( + "The parameter idx_original_string type must be int or passive" + " of cast to int." + ) + ) + if idx_original_string < 0: + raise ValueError( + "The parameter idx_original_string must not be lower than 0." + ) + if idx_original_string >= len(bwt_string): + raise ValueError( + ( + "The parameter idx_original_string must be lower than" + " len(bwt_string)." + ) + ) + + ordered_rotations = [""] * len(bwt_string) + for x in range(len(bwt_string)): + for i in range(len(bwt_string)): + ordered_rotations[i] = bwt_string[i] + ordered_rotations[i] + ordered_rotations.sort() + return ordered_rotations[idx_original_string] + + +if __name__ == "__main__": + entry_msg = "Provide a string that I will generate its BWT transform: " + s = input(entry_msg).strip() + result = bwt_transform(s) + bwt_output_msg = "Burrows Wheeler tranform for string '{}' results in '{}'" + print(bwt_output_msg.format(s, result["bwt_string"])) + original_string = reverse_bwt( + result["bwt_string"], result["idx_original_string"] + ) + fmt = ( + "Reversing Burrows Wheeler tranform for entry '{}' we get original" + " string '{}'" + ) + print(fmt.format(result["bwt_string"], original_string))