commented doctest

This commit is contained in:
Isidro Arias 2023-04-07 17:21:38 +02:00
parent 78d19fd19b
commit 8b1bec0dc6

View File

@ -1,73 +1,101 @@
""" """
See https://en.wikipedia.org/wiki/Bloom_filter See https://en.wikipedia.org/wiki/Bloom_filter
>>> b = Bloom() The use of this data structure is to test membership in a set.
Compared to python built-in set() it is more space-efficent.
In the following example, only 8 bits of memory will be used:
>>> b = Bloom(size=8)
>>> "Titanic" in b
False
Initially the filter contains all zeros:
>>> b.bitstring
'00000000'
When an element is added, two bits are set to 1
since there are 2 hash functions:
>>> b.add("Titanic") >>> b.add("Titanic")
>>> b.bitstring
'01100000'
>>> "Titanic" in b
True
However, sometimes only one bit is added
because both hash functions return the same value
>>> b.add("Avatar") >>> b.add("Avatar")
>>> b.exists("Titanic") >>> b.format_hash("Avatar")
'00000100'
>>> b.bitstring
'01100100'
Not added elements should return False ...
>>> "The Goodfather" in b
False
>>> b.format_hash("The Goodfather")
'00011000'
>>> "Interstellar" in b
False
>>> "Parasite" in b
False
>>> "Pulp Fiction" in b
False
but sometimes there are false positives:
>>> "Ratatouille" in b
True True
>>> b.exists("Avatar") >>> b.format_hash("Ratatouille")
True '01100000'
>>> b.exists("The Goodfather")
False >>> b.estimated_error_rate()
>>> b.exists("Interstellar") 0.140625
False
>>> b.exists("Parasite")
False
>>> b.exists("Pulp Fiction")
False
""" """
from hashlib import md5, sha256 from hashlib import md5, sha256
from random import choices from random import choices
from string import ascii_lowercase from string import ascii_lowercase
HASH_FUNCTIONS = (sha256, md5)
class Bloom: class Bloom:
# number of hash functions is fixed
HASH_FUNCTIONS = (sha256, md5)
def __init__(self, size: int = 8) -> None: def __init__(self, size: int = 8) -> None:
self.bitstring = 0b0 self.bitarray = 0b0
self.size = size self.size = size
def add(self, value: str) -> None: def add(self, value: str) -> None:
h = self.hash_(value) h = self.hash_(value)
self.bitstring |= h self.bitarray |= h
# print(
# f"""\
# [add] value = {value}
# hash = {self.format_bin(h)}
# filter = {self.format_bin(self.bitstring)}
# """
# )
def exists(self, value: str) -> bool: def exists(self, value: str) -> bool:
h = self.hash_(value) h = self.hash_(value)
res = (h & self.bitstring) == h return (h & self.bitarray) == h
# print( def __contains__(self, other):
# f"""\ return self.exists(other)
# [exists] value = {value}
# hash = {self.format_bin(h)}
# filter = {self.format_bin(self.bitstring)}
# res = {res}
# """
# )
return res
def format_bin(self, value: int) -> str: def format_bin(self, bitarray: int) -> str:
res = bin(value)[2:] res = bin(bitarray)[2:]
return res.zfill(self.size) return res.zfill(self.size)
@property
def bitstring(self):
return self.format_bin(self.bitarray)
def hash_(self, value: str) -> int: def hash_(self, value: str) -> int:
res = 0b0 res = 0b0
for func in self.HASH_FUNCTIONS: for func in HASH_FUNCTIONS:
b = func(value.encode()).digest() b = func(value.encode()).digest()
position = int.from_bytes(b, "little") % self.size position = int.from_bytes(b, "little") % self.size
res |= 2**position res |= 2**position
return res return res
def format_hash(self, value: str) -> str:
return self.format_bin(self.hash_(value))
def estimated_error_rate(self):
n_ones = bin(self.bitarray).count("1")
k = len(HASH_FUNCTIONS)
return (n_ones / self.size) ** k
def random_string(size: int) -> str: def random_string(size: int) -> str:
return "".join(choices(ascii_lowercase + " ", k=size)) return "".join(choices(ascii_lowercase + " ", k=size))
@ -76,7 +104,7 @@ def random_string(size: int) -> str:
def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None: def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
b = Bloom(size=filter_bits) b = Bloom(size=filter_bits)
k = len(b.HASH_FUNCTIONS) k = len(HASH_FUNCTIONS)
estimated_error_rate_beforehand = ( estimated_error_rate_beforehand = (
1 - (1 - 1 / filter_bits) ** (k * added_elements) 1 - (1 - 1 / filter_bits) ** (k * added_elements)
) ** k ) ** k
@ -85,7 +113,7 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
for _ in range(added_elements): for _ in range(added_elements):
b.add(not_added.pop()) b.add(not_added.pop())
n_ones = bin(b.bitstring).count("1") n_ones = bin(b.bitarray).count("1")
estimated_error_rate = (n_ones / filter_bits) ** k estimated_error_rate = (n_ones / filter_bits) ** k
errors = 0 errors = 0