mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-02-07 18:10:55 +00:00
Bloom filter with tests
This commit is contained in:
parent
b2b8585e63
commit
173ab0ea96
103
data_structures/hashing/bloom_filter.py
Normal file
103
data_structures/hashing/bloom_filter.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
"""
|
||||
See https://en.wikipedia.org/wiki/Bloom_filter
|
||||
"""
|
||||
from hashlib import sha256, md5
|
||||
from random import randint, choices
|
||||
import string
|
||||
|
||||
|
||||
class Bloom:
|
||||
def __init__(self, size=8):
|
||||
self.bitstring = 0b0
|
||||
self.size = size
|
||||
|
||||
def add(self, value):
|
||||
h = self.hash(value)
|
||||
self.bitstring |= h
|
||||
print(
|
||||
f"""\
|
||||
[add] value = {value}
|
||||
hash = {self.format_bin(h)}
|
||||
filter = {self.format_bin(self.bitstring)}
|
||||
"""
|
||||
)
|
||||
|
||||
def exists(self, value):
|
||||
h = self.hash(value)
|
||||
res = (h & self.bitstring) == h
|
||||
|
||||
print(
|
||||
f"""\
|
||||
[exists] value = {value}
|
||||
hash = {self.format_bin(h)}
|
||||
filter = {self.format_bin(self.bitstring)}
|
||||
res = {res}
|
||||
"""
|
||||
)
|
||||
return res
|
||||
|
||||
def format_bin(self, value):
|
||||
res = bin(value)[2:]
|
||||
return res.zfill(self.size)
|
||||
|
||||
def hash(self, value):
|
||||
res = 0b0
|
||||
for func in (sha256, md5):
|
||||
b = func(value.encode()).digest()
|
||||
position = int.from_bytes(b, "little") % self.size
|
||||
res |= 2**position
|
||||
return res
|
||||
|
||||
|
||||
def test_movies():
|
||||
b = Bloom()
|
||||
b.add("titanic")
|
||||
b.add("avatar")
|
||||
|
||||
assert b.exists("titanic")
|
||||
assert b.exists("avatar")
|
||||
|
||||
assert b.exists("the goodfather") in (True, False)
|
||||
assert b.exists("interstellar") in (True, False)
|
||||
assert b.exists("Parasite") in (True, False)
|
||||
assert b.exists("Pulp fiction") in (True, False)
|
||||
|
||||
|
||||
def random_string(size):
|
||||
return "".join(choices(string.ascii_lowercase + " ", k=size))
|
||||
|
||||
|
||||
def test_probability(m=64, n=20):
|
||||
b = Bloom(size=m)
|
||||
|
||||
added = {random_string(10) for i in range(n)}
|
||||
for a in added:
|
||||
b.add(a)
|
||||
|
||||
# number of hash functions is fixed
|
||||
k = 2
|
||||
|
||||
n_ones = bin(b.bitstring).count("1")
|
||||
expected_probability = (n_ones / m) ** k
|
||||
|
||||
expected_probability_wikipedia = (1 - (1 - 1 / m) ** (k * n)) ** k
|
||||
|
||||
not_added = {random_string(10) for i in range(1000)}
|
||||
fails = 0
|
||||
for string in not_added:
|
||||
if b.exists(string):
|
||||
fails += 1
|
||||
fail_rate = fails / len(not_added)
|
||||
|
||||
print(f"total = {len(not_added)}, fails = {fails}, fail_rate = {fail_rate}")
|
||||
print(f"{expected_probability=}")
|
||||
print(f"{expected_probability_wikipedia=}")
|
||||
|
||||
assert (
|
||||
abs(expected_probability - fail_rate) <= 0.05
|
||||
) # 5% margin calculated experiementally
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_movies()
|
||||
test_probability()
|
Loading…
Reference in New Issue
Block a user