mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-02-07 18:10:55 +00:00
commented doctest
This commit is contained in:
parent
78d19fd19b
commit
8b1bec0dc6
|
@ -1,73 +1,101 @@
|
||||||
"""
|
"""
|
||||||
See https://en.wikipedia.org/wiki/Bloom_filter
|
See https://en.wikipedia.org/wiki/Bloom_filter
|
||||||
|
|
||||||
>>> b = Bloom()
|
The use of this data structure is to test membership in a set.
|
||||||
|
Compared to python built-in set() it is more space-efficent.
|
||||||
|
In the following example, only 8 bits of memory will be used:
|
||||||
|
>>> b = Bloom(size=8)
|
||||||
|
>>> "Titanic" in b
|
||||||
|
False
|
||||||
|
|
||||||
|
Initially the filter contains all zeros:
|
||||||
|
>>> b.bitstring
|
||||||
|
'00000000'
|
||||||
|
|
||||||
|
When an element is added, two bits are set to 1
|
||||||
|
since there are 2 hash functions:
|
||||||
>>> b.add("Titanic")
|
>>> b.add("Titanic")
|
||||||
|
>>> b.bitstring
|
||||||
|
'01100000'
|
||||||
|
>>> "Titanic" in b
|
||||||
|
True
|
||||||
|
|
||||||
|
However, sometimes only one bit is added
|
||||||
|
because both hash functions return the same value
|
||||||
>>> b.add("Avatar")
|
>>> b.add("Avatar")
|
||||||
>>> b.exists("Titanic")
|
>>> b.format_hash("Avatar")
|
||||||
|
'00000100'
|
||||||
|
>>> b.bitstring
|
||||||
|
'01100100'
|
||||||
|
|
||||||
|
Not added elements should return False ...
|
||||||
|
>>> "The Goodfather" in b
|
||||||
|
False
|
||||||
|
>>> b.format_hash("The Goodfather")
|
||||||
|
'00011000'
|
||||||
|
>>> "Interstellar" in b
|
||||||
|
False
|
||||||
|
>>> "Parasite" in b
|
||||||
|
False
|
||||||
|
>>> "Pulp Fiction" in b
|
||||||
|
False
|
||||||
|
|
||||||
|
but sometimes there are false positives:
|
||||||
|
>>> "Ratatouille" in b
|
||||||
True
|
True
|
||||||
>>> b.exists("Avatar")
|
>>> b.format_hash("Ratatouille")
|
||||||
True
|
'01100000'
|
||||||
>>> b.exists("The Goodfather")
|
|
||||||
False
|
>>> b.estimated_error_rate()
|
||||||
>>> b.exists("Interstellar")
|
0.140625
|
||||||
False
|
|
||||||
>>> b.exists("Parasite")
|
|
||||||
False
|
|
||||||
>>> b.exists("Pulp Fiction")
|
|
||||||
False
|
|
||||||
"""
|
"""
|
||||||
from hashlib import md5, sha256
|
from hashlib import md5, sha256
|
||||||
from random import choices
|
from random import choices
|
||||||
from string import ascii_lowercase
|
from string import ascii_lowercase
|
||||||
|
|
||||||
|
HASH_FUNCTIONS = (sha256, md5)
|
||||||
|
|
||||||
|
|
||||||
class Bloom:
|
class Bloom:
|
||||||
# number of hash functions is fixed
|
|
||||||
HASH_FUNCTIONS = (sha256, md5)
|
|
||||||
|
|
||||||
def __init__(self, size: int = 8) -> None:
|
def __init__(self, size: int = 8) -> None:
|
||||||
self.bitstring = 0b0
|
self.bitarray = 0b0
|
||||||
self.size = size
|
self.size = size
|
||||||
|
|
||||||
def add(self, value: str) -> None:
|
def add(self, value: str) -> None:
|
||||||
h = self.hash_(value)
|
h = self.hash_(value)
|
||||||
self.bitstring |= h
|
self.bitarray |= h
|
||||||
|
|
||||||
# print(
|
|
||||||
# f"""\
|
|
||||||
# [add] value = {value}
|
|
||||||
# hash = {self.format_bin(h)}
|
|
||||||
# filter = {self.format_bin(self.bitstring)}
|
|
||||||
# """
|
|
||||||
# )
|
|
||||||
|
|
||||||
def exists(self, value: str) -> bool:
|
def exists(self, value: str) -> bool:
|
||||||
h = self.hash_(value)
|
h = self.hash_(value)
|
||||||
res = (h & self.bitstring) == h
|
return (h & self.bitarray) == h
|
||||||
|
|
||||||
# print(
|
def __contains__(self, other):
|
||||||
# f"""\
|
return self.exists(other)
|
||||||
# [exists] value = {value}
|
|
||||||
# hash = {self.format_bin(h)}
|
|
||||||
# filter = {self.format_bin(self.bitstring)}
|
|
||||||
# res = {res}
|
|
||||||
# """
|
|
||||||
# )
|
|
||||||
return res
|
|
||||||
|
|
||||||
def format_bin(self, value: int) -> str:
|
def format_bin(self, bitarray: int) -> str:
|
||||||
res = bin(value)[2:]
|
res = bin(bitarray)[2:]
|
||||||
return res.zfill(self.size)
|
return res.zfill(self.size)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def bitstring(self):
|
||||||
|
return self.format_bin(self.bitarray)
|
||||||
|
|
||||||
def hash_(self, value: str) -> int:
|
def hash_(self, value: str) -> int:
|
||||||
res = 0b0
|
res = 0b0
|
||||||
for func in self.HASH_FUNCTIONS:
|
for func in HASH_FUNCTIONS:
|
||||||
b = func(value.encode()).digest()
|
b = func(value.encode()).digest()
|
||||||
position = int.from_bytes(b, "little") % self.size
|
position = int.from_bytes(b, "little") % self.size
|
||||||
res |= 2**position
|
res |= 2**position
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
def format_hash(self, value: str) -> str:
|
||||||
|
return self.format_bin(self.hash_(value))
|
||||||
|
|
||||||
|
def estimated_error_rate(self):
|
||||||
|
n_ones = bin(self.bitarray).count("1")
|
||||||
|
k = len(HASH_FUNCTIONS)
|
||||||
|
return (n_ones / self.size) ** k
|
||||||
|
|
||||||
|
|
||||||
def random_string(size: int) -> str:
|
def random_string(size: int) -> str:
|
||||||
return "".join(choices(ascii_lowercase + " ", k=size))
|
return "".join(choices(ascii_lowercase + " ", k=size))
|
||||||
|
@ -76,7 +104,7 @@ def random_string(size: int) -> str:
|
||||||
def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
|
def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
|
||||||
b = Bloom(size=filter_bits)
|
b = Bloom(size=filter_bits)
|
||||||
|
|
||||||
k = len(b.HASH_FUNCTIONS)
|
k = len(HASH_FUNCTIONS)
|
||||||
estimated_error_rate_beforehand = (
|
estimated_error_rate_beforehand = (
|
||||||
1 - (1 - 1 / filter_bits) ** (k * added_elements)
|
1 - (1 - 1 / filter_bits) ** (k * added_elements)
|
||||||
) ** k
|
) ** k
|
||||||
|
@ -85,7 +113,7 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
|
||||||
for _ in range(added_elements):
|
for _ in range(added_elements):
|
||||||
b.add(not_added.pop())
|
b.add(not_added.pop())
|
||||||
|
|
||||||
n_ones = bin(b.bitstring).count("1")
|
n_ones = bin(b.bitarray).count("1")
|
||||||
estimated_error_rate = (n_ones / filter_bits) ** k
|
estimated_error_rate = (n_ones / filter_bits) ** k
|
||||||
|
|
||||||
errors = 0
|
errors = 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user