From 8b1bec0dc6d5474d749210a70c0221f6527b2258 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Fri, 7 Apr 2023 17:21:38 +0200 Subject: [PATCH] commented doctest --- data_structures/hashing/bloom_filter.py | 108 +++++++++++++++--------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 02e01713b..9f6048911 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -1,73 +1,101 @@ """ See https://en.wikipedia.org/wiki/Bloom_filter ->>> b = Bloom() +The use of this data structure is to test membership in a set. +Compared to python built-in set() it is more space-efficent. +In the following example, only 8 bits of memory will be used: +>>> b = Bloom(size=8) +>>> "Titanic" in b +False + +Initially the filter contains all zeros: +>>> b.bitstring +'00000000' + +When an element is added, two bits are set to 1 +since there are 2 hash functions: >>> b.add("Titanic") +>>> b.bitstring +'01100000' +>>> "Titanic" in b +True + +However, sometimes only one bit is added +because both hash functions return the same value >>> b.add("Avatar") ->>> b.exists("Titanic") +>>> b.format_hash("Avatar") +'00000100' +>>> b.bitstring +'01100100' + +Not added elements should return False ... +>>> "The Goodfather" in b +False +>>> b.format_hash("The Goodfather") +'00011000' +>>> "Interstellar" in b +False +>>> "Parasite" in b +False +>>> "Pulp Fiction" in b +False + +but sometimes there are false positives: +>>> "Ratatouille" in b True ->>> b.exists("Avatar") -True ->>> b.exists("The Goodfather") -False ->>> b.exists("Interstellar") -False ->>> b.exists("Parasite") -False ->>> b.exists("Pulp Fiction") -False +>>> b.format_hash("Ratatouille") +'01100000' + +>>> b.estimated_error_rate() +0.140625 """ from hashlib import md5, sha256 from random import choices from string import ascii_lowercase +HASH_FUNCTIONS = (sha256, md5) + class Bloom: - # number of hash functions is fixed - HASH_FUNCTIONS = (sha256, md5) - def __init__(self, size: int = 8) -> None: - self.bitstring = 0b0 + self.bitarray = 0b0 self.size = size def add(self, value: str) -> None: h = self.hash_(value) - self.bitstring |= h - - # print( - # f"""\ - # [add] value = {value} - # hash = {self.format_bin(h)} - # filter = {self.format_bin(self.bitstring)} - # """ - # ) + self.bitarray |= h def exists(self, value: str) -> bool: h = self.hash_(value) - res = (h & self.bitstring) == h + return (h & self.bitarray) == h - # print( - # f"""\ - # [exists] value = {value} - # hash = {self.format_bin(h)} - # filter = {self.format_bin(self.bitstring)} - # res = {res} - # """ - # ) - return res + def __contains__(self, other): + return self.exists(other) - def format_bin(self, value: int) -> str: - res = bin(value)[2:] + def format_bin(self, bitarray: int) -> str: + res = bin(bitarray)[2:] return res.zfill(self.size) + @property + def bitstring(self): + return self.format_bin(self.bitarray) + def hash_(self, value: str) -> int: res = 0b0 - for func in self.HASH_FUNCTIONS: + for func in HASH_FUNCTIONS: b = func(value.encode()).digest() position = int.from_bytes(b, "little") % self.size res |= 2**position return res + def format_hash(self, value: str) -> str: + return self.format_bin(self.hash_(value)) + + def estimated_error_rate(self): + n_ones = bin(self.bitarray).count("1") + k = len(HASH_FUNCTIONS) + return (n_ones / self.size) ** k + def random_string(size: int) -> str: return "".join(choices(ascii_lowercase + " ", k=size)) @@ -76,7 +104,7 @@ def random_string(size: int) -> str: def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None: b = Bloom(size=filter_bits) - k = len(b.HASH_FUNCTIONS) + k = len(HASH_FUNCTIONS) estimated_error_rate_beforehand = ( 1 - (1 - 1 / filter_bits) ** (k * added_elements) ) ** k @@ -85,7 +113,7 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None: for _ in range(added_elements): b.add(not_added.pop()) - n_ones = bin(b.bitstring).count("1") + n_ones = bin(b.bitarray).count("1") estimated_error_rate = (n_ones / filter_bits) ** k errors = 0