Compare commits

..

3 Commits

Author SHA1 Message Date
isidroas
14bdd174bb
Bloom Filter (#8615)
* Bloom filter with tests

* has functions constant

* fix type

* isort

* passing ruff

* type hints

* type hints

* from fail to erro

* captital leter

* type hints requested by boot

* descriptive name for m

* more descriptibe arguments II

* moved movies_test to doctest

* commented doctest

* removed test_probability

* estimated error

* added types

* again hash_

* Update data_structures/hashing/bloom_filter.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* from b to bloom

* Update data_structures/hashing/bloom_filter.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update data_structures/hashing/bloom_filter.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* syntax error in dict comprehension

* from goodfather to godfather

* removed Interestellar

* forgot the last Godfather

* Revert "removed Interestellar"

This reverts commit 35fa5f5c4bf101d073aad43c37b0a423d8975071.

* pretty dict

* Apply suggestions from code review

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update bloom_filter.py

---------

Co-authored-by: Christian Clauss <cclauss@me.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-04-08 19:39:24 +02:00
Christian Clauss
2f9b03393c
Delete queue_on_two_stacks.py which duplicates queue_by_two_stacks.py (#8624)
* Delete queue_on_two_stacks.py which duplicates queue_by_two_stacks.py

* updating DIRECTORY.md

---------

Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
2023-04-08 17:46:19 +05:30
amirsoroush
5cb0a000c4
Queue implementation using two Stacks (#8617)
* Queue implementation using two Stacks

* fix typo in queue/queue_on_two_stacks.py

* add 'iterable' to queue_on_two_stacks initializer

* make queue_on_two_stacks.py generic class

* fix ruff-UP007 in queue_on_two_stacks.py

* enhance readability in queue_on_two_stacks.py

* Create queue_by_two_stacks.py

---------

Co-authored-by: Christian Clauss <cclauss@me.com>
2023-04-08 13:41:08 +02:00
3 changed files with 221 additions and 0 deletions

View File

@ -232,6 +232,7 @@
* [Double Ended Queue](data_structures/queue/double_ended_queue.py) * [Double Ended Queue](data_structures/queue/double_ended_queue.py)
* [Linked Queue](data_structures/queue/linked_queue.py) * [Linked Queue](data_structures/queue/linked_queue.py)
* [Priority Queue Using List](data_structures/queue/priority_queue_using_list.py) * [Priority Queue Using List](data_structures/queue/priority_queue_using_list.py)
* [Queue By Two Stacks](data_structures/queue/queue_by_two_stacks.py)
* [Queue On List](data_structures/queue/queue_on_list.py) * [Queue On List](data_structures/queue/queue_on_list.py)
* [Queue On Pseudo Stack](data_structures/queue/queue_on_pseudo_stack.py) * [Queue On Pseudo Stack](data_structures/queue/queue_on_pseudo_stack.py)
* Stacks * Stacks

View File

@ -0,0 +1,105 @@
"""
See https://en.wikipedia.org/wiki/Bloom_filter
The use of this data structure is to test membership in a set.
Compared to Python's built-in set() it is more space-efficient.
In the following example, only 8 bits of memory will be used:
>>> bloom = Bloom(size=8)
Initially, the filter contains all zeros:
>>> bloom.bitstring
'00000000'
When an element is added, two bits are set to 1
since there are 2 hash functions in this implementation:
>>> "Titanic" in bloom
False
>>> bloom.add("Titanic")
>>> bloom.bitstring
'01100000'
>>> "Titanic" in bloom
True
However, sometimes only one bit is added
because both hash functions return the same value
>>> bloom.add("Avatar")
>>> "Avatar" in bloom
True
>>> bloom.format_hash("Avatar")
'00000100'
>>> bloom.bitstring
'01100100'
Not added elements should return False ...
>>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction")
>>> {
... film: bloom.format_hash(film) for film in not_present_films
... } # doctest: +NORMALIZE_WHITESPACE
{'The Godfather': '00000101',
'Interstellar': '00000011',
'Parasite': '00010010',
'Pulp Fiction': '10000100'}
>>> any(film in bloom for film in not_present_films)
False
but sometimes there are false positives:
>>> "Ratatouille" in bloom
True
>>> bloom.format_hash("Ratatouille")
'01100000'
The probability increases with the number of elements added.
The probability decreases with the number of bits in the bitarray.
>>> bloom.estimated_error_rate
0.140625
>>> bloom.add("The Godfather")
>>> bloom.estimated_error_rate
0.25
>>> bloom.bitstring
'01100101'
"""
from hashlib import md5, sha256
HASH_FUNCTIONS = (sha256, md5)
class Bloom:
def __init__(self, size: int = 8) -> None:
self.bitarray = 0b0
self.size = size
def add(self, value: str) -> None:
h = self.hash_(value)
self.bitarray |= h
def exists(self, value: str) -> bool:
h = self.hash_(value)
return (h & self.bitarray) == h
def __contains__(self, other: str) -> bool:
return self.exists(other)
def format_bin(self, bitarray: int) -> str:
res = bin(bitarray)[2:]
return res.zfill(self.size)
@property
def bitstring(self) -> str:
return self.format_bin(self.bitarray)
def hash_(self, value: str) -> int:
res = 0b0
for func in HASH_FUNCTIONS:
position = (
int.from_bytes(func(value.encode()).digest(), "little") % self.size
)
res |= 2**position
return res
def format_hash(self, value: str) -> str:
return self.format_bin(self.hash_(value))
@property
def estimated_error_rate(self) -> float:
n_ones = bin(self.bitarray).count("1")
return (n_ones / self.size) ** len(HASH_FUNCTIONS)

View File

@ -0,0 +1,115 @@
"""Queue implementation using two stacks"""
from collections.abc import Iterable
from typing import Generic, TypeVar
_T = TypeVar("_T")
class QueueByTwoStacks(Generic[_T]):
def __init__(self, iterable: Iterable[_T] | None = None) -> None:
"""
>>> QueueByTwoStacks()
Queue(())
>>> QueueByTwoStacks([10, 20, 30])
Queue((10, 20, 30))
>>> QueueByTwoStacks((i**2 for i in range(1, 4)))
Queue((1, 4, 9))
"""
self._stack1: list[_T] = list(iterable or [])
self._stack2: list[_T] = []
def __len__(self) -> int:
"""
>>> len(QueueByTwoStacks())
0
>>> from string import ascii_lowercase
>>> len(QueueByTwoStacks(ascii_lowercase))
26
>>> queue = QueueByTwoStacks()
>>> for i in range(1, 11):
... queue.put(i)
...
>>> len(queue)
10
>>> for i in range(2):
... queue.get()
1
2
>>> len(queue)
8
"""
return len(self._stack1) + len(self._stack2)
def __repr__(self) -> str:
"""
>>> queue = QueueByTwoStacks()
>>> queue
Queue(())
>>> str(queue)
'Queue(())'
>>> queue.put(10)
>>> queue
Queue((10,))
>>> queue.put(20)
>>> queue.put(30)
>>> queue
Queue((10, 20, 30))
"""
return f"Queue({tuple(self._stack2[::-1] + self._stack1)})"
def put(self, item: _T) -> None:
"""
Put `item` into the Queue
>>> queue = QueueByTwoStacks()
>>> queue.put(10)
>>> queue.put(20)
>>> len(queue)
2
>>> queue
Queue((10, 20))
"""
self._stack1.append(item)
def get(self) -> _T:
"""
Get `item` from the Queue
>>> queue = QueueByTwoStacks((10, 20, 30))
>>> queue.get()
10
>>> queue.put(40)
>>> queue.get()
20
>>> queue.get()
30
>>> len(queue)
1
>>> queue.get()
40
>>> queue.get()
Traceback (most recent call last):
...
IndexError: Queue is empty
"""
# To reduce number of attribute look-ups in `while` loop.
stack1_pop = self._stack1.pop
stack2_append = self._stack2.append
if not self._stack2:
while self._stack1:
stack2_append(stack1_pop())
if not self._stack2:
raise IndexError("Queue is empty")
return self._stack2.pop()
if __name__ == "__main__":
from doctest import testmod
testmod()