mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-30 16:31:08 +00:00
Add hashmap implementation (#7967)
This commit is contained in:
parent
8959211100
commit
b797e437ae
162
data_structures/hashing/hash_map.py
Normal file
162
data_structures/hashing/hash_map.py
Normal file
|
@ -0,0 +1,162 @@
|
||||||
|
"""
|
||||||
|
Hash map with open addressing.
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/Hash_table
|
||||||
|
|
||||||
|
Another hash map implementation, with a good explanation.
|
||||||
|
Modern Dictionaries by Raymond Hettinger
|
||||||
|
https://www.youtube.com/watch?v=p33CVV29OG8
|
||||||
|
"""
|
||||||
|
from collections.abc import Iterator, MutableMapping
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Generic, TypeVar
|
||||||
|
|
||||||
|
KEY = TypeVar("KEY")
|
||||||
|
VAL = TypeVar("VAL")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True, slots=True)
|
||||||
|
class _Item(Generic[KEY, VAL]):
|
||||||
|
key: KEY
|
||||||
|
val: VAL
|
||||||
|
|
||||||
|
|
||||||
|
class _DeletedItem(_Item):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__(None, None)
|
||||||
|
|
||||||
|
def __bool__(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
_deleted = _DeletedItem()
|
||||||
|
|
||||||
|
|
||||||
|
class HashMap(MutableMapping[KEY, VAL]):
|
||||||
|
"""
|
||||||
|
Hash map with open addressing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, initial_block_size: int = 8, capacity_factor: float = 0.75
|
||||||
|
) -> None:
|
||||||
|
self._initial_block_size = initial_block_size
|
||||||
|
self._buckets: list[_Item | None] = [None] * initial_block_size
|
||||||
|
assert 0.0 < capacity_factor < 1.0
|
||||||
|
self._capacity_factor = capacity_factor
|
||||||
|
self._len = 0
|
||||||
|
|
||||||
|
def _get_bucket_index(self, key: KEY) -> int:
|
||||||
|
return hash(key) % len(self._buckets)
|
||||||
|
|
||||||
|
def _get_next_ind(self, ind: int) -> int:
|
||||||
|
"""
|
||||||
|
Get next index.
|
||||||
|
|
||||||
|
Implements linear open addressing.
|
||||||
|
"""
|
||||||
|
return (ind + 1) % len(self._buckets)
|
||||||
|
|
||||||
|
def _try_set(self, ind: int, key: KEY, val: VAL) -> bool:
|
||||||
|
"""
|
||||||
|
Try to add value to the bucket.
|
||||||
|
|
||||||
|
If bucket is empty or key is the same, does insert and return True.
|
||||||
|
|
||||||
|
If bucket has another key or deleted placeholder,
|
||||||
|
that means that we need to check next bucket.
|
||||||
|
"""
|
||||||
|
stored = self._buckets[ind]
|
||||||
|
if not stored:
|
||||||
|
self._buckets[ind] = _Item(key, val)
|
||||||
|
self._len += 1
|
||||||
|
return True
|
||||||
|
elif stored.key == key:
|
||||||
|
self._buckets[ind] = _Item(key, val)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _is_full(self) -> bool:
|
||||||
|
"""
|
||||||
|
Return true if we have reached safe capacity.
|
||||||
|
|
||||||
|
So we need to increase the number of buckets to avoid collisions.
|
||||||
|
"""
|
||||||
|
limit = len(self._buckets) * self._capacity_factor
|
||||||
|
return len(self) >= int(limit)
|
||||||
|
|
||||||
|
def _is_sparse(self) -> bool:
|
||||||
|
"""Return true if we need twice fewer buckets when we have now."""
|
||||||
|
if len(self._buckets) <= self._initial_block_size:
|
||||||
|
return False
|
||||||
|
limit = len(self._buckets) * self._capacity_factor / 2
|
||||||
|
return len(self) < limit
|
||||||
|
|
||||||
|
def _resize(self, new_size: int) -> None:
|
||||||
|
old_buckets = self._buckets
|
||||||
|
self._buckets = [None] * new_size
|
||||||
|
self._len = 0
|
||||||
|
for item in old_buckets:
|
||||||
|
if item:
|
||||||
|
self._add_item(item.key, item.val)
|
||||||
|
|
||||||
|
def _size_up(self) -> None:
|
||||||
|
self._resize(len(self._buckets) * 2)
|
||||||
|
|
||||||
|
def _size_down(self) -> None:
|
||||||
|
self._resize(len(self._buckets) // 2)
|
||||||
|
|
||||||
|
def _iterate_buckets(self, key: KEY) -> Iterator[int]:
|
||||||
|
ind = self._get_bucket_index(key)
|
||||||
|
for _ in range(len(self._buckets)):
|
||||||
|
yield ind
|
||||||
|
ind = self._get_next_ind(ind)
|
||||||
|
|
||||||
|
def _add_item(self, key: KEY, val: VAL) -> None:
|
||||||
|
for ind in self._iterate_buckets(key):
|
||||||
|
if self._try_set(ind, key, val):
|
||||||
|
break
|
||||||
|
|
||||||
|
def __setitem__(self, key: KEY, val: VAL) -> None:
|
||||||
|
if self._is_full():
|
||||||
|
self._size_up()
|
||||||
|
|
||||||
|
self._add_item(key, val)
|
||||||
|
|
||||||
|
def __delitem__(self, key: KEY) -> None:
|
||||||
|
for ind in self._iterate_buckets(key):
|
||||||
|
item = self._buckets[ind]
|
||||||
|
if item is None:
|
||||||
|
raise KeyError(key)
|
||||||
|
if item is _deleted:
|
||||||
|
continue
|
||||||
|
if item.key == key:
|
||||||
|
self._buckets[ind] = _deleted
|
||||||
|
self._len -= 1
|
||||||
|
break
|
||||||
|
if self._is_sparse():
|
||||||
|
self._size_down()
|
||||||
|
|
||||||
|
def __getitem__(self, key: KEY) -> VAL:
|
||||||
|
for ind in self._iterate_buckets(key):
|
||||||
|
item = self._buckets[ind]
|
||||||
|
if item is None:
|
||||||
|
break
|
||||||
|
if item is _deleted:
|
||||||
|
continue
|
||||||
|
if item.key == key:
|
||||||
|
return item.val
|
||||||
|
raise KeyError(key)
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return self._len
|
||||||
|
|
||||||
|
def __iter__(self) -> Iterator[KEY]:
|
||||||
|
yield from (item.key for item in self._buckets if item)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
val_string = " ,".join(
|
||||||
|
f"{item.key}: {item.val}" for item in self._buckets if item
|
||||||
|
)
|
||||||
|
return f"HashMap({val_string})"
|
97
data_structures/hashing/tests/test_hash_map.py
Normal file
97
data_structures/hashing/tests/test_hash_map.py
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
from operator import delitem, getitem, setitem
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from data_structures.hashing.hash_map import HashMap
|
||||||
|
|
||||||
|
|
||||||
|
def _get(k):
|
||||||
|
return getitem, k
|
||||||
|
|
||||||
|
|
||||||
|
def _set(k, v):
|
||||||
|
return setitem, k, v
|
||||||
|
|
||||||
|
|
||||||
|
def _del(k):
|
||||||
|
return delitem, k
|
||||||
|
|
||||||
|
|
||||||
|
def _run_operation(obj, fun, *args):
|
||||||
|
try:
|
||||||
|
return fun(obj, *args), None
|
||||||
|
except Exception as e:
|
||||||
|
return None, e
|
||||||
|
|
||||||
|
|
||||||
|
_add_items = (
|
||||||
|
_set("key_a", "val_a"),
|
||||||
|
_set("key_b", "val_b"),
|
||||||
|
)
|
||||||
|
|
||||||
|
_overwrite_items = [
|
||||||
|
_set("key_a", "val_a"),
|
||||||
|
_set("key_a", "val_b"),
|
||||||
|
]
|
||||||
|
|
||||||
|
_delete_items = [
|
||||||
|
_set("key_a", "val_a"),
|
||||||
|
_set("key_b", "val_b"),
|
||||||
|
_del("key_a"),
|
||||||
|
_del("key_b"),
|
||||||
|
_set("key_a", "val_a"),
|
||||||
|
_del("key_a"),
|
||||||
|
]
|
||||||
|
|
||||||
|
_access_absent_items = [
|
||||||
|
_get("key_a"),
|
||||||
|
_del("key_a"),
|
||||||
|
_set("key_a", "val_a"),
|
||||||
|
_del("key_a"),
|
||||||
|
_del("key_a"),
|
||||||
|
_get("key_a"),
|
||||||
|
]
|
||||||
|
|
||||||
|
_add_with_resize_up = [
|
||||||
|
*[_set(x, x) for x in range(5)], # guaranteed upsize
|
||||||
|
]
|
||||||
|
|
||||||
|
_add_with_resize_down = [
|
||||||
|
*[_set(x, x) for x in range(5)], # guaranteed upsize
|
||||||
|
*[_del(x) for x in range(5)],
|
||||||
|
_set("key_a", "val_b"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"operations",
|
||||||
|
(
|
||||||
|
pytest.param(_add_items, id="add items"),
|
||||||
|
pytest.param(_overwrite_items, id="overwrite items"),
|
||||||
|
pytest.param(_delete_items, id="delete items"),
|
||||||
|
pytest.param(_access_absent_items, id="access absent items"),
|
||||||
|
pytest.param(_add_with_resize_up, id="add with resize up"),
|
||||||
|
pytest.param(_add_with_resize_down, id="add with resize down"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def test_hash_map_is_the_same_as_dict(operations):
|
||||||
|
my = HashMap(initial_block_size=4)
|
||||||
|
py = {}
|
||||||
|
for _, (fun, *args) in enumerate(operations):
|
||||||
|
my_res, my_exc = _run_operation(my, fun, *args)
|
||||||
|
py_res, py_exc = _run_operation(py, fun, *args)
|
||||||
|
assert my_res == py_res
|
||||||
|
assert str(my_exc) == str(py_exc)
|
||||||
|
assert set(py) == set(my)
|
||||||
|
assert len(py) == len(my)
|
||||||
|
assert set(my.items()) == set(py.items())
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_new_methods_was_added_to_api():
|
||||||
|
def is_public(name: str) -> bool:
|
||||||
|
return not name.startswith("_")
|
||||||
|
|
||||||
|
dict_public_names = {name for name in dir({}) if is_public(name)}
|
||||||
|
hash_public_names = {name for name in dir(HashMap()) if is_public(name)}
|
||||||
|
|
||||||
|
assert dict_public_names > hash_public_names
|
Loading…
Reference in New Issue
Block a user