Add hashmap implementation (#7967)

2025-05-13 12:47:16 +00:00 · 2023-03-14 01:31:27 +01:00 · 2023-03-14 01:31:27 +01:00 · b797e437ae
commit b797e437ae
parent 8959211100
2 changed files with 259 additions and 0 deletions
--- a/data_structures/hashing/hash_map.py
+++ b/data_structures/hashing/hash_map.py
@ -0,0 +1,162 @@
+"""
+Hash map with open addressing.
+
+https://en.wikipedia.org/wiki/Hash_table
+
+Another hash map implementation, with a good explanation.
+Modern Dictionaries by Raymond Hettinger
+https://www.youtube.com/watch?v=p33CVV29OG8
+"""
+from collections.abc import Iterator, MutableMapping
+from dataclasses import dataclass
+from typing import Generic, TypeVar
+
+KEY = TypeVar("KEY")
+VAL = TypeVar("VAL")
+
+
+@dataclass(frozen=True, slots=True)
+class _Item(Generic[KEY, VAL]):
+    key: KEY
+    val: VAL
+
+
+class _DeletedItem(_Item):
+    def __init__(self) -> None:
+        super().__init__(None, None)
+
+    def __bool__(self) -> bool:
+        return False
+
+
+_deleted = _DeletedItem()
+
+
+class HashMap(MutableMapping[KEY, VAL]):
+    """
+    Hash map with open addressing.
+    """
+
+    def __init__(
+        self, initial_block_size: int = 8, capacity_factor: float = 0.75
+    ) -> None:
+        self._initial_block_size = initial_block_size
+        self._buckets: list[_Item | None] = [None] * initial_block_size
+        assert 0.0 < capacity_factor < 1.0
+        self._capacity_factor = capacity_factor
+        self._len = 0
+
+    def _get_bucket_index(self, key: KEY) -> int:
+        return hash(key) % len(self._buckets)
+
+    def _get_next_ind(self, ind: int) -> int:
+        """
+        Get next index.
+
+        Implements linear open addressing.
+        """
+        return (ind + 1) % len(self._buckets)
+
+    def _try_set(self, ind: int, key: KEY, val: VAL) -> bool:
+        """
+        Try to add value to the bucket.
+
+        If bucket is empty or key is the same, does insert and return True.
+
+        If bucket has another key or deleted placeholder,
+        that means that we need to check next bucket.
+        """
+        stored = self._buckets[ind]
+        if not stored:
+            self._buckets[ind] = _Item(key, val)
+            self._len += 1
+            return True
+        elif stored.key == key:
+            self._buckets[ind] = _Item(key, val)
+            return True
+        else:
+            return False
+
+    def _is_full(self) -> bool:
+        """
+        Return true if we have reached safe capacity.
+
+        So we need to increase the number of buckets to avoid collisions.
+        """
+        limit = len(self._buckets) * self._capacity_factor
+        return len(self) >= int(limit)
+
+    def _is_sparse(self) -> bool:
+        """Return true if we need twice fewer buckets when we have now."""
+        if len(self._buckets) <= self._initial_block_size:
+            return False
+        limit = len(self._buckets) * self._capacity_factor / 2
+        return len(self) < limit
+
+    def _resize(self, new_size: int) -> None:
+        old_buckets = self._buckets
+        self._buckets = [None] * new_size
+        self._len = 0
+        for item in old_buckets:
+            if item:
+                self._add_item(item.key, item.val)
+
+    def _size_up(self) -> None:
+        self._resize(len(self._buckets) * 2)
+
+    def _size_down(self) -> None:
+        self._resize(len(self._buckets) // 2)
+
+    def _iterate_buckets(self, key: KEY) -> Iterator[int]:
+        ind = self._get_bucket_index(key)
+        for _ in range(len(self._buckets)):
+            yield ind
+            ind = self._get_next_ind(ind)
+
+    def _add_item(self, key: KEY, val: VAL) -> None:
+        for ind in self._iterate_buckets(key):
+            if self._try_set(ind, key, val):
+                break
+
+    def __setitem__(self, key: KEY, val: VAL) -> None:
+        if self._is_full():
+            self._size_up()
+
+        self._add_item(key, val)
+
+    def __delitem__(self, key: KEY) -> None:
+        for ind in self._iterate_buckets(key):
+            item = self._buckets[ind]
+            if item is None:
+                raise KeyError(key)
+            if item is _deleted:
+                continue
+            if item.key == key:
+                self._buckets[ind] = _deleted
+                self._len -= 1
+                break
+        if self._is_sparse():
+            self._size_down()
+
+    def __getitem__(self, key: KEY) -> VAL:
+        for ind in self._iterate_buckets(key):
+            item = self._buckets[ind]
+            if item is None:
+                break
+            if item is _deleted:
+                continue
+            if item.key == key:
+                return item.val
+        raise KeyError(key)
+
+    def __len__(self) -> int:
+        return self._len
+
+    def __iter__(self) -> Iterator[KEY]:
+        yield from (item.key for item in self._buckets if item)
+
+    def __repr__(self) -> str:
+        val_string = " ,".join(
+            f"{item.key}: {item.val}" for item in self._buckets if item
+        )
+        return f"HashMap({val_string})"
--- a/data_structures/hashing/tests/test_hash_map.py
+++ b/data_structures/hashing/tests/test_hash_map.py
@ -0,0 +1,97 @@
+from operator import delitem, getitem, setitem
+
+import pytest
+
+from data_structures.hashing.hash_map import HashMap
+
+
+def _get(k):
+    return getitem, k
+
+
+def _set(k, v):
+    return setitem, k, v
+
+
+def _del(k):
+    return delitem, k
+
+
+def _run_operation(obj, fun, *args):
+    try:
+        return fun(obj, *args), None
+    except Exception as e:
+        return None, e
+
+
+_add_items = (
+    _set("key_a", "val_a"),
+    _set("key_b", "val_b"),
+)
+
+_overwrite_items = [
+    _set("key_a", "val_a"),
+    _set("key_a", "val_b"),
+]
+
+_delete_items = [
+    _set("key_a", "val_a"),
+    _set("key_b", "val_b"),
+    _del("key_a"),
+    _del("key_b"),
+    _set("key_a", "val_a"),
+    _del("key_a"),
+]
+
+_access_absent_items = [
+    _get("key_a"),
+    _del("key_a"),
+    _set("key_a", "val_a"),
+    _del("key_a"),
+    _del("key_a"),
+    _get("key_a"),
+]
+
+_add_with_resize_up = [
+    *[_set(x, x) for x in range(5)],  # guaranteed upsize
+]
+
+_add_with_resize_down = [
+    *[_set(x, x) for x in range(5)],  # guaranteed upsize
+    *[_del(x) for x in range(5)],
+    _set("key_a", "val_b"),
+]
+
+
+@pytest.mark.parametrize(
+    "operations",
+    (
+        pytest.param(_add_items, id="add items"),
+        pytest.param(_overwrite_items, id="overwrite items"),
+        pytest.param(_delete_items, id="delete items"),
+        pytest.param(_access_absent_items, id="access absent items"),
+        pytest.param(_add_with_resize_up, id="add with resize up"),
+        pytest.param(_add_with_resize_down, id="add with resize down"),
+    ),
+)
+def test_hash_map_is_the_same_as_dict(operations):
+    my = HashMap(initial_block_size=4)
+    py = {}
+    for _, (fun, *args) in enumerate(operations):
+        my_res, my_exc = _run_operation(my, fun, *args)
+        py_res, py_exc = _run_operation(py, fun, *args)
+        assert my_res == py_res
+        assert str(my_exc) == str(py_exc)
+        assert set(py) == set(my)
+        assert len(py) == len(my)
+        assert set(my.items()) == set(py.items())
+
+
+def test_no_new_methods_was_added_to_api():
+    def is_public(name: str) -> bool:
+        return not name.startswith("_")
+
+    dict_public_names = {name for name in dir({}) if is_public(name)}
+    hash_public_names = {name for name in dir(HashMap()) if is_public(name)}
+
+    assert dict_public_names > hash_public_names