From b797e437aeadcac50556d6606a547dc634cf5329 Mon Sep 17 00:00:00 2001 From: Andrey Date: Tue, 14 Mar 2023 01:31:27 +0100 Subject: [PATCH] Add hashmap implementation (#7967) --- data_structures/hashing/hash_map.py | 162 ++++++++++++++++++ .../hashing/tests/test_hash_map.py | 97 +++++++++++ 2 files changed, 259 insertions(+) create mode 100644 data_structures/hashing/hash_map.py create mode 100644 data_structures/hashing/tests/test_hash_map.py diff --git a/data_structures/hashing/hash_map.py b/data_structures/hashing/hash_map.py new file mode 100644 index 000000000..1dfcc8bbf --- /dev/null +++ b/data_structures/hashing/hash_map.py @@ -0,0 +1,162 @@ +""" +Hash map with open addressing. + +https://en.wikipedia.org/wiki/Hash_table + +Another hash map implementation, with a good explanation. +Modern Dictionaries by Raymond Hettinger +https://www.youtube.com/watch?v=p33CVV29OG8 +""" +from collections.abc import Iterator, MutableMapping +from dataclasses import dataclass +from typing import Generic, TypeVar + +KEY = TypeVar("KEY") +VAL = TypeVar("VAL") + + +@dataclass(frozen=True, slots=True) +class _Item(Generic[KEY, VAL]): + key: KEY + val: VAL + + +class _DeletedItem(_Item): + def __init__(self) -> None: + super().__init__(None, None) + + def __bool__(self) -> bool: + return False + + +_deleted = _DeletedItem() + + +class HashMap(MutableMapping[KEY, VAL]): + """ + Hash map with open addressing. + """ + + def __init__( + self, initial_block_size: int = 8, capacity_factor: float = 0.75 + ) -> None: + self._initial_block_size = initial_block_size + self._buckets: list[_Item | None] = [None] * initial_block_size + assert 0.0 < capacity_factor < 1.0 + self._capacity_factor = capacity_factor + self._len = 0 + + def _get_bucket_index(self, key: KEY) -> int: + return hash(key) % len(self._buckets) + + def _get_next_ind(self, ind: int) -> int: + """ + Get next index. + + Implements linear open addressing. + """ + return (ind + 1) % len(self._buckets) + + def _try_set(self, ind: int, key: KEY, val: VAL) -> bool: + """ + Try to add value to the bucket. + + If bucket is empty or key is the same, does insert and return True. + + If bucket has another key or deleted placeholder, + that means that we need to check next bucket. + """ + stored = self._buckets[ind] + if not stored: + self._buckets[ind] = _Item(key, val) + self._len += 1 + return True + elif stored.key == key: + self._buckets[ind] = _Item(key, val) + return True + else: + return False + + def _is_full(self) -> bool: + """ + Return true if we have reached safe capacity. + + So we need to increase the number of buckets to avoid collisions. + """ + limit = len(self._buckets) * self._capacity_factor + return len(self) >= int(limit) + + def _is_sparse(self) -> bool: + """Return true if we need twice fewer buckets when we have now.""" + if len(self._buckets) <= self._initial_block_size: + return False + limit = len(self._buckets) * self._capacity_factor / 2 + return len(self) < limit + + def _resize(self, new_size: int) -> None: + old_buckets = self._buckets + self._buckets = [None] * new_size + self._len = 0 + for item in old_buckets: + if item: + self._add_item(item.key, item.val) + + def _size_up(self) -> None: + self._resize(len(self._buckets) * 2) + + def _size_down(self) -> None: + self._resize(len(self._buckets) // 2) + + def _iterate_buckets(self, key: KEY) -> Iterator[int]: + ind = self._get_bucket_index(key) + for _ in range(len(self._buckets)): + yield ind + ind = self._get_next_ind(ind) + + def _add_item(self, key: KEY, val: VAL) -> None: + for ind in self._iterate_buckets(key): + if self._try_set(ind, key, val): + break + + def __setitem__(self, key: KEY, val: VAL) -> None: + if self._is_full(): + self._size_up() + + self._add_item(key, val) + + def __delitem__(self, key: KEY) -> None: + for ind in self._iterate_buckets(key): + item = self._buckets[ind] + if item is None: + raise KeyError(key) + if item is _deleted: + continue + if item.key == key: + self._buckets[ind] = _deleted + self._len -= 1 + break + if self._is_sparse(): + self._size_down() + + def __getitem__(self, key: KEY) -> VAL: + for ind in self._iterate_buckets(key): + item = self._buckets[ind] + if item is None: + break + if item is _deleted: + continue + if item.key == key: + return item.val + raise KeyError(key) + + def __len__(self) -> int: + return self._len + + def __iter__(self) -> Iterator[KEY]: + yield from (item.key for item in self._buckets if item) + + def __repr__(self) -> str: + val_string = " ,".join( + f"{item.key}: {item.val}" for item in self._buckets if item + ) + return f"HashMap({val_string})" diff --git a/data_structures/hashing/tests/test_hash_map.py b/data_structures/hashing/tests/test_hash_map.py new file mode 100644 index 000000000..929e67311 --- /dev/null +++ b/data_structures/hashing/tests/test_hash_map.py @@ -0,0 +1,97 @@ +from operator import delitem, getitem, setitem + +import pytest + +from data_structures.hashing.hash_map import HashMap + + +def _get(k): + return getitem, k + + +def _set(k, v): + return setitem, k, v + + +def _del(k): + return delitem, k + + +def _run_operation(obj, fun, *args): + try: + return fun(obj, *args), None + except Exception as e: + return None, e + + +_add_items = ( + _set("key_a", "val_a"), + _set("key_b", "val_b"), +) + +_overwrite_items = [ + _set("key_a", "val_a"), + _set("key_a", "val_b"), +] + +_delete_items = [ + _set("key_a", "val_a"), + _set("key_b", "val_b"), + _del("key_a"), + _del("key_b"), + _set("key_a", "val_a"), + _del("key_a"), +] + +_access_absent_items = [ + _get("key_a"), + _del("key_a"), + _set("key_a", "val_a"), + _del("key_a"), + _del("key_a"), + _get("key_a"), +] + +_add_with_resize_up = [ + *[_set(x, x) for x in range(5)], # guaranteed upsize +] + +_add_with_resize_down = [ + *[_set(x, x) for x in range(5)], # guaranteed upsize + *[_del(x) for x in range(5)], + _set("key_a", "val_b"), +] + + +@pytest.mark.parametrize( + "operations", + ( + pytest.param(_add_items, id="add items"), + pytest.param(_overwrite_items, id="overwrite items"), + pytest.param(_delete_items, id="delete items"), + pytest.param(_access_absent_items, id="access absent items"), + pytest.param(_add_with_resize_up, id="add with resize up"), + pytest.param(_add_with_resize_down, id="add with resize down"), + ), +) +def test_hash_map_is_the_same_as_dict(operations): + my = HashMap(initial_block_size=4) + py = {} + for _, (fun, *args) in enumerate(operations): + my_res, my_exc = _run_operation(my, fun, *args) + py_res, py_exc = _run_operation(py, fun, *args) + assert my_res == py_res + assert str(my_exc) == str(py_exc) + assert set(py) == set(my) + assert len(py) == len(my) + assert set(my.items()) == set(py.items()) + + +def test_no_new_methods_was_added_to_api(): + def is_public(name: str) -> bool: + return not name.startswith("_") + + dict_public_names = {name for name in dir({}) if is_public(name)} + hash_public_names = {name for name in dir(HashMap()) if is_public(name)} + + assert dict_public_names > hash_public_names