Python/machine_learning/similarity_search.py

"""
Similarity Search : https://en.wikipedia.org/wiki/Similarity_search
Similarity search is a search algorithm for finding the nearest vector from
vectors, used in natural language processing.
In this algorithm, it calculates distance with euclidean distance and
returns a list containing two data for each vector:
    1. the nearest vector
    2. distance between the vector and the nearest vector (float)
"""
from __future__ import annotations

import math

import numpy as np


def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
    """
    Calculates euclidean distance between two data.
    :param input_a: ndarray of first vector.
    :param input_b: ndarray of second vector.
    :return: Euclidean distance of input_a and input_b. By using math.sqrt(),
             result will be float.

    >>> euclidean(np.array([0]), np.array([1]))
    1.0
    >>> euclidean(np.array([0, 1]), np.array([1, 1]))
    1.0
    >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
    1.0
    """
    return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))


def similarity_search(
    dataset: np.ndarray, value_array: np.ndarray
) -> list[list[list[float] | float]]:
    """
    :param dataset: Set containing the vectors. Should be ndarray.
    :param value_array: vector/vectors we want to know the nearest vector from dataset.
    :return: Result will be a list containing
            1. the nearest vector
            2. distance from the vector

    >>> dataset = np.array([[0], [1], [2]])
    >>> value_array = np.array([[0]])
    >>> similarity_search(dataset, value_array)
    [[[0], 0.0]]

    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
    >>> value_array = np.array([[0, 1]])
    >>> similarity_search(dataset, value_array)
    [[[0, 0], 1.0]]

    >>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
    >>> value_array = np.array([[0, 0, 1]])
    >>> similarity_search(dataset, value_array)
    [[[0, 0, 0], 1.0]]

    >>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
    >>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
    >>> similarity_search(dataset, value_array)
    [[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]

    These are the errors that might occur:

    1. If dimensions are different.
    For example, dataset has 2d array and value_array has 1d array:
    >>> dataset = np.array([[1]])
    >>> value_array = np.array([1])
    >>> similarity_search(dataset, value_array)
    Traceback (most recent call last):
    ...
    ValueError: Wrong input data's dimensions... dataset : 2, value_array : 1

    2. If data's shapes are different.
    For example, dataset has shape of (3, 2) and value_array has (2, 3).
    We are expecting same shapes of two arrays, so it is wrong.
    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
    >>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
    >>> similarity_search(dataset, value_array)
    Traceback (most recent call last):
    ...
    ValueError: Wrong input data's shape... dataset : 2, value_array : 3

    3. If data types are different.
    When trying to compare, we are expecting same types so they should be same.
    If not, it'll come up with errors.
    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32)
    >>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32)
    >>> similarity_search(dataset, value_array)  # doctest: +NORMALIZE_WHITESPACE
    Traceback (most recent call last):
    ...
    TypeError: Input data have different datatype...
    dataset : float32, value_array : int32
    """

    if dataset.ndim != value_array.ndim:
        raise ValueError(
            f"Wrong input data's dimensions... dataset : {dataset.ndim}, "
            f"value_array : {value_array.ndim}"
        )

    try:
        if dataset.shape[1] != value_array.shape[1]:
            raise ValueError(
                f"Wrong input data's shape... dataset : {dataset.shape[1]}, "
                f"value_array : {value_array.shape[1]}"
            )
    except IndexError:
        if dataset.ndim != value_array.ndim:
            raise TypeError("Wrong shape")

    if dataset.dtype != value_array.dtype:
        raise TypeError(
            f"Input data have different datatype... dataset : {dataset.dtype}, "
            f"value_array : {value_array.dtype}"
        )

    answer = []

    for value in value_array:
        dist = euclidean(value, dataset[0])
        vector = dataset[0].tolist()

        for dataset_value in dataset[1:]:
            temp_dist = euclidean(value, dataset_value)

            if dist > temp_dist:
                dist = temp_dist
                vector = dataset_value.tolist()

        answer.append([vector, dist])

    return answer


if __name__ == "__main__":
    import doctest

    doctest.testmod()
add similarity_search.py in machine_learning (#3864) * add similarity_search.py in machine_learning adding similarity_search algorithm in machine_learning * fix pre-commit test, apply feedback isort, codespell changed. applied feedback(np -> np.ndarray) * apply feedback add type hints to euclidean method * apply feedback - changed euclidean's type hints - changed few TypeError to ValueError - changed range(len()) to enumerate() - changed error's strings to f-string - implemented without type() - add euclidean's explanation * apply feedback - deleted try/catch in euclidean - added error tests - name change(value -> value_array) * # doctest: +NORMALIZE_WHITESPACE * Update machine_learning/similarity_search.py * placate flake8 Co-authored-by: Christian Clauss <cclauss@me.com> 2020-11-13 14:26:17 +00:00			`"""`
			`Similarity Search : https://en.wikipedia.org/wiki/Similarity_search`
			`Similarity search is a search algorithm for finding the nearest vector from`
			`vectors, used in natural language processing.`
			`In this algorithm, it calculates distance with euclidean distance and`
			`returns a list containing two data for each vector:`
			`1. the nearest vector`
			`2. distance between the vector and the nearest vector (float)`
			`"""`
Pyupgrade to Python 3.9 (#4718) * Pyupgrade to Python 3.9 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2021-09-07 11:37:03 +00:00			`from __future__ import annotations`

add similarity_search.py in machine_learning (#3864) * add similarity_search.py in machine_learning adding similarity_search algorithm in machine_learning * fix pre-commit test, apply feedback isort, codespell changed. applied feedback(np -> np.ndarray) * apply feedback add type hints to euclidean method * apply feedback - changed euclidean's type hints - changed few TypeError to ValueError - changed range(len()) to enumerate() - changed error's strings to f-string - implemented without type() - add euclidean's explanation * apply feedback - deleted try/catch in euclidean - added error tests - name change(value -> value_array) * # doctest: +NORMALIZE_WHITESPACE * Update machine_learning/similarity_search.py * placate flake8 Co-authored-by: Christian Clauss <cclauss@me.com> 2020-11-13 14:26:17 +00:00			`import math`

			`import numpy as np`


			`def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:`
			`"""`
			`Calculates euclidean distance between two data.`
			`:param input_a: ndarray of first vector.`
			`:param input_b: ndarray of second vector.`
			`:return: Euclidean distance of input_a and input_b. By using math.sqrt(),`
			`result will be float.`

			`>>> euclidean(np.array([0]), np.array([1]))`
			`1.0`
			`>>> euclidean(np.array([0, 1]), np.array([1, 1]))`
			`1.0`
			`>>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))`
			`1.0`
			`"""`
			`return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))`


[mypy] Add/fix type annotations for similarity search in machine learning (#4088) * [mypy] Add/fix type annotations for similarity search in machine learning * fix annotation * fix annotation (Union) * isort 2021-01-22 04:40:21 +00:00			`def similarity_search(`
			`dataset: np.ndarray, value_array: np.ndarray`
Pyupgrade to Python 3.9 (#4718) * Pyupgrade to Python 3.9 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2021-09-07 11:37:03 +00:00			`) -> list[list[list[float] \| float]]:`
add similarity_search.py in machine_learning (#3864) * add similarity_search.py in machine_learning adding similarity_search algorithm in machine_learning * fix pre-commit test, apply feedback isort, codespell changed. applied feedback(np -> np.ndarray) * apply feedback add type hints to euclidean method * apply feedback - changed euclidean's type hints - changed few TypeError to ValueError - changed range(len()) to enumerate() - changed error's strings to f-string - implemented without type() - add euclidean's explanation * apply feedback - deleted try/catch in euclidean - added error tests - name change(value -> value_array) * # doctest: +NORMALIZE_WHITESPACE * Update machine_learning/similarity_search.py * placate flake8 Co-authored-by: Christian Clauss <cclauss@me.com> 2020-11-13 14:26:17 +00:00			`"""`
			`:param dataset: Set containing the vectors. Should be ndarray.`
			`:param value_array: vector/vectors we want to know the nearest vector from dataset.`
			`:return: Result will be a list containing`
			`1. the nearest vector`
			`2. distance from the vector`

			`>>> dataset = np.array([[0], [1], [2]])`
			`>>> value_array = np.array([[0]])`
			`>>> similarity_search(dataset, value_array)`
			`[[[0], 0.0]]`

			`>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])`
			`>>> value_array = np.array([[0, 1]])`
			`>>> similarity_search(dataset, value_array)`
			`[[[0, 0], 1.0]]`

			`>>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])`
			`>>> value_array = np.array([[0, 0, 1]])`
			`>>> similarity_search(dataset, value_array)`
			`[[[0, 0, 0], 1.0]]`

			`>>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])`
			`>>> value_array = np.array([[0, 0, 0], [0, 0, 1]])`
			`>>> similarity_search(dataset, value_array)`
			`[[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]`

			`These are the errors that might occur:`

			`1. If dimensions are different.`
			`For example, dataset has 2d array and value_array has 1d array:`
			`>>> dataset = np.array([[1]])`
			`>>> value_array = np.array([1])`
			`>>> similarity_search(dataset, value_array)`
			`Traceback (most recent call last):`
			`...`
			`ValueError: Wrong input data's dimensions... dataset : 2, value_array : 1`

			`2. If data's shapes are different.`
			`For example, dataset has shape of (3, 2) and value_array has (2, 3).`
			`We are expecting same shapes of two arrays, so it is wrong.`
			`>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])`
			`>>> value_array = np.array([[0, 0, 0], [0, 0, 1]])`
			`>>> similarity_search(dataset, value_array)`
			`Traceback (most recent call last):`
			`...`
			`ValueError: Wrong input data's shape... dataset : 2, value_array : 3`

			`3. If data types are different.`
			`When trying to compare, we are expecting same types so they should be same.`
			`If not, it'll come up with errors.`
			`>>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32)`
			`>>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32)`
			`>>> similarity_search(dataset, value_array) # doctest: +NORMALIZE_WHITESPACE`
			`Traceback (most recent call last):`
			`...`
			`TypeError: Input data have different datatype...`
			`dataset : float32, value_array : int32`
			`"""`

			`if dataset.ndim != value_array.ndim:`
			`raise ValueError(`
			`f"Wrong input data's dimensions... dataset : {dataset.ndim}, "`
			`f"value_array : {value_array.ndim}"`
			`)`

			`try:`
			`if dataset.shape[1] != value_array.shape[1]:`
			`raise ValueError(`
			`f"Wrong input data's shape... dataset : {dataset.shape[1]}, "`
			`f"value_array : {value_array.shape[1]}"`
			`)`
			`except IndexError:`
			`if dataset.ndim != value_array.ndim:`
			`raise TypeError("Wrong shape")`

			`if dataset.dtype != value_array.dtype:`
			`raise TypeError(`
			`f"Input data have different datatype... dataset : {dataset.dtype}, "`
			`f"value_array : {value_array.dtype}"`
			`)`

			`answer = []`

			`for value in value_array:`
			`dist = euclidean(value, dataset[0])`
			`vector = dataset[0].tolist()`

			`for dataset_value in dataset[1:]:`
			`temp_dist = euclidean(value, dataset_value)`

			`if dist > temp_dist:`
			`dist = temp_dist`
			`vector = dataset_value.tolist()`

			`answer.append([vector, dist])`

			`return answer`


			`if __name__ == "__main__":`
			`import doctest`

			`doctest.testmod()`