From ae4d7d4d0433b865c1a3e35dbbb7d43f2dc8ab2c Mon Sep 17 00:00:00 2001
From: Steve Kim <54872857+SteveKimSR@users.noreply.github.com>
Date: Fri, 13 Nov 2020 23:26:17 +0900
Subject: [PATCH] add similarity_search.py in machine_learning (#3864)

* add similarity_search.py in machine_learning
adding similarity_search algorithm in machine_learning

* fix pre-commit test, apply feedback

isort, codespell changed.
applied feedback(np -> np.ndarray)

* apply feedback

add type hints to euclidean method

* apply feedback

- changed euclidean's type hints
- changed few TypeError to ValueError
- changed range(len()) to enumerate()
- changed error's strings to f-string
- implemented without type()
- add euclidean's explanation

* apply feedback

- deleted try/catch in euclidean
- added error tests
- name change(value -> value_array)

* # doctest: +NORMALIZE_WHITESPACE

* Update machine_learning/similarity_search.py

* placate flake8

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/similarity_search.py | 137 ++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 machine_learning/similarity_search.py

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
new file mode 100644
index 000000000..6bfb12ed8
--- /dev/null
+++ b/machine_learning/similarity_search.py
@@ -0,0 +1,137 @@
+"""
+Similarity Search : https://en.wikipedia.org/wiki/Similarity_search
+Similarity search is a search algorithm for finding the nearest vector from
+vectors, used in natural language processing.
+In this algorithm, it calculates distance with euclidean distance and
+returns a list containing two data for each vector:
+    1. the nearest vector
+    2. distance between the vector and the nearest vector (float)
+"""
+import math
+
+import numpy as np
+
+
+def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
+    """
+    Calculates euclidean distance between two data.
+    :param input_a: ndarray of first vector.
+    :param input_b: ndarray of second vector.
+    :return: Euclidean distance of input_a and input_b. By using math.sqrt(),
+             result will be float.
+
+    >>> euclidean(np.array([0]), np.array([1]))
+    1.0
+    >>> euclidean(np.array([0, 1]), np.array([1, 1]))
+    1.0
+    >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
+    1.0
+    """
+    return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))
+
+
+def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list:
+    """
+    :param dataset: Set containing the vectors. Should be ndarray.
+    :param value_array: vector/vectors we want to know the nearest vector from dataset.
+    :return: Result will be a list containing
+            1. the nearest vector
+            2. distance from the vector
+
+    >>> dataset = np.array([[0], [1], [2]])
+    >>> value_array = np.array([[0]])
+    >>> similarity_search(dataset, value_array)
+    [[[0], 0.0]]
+
+    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
+    >>> value_array = np.array([[0, 1]])
+    >>> similarity_search(dataset, value_array)
+    [[[0, 0], 1.0]]
+
+    >>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
+    >>> value_array = np.array([[0, 0, 1]])
+    >>> similarity_search(dataset, value_array)
+    [[[0, 0, 0], 1.0]]
+
+    >>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
+    >>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
+    >>> similarity_search(dataset, value_array)
+    [[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
+
+    These are the errors that might occur:
+
+    1. If dimensions are different.
+    For example, dataset has 2d array and value_array has 1d array:
+    >>> dataset = np.array([[1]])
+    >>> value_array = np.array([1])
+    >>> similarity_search(dataset, value_array)
+    Traceback (most recent call last):
+    ...
+    ValueError: Wrong input data's dimensions... dataset : 2, value_array : 1
+
+    2. If data's shapes are different.
+    For example, dataset has shape of (3, 2) and value_array has (2, 3).
+    We are expecting same shapes of two arrays, so it is wrong.
+    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
+    >>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
+    >>> similarity_search(dataset, value_array)
+    Traceback (most recent call last):
+    ...
+    ValueError: Wrong input data's shape... dataset : 2, value_array : 3
+
+    3. If data types are different.
+    When trying to compare, we are expecting same types so they should be same.
+    If not, it'll come up with errors.
+    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32)
+    >>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32)
+    >>> similarity_search(dataset, value_array)  # doctest: +NORMALIZE_WHITESPACE
+    Traceback (most recent call last):
+    ...
+    TypeError: Input data have different datatype...
+    dataset : float32, value_array : int32
+    """
+
+    if dataset.ndim != value_array.ndim:
+        raise ValueError(
+            f"Wrong input data's dimensions... dataset : {dataset.ndim}, "
+            f"value_array : {value_array.ndim}"
+        )
+
+    try:
+        if dataset.shape[1] != value_array.shape[1]:
+            raise ValueError(
+                f"Wrong input data's shape... dataset : {dataset.shape[1]}, "
+                f"value_array : {value_array.shape[1]}"
+            )
+    except IndexError:
+        if dataset.ndim != value_array.ndim:
+            raise TypeError("Wrong shape")
+
+    if dataset.dtype != value_array.dtype:
+        raise TypeError(
+            f"Input data have different datatype... dataset : {dataset.dtype}, "
+            f"value_array : {value_array.dtype}"
+        )
+
+    answer = []
+
+    for value in value_array:
+        dist = euclidean(value, dataset[0])
+        vector = dataset[0].tolist()
+
+        for dataset_value in dataset[1:]:
+            temp_dist = euclidean(value, dataset_value)
+
+            if dist > temp_dist:
+                dist = temp_dist
+                vector = dataset_value.tolist()
+
+        answer.append([vector, dist])
+
+    return answer
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()