2020-11-13 14:26:17 +00:00
|
|
|
"""
|
|
|
|
Similarity Search : https://en.wikipedia.org/wiki/Similarity_search
|
|
|
|
Similarity search is a search algorithm for finding the nearest vector from
|
|
|
|
vectors, used in natural language processing.
|
|
|
|
In this algorithm, it calculates distance with euclidean distance and
|
|
|
|
returns a list containing two data for each vector:
|
|
|
|
1. the nearest vector
|
|
|
|
2. distance between the vector and the nearest vector (float)
|
|
|
|
"""
|
2021-09-07 11:37:03 +00:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2020-11-13 14:26:17 +00:00
|
|
|
import math
|
|
|
|
|
|
|
|
import numpy as np
|
2022-10-29 15:38:40 +00:00
|
|
|
from numpy.linalg import norm
|
2020-11-13 14:26:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
|
|
|
|
"""
|
|
|
|
Calculates euclidean distance between two data.
|
|
|
|
:param input_a: ndarray of first vector.
|
|
|
|
:param input_b: ndarray of second vector.
|
|
|
|
:return: Euclidean distance of input_a and input_b. By using math.sqrt(),
|
|
|
|
result will be float.
|
|
|
|
|
|
|
|
>>> euclidean(np.array([0]), np.array([1]))
|
|
|
|
1.0
|
|
|
|
>>> euclidean(np.array([0, 1]), np.array([1, 1]))
|
|
|
|
1.0
|
|
|
|
>>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
|
|
|
|
1.0
|
|
|
|
"""
|
|
|
|
return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))
|
|
|
|
|
|
|
|
|
2021-01-22 04:40:21 +00:00
|
|
|
def similarity_search(
|
|
|
|
dataset: np.ndarray, value_array: np.ndarray
|
2021-09-07 11:37:03 +00:00
|
|
|
) -> list[list[list[float] | float]]:
|
2020-11-13 14:26:17 +00:00
|
|
|
"""
|
|
|
|
:param dataset: Set containing the vectors. Should be ndarray.
|
|
|
|
:param value_array: vector/vectors we want to know the nearest vector from dataset.
|
|
|
|
:return: Result will be a list containing
|
|
|
|
1. the nearest vector
|
|
|
|
2. distance from the vector
|
|
|
|
|
|
|
|
>>> dataset = np.array([[0], [1], [2]])
|
|
|
|
>>> value_array = np.array([[0]])
|
|
|
|
>>> similarity_search(dataset, value_array)
|
|
|
|
[[[0], 0.0]]
|
|
|
|
|
|
|
|
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
|
|
|
|
>>> value_array = np.array([[0, 1]])
|
|
|
|
>>> similarity_search(dataset, value_array)
|
|
|
|
[[[0, 0], 1.0]]
|
|
|
|
|
|
|
|
>>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
|
|
|
|
>>> value_array = np.array([[0, 0, 1]])
|
|
|
|
>>> similarity_search(dataset, value_array)
|
|
|
|
[[[0, 0, 0], 1.0]]
|
|
|
|
|
|
|
|
>>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
|
|
|
|
>>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
|
|
|
|
>>> similarity_search(dataset, value_array)
|
|
|
|
[[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
|
|
|
|
|
|
|
|
These are the errors that might occur:
|
|
|
|
|
|
|
|
1. If dimensions are different.
|
|
|
|
For example, dataset has 2d array and value_array has 1d array:
|
|
|
|
>>> dataset = np.array([[1]])
|
|
|
|
>>> value_array = np.array([1])
|
|
|
|
>>> similarity_search(dataset, value_array)
|
|
|
|
Traceback (most recent call last):
|
2022-10-27 17:42:30 +00:00
|
|
|
...
|
2020-11-13 14:26:17 +00:00
|
|
|
ValueError: Wrong input data's dimensions... dataset : 2, value_array : 1
|
|
|
|
|
|
|
|
2. If data's shapes are different.
|
|
|
|
For example, dataset has shape of (3, 2) and value_array has (2, 3).
|
|
|
|
We are expecting same shapes of two arrays, so it is wrong.
|
|
|
|
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
|
|
|
|
>>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
|
|
|
|
>>> similarity_search(dataset, value_array)
|
|
|
|
Traceback (most recent call last):
|
2022-10-27 17:42:30 +00:00
|
|
|
...
|
2020-11-13 14:26:17 +00:00
|
|
|
ValueError: Wrong input data's shape... dataset : 2, value_array : 3
|
|
|
|
|
|
|
|
3. If data types are different.
|
|
|
|
When trying to compare, we are expecting same types so they should be same.
|
|
|
|
If not, it'll come up with errors.
|
|
|
|
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32)
|
|
|
|
>>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32)
|
|
|
|
>>> similarity_search(dataset, value_array) # doctest: +NORMALIZE_WHITESPACE
|
|
|
|
Traceback (most recent call last):
|
2022-10-27 17:42:30 +00:00
|
|
|
...
|
2020-11-13 14:26:17 +00:00
|
|
|
TypeError: Input data have different datatype...
|
|
|
|
dataset : float32, value_array : int32
|
|
|
|
"""
|
|
|
|
|
|
|
|
if dataset.ndim != value_array.ndim:
|
|
|
|
raise ValueError(
|
|
|
|
f"Wrong input data's dimensions... dataset : {dataset.ndim}, "
|
|
|
|
f"value_array : {value_array.ndim}"
|
|
|
|
)
|
|
|
|
|
|
|
|
try:
|
|
|
|
if dataset.shape[1] != value_array.shape[1]:
|
|
|
|
raise ValueError(
|
|
|
|
f"Wrong input data's shape... dataset : {dataset.shape[1]}, "
|
|
|
|
f"value_array : {value_array.shape[1]}"
|
|
|
|
)
|
|
|
|
except IndexError:
|
|
|
|
if dataset.ndim != value_array.ndim:
|
|
|
|
raise TypeError("Wrong shape")
|
|
|
|
|
|
|
|
if dataset.dtype != value_array.dtype:
|
|
|
|
raise TypeError(
|
|
|
|
f"Input data have different datatype... dataset : {dataset.dtype}, "
|
|
|
|
f"value_array : {value_array.dtype}"
|
|
|
|
)
|
|
|
|
|
|
|
|
answer = []
|
|
|
|
|
|
|
|
for value in value_array:
|
|
|
|
dist = euclidean(value, dataset[0])
|
|
|
|
vector = dataset[0].tolist()
|
|
|
|
|
|
|
|
for dataset_value in dataset[1:]:
|
|
|
|
temp_dist = euclidean(value, dataset_value)
|
|
|
|
|
|
|
|
if dist > temp_dist:
|
|
|
|
dist = temp_dist
|
|
|
|
vector = dataset_value.tolist()
|
|
|
|
|
|
|
|
answer.append([vector, dist])
|
|
|
|
|
|
|
|
return answer
|
|
|
|
|
|
|
|
|
2022-10-29 15:38:40 +00:00
|
|
|
def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float:
|
|
|
|
"""
|
|
|
|
Calculates cosine similarity between two data.
|
|
|
|
:param input_a: ndarray of first vector.
|
|
|
|
:param input_b: ndarray of second vector.
|
|
|
|
:return: Cosine similarity of input_a and input_b. By using math.sqrt(),
|
|
|
|
result will be float.
|
|
|
|
|
|
|
|
>>> cosine_similarity(np.array([1]), np.array([1]))
|
|
|
|
1.0
|
|
|
|
>>> cosine_similarity(np.array([1, 2]), np.array([6, 32]))
|
|
|
|
0.9615239476408232
|
|
|
|
"""
|
|
|
|
return np.dot(input_a, input_b) / (norm(input_a) * norm(input_b))
|
|
|
|
|
|
|
|
|
2020-11-13 14:26:17 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
|
|
|
|
|
|
|
doctest.testmod()
|