Python/machine_learning/k_nearest_neighbours.py

"""
k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning
algorithm used for classification. Given some labelled training data, a given
point is classified using its k nearest neighbours according to some distance
metric. The most commonly occurring label among the neighbours becomes the label
of the given point. In effect, the label of the given point is decided by a
majority vote.

This implementation uses the commonly used Euclidean distance metric, but other
distance metrics can also be used.

Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
"""

from collections import Counter
from heapq import nsmallest

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split


class KNN:
    def __init__(
        self,
        train_data: np.ndarray[float],
        train_target: np.ndarray[int],
        class_labels: list[str],
    ) -> None:
        """
        Create a kNN classifier using the given training data and class labels
        """
        self.data = zip(train_data, train_target)
        self.labels = class_labels

    @staticmethod
    def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
        """
        Calculate the Euclidean distance between two points
        >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4]))
        5.0
        >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
        10.0
        """
        return np.linalg.norm(a - b)

    def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
        """
        Classify a given point using the kNN algorithm
        >>> train_X = np.array(
        ...     [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
        ... )
        >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1])
        >>> classes = ['A', 'B']
        >>> knn = KNN(train_X, train_y, classes)
        >>> point = np.array([1.2, 1.2])
        >>> knn.classify(point)
        'A'
        """
        # Distances of all points from the point to be classified
        distances = (
            (self._euclidean_distance(data_point[0], pred_point), data_point[1])
            for data_point in self.data
        )

        # Choosing k points with the shortest distances
        votes = (i[1] for i in nsmallest(k, distances))

        # Most commonly occurring class is the one into which the point is classified
        result = Counter(votes).most_common(1)[0][0]
        return self.labels[result]


if __name__ == "__main__":
    import doctest

    doctest.testmod()

    iris = datasets.load_iris()

    X = np.array(iris["data"])
    y = np.array(iris["target"])
    iris_classes = iris["target_names"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    iris_point = np.array([4.4, 3.1, 1.3, 1.4])
    classifier = KNN(X_train, y_train, iris_classes)
    print(classifier.classify(iris_point, k=3))
Consolidate the two existing kNN implementations (#8903) * Add type hints to k_nearest_neighbours.py * Refactor k_nearest_neighbours.py into class * Add documentation to k_nearest_neighbours.py * Use heap-based priority queue for k_nearest_neighbours.py * Delete knn_sklearn.py * updating DIRECTORY.md * Use optional args in k_nearest_neighbours.py for demo purposes * Fix wrong function arg in k_nearest_neighbours.py --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2023-09-27 12:01:18 +00:00			`"""`
			`k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning`
			`algorithm used for classification. Given some labelled training data, a given`
			`point is classified using its k nearest neighbours according to some distance`
			`metric. The most commonly occurring label among the neighbours becomes the label`
			`of the given point. In effect, the label of the given point is decided by a`
			`majority vote.`

			`This implementation uses the commonly used Euclidean distance metric, but other`
			`distance metrics can also be used.`

			`Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm`
			`"""`

Add pure implementation of K-Nearest Neighbours (#1278) * Pure implementation of KNN added * Comments and test case added * doctest added 2019-10-06 18:50:50 +00:00			`from collections import Counter`
Consolidate the two existing kNN implementations (#8903) * Add type hints to k_nearest_neighbours.py * Refactor k_nearest_neighbours.py into class * Add documentation to k_nearest_neighbours.py * Use heap-based priority queue for k_nearest_neighbours.py * Delete knn_sklearn.py * updating DIRECTORY.md * Use optional args in k_nearest_neighbours.py for demo purposes * Fix wrong function arg in k_nearest_neighbours.py --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2023-09-27 12:01:18 +00:00			`from heapq import nsmallest`
isort --profile black . (#2181) * updating DIRECTORY.md * isort --profile black . * Black after * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-07-06 07:44:19 +00:00
			`import numpy as np`
Add pure implementation of K-Nearest Neighbours (#1278) * Pure implementation of KNN added * Comments and test case added * doctest added 2019-10-06 18:50:50 +00:00			`from sklearn import datasets`
			`from sklearn.model_selection import train_test_split`

Consolidate the two existing kNN implementations (#8903) * Add type hints to k_nearest_neighbours.py * Refactor k_nearest_neighbours.py into class * Add documentation to k_nearest_neighbours.py * Use heap-based priority queue for k_nearest_neighbours.py * Delete knn_sklearn.py * updating DIRECTORY.md * Use optional args in k_nearest_neighbours.py for demo purposes * Fix wrong function arg in k_nearest_neighbours.py --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2023-09-27 12:01:18 +00:00
			`class KNN:`
			`def __init__(`
			`self,`
			`train_data: np.ndarray[float],`
			`train_target: np.ndarray[int],`
			`class_labels: list[str],`
			`) -> None:`
			`"""`
			`Create a kNN classifier using the given training data and class labels`
			`"""`
			`self.data = zip(train_data, train_target)`
			`self.labels = class_labels`

			`@staticmethod`
			`def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:`
			`"""`
			`Calculate the Euclidean distance between two points`
			`>>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4]))`
			`5.0`
			`>>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))`
			`10.0`
			`"""`
			`return np.linalg.norm(a - b)`

			`def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:`
			`"""`
			`Classify a given point using the kNN algorithm`
			`>>> train_X = np.array(`
			`... [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]`
			`... )`
			`>>> train_y = np.array([0, 0, 0, 0, 1, 1, 1])`
			`>>> classes = ['A', 'B']`
			`>>> knn = KNN(train_X, train_y, classes)`
			`>>> point = np.array([1.2, 1.2])`
			`>>> knn.classify(point)`
			`'A'`
			`"""`
			`# Distances of all points from the point to be classified`
			`distances = (`
			`(self._euclidean_distance(data_point[0], pred_point), data_point[1])`
			`for data_point in self.data`
			`)`

			`# Choosing k points with the shortest distances`
			`votes = (i[1] for i in nsmallest(k, distances))`

			`# Most commonly occurring class is the one into which the point is classified`
			`result = Counter(votes).most_common(1)[0][0]`
			`return self.labels[result]`
Add pure implementation of K-Nearest Neighbours (#1278) * Pure implementation of KNN added * Comments and test case added * doctest added 2019-10-06 18:50:50 +00:00

			`if __name__ == "__main__":`
Consolidate the two existing kNN implementations (#8903) * Add type hints to k_nearest_neighbours.py * Refactor k_nearest_neighbours.py into class * Add documentation to k_nearest_neighbours.py * Use heap-based priority queue for k_nearest_neighbours.py * Delete knn_sklearn.py * updating DIRECTORY.md * Use optional args in k_nearest_neighbours.py for demo purposes * Fix wrong function arg in k_nearest_neighbours.py --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2023-09-27 12:01:18 +00:00			`import doctest`

			`doctest.testmod()`

			`iris = datasets.load_iris()`

			`X = np.array(iris["data"])`
			`y = np.array(iris["target"])`
			`iris_classes = iris["target_names"]`

			`X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)`
			`iris_point = np.array([4.4, 3.1, 1.3, 1.4])`
			`classifier = KNN(X_train, y_train, iris_classes)`
			`print(classifier.classify(iris_point, k=3))`