From 76767d2f09d15aeff0a54cfc44652207eda2314e Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Wed, 27 Sep 2023 08:01:18 -0400 Subject: [PATCH] Consolidate the two existing kNN implementations (#8903) * Add type hints to k_nearest_neighbours.py * Refactor k_nearest_neighbours.py into class * Add documentation to k_nearest_neighbours.py * Use heap-based priority queue for k_nearest_neighbours.py * Delete knn_sklearn.py * updating DIRECTORY.md * Use optional args in k_nearest_neighbours.py for demo purposes * Fix wrong function arg in k_nearest_neighbours.py --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> --- DIRECTORY.md | 1 - machine_learning/k_nearest_neighbours.py | 118 ++++++++++++++--------- machine_learning/knn_sklearn.py | 31 ------ 3 files changed, 74 insertions(+), 76 deletions(-) delete mode 100644 machine_learning/knn_sklearn.py diff --git a/DIRECTORY.md b/DIRECTORY.md index d81e4ec1e..902999460 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -507,7 +507,6 @@ * [Gradient Descent](machine_learning/gradient_descent.py) * [K Means Clust](machine_learning/k_means_clust.py) * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py) - * [Knn Sklearn](machine_learning/knn_sklearn.py) * [Linear Discriminant Analysis](machine_learning/linear_discriminant_analysis.py) * [Linear Regression](machine_learning/linear_regression.py) * Local Weighted Learning diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index 2a90cfe59..a43757c5c 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -1,58 +1,88 @@ +""" +k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning +algorithm used for classification. Given some labelled training data, a given +point is classified using its k nearest neighbours according to some distance +metric. The most commonly occurring label among the neighbours becomes the label +of the given point. In effect, the label of the given point is decided by a +majority vote. + +This implementation uses the commonly used Euclidean distance metric, but other +distance metrics can also be used. + +Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm +""" + from collections import Counter +from heapq import nsmallest import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split -data = datasets.load_iris() -X = np.array(data["data"]) -y = np.array(data["target"]) -classes = data["target_names"] +class KNN: + def __init__( + self, + train_data: np.ndarray[float], + train_target: np.ndarray[int], + class_labels: list[str], + ) -> None: + """ + Create a kNN classifier using the given training data and class labels + """ + self.data = zip(train_data, train_target) + self.labels = class_labels -X_train, X_test, y_train, y_test = train_test_split(X, y) + @staticmethod + def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: + """ + Calculate the Euclidean distance between two points + >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4])) + 5.0 + >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) + 10.0 + """ + return np.linalg.norm(a - b) + def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: + """ + Classify a given point using the kNN algorithm + >>> train_X = np.array( + ... [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] + ... ) + >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1]) + >>> classes = ['A', 'B'] + >>> knn = KNN(train_X, train_y, classes) + >>> point = np.array([1.2, 1.2]) + >>> knn.classify(point) + 'A' + """ + # Distances of all points from the point to be classified + distances = ( + (self._euclidean_distance(data_point[0], pred_point), data_point[1]) + for data_point in self.data + ) -def euclidean_distance(a, b): - """ - Gives the euclidean distance between two points - >>> euclidean_distance([0, 0], [3, 4]) - 5.0 - >>> euclidean_distance([1, 2, 3], [1, 8, 11]) - 10.0 - """ - return np.linalg.norm(np.array(a) - np.array(b)) + # Choosing k points with the shortest distances + votes = (i[1] for i in nsmallest(k, distances)) - -def classifier(train_data, train_target, classes, point, k=5): - """ - Classifies the point using the KNN algorithm - k closest points are found (ranked in ascending order of euclidean distance) - Params: - :train_data: Set of points that are classified into two or more classes - :train_target: List of classes in the order of train_data points - :classes: Labels of the classes - :point: The data point that needs to be classified - - >>> X_train = [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] - >>> y_train = [0, 0, 0, 0, 1, 1, 1] - >>> classes = ['A','B']; point = [1.2,1.2] - >>> classifier(X_train, y_train, classes,point) - 'A' - """ - data = zip(train_data, train_target) - # List of distances of all points from the point to be classified - distances = [] - for data_point in data: - distance = euclidean_distance(data_point[0], point) - distances.append((distance, data_point[1])) - # Choosing 'k' points with the least distances. - votes = [i[1] for i in sorted(distances)[:k]] - # Most commonly occurring class among them - # is the class into which the point is classified - result = Counter(votes).most_common(1)[0][0] - return classes[result] + # Most commonly occurring class is the one into which the point is classified + result = Counter(votes).most_common(1)[0][0] + return self.labels[result] if __name__ == "__main__": - print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4])) + import doctest + + doctest.testmod() + + iris = datasets.load_iris() + + X = np.array(iris["data"]) + y = np.array(iris["target"]) + iris_classes = iris["target_names"] + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + iris_point = np.array([4.4, 3.1, 1.3, 1.4]) + classifier = KNN(X_train, y_train, iris_classes) + print(classifier.classify(iris_point, k=3)) diff --git a/machine_learning/knn_sklearn.py b/machine_learning/knn_sklearn.py deleted file mode 100644 index 4a621a424..000000000 --- a/machine_learning/knn_sklearn.py +++ /dev/null @@ -1,31 +0,0 @@ -from sklearn.datasets import load_iris -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier - -# Load iris file -iris = load_iris() -iris.keys() - - -print(f"Target names: \n {iris.target_names} ") -print(f"\n Features: \n {iris.feature_names}") - -# Train set e Test set -X_train, X_test, y_train, y_test = train_test_split( - iris["data"], iris["target"], random_state=4 -) - -# KNN - -knn = KNeighborsClassifier(n_neighbors=1) -knn.fit(X_train, y_train) - -# new array to test -X_new = [[1, 2, 1, 4], [2, 3, 4, 5]] - -prediction = knn.predict(X_new) - -print( - f"\nNew array: \n {X_new}\n\nTarget Names Prediction: \n" - f" {iris['target_names'][prediction]}" -)