Consolidate the two existing kNN implementations (#8903)

* Add type hints to k_nearest_neighbours.py * Refactor k_nearest_neighbours.py into class * Add documentation to k_nearest_neighbours.py * Use heap-based priority queue for k_nearest_neighbours.py * Delete knn_sklearn.py * updating DIRECTORY.md * Use optional args in k_nearest_neighbours.py for demo purposes * Fix wrong function arg in k_nearest_neighbours.py --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
2025-04-20 04:37:36 +00:00 · 2023-09-27 08:01:18 -04:00 · 2023-09-27 08:01:18 -04:00 · 76767d2f09
commit 76767d2f09
parent 5830b29e7e
3 changed files with 74 additions and 76 deletions
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@ -507,7 +507,6 @@
  * [Gradient Descent](machine_learning/gradient_descent.py)
  * [K Means Clust](machine_learning/k_means_clust.py)
  * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)
  * [Knn Sklearn](machine_learning/knn_sklearn.py)
  * [Linear Discriminant Analysis](machine_learning/linear_discriminant_analysis.py)
  * [Linear Regression](machine_learning/linear_regression.py)
  * Local Weighted Learning
--- a/machine_learning/k_nearest_neighbours.py
+++ b/machine_learning/k_nearest_neighbours.py
@ -1,58 +1,88 @@
 """
 k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning
 algorithm used for classification. Given some labelled training data, a given
 point is classified using its k nearest neighbours according to some distance
 metric. The most commonly occurring label among the neighbours becomes the label
 of the given point. In effect, the label of the given point is decided by a
 majority vote.
 This implementation uses the commonly used Euclidean distance metric, but other
 distance metrics can also be used.
 Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
 """
 from collections import Counter
 from heapq import nsmallest
 import numpy as np
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 data = datasets.load_iris()
-X = np.array(data["data"])
+class KNN:
-y = np.array(data["target"])
+    def __init__(
-classes = data["target_names"]
+        self,
-
+        train_data: np.ndarray[float],
-X_train, X_test, y_train, y_test = train_test_split(X, y)
+        train_target: np.ndarray[int],
-
+        class_labels: list[str],
-
+    ) -> None:
 def euclidean_distance(a, b):
        """
-    Gives the euclidean distance between two points
+        Create a kNN classifier using the given training data and class labels
-    >>> euclidean_distance([0, 0], [3, 4])
+        """
        self.data = zip(train_data, train_target)
        self.labels = class_labels
    @staticmethod
    def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
        """
        Calculate the Euclidean distance between two points
        >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4]))
        5.0
-    >>> euclidean_distance([1, 2, 3], [1, 8, 11])
+        >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
        10.0
        """
-    return np.linalg.norm(np.array(a) - np.array(b))
+        return np.linalg.norm(a - b)
-
+    def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
 def classifier(train_data, train_target, classes, point, k=5):
        """
-    Classifies the point using the KNN algorithm
+        Classify a given point using the kNN algorithm
-    k closest points are found (ranked in ascending order of euclidean distance)
+        >>> train_X = np.array(
-    Params:
+        ...     [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
-    :train_data: Set of points that are classified into two or more classes
+        ... )
-    :train_target: List of classes in the order of train_data points
+        >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1])
-    :classes: Labels of the classes
+        >>> classes = ['A', 'B']
-    :point: The data point that needs to be classified
+        >>> knn = KNN(train_X, train_y, classes)
-
+        >>> point = np.array([1.2, 1.2])
-    >>> X_train = [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
+        >>> knn.classify(point)
    >>> y_train = [0, 0, 0, 0, 1, 1, 1]
    >>> classes = ['A','B']; point = [1.2,1.2]
    >>> classifier(X_train, y_train, classes,point)
        'A'
        """
-    data = zip(train_data, train_target)
+        # Distances of all points from the point to be classified
-    # List of distances of all points from the point to be classified
+        distances = (
-    distances = []
+            (self._euclidean_distance(data_point[0], pred_point), data_point[1])
-    for data_point in data:
+            for data_point in self.data
-        distance = euclidean_distance(data_point[0], point)
+        )
-        distances.append((distance, data_point[1]))
+
-    # Choosing 'k' points with the least distances.
+        # Choosing k points with the shortest distances
-    votes = [i[1] for i in sorted(distances)[:k]]
+        votes = (i[1] for i in nsmallest(k, distances))
-    # Most commonly occurring class among them
+
-    # is the class into which the point is classified
+        # Most commonly occurring class is the one into which the point is classified
        result = Counter(votes).most_common(1)[0][0]
-    return classes[result]
+        return self.labels[result]
 if __name__ == "__main__":
-    print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4]))
+    import doctest
    doctest.testmod()
    iris = datasets.load_iris()
    X = np.array(iris["data"])
    y = np.array(iris["target"])
    iris_classes = iris["target_names"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    iris_point = np.array([4.4, 3.1, 1.3, 1.4])
    classifier = KNN(X_train, y_train, iris_classes)
    print(classifier.classify(iris_point, k=3))
--- a/machine_learning/knn_sklearn.py
+++ b/machine_learning/knn_sklearn.py
@ -1,31 +0,0 @@
 from sklearn.datasets import load_iris
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors import KNeighborsClassifier
 # Load iris file
 iris = load_iris()
 iris.keys()
 print(f"Target names: \n {iris.target_names} ")
 print(f"\n Features: \n {iris.feature_names}")
 # Train set e Test set
 X_train, X_test, y_train, y_test = train_test_split(
    iris["data"], iris["target"], random_state=4
 )
 # KNN
 knn = KNeighborsClassifier(n_neighbors=1)
 knn.fit(X_train, y_train)
 # new array to test
 X_new = [[1, 2, 1, 4], [2, 3, 4, 5]]
 prediction = knn.predict(X_new)
 print(
    f"\nNew array: \n {X_new}\n\nTarget Names Prediction: \n"
    f" {iris['target_names'][prediction]}"
 )