mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-27 15:01:08 +00:00
Consolidate the two existing kNN implementations (#8903)
* Add type hints to k_nearest_neighbours.py * Refactor k_nearest_neighbours.py into class * Add documentation to k_nearest_neighbours.py * Use heap-based priority queue for k_nearest_neighbours.py * Delete knn_sklearn.py * updating DIRECTORY.md * Use optional args in k_nearest_neighbours.py for demo purposes * Fix wrong function arg in k_nearest_neighbours.py --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
This commit is contained in:
parent
5830b29e7e
commit
76767d2f09
|
@ -507,7 +507,6 @@
|
|||
* [Gradient Descent](machine_learning/gradient_descent.py)
|
||||
* [K Means Clust](machine_learning/k_means_clust.py)
|
||||
* [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)
|
||||
* [Knn Sklearn](machine_learning/knn_sklearn.py)
|
||||
* [Linear Discriminant Analysis](machine_learning/linear_discriminant_analysis.py)
|
||||
* [Linear Regression](machine_learning/linear_regression.py)
|
||||
* Local Weighted Learning
|
||||
|
|
|
@ -1,58 +1,88 @@
|
|||
"""
|
||||
k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning
|
||||
algorithm used for classification. Given some labelled training data, a given
|
||||
point is classified using its k nearest neighbours according to some distance
|
||||
metric. The most commonly occurring label among the neighbours becomes the label
|
||||
of the given point. In effect, the label of the given point is decided by a
|
||||
majority vote.
|
||||
|
||||
This implementation uses the commonly used Euclidean distance metric, but other
|
||||
distance metrics can also be used.
|
||||
|
||||
Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
|
||||
"""
|
||||
|
||||
from collections import Counter
|
||||
from heapq import nsmallest
|
||||
|
||||
import numpy as np
|
||||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
data = datasets.load_iris()
|
||||
|
||||
X = np.array(data["data"])
|
||||
y = np.array(data["target"])
|
||||
classes = data["target_names"]
|
||||
class KNN:
|
||||
def __init__(
|
||||
self,
|
||||
train_data: np.ndarray[float],
|
||||
train_target: np.ndarray[int],
|
||||
class_labels: list[str],
|
||||
) -> None:
|
||||
"""
|
||||
Create a kNN classifier using the given training data and class labels
|
||||
"""
|
||||
self.data = zip(train_data, train_target)
|
||||
self.labels = class_labels
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
||||
@staticmethod
|
||||
def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
|
||||
"""
|
||||
Calculate the Euclidean distance between two points
|
||||
>>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4]))
|
||||
5.0
|
||||
>>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
|
||||
10.0
|
||||
"""
|
||||
return np.linalg.norm(a - b)
|
||||
|
||||
def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
|
||||
"""
|
||||
Classify a given point using the kNN algorithm
|
||||
>>> train_X = np.array(
|
||||
... [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
|
||||
... )
|
||||
>>> train_y = np.array([0, 0, 0, 0, 1, 1, 1])
|
||||
>>> classes = ['A', 'B']
|
||||
>>> knn = KNN(train_X, train_y, classes)
|
||||
>>> point = np.array([1.2, 1.2])
|
||||
>>> knn.classify(point)
|
||||
'A'
|
||||
"""
|
||||
# Distances of all points from the point to be classified
|
||||
distances = (
|
||||
(self._euclidean_distance(data_point[0], pred_point), data_point[1])
|
||||
for data_point in self.data
|
||||
)
|
||||
|
||||
def euclidean_distance(a, b):
|
||||
"""
|
||||
Gives the euclidean distance between two points
|
||||
>>> euclidean_distance([0, 0], [3, 4])
|
||||
5.0
|
||||
>>> euclidean_distance([1, 2, 3], [1, 8, 11])
|
||||
10.0
|
||||
"""
|
||||
return np.linalg.norm(np.array(a) - np.array(b))
|
||||
# Choosing k points with the shortest distances
|
||||
votes = (i[1] for i in nsmallest(k, distances))
|
||||
|
||||
|
||||
def classifier(train_data, train_target, classes, point, k=5):
|
||||
"""
|
||||
Classifies the point using the KNN algorithm
|
||||
k closest points are found (ranked in ascending order of euclidean distance)
|
||||
Params:
|
||||
:train_data: Set of points that are classified into two or more classes
|
||||
:train_target: List of classes in the order of train_data points
|
||||
:classes: Labels of the classes
|
||||
:point: The data point that needs to be classified
|
||||
|
||||
>>> X_train = [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
|
||||
>>> y_train = [0, 0, 0, 0, 1, 1, 1]
|
||||
>>> classes = ['A','B']; point = [1.2,1.2]
|
||||
>>> classifier(X_train, y_train, classes,point)
|
||||
'A'
|
||||
"""
|
||||
data = zip(train_data, train_target)
|
||||
# List of distances of all points from the point to be classified
|
||||
distances = []
|
||||
for data_point in data:
|
||||
distance = euclidean_distance(data_point[0], point)
|
||||
distances.append((distance, data_point[1]))
|
||||
# Choosing 'k' points with the least distances.
|
||||
votes = [i[1] for i in sorted(distances)[:k]]
|
||||
# Most commonly occurring class among them
|
||||
# is the class into which the point is classified
|
||||
result = Counter(votes).most_common(1)[0][0]
|
||||
return classes[result]
|
||||
# Most commonly occurring class is the one into which the point is classified
|
||||
result = Counter(votes).most_common(1)[0][0]
|
||||
return self.labels[result]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4]))
|
||||
import doctest
|
||||
|
||||
doctest.testmod()
|
||||
|
||||
iris = datasets.load_iris()
|
||||
|
||||
X = np.array(iris["data"])
|
||||
y = np.array(iris["target"])
|
||||
iris_classes = iris["target_names"]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
iris_point = np.array([4.4, 3.1, 1.3, 1.4])
|
||||
classifier = KNN(X_train, y_train, iris_classes)
|
||||
print(classifier.classify(iris_point, k=3))
|
||||
|
|
|
@ -1,31 +0,0 @@
|
|||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
|
||||
# Load iris file
|
||||
iris = load_iris()
|
||||
iris.keys()
|
||||
|
||||
|
||||
print(f"Target names: \n {iris.target_names} ")
|
||||
print(f"\n Features: \n {iris.feature_names}")
|
||||
|
||||
# Train set e Test set
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris["data"], iris["target"], random_state=4
|
||||
)
|
||||
|
||||
# KNN
|
||||
|
||||
knn = KNeighborsClassifier(n_neighbors=1)
|
||||
knn.fit(X_train, y_train)
|
||||
|
||||
# new array to test
|
||||
X_new = [[1, 2, 1, 4], [2, 3, 4, 5]]
|
||||
|
||||
prediction = knn.predict(X_new)
|
||||
|
||||
print(
|
||||
f"\nNew array: \n {X_new}\n\nTarget Names Prediction: \n"
|
||||
f" {iris['target_names'][prediction]}"
|
||||
)
|
Loading…
Reference in New Issue
Block a user