diff --git a/machine_learning/dbscan.py b/machine_learning/dbscan.py index 107ab2f68..0b791652b 100644 --- a/machine_learning/dbscan.py +++ b/machine_learning/dbscan.py @@ -1,20 +1,21 @@ -import pandas as pd import math import matplotlib.pyplot as plt -from typing import dict, list - - +import pandas as pd +from typing import dict, list, optional class DbScan: - """ + ''' DBSCAN Algorithm : Density-Based Spatial Clustering Of Applications With Noise - Refer this website for more details : https://en.wikipedia.org/wiki/DBSCAN - + Reference Website : https://en.wikipedia.org/wiki/DBSCAN + Reference YouTube Video : https://youtu.be/-p354tQsKrs?si=t1IxCFhrOB-RAcIU + Functions: ---------- __init__() : Constructor that sets minPts, radius and file - perform_dbscan() : Invoked by constructor and calculates the core and noise points and returns a dictionary. - print_dbscan() : Prints the core and noise points along with stating if the noise are border points or not. + perform_dbscan() : Invoked by constructor and calculates the core + and noise points and returns a dictionary. + print_dbscan() : Prints the core and noise points along + with stating if the noise are border points or not. plot_dbscan() : Plots the points to show the core and noise point. To create a object @@ -23,13 +24,17 @@ class DbScan: obj = dbscan.DbScan(minpts, radius, file) obj.print_dbscan() obj.plot_dbscan() - """ - - def __init__(self, minpts: int, radius: int, file: str) -> None: - """ + ''' + def __init__(self, minpts : int, radius : int, file : optional[str] = + [{'x': 3, 'y': 7}, {'x': 4, 'y': 6}, {'x': 5, 'y': 5}, + {'x': 6, 'y': 4},{'x': 7, 'y': 3}, {'x': 6, 'y': 2}, + {'x': 7, 'y': 2}, {'x': 8, 'y': 4},{'x': 3, 'y': 3}, + {'x': 2, 'y': 6}, {'x': 3, 'y': 5}, {'x': 2, 'y': 4}] + ) -> None: + ''' Constructor - Attributes: + Args: ----------- minpts (int) : Minimum number of points needed to be within the radius to considered as core @@ -54,97 +59,111 @@ class DbScan: 6 | 4 7 | 3 ----- - """ + ''' self.minpts = minpts self.radius = radius self.file = file self.dict1 = self.perform_dbscan() - def perform_dbscan(self) -> dict[int, list[int]]: - """ - Parameters: + ''' + Args: ----------- - None + None Return: -------- - Dictionary with points and the list of points - that lie in its radius - """ - data = pd.read_csv(self.file) + Dictionary with points and the list + of points that lie in its radius + + >>> result = DbScan(4, 1.9).perform_dbscan() + >>> for key in sorted(result): + ... print(key, sorted(result[key])) + 1 [1, 2, 10] + 2 [1, 2, 3, 11] + 3 [2, 3, 4] + 4 [3, 4, 5] + 5 [4, 5, 6, 7, 8] + 6 [5, 6, 7] + 7 [5, 6, 7] + 8 [5, 8] + 9 [9, 12] + 10 [1, 10, 11] + 11 [2, 10, 11, 12] + 12 [9, 11, 12] + + ''' + data = pd.read_csv(self.file) if type(self.file) == type("str") else pd.DataFrame(self.file) e = self.radius dict1 = {} for i in range(len(data)): for j in range(len(data)): - dist = math.sqrt( - pow(data["x"][j] - data["x"][i], 2) - + pow(data["y"][j] - data["y"][i], 2) - ) + dist = math.sqrt(pow(data['x'][j] - data['x'][i],2) + + pow(data['y'][j] - data['y'][i],2)) if dist < e: - if i + 1 in dict1: - dict1[i + 1].append(j + 1) + if i+1 in dict1: + dict1[i+1].append(j+1) else: - dict1[i + 1] = [ - j + 1, - ] + dict1[i+1] = [j+1,] return dict1 - def print_dbscan(self) -> None: - """ + ''' Outputs: -------- Prints each point and if it is a core or a noise (w/ border) - """ + + >>> DbScan(4,1.9).print_dbscan() + 1 [1, 2, 10] ---> Noise ---> Border + 2 [1, 2, 3, 11] ---> Core + 3 [2, 3, 4] ---> Noise ---> Border + 4 [3, 4, 5] ---> Noise ---> Border + 5 [4, 5, 6, 7, 8] ---> Core + 6 [5, 6, 7] ---> Noise ---> Border + 7 [5, 6, 7] ---> Noise ---> Border + 8 [5, 8] ---> Noise ---> Border + 9 [9, 12] ---> Noise + 10 [1, 10, 11] ---> Noise ---> Border + 11 [2, 10, 11, 12] ---> Core + 12 [9, 11, 12] ---> Noise ---> Border + ''' for i in self.dict1: - print(i, " ", self.dict1[i], end=" ---> ") + print(i," ",self.dict1[i], end=' ---> ') if len(self.dict1[i]) >= self.minpts: print("Core") else: for j in self.dict1: - if ( - i != j - and len(self.dict1[j]) >= self.minpts - and i in self.dict1[j] - ): - print("Noise ---> Border") - break + if i != j and len(self.dict1[j]) >= self.minpts: + if i in self.dict1[j]: + print("Noise ---> Border") + break else: print("Noise") - def plot_dbscan(self) -> None: - """ + ''' Output: ------- A matplotlib plot that show points as core and noise along with the circle that lie within it. - """ - data = pd.read_csv(self.file) + + >>> DbScan(4,1.9).plot_dbscan() + Plotted Successfully + ''' + data = pd.read_csv(self.file) if type(self.file) == type("str") else pd.DataFrame(self.file) e = self.radius for i in self.dict1: if len(self.dict1[i]) >= self.minpts: - plt.scatter(data["x"][i - 1], data["y"][i - 1], color="red") - circle = plt.Circle( - (data["x"][i - 1], data["y"][i - 1]), e, color="blue", fill=False - ) + plt.scatter(data['x'][i-1], data['y'][i-1], color='red') + circle = plt.Circle((data['x'][i-1], data['y'][i-1]), + e, color='blue', fill=False) plt.gca().add_artist(circle) - plt.text( - data["x"][i - 1], - data["y"][i - 1], - "P" + str(i), - ha="center", - va="bottom", - ) + plt.text(data['x'][i-1], data['y'][i-1], + 'P'+str(i), ha='center', va='bottom') else: - plt.scatter(data["x"][i - 1], data["y"][i - 1], color="green") - plt.text( - data["x"][i - 1], - data["y"][i - 1], - "P" + str(i), - ha="center", - va="bottom", - ) - plt.xlabel("X") - plt.ylabel("Y") - plt.title("DBSCAN Clustering") - plt.legend(["Core", "Noise"]) + plt.scatter(data['x'][i-1], data['y'][i-1], color='green') + plt.text(data['x'][i-1], data['y'][i-1], + 'P'+str(i), ha='center', va='bottom') + plt.xlabel('X') + plt.ylabel('Y') + plt.title('DBSCAN Clustering') + plt.legend(['Core','Noise']) plt.show() + print("Plotted Successfully")