Final Update of DBSCAN

2025-05-07 12:54:01 +00:00 · 2024-10-02 19:42:19 +05:30 · 2024-10-02 19:42:19 +05:30 · cd539e378d
commit cd539e378d
parent 59f4a0e046
1 changed files with 190 additions and 0 deletions
--- a/machine_learning/dbscan.py
+++ b/machine_learning/dbscan.py
@ -0,0 +1,190 @@
+'''
+
+Author : Gowtham Kamalasekar
+LinkedIn : https://www.linkedin.com/in/gowtham-kamalasekar/
+
+'''
+import math
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from typing import dict, list
+
+class DbScan:
+    
+    '''
+    DBSCAN Algorithm :
+    Density-Based Spatial Clustering Of Applications With Noise
+    Refer this website for more details : https://en.wikipedia.org/wiki/DBSCAN
+
+    Functions:
+    ----------
+        __init__()       : Constructor that sets minPts, radius and file
+        perform_dbscan() : Invoked by constructor and calculates the core
+                            and noise points and returns a dictionary.
+        print_dbscan()   : Prints the core and noise points along
+                           with stating if the noise are border points or not.
+        plot_dbscan()    : Plots the points to show the core and noise point.
+
+    To create a object
+    ------------------
+    import dbscan
+    obj = dbscan.DbScan(minpts, radius, file)
+    obj.print_dbscan()
+    obj.plot_dbscan()
+    '''
+    def __init__(self, minpts : int, radius : int, file : str =
+                 ({'x': 3, 'y': 7}, {'x': 4, 'y': 6}, {'x': 5, 'y': 5},
+                  {'x': 6, 'y': 4},{'x': 7, 'y': 3}, {'x': 6, 'y': 2},
+                  {'x': 7, 'y': 2}, {'x': 8, 'y': 4},{'x': 3, 'y': 3},
+                  {'x': 2, 'y': 6}, {'x': 3, 'y': 5}, {'x': 2, 'y': 4})
+                 ) -> None:
+        '''
+        Constructor
+
+        Args:
+        -----------
+            minpts (int) : Minimum number of points needed to be
+                           within the radius to considered as core
+            radius (int) : The radius from a given core point where
+                           other core points can be considered as core
+            file (csv)   : CSV file location. Should contain x and y
+                           coordinate value for each point.
+
+            Example :
+            minPts = 4
+            radius = 1.9
+            file = 'data_dbscan.csv'
+
+            File Structure of CSV Data:
+            ---------------------------
+            _____
+            x | y
+            -----
+            3 | 7
+            4 | 6
+            5 | 5
+            6 | 4
+            7 | 3
+            -----
+        '''
+        self.minpts = minpts
+        self.radius = radius
+        self.file = file
+        self.dict1 = self.perform_dbscan()
+    def perform_dbscan(self) -> dict[int, list[int]]:
+        '''
+        Args:
+        -----------
+            None
+
+        Return:
+        --------
+            Dictionary with points and the list
+            of points that lie in its radius
+
+        >>> result = DbScan(4, 1.9).perform_dbscan()
+        >>> for key in sorted(result):
+        ...     print(key, sorted(result[key]))
+        1 [1, 2, 10]
+        2 [1, 2, 3, 11]
+        3 [2, 3, 4]
+        4 [3, 4, 5]
+        5 [4, 5, 6, 7, 8]
+        6 [5, 6, 7]
+        7 [5, 6, 7]
+        8 [5, 8]
+        9 [9, 12]
+        10 [1, 10, 11]
+        11 [2, 10, 11, 12]
+        12 [9, 11, 12]
+
+        '''
+        if type(self.file) is str:
+            data = pd.read_csv(self.file) 
+        else:
+            data = pd.DataFrame(list(self.file))
+        e = self.radius
+        dict1 = {}
+        for i in range(len(data)):
+            for j in range(len(data)):
+                dist = math.sqrt(pow(data['x'][j] - data['x'][i],2)
+                                 + pow(data['y'][j] - data['y'][i],2))
+                if dist < e:
+                    if i+1 in dict1:
+                        dict1[i+1].append(j+1)
+                    else:
+                        dict1[i+1] = [j+1,]
+        return dict1
+    def print_dbscan(self) -> None:
+        '''
+        Outputs:
+        --------
+        Prints each point and if it is a core or a noise (w/ border)
+
+        >>> DbScan(4,1.9).print_dbscan()
+        1   [1, 2, 10] ---> Noise ---> Border
+        2   [1, 2, 3, 11] ---> Core
+        3   [2, 3, 4] ---> Noise ---> Border
+        4   [3, 4, 5] ---> Noise ---> Border
+        5   [4, 5, 6, 7, 8] ---> Core
+        6   [5, 6, 7] ---> Noise ---> Border
+        7   [5, 6, 7] ---> Noise ---> Border
+        8   [5, 8] ---> Noise ---> Border
+        9   [9, 12] ---> Noise
+        10   [1, 10, 11] ---> Noise ---> Border
+        11   [2, 10, 11, 12] ---> Core
+        12   [9, 11, 12] ---> Noise ---> Border
+        '''
+        for i in self.dict1:
+            print(i," ",self.dict1[i], end=' ---> ')
+            if len(self.dict1[i]) >= self.minpts:
+                print("Core")
+            else:
+                for j in self.dict1:
+                    if (
+                        i != j 
+                        and len(self.dict1[j]) >= self.minpts 
+                        and i in self.dict1[j]
+                    ):
+                        print("Noise ---> Border")
+                        break
+                else:
+                    print("Noise")
+    def plot_dbscan(self) -> None:
+        '''
+        Output:
+        -------
+        A matplotlib plot that show points as core and noise along
+        with the circle that lie within it.
+
+        >>> DbScan(4,1.9).plot_dbscan()
+        Plotted Successfully
+        '''
+        if type(self.file) is str:
+            data = pd.read_csv(self.file) 
+        else:
+            data = pd.DataFrame(list(self.file))
+        e = self.radius
+        for i in self.dict1:
+            if len(self.dict1[i]) >= self.minpts:
+                plt.scatter(data['x'][i-1], data['y'][i-1], color='red')
+                circle = plt.Circle((data['x'][i-1], data['y'][i-1]),
+                                    e, color='blue', fill=False)
+                plt.gca().add_artist(circle)
+                plt.text(data['x'][i-1], data['y'][i-1],
+                         'P'+str(i), ha='center', va='bottom')
+            else:
+                plt.scatter(data['x'][i-1], data['y'][i-1], color='green')
+                plt.text(data['x'][i-1], data['y'][i-1],
+                         'P'+str(i), ha='center', va='bottom')
+        plt.xlabel('X')
+        plt.ylabel('Y')
+        plt.title('DBSCAN Clustering')
+        plt.legend(['Core','Noise'])
+        plt.show()
+        print("Plotted Successfully")
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()