191 lines
6.1 KiB
Python
Raw Normal View History

2024-10-02 16:33:05 +05:30
'''
2024-10-02 16:24:52 +05:30
Author : Gowtham Kamalasekar
LinkedIn : https://www.linkedin.com/in/gowtham-kamalasekar/
2024-10-02 16:33:05 +05:30
'''
2024-10-01 21:20:12 +05:30
class DbScan:
2024-10-02 16:33:05 +05:30
import math
import matplotlib.pyplot as plt
import pandas as pd
from typing import dict, list
'''
2024-10-01 20:06:58 +05:30
DBSCAN Algorithm :
Density-Based Spatial Clustering Of Applications With Noise
2024-10-02 16:03:06 +05:30
Refer this website for more details : https://en.wikipedia.org/wiki/DBSCAN
2024-10-01 20:06:58 +05:30
Functions:
----------
__init__() : Constructor that sets minPts, radius and file
2024-10-02 15:43:16 +05:30
perform_dbscan() : Invoked by constructor and calculates the core
and noise points and returns a dictionary.
print_dbscan() : Prints the core and noise points along
with stating if the noise are border points or not.
2024-10-01 20:06:58 +05:30
plot_dbscan() : Plots the points to show the core and noise point.
To create a object
------------------
import dbscan
2024-10-01 21:20:12 +05:30
obj = dbscan.DbScan(minpts, radius, file)
2024-10-01 20:06:58 +05:30
obj.print_dbscan()
obj.plot_dbscan()
2024-10-02 16:33:05 +05:30
'''
def __init__(self, minpts : int, radius : int, file : str =
({'x': 3, 'y': 7}, {'x': 4, 'y': 6}, {'x': 5, 'y': 5},
{'x': 6, 'y': 4},{'x': 7, 'y': 3}, {'x': 6, 'y': 2},
{'x': 7, 'y': 2}, {'x': 8, 'y': 4},{'x': 3, 'y': 3},
{'x': 2, 'y': 6}, {'x': 3, 'y': 5}, {'x': 2, 'y': 4})
) -> None:
'''
Constructor
2024-10-02 15:43:16 +05:30
Args:
-----------
2024-10-01 21:13:40 +05:30
minpts (int) : Minimum number of points needed to be
within the radius to considered as core
radius (int) : The radius from a given core point where
other core points can be considered as core
file (csv) : CSV file location. Should contain x and y
coordinate value for each point.
Example :
minPts = 4
radius = 1.9
file = 'data_dbscan.csv'
File Structure of CSV Data:
---------------------------
_____
x | y
-----
3 | 7
4 | 6
5 | 5
6 | 4
7 | 3
-----
2024-10-02 16:33:05 +05:30
'''
2024-10-01 21:13:40 +05:30
self.minpts = minpts
2024-10-01 20:06:58 +05:30
self.radius = radius
self.file = file
self.dict1 = self.perform_dbscan()
2024-10-02 16:24:52 +05:30
def perform_dbscan(self) -> dict[int, list[int]]:
2024-10-02 16:33:05 +05:30
'''
2024-10-02 15:43:16 +05:30
Args:
-----------
2024-10-02 15:43:16 +05:30
None
2024-10-01 20:06:58 +05:30
Return:
--------
2024-10-02 15:43:16 +05:30
Dictionary with points and the list
of points that lie in its radius
>>> result = DbScan(4, 1.9).perform_dbscan()
>>> for key in sorted(result):
... print(key, sorted(result[key]))
1 [1, 2, 10]
2 [1, 2, 3, 11]
3 [2, 3, 4]
4 [3, 4, 5]
5 [4, 5, 6, 7, 8]
6 [5, 6, 7]
7 [5, 6, 7]
8 [5, 8]
9 [9, 12]
10 [1, 10, 11]
11 [2, 10, 11, 12]
12 [9, 11, 12]
2024-10-02 16:33:05 +05:30
'''
2024-10-02 16:03:06 +05:30
if type(self.file) is str:
2024-10-02 16:33:05 +05:30
data = pd.read_csv(self.file)
2024-10-02 16:03:06 +05:30
else:
data = pd.DataFrame(list(self.file))
2024-10-01 20:06:58 +05:30
e = self.radius
dict1 = {}
for i in range(len(data)):
for j in range(len(data)):
2024-10-02 16:33:05 +05:30
dist = math.sqrt(pow(data['x'][j] - data['x'][i],2)
+ pow(data['y'][j] - data['y'][i],2))
2024-10-01 20:06:58 +05:30
if dist < e:
2024-10-02 16:33:05 +05:30
if i+1 in dict1:
dict1[i+1].append(j+1)
2024-10-01 20:06:58 +05:30
else:
2024-10-02 16:33:05 +05:30
dict1[i+1] = [j+1,]
2024-10-01 20:06:58 +05:30
return dict1
def print_dbscan(self) -> None:
2024-10-02 16:33:05 +05:30
'''
Outputs:
--------
Prints each point and if it is a core or a noise (w/ border)
2024-10-02 15:43:16 +05:30
>>> DbScan(4,1.9).print_dbscan()
1 [1, 2, 10] ---> Noise ---> Border
2 [1, 2, 3, 11] ---> Core
3 [2, 3, 4] ---> Noise ---> Border
4 [3, 4, 5] ---> Noise ---> Border
5 [4, 5, 6, 7, 8] ---> Core
6 [5, 6, 7] ---> Noise ---> Border
7 [5, 6, 7] ---> Noise ---> Border
8 [5, 8] ---> Noise ---> Border
9 [9, 12] ---> Noise
10 [1, 10, 11] ---> Noise ---> Border
11 [2, 10, 11, 12] ---> Core
12 [9, 11, 12] ---> Noise ---> Border
2024-10-02 16:33:05 +05:30
'''
2024-10-01 20:06:58 +05:30
for i in self.dict1:
2024-10-02 16:33:05 +05:30
print(i," ",self.dict1[i], end=' ---> ')
2024-10-01 21:13:40 +05:30
if len(self.dict1[i]) >= self.minpts:
2024-10-01 20:06:58 +05:30
print("Core")
else:
for j in self.dict1:
2024-10-02 16:03:06 +05:30
if (
2024-10-02 16:33:05 +05:30
i != j
and len(self.dict1[j]) >= self.minpts
2024-10-02 16:03:06 +05:30
and i in self.dict1[j]
):
print("Noise ---> Border")
break
2024-10-01 20:06:58 +05:30
else:
print("Noise")
def plot_dbscan(self) -> None:
2024-10-02 16:33:05 +05:30
'''
Output:
-------
2024-10-01 21:13:40 +05:30
A matplotlib plot that show points as core and noise along
with the circle that lie within it.
2024-10-02 15:43:16 +05:30
>>> DbScan(4,1.9).plot_dbscan()
Plotted Successfully
2024-10-02 16:33:05 +05:30
'''
2024-10-02 16:03:06 +05:30
if type(self.file) is str:
2024-10-02 16:33:05 +05:30
data = pd.read_csv(self.file)
2024-10-02 16:03:06 +05:30
else:
data = pd.DataFrame(list(self.file))
2024-10-01 20:06:58 +05:30
e = self.radius
for i in self.dict1:
2024-10-01 21:13:40 +05:30
if len(self.dict1[i]) >= self.minpts:
2024-10-02 16:33:05 +05:30
plt.scatter(data['x'][i-1], data['y'][i-1], color='red')
circle = plt.Circle((data['x'][i-1], data['y'][i-1]),
e, color='blue', fill=False)
2024-10-01 20:06:58 +05:30
plt.gca().add_artist(circle)
2024-10-02 16:33:05 +05:30
plt.text(data['x'][i-1], data['y'][i-1],
'P'+str(i), ha='center', va='bottom')
2024-10-01 20:06:58 +05:30
else:
2024-10-02 16:33:05 +05:30
plt.scatter(data['x'][i-1], data['y'][i-1], color='green')
plt.text(data['x'][i-1], data['y'][i-1],
'P'+str(i), ha='center', va='bottom')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('DBSCAN Clustering')
plt.legend(['Core','Noise'])
2024-10-01 20:06:58 +05:30
plt.show()
2024-10-02 15:43:16 +05:30
print("Plotted Successfully")
2024-10-02 16:24:52 +05:30
if __name__ == "__main__":
import doctest
doctest.testmod()