From 4617aa78b2a61fcfebc136f7fc217ea980205067 Mon Sep 17 00:00:00 2001 From: Kaushik Amar Das Date: Sun, 29 Sep 2019 14:14:41 +0530 Subject: [PATCH] DBSCAN algorithm (#1207) * Added dbscan in two formats. A jupyter notebook file for the storytelling and a .py file for people that just want to look at the code. The code in both is essentially the same. With a few things different in the .py file for plotting the clusters. * fixed LGTM problems * Some requested changes implemented. Still need to do docstring * implememted all changes as requested --- machine_learning/dbscan/dbscan.ipynb | 376 +++++++++++++++++++++++++++ machine_learning/dbscan/dbscan.py | 271 +++++++++++++++++++ 2 files changed, 647 insertions(+) create mode 100644 machine_learning/dbscan/dbscan.ipynb create mode 100644 machine_learning/dbscan/dbscan.py diff --git a/machine_learning/dbscan/dbscan.ipynb b/machine_learning/dbscan/dbscan.ipynb new file mode 100644 index 000000000..603a4cd40 --- /dev/null +++ b/machine_learning/dbscan/dbscan.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DBSCAN\n", + "This implementation and notebook is inspired from the original DBSCAN algorithm and article as given in \n", + "[DBSCAN Wikipedia](https://en.wikipedia.org/wiki/DBSCAN).\n", + "\n", + "Stands for __Density-based spatial clustering of applications with noise__ . \n", + "\n", + "DBSCAN is clustering algorithm that tries to captures the intuition that if two points belong to the same cluster they should be close to one another. It does so by finding regions that are densely packed together, i.e, the points that have many close neighbours.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### When to use ?\n", + "\n", + "1. You need a robust clustering algorithm.\n", + "2. You don't know how many clusters there are in the dataset\n", + "3. You find it difficult to guess the number of clusters there are just by eyeballing the dataset.\n", + "4. The clusters are of arbitrary shapes.\n", + "5. You want to detect outliers/noise." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Why DBSCAN ? \n", + "\n", + "This algorithm is way better than other clustering algorithms such as [k-means](https://en.wikipedia.org/wiki/K-means_clustering) whose only job is to find circular blobs. It is smart enough to figure out the number of clusters in the dataset on its own, unlike k-means where you need to specify 'k'. It can also find clusters of arbitrary shapes, not just circular blobs. Its too robust to be affected by outliers (the noise points) and isn't fooled by them, unlike k-means where the entire centroid get pulled thanks to pesky outliers. Plus, you can fine-tune its parameters depending on what you are clustering.\n", + "\n", + "#### Have a look at these [neat animations](https://www.naftaliharris.com/blog/visualizing-dbscan-clustering/) of DBSCAN to see for yourself." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## First lets grab a dataset\n", + "We will take the moons dataset which is pretty good at showing the power of DBSCAN. \n", + "\n", + "Lets generate 200 random points in the shape of two moons" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_moons\n", + "\n", + "x, label = make_moons(n_samples=200, noise=0.1, random_state=19)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize the dataset using matplotlib\n", + "You will observe that the points are in the shape of two crescent moons. \n", + "\n", + "The challenge here is to cluster the two moons. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD6CAYAAACs/ECRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO2df5AlV3XfP2dnZ4QWsLX7VsZroZmViAIWToxhIoNMUcIQkFUpLSkrFeHRejGixlrFKVwpUqxqEyelZMv8qApgEyFvFJmFmUgCYjuKC0WWQIQ/jAQjol+grLRapEUqGVa7WLAlh5VWN390P03Pm/75+tft199PVdfrH7e7b9/Xfc+955x7rjnnEEII0V82tJ0BIYQQ7SJBIIQQPUeCQAgheo4EgRBC9BwJAiGE6DkSBEII0XMqEQRmdqOZ/dDMHko4vmBmD5jZg2b212b2y5Fjj4f77zOzlSryI4QQIj9WxTgCM3sbcAL4nHPul2KOXwg87Jz7kZn9BvDvnXO/Gh57HJh3zj2T935bt25127dvL51vIYToE/fee+8zzrkzR/dvrOLizrmvm9n2lON/Hdm8G3h1mftt376dlRV1HoQQoghm9kTc/jZsBFcCt0W2HfBXZnavmS22kB8hhOg1lfQI8mJmbycQBG+N7H6rc+4pM/s54A4z+7/Oua/HnLsILALMzs42kl8hhOgDjfUIzOwfAjcAO5xzx4b7nXNPhb8/BP4cuCDufOfcfufcvHNu/swz16m4hBBCjEkjgsDMZoE/A3Y65x6J7H+5mb1yuA68C4j1PBJCCFEPlaiGzOwm4CJgq5k9Cfw7YBrAOXc98AfAALjOzABecM7NA68C/jzctxH4b865/1VFnoQQQuSjKq+h92Yc/wDwgZj9h4FfXn+GEDWwvAx798KRIzA7C/v2wcJC27kSonUaNRYL0RrLy7C4CM89F2w/8USwDRIGovcoxIToB3v3rgqBIc89F+wXoudIEIh+cORIsf1C9AgJAlE/y8uwfTts2BD8Li83n4eksScakyKEBIGomaFu/oknwLlV3XxRYVBWmOzbB5s2rd23aVOwX4ieI0Eg6qUK3XwVwmRhAfbvh7k5MAt+9++XoVgIKoo+2jTz8/NOQec6woYNQeU9ihm8+GK+a2zfHlT+o8zNweOPl8mdEL3CzO4Nx3CtQT0CUS9V6OZl6BWiViQIRL1UoZuXoVeIWpEgEPVShW5ehl4hakWCQATU6eK5sBDo8l98MfgtaqD10dDrg0usEBUhQSCqc/Gsk6gw2bcv8DpqqxLuQnkJUQAJAtGt8AtxlfAVV8DWrfVUxHEt/y6VlxA5kPuoqMbFsymSXEkhsBtUqTIaDVQ3vMeoEBjiY3kJEUHuoyKZLnnlpLmMVt0qT2r5T03Fp/exvITIgQSBqNYrp24jalZlW+XYgqRrnTolLyYxUUgQiOq8csY1ohYRHnFCK8qWLcXynEaS0BmWj09eTEKUwTnXueVNb3qTEx6xtOTc3JxzQfW/fpmbSz9306a16TdtCvannbNhQ/y9BoN8eTULfrPuUzRvQngMsOJi6tRKKmbgRuCHwEMJxw34I+AQ8ADwxsixXcCj4bIrz/0kCDwirrIcXcySz08SIGnCw7ngmkXvNa7QySs4hPCcugXB24A3pgiCS4DbQoHwZuCecP8W4HD4uzlc35x1PwkCj0jrCeSp1NMq9LRKeBwBMq7QEWJCSBIEldgInHNfB46nJNkBfC7My93AGWa2DXg3cIdz7rhz7kfAHcDFVeRJNESWcTbLiJqkh9+yJd3eMI6BW8HrhIilKWPxWcD3I9tPhvuS9ouukObFk8eImlShQ/qgrXEM3El5dU5hIkSv6YzXkJktmtmKma0cPXq07eyIIUkV+dJSvrhCSRX68YQOZrT1XjSGUZrHkcJEiB7TlCB4Cjg7sv3qcF/S/nU45/Y75+adc/NnnnlmbRkVBanC9TSuQq9jkFs0r3E89xzs3Bk8h1l9YSuE8IymBMGtwG9bwJuBZ51zTwO3A+8ys81mthl4V7ivn3Q1omXZ6KJx1BV6ephXs/jjLhJq49gxeP/7u/M/CDEmlQgCM7sJ+AbwWjN70syuNLOrzOyqMMmXCTyCDgH/BbgawDl3HPgPwLfC5dpwX/9QRMu11B16Om/P4uRJBZMTE4+CzvlCH+blHUbuPHIkqIj37WtvNG5cQLkkFExOTAgKOuc7k+7a6EuPZ6h+27kTTj8dBoOgok8KJAcKJicmHgkCX+hSBNBx8CGG/6gwOnYM/u7v4Kqr4Iwz4s+ZmSlml+iqnUf0GgkCX2hiXt64Siqt4qqyUvOhx5MkjD7zmUAojDIYwI035ldf+dLrEaIoccONfV8mNsRE0YBoRWLgxMXZmZ52bmYmPvZO1QHXfAjvkBTOoqp8+fCMQqRAQogJGYu7SNLMWWleNWkze40y9LOv0ng9Tp6rpkgZjGMg7tJMb6KXyFg8SYyjby+igjlypHpVTt3uoHkoomYbxzYz6XYeMbFIEHSRcSrpIpXR7Gx6pTau7aCOgWdFWFgI9P5ZjGubacLOI0QNSBB0kXFannGV1PR04BUTZVhxJVVql1zSbYPopz61/rlmZlbdSMv0VHzo9QgxDnGGA9+XiTUW52VcQ26cgTnN6Bx3bBIMoppsRvQU6pyPQDTMuC3PUdUMrB/pC6tqnw9+EE6cWHsNH9xAy9K2ikoIz5DXUF+J8+KZmQna988/H3/Opk3BaNw4n/tJCoVRNz6F2hC9Ql5DPuDTqNM4z6OTJ5OFAKyml0F0fMYZdObTeyMmkzh9ke9LJ20EVQ/QyrpXlg68yOCqInMJTxpV2FWiFLWx7N69/r+q670REw91Tl7f9NJJQdCUkTWvwMkz6XzXjcJliSvLmZlgRPa4o7GTBLBZ/P2T0vfpfxCVkSQIZCNoiqZGnSaNnp2aggMHVnXR49oI+uQOWcdo7CLhxtPur9HKYgxkI2ibpkadJnnvnDq1Vhcd53l0443wp3+6um8wqMa/PkqX9N11jMYuMuisqgGCQmQR103wfemkaqgpG0GWyqeISqFqW0CTdpIqKKI+m5vLr/7LW65p9x8M+mGjEZWCxhG0TFOjTuNanFHytnLrCKnsw5wERYgry5mZYER2lKzR2ON6VCX9lxs3Bi68Vf0vQsRJh6ILcDFwkGBO4j0xxz8B3BcujwB/Gzl2KnLs1jz362SPoEmWlpybmirXI6jDuF3EUOoLVXsNpfWK8txrMKj+fxG9gbq8hoAp4DHgXGAGuB84PyX9vwRujGyfKHrP3guCPKqFsmqYOirtSQhPUZakMhgM8v1fXRSmwhuSBEEVqqELgEPOucPOuZPAzcCOlPTvBW6q4L7+U4dhNK/Kpqwqqg7jtqJzJqvmjh3LpzZTqGtRB3HSocgCXAbcENneCXw6Ie0c8DQwFdn3ArAC3A28J+U+i2G6ldnZ2TqFZjXUZRj1bTzCONdtajCajwPfio7fGG3pd83gLryCGlVDRQTBh4E/Htl3Vvh7LvA48Jqse3qrGopWPGV19Ek0qRrwsSLNi68VZlK+iuj+u/y/iFapUxC8Bbg9sn0NcE1C2v8DXJhyrc8Cl2Xd00tBEPeB11FhS8+eD5/LKcko7KPgEhNFkiCowkbwLeA8MzvHzGaAy4FbRxOZ2euAzcA3Ivs2m9lp4fpW4NeA71aQp+aJc42Mo6wuV3r2fPgcLjsuDLYmtREtUloQOOdeAH4PuB14GPiCc+47ZnatmV0aSXo5cHMolYb8IrBiZvcDdwEfcc51UxDkqWCqqLBVYeSjbaNqXkeBaLq9e4P3o+g8CV0arS38JK6b4PvipWooSRUxNZXt5lmXvrfPuuQ2VS15711FHqVSEgVA0UdrZpwPMuucMhW5Koj2BGFe+0QVdow81+hzg0CsQYKgCYp+cGkfcdmK3Gdj6aST17OrCg+wrGuoQSAiJAkChaFuk7TQ1LOz+cMVF722whfXS95Q00VCUo97ryruISYGhaH2jeXloLKOY3a2vNdL28bSPpPXs6sKD7Csa/jsPSW8QYKgCop6bQzDRJw6tf7Y8CMuW5HLzbQ98np2VeEBlnUNNQhEHuL0Rb4vXtkIxtHBpnkYRQ3FVXiUyEjYb2QjEBGQsbgmihpll5bi08cZCVWRiyzyRqLVeyRcsiCQsbgsRYyycfMER5EBTxQh7n3q27zSohAyFtdFER1sWhgK6e9FUbJmfNOIY5ETCYKyVDUZuVpxIo60yjzNI6iOqUbFxCJBUJYinh9JvYe5OQmBIWrFrpJVmaf1Rrs2P7RolzjDge+LV8biIsiDIx2Vz1qyHBHiysvMud27NaWliIUaw1CLvChyaDpqxa4lazDYwgLs2hW8S0OcgwMHYMuW+HOLjB9Q76xZWizvjY3dSQQMY8+L9WgU7FqSwoxEK/Mvf3m919pzz8Hppwe2qlGPorwOCaMeSUO1FOj9rYOWy1s9grpQayqb0TKqohU7SeRxREgSksePl+t9qnfWLG17gMXpi3xfvLcRSNedTVwZzcw4Nz2dXW59GiCV9ax1RZmVjaFZ0sq7wvoEjSyuibgPVSGgs0kqo8EgveKTkF1LXeWhd7hZ0sq7wv+iVkEAXAwcBA4Be2KOvw84CtwXLh+IHNsFPBouu/LczxtBkPQR5g0h0WfGbXGqglpPHT0kCdxmSSvvCntntQkCYAp4DDgXmAHuB84fSfM+4NMx524BDoe/m8P1zVn39EYQpAWPU2WVzrgVulQW1ZMkSPqkgvOBpPJuoEdQhbH4AuCQc+6wc+4kcDOwI+e57wbucM4dd879CLiDoHfRDZIMdadOKQR0FuOGyVZY5WpJG7S2sBDEvnrxxeA3zdAs54jyJJV3AyHlqxAEZwHfj2w/Ge4b5TfN7AEz+5KZnV3wXD9JGyms8QLpjDumQvMsVEsV3kEKZ1EvTYw/iusmFFmAy4AbIts7GVEDAQPgtHD9d4GvhusfAv5NJN2/BT6UcJ9FYAVYmZ2dLdwlyk2R7nAePaq619WjMq2OKlRtstt0Bmq0EbwFuD2yfQ1wTUr6KeDZcP29wJ9Ejv0J8N6se9ZmIxjHQJZWKcngJnynikpcdpvOkCQIqlANfQs4z8zOMbMZ4HLg1mgCM9sW2bwUeDhcvx14l5ltNrPNwLvCfe1Q9SCaotfrq561r8/tA1Wo2mS3yYfP73mcdCi6AJcAjxB4D+0N910LXBqu/yHwHQKPoruA10XOfT+B2+kh4Hfy3K+2HkHRlk1Wi7/I9frae+jrc1dFFWqystfQf5iNJ2WEBpTloGg3OSt9kev1Vc/a1+euAk8ql5fyIrtNMp6850mCQFNVRik69V/WNJVFrldkystJoq/PXQXbt8cHpdOUp/7hyXuuqSrzUNRNKytIWhWT1ky6nrWvz10FitbaHTx/zyUIRsk7iGZ5GX7yk/X7p6fXGtryXq+v/vF9fe4q8LxyERF8f8/j9EW+L42GmCg67HswqP5ek05fn7ssPtkIRDYevOfIRjAGaTr+nTu90PmJnrO8HLgjHzkS9AT27dMIdpGIbATjkDYOQN1y4QNF4gHFMY5vu8/+8HUyyc8d103wfWlMNdTQZBFCtMK4I+n7+N5PyHOjcQRjkOX764HOT4TovyjOOL7tnvjDN86EPHeSIJBqKI0sS3/ZbrlYz7iqCkW/TCeuXMdxP+2ry2rZ5/ZdrRQnHXxfvPAaEtUzbvd7QlprtbF793o1p5lzL3+5egR5Gfe5l5YCT8LR81pSKyHVkPAezVpWPWlTHYJz09PFKqg4YT10m57kRtI4Ied3706furYF4SlBIPwnb4U++sHFtbj60ErNQ5JwjVbgRXu8nrVyG6NoyPk0AdxSQyVJEGgcgWiOLJ/3pNg5r3gFDAbBeVu2wI9/DM8/v3p8ejoYv3Hy5Oq+tBhRfSIpxs2Qcce9KM7RWpLKI40WykrjCMbFdyNPV8hj0N23D2Zm1p974sTqeceOrRUCEGy/8pWaGjSOrHEt44576avROImiz+1TeAmQaiiVCfEd9oK8+v8kNU/WIntAPEk6/bLvcl+NxkkklUeceqhFewpyHx2Dqmcs6zN5W5DHj493fY3ojicaARdgair4HfaaYLwer+9B1JomqTyuumptT3VpCZ55xr/eapx08H3xYmSxKEbeFmSWcbPqlm2fKdvjlWv1WjpQHshYPAYyiFVH3kl64tKNMjMT2ASOH1egtTLo/e4dtRqLzexiMztoZofMbE/M8X9lZt81swfM7CtmNhc5dsrM7guXW0fPbRV1f6sj7yQ9cel27167feONQfdaI7rLIYPveEyiA0lcN6HIAkwRTFp/LjBDMEH9+SNp3g5sCtd3A7dEjp0oek+NLBaiAmTwLU7HHUio0Vh8AXDIOXfYOXcSuBnYMSJs7nLODfv6dwOvruC+zaB4QmJSUY+3OEUdSDrSe6hCEJwFfD+y/WS4L4krgdsi2y8zsxUzu9vM3lNBfvykIy+E6BFF5+gWxdRpHQqG2Kj7qJldAcwDH4/snnOB8eK3gE+a2WsSzl0MBcbK0aNH68tkHRV2h16ITiChWh3q8eZneTl45+KIc1/ukvt5nL6oyAK8Bbg9sn0NcE1MuncCDwM/l3KtzwKXZd2zNhtBnP5vejoYAFLGRiBdbHWME/yrI/pb4TF5BuaNvnceDn6krqBzwEbgMHAOq8bi14+k+RUCg/J5I/s3A6eF61uBRxkxNMcttQmCPD7s4xiGNB6hOvJMFtRhY14jSFAWJ+m9m5paFQJ5g8612ACsTRAE1+YS4JGwst8b7rsWuDRcvxP4AXBfuNwa7r8QeDAUHg8CV+a5X22CICtaYNIfmfVhqUdQHVlCVWWdjgTleIz73o0uMzOtlnWtgqDppdUewWhLPq+qQh9fNWRV9GnCXOUtQTkuZd676DI97aUgUKyhKHHudHFEDUN5DELyzqiOLJfHtJhDMtBrENk4LC8HEXBHyfveRXn++ck0Frex1DqgLKrmGQyCrtyoVI9GD5T+v3mKThCilu8q6hEUI++MbFnvnSd1A1INjUHSTExR1Y4+LP9YWvLyI2ycOIEpNWUxinzfHZg5T4KgKHkk/PDj0oflH30X0GnvpbyG8lOmx+9h3SBBUJS8hmN9WH7i4UfYKGnujnpP81O2QeFZ3SBBUJS8XgB9qly6hmcfYaPkeX/17mYzYQ2KJEGg+QiSKDIZteK3C9/I+/7q3c1meTnw9DlypPPzX2jy+qLkdSUFud4J/8j7/urdzaYH8ZgkCJKI8/0fDOLTbtnSbN6EyGL4/g7nKE5Ccz0LJAjSGW0JfOpTMD29Pt1PfqKBSsI/FhaCdzcJzT0gQiQIirCwAD/zM+v3nzzp52hBIZJa/FNTGt0uXkKCoCjHj8fvl65V+EhSSI4DByQExEtIEBQlqYUlXWu7aLKaeBTnql2uvho2bgzKfuPGYNtDJAiKonle/SNuBrgrroCtWyUQoBdeL15y9dXwmc/AqVPB9qlTwbaHwkCCoChqYflHXARYgGPHFHG0COpVVcv+/cX2t0h/BEGVL7laWH6RZp/xdY5Y39C82tUz7Ank3d8i/RAEesknmyz7jAz52XRpovWukDSGI2tsRwv0QxDoJe8+aT26rFG0MuRnowlrqmdxsdj+FqlEEJjZxWZ20MwOmdmemOOnmdkt4fF7zGx75Ng14f6DZvbuKvKzDr3k3SarRze028SN/JYhPx/yhque666D3btXewBTU8H2dde1m6844iLRFVmAKYJJ688FZggmoj9/JM3VwPXh+uXALeH6+WH604BzwutMZd2zcPTRvsem7zplJgfpaJTIxpmwKJsiHmqcs/gC4JBz7rBz7iRwM7BjJM0O4EC4/iXgHWZm4f6bnXM/dc59DzgUXq9a5PLZbYr06GTIH48i3nDyLkqmo2WzsYJrnAV8P7L9JPCrSWmccy+Y2bPAINx/98i5Z8XdxMwWgUWA2aLd1eHLPCGhZHvH7Gx8SGWpLaplYSH7mxiq6YY2t6Gabnh+n+lw2XTGWOyc2++cm3fOzZ955pnFL6CWYncZt0fX0dZZpVRdBnK8SC7TDpdNFYLgKeDsyParw32xacxsI/CzwLGc57aHKhI/GGcQn1yG6ymDvjtepJVpl8smznBQZCFQLx0mMPYOjcWvH0nzL1hrLP5CuP561hqLD1OHsXgcZDzzlzwGYTkI1FMGfSnXpHcs7fk7UDbUOWcxcAnwCIHXz95w37XApeH6y4AvEhiDvwmcGzl3b3jeQeA38tyv1cnrPfpTe0leAZ00Z69ZO/lugzrKoA8NpLRnTCvTDpRNrYKg6aXVyev7VJH4SF4BLUFeXxlMsovu0pJzU1PJ5ZZVpp6XjQRBUVSR+EleAd2B1lntqAyKEVdebbX6axIoEgRF0UfkJxpcVgyVQX6S3q0irf4qyrvGukeCYBz0EfnHOB+J/keRh6Te5rA3MBQGSe9PVRV4jdoICQIxOQwrdljV5yZ9oOrZibxk9Qiy3p+qKvAa7ZNJgqAzA8qEeImFhdVBZsPY7kk+8kUG+WjcSD8Z/u9PPBGMU4kyug3J709V4whaCAAoQSC6SVoFH63Q40JTwPqPUwPQ+kn0f4fgvx9W/nNzwXYccZV70Qo8qeHRRmy0uG6C74tUQz0jTsefps9N8/yQu6mIkvW/F3VOyKuGzEorryEJAhEh6YMZDOI/0CQf8KyPU+NG+knW/17UxpS3Am+p4ZEkCKQaEn6TpAKC+O5z2nywaXGKNDFLP8n634vGucob3NKzuEQSBMJvkj6M48fjP9C5ufj0c3PpH6fmrOgnef73OiIXe9bwkCAQfpP2wcR9oONW6AsLsGvX2mkFd+1SuPJJJ0+Lvw5vMt8aHnH6It8X2Qh6RFMDyDTeQMRR53vRwkBHZCwWnaWJD0ZeQ37hy2jwCXsvkgSBVEPCf5qYXS7NeKeBZs3i05gOz4y6dSFBIAQk2yK2bPGnUuoLbUz5mCTsPTPq1oUEgRCQbLyDzs5D21maboWn9UB8M+rWhASBEJDsPXL8eHz6rqoG6lZzVXH9plvhaT2QcebL7iJxhoO8C7AFuAN4NPzdHJPmDcA3gO8ADwD/PHLss8D3gPvC5Q157itjscikKmPjJBkL6/aMqur6TXtw9WhUOXV4DQEfA/aE63uAj8ak+fvAeeH6LwBPA2e4VUFwWdH7ShCIVKqsSCbJrbRuoVbl9Zv0GpokYZ9BXYLgILAtXN8GHMxxzv0RwSBBIKqn6g/bF1fGstTd8u1qy7queEIekiQIytoIXuWcezpc/xvgVWmJzewCYAZ4LLJ7n5k9YGafMLPTSuZHiOqNjU24rzZB3br3rnrYFLED+OTaWiGZgsDM7jSzh2KWHdF0obRxKdfZBnwe+B3n3Ivh7muA1wH/iMDe8OGU8xfNbMXMVo4ePZr9ZKK/NFkhdWmMQd0eMF32sMkr7NtwbW2CuG5C3oWcqiHgZ4Bvk6IGAi4C/jLPfaUaEqk0pdfvov2gbrVGh9Umueiq+iuEmmwEH2etsfhjMWlmgK8Avx9zbChEDPgk8JE895UgEJm0GZZiaI+YtEqwSXwVKB03LNclCAZhJf8ocCewJdw/D9wQrl8BPM+qi+hLbqLAV4EHgYeAJeAVee4rQSC8IG2WtC70DnzF556Wz3nLQS2CoK1FgkDkps6WZVqPoM6Woq+t5arwvdXd4fKXIBD9o40BVHXrjjveIs1FU3r4Dlfo45IkCCw41i3m5+fdyspK29kQvrN9e+DeN8rcXOAZUgXLy8EENklTZFZ5L2jmmdqmqf9tcXGtB9CmTZMZPiKCmd3rnJsf3a9YQ2JyaSJ42cJC4HKYRNWuk30Ii1yFG2qWW++kuoGOiQSBmFyaGk+QdL3BYLzWZVol1tVBW0UoG+gtz6CvPgjUIsTpi3xfZCMQufBtPEEenXTWtSbFRtCGET9qbPbdIF0TyFgseklTBsGs++StwPNUUG0bOcvevw5hFs1THsP9pAjUgkgQCNEmeVugaRWZDx4uVVSgdQQFzPLeirv+qEDbvduPMq6RJEEgryEhmmDDhqA6GsVsrbE5yWPGbO35bXm4VOHRk7csyuYpSlZ59cSLSF5DQozSZMC4JGPuhg1r7xvnMTMqBKA9D5ckY2pWRRylaoN3moE3r7G5515EEgSin+TxLKlSUMRV8BCMP4jeN85jJqnX3oaHS1JlbZZePtGyPHECpqfXHi8TpTQpT3Nza6OJpv2fffciitMX+b7IRiBKk6WnLqMLTzKmLi05NzVVXD/uk4fL0lKyHSMpP3FlOTPj3GBQjT4+z3+VlcanMq4RZCwWIkJWGINxK4asCmec8Alte7iMCrY0r5w4IdhEJZvmyZRHALddxg0hQSBElKzKadx4N1nXLSNgini0VOViGldBJpXNYBBfmaYJjrrJ8igadSntqddQ65X6OIsEgShNXaqCLAHSRMuzynsklcPoc27aFAiCPGnrVLuMVuZJeaozDx4jQSDEKFnqhHEqUx8GhFWlillayq5Eo8+QNT9DdJmebmaEd9oygaqfLCQIhCjKOBW2D7rmKlQxWZVqnFDJMz9DVI1UNUXuPzXVOyHgXLIgkPuoEEnkndB89JwyAdPKsrwc3DeOIn76cX71Q5JcPZNcZOM4fjx/XvKS19Vz0yY4cGCiBoqVRYJAiKoZR4CkUWQ8w969QZt3FLNifvpplerpp8POnevzEicEB4P4a9QRLTUtCmxbgrkrxHUT8i7AFuAOgjmL7wA2J6Q7xep8xbdG9p8D3AMcAm4BZvLcV6oh0RuKqprS9PTD6+VRdxUxEqepWOoeQ5B1rx7aAdKgpsnrPwbsCdf3AB9NSHciYf8XgMvD9euB3XnuK0EgOkUZ43BRw29a+iIVZVG30bzPPxgEhuK6KuseuICWoS5BcBDYFq5vAw4mpFsnCAADngE2httvAW7Pc18JAtEZyrZSi45nSLtfUaGSdyAZ5H+enozg9ZUkQVAq+qiZ/a1z7oxw3YAfDbdH0r0QqoVeAD7inPsLM9sK3O2c+3thmrOB25xzv5R1X0UfFZ2hbLTOcc5fXg5sBUeOBHrzffsCnXjZqJ9pUfYM6NoAAAehSURBVD7zPk/VkUdFIcaOPmpmd5rZQzHLjmi6UNokSZW58Oa/BXzSzF4zxgMsmtmKma0cPXq06OlCtEPZYGZF5+9NEgJQPupnmrE57/P0YarNLhLXTci7kFM1NHLOZ4HLkGpI9IEqVCGjOvYkQ2sT01wmjdTN+zxNjayWnSAWarIRfJy1xuKPxaTZDJwWrm8l8DA6P9z+ImuNxVfnua8EgegMVVZ8VYTF8GGayai9YhgMrqoKW55DqdQlCAbAV8LK/U5gS7h/HrghXL8QeBC4P/y9MnL+ucA3CdxHvzgUGFmLBIHoFEUq37S0dQXKq/N50q5RR4UtY3QqSYJAU1UK4QtXXw3XX7/WmBqdLjHL0FrFNJJNUVdeZYxORVNVCuEzy8vrhQCsnS4xy9Ba1LDcJnXNCCZj9FhIEAjhA0mhIWC1csyq6NuOc1SEuirsLglDj5AgEMIH0lrCw8oxT0VfdZyjuqirwu6SMPQICQIhfCBtUvho5diVij5KXNC8OivsLpZRy2xsOwNCCILKfnFxbehnM7jqqm5XZMvLa5/riSeCbQieq8vPNkGoRyCED8S1kD//ebjuurZzVo64eQ2iBnDhBXIfFULUh9w5vULuo0KI5pE7ZyeQIBBC1IfcOTuBBIEQoj7kztkJJAiEqIIi8wr3Dblzeo/cR4UoS5aLpBCeox6BEGWRi6ToOBIEQpSlrgBqQjSEBIEQZZGLpOg4EgRClEUukqLjSBAIURa5SIqOI68hIapAAdREhynVIzCzLWZ2h5k9Gv5ujknzdjO7L7L8PzN7T3jss2b2vcixN5TJjxBCiOKUVQ3tAb7inDuPYBL7PaMJnHN3Oefe4Jx7A/DrwHPAX0WS/OvhcefcfSXzI4QQoiBlBcEO4EC4fgB4T0b6y4DbnHPPZaQTQgjREGUFwaucc0+H638DvCoj/eXATSP79pnZA2b2CTM7rWR+hBBCFCTTWGxmdwI/H3NozbBJ55wzs8TJDcxsG/APgNsju68hECAzwH7gw8C1CecvAosAs/LPFkKIyig1MY2ZHQQucs49HVb0X3POvTYh7QeB1zvnFhOOXwR8yDn3T3Lc9yjwxNgZL8dW4JmW7l2GruYbupt35bt5upr3pvI955w7c3RnWffRW4FdwEfC3/+Rkva9BD2AlzCzbaEQMQL7wkN5bhr3IE1hZitxM/z4TlfzDd3Nu/LdPF3Ne9v5Lmsj+Ajwj83sUeCd4TZmNm9mNwwTmdl24Gzgf4+cv2xmDwIPEkjE/1gyP0IIIQpSqkfgnDsGvCNm/wrwgcj248BZMel+vcz9hRBClEchJoqzv+0MjElX8w3dzbvy3TxdzXur+S5lLBZCCNF91CMQQoieI0GQgZn9MzP7jpm9aGaJVn0zu9jMDprZITNbF2qjafLEgQrTnYrEerq16XyO5CW1DM3sNDO7JTx+T+iE0Do58v0+MzsaKecPxF2naczsRjP7oZnFeutZwB+Fz/WAmb2x6TzGkSPfF5nZs5Hy/oOm8xiHmZ1tZneZ2XfDOuWDMWnaKXPnnJaUBfhF4LXA14D5hDRTwGPAuQSD4+4Hzm853x8D9oTre4CPJqQ70XYZ5y1D4Grg+nD9cuCWjuT7fcCn285rTN7fBrwReCjh+CXAbYABbwbuaTvPOfN9EfCXbeczJl/bgDeG668EHol5V1opc/UIMnDOPeycO5iR7ALgkHPusHPuJHAzQRymNikaB6pt8pRh9Jm+BLwjHIPSJj7+97lwzn0dOJ6SZAfwORdwN3BGOHC0VXLk20ucc087574drv8EeJj13pStlLkEQTWcBXw/sv0kMe6yDZM3DtTLzGzFzO4ehgdviTxl+FIa59wLwLPAoJHcJZP3v//NsKv/JTM7u5mslcbH9zovbzGz+83sNjN7fduZGSVUa/4KcM/IoVbKXBPTkB5PyTmXNlq6VSqKAzXnnHvKzM4FvmpmDzrnHqs6rz3nfwI3Oed+ama/S9Cr0Ria+vg2wXt9wswuAf4COK/lPL2Emb0C+O/A7zvnftx2fkCCAADn3DtLXuIpgpHTQ14d7quVtHyb2Q8iITy2AT9MuMZT4e9hM/saQSulDUGQpwyHaZ40s43AzwLHmsleIpn5dsHAyyE3ENhvukAr73VZopWrc+7LZnadmW11zrUeg8jMpgmEwLJz7s9ikrRS5lINVcO3gPPM7BwzmyEwZLbqgcNqHChIiANlZpuHob/NbCvwa8B3G8vhWvKUYfSZLgO+6kILW4tk5ntEx3spgW64C9wK/HboyfJm4NmIutFbzOznh7YjM7uAoJ5ru8FAmKf/CjzsnPtPCcnaKfO2Lem+L8A/JdDT/RT4AXB7uP8XgC9H0l1C4AXwGIFKqe18DwhmjXsUuBPYEu6fB24I1y8kiPN0f/h7Zct5XleGBGHJLw3XXwZ8ETgEfBM4t+1yzpnvPwS+E5bzXcDr2s5zmK+bgKeB58N3/ErgKuCq8LgB/zl8rgdJ8JrzMN+/Fynvu4EL285zmK+3Ag54ALgvXC7xocw1slgIIXqOVENCCNFzJAiEEKLnSBAIIUTPkSAQQoieI0EghBA9R4JACCF6jgSBEEL0HAkCIYToOf8fbbT41ArTNJwAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(x[:,0], x[:,1],'ro')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Abstract of the Algorithm\n", + "The DBSCAN algorithm can be abstracted into the following steps:\n", + "\n", + "- Find the points in the $ε$ (eps) neighborhood of every point, and identify the core points with more than min_pts neighbors.\n", + "- Find the connected components of core points on the neighbor graph, ignoring all non-core points.\n", + "- Assign each non-core point to a nearby cluster if the cluster is an $ε$ (eps) neighbor, otherwise assign it to noise.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing the points\n", + "Initially we label all the points in the dataset as __undefined__ .\n", + "\n", + "__points__ is our database of all points in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "points = { (point[0],point[1]):{'label':'undefined'} for point in x }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Helper functions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def euclidean_distance(q, p):\n", + " \"\"\"\n", + " Calculates the Euclidean distance\n", + " between points P and Q\n", + " \"\"\"\n", + " a = pow((q[0] - p[0]), 2)\n", + " b = pow((q[1] - p[1]), 2)\n", + " return pow((a + b), 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def find_neighbors(db, q, eps):\n", + " \"\"\"\n", + " Finds all points in the DB that\n", + " are within a distance of eps from Q\n", + " \"\"\"\n", + " return [p for p in db if euclidean_distance(q, p) <= eps]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_cluster(db, clusters):\n", + " \"\"\"\n", + " Extracts all the points in the DB and puts them together\n", + " as seperate clusters and finally plots them\n", + " \"\"\"\n", + " temp = []\n", + " noise = []\n", + " for i in clusters:\n", + " stack = []\n", + " for k, v in db.items():\n", + " if v[\"label\"] == i:\n", + " stack.append(k)\n", + " elif v[\"label\"] == \"noise\":\n", + " noise.append(k)\n", + " temp.append(stack)\n", + "\n", + " color = iter(plt.cm.rainbow(np.linspace(0, 1, len(clusters))))\n", + " for i in range(0, len(temp)):\n", + " c = next(color)\n", + " x = [l[0] for l in temp[i]]\n", + " y = [l[1] for l in temp[i]]\n", + " plt.plot(x, y, \"ro\", c=c)\n", + "\n", + " x = [l[0] for l in noise]\n", + " y = [l[1] for l in noise]\n", + " plt.plot(x, y, \"ro\", c=\"0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implementation of DBSCAN\n", + "\n", + "Initialize an empty list, clusters = $[ ]$ and cluster identifier, c = 0\n", + "\n", + "1. For each point p in our database/dict db :\n", + "\n", + " 1.1 Check if p is already labelled. If it's already labelled (means it already been associated to a cluster), continue to the next point,i.e, go to step 1\n", + " \n", + " 1.2. Find the list of neighbors of p , i.e, points that are within a distance of eps from p\n", + " \n", + " 1.3. If p does not have atleast min_pts neighbours, we label it as noise and go back to step 1\n", + " \n", + " 1.4. Initialize the cluster, by incrementing c by 1\n", + " \n", + " 1.5. Append the cluster identifier c to clusters\n", + " \n", + " 1.6. Label p with the cluster identifier c\n", + " \n", + " 1.7 Remove p from the list of neighbors (p will be detected as its own neighbor because it is within eps of itself)\n", + " \n", + " 1.8. Initialize the seed_set as a copy of neighbors\n", + " \n", + " 1.9. While the seed_set is not empty:\n", + " 1.9.1. Removing the 1st point from seed_set and initialise it as q\n", + " 1.9.2. If it's label is noise, label it with c\n", + " 1.9.3. If it's not unlabelled, go back to step 1.9\n", + " 1.9.4. Label q with c\n", + " 1.9.5. Find the neighbours of q \n", + " 1.9.6. If there are atleast min_pts neighbors, append them to the seed_set" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def dbscan(db,eps,min_pts):\n", + " '''\n", + " Implementation of the DBSCAN algorithm\n", + " '''\n", + " clusters = []\n", + " c = 0\n", + " for p in db:\n", + " if db[p][\"label\"] != \"undefined\":\n", + " continue\n", + " neighbors = find_neighbors(db, p, eps)\n", + " if len(neighbors) < min_pts:\n", + " db[p][\"label\"] = \"noise\"\n", + " continue\n", + " c += 1\n", + " clusters.append(c)\n", + " db[p][\"label\"] = c\n", + " neighbors.remove(p)\n", + " seed_set = neighbors.copy()\n", + " while seed_set != []:\n", + " q = seed_set.pop(0)\n", + " if db[q][\"label\"] == \"noise\":\n", + " db[q][\"label\"] = c\n", + " if db[q][\"label\"] != \"undefined\":\n", + " continue\n", + " db[q][\"label\"] = c\n", + " neighbors_n = find_neighbors(db, q, eps)\n", + " if len(neighbors_n) >= min_pts:\n", + " seed_set = seed_set + neighbors_n\n", + " return db, clusters\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lets run it!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD6CAYAAACs/ECRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO2de7Ac1X3nPz9dSVhabIyuiEMAXeGEPJwQv7Q4cVIpHGwHUxtkJ94KzjWRHbvuIsW7SaV2N6RUlcRkVYtx1RpSsSAqwhpbd40fix05MSEY2+s/EhxECpCBxcgEYalILCQvsRbWQuK3f3QP6ju3n9Pv6e+nampmTp/uPtPTfX7n/F7H3B0hhBDDZUXbDRBCCNEuEgRCCDFwJAiEEGLgSBAIIcTAkSAQQoiBI0EghBADpxJBYGa3mNl3zOwbCdvnzexBM9tnZn9rZq+ObHsiLL/fzPZW0R4hhBD5sSriCMzsF4BjwMfd/aditr8ReMTdv2tmbwP+yN3fEG57Atjk7k/nPd/69et948aNpdsthBBD4r777nva3c8aL19ZxcHd/WtmtjFl+99Gvt4DnFvmfBs3bmTvXk0ehBCiCGZ2IK68DRvB+4A7It8d+Bszu8/MFlpojxBCDJpKZgR5MbM3EQiCn48U/7y7HzKzHwDuMrP/7e5fi9l3AVgA2LBhQyPtFUKIIdDYjMDMfhq4Gdjs7kdG5e5+KHz/DvA54KK4/d19l7tvcvdNZ521TMUlhBBiQhoRBGa2AbgduNLdvxkp/1dm9tLRZ+CtQKznkRBCiHqoRDVkZp8ELgbWm9lB4A+BVQDufhPwB8AssNPMAE64+ybgFcDnwrKVwP9w97+uok1CCCHyUZXX0Lsytr8feH9M+ePAq5fvIUT17FuEu7fDM0/CGRvgkh1w4XzbrRKifRo1FgvRFvsW4QsL8PyzwfdnDgTfQcJACKWYEIPg7u2nhMCI558NyoUYOhIEYhA882SxciGGhASBqJ19i3D9RvjgiuB932LzbTgjIfQkqVyIISFBIGplpJt/5gDgp3TzRYVBWWFyyQ5YtXZp2aq1QbkQQ0eCQNRKFbr5KoTJhfPwy7vgjDnAgvdf3iVDsRAgryFRM1Xo5tOESZGO/MJ5dfxCxKEZgaiVKnTzMvQKUS8SBKJWqtDNy9ArRL1IEIhaqUI3L0OvEPUiG4EA6k2/UFY3P9q3S+khlK5CTBOVLFXZNJs2bXKtUFYd4+kXIBhxd9Wrpu1OuG/XS4gRZnZfmPBzCVINiV6lX4hzJb393XDd+noC1eLiF/p0vYTIg1RDoldeOXGdMMBzR6pPIpeUqC7u/NDN6yVEHjQjEL3yyknrbKselSeN/G0mvn4Xr5cQeZAgEJV65dSdVyirs61yVJ50LD8pLyYxXUgQiMrSL0yaCqKI8IgTWlHWrCvW5jQSZ0pzSlchpgt5DYnSvOjFcyB++xlz8DtPJO9b1ANn3yJ87jfAX1i+bc0s/Oenc7Q1h8eRvIPEtFGr15CZ3WJm3zGz2IXnLeBPzGy/mT1oZq+LbNtiZo+Fry1VtEc0x5JZQAJp6ppJPHAunIek8ctzR3O2NceMRYnqxFCoymvoY8CfAh9P2P424ILw9QbgRuANZraOYKH7TYAD95nZHnf/bkXtEjWT5MUTJU2vn+axlDZ6P2NDvPBJO9ckyeuUqE4MgUpmBO7+NSBlLMZm4OMecA/wcjM7G/gl4C53Pxp2/ncBl1bRJtEMWcbZLCNqUse9Zl366H0SA3ef3GSFaJKmjMXnAN+OfD8YliWVi56QNgLPo0pJ6tAhXWU0idomsa3e3sppQnSB3gSUmdkCsACwYYMctrvCJTvKGVST8gjdfmV8/ejovajaJq6tLx73QPUBaUL0haYEwSHgvMj3c8OyQ8DFY+VfjTuAu+8CdkHgNVRHI0VxqkgIF9ehJ3khlQnaWtLWmGM//2wggG5/d/B9zSy87QYJBjH9NKUa2gP8Rug99DPAM+7+FHAn8FYzO9PMzgTeGpYNki4s8j4JF84H7qF/+ELwXkXHWVfq6VFbsYQKkSHGc0fgL36zP/+DEJNSyYzAzD5JMLJfb2YHCTyBVgG4+03AF4HLgP3As8B7w21HzeyPgXvDQ13j7mlG56klKa8NDHNEWnfq6SSvo3FOHi++JKYQfUMBZR3h+o0JqpCUYKy+0Xb66PG2pCWQW4IFsx0h+k5SQFlvjMXTzrS7NnZlxhMVRmvWwco1QRCarQhyCMWhZHJi2lGuoY7Qpwygk9CFHP7jkcXPHYETz8Gmq+AlL4/fZ2Z1MbtEX+08YthIEHSEJtbljeuk0jquKju1Lsx4koTR3hsDoTDOmlnYfEv+GcukSfeEaBuphjpCUeNoUX17nGrm8+8Fs8AgOiobqWugWlXOJCkhqqaI0JnENjNJCgshuoAEQYfIGyA1ib49rpN64fnl9aLqmio7taTAsyZz+Of1FILJZipdmPUIMQlSDfWQSfTtRTqjZ56svlPrQibPIkJnkpnKtNt5xPQiQdBDJumki3RGZ2xI79QmtR3UEXhWhAvnA71/FpPOVJqw8whRBxIEPWSSkWdcJ7ViVeAVE2XUcSV1ahdc1m+D6NtuWP67ZlaHAqLkTKULsx4hJkE2gh4yib49yRgdVxbtuMa39d0gWnfEstYvEH1EgqCHTNqZjXdScZ5HEEY5Pxm//u80GETVWQuxFKWYGChxKRZmVgdLQMZ5E0Ew61i5Jt7nfppSYdRNl1JtiGFR65rFIh9dijqNU/GcPJ4sBOBUfRlEJ2eSoLMu3TdiOpEgaIgmo07zdByTqnKeOzosg2jV0dhFXX//aluwRkJfjfOiH0g11BBNZReNU/nErRiW1J4shqQCyqs+G11fyL72H1zBkjUPXiQmw+m+xXCltpj6Q/ofRHVINdQyTRlZk0acn9uydBQZ5x46szpwKU1iaCqgvOqz0Yg+z2i/iOvv3duJFxr0yzgvuo8EQUM0FXWa1EH4yaUqhTif9823wNv/+6myNbPV+NdH6ZO+u45o7CJBZ1UFCAqRhdxHG6KpXDtp+XTG/f2T3ChHZSPvlucqWjOuK2sS5KVIbqJRx5yVWK+I62/a+Y8fC4SpvI5EFWhG0BBNRZ3GjTij5B3l1mHc7sKaBEXIqz7LisaeVNgn/ZcrVoYuvDIei4qoas3iS4EbgBngZne/dmz7R4A3hV/XAj/g7i8Pt50E9oXbnnT3y6toUxdpIpBpdPzPbYlfcSuvSqGOCOK+BaNVFY09HsSXNCuK2/eXdy0tO35seRxHnyK7RTcp7TVkZjPAN4G3AAcJFqJ/l7s/nFD/3wOvdfffDL8fc/fTi5yzj15DVZInICmv91ASRbxb8jKEdZmzSLoGa2aD1dKy/q86/hcxHOr0GroI2O/uj7v7ceA2YHNK/XcBn6zgvJ2nDsNoXpVNWVVUHcZtZedMnv08dySf2kyprkUdVCEIzgG+Hfl+MCxbhpnNAecDX44Uv8TM9prZPWb29qSTmNlCWG/v4cOHK2h2vdQVQFZEz14m7XMdnXbT2Tm76KFUtMMeFxwSpqIOmvYaugL4rPsS7fWcux8ys1cCXzazfe7+rfEd3X0XsAsC1VAzzS1GVGVjK5br6KvQ5TalZ68rS2dTCd+66qGU5D2WmMNpTHDUnT1VDJMqBMEh4LzI93PDsjiuAH4rWuDuh8L3x83sq8BrgWWCoOuMdzxxhloo32E3ufZvn7N0djVddpoBOq97cZ//F9FNqhAE9wIXmNn5BALgCuDXxyuZ2Y8DZwJ/Fyk7E3jW3b9vZuuBnwOuq6BNjRPX8cRRtsPuwtq/faDLHkppHblG+qINSgsCdz9hZh8A7iRwH73F3R8ys2uAve6+J6x6BXCbL3VT+gngz8zsBQJ7xbVJ3kZdJ08HU0WHLdVAPpqcOcWRN9V0FSmpldZalEVJ5yoiyS3QZsBfSHfzrOshHnIHUdZ9tolzV9HGNn+n6B9KOlczSd4c77g12Wsny7OojNdLk2mvu0ib6wfn9eyqItI6zzG66D0luoVyDVXEJCqbrIe4jNdLV42lTdKWUTWvfaIKO0bWMbrqPSW6hQRBhRTteNIe4rIdeZeNpdNOXvtEFXaMrGNoQCDyINVQS+xbDGIN4jhjQ/mOXBGo7ZE36KuK4LCsY2hAIPIgQVABRXWwo+l6XKzB6CEu25ErArU98tonqrBjZB1DAwKRB3kNlWQSr400D6N33BrsV5VHyVC9hkSAvIpElCSvIdkISlJUB7tvMXmxEX9h6aIxo+NP2pErAnX6yRL2ijsReZAgKEkRHexodJZEXF4ZPbAiibweQbqPRBayEZSk6GLkSWkopL8XRclyP1b8gMiLBEFJqlqMXDpbEUdaZ542Gx16QKEohgRBSYp4fiTOHuYkBF5kcRE2boQVK4L3xeH2XFmdedpstG/rQ4t2kSCogLwLwMilM4PFRVhYgAMHwD14X1gYrDDI6sxjF7c3uOAyxQ+IYkgQNEib+W96wfbt8OxYz/fss0H5AMnqzC+ch1dvASyy0eGBW2HNuvh9i8QPyMbQMC3OhuU11DDy4EjhyYSeL6l8ysmTguKxL7JsMfvnnw1WPFu1dvJ1K5SjqGFGs+HRQGg0GwaYr/+Ca0ZQExpN5WB8BLQuYRi7YZhhsHlUiUmzhueOlpt9ysbQMBmz4W3btrFy5UrMjJUrV7Jt27ZKTy9BUAPy2MhBnD3ge9+DVauW1lu7FnbsWL7vAAzKeVSJaQbjvLarOGRjaJiU2fC2bdu48cYbOXkyyElz8uRJbrzxxkqFgVJMlCQusvPu7QlT+rnggRQEHfiBmIs0Owunnx48GBs2BEIgOjUen0JDICx27WpkCt016kohkZQGRfdwTSQ9D3NzrDx48EUhEGVmZoYTJ04UOk2tC9OY2aVm9qiZ7Tezq2O2v8fMDpvZ/eHr/ZFtW8zssfC1pYr2NEXSyD8phYRGUxGSRkBHj8ITT8ALLwTv4527DMpLqMsBQR5uDbNjRzCgiRLOhuOEAJBYPgmljcVmNgN8FHgLcBC418z2xKw9/Cl3/8DYvuuAPwQ2EZi87gv3/W7ZdjVBkh7VZuIziyrjY4QNG+JHQFn2ABmUl1HWASEtX5FyFDXEaMCzffuy2fDMli2JM4KqqGJGcBGw390fd/fjwG3A5pz7/hJwl7sfDTv/u4BLK2hTIySN8P2kRlOZpIyAUkkSFAM1KJclzZ5VxMYg54gKmJ+PnQ0vLMQnKEsqn4QqBME5wLcj3w+GZeP8qpk9aGafNbPzCu7bSdIihRUvkMH8fKDXn5sDs+A9j55/UgEiYqnCO0jOEfWyc+dOtm7d+uIMYGZmhq1bt7Jz587KztGU19AXgI3u/tMEo/5bix7AzBbMbK+Z7T18+HDlDRxRZGSTpkcdjaZ+5RNB+e1XaqS0jIQRUOY+kwgQEUsV3kFyNa2fnTt3cuLECdydEydOVCoEoBpBcAg4L/L93LDsRdz9iLt/P/x6M/D6vPtGjrHL3Te5+6azzjqrgmYvp+jIJstQp5FSTUwiQEQsVaxgJlfT/lOFILgXuMDMzjez1cAVwJ5oBTM7O/L1cuCR8POdwFvN7EwzOxN4a1jWClWPbIoeb7B61oHEBXSRKryDtBxmTjp8n5f2GnL3E2b2AYIOfAa4xd0fMrNrgL3uvgf4D2Z2OXACOAq8J9z3qJn9MYEwAbjG3Y+WbdOkFB3ZZIXhT7JozeBC+lsOre87ZZcjrcI76JId8bEMco6I0PH7XAFlEYoG0WTVL3K8wQbwpATS8MQTTbemV3RpPWKtj51BR+7zWgPKpoWi0+SsEX8Vi9ZMvZ5VcQET0yUjbZl0FoOg4/e5BEGEolGaWal+K1m0Ztr1rIoLmJjBDh76SMfvc6WhHiNvlOa+RTj+veXlK1YtHfHnPd5g9aw7dsTnDlJcQCZ50lSLjtDx+1wzggySPHnu3g4njy+vf9rLJpsWD3bRGsUFTIzyAfWIjt/nMhankGaMu/1Kli0IAoAFelIhmkBGWlEEGYsnIM0YN1idvugUZY20k8SuKN6le3EAZZEgSCHNGKdpueg7k0S+DzZaPm4hpYWFqREGEgQpZK3+NEidfleZ4tFaXUziftoll9VGmfJ1MCQIUsga9ct3ugYm6dCnfLRWBXHqnEncTwfrslo2DqDjAxUJghQ06m+YSTv0KR+tleWvtgXODVF1zu1XLh/kjEizcw3WNjZpHMDiIqxfD+9+d6cHKhIEGWjU3yCTdugdj9psk32LsPcmlnu4OTz/f4O4lyhZdq64WTLA8WNTbifIsw7G+Kh/27agwz9yZPnxOjZQkSAQ3SFvhz7+wK1LCPHuSNRmm9y9nXg355DTXlZsxjuaJa+ZXVr+3JEpNxpnxQHEzWZvumn5wCZKhwYqiiMQzbG4GLsm64skJeY6/XSYnQ32W7cO/uVf4PnnT21ftSp4OI9HIvzWru1UwE5bfHAFqYJg0riXwSZJTCLp3k2jhcSKiiOYkMH6TFdNHv3/jh2wevXyfY8dO7XfkSNLhQAE31/60s5GbbZJlu5+Ut3+YI3GSRQd3XcovQRIEKQyWJ/pOsij/5+fDzr0STh6VKuWxZCk04dycS+DNRonkaSGNFteNjvbuYGKBEEKg/WZroO8+v+jE65LJHtALEs83wAL1j9/0R4Ak814FVA5RpIx+aqrls5Ud++Gp5/ulBAAZR9NRdPfCtmwIV6HOt6BJ9VLo2PT7K6RlAG3zKp4VaxsNlWMOvY0G1iHkbE4BRnEKmR8qT6IN+jG1Rtn9epAhXT0aO8euC6h+3t41GosNrNLzexRM9tvZlfHbP9dM3vYzB40s7vNbC6y7aSZ3R++9ozv2yaa/lZI3jS8cfW2bl36/ZZbgum17AGl0Ix3QjoeJTwJpWcEZjYDfBN4C3CQYCH6d7n7w5E6bwK+7u7PmtlW4GJ3/7Vw2zF3P73IOZt0H1WaXzGtaEYwAXlnth2lzhnBRcB+d3/c3Y8DtwGboxXc/SvuPrpy9wDnVnDeRlBksZhWNOOdgKLR7z2ZPVQhCM4Bvh35fjAsS+J9wB2R7y8xs71mdo+Zvb2C9nQSxSOIrqFcWhNQJJ1Jj5IhNuo+ambvBjYBH44Uz4VTlV8HrjezH07YdyEUGHsPHz5cWxvr6LAVj1AxPRll9QHNeAuwuBjcc3HEuS/3KBliFYLgEHBe5Pu5YdkSzOzNwHbgcnf//qjc3Q+F748DXwVeG3cSd9/l7pvcfdNZZ51VQbOXE9dhf/69cN36coJB8QgVkmeUJUEhqmZ03508uXzbyH15/L5LcoPuUI6hEVUIgnuBC8zsfDNbDVwBLPH+MbPXAn9GIAS+Eyk/08xOCz+vB34OeJiWiOuwX3g+SKhVZiQv74wKyRpl9Wg63hZSU05A3H0HMDMTGIph+X0XF1UMnQx+LC0I3P0E8AHgTuAR4NPu/pCZXWNml4fVPgycDnxmzE30J4C9ZvYA8BXg2qi3UdPk6ZjjRvJZD5bC8SskS0fbo+l4G0hNOSFJ990LLwTeQnH3XZxH5urVnQx+rMRG4O5fdPcfdfcfdvcdYdkfuPue8POb3f0V7v6a8HV5WP637n6hu786fP/zKtozKXk75qjAyPNgyTujQrIWCEl6YA8c0KwAqSknZtL7bpyOBvAq11CEtARdUaICI8+DJe+MCslaICRt2i0VkdSUk7C4GGTAHSfvfRfl+ec7OTuVIIgw3mGvmYWZmKzI0dWY8j5Y8s6oiKwI5ThBMUIqIqkpizKyOY2vMjaeQTTtvhung8Zi5RpKYd8i3PHbobF4jFVrA6Fx93ZFZ3aOxcVgjdg4zAK97gCIi4qHpYnm4NS9rMFJDEneP3GLyowvvHTsWPwylS0sSDNCC9MUZKT7jxMCcEr9I/1/B5mfDx62ODrosVEHSbYrkJqyEEUCyObnl66JccMN2escdwQJggTidP/jjGYCerA6SJ7FxqeYJNvV57bA7VcG33/lE1JTZpJlJE4jb6LFDqD1CBLIazz7wkLQ8UsN1DF6nh++LEn3r4fxUEXWHhg0O3bEJ5nLO6CYn+/FPacZQQJ5jWdyvesw41P1HjyQVZHn/tW9m4MejerLIEGQQF5XUpDrnegeee9f3bs5GMCAQoIggTjf/zWz8XXXrGu0aUJkMrp/R2sUJyG3UQESBKmM+/6/7QZYsWp5vePfU4i+6B4XzoOneMrKu02MkCAowIXzcNrLlpefPC5dq+gmSSN+m5F3mziFBEFBnjsaXy5dq+giSXEu77hVQkCcQoKgIArR7yhagyAW5blqmW3bYOXKwONo5crgewdRiomCjCI2FaLfIeIWFIcgH8wNN0yll4foAdu2wY03Li/fuhV27my+PSSnmJAgmIC4HC4SAi2SthrU2rVT6fddB7qvK2blyvgVzWZm4MSJ5tuDBIFu8mlmxYr0PO8tJvnqC5rp1kDSCmXQ2roEg046p1WZppysvC8dTPvbNbRgTQ3MJARxJJW3yCAEgW7yKSDNGJyVC34gGUfLoAVramBhoVh5i1QiCMzsUjN71Mz2m9nVMdtPM7NPhdu/bmYbI9t+Pyx/1Mx+qYr2jKObvOdkLUg/ygczGxP6PaCMo2WQN1wN7NwZGIZHM4CZmVYNxWmUFgRmNgN8FHgb8CrgXWb2qrFq7wO+6+4/AnwE+FC476uAK4CfBC4FdobHqxTd5D0nz4L08/Pw9NOwe/fUJwirA62rURM7dwaGYffgvYNCAKqZEVwE7Hf3x939OHAbsHmszmbg1vDzZ4FLzMzC8tvc/fvu/o/A/vB4laKbvOeUWRxEQiAXReIN9i3C9RvhgyuCd9naIvQ0nqWK9QjOAb4d+X4QeENSHXc/YWbPALNh+T1j+54TdxIzWwAWADYU1PmObmZ5DfWUDRvi3UOl+6+UC+ezn4lx7yKtaxBhPJ5lpMKEzg9IemMsdvdd7r7J3TedddZZhffX4vE9ZtLVxno6OquSqkfvcrwg+b7Ko8LsKFXMCA4B50W+nxuWxdU5aGYrgTOAIzn3bQ3FHnSESVYb6/HorCrqGL0P3vEi7b4qosLsGFXMCO4FLjCz881sNYHxd89YnT3AlvDzO4EvexDJtge4IvQqOh+4APj7CtpUGsUedIyo7n/HjkAopI30ezw6q4o6Ru+DcbyYZNRfZn3jliktCNz9BPAB4E7gEeDT7v6QmV1jZpeH1f4cmDWz/cDvAleH+z4EfBp4GPhr4LfcPSYmu3k0Be4oWa6kI3o8OquKOkbvg3C8SLvH0u6rSVWYHWAwKSaK8sEVQNylscDOIFoiKa/QeBqJvPWmmOs3hjPaMc6YC+xkkzLVKtPFRdiyJT5H0Nxc8J52Xy0uFlNhNsygU0xMwmCmwH0j70i/x6Ozqqhr9D61jhejmUCcEIB8o/6q3JcbdnSQIEhgEFPgPpJXDzuKNh5wcJnWIihInP4/yoYN2fdVFR14XvVnlbh7716vf/3rvQke3O3+kTn3P7Lg/cHdjZxWpLF7t/vate7BIxK81q4NytP2mZtzNwve0+qK4WK29L6Kvkbb0u6fSe7NOObm4tswN1fyB7oDez2mT229U5/k1ZQgEB1l1LGD+8xM+gNa1cMppp+kDnj8lXT/VNWBJwkks9I/MUkQSDUk+sf8/Cld7UifmzR9LuJGqgC0YTL63w8cWL6GQNyaAkn3T1Weai24oUoQiH6S1sFHO/SklcvGH8429LKifaL/OwT//ajzn5tLXkAmrnMv2oEnDTzacHSImyZ0/SXV0MCI0/Gn6XPHVUF5pus16mVFh8n634vcF0XUkFl1a7JrIRuB6CVJD8zsbPwDOrIZFNXx1qiXFR0m638vamPK24G3NPBIEgRSDYluk6QCgvjpc5IPOKS7kfY4PYAoQdb/XtQNOW8cQcci3yUIRLdJejCOHo1/QEfRn+PMzaU/nApAGyZ5/vc61rjo2MBDgkB0m7QHJu4BnbRDn58PUgtElxXcsmVQAWiDJM+Ivw5vsq4NPOL0RV1/yUYwIJoKIFO8gYijzvuihUBHZCwWvaWJB0ZeQ92iK9HgU3ZfJAkCqYZE92liHeI0450CzZqlSzEdHTPq1oUEgRCQbItYt647ndJQaGNRoSRh3zGjbl1IEAgBycY7GPxKZ43T9Cg8bQbSNaNuTUgQCAHJ3iNHj8bX76tqoG41VxXHb3oUnjYDGUo68zjDQd4XsA64C3gsfD8zps5rgL8DHgIeBH4tsu1jwD8C94ev1+Q5r4zFIpOqjI3TZCys2zOqquM37cE1oKhy6vAaAq4Drg4/Xw18KKbOjwIXhJ9/CHgKeLmfEgTvLHpeCQKRSpUdyTS5ldYt1Ko8fpNeQ9Mk7DOoSxA8Cpwdfj4beDTHPg9EBIMEgaieqh/srrgylqXukW9fR9Z15RPqIEmCoKyN4BXu/lT4+Z+AV6RVNrOLgNXAtyLFO8zsQTP7iJmdVrI9QlRvbGzCfbUJ6ta999XDpogdoEuurRWSKQjM7Etm9o2Y1+ZovVDaeMpxzgY+AbzX3V8Ii38f+HHgXxPYG34vZf8FM9trZnsPHz6c/cvEcGmyQ+pTjEHdHjB99rDJK+zbcG1tgrhpQt4XOVVDwMuAfyBFDQRcDPxlnvNKNSRSaUqv30f7Qd1qjR6rTXLRV/VXCDXZCD7MUmPxdTF1VgN3A78Ts20kRAy4Hrg2z3klCEQmbaalGNkjpq0TbJKuCpSeG5brEgSzYSf/GPAlYF1Yvgm4Ofz8buB5TrmIvugmCnwZ2Ad8A9gNnJ7nvBIEohOkrZLWh9lBV+nyTKvLbctBLYKgrZcEgchNnSPLtBlBnSPFro6Wq6Lro+4eX38JAjE82gigqlt33PMRaS6a0sP3uEOflCRBYMG2frFp0ybfu3dv280QXWfjxsC9b5y5ucAzpAoWF4MFbJKWyKzyXNDMb2qbpv63hYWlHkBr105n+ogIZnafu28aL1euITG9NJG8bH4+cDlMomrXySGkRa7CDTXLrXda3UAnRIJATC9NxRMkHW92drLRZVon1tegrSKUTfSWJ+hrCAK1CHH6oq6/ZCMQuehaPEEenXTWsabFRtCGET9qbO66QbomkLFYDJKmDIJZ52kH/WcAAArwSURBVMnbgefpoNo2cpY9fx3CLNqmPIb7aRGoBZEgEKJN8o5A0zqyLni4VNGB1pEUMMt7K+744wJt69ZuXOMaSRIE8hoSoglWrAi6o3HMlhqbkzxmzJbu35aHSxUePXmvRdk2Rcm6XgPxIpLXkBDjNJkwLsmYu2LF0vPGecyMCwFoz8MlyZia1RFHqdrgnWbgzWtsHrgXkQSBGCZ5PEuqFBRxHTwE8QfR88Z5zCTN2tvwcEnqrM3Sr0/0Wh47BqtWLd1eJktpUpvm5pZmE037P4fuRRSnL+r6SzYCUZosPXUZXXiSMXX3bveZmeL68S55uOzenWzHSGpP3LVcvdp9drYafXye/yqrTpeucY0gY7EQEbLSGEzaMWR1OJOkT2jbw2VcsKV55cQJwSY62TRPpjwCuO1r3BASBEJEyeqcJs13k3XcMgKmiEdLVS6mcR1k0rWZnY3vTNMER91keRSNu5QO1Guo9U59kpcEgShNXaqCLAHSxMizynMkXYfx37l2bSAI8tStU+0y3pkntanONnQYCQIhxslSJ0zSmXYhIKwqVczu3dmdaPQ3ZK3PEH2tWtVMhHfaawpVP1lIEAhRlEk67C7omqtQxWR1qnFCJc/6DFE1UtUUOf/MzOCEgHuyIJD7qBBJ5F3QfHyfMgnTyrK4GJw3jiJ++nF+9SOSXD2TXGTjOHo0f1vyktfVc+1auPXWqQoUK4sEgRBVM4kASaNIPMP27cGYdxyzYn76aZ3qmjVw5ZXL2xInBGdn449RR7bUtCywbQnmvhA3Tcj7AtYBdxGsWXwXcGZCvZOcWq94T6T8fODrwH7gU8DqPOeVakgMhqKqpjQ9/eh4edRdRYzEaSqWumMIss41QDtAGtS0eP11wNXh56uBDyXUO5ZQ/mngivDzTcDWPOeVIBC9ooxxuKjhN61+kY6yqNto3t8/OxsYiuvqrAfgAlqGugTBo8DZ4eezgUcT6i0TBIABTwMrw+8/C9yZ57wSBKI3lB2lFo1nSDtfUaGSN5AM8v+egUTwdpUkQVAq+6iZ/R93f3n42YDvjr6P1TsRqoVOANe6++fNbD1wj7v/SFjnPOAOd/+prPMq+6joDWWzdU6y/+JiYCt48slAb75jR6ATL5v1My3LZ97fU3XmUVGIibOPmtmXzOwbMa/N0XqhtEmSKnPhyX8duN7MfniCH7BgZnvNbO/hw4eL7i5EO5RNZlZ0/d4kIQDls36mGZvz/p4hLLXZR+KmCXlf5FQNje3zMeCdSDUkhkAVqpBxHXuSobWJZS6TInXz/p6mIqtlJ4iFmmwEH2apsfi6mDpnAqeFn9cTeBi9Kvz+GZYai7flOa8EgegNVXZ8VaTF6MIyk1F7xSgZXFUdtjyHUqlLEMwCd4ed+5eAdWH5JuDm8PMbgX3AA+H7+yL7vxL4ewL30c+MBEbWS4JA9IoinW9a3boS5dX5e9KOUUeHLWN0KkmCQEtVCtEVtm2Dm25aakyNLpeYZWitYhnJpqirrTJGp6KlKoXoMouLy4UALF0uMcvQWtSw3CZ1rQgmY/RESBAI0QWSUkPAqc4xq6NvO89REerqsPskDDuEBIEQXSBtJDzqHPN09FXnOaqLujrsPgnDDiFBIEQXSFsUPto59qWjjxKXNK/ODruP16hlVrbdACEEQWe/sLA09bMZXHVVvzuyxcWlv+vAgeA7BL+rz79titCMQIguEDdC/sQnYOfOtltWjrh1DaIGcNEJ5D4qhKgPuXN2CrmPCiGaR+6cvUCCQAhRH3Ln7AUSBEKI+pA7Zy+QIBCiCoqsKzw05M7ZeeQ+KkRZslwkheg4mhEIURa5SIqeI0EgRFnqSqAmRENIEAhRFrlIip4jQSBEWeQiKXqOBIEQZZGLpOg58hoSogqUQE30mFIzAjNbZ2Z3mdlj4fuZMXXeZGb3R17/z8zeHm77mJn9Y2Tba8q0RwghRHHKqoauBu529wsIFrG/eryCu3/F3V/j7q8BfhF4FvibSJX/NNru7veXbI8QQoiClBUEm4Fbw8+3Am/PqP9O4A53fzajnhBCiIYoKwhe4e5PhZ//CXhFRv0rgE+Ole0wswfN7CNmdlrJ9gghhChIprHYzL4E/GDMpiVhk+7uZpa4uIGZnQ1cCNwZKf59AgGyGtgF/B5wTcL+C8ACwAb5ZwshRGWUWpjGzB4FLnb3p8KO/qvu/mMJdX8b+El3X0jYfjHwH9393+Q472HgwMQNL8d64OmWzl2GvrYb+tt2tbt5+tr2pto95+5njReWdR/dA2wBrg3f/yKl7rsIZgAvYmZnh0LECOwL38hz0rgf0hRmtjduhZ+u09d2Q3/brnY3T1/b3na7y9oIrgXeYmaPAW8Ov2Nmm8zs5lElM9sInAf8r7H9F81sH7CPQCL+l5LtEUIIUZBSMwJ3PwJcElO+F3h/5PsTwDkx9X6xzPmFEEKURykmirOr7QZMSF/bDf1tu9rdPH1te6vtLmUsFkII0X80IxBCiIEjQZCBmf1bM3vIzF4ws0SrvpldamaPmtl+M1uWaqNp8uSBCuudjOR62tN0O8faknoNzew0M/tUuP3roRNC6+Ro93vM7HDkOr8/7jhNY2a3mNl3zCzWW88C/iT8XQ+a2euabmMcOdp9sZk9E7nef9B0G+Mws/PM7Ctm9nDYp/x2TJ12rrm765XyAn4C+DHgq8CmhDozwLeAVxIExz0AvKrldl8HXB1+vhr4UEK9Y21f47zXENgG3BR+vgL4VE/a/R7gT9tua0zbfwF4HfCNhO2XAXcABvwM8PW225yz3RcDf9l2O2PadTbwuvDzS4FvxtwrrVxzzQgycPdH3P3RjGoXAfvd/XF3Pw7cRpCHqU2K5oFqmzzXMPqbPgtcEsagtEkX//tcuPvXgKMpVTYDH/eAe4CXh4GjrZKj3Z3E3Z9y938IP38PeITl3pStXHMJgmo4B/h25PtBYtxlGyZvHqiXmNleM7tnlB68JfJcwxfruPsJ4BlgtpHWJZP3v//VcKr/WTM7r5mmlaaL93VeftbMHjCzO8zsJ9tuzDihWvO1wNfHNrVyzbUwDen5lNw9LVq6VSrKAzXn7ofM7JXAl81sn7t/q+q2DpwvAJ909++b2b8jmNUohqY+/oHgvj5mZpcBnwcuaLlNL2JmpwP/E/gdd/+XttsDEgQAuPubSx7iEEHk9Ihzw7JaSWu3mf1zJIXH2cB3Eo5xKHx/3My+SjBKaUMQ5LmGozoHzWwlcAZwpJnmJZLZbg8CL0fcTGC/6QOt3NdliXau7v5FM9tpZuvdvfUcRGa2ikAILLr77TFVWrnmUg1Vw73ABWZ2vpmtJjBktuqBw6k8UJCQB8rMzhyl/jaz9cDPAQ831sKl5LmG0d/0TuDLHlrYWiSz3WM63ssJdMN9YA/wG6Eny88Az0TUjZ3FzH5wZDsys4sI+rm2BwyEbfpz4BF3/28J1dq55m1b0rv+At5BoKf7PvDPwJ1h+Q8BX4zUu4zAC+BbBCqltts9S7Bq3GPAl4B1Yfkm4Obw8xsJ8jw9EL6/r+U2L7uGBGnJLw8/vwT4DLAf+HvglW1f55zt/q/AQ+F1/grw4223OWzXJ4GngOfDe/x9wFXAVeF2Az4a/q59JHjNdbDdH4hc73uAN7bd5rBdPw848CBwf/i6rAvXXJHFQggxcKQaEkKIgSNBIIQQA0eCQAghBo4EgRBCDBwJAiGEGDgSBEIIMXAkCIQQYuBIEAghxMD5/1SENh4utwCVAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "eps = 0.25\n", + "min_pts = 12\n", + "\n", + "db,clusters = dbscan(points,eps,min_pts)\n", + "\n", + "plot_cluster(db,clusters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I encourage you to try with different datasets and playing with the values of eps and min_pts.\n", + "\n", + "Also, try kmeans on this dataset and see how it compares to dbscan. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I hope by now you are convinced about about how cool dbscan is. But it has its pitfalls.\n", + "### When NOT to use ?\n", + "\n", + "1. You have a high dimentional dataset. Euclidean distance will fail thanks to '[curse of dimentionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality#Distance_functions)'.\n", + "2. We have used a dict to store the points. So we can't do anything about the order in which the points will be processed. So it's not entirely deterministic.\n", + "3. Won't work well if there are large differences in density. Finding the min_pts and $ε$ combination will be difficult.\n", + "4. Choosing the $ε$ without understanding the data and its scale, might result is poor clustering performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/machine_learning/dbscan/dbscan.py b/machine_learning/dbscan/dbscan.py new file mode 100644 index 000000000..04fb5f018 --- /dev/null +++ b/machine_learning/dbscan/dbscan.py @@ -0,0 +1,271 @@ +import matplotlib.pyplot as plt +import numpy as np +from sklearn.datasets import make_moons +import warnings + + +def euclidean_distance(q, p): + """ + Calculates the Euclidean distance + between points q and p + + Distance can only be calculated between numeric values + >>> euclidean_distance([1,'a'],[1,2]) + Traceback (most recent call last): + ... + ValueError: Non-numeric input detected + + The dimentions of both the points must be the same + >>> euclidean_distance([1,1,1],[1,2]) + Traceback (most recent call last): + ... + ValueError: expected dimensions to be 2-d, instead got p:3 and q:2 + + Supports only two dimentional points + >>> euclidean_distance([1,1,1],[1,2]) + Traceback (most recent call last): + ... + ValueError: expected dimensions to be 2-d, instead got p:3 and q:2 + + Input should be in the format [x,y] or (x,y) + >>> euclidean_distance(1,2) + Traceback (most recent call last): + ... + TypeError: inputs must be iterable, either list [x,y] or tuple (x,y) + """ + if not hasattr(q, "__iter__") or not hasattr(p, "__iter__"): + raise TypeError("inputs must be iterable, either list [x,y] or tuple (x,y)") + + if isinstance(q, str) or isinstance(p, str): + raise TypeError("inputs cannot be str") + + if len(q) != 2 or len(p) != 2: + raise ValueError( + "expected dimensions to be 2-d, instead got p:{} and q:{}".format( + len(q), len(p) + ) + ) + + for num in q + p: + try: + num = int(num) + except: + raise ValueError("Non-numeric input detected") + + a = pow((q[0] - p[0]), 2) + b = pow((q[1] - p[1]), 2) + return pow((a + b), 0.5) + + +def find_neighbors(db, q, eps): + """ + Finds all points in the db that + are within a distance of eps from Q + + eps value should be a number + >>> find_neighbors({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}}, (2,5),'a') + Traceback (most recent call last): + ... + ValueError: eps should be either int or float + + Q must be a 2-d point as list or tuple + >>> find_neighbors({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}}, 2, 0.5) + Traceback (most recent call last): + ... + TypeError: Q must a 2-dimentional point in the format (x,y) or [x,y] + + Points must be in correct format + >>> find_neighbors([], (2,2) ,0.4) + Traceback (most recent call last): + ... + TypeError: db must be a dict of points in the format {(x,y):{'label':'boolean/undefined'}} + """ + + if not isinstance(eps, (int, float)): + raise ValueError("eps should be either int or float") + + if not hasattr(q, "__iter__"): + raise TypeError("Q must a 2-dimentional point in the format (x,y) or [x,y]") + + if not isinstance(db, dict): + raise TypeError( + "db must be a dict of points in the format {(x,y):{'label':'boolean/undefined'}}" + ) + + return [p for p in db if euclidean_distance(q, p) <= eps] + + +def plot_cluster(db, clusters, ax): + """ + Extracts all the points in the db and puts them together + as seperate clusters and finally plots them + + db cannot be empty + >>> fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) + >>> plot_cluster({},[1,2], axes[1] ) + Traceback (most recent call last): + ... + Exception: db is empty. No points to cluster + + clusters cannot be empty + >>> fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) + >>> plot_cluster({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},[],axes[1] ) + Traceback (most recent call last): + ... + Exception: nothing to cluster. Empty clusters + + clusters cannot be empty + >>> fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) + >>> plot_cluster({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},[],axes[1] ) + Traceback (most recent call last): + ... + Exception: nothing to cluster. Empty clusters + + ax must be a plotable + >>> plot_cluster({ (1,2):{'label':'1'}, (2,3):{'label':'2'}},[1,2], [] ) + Traceback (most recent call last): + ... + TypeError: ax must be an slot in a matplotlib figure + """ + if len(db) == 0: + raise Exception("db is empty. No points to cluster") + + if len(clusters) == 0: + raise Exception("nothing to cluster. Empty clusters") + + if not hasattr(ax, "plot"): + raise TypeError("ax must be an slot in a matplotlib figure") + + temp = [] + noise = [] + for i in clusters: + stack = [] + for k, v in db.items(): + if v["label"] == i: + stack.append(k) + elif v["label"] == "noise": + noise.append(k) + temp.append(stack) + + color = iter(plt.cm.rainbow(np.linspace(0, 1, len(clusters)))) + for i in range(0, len(temp)): + c = next(color) + x = [l[0] for l in temp[i]] + y = [l[1] for l in temp[i]] + ax.plot(x, y, "ro", c=c) + + x = [l[0] for l in noise] + y = [l[1] for l in noise] + ax.plot(x, y, "ro", c="0") + + +def dbscan(db, eps, min_pts): + """ + Implementation of the DBSCAN algorithm + + Points must be in correct format + >>> dbscan([], (2,2) ,0.4) + Traceback (most recent call last): + ... + TypeError: db must be a dict of points in the format {(x,y):{'label':'boolean/undefined'}} + + eps value should be a number + >>> dbscan({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},'a',20 ) + Traceback (most recent call last): + ... + ValueError: eps should be either int or float + + min_pts value should be an integer + >>> dbscan({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},0.4,20.0 ) + Traceback (most recent call last): + ... + ValueError: min_pts should be int + + db cannot be empty + >>> dbscan({},0.4,20.0 ) + Traceback (most recent call last): + ... + Exception: db is empty, nothing to cluster + + min_pts cannot be negative + >>> dbscan({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}}, 0.4, -20) + Traceback (most recent call last): + ... + ValueError: min_pts or eps cannot be negative + + eps cannot be negative + >>> dbscan({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},-0.4, 20) + Traceback (most recent call last): + ... + ValueError: min_pts or eps cannot be negative + + """ + if not isinstance(db, dict): + raise TypeError( + "db must be a dict of points in the format {(x,y):{'label':'boolean/undefined'}}" + ) + + if len(db) == 0: + raise Exception("db is empty, nothing to cluster") + + if not isinstance(eps, (int, float)): + raise ValueError("eps should be either int or float") + + if not isinstance(min_pts, int): + raise ValueError("min_pts should be int") + + if min_pts < 0 or eps < 0: + raise ValueError("min_pts or eps cannot be negative") + + if min_pts == 0: + warnings.warn("min_pts is 0. Are you sure you want this ?") + + if eps == 0: + warnings.warn("eps is 0. Are you sure you want this ?") + + clusters = [] + c = 0 + for p in db: + if db[p]["label"] != "undefined": + continue + neighbors = find_neighbors(db, p, eps) + if len(neighbors) < min_pts: + db[p]["label"] = "noise" + continue + c += 1 + clusters.append(c) + db[p]["label"] = c + neighbors.remove(p) + seed_set = neighbors.copy() + while seed_set != []: + q = seed_set.pop(0) + if db[q]["label"] == "noise": + db[q]["label"] = c + if db[q]["label"] != "undefined": + continue + db[q]["label"] = c + neighbors_n = find_neighbors(db, q, eps) + if len(neighbors_n) >= min_pts: + seed_set = seed_set + neighbors_n + return db, clusters + + +if __name__ == "__main__": + + fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) + + x, label = make_moons(n_samples=200, noise=0.1, random_state=19) + + axes[0].plot(x[:, 0], x[:, 1], "ro") + + points = {(point[0], point[1]): {"label": "undefined"} for point in x} + + eps = 0.25 + + min_pts = 12 + + db, clusters = dbscan(points, eps, min_pts) + + plot_cluster(db, clusters, axes[1]) + + plt.show()