Python/machine_learning/k_means_clust.py

205 lines
6.5 KiB
Python
Raw Normal View History

2019-10-05 05:14:13 +00:00
"""README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)
Requirements:
- sklearn
- numpy
- matplotlib
Python:
- 3.5
Inputs:
- X , a 2D numpy array of features.
- k , number of clusters to create.
- initial_centroids , initial centroid values generated by utility function(mentioned
in usage).
- maxiter , maximum number of iterations to process.
- heterogeneity , empty list that will be filled with hetrogeneity values if passed
to kmeans func.
Usage:
1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
2. create initial_centroids,
initial_centroids = get_initial_centroids(
X,
k,
seed=0 # seed value for initial centroid generation,
# None for randomness(default=None)
)
3. find centroids and clusters using kmeans function.
centroids, cluster_assignment = kmeans(
X,
k,
initial_centroids,
maxiter=400,
record_heterogeneity=heterogeneity,
verbose=True # whether to print logs in console or not.(default=False)
)
4. Plot the loss function, hetrogeneity values for every iteration saved in
hetrogeneity list.
plot_heterogeneity(
heterogeneity,
k
)
5. Have fun..
2019-10-05 05:14:13 +00:00
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances
2019-10-05 05:14:13 +00:00
TAG = "K-MEANS-CLUST/ "
def get_initial_centroids(data, k, seed=None):
2019-10-05 05:14:13 +00:00
"""Randomly choose k data points as initial centroids"""
if seed is not None: # useful for obtaining consistent results
np.random.seed(seed)
2019-10-05 05:14:13 +00:00
n = data.shape[0] # number of data points
# Pick K indices from range [0, N).
rand_indices = np.random.randint(0, n, k)
# Keep centroids as dense format, as many entries will be nonzero due to averaging.
# As long as at least one document in a cluster contains a word,
# it will carry a nonzero weight in the TF-IDF vector of the centroid.
2019-10-05 05:14:13 +00:00
centroids = data[rand_indices, :]
return centroids
2019-10-05 05:14:13 +00:00
def centroid_pairwise_dist(X, centroids):
return pairwise_distances(X, centroids, metric="euclidean")
def assign_clusters(data, centroids):
# Compute distances between each data point and the set of centroids:
# Fill in the blank (RHS only)
2019-10-05 05:14:13 +00:00
distances_from_centroids = centroid_pairwise_dist(data, centroids)
# Compute cluster assignments for each data point:
# Fill in the blank (RHS only)
2019-10-05 05:14:13 +00:00
cluster_assignment = np.argmin(distances_from_centroids, axis=1)
return cluster_assignment
2019-10-05 05:14:13 +00:00
def revise_centroids(data, k, cluster_assignment):
new_centroids = []
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
2019-10-05 05:14:13 +00:00
member_data_points = data[cluster_assignment == i]
# Compute the mean of the data points. Fill in the blank (RHS only)
centroid = member_data_points.mean(axis=0)
new_centroids.append(centroid)
new_centroids = np.array(new_centroids)
return new_centroids
2019-10-05 05:14:13 +00:00
def compute_heterogeneity(data, k, centroids, cluster_assignment):
heterogeneity = 0.0
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
2019-10-05 05:14:13 +00:00
member_data_points = data[cluster_assignment == i, :]
2019-10-05 05:14:13 +00:00
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
# Compute distances from centroid to data points (RHS only)
2019-10-05 05:14:13 +00:00
distances = pairwise_distances(
member_data_points, [centroids[i]], metric="euclidean"
)
squared_distances = distances ** 2
heterogeneity += np.sum(squared_distances)
return heterogeneity
2019-10-05 05:14:13 +00:00
def plot_heterogeneity(heterogeneity, k):
2019-10-05 05:14:13 +00:00
plt.figure(figsize=(7, 4))
plt.plot(heterogeneity, linewidth=4)
2019-10-05 05:14:13 +00:00
plt.xlabel("# Iterations")
plt.ylabel("Heterogeneity")
plt.title(f"Heterogeneity of clustering over time, K={k:d}")
2019-10-05 05:14:13 +00:00
plt.rcParams.update({"font.size": 16})
plt.show()
2019-10-05 05:14:13 +00:00
def kmeans(
data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False
):
"""This function runs k-means on given data and initial set of centroids.
maxiter: maximum number of iterations to run.(default=500)
record_heterogeneity: (optional) a list, to store the history of heterogeneity
as function of iterations
if None, do not store the history.
verbose: if True, print how many data points changed their cluster labels in
each iteration"""
centroids = initial_centroids[:]
prev_cluster_assignment = None
for itr in range(maxiter):
if verbose:
2019-10-05 05:14:13 +00:00
print(itr, end="")
# 1. Make cluster assignments using nearest centroids
2019-10-05 05:14:13 +00:00
cluster_assignment = assign_clusters(data, centroids)
# 2. Compute a new centroid for each of the k clusters, averaging all data
# points assigned to that cluster.
2019-10-05 05:14:13 +00:00
centroids = revise_centroids(data, k, cluster_assignment)
# Check for convergence: if none of the assignments changed, stop
2019-10-05 05:14:13 +00:00
if (
prev_cluster_assignment is not None
and (prev_cluster_assignment == cluster_assignment).all()
):
break
# Print number of new assignments
if prev_cluster_assignment is not None:
2019-10-05 05:14:13 +00:00
num_changed = np.sum(prev_cluster_assignment != cluster_assignment)
if verbose:
2019-10-05 05:14:13 +00:00
print(
" {:5d} elements changed their cluster assignment.".format(
2019-10-05 05:14:13 +00:00
num_changed
)
)
# Record heterogeneity convergence metric
if record_heterogeneity is not None:
# YOUR CODE HERE
2019-10-05 05:14:13 +00:00
score = compute_heterogeneity(data, k, centroids, cluster_assignment)
record_heterogeneity.append(score)
prev_cluster_assignment = cluster_assignment[:]
return centroids, cluster_assignment
2019-10-05 05:14:13 +00:00
# Mock test below
2019-10-05 05:14:13 +00:00
if False: # change to true to run this test case.
import sklearn.datasets as ds
2019-10-05 05:14:13 +00:00
dataset = ds.load_iris()
k = 3
heterogeneity = []
2019-10-05 05:14:13 +00:00
initial_centroids = get_initial_centroids(dataset["data"], k, seed=0)
centroids, cluster_assignment = kmeans(
dataset["data"],
k,
initial_centroids,
maxiter=400,
record_heterogeneity=heterogeneity,
verbose=True,
)
plot_heterogeneity(heterogeneity, k)