2019-10-05 05:14:13 +00:00
|
|
|
"""README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)
|
2017-10-02 15:43:43 +00:00
|
|
|
Requirements:
|
|
|
|
- sklearn
|
|
|
|
- numpy
|
|
|
|
- matplotlib
|
|
|
|
Python:
|
|
|
|
- 3.5
|
|
|
|
Inputs:
|
|
|
|
- X , a 2D numpy array of features.
|
|
|
|
- k , number of clusters to create.
|
2020-05-22 06:10:11 +00:00
|
|
|
- initial_centroids , initial centroid values generated by utility function(mentioned
|
|
|
|
in usage).
|
2017-10-02 15:43:43 +00:00
|
|
|
- maxiter , maximum number of iterations to process.
|
2023-09-27 06:31:42 +00:00
|
|
|
- heterogeneity , empty list that will be filled with heterogeneity values if passed
|
2020-05-22 06:10:11 +00:00
|
|
|
to kmeans func.
|
2017-10-02 15:43:43 +00:00
|
|
|
Usage:
|
2023-09-27 06:31:42 +00:00
|
|
|
1. define 'k' value, 'X' features array and 'heterogeneity' empty list
|
2017-10-02 15:43:43 +00:00
|
|
|
2. create initial_centroids,
|
|
|
|
initial_centroids = get_initial_centroids(
|
2019-08-19 13:37:49 +00:00
|
|
|
X,
|
|
|
|
k,
|
2020-05-22 06:10:11 +00:00
|
|
|
seed=0 # seed value for initial centroid generation,
|
|
|
|
# None for randomness(default=None)
|
2017-10-02 15:43:43 +00:00
|
|
|
)
|
|
|
|
3. find centroids and clusters using kmeans function.
|
|
|
|
centroids, cluster_assignment = kmeans(
|
2019-08-19 13:37:49 +00:00
|
|
|
X,
|
|
|
|
k,
|
|
|
|
initial_centroids,
|
2017-10-02 15:43:43 +00:00
|
|
|
maxiter=400,
|
2019-08-19 13:37:49 +00:00
|
|
|
record_heterogeneity=heterogeneity,
|
2017-10-02 15:43:43 +00:00
|
|
|
verbose=True # whether to print logs in console or not.(default=False)
|
|
|
|
)
|
2023-09-27 06:31:42 +00:00
|
|
|
4. Plot the loss function and heterogeneity values for every iteration saved in
|
|
|
|
heterogeneity list.
|
2017-10-02 15:43:43 +00:00
|
|
|
plot_heterogeneity(
|
2019-08-19 13:37:49 +00:00
|
|
|
heterogeneity,
|
2017-10-02 15:43:43 +00:00
|
|
|
k
|
|
|
|
)
|
2020-06-19 15:55:13 +00:00
|
|
|
5. Transfers Dataframe into excel format it must have feature called
|
|
|
|
'Clust' with k means clustering numbers in it.
|
2019-10-05 05:14:13 +00:00
|
|
|
"""
|
2020-07-06 07:44:19 +00:00
|
|
|
import warnings
|
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
import numpy as np
|
2020-06-19 15:55:13 +00:00
|
|
|
import pandas as pd
|
2020-05-22 06:10:11 +00:00
|
|
|
from matplotlib import pyplot as plt
|
|
|
|
from sklearn.metrics import pairwise_distances
|
2020-06-19 15:55:13 +00:00
|
|
|
|
|
|
|
warnings.filterwarnings("ignore")
|
2017-10-02 15:43:43 +00:00
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
TAG = "K-MEANS-CLUST/ "
|
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
|
|
|
|
def get_initial_centroids(data, k, seed=None):
|
2019-10-05 05:14:13 +00:00
|
|
|
"""Randomly choose k data points as initial centroids"""
|
|
|
|
if seed is not None: # useful for obtaining consistent results
|
2017-10-02 15:43:43 +00:00
|
|
|
np.random.seed(seed)
|
2019-10-05 05:14:13 +00:00
|
|
|
n = data.shape[0] # number of data points
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
# Pick K indices from range [0, N).
|
|
|
|
rand_indices = np.random.randint(0, n, k)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
# Keep centroids as dense format, as many entries will be nonzero due to averaging.
|
|
|
|
# As long as at least one document in a cluster contains a word,
|
|
|
|
# it will carry a nonzero weight in the TF-IDF vector of the centroid.
|
2019-10-05 05:14:13 +00:00
|
|
|
centroids = data[rand_indices, :]
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
return centroids
|
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
|
2022-10-12 22:54:20 +00:00
|
|
|
def centroid_pairwise_dist(x, centroids):
|
|
|
|
return pairwise_distances(x, centroids, metric="euclidean")
|
2019-10-05 05:14:13 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
|
|
|
|
def assign_clusters(data, centroids):
|
|
|
|
# Compute distances between each data point and the set of centroids:
|
|
|
|
# Fill in the blank (RHS only)
|
2019-10-05 05:14:13 +00:00
|
|
|
distances_from_centroids = centroid_pairwise_dist(data, centroids)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
# Compute cluster assignments for each data point:
|
|
|
|
# Fill in the blank (RHS only)
|
2019-10-05 05:14:13 +00:00
|
|
|
cluster_assignment = np.argmin(distances_from_centroids, axis=1)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
return cluster_assignment
|
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
def revise_centroids(data, k, cluster_assignment):
|
|
|
|
new_centroids = []
|
|
|
|
for i in range(k):
|
|
|
|
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
|
2019-10-05 05:14:13 +00:00
|
|
|
member_data_points = data[cluster_assignment == i]
|
2017-10-02 15:43:43 +00:00
|
|
|
# Compute the mean of the data points. Fill in the blank (RHS only)
|
|
|
|
centroid = member_data_points.mean(axis=0)
|
|
|
|
new_centroids.append(centroid)
|
|
|
|
new_centroids = np.array(new_centroids)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
return new_centroids
|
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
def compute_heterogeneity(data, k, centroids, cluster_assignment):
|
|
|
|
heterogeneity = 0.0
|
|
|
|
for i in range(k):
|
|
|
|
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
|
2019-10-05 05:14:13 +00:00
|
|
|
member_data_points = data[cluster_assignment == i, :]
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
|
2017-10-02 15:43:43 +00:00
|
|
|
# Compute distances from centroid to data points (RHS only)
|
2019-10-05 05:14:13 +00:00
|
|
|
distances = pairwise_distances(
|
|
|
|
member_data_points, [centroids[i]], metric="euclidean"
|
|
|
|
)
|
2022-01-30 19:29:54 +00:00
|
|
|
squared_distances = distances**2
|
2017-10-02 15:43:43 +00:00
|
|
|
heterogeneity += np.sum(squared_distances)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
return heterogeneity
|
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
def plot_heterogeneity(heterogeneity, k):
|
2019-10-05 05:14:13 +00:00
|
|
|
plt.figure(figsize=(7, 4))
|
2017-10-02 15:43:43 +00:00
|
|
|
plt.plot(heterogeneity, linewidth=4)
|
2019-10-05 05:14:13 +00:00
|
|
|
plt.xlabel("# Iterations")
|
|
|
|
plt.ylabel("Heterogeneity")
|
2020-01-03 14:25:36 +00:00
|
|
|
plt.title(f"Heterogeneity of clustering over time, K={k:d}")
|
2019-10-05 05:14:13 +00:00
|
|
|
plt.rcParams.update({"font.size": 16})
|
2017-10-02 15:43:43 +00:00
|
|
|
plt.show()
|
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
|
|
|
|
def kmeans(
|
|
|
|
data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False
|
|
|
|
):
|
2023-10-11 18:30:02 +00:00
|
|
|
"""Runs k-means on given data and initial set of centroids.
|
2020-09-10 08:31:26 +00:00
|
|
|
maxiter: maximum number of iterations to run.(default=500)
|
|
|
|
record_heterogeneity: (optional) a list, to store the history of heterogeneity
|
|
|
|
as function of iterations
|
|
|
|
if None, do not store the history.
|
|
|
|
verbose: if True, print how many data points changed their cluster labels in
|
|
|
|
each iteration"""
|
2017-10-02 15:43:43 +00:00
|
|
|
centroids = initial_centroids[:]
|
|
|
|
prev_cluster_assignment = None
|
2019-08-19 13:37:49 +00:00
|
|
|
|
|
|
|
for itr in range(maxiter):
|
2017-10-02 15:43:43 +00:00
|
|
|
if verbose:
|
2019-10-05 05:14:13 +00:00
|
|
|
print(itr, end="")
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
# 1. Make cluster assignments using nearest centroids
|
2019-10-05 05:14:13 +00:00
|
|
|
cluster_assignment = assign_clusters(data, centroids)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2020-05-22 06:10:11 +00:00
|
|
|
# 2. Compute a new centroid for each of the k clusters, averaging all data
|
|
|
|
# points assigned to that cluster.
|
2019-10-05 05:14:13 +00:00
|
|
|
centroids = revise_centroids(data, k, cluster_assignment)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
# Check for convergence: if none of the assignments changed, stop
|
2019-10-05 05:14:13 +00:00
|
|
|
if (
|
|
|
|
prev_cluster_assignment is not None
|
|
|
|
and (prev_cluster_assignment == cluster_assignment).all()
|
|
|
|
):
|
2017-10-02 15:43:43 +00:00
|
|
|
break
|
2019-08-19 13:37:49 +00:00
|
|
|
|
|
|
|
# Print number of new assignments
|
2017-10-02 15:43:43 +00:00
|
|
|
if prev_cluster_assignment is not None:
|
2019-10-05 05:14:13 +00:00
|
|
|
num_changed = np.sum(prev_cluster_assignment != cluster_assignment)
|
2017-10-02 15:43:43 +00:00
|
|
|
if verbose:
|
2019-10-05 05:14:13 +00:00
|
|
|
print(
|
2022-07-07 14:34:07 +00:00
|
|
|
f" {num_changed:5d} elements changed their cluster assignment."
|
2019-10-05 05:14:13 +00:00
|
|
|
)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
# Record heterogeneity convergence metric
|
|
|
|
if record_heterogeneity is not None:
|
|
|
|
# YOUR CODE HERE
|
2019-10-05 05:14:13 +00:00
|
|
|
score = compute_heterogeneity(data, k, centroids, cluster_assignment)
|
2017-10-02 15:43:43 +00:00
|
|
|
record_heterogeneity.append(score)
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
prev_cluster_assignment = cluster_assignment[:]
|
2019-08-19 13:37:49 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
return centroids, cluster_assignment
|
|
|
|
|
2019-10-05 05:14:13 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
# Mock test below
|
2019-10-05 05:14:13 +00:00
|
|
|
if False: # change to true to run this test case.
|
2020-07-06 07:44:19 +00:00
|
|
|
from sklearn import datasets as ds
|
2019-10-05 05:14:13 +00:00
|
|
|
|
2017-10-02 15:43:43 +00:00
|
|
|
dataset = ds.load_iris()
|
|
|
|
k = 3
|
|
|
|
heterogeneity = []
|
2019-10-05 05:14:13 +00:00
|
|
|
initial_centroids = get_initial_centroids(dataset["data"], k, seed=0)
|
|
|
|
centroids, cluster_assignment = kmeans(
|
|
|
|
dataset["data"],
|
|
|
|
k,
|
|
|
|
initial_centroids,
|
|
|
|
maxiter=400,
|
|
|
|
record_heterogeneity=heterogeneity,
|
|
|
|
verbose=True,
|
|
|
|
)
|
2017-11-25 11:41:55 +00:00
|
|
|
plot_heterogeneity(heterogeneity, k)
|
2020-06-19 15:55:13 +00:00
|
|
|
|
|
|
|
|
2022-10-12 22:54:20 +00:00
|
|
|
def report_generator(
|
2023-10-11 18:30:02 +00:00
|
|
|
predicted: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
|
2020-06-19 15:55:13 +00:00
|
|
|
) -> pd.DataFrame:
|
|
|
|
"""
|
2023-10-11 18:30:02 +00:00
|
|
|
Generate a clustering report given these two arguments:
|
|
|
|
predicted - dataframe with predicted cluster column
|
2023-09-27 06:31:42 +00:00
|
|
|
fill_missing_report - dictionary of rules on how we are going to fill in missing
|
|
|
|
values for final generated report (not included in modelling);
|
2023-10-11 18:30:02 +00:00
|
|
|
>>> predicted = pd.DataFrame()
|
|
|
|
>>> predicted['numbers'] = [1, 2, 3]
|
|
|
|
>>> predicted['col1'] = [0.5, 2.5, 4.5]
|
|
|
|
>>> predicted['col2'] = [100, 200, 300]
|
|
|
|
>>> predicted['col3'] = [10, 20, 30]
|
|
|
|
>>> predicted['Cluster'] = [1, 1, 2]
|
|
|
|
>>> report_generator(predicted, ['col1', 'col2'], 0)
|
2020-06-19 15:55:13 +00:00
|
|
|
Features Type Mark 1 2
|
|
|
|
0 # of Customers ClusterSize False 2.000000 1.000000
|
|
|
|
1 % of Customers ClusterProportion False 0.666667 0.333333
|
|
|
|
2 col1 mean_with_zeros True 1.500000 4.500000
|
|
|
|
3 col2 mean_with_zeros True 150.000000 300.000000
|
|
|
|
4 numbers mean_with_zeros False 1.500000 3.000000
|
|
|
|
.. ... ... ... ... ...
|
|
|
|
99 dummy 5% False 1.000000 1.000000
|
|
|
|
100 dummy 95% False 1.000000 1.000000
|
|
|
|
101 dummy stdev False 0.000000 NaN
|
|
|
|
102 dummy mode False 1.000000 1.000000
|
|
|
|
103 dummy median False 1.000000 1.000000
|
|
|
|
<BLANKLINE>
|
|
|
|
[104 rows x 5 columns]
|
|
|
|
"""
|
|
|
|
# Fill missing values with given rules
|
2022-10-12 22:54:20 +00:00
|
|
|
if fill_missing_report:
|
2023-10-11 18:30:02 +00:00
|
|
|
predicted = predicted.fillna(value=fill_missing_report)
|
|
|
|
predicted["dummy"] = 1
|
|
|
|
numeric_cols = predicted.select_dtypes(np.number).columns
|
2020-06-19 15:55:13 +00:00
|
|
|
report = (
|
2023-10-11 18:30:02 +00:00
|
|
|
predicted.groupby(["Cluster"])[ # construct report dataframe
|
2020-06-19 15:55:13 +00:00
|
|
|
numeric_cols
|
|
|
|
] # group by cluster number
|
|
|
|
.agg(
|
|
|
|
[
|
2023-10-10 20:18:31 +00:00
|
|
|
("sum", "sum"),
|
2020-06-19 15:55:13 +00:00
|
|
|
("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
|
|
|
|
("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
|
|
|
|
(
|
|
|
|
"mean_25-75",
|
|
|
|
lambda x: np.mean(
|
|
|
|
np.nan_to_num(
|
|
|
|
sorted(x)[
|
2020-10-21 10:46:14 +00:00
|
|
|
round(len(x) * 25 / 100) : round(len(x) * 75 / 100)
|
2020-06-19 15:55:13 +00:00
|
|
|
]
|
|
|
|
)
|
|
|
|
),
|
|
|
|
),
|
2023-10-10 20:18:31 +00:00
|
|
|
("mean_with_na", "mean"),
|
2020-06-19 15:55:13 +00:00
|
|
|
("min", lambda x: x.min()),
|
|
|
|
("5%", lambda x: x.quantile(0.05)),
|
|
|
|
("25%", lambda x: x.quantile(0.25)),
|
|
|
|
("50%", lambda x: x.quantile(0.50)),
|
|
|
|
("75%", lambda x: x.quantile(0.75)),
|
|
|
|
("95%", lambda x: x.quantile(0.95)),
|
|
|
|
("max", lambda x: x.max()),
|
|
|
|
("count", lambda x: x.count()),
|
|
|
|
("stdev", lambda x: x.std()),
|
|
|
|
("mode", lambda x: x.mode()[0]),
|
|
|
|
("median", lambda x: x.median()),
|
|
|
|
("# > 0", lambda x: (x > 0).sum()),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
.T.reset_index()
|
|
|
|
.rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
|
|
|
|
) # rename columns
|
2020-08-23 11:40:57 +00:00
|
|
|
# calculate the size of cluster(count of clientID's)
|
2023-10-11 18:30:02 +00:00
|
|
|
# avoid SettingWithCopyWarning
|
2020-06-19 15:55:13 +00:00
|
|
|
clustersize = report[
|
|
|
|
(report["Features"] == "dummy") & (report["Type"] == "count")
|
2023-10-11 18:30:02 +00:00
|
|
|
].copy()
|
|
|
|
# rename created predicted cluster to match report column names
|
|
|
|
clustersize.Type = "ClusterSize"
|
2020-06-19 15:55:13 +00:00
|
|
|
clustersize.Features = "# of Customers"
|
2023-10-11 18:30:02 +00:00
|
|
|
# calculating the proportion of cluster
|
2020-06-19 15:55:13 +00:00
|
|
|
clusterproportion = pd.DataFrame(
|
2023-10-11 18:30:02 +00:00
|
|
|
clustersize.iloc[:, 2:].to_numpy() / clustersize.iloc[:, 2:].to_numpy().sum()
|
2020-06-19 15:55:13 +00:00
|
|
|
)
|
2023-10-11 18:30:02 +00:00
|
|
|
# rename created predicted cluster to match report column names
|
|
|
|
clusterproportion["Type"] = "% of Customers"
|
2020-06-19 15:55:13 +00:00
|
|
|
clusterproportion["Features"] = "ClusterProportion"
|
|
|
|
cols = clusterproportion.columns.tolist()
|
|
|
|
cols = cols[-2:] + cols[:-2]
|
|
|
|
clusterproportion = clusterproportion[cols] # rearrange columns to match report
|
|
|
|
clusterproportion.columns = report.columns
|
2023-10-11 18:30:02 +00:00
|
|
|
# generating dataframe with count of nan values
|
2020-06-19 15:55:13 +00:00
|
|
|
a = pd.DataFrame(
|
|
|
|
abs(
|
2023-10-11 18:30:02 +00:00
|
|
|
report[report["Type"] == "count"].iloc[:, 2:].to_numpy()
|
|
|
|
- clustersize.iloc[:, 2:].to_numpy()
|
2020-06-19 15:55:13 +00:00
|
|
|
)
|
2023-10-11 18:30:02 +00:00
|
|
|
)
|
2020-06-19 15:55:13 +00:00
|
|
|
a["Features"] = 0
|
|
|
|
a["Type"] = "# of nan"
|
2023-10-11 18:30:02 +00:00
|
|
|
# filling values in order to match report
|
|
|
|
a.Features = report[report["Type"] == "count"].Features.tolist()
|
2020-06-19 15:55:13 +00:00
|
|
|
cols = a.columns.tolist()
|
|
|
|
cols = cols[-2:] + cols[:-2]
|
|
|
|
a = a[cols] # rearrange columns to match report
|
|
|
|
a.columns = report.columns # rename columns to match report
|
2023-10-11 18:30:02 +00:00
|
|
|
# drop count values except for cluster size
|
|
|
|
report = report.drop(report[report.Type == "count"].index)
|
|
|
|
# concat report with cluster size and nan values
|
|
|
|
report = pd.concat([report, a, clustersize, clusterproportion], axis=0)
|
2022-10-12 22:54:20 +00:00
|
|
|
report["Mark"] = report["Features"].isin(clustering_variables)
|
2020-06-19 15:55:13 +00:00
|
|
|
cols = report.columns.tolist()
|
|
|
|
cols = cols[0:2] + cols[-1:] + cols[2:-1]
|
|
|
|
report = report[cols]
|
|
|
|
sorter1 = {
|
|
|
|
"ClusterSize": 9,
|
|
|
|
"ClusterProportion": 8,
|
|
|
|
"mean_with_zeros": 7,
|
|
|
|
"mean_with_na": 6,
|
|
|
|
"max": 5,
|
|
|
|
"50%": 4,
|
|
|
|
"min": 3,
|
|
|
|
"25%": 2,
|
|
|
|
"75%": 1,
|
|
|
|
"# of nan": 0,
|
|
|
|
"# > 0": -1,
|
|
|
|
"sum_with_na": -2,
|
|
|
|
}
|
|
|
|
report = (
|
|
|
|
report.assign(
|
|
|
|
Sorter1=lambda x: x.Type.map(sorter1),
|
|
|
|
Sorter2=lambda x: list(reversed(range(len(x)))),
|
|
|
|
)
|
|
|
|
.sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
|
|
|
|
.drop(["Sorter1", "Sorter2"], axis=1)
|
|
|
|
)
|
|
|
|
report.columns.name = ""
|
|
|
|
report = report.reset_index()
|
2023-03-01 16:23:33 +00:00
|
|
|
report = report.drop(columns=["index"])
|
2020-06-19 15:55:13 +00:00
|
|
|
return report
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
|
|
|
|
|
|
|
doctest.testmod()
|