add visualization of k means clustering as excel format (#2104)

* add visualization of kmneas clust as excel format

* style changes

* style changes

* Add doctest and typehint!

* style change

* Update machine_learning/k_means_clust.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update machine_learning/k_means_clust.py

Co-authored-by: Christian Clauss <cclauss@me.com>

Co-authored-by: Christian Clauss <cclauss@me.com>
This commit is contained in:
beqakd 2020-06-19 19:55:13 +04:00 committed by GitHub
parent b9e7c891e2
commit d034add61f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -47,12 +47,18 @@ Usage:
k k
) )
5. Have fun.. 5. Transfers Dataframe into excel format it must have feature called
'Clust' with k means clustering numbers in it.
""" """
import numpy as np import numpy as np
import pandas as pd
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances from sklearn.metrics import pairwise_distances
import warnings
warnings.filterwarnings("ignore")
TAG = "K-MEANS-CLUST/ " TAG = "K-MEANS-CLUST/ "
@ -202,3 +208,158 @@ if False: # change to true to run this test case.
verbose=True, verbose=True,
) )
plot_heterogeneity(heterogeneity, k) plot_heterogeneity(heterogeneity, k)
def ReportGenerator(
df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None
) -> pd.DataFrame:
"""
Function generates easy-erading clustering report. It takes 2 arguments as an input:
DataFrame - dataframe with predicted cluester column;
FillMissingReport - dictionary of rules how we are going to fill missing
values of for final report generate (not included in modeling);
in order to run the function following libraries must be imported:
import pandas as pd
import numpy as np
>>> data = pd.DataFrame()
>>> data['numbers'] = [1, 2, 3]
>>> data['col1'] = [0.5, 2.5, 4.5]
>>> data['col2'] = [100, 200, 300]
>>> data['col3'] = [10, 20, 30]
>>> data['Cluster'] = [1, 1, 2]
>>> ReportGenerator(data, ['col1', 'col2'], 0)
Features Type Mark 1 2
0 # of Customers ClusterSize False 2.000000 1.000000
1 % of Customers ClusterProportion False 0.666667 0.333333
2 col1 mean_with_zeros True 1.500000 4.500000
3 col2 mean_with_zeros True 150.000000 300.000000
4 numbers mean_with_zeros False 1.500000 3.000000
.. ... ... ... ... ...
99 dummy 5% False 1.000000 1.000000
100 dummy 95% False 1.000000 1.000000
101 dummy stdev False 0.000000 NaN
102 dummy mode False 1.000000 1.000000
103 dummy median False 1.000000 1.000000
<BLANKLINE>
[104 rows x 5 columns]
"""
# Fill missing values with given rules
if FillMissingReport:
df.fillna(value=FillMissingReport, inplace=True)
df["dummy"] = 1
numeric_cols = df.select_dtypes(np.number).columns
report = (
df.groupby(["Cluster"])[ # constract report dataframe
numeric_cols
] # group by cluster number
.agg(
[
("sum", np.sum),
("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
(
"mean_25-75",
lambda x: np.mean(
np.nan_to_num(
sorted(x)[
round((len(x) * 25 / 100)) : round(len(x) * 75 / 100)
]
)
),
),
("mean_with_na", np.mean),
("min", lambda x: x.min()),
("5%", lambda x: x.quantile(0.05)),
("25%", lambda x: x.quantile(0.25)),
("50%", lambda x: x.quantile(0.50)),
("75%", lambda x: x.quantile(0.75)),
("95%", lambda x: x.quantile(0.95)),
("max", lambda x: x.max()),
("count", lambda x: x.count()),
("stdev", lambda x: x.std()),
("mode", lambda x: x.mode()[0]),
("median", lambda x: x.median()),
("# > 0", lambda x: (x > 0).sum()),
]
)
.T.reset_index()
.rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
) # rename columns
clustersize = report[
(report["Features"] == "dummy") & (report["Type"] == "count")
] # caclulating size of cluster(count of clientID's)
clustersize.Type = (
"ClusterSize" # rename created cluster df to match report column names
)
clustersize.Features = "# of Customers"
clusterproportion = pd.DataFrame(
clustersize.iloc[:, 2:].values
/ clustersize.iloc[:, 2:].values.sum() # caclulating proportion of cluster
)
clusterproportion[
"Type"
] = "% of Customers" # rename created cluster df to match report column names
clusterproportion["Features"] = "ClusterProportion"
cols = clusterproportion.columns.tolist()
cols = cols[-2:] + cols[:-2]
clusterproportion = clusterproportion[cols] # rearrange columns to match report
clusterproportion.columns = report.columns
a = pd.DataFrame(
abs(
report[report["Type"] == "count"].iloc[:, 2:].values
- clustersize.iloc[:, 2:].values
)
) # generating df with count of nan values
a["Features"] = 0
a["Type"] = "# of nan"
a.Features = report[
report["Type"] == "count"
].Features.tolist() # filling values in order to match report
cols = a.columns.tolist()
cols = cols[-2:] + cols[:-2]
a = a[cols] # rearrange columns to match report
a.columns = report.columns # rename columns to match report
report = report.drop(
report[report.Type == "count"].index
) # drop count values except cluster size
report = pd.concat(
[report, a, clustersize, clusterproportion], axis=0
) # concat report with clustert size and nan values
report["Mark"] = report["Features"].isin(ClusteringVariables)
cols = report.columns.tolist()
cols = cols[0:2] + cols[-1:] + cols[2:-1]
report = report[cols]
sorter1 = {
"ClusterSize": 9,
"ClusterProportion": 8,
"mean_with_zeros": 7,
"mean_with_na": 6,
"max": 5,
"50%": 4,
"min": 3,
"25%": 2,
"75%": 1,
"# of nan": 0,
"# > 0": -1,
"sum_with_na": -2,
}
report = (
report.assign(
Sorter1=lambda x: x.Type.map(sorter1),
Sorter2=lambda x: list(reversed(range(len(x)))),
)
.sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
.drop(["Sorter1", "Sorter2"], axis=1)
)
report.columns.name = ""
report = report.reset_index()
report.drop(columns=["index"], inplace=True)
return report
if __name__ == "__main__":
import doctest
doctest.testmod()