Update k_means_clust.py

This commit is contained in:
thor-harsh 2023-08-18 18:42:32 +05:30 committed by GitHub
parent 945803f65d
commit 7dee330c83
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -10,11 +10,11 @@ Inputs:
- k , number of clusters to create.
- initial_centroids , initial centroid values generated by utility function(mentioned
in usage).
- maxiter , maximum number of iterations to process.
- heterogeneity , empty list that will be filled with hetrogeneity values if passed
- maxiter , the maximum number of iterations to process.
- heterogeneity, empty list that will be filled with heterogeneity values if passed
to kmeans func.
Usage:
1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
1. define 'k' value, 'X' features array and 'heterogeneity' empty list
2. create initial_centroids,
initial_centroids = get_initial_centroids(
X,
@ -31,8 +31,8 @@ Usage:
record_heterogeneity=heterogeneity,
verbose=True # whether to print logs in console or not.(default=False)
)
4. Plot the loss function, hetrogeneity values for every iteration saved in
hetrogeneity list.
4. Plot the loss function, heterogeneity values for every iteration saved in
heterogeneity list.
plot_heterogeneity(
heterogeneity,
k
@ -46,6 +46,7 @@ import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances
import doctest
warnings.filterwarnings("ignore")
@ -198,10 +199,10 @@ def report_generator(
df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
) -> pd.DataFrame:
"""
Function generates easy-erading clustering report. It takes 2 arguments as an input:
DataFrame - dataframe with predicted cluester column;
FillMissingReport - dictionary of rules how we are going to fill missing
values of for final report generate (not included in modeling);
Function generates an easy-reading clustering report. It takes 3 arguments as input:
DataFrame,predicted cluster column,
FillMissingReport - dictionary of rules on how we are going to fill in missing
values of for final report generate (not included in modelling);
in order to run the function following libraries must be imported:
import pandas as pd
import numpy as np
@ -306,10 +307,10 @@ def report_generator(
a.columns = report.columns # rename columns to match report
report = report.drop(
report[report.Type == "count"].index
) # drop count values except cluster size
) # drop count values except for cluster size
report = pd.concat(
[report, a, clustersize, clusterproportion], axis=0
) # concat report with clustert size and nan values
[report, a, cluster size, clusterproportion], axis=0
) # concat report with cluster size and nan values
report["Mark"] = report["Features"].isin(clustering_variables)
cols = report.columns.tolist()
cols = cols[0:2] + cols[-1:] + cols[2:-1]
@ -343,6 +344,6 @@ def report_generator(
if __name__ == "__main__":
import doctest
doctest.testmod()