mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-02-25 18:38:39 +00:00
Update k_means_clust.py
This commit is contained in:
parent
945803f65d
commit
7dee330c83
@ -10,11 +10,11 @@ Inputs:
|
|||||||
- k , number of clusters to create.
|
- k , number of clusters to create.
|
||||||
- initial_centroids , initial centroid values generated by utility function(mentioned
|
- initial_centroids , initial centroid values generated by utility function(mentioned
|
||||||
in usage).
|
in usage).
|
||||||
- maxiter , maximum number of iterations to process.
|
- maxiter , the maximum number of iterations to process.
|
||||||
- heterogeneity , empty list that will be filled with hetrogeneity values if passed
|
- heterogeneity, empty list that will be filled with heterogeneity values if passed
|
||||||
to kmeans func.
|
to kmeans func.
|
||||||
Usage:
|
Usage:
|
||||||
1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
|
1. define 'k' value, 'X' features array and 'heterogeneity' empty list
|
||||||
2. create initial_centroids,
|
2. create initial_centroids,
|
||||||
initial_centroids = get_initial_centroids(
|
initial_centroids = get_initial_centroids(
|
||||||
X,
|
X,
|
||||||
@ -31,8 +31,8 @@ Usage:
|
|||||||
record_heterogeneity=heterogeneity,
|
record_heterogeneity=heterogeneity,
|
||||||
verbose=True # whether to print logs in console or not.(default=False)
|
verbose=True # whether to print logs in console or not.(default=False)
|
||||||
)
|
)
|
||||||
4. Plot the loss function, hetrogeneity values for every iteration saved in
|
4. Plot the loss function, heterogeneity values for every iteration saved in
|
||||||
hetrogeneity list.
|
heterogeneity list.
|
||||||
plot_heterogeneity(
|
plot_heterogeneity(
|
||||||
heterogeneity,
|
heterogeneity,
|
||||||
k
|
k
|
||||||
@ -46,6 +46,7 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
from sklearn.metrics import pairwise_distances
|
from sklearn.metrics import pairwise_distances
|
||||||
|
import doctest
|
||||||
|
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
@ -198,10 +199,10 @@ def report_generator(
|
|||||||
df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
|
df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Function generates easy-erading clustering report. It takes 2 arguments as an input:
|
Function generates an easy-reading clustering report. It takes 3 arguments as input:
|
||||||
DataFrame - dataframe with predicted cluester column;
|
DataFrame,predicted cluster column,
|
||||||
FillMissingReport - dictionary of rules how we are going to fill missing
|
FillMissingReport - dictionary of rules on how we are going to fill in missing
|
||||||
values of for final report generate (not included in modeling);
|
values of for final report generate (not included in modelling);
|
||||||
in order to run the function following libraries must be imported:
|
in order to run the function following libraries must be imported:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -306,10 +307,10 @@ def report_generator(
|
|||||||
a.columns = report.columns # rename columns to match report
|
a.columns = report.columns # rename columns to match report
|
||||||
report = report.drop(
|
report = report.drop(
|
||||||
report[report.Type == "count"].index
|
report[report.Type == "count"].index
|
||||||
) # drop count values except cluster size
|
) # drop count values except for cluster size
|
||||||
report = pd.concat(
|
report = pd.concat(
|
||||||
[report, a, clustersize, clusterproportion], axis=0
|
[report, a, cluster size, clusterproportion], axis=0
|
||||||
) # concat report with clustert size and nan values
|
) # concat report with cluster size and nan values
|
||||||
report["Mark"] = report["Features"].isin(clustering_variables)
|
report["Mark"] = report["Features"].isin(clustering_variables)
|
||||||
cols = report.columns.tolist()
|
cols = report.columns.tolist()
|
||||||
cols = cols[0:2] + cols[-1:] + cols[2:-1]
|
cols = cols[0:2] + cols[-1:] + cols[2:-1]
|
||||||
@ -343,6 +344,6 @@ def report_generator(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
|
||||||
|
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user