Update k_means_clust.py

2025-02-25 18:38:39 +00:00 · 2023-08-18 18:42:32 +05:30 · 2023-08-18 18:42:32 +05:30 · 7dee330c83
commit 7dee330c83
parent 945803f65d
1 changed files with 14 additions and 13 deletions
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@ -10,11 +10,11 @@ Inputs:
  - k , number of clusters to create.
  - initial_centroids , initial centroid values generated by utility function(mentioned
    in usage).
-  - maxiter , maximum number of iterations to process.
-  - heterogeneity , empty list that will be filled with hetrogeneity values if passed
+  - maxiter , the maximum number of iterations to process.
+  - heterogeneity, empty list that will be filled with heterogeneity values if passed
    to kmeans func.
 Usage:
-  1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
+  1. define 'k' value, 'X' features array and 'heterogeneity' empty list
  2. create initial_centroids,
        initial_centroids = get_initial_centroids(
            X,
@ -31,8 +31,8 @@ Usage:
            record_heterogeneity=heterogeneity,
            verbose=True # whether to print logs in console or not.(default=False)
            )
-  4. Plot the loss function, hetrogeneity values for every iteration saved in
-     hetrogeneity list.
+  4. Plot the loss function, heterogeneity values for every iteration saved in
+     heterogeneity list.
        plot_heterogeneity(
            heterogeneity,
            k
@ -46,6 +46,7 @@ import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 from sklearn.metrics import pairwise_distances
+import doctest

 warnings.filterwarnings("ignore")

@ -198,10 +199,10 @@ def report_generator(
    df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
 ) -> pd.DataFrame:
    """
-    Function generates easy-erading clustering report. It takes 2 arguments as an input:
-        DataFrame - dataframe with predicted cluester column;
-        FillMissingReport - dictionary of rules how we are going to fill missing
-        values of for final report generate (not included in modeling);
+    Function generates an easy-reading clustering report. It takes 3 arguments as input:
+        DataFrame,predicted cluster column,
+        FillMissingReport - dictionary of rules on how we are going to fill in missing
+        values of for final report generate (not included in modelling);
    in order to run the function following libraries must be imported:
        import pandas as pd
        import numpy as np
@ -306,10 +307,10 @@ def report_generator(
    a.columns = report.columns  # rename columns to match report
    report = report.drop(
        report[report.Type == "count"].index
-    )  # drop count values except cluster size
+    )  # drop count values except for cluster size
    report = pd.concat(
-        [report, a, clustersize, clusterproportion], axis=0
-    )  # concat report with clustert size and nan values
+        [report, a, cluster size, clusterproportion], axis=0
+    )  # concat report with cluster size and nan values
    report["Mark"] = report["Features"].isin(clustering_variables)
    cols = report.columns.tolist()
    cols = cols[0:2] + cols[-1:] + cols[2:-1]
@ -343,6 +344,6 @@ def report_generator(


 if __name__ == "__main__":
-    import doctest
+   

    doctest.testmod()