add visualization of k means clustering as excel format (#2104)

* add visualization of kmneas clust as excel format * style changes * style changes * Add doctest and typehint! * style change * Update machine_learning/k_means_clust.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/k_means_clust.py Co-authored-by: Christian Clauss <cclauss@me.com> Co-authored-by: Christian Clauss <cclauss@me.com>
2025-03-20 05:29:48 +00:00 · 2020-06-19 19:55:13 +04:00 · 2020-06-19 19:55:13 +04:00 · d034add61f
commit d034add61f
parent b9e7c891e2
1 changed files with 162 additions and 1 deletions
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@ -47,12 +47,18 @@ Usage:
            k
        )
-  5. Have fun..
+  5. Transfers Dataframe into excel format it must have feature called
      'Clust' with k means clustering numbers in it.
 """
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 from sklearn.metrics import pairwise_distances
 import warnings
 warnings.filterwarnings("ignore")
 TAG = "K-MEANS-CLUST/ "
@ -202,3 +208,158 @@ if False:  # change to true to run this test case.
        verbose=True,
    )
    plot_heterogeneity(heterogeneity, k)
 def ReportGenerator(
    df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None
 ) -> pd.DataFrame:
    """
    Function generates easy-erading clustering report. It takes 2 arguments as an input:
        DataFrame - dataframe with predicted cluester column;
        FillMissingReport - dictionary of rules how we are going to fill missing
        values of for final report generate (not included in modeling);
    in order to run the function following libraries must be imported:
        import pandas as pd
        import numpy as np
    >>> data = pd.DataFrame()
    >>> data['numbers'] = [1, 2, 3]
    >>> data['col1'] = [0.5, 2.5, 4.5]
    >>> data['col2'] = [100, 200, 300]
    >>> data['col3'] = [10, 20, 30]
    >>> data['Cluster'] = [1, 1, 2]
    >>> ReportGenerator(data, ['col1', 'col2'], 0)
               Features               Type   Mark           1           2
    0    # of Customers        ClusterSize  False    2.000000    1.000000
    1    % of Customers  ClusterProportion  False    0.666667    0.333333
    2              col1    mean_with_zeros   True    1.500000    4.500000
    3              col2    mean_with_zeros   True  150.000000  300.000000
    4           numbers    mean_with_zeros  False    1.500000    3.000000
    ..              ...                ...    ...         ...         ...
    99            dummy                 5%  False    1.000000    1.000000
    100           dummy                95%  False    1.000000    1.000000
    101           dummy              stdev  False    0.000000         NaN
    102           dummy               mode  False    1.000000    1.000000
    103           dummy             median  False    1.000000    1.000000
    <BLANKLINE>
    [104 rows x 5 columns]
    """
    # Fill missing values with given rules
    if FillMissingReport:
        df.fillna(value=FillMissingReport, inplace=True)
    df["dummy"] = 1
    numeric_cols = df.select_dtypes(np.number).columns
    report = (
        df.groupby(["Cluster"])[  # constract report dataframe
            numeric_cols
        ]  # group by cluster number
        .agg(
            [
                ("sum", np.sum),
                ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
                ("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
                (
                    "mean_25-75",
                    lambda x: np.mean(
                        np.nan_to_num(
                            sorted(x)[
                                round((len(x) * 25 / 100)) : round(len(x) * 75 / 100)
                            ]
                        )
                    ),
                ),
                ("mean_with_na", np.mean),
                ("min", lambda x: x.min()),
                ("5%", lambda x: x.quantile(0.05)),
                ("25%", lambda x: x.quantile(0.25)),
                ("50%", lambda x: x.quantile(0.50)),
                ("75%", lambda x: x.quantile(0.75)),
                ("95%", lambda x: x.quantile(0.95)),
                ("max", lambda x: x.max()),
                ("count", lambda x: x.count()),
                ("stdev", lambda x: x.std()),
                ("mode", lambda x: x.mode()[0]),
                ("median", lambda x: x.median()),
                ("# > 0", lambda x: (x > 0).sum()),
            ]
        )
        .T.reset_index()
        .rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
    )  # rename columns
    clustersize = report[
        (report["Features"] == "dummy") & (report["Type"] == "count")
    ]  # caclulating size of cluster(count of clientID's)
    clustersize.Type = (
        "ClusterSize"  # rename created cluster df to match report column names
    )
    clustersize.Features = "# of Customers"
    clusterproportion = pd.DataFrame(
        clustersize.iloc[:, 2:].values
        / clustersize.iloc[:, 2:].values.sum()  # caclulating proportion of cluster
    )
    clusterproportion[
        "Type"
    ] = "% of Customers"  # rename created cluster df to match report column names
    clusterproportion["Features"] = "ClusterProportion"
    cols = clusterproportion.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    clusterproportion = clusterproportion[cols]  # rearrange columns to match report
    clusterproportion.columns = report.columns
    a = pd.DataFrame(
        abs(
            report[report["Type"] == "count"].iloc[:, 2:].values
            - clustersize.iloc[:, 2:].values
        )
    )  # generating df with count of nan values
    a["Features"] = 0
    a["Type"] = "# of nan"
    a.Features = report[
        report["Type"] == "count"
    ].Features.tolist()  # filling values in order to match report
    cols = a.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    a = a[cols]  # rearrange columns to match report
    a.columns = report.columns  # rename columns to match report
    report = report.drop(
        report[report.Type == "count"].index
    )  # drop count values except cluster size
    report = pd.concat(
        [report, a, clustersize, clusterproportion], axis=0
    )  # concat report with clustert size and nan values
    report["Mark"] = report["Features"].isin(ClusteringVariables)
    cols = report.columns.tolist()
    cols = cols[0:2] + cols[-1:] + cols[2:-1]
    report = report[cols]
    sorter1 = {
        "ClusterSize": 9,
        "ClusterProportion": 8,
        "mean_with_zeros": 7,
        "mean_with_na": 6,
        "max": 5,
        "50%": 4,
        "min": 3,
        "25%": 2,
        "75%": 1,
        "# of nan": 0,
        "# > 0": -1,
        "sum_with_na": -2,
    }
    report = (
        report.assign(
            Sorter1=lambda x: x.Type.map(sorter1),
            Sorter2=lambda x: list(reversed(range(len(x)))),
        )
        .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
        .drop(["Sorter1", "Sorter2"], axis=1)
    )
    report.columns.name = ""
    report = report.reset_index()
    report.drop(columns=["index"], inplace=True)
    return report
 if __name__ == "__main__":
    import doctest
    doctest.testmod()