Python/machine_learning/k_means_clust.py

"""README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)

Requirements:
  - sklearn
  - numpy
  - matplotlib

Python:
  - 3.5

Inputs:
  - X , a 2D numpy array of features.
  - k , number of clusters to create.
  - initial_centroids , initial centroid values generated by utility function(mentioned
    in usage).
  - maxiter , maximum number of iterations to process.
  - heterogeneity , empty list that will be filled with hetrogeneity values if passed
    to kmeans func.

Usage:
  1. define 'k' value, 'X' features array and 'hetrogeneity' empty list

  2. create initial_centroids,
        initial_centroids = get_initial_centroids(
            X,
            k,
            seed=0 # seed value for initial centroid generation,
                   # None for randomness(default=None)
            )

  3. find centroids and clusters using kmeans function.

        centroids, cluster_assignment = kmeans(
            X,
            k,
            initial_centroids,
            maxiter=400,
            record_heterogeneity=heterogeneity,
            verbose=True # whether to print logs in console or not.(default=False)
            )


  4. Plot the loss function, hetrogeneity values for every iteration saved in
     hetrogeneity list.
        plot_heterogeneity(
            heterogeneity,
            k
        )

  5. Have fun..

"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances

TAG = "K-MEANS-CLUST/ "


def get_initial_centroids(data, k, seed=None):
    """Randomly choose k data points as initial centroids"""
    if seed is not None:  # useful for obtaining consistent results
        np.random.seed(seed)
    n = data.shape[0]  # number of data points

    # Pick K indices from range [0, N).
    rand_indices = np.random.randint(0, n, k)

    # Keep centroids as dense format, as many entries will be nonzero due to averaging.
    # As long as at least one document in a cluster contains a word,
    # it will carry a nonzero weight in the TF-IDF vector of the centroid.
    centroids = data[rand_indices, :]

    return centroids


def centroid_pairwise_dist(X, centroids):
    return pairwise_distances(X, centroids, metric="euclidean")


def assign_clusters(data, centroids):

    # Compute distances between each data point and the set of centroids:
    # Fill in the blank (RHS only)
    distances_from_centroids = centroid_pairwise_dist(data, centroids)

    # Compute cluster assignments for each data point:
    # Fill in the blank (RHS only)
    cluster_assignment = np.argmin(distances_from_centroids, axis=1)

    return cluster_assignment


def revise_centroids(data, k, cluster_assignment):
    new_centroids = []
    for i in range(k):
        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
        member_data_points = data[cluster_assignment == i]
        # Compute the mean of the data points. Fill in the blank (RHS only)
        centroid = member_data_points.mean(axis=0)
        new_centroids.append(centroid)
    new_centroids = np.array(new_centroids)

    return new_centroids


def compute_heterogeneity(data, k, centroids, cluster_assignment):

    heterogeneity = 0.0
    for i in range(k):

        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
        member_data_points = data[cluster_assignment == i, :]

        if member_data_points.shape[0] > 0:  # check if i-th cluster is non-empty
            # Compute distances from centroid to data points (RHS only)
            distances = pairwise_distances(
                member_data_points, [centroids[i]], metric="euclidean"
            )
            squared_distances = distances ** 2
            heterogeneity += np.sum(squared_distances)

    return heterogeneity


def plot_heterogeneity(heterogeneity, k):
    plt.figure(figsize=(7, 4))
    plt.plot(heterogeneity, linewidth=4)
    plt.xlabel("# Iterations")
    plt.ylabel("Heterogeneity")
    plt.title(f"Heterogeneity of clustering over time, K={k:d}")
    plt.rcParams.update({"font.size": 16})
    plt.show()


def kmeans(
    data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False
):
    """This function runs k-means on given data and initial set of centroids.
       maxiter: maximum number of iterations to run.(default=500)
       record_heterogeneity: (optional) a list, to store the history of heterogeneity
                             as function of iterations
                             if None, do not store the history.
       verbose: if True, print how many data points changed their cluster labels in
                             each iteration"""
    centroids = initial_centroids[:]
    prev_cluster_assignment = None

    for itr in range(maxiter):
        if verbose:
            print(itr, end="")

        # 1. Make cluster assignments using nearest centroids
        cluster_assignment = assign_clusters(data, centroids)

        # 2. Compute a new centroid for each of the k clusters, averaging all data
        #    points assigned to that cluster.
        centroids = revise_centroids(data, k, cluster_assignment)

        # Check for convergence: if none of the assignments changed, stop
        if (
            prev_cluster_assignment is not None
            and (prev_cluster_assignment == cluster_assignment).all()
        ):
            break

        # Print number of new assignments
        if prev_cluster_assignment is not None:
            num_changed = np.sum(prev_cluster_assignment != cluster_assignment)
            if verbose:
                print(
                    "    {:5d} elements changed their cluster assignment.".format(
                        num_changed
                    )
                )

        # Record heterogeneity convergence metric
        if record_heterogeneity is not None:
            # YOUR CODE HERE
            score = compute_heterogeneity(data, k, centroids, cluster_assignment)
            record_heterogeneity.append(score)

        prev_cluster_assignment = cluster_assignment[:]

    return centroids, cluster_assignment


# Mock test below
if False:  # change to true to run this test case.
    import sklearn.datasets as ds

    dataset = ds.load_iris()
    k = 3
    heterogeneity = []
    initial_centroids = get_initial_centroids(dataset["data"], k, seed=0)
    centroids, cluster_assignment = kmeans(
        dataset["data"],
        k,
        initial_centroids,
        maxiter=400,
        record_heterogeneity=heterogeneity,
        verbose=True,
    )
    plot_heterogeneity(heterogeneity, k)
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`"""README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00
			`Requirements:`
			`- sklearn`
			`- numpy`
			`- matplotlib`

			`Python:`
			`- 3.5`

			`Inputs:`
			`- X , a 2D numpy array of features.`
			`- k , number of clusters to create.`
Tighten up psf/black and flake8 (#2024) * Tighten up psf/black and flake8 * Fix some tests * Fix some E741 * Fix some E741 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-05-22 06:10:11 +00:00			`- initial_centroids , initial centroid values generated by utility function(mentioned`
			`in usage).`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`- maxiter , maximum number of iterations to process.`
Tighten up psf/black and flake8 (#2024) * Tighten up psf/black and flake8 * Fix some tests * Fix some E741 * Fix some E741 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-05-22 06:10:11 +00:00			`- heterogeneity , empty list that will be filled with hetrogeneity values if passed`
			`to kmeans func.`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00
			`Usage:`
			`1. define 'k' value, 'X' features array and 'hetrogeneity' empty list`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`2. create initial_centroids,`
			`initial_centroids = get_initial_centroids(`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00			`X,`
			`k,`
Tighten up psf/black and flake8 (#2024) * Tighten up psf/black and flake8 * Fix some tests * Fix some E741 * Fix some E741 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-05-22 06:10:11 +00:00			`seed=0 # seed value for initial centroid generation,`
			`# None for randomness(default=None)`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`)`

			`3. find centroids and clusters using kmeans function.`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`centroids, cluster_assignment = kmeans(`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00			`X,`
			`k,`
			`initial_centroids,`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`maxiter=400,`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00			`record_heterogeneity=heterogeneity,`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`verbose=True # whether to print logs in console or not.(default=False)`
			`)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00

Tighten up psf/black and flake8 (#2024) * Tighten up psf/black and flake8 * Fix some tests * Fix some E741 * Fix some E741 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-05-22 06:10:11 +00:00			`4. Plot the loss function, hetrogeneity values for every iteration saved in`
			`hetrogeneity list.`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`plot_heterogeneity(`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00			`heterogeneity,`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`k`
			`)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`5. Have fun..`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`"""`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`import numpy as np`
Tighten up psf/black and flake8 (#2024) * Tighten up psf/black and flake8 * Fix some tests * Fix some E741 * Fix some E741 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-05-22 06:10:11 +00:00			`from matplotlib import pyplot as plt`
			`from sklearn.metrics import pairwise_distances`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`TAG = "K-MEANS-CLUST/ "`

added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00
			`def get_initial_centroids(data, k, seed=None):`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`"""Randomly choose k data points as initial centroids"""`
			`if seed is not None: # useful for obtaining consistent results`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`np.random.seed(seed)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`n = data.shape[0] # number of data points`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Pick K indices from range [0, N).`
			`rand_indices = np.random.randint(0, n, k)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Keep centroids as dense format, as many entries will be nonzero due to averaging.`
			`# As long as at least one document in a cluster contains a word,`
			`# it will carry a nonzero weight in the TF-IDF vector of the centroid.`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`centroids = data[rand_indices, :]`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`return centroids`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
			`def centroid_pairwise_dist(X, centroids):`
			`return pairwise_distances(X, centroids, metric="euclidean")`

added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00
			`def assign_clusters(data, centroids):`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Compute distances between each data point and the set of centroids:`
			`# Fill in the blank (RHS only)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`distances_from_centroids = centroid_pairwise_dist(data, centroids)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Compute cluster assignments for each data point:`
			`# Fill in the blank (RHS only)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`cluster_assignment = np.argmin(distances_from_centroids, axis=1)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`return cluster_assignment`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`def revise_centroids(data, k, cluster_assignment):`
			`new_centroids = []`
			`for i in range(k):`
			`# Select all data points that belong to cluster i. Fill in the blank (RHS only)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`member_data_points = data[cluster_assignment == i]`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Compute the mean of the data points. Fill in the blank (RHS only)`
			`centroid = member_data_points.mean(axis=0)`
			`new_centroids.append(centroid)`
			`new_centroids = np.array(new_centroids)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`return new_centroids`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`def compute_heterogeneity(data, k, centroids, cluster_assignment):`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`heterogeneity = 0.0`
			`for i in range(k):`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Select all data points that belong to cluster i. Fill in the blank (RHS only)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`member_data_points = data[cluster_assignment == i, :]`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Compute distances from centroid to data points (RHS only)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`distances = pairwise_distances(`
			`member_data_points, [centroids[i]], metric="euclidean"`
			`)`
			`squared_distances = distances ** 2`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`heterogeneity += np.sum(squared_distances)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`return heterogeneity`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`def plot_heterogeneity(heterogeneity, k):`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`plt.figure(figsize=(7, 4))`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`plt.plot(heterogeneity, linewidth=4)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`plt.xlabel("# Iterations")`
			`plt.ylabel("Heterogeneity")`
pyupgrade --py37-plus */.py (#1654) * pyupgrade --py37-plus */.py * fixup! Format Python code with psf/black push 2020-01-03 14:25:36 +00:00			`plt.title(f"Heterogeneity of clustering over time, K={k:d}")`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`plt.rcParams.update({"font.size": 16})`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`plt.show()`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
			`def kmeans(`
			`data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False`
			`):`
			`"""This function runs k-means on given data and initial set of centroids.`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`maxiter: maximum number of iterations to run.(default=500)`
Tighten up psf/black and flake8 (#2024) * Tighten up psf/black and flake8 * Fix some tests * Fix some E741 * Fix some E741 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-05-22 06:10:11 +00:00			`record_heterogeneity: (optional) a list, to store the history of heterogeneity`
			`as function of iterations`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`if None, do not store the history.`
Tighten up psf/black and flake8 (#2024) * Tighten up psf/black and flake8 * Fix some tests * Fix some E741 * Fix some E741 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-05-22 06:10:11 +00:00			`verbose: if True, print how many data points changed their cluster labels in`
			`each iteration"""`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`centroids = initial_centroids[:]`
			`prev_cluster_assignment = None`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
			`for itr in range(maxiter):`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`if verbose:`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`print(itr, end="")`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# 1. Make cluster assignments using nearest centroids`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`cluster_assignment = assign_clusters(data, centroids)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
Tighten up psf/black and flake8 (#2024) * Tighten up psf/black and flake8 * Fix some tests * Fix some E741 * Fix some E741 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2020-05-22 06:10:11 +00:00			`# 2. Compute a new centroid for each of the k clusters, averaging all data`
			`# points assigned to that cluster.`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`centroids = revise_centroids(data, k, cluster_assignment)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Check for convergence: if none of the assignments changed, stop`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`if (`
			`prev_cluster_assignment is not None`
			`and (prev_cluster_assignment == cluster_assignment).all()`
			`):`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`break`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
			`# Print number of new assignments`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`if prev_cluster_assignment is not None:`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`num_changed = np.sum(prev_cluster_assignment != cluster_assignment)`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`if verbose:`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`print(`
pyupgrade --py37-plus */.py (#1654) * pyupgrade --py37-plus */.py * fixup! Format Python code with psf/black push 2020-01-03 14:25:36 +00:00			`" {:5d} elements changed their cluster assignment.".format(`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`num_changed`
			`)`
			`)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Record heterogeneity convergence metric`
			`if record_heterogeneity is not None:`
			`# YOUR CODE HERE`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`score = compute_heterogeneity(data, k, centroids, cluster_assignment)`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`record_heterogeneity.append(score)`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`prev_cluster_assignment = cluster_assignment[:]`
Simplify code by dropping support for legacy Python (#1143) * Simplify code by dropping support for legacy Python * sort() --> sorted() 2019-08-19 13:37:49 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`return centroids, cluster_assignment`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`# Mock test below`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`if False: # change to true to run this test case.`
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`import sklearn.datasets as ds`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
added k means clustering algorithm, usage doc inside. 2017-10-02 15:43:43 +00:00			`dataset = ds.load_iris()`
			`k = 3`
			`heterogeneity = []`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`initial_centroids = get_initial_centroids(dataset["data"], k, seed=0)`
			`centroids, cluster_assignment = kmeans(`
			`dataset["data"],`
			`k,`
			`initial_centroids,`
			`maxiter=400,`
			`record_heterogeneity=heterogeneity,`
			`verbose=True,`
			`)`
Modernize Python 2 code to get ready for Python 3 2017-11-25 11:41:55 +00:00			`plot_heterogeneity(heterogeneity, k)`