Python/dynamic_programming/k_means_clustering_tensorflow.py

import tensorflow as tf
from random import shuffle
from numpy import array


def TFKMeansCluster(vectors, noofclusters):
    """
    K-Means Clustering using TensorFlow.
    'vectors' should be a n*k 2-D NumPy array, where n is the number
    of vectors of dimensionality k.
    'noofclusters' should be an integer.
    """

    noofclusters = int(noofclusters)
    assert noofclusters < len(vectors)

    # Find out the dimensionality
    dim = len(vectors[0])

    # Will help select random centroids from among the available vectors
    vector_indices = list(range(len(vectors)))
    shuffle(vector_indices)

    # GRAPH OF COMPUTATION
    # We initialize a new graph and set it as the default during each run
    # of this algorithm. This ensures that as this function is called
    # multiple times, the default graph doesn't keep getting crowded with
    # unused ops and Variables from previous function calls.

    graph = tf.Graph()

    with graph.as_default():

        # SESSION OF COMPUTATION

        sess = tf.Session()

        ##CONSTRUCTING THE ELEMENTS OF COMPUTATION

        ##First lets ensure we have a Variable vector for each centroid,
        ##initialized to one of the vectors from the available data points
        centroids = [
            tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters)
        ]
        ##These nodes will assign the centroid Variables the appropriate
        ##values
        centroid_value = tf.placeholder("float64", [dim])
        cent_assigns = []
        for centroid in centroids:
            cent_assigns.append(tf.assign(centroid, centroid_value))

        ##Variables for cluster assignments of individual vectors(initialized
        ##to 0 at first)
        assignments = [tf.Variable(0) for i in range(len(vectors))]
        ##These nodes will assign an assignment Variable the appropriate
        ##value
        assignment_value = tf.placeholder("int32")
        cluster_assigns = []
        for assignment in assignments:
            cluster_assigns.append(tf.assign(assignment, assignment_value))

        ##Now lets construct the node that will compute the mean
        # The placeholder for the input
        mean_input = tf.placeholder("float", [None, dim])
        # The Node/op takes the input and computes a mean along the 0th
        # dimension, i.e. the list of input vectors
        mean_op = tf.reduce_mean(mean_input, 0)

        ##Node for computing Euclidean distances
        # Placeholders for input
        v1 = tf.placeholder("float", [dim])
        v2 = tf.placeholder("float", [dim])
        euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2)))

        ##This node will figure out which cluster to assign a vector to,
        ##based on Euclidean distances of the vector from the centroids.
        # Placeholder for input
        centroid_distances = tf.placeholder("float", [noofclusters])
        cluster_assignment = tf.argmin(centroid_distances, 0)

        ##INITIALIZING STATE VARIABLES

        ##This will help initialization of all Variables defined with respect
        ##to the graph. The Variable-initializer should be defined after
        ##all the Variables have been constructed, so that each of them
        ##will be included in the initialization.
        init_op = tf.initialize_all_variables()

        # Initialize all variables
        sess.run(init_op)

        ##CLUSTERING ITERATIONS

        # Now perform the Expectation-Maximization steps of K-Means clustering
        # iterations. To keep things simple, we will only do a set number of
        # iterations, instead of using a Stopping Criterion.
        noofiterations = 100
        for iteration_n in range(noofiterations):

            ##EXPECTATION STEP
            ##Based on the centroid locations till last iteration, compute
            ##the _expected_ centroid assignments.
            # Iterate over each vector
            for vector_n in range(len(vectors)):
                vect = vectors[vector_n]
                # Compute Euclidean distance between this vector and each
                # centroid. Remember that this list cannot be named
                #'centroid_distances', since that is the input to the
                # cluster assignment node.
                distances = [
                    sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)})
                    for centroid in centroids
                ]
                # Now use the cluster assignment node, with the distances
                # as the input
                assignment = sess.run(
                    cluster_assignment, feed_dict={centroid_distances: distances}
                )
                # Now assign the value to the appropriate state variable
                sess.run(
                    cluster_assigns[vector_n], feed_dict={assignment_value: assignment}
                )

            ##MAXIMIZATION STEP
            # Based on the expected state computed from the Expectation Step,
            # compute the locations of the centroids so as to maximize the
            # overall objective of minimizing within-cluster Sum-of-Squares
            for cluster_n in range(noofclusters):
                # Collect all the vectors assigned to this cluster
                assigned_vects = [
                    vectors[i]
                    for i in range(len(vectors))
                    if sess.run(assignments[i]) == cluster_n
                ]
                # Compute new centroid location
                new_location = sess.run(
                    mean_op, feed_dict={mean_input: array(assigned_vects)}
                )
                # Assign value to appropriate variable
                sess.run(
                    cent_assigns[cluster_n], feed_dict={centroid_value: new_location}
                )

        # Return centroids and assignments
        centroids = sess.run(centroids)
        assignments = sess.run(assignments)
        return centroids, assignments
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`import tensorflow as tf`
Remove Multiple Unused Imports and Variable 2018-10-17 21:28:57 +00:00			`from random import shuffle`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`from numpy import array`


			`def TFKMeansCluster(vectors, noofclusters):`
			`"""`
			`K-Means Clustering using TensorFlow.`
			`'vectors' should be a n*k 2-D NumPy array, where n is the number`
			`of vectors of dimensionality k.`
			`'noofclusters' should be an integer.`
			`"""`

			`noofclusters = int(noofclusters)`
			`assert noofclusters < len(vectors)`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Find out the dimensionality`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`dim = len(vectors[0])`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Will help select random centroids from among the available vectors`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`vector_indices = list(range(len(vectors)))`
			`shuffle(vector_indices)`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# GRAPH OF COMPUTATION`
			`# We initialize a new graph and set it as the default during each run`
			`# of this algorithm. This ensures that as this function is called`
			`# multiple times, the default graph doesn't keep getting crowded with`
			`# unused ops and Variables from previous function calls.`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00
			`graph = tf.Graph()`

			`with graph.as_default():`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# SESSION OF COMPUTATION`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00
			`sess = tf.Session()`

			`##CONSTRUCTING THE ELEMENTS OF COMPUTATION`

			`##First lets ensure we have a Variable vector for each centroid,`
			`##initialized to one of the vectors from the available data points`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`centroids = [`
pyupgrade --py37-plus */.py (#1654) * pyupgrade --py37-plus */.py * fixup! Format Python code with psf/black push 2020-01-03 14:25:36 +00:00			`tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`]`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`##These nodes will assign the centroid Variables the appropriate`
			`##values`
			`centroid_value = tf.placeholder("float64", [dim])`
			`cent_assigns = []`
			`for centroid in centroids:`
			`cent_assigns.append(tf.assign(centroid, centroid_value))`

			`##Variables for cluster assignments of individual vectors(initialized`
			`##to 0 at first)`
			`assignments = [tf.Variable(0) for i in range(len(vectors))]`
			`##These nodes will assign an assignment Variable the appropriate`
			`##value`
			`assignment_value = tf.placeholder("int32")`
			`cluster_assigns = []`
			`for assignment in assignments:`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`cluster_assigns.append(tf.assign(assignment, assignment_value))`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00
			`##Now lets construct the node that will compute the mean`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# The placeholder for the input`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`mean_input = tf.placeholder("float", [None, dim])`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# The Node/op takes the input and computes a mean along the 0th`
			`# dimension, i.e. the list of input vectors`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`mean_op = tf.reduce_mean(mean_input, 0)`

			`##Node for computing Euclidean distances`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Placeholders for input`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`v1 = tf.placeholder("float", [dim])`
			`v2 = tf.placeholder("float", [dim])`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2)))`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00
			`##This node will figure out which cluster to assign a vector to,`
			`##based on Euclidean distances of the vector from the centroids.`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Placeholder for input`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`centroid_distances = tf.placeholder("float", [noofclusters])`
			`cluster_assignment = tf.argmin(centroid_distances, 0)`

			`##INITIALIZING STATE VARIABLES`

			`##This will help initialization of all Variables defined with respect`
			`##to the graph. The Variable-initializer should be defined after`
			`##all the Variables have been constructed, so that each of them`
			`##will be included in the initialization.`
			`init_op = tf.initialize_all_variables()`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Initialize all variables`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`sess.run(init_op)`

			`##CLUSTERING ITERATIONS`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Now perform the Expectation-Maximization steps of K-Means clustering`
			`# iterations. To keep things simple, we will only do a set number of`
			`# iterations, instead of using a Stopping Criterion.`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`noofiterations = 100`
			`for iteration_n in range(noofiterations):`

			`##EXPECTATION STEP`
			`##Based on the centroid locations till last iteration, compute`
			`##the _expected_ centroid assignments.`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Iterate over each vector`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`for vector_n in range(len(vectors)):`
			`vect = vectors[vector_n]`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Compute Euclidean distance between this vector and each`
			`# centroid. Remember that this list cannot be named`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`#'centroid_distances', since that is the input to the`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# cluster assignment node.`
			`distances = [`
			`sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)})`
			`for centroid in centroids`
			`]`
			`# Now use the cluster assignment node, with the distances`
			`# as the input`
			`assignment = sess.run(`
			`cluster_assignment, feed_dict={centroid_distances: distances}`
			`)`
			`# Now assign the value to the appropriate state variable`
			`sess.run(`
			`cluster_assigns[vector_n], feed_dict={assignment_value: assignment}`
			`)`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00
			`##MAXIMIZATION STEP`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Based on the expected state computed from the Expectation Step,`
			`# compute the locations of the centroids so as to maximize the`
			`# overall objective of minimizing within-cluster Sum-of-Squares`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`for cluster_n in range(noofclusters):`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`# Collect all the vectors assigned to this cluster`
			`assigned_vects = [`
			`vectors[i]`
			`for i in range(len(vectors))`
			`if sess.run(assignments[i]) == cluster_n`
			`]`
			`# Compute new centroid location`
			`new_location = sess.run(`
			`mean_op, feed_dict={mean_input: array(assigned_vects)}`
			`)`
			`# Assign value to appropriate variable`
			`sess.run(`
			`cent_assigns[cluster_n], feed_dict={centroid_value: new_location}`
			`)`

			`# Return centroids and assignments`
Added one of the most important machine learning algorithm The k-means clustering is done by using tensorflow which is the vital and growing machine learning library of google. 2017-07-29 19:12:32 +00:00			`centroids = sess.run(centroids)`
			`assignments = sess.run(assignments)`
			`return centroids, assignments`