How come clustering gives 99%> result in just two epochs

please check this

and this is my code


from sklearn.datasets import make_blobs
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

centers = np.random.randint(1, 51, (3, 2))
data = make_blobs(n_samples=1000, centers=centers)
X, y = data

def get_initial_centroids(X, y):
    classes = np.unique(y)
    return np.random.randint(20, 60, size=(classes.shape[0], X.shape[1]))
   

def associate_centroids(X, centroids):
    y = np.zeros((X.shape[0], 2))
    for i in range(X.shape[0]):
        x_i = X[i, :]
        distances = np.zeros(centroids.shape[0])

        for k in range(centroids.shape[0]):
            c_k = centroids[k, :]
            distance = np.sqrt(np.sum(np.power(c_k - x_i, 2)))
            distances[k] = distance
        
        cid = np.argmin(distances)
        y[i] = [cid.astype(int), distances[cid]]
    return y


def update_centroids(X, y, centroids):
    for i in range(centroids.shape[0]):
        centroids[i] = np.mean(X[np.where(y == i)], axis=0)
    return centroids

def train(X, y, centroids):
    for i in range(50):
        y_pred = associate_centroids(X, centroids)
        score = round((y_pred[:, 0] == y).mean() * 100, 3)
        print(f"EPOCH: {i}, SCORE={score}%")
        centroids = update_centroids(X, y, centroids.copy())
        if score >= 95:
            break
    
    return centroids
    

def plot_clusters(centroids, X, y, title):
    colors = [("gold", "black"), ("navy", "aqua"), ("green", "lime")]

    for i in range(centroids.shape[0]):
        blob = X[np.where(y == i)]
        plt.scatter(blob[:, 0], blob[:,1], c=colors[i][0])
        plt.scatter(centroids[i, 0], centroids[i, 1], label=f"Centroid {i+1}", c=colors[i][1])
    plt.title(title)
    plt.legend()

centroids = get_initial_centroids(X, y)
plot_clusters(centroids, X, y, "Before Training")

centroids = train(X, y, centroids)
plot_clusters(centroids, X, y, "After Train")

Is this because, unlike in supervised learning here,

The cost will either decrease or stay the same after each iteration. .

Also checked with same formula, I am getting 9 iteraction in class assignment. Now wondering where I whent wrong

Your initial clusters are already clustered, so there’s nothing to learn.

1 Like

So you mean if the data was scattered it would have taken more iterations?

That’s a great experiment you can try. Please report your results.

1 Like