K-means聚类算法的简单numpy实现

K-means算法作为一种无监督的学习算法，其实现思路比较简单，就是根据当前设定的$K$个聚类点$p_1,p_2…p_i…p_k$，按照给定的距离公式将每一个点归类到对应的簇$C_i$中，然后对每一个簇重新计算它目前的新聚类点坐标$p_i’$，然后看这个新聚类点是否与聚类之前设定的聚类点$p_i$相同，如果不相同，那么更新这个聚类点，然后继续处理下个簇，直到每个簇的聚类点都不会再发生变化。

代码实现的时候参考了网上的伪代码：

代码：

import numpy as np
from matplotlib import pyplot as plt


def dis(a, b):
    return np.sqrt(np.sum((a - b) ** 2))


def func(points, k=5, disthre=1e-5):
    n_points = len(points)
    centroid_indexs = np.random.choice(range(n_points), k)
    centroid_coords = points[centroid_indexs]
    labels = [-1 for i in range(n_points)]
    max_proc_cnt = 1000  # max process count
    proc_cnt = 0
    while True and (proc_cnt < max_proc_cnt):
        for i in range(n_points):
            point_rep = np.repeat(np.expand_dims(points[i], 0), repeats=k, axis=0)
            point_rep = np.sqrt(np.sum((point_rep - centroid_coords) ** 2, axis=1, keepdims=True))
            centroid_min_oriIndex = np.argmin(point_rep, axis=0)[0]
            labels[i] = centroid_min_oriIndex

        centroid_updated = False
        # for i, centroids_index in enumerate(centroids_indexs):
        for i, centroid_coord in enumerate(centroid_coords):
            # get centroids_index's sub_points
            cluster_points = []
            for j in range(n_points):
                if labels[j] == i:
                    cluster_points.append(points[j])
            cluster_points = np.concatenate([cluster_points], axis=1)
            # cal new_centroid_coord
            new_centroid_coord = np.sum(cluster_points, axis=0, keepdims=True) / len(cluster_points)
            if dis(centroid_coord, new_centroid_coord) > disthre:
                centroid_coords[i] = new_centroid_coord
                centroid_updated = True
        if not centroid_updated:
            break
        proc_cnt += 1
    print('fnished with {0}(s) loop'.format(proc_cnt))
    return labels


def kmeans_demo():
    n_points = 100
    n_clusters = 5
    X = np.random.rand(n_points, 2) * n_points
    colors = ['pink', 'red', 'blue', 'green', 'cyan']

    plt.figure()
    ax1 = plt.subplot(1, 2, 1)
    ax2 = plt.subplot(1, 2, 2)
    plt.title('K-means demo(k={0})'.format(n_clusters), fontsize='large', fontweight='bold')

    plt.sca(ax1)
    plt.scatter(X[:, 0], X[:, 1], marker='.', color='red', s=20, label='First')

    # process begin
    labels = func(points=X, k=n_clusters, disthre=1e-7)
    # process end

    # show the results with pyplot and output my simple statics
    plt.sca(ax2)
    for i in range(n_points):
        plt.scatter(X[i, 0], X[i, 1], marker='.', color=colors[labels[i]], s=20, label='Second')
    print(labels)

    for cid in range(n_clusters):
        print('{0} cluster(colored {1}) has {2} points'.format(cid, colors[cid], labels.count(cid)))
    plt.show()


if __name__ == '__main__':
    kmeans_demo()

效果：