跳转至

Clustering

K-Means

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances


def normalize(data: np.ndarray):
    mean_ = data.mean(0)
    std_ = data.std(0)
    return (data - mean_) / std_


def get_labels(data, centroids):
    # (n_samples, n_classes)
    dist = euclidean_distances(data, centroids)
    labels = np.argmin(dist, axis=1)
    return labels  # (n_samples,)


def get_centroids(data, labels, n_classes):
    n_features = data.shape[1]
    new_centroids = np.empty((n_classes, n_features))
    for i in range(n_classes):
        cls = data[labels == i]  # (n_c_samples, n_features)
        if cls.size != 0:
            new_centroids[i] = np.mean(cls, axis=0)
        else:
            new_centroids[i] = 0
    return new_centroids


def k_means(data: np.ndarray, n_classes):
    n_samples, n_features = data.shape
    data = normalize(data)
    centroids = np.random.rand(n_classes, n_features)
    n_iteration_ = 0

    while True:
        old_centroids = centroids
        n_iteration_ += 1
        labels = get_labels(data, centroids)
        centroids = get_centroids(data, labels, n_classes)
        if np.allclose(centroids, old_centroids):
            print(f"Converged after {n_iteration_} iterations.")
            break
    return labels