For this assignment, I will try to somehow implement the k means algorithm with GPU acceleration. First, a simple k means kernel is built to accelerate 2D distance and cluster estimation. Then this is extended to 3D space, tested, and compared with open cv function to asses performance.
import math, matplotlib.pyplot as plt, operator, torch
import pycuda.autoinit
import pycuda.driver as drv
import numpy
from pycuda.compiler import SourceModule
import numpy as np
import cv2 as cv
from torch.distributions.multivariate_normal import MultivariateNormal
from torch import tensor
torch.set_printoptions(precision=3, linewidth=140, sci_mode=False)
I know it is not necessary and consistency would be better but I will use both torch tensors and NumPy arrays to practice some transformations and functionalities. A set of toy data is generated first with PyTorch
def sample(m): return MultivariateNormal(m, torch.diag(tensor([5.,5.]))).sample((n_samples,))
n_clusters=6
n_samples =100
centroids = torch.rand(n_clusters, 2)*70-35
slices = [sample(c) for c in centroids]
data = torch.cat(slices)
plt.scatter(data[:,0],data[:,1],s=2)
<matplotlib.collections.PathCollection at 0x7f437efe1130>
Now I define two functions to initialize k centers a completely random point generator and a k-means++ principle that aims to maximize the initial distance over the selected centers.
def plus_plus(ds, k, random_state=42):
"""
Create cluster centroids using the k-means++ algorithm.
Parameters
----------
ds : numpy array
The dataset to be used for centroid initialization.
k : int
The desired number of clusters for which centroids are required.
Returns
-------
centroids : numpy array
Collection of k centroids as a numpy array.
Inspiration from here: https://stackoverflow.com/questions/5466323/how-could-one-implement-the-k-means-algorithm
"""
np.random.seed(random_state)
centroids = [ds[0]]
for _ in range(1, k):
dist_sq = np.array([min([np.inner(c-x,c-x) for c in centroids]) for x in ds])
probs = dist_sq/dist_sq.sum()
cumulative_probs = probs.cumsum()
r = np.random.rand()
for j, p in enumerate(cumulative_probs):
if r < p:
i = j
break
centroids.append(ds[i])
return np.array(centroids)
def random(ds, k, random_state=42):
"""
Create random cluster centroids.
Parameters
----------
ds : numpy array
The dataset to be used for centroid initialization.
k : int
The desired number of clusters for which centroids are required.
Returns
-------
centroids : numpy array
Collection of k centroids as a numpy array.
"""
np.random.seed(random_state)
centroids = []
m = np.shape(ds)[0]
for _ in range(k):
r = np.random.randint(0, m-1)
centroids.append(ds[r])
return np.array(centroids)
def plot_data(centroids, data, n_samples):
fig = plt.figure()
ax = fig.add_subplot()
for i, centroid in enumerate(centroids):
samples = data[i*n_samples:(i+1)*n_samples]
ax.scatter(samples[:,0], samples[:,1], s=1)
ax.plot(*centroid, markersize=10, marker="x", color='k', mew=5)
ax.plot(*centroid, markersize=5, marker="x", color='m', mew=2)
def plot_data3D(centroids, data, n_samples):
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')
for i, centroid in enumerate(centroids):
samples = data[i*n_samples:(i+1)*n_samples]
ax.scatter(samples[:,0], samples[:,1], samples[:,2], s=1)
ax.plot(*centroid, markersize=10, marker="x", color='k', mew=5)
ax.plot(*centroid, markersize=5, marker="x", color='m', mew=2)
plot_data(centroids, data, 100)
The kernel is a simple function, I define a one-dimensional grid with a one-dimensional block of N threads where N represents the total number of data samples to cluster. The kernel will estimate the distance between the point and each center and determine which is the minimum. The update of the centroids is done in CPU
mod = SourceModule("""
__global__ void cluster_sub(float *dest, float *x,float *y,float *c_x,float *c_y, int N,int K)
{
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
int index=0;
if(threadId < N){
float dist=sqrt(pow((x[threadId] - c_x[0]),2) + pow((y[threadId] - c_y[0]),2));
for(int j=0 ; j<K; j++)
{
if(sqrt(pow((x[threadId] - c_x[j]),2) + pow((y[threadId] - c_y[j]),2))< dist){
dist=sqrt(pow((x[threadId] - c_x[j]),2) + pow((y[threadId] - c_y[j]),2));
index=j;
}else {}
dest[threadId]=index;
}
}
}
""")
cluster_sub = mod.get_function("cluster_sub")
#Random centroids
K=numpy.int32(6)
subset_x = data[:,0].numpy().copy(order='C')
subset_y = data[:,1].numpy().copy(order='C')
N=numpy.int32(subset_x.shape[0])
centroids = random(data.numpy(),K,random_state=42)
centroids_x = centroids[:,0].copy(order='C')
centroids_y = centroids[:,1].copy(order='C')
dest = numpy.zeros_like(subset_x)
for i in range (20):
cluster_1=dest
dest = numpy.zeros_like(subset_x)
c=(numpy.stack([centroids_x.T,centroids_y.T],axis=1))
plot_data(c, data, 100)
cluster_sub(drv.Out(dest), drv.In(subset_x),drv.In(subset_y), drv.In(centroids_x),drv.In(centroids_y),N,K,block=(int(N),1,1), grid=(1,1))
if numpy.all(dest==cluster_1):
break
else:
centroids_x=numpy.array([numpy.mean(subset_x[dest==i]) for i in range(K)])
centroids_y=numpy.array([numpy.mean(subset_y[dest==i]) for i in range(K)])
#With kmeans ++
K=numpy.int32(6)
subset_x = data[:,0].numpy().copy(order='C')
subset_y = data[:,1].numpy().copy(order='C')
N=numpy.int32(subset_x.shape[0])
centroids = plus_plus(data.numpy(),K,random_state=5258)
centroids_x = centroids[:,0].copy(order='C')
centroids_y = centroids[:,1].copy(order='C')
dest = numpy.zeros_like(subset_x)
for i in range (20):
cluster_1=dest
dest = numpy.zeros_like(subset_x)
c=(numpy.stack([centroids_x.T,centroids_y.T],axis=1))
plot_data(c, data, 100)
cluster_sub(drv.Out(dest), drv.In(subset_x),drv.In(subset_y), drv.In(centroids_x),drv.In(centroids_y),N,K,block=(int(N),1,1), grid=(1,1))
if numpy.all(dest==cluster_1):
break
else:
centroids_x=numpy.array([numpy.mean(subset_x[dest==i]) for i in range(K)])
centroids_y=numpy.array([numpy.mean(subset_y[dest==i]) for i in range(K)])
The kmeas++ performs better with fewer iterations and better cluster results. However, the random initialization gives decent results. I have some trouble with the fact that two centroids are often initialized too close and the doesn't allow to find all the clusters
Now I will define he kernel function for 3D arrays and the grid shape in such a way the image is processed in chunks of 512 threads and as many blocks as needed.
mod = SourceModule("""
__global__ void cluster_sub(float *dest, float *x,float *y,float *z,float *c_x,float *c_y,float *c_z, int N,int K)
{
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
int index=0;
if(threadId < N){
float dist=sqrt( pow((x[threadId] - c_x[0]),2) + pow((y[threadId] - c_y[0]),2) + pow((z[threadId] - c_z[0]),2));
for(int j=0 ; j<K; j++)
{
if(sqrt( pow((x[threadId] - c_x[j]),2) + pow((y[threadId] - c_y[j]),2) + pow((z[threadId] - c_z[j]),2))< dist){
dist=sqrt( pow((x[threadId] - c_x[j]),2) + pow((y[threadId] - c_y[j]),2) + pow((z[threadId] - c_z[j]),2));
index=j;
}else {}
dest[threadId]=index;
}
}
}
""")
cluster_sub = mod.get_function("cluster_sub")
def sample(m): return MultivariateNormal(m, torch.diag(tensor([5.,7.,10,]))).sample((n_samples,))
n_clusters=6
n_samples =200
centroids = torch.rand(n_clusters, 3)*70-35
slices = [sample(c) for c in centroids]
data = torch.cat(slices)
plot_data3D(centroids,data,5000)
#With kmeans ++
K=numpy.int32(6)
subset_x = data[:,0].numpy().copy(order='C')
subset_y = data[:,1].numpy().copy(order='C')
subset_z = data[:,2].numpy().copy(order='C')
N=numpy.int32(subset_x.shape[0])
centroids = plus_plus(data.numpy(),K,random_state=5258)
centroids_x = centroids[:,0].copy(order='C')
centroids_y = centroids[:,1].copy(order='C')
centroids_z = centroids[:,2].copy(order='C')
dest = numpy.zeros_like(subset_x)
grid_x=int(N/512)+1
for i in range (10):
cluster_1=dest
dest = numpy.zeros_like(subset_x)
c=(numpy.stack([centroids_x.T,centroids_y.T,centroids_z.T],axis=1))
plot_data3D(c,data,200)
cluster_sub(drv.Out(dest), drv.In(subset_x),drv.In(subset_y),drv.In(subset_z), drv.In(centroids_x),drv.In(centroids_y),drv.In(centroids_z),N,K,block=(512,1,1), grid=(6,1))
if numpy.all(dest==cluster_1):
break
else:
centroids_x=numpy.array([numpy.mean(subset_x[dest==i]) for i in range(K)])
centroids_y=numpy.array([numpy.mean(subset_y[dest==i]) for i in range(K)])
centroids_z=numpy.array([numpy.mean(subset_z[dest==i]) for i in range(K)])
import cv2 as cv
img = cv.imread('/notebooks/delta.jpg')
img =cv.resize(img,(1000,1000))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7f437c46b760>
data = img.reshape((-1,3))
#With kmeans ++
K=numpy.int32(6)
subset_x = data[:,0].astype(np.float32).copy(order='C')
subset_y = data[:,1].astype(np.float32).copy(order='C')
subset_z = data[:,2].astype(np.float32).copy(order='C')
N=numpy.int32(subset_x.shape[0])
centroids = plus_plus(data,K,random_state=5258).astype(np.float32)
centroids_x = centroids[:,0].copy(order='C')
centroids_y = centroids[:,1].copy(order='C')
centroids_z = centroids[:,2].copy(order='C')
dest = numpy.zeros_like(subset_x)
grid_x=int(N/512)+1
%time
while True:
cluster_1=dest
dest = numpy.zeros_like(subset_x)
cluster_sub(drv.Out(dest), drv.In(subset_x),drv.In(subset_y),drv.In(subset_z), drv.In(centroids_x),drv.In(centroids_y),drv.In(centroids_z),N,K,block=(512,1,1), grid=(grid_x,1))
if numpy.all(dest==cluster_1):
break
else:
centroids_x=numpy.array([numpy.mean(subset_x[dest==i]) for i in range(K)])
centroids_y=numpy.array([numpy.mean(subset_y[dest==i]) for i in range(K)])
centroids_z=numpy.array([numpy.mean(subset_z[dest==i]) for i in range(K)])
CPU times: user 4 µs, sys: 0 ns, total: 4 µs Wall time: 9.06 µs
subset_xsubset_xBelow we can see each centroid marked w/ X, and the coloring associated to each respective cluster.
plt.imshow(dest.reshape(1000,1000))
<matplotlib.image.AxesImage at 0x7f437c6750a0>
Z = img.reshape((-1,3))
Z = np.float32(Z)
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 1.0)
K = 6
%time
ret,label,center=cv.kmeans(Z,K,None,criteria,10,cv.KMEANS_RANDOM_CENTERS)
CPU times: user 4 µs, sys: 0 ns, total: 4 µs Wall time: 8.58 µs
plt.imshow(label.reshape(1000,1000))
<matplotlib.image.AxesImage at 0x7f4c3447ad60>