Gần đây tôi cần điều này cho một bộ dữ liệu không lớn. Câu trả lời của tôi, mặc dù nó có thời gian chạy tương đối dài, được đảm bảo để hội tụ đến mức tối ưu cục bộ.
def eqsc(X, K=None, G=None):
"equal-size clustering based on data exchanges between pairs of clusters"
from scipy.spatial.distance import pdist, squareform
from matplotlib import pyplot as plt
from matplotlib import animation as ani
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
def error(K, m, D):
"""return average distances between data in one cluster, averaged over all clusters"""
E = 0
for k in range(K):
i = numpy.where(m == k)[0] # indeces of datapoints belonging to class k
E += numpy.mean(D[numpy.meshgrid(i,i)])
return E / K
numpy.random.seed(0) # repeatability
N, n = X.shape
if G is None and K is not None:
G = N // K # group size
elif K is None and G is not None:
K = N // G # number of clusters
else:
raise Exception('must specify either K or G')
D = squareform(pdist(X)) # distance matrix
m = numpy.random.permutation(N) % K # initial membership
E = error(K, m, D)
# visualization
#FFMpegWriter = ani.writers['ffmpeg']
#writer = FFMpegWriter(fps=15)
#fig = plt.figure()
#with writer.saving(fig, "ec.mp4", 100):
t = 1
while True:
E_p = E
for a in range(N): # systematically
for b in range(a):
m[a], m[b] = m[b], m[a] # exchange membership
E_t = error(K, m, D)
if E_t < E:
E = E_t
print("{}: {}<->{} E={}".format(t, a, b, E))
#plt.clf()
#for i in range(N):
#plt.text(X[i,0], X[i,1], m[i])
#writer.grab_frame()
else:
m[a], m[b] = m[b], m[a] # put them back
if E_p == E:
break
t += 1
fig, ax = plt.subplots()
patches = []
for k in range(K):
i = numpy.where(m == k)[0] # indeces of datapoints belonging to class k
x = X[i]
patches.append(Polygon(x[:,:2], True)) # how to draw this clock-wise?
u = numpy.mean(x, 0)
plt.text(u[0], u[1], k)
p = PatchCollection(patches, alpha=0.5)
ax.add_collection(p)
plt.show()
if __name__ == "__main__":
N, n = 100, 2
X = numpy.random.rand(N, n)
eqsc(X, G=3)