Slide 21
Slide 21 text
// kmeans++の初期値を返す。入力のnumClusterはクラスタ数、dataは分類対象データ。
// 初期値同士を遠ざけることで、局所解を防ぐと共に収束を早くする。
// SEE ALSO: http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf
func seeds(numCluster uint64, data []*entityIdeaVector.Entity) ([][]float64, error) {
seeds := make([][]float64, numCluster)
e := data[rand.Intn(len(data))]
seeds[0] = e.Vector
for i := uint64(1); i < numCluster; i++ {
dist := make([]float64, len(data))
var sum float64
for k, v := range data {
_, dist[k] = nearest(seeds, v.Vector)
sum += dist[k]
}
threshold := rand.Float64() * sum
var stack float64
var dataNum int
for k, v := range dist {
stack += v
if threshold < stack {
dataNum = k
break
}
}
seeds[i] = data[dataNum].Vector
}
return seeds, nil
}