1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5// By downloading, copying, installing or using the software you agree to this license.
6// If you do not agree to this license, do not download, install,
7// copy or use the software.
8//
9//
10// License Agreement
11// For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2018, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16// Third party copyrights are property of their respective owners.
17//
18// Redistribution and use in source and binary forms, with or without modification,
19// are permitted provided that the following conditions are met:
20//
21// * Redistribution's of source code must retain the above copyright notice,
22// this list of conditions and the following disclaimer.
23//
24// * Redistribution's in binary form must reproduce the above copyright notice,
25// this list of conditions and the following disclaimer in the documentation
26// and/or other materials provided with the distribution.
27//
28// * The name of the copyright holders may not be used to endorse or promote products
29// derived from this software without specific prior written permission.
30//
31// This software is provided by the copyright holders and contributors "as is" and
32// any express or implied warranties, including, but not limited to, the implied
33// warranties of merchantability and fitness for a particular purpose are disclaimed.
34// In no event shall the Intel Corporation or contributors be liable for any direct,
35// indirect, incidental, special, exemplary, or consequential damages
36// (including, but not limited to, procurement of substitute goods or services;
37// loss of use, data, or profits; or business interruption) however caused
38// and on any theory of liability, whether in contract, strict liability,
39// or tort (including negligence or otherwise) arising in any way out of
40// the use of this software, even if advised of the possibility of such damage.
41//
42//M*/
43
44#include "precomp.hpp"
45#include <opencv2/core/utils/configuration.private.hpp>
46#include <opencv2/core/hal/hal.hpp>
47
48////////////////////////////////////////// kmeans ////////////////////////////////////////////
49
50namespace cv
51{
52
53static int CV_KMEANS_PARALLEL_GRANULARITY = (int)utils::getConfigurationParameterSizeT(name: "OPENCV_KMEANS_PARALLEL_GRANULARITY", defaultValue: 1000);
54
55static void generateRandomCenter(int dims, const Vec2f* box, float* center, RNG& rng)
56{
57 float margin = 1.f/dims;
58 for (int j = 0; j < dims; j++)
59 center[j] = ((float)rng*(1.f+margin*2.f)-margin)*(box[j][1] - box[j][0]) + box[j][0];
60}
61
62class KMeansPPDistanceComputer : public ParallelLoopBody
63{
64public:
65 KMeansPPDistanceComputer(float *tdist2_, const Mat& data_, const float *dist_, int ci_) :
66 tdist2(tdist2_), data(data_), dist(dist_), ci(ci_)
67 { }
68
69 void operator()( const cv::Range& range ) const CV_OVERRIDE
70 {
71 CV_TRACE_FUNCTION();
72 const int begin = range.start;
73 const int end = range.end;
74 const int dims = data.cols;
75
76 for (int i = begin; i<end; i++)
77 {
78 tdist2[i] = std::min(a: hal::normL2Sqr_(a: data.ptr<float>(y: i), b: data.ptr<float>(y: ci), n: dims), b: dist[i]);
79 }
80 }
81
82private:
83 KMeansPPDistanceComputer& operator=(const KMeansPPDistanceComputer&); // = delete
84
85 float *tdist2;
86 const Mat& data;
87 const float *dist;
88 const int ci;
89};
90
91/*
92k-means center initialization using the following algorithm:
93Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
94*/
95static void generateCentersPP(const Mat& data, Mat& _out_centers,
96 int K, RNG& rng, int trials)
97{
98 CV_TRACE_FUNCTION();
99 const int dims = data.cols, N = data.rows;
100 cv::AutoBuffer<int, 64> _centers(K);
101 int* centers = &_centers[0];
102 cv::AutoBuffer<float, 0> _dist(N*3);
103 float* dist = &_dist[0], *tdist = dist + N, *tdist2 = tdist + N;
104 double sum0 = 0;
105
106 centers[0] = (unsigned)rng % N;
107
108 for (int i = 0; i < N; i++)
109 {
110 dist[i] = hal::normL2Sqr_(a: data.ptr<float>(y: i), b: data.ptr<float>(y: centers[0]), n: dims);
111 sum0 += dist[i];
112 }
113
114 for (int k = 1; k < K; k++)
115 {
116 double bestSum = DBL_MAX;
117 int bestCenter = -1;
118
119 for (int j = 0; j < trials; j++)
120 {
121 double p = (double)rng*sum0;
122 int ci = 0;
123 for (; ci < N - 1; ci++)
124 {
125 p -= dist[ci];
126 if (p <= 0)
127 break;
128 }
129
130 parallel_for_(range: Range(0, N),
131 body: KMeansPPDistanceComputer(tdist2, data, dist, ci),
132 nstripes: (double)divUp(a: (size_t)(dims * N), b: CV_KMEANS_PARALLEL_GRANULARITY));
133 double s = 0;
134 for (int i = 0; i < N; i++)
135 {
136 s += tdist2[i];
137 }
138
139 if (s < bestSum)
140 {
141 bestSum = s;
142 bestCenter = ci;
143 std::swap(a&: tdist, b&: tdist2);
144 }
145 }
146 if (bestCenter < 0)
147 CV_Error(Error::StsNoConv, "kmeans: can't update cluster center (check input for huge or NaN values)");
148 centers[k] = bestCenter;
149 sum0 = bestSum;
150 std::swap(a&: dist, b&: tdist);
151 }
152
153 for (int k = 0; k < K; k++)
154 {
155 const float* src = data.ptr<float>(y: centers[k]);
156 float* dst = _out_centers.ptr<float>(y: k);
157 for (int j = 0; j < dims; j++)
158 dst[j] = src[j];
159 }
160}
161
162template<bool onlyDistance>
163class KMeansDistanceComputer : public ParallelLoopBody
164{
165public:
166 KMeansDistanceComputer( double *distances_,
167 int *labels_,
168 const Mat& data_,
169 const Mat& centers_)
170 : distances(distances_),
171 labels(labels_),
172 data(data_),
173 centers(centers_)
174 {
175 }
176
177 void operator()(const Range& range) const CV_OVERRIDE
178 {
179 CV_TRACE_FUNCTION();
180 const int begin = range.start;
181 const int end = range.end;
182 const int K = centers.rows;
183 const int dims = centers.cols;
184
185 for (int i = begin; i < end; ++i)
186 {
187 const float *sample = data.ptr<float>(y: i);
188 if (onlyDistance)
189 {
190 const float* center = centers.ptr<float>(y: labels[i]);
191 distances[i] = hal::normL2Sqr_(a: sample, b: center, n: dims);
192 continue;
193 }
194 else
195 {
196 int k_best = 0;
197 double min_dist = DBL_MAX;
198
199 for (int k = 0; k < K; k++)
200 {
201 const float* center = centers.ptr<float>(y: k);
202 const double dist = hal::normL2Sqr_(a: sample, b: center, n: dims);
203
204 if (min_dist > dist)
205 {
206 min_dist = dist;
207 k_best = k;
208 }
209 }
210
211 distances[i] = min_dist;
212 labels[i] = k_best;
213 }
214 }
215 }
216
217private:
218 KMeansDistanceComputer& operator=(const KMeansDistanceComputer&); // = delete
219
220 double *distances;
221 int *labels;
222 const Mat& data;
223 const Mat& centers;
224};
225
226}
227
228double cv::kmeans( InputArray _data, int K,
229 InputOutputArray _bestLabels,
230 TermCriteria criteria, int attempts,
231 int flags, OutputArray _centers )
232{
233 CV_INSTRUMENT_REGION();
234 const int SPP_TRIALS = 3;
235 Mat data0 = _data.getMat();
236 const bool isrow = data0.rows == 1;
237 const int N = isrow ? data0.cols : data0.rows;
238 const int dims = (isrow ? 1 : data0.cols)*data0.channels();
239 const int type = data0.depth();
240
241 attempts = std::max(a: attempts, b: 1);
242 CV_Assert( data0.dims <= 2 && type == CV_32F && K > 0 );
243 CV_CheckGE(N, K, "There can't be more clusters than elements");
244
245 Mat data(N, dims, CV_32F, data0.ptr(), isrow ? dims * sizeof(float) : static_cast<size_t>(data0.step));
246
247 _bestLabels.create(rows: N, cols: 1, CV_32S, i: -1, allowTransposed: true);
248
249 Mat _labels, best_labels = _bestLabels.getMat();
250 if (flags & CV_KMEANS_USE_INITIAL_LABELS)
251 {
252 CV_Assert( (best_labels.cols == 1 || best_labels.rows == 1) &&
253 best_labels.cols*best_labels.rows == N &&
254 best_labels.type() == CV_32S &&
255 best_labels.isContinuous());
256 best_labels.reshape(cn: 1, rows: N).copyTo(m: _labels);
257 for (int i = 0; i < N; i++)
258 {
259 CV_Assert((unsigned)_labels.at<int>(i) < (unsigned)K);
260 }
261 }
262 else
263 {
264 if (!((best_labels.cols == 1 || best_labels.rows == 1) &&
265 best_labels.cols*best_labels.rows == N &&
266 best_labels.type() == CV_32S &&
267 best_labels.isContinuous()))
268 {
269 _bestLabels.create(rows: N, cols: 1, CV_32S);
270 best_labels = _bestLabels.getMat();
271 }
272 _labels.create(size: best_labels.size(), type: best_labels.type());
273 }
274 int* labels = _labels.ptr<int>();
275
276 Mat centers(K, dims, type), old_centers(K, dims, type), temp(1, dims, type);
277 cv::AutoBuffer<int, 64> counters(K);
278 cv::AutoBuffer<double, 64> dists(N);
279 RNG& rng = theRNG();
280
281 if (criteria.type & TermCriteria::EPS)
282 criteria.epsilon = std::max(a: criteria.epsilon, b: 0.);
283 else
284 criteria.epsilon = FLT_EPSILON;
285 criteria.epsilon *= criteria.epsilon;
286
287 if (criteria.type & TermCriteria::COUNT)
288 criteria.maxCount = std::min(a: std::max(a: criteria.maxCount, b: 2), b: 100);
289 else
290 criteria.maxCount = 100;
291
292 if (K == 1)
293 {
294 attempts = 1;
295 criteria.maxCount = 2;
296 }
297
298 cv::AutoBuffer<Vec2f, 64> box(dims);
299 if (!(flags & KMEANS_PP_CENTERS))
300 {
301 {
302 const float* sample = data.ptr<float>(y: 0);
303 for (int j = 0; j < dims; j++)
304 box[j] = Vec2f(sample[j], sample[j]);
305 }
306 for (int i = 1; i < N; i++)
307 {
308 const float* sample = data.ptr<float>(y: i);
309 for (int j = 0; j < dims; j++)
310 {
311 float v = sample[j];
312 box[j][0] = std::min(a: box[j][0], b: v);
313 box[j][1] = std::max(a: box[j][1], b: v);
314 }
315 }
316 }
317
318 double best_compactness = DBL_MAX;
319 for (int a = 0; a < attempts; a++)
320 {
321 double compactness = 0;
322
323 for (int iter = 0; ;)
324 {
325 double max_center_shift = iter == 0 ? DBL_MAX : 0.0;
326
327 swap(a&: centers, b&: old_centers);
328
329 if (iter == 0 && (a > 0 || !(flags & KMEANS_USE_INITIAL_LABELS)))
330 {
331 if (flags & KMEANS_PP_CENTERS)
332 generateCentersPP(data, out_centers&: centers, K, rng, trials: SPP_TRIALS);
333 else
334 {
335 for (int k = 0; k < K; k++)
336 generateRandomCenter(dims, box: box.data(), center: centers.ptr<float>(y: k), rng);
337 }
338 }
339 else
340 {
341 // compute centers
342 centers = Scalar(0);
343 for (int k = 0; k < K; k++)
344 counters[k] = 0;
345
346 for (int i = 0; i < N; i++)
347 {
348 const float* sample = data.ptr<float>(y: i);
349 int k = labels[i];
350 float* center = centers.ptr<float>(y: k);
351 for (int j = 0; j < dims; j++)
352 center[j] += sample[j];
353 counters[k]++;
354 }
355
356 for (int k = 0; k < K; k++)
357 {
358 if (counters[k] != 0)
359 continue;
360
361 // if some cluster appeared to be empty then:
362 // 1. find the biggest cluster
363 // 2. find the farthest from the center point in the biggest cluster
364 // 3. exclude the farthest point from the biggest cluster and form a new 1-point cluster.
365 int max_k = 0;
366 for (int k1 = 1; k1 < K; k1++)
367 {
368 if (counters[max_k] < counters[k1])
369 max_k = k1;
370 }
371
372 double max_dist = 0;
373 int farthest_i = -1;
374 float* base_center = centers.ptr<float>(y: max_k);
375 float* _base_center = temp.ptr<float>(); // normalized
376 float scale = 1.f/counters[max_k];
377 for (int j = 0; j < dims; j++)
378 _base_center[j] = base_center[j]*scale;
379
380 for (int i = 0; i < N; i++)
381 {
382 if (labels[i] != max_k)
383 continue;
384 const float* sample = data.ptr<float>(y: i);
385 double dist = hal::normL2Sqr_(a: sample, b: _base_center, n: dims);
386
387 if (max_dist <= dist)
388 {
389 max_dist = dist;
390 farthest_i = i;
391 }
392 }
393
394 counters[max_k]--;
395 counters[k]++;
396 labels[farthest_i] = k;
397
398 const float* sample = data.ptr<float>(y: farthest_i);
399 float* cur_center = centers.ptr<float>(y: k);
400 for (int j = 0; j < dims; j++)
401 {
402 base_center[j] -= sample[j];
403 cur_center[j] += sample[j];
404 }
405 }
406
407 for (int k = 0; k < K; k++)
408 {
409 float* center = centers.ptr<float>(y: k);
410 CV_Assert( counters[k] != 0 );
411
412 float scale = 1.f/counters[k];
413 for (int j = 0; j < dims; j++)
414 center[j] *= scale;
415
416 if (iter > 0)
417 {
418 double dist = 0;
419 const float* old_center = old_centers.ptr<float>(y: k);
420 for (int j = 0; j < dims; j++)
421 {
422 double t = center[j] - old_center[j];
423 dist += t*t;
424 }
425 max_center_shift = std::max(a: max_center_shift, b: dist);
426 }
427 }
428 }
429
430 bool isLastIter = (++iter == MAX(criteria.maxCount, 2) || max_center_shift <= criteria.epsilon);
431
432 if (isLastIter)
433 {
434 // don't re-assign labels to avoid creation of empty clusters
435 parallel_for_(range: Range(0, N), body: KMeansDistanceComputer<true>(dists.data(), labels, data, centers), nstripes: (double)divUp(a: (size_t)(dims * N), b: CV_KMEANS_PARALLEL_GRANULARITY));
436 compactness = sum(src: Mat(Size(N, 1), CV_64F, &dists[0]))[0];
437 break;
438 }
439 else
440 {
441 // assign labels
442 parallel_for_(range: Range(0, N), body: KMeansDistanceComputer<false>(dists.data(), labels, data, centers), nstripes: (double)divUp(a: (size_t)(dims * N * K), b: CV_KMEANS_PARALLEL_GRANULARITY));
443 }
444 }
445
446 if (compactness < best_compactness)
447 {
448 best_compactness = compactness;
449 if (_centers.needed())
450 {
451 if (_centers.fixedType() && _centers.channels() == dims)
452 centers.reshape(cn: dims).copyTo(m: _centers);
453 else
454 centers.copyTo(m: _centers);
455 }
456 _labels.copyTo(m: best_labels);
457 }
458 }
459
460 return best_compactness;
461}
462

source code of opencv/modules/core/src/kmeans.cpp