kmeans.cpp source code [opencv/modules/core/src/kmeans.cpp]

1	/M///////////////////////////////////////////////////////////////////////////////////////*
2	//
3	// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4	//
5	// By downloading, copying, installing or using the software you agree to this license.
6	// If you do not agree to this license, do not download, install,
7	// copy or use the software.
8	//
9	//
10	// License Agreement
11	// For Open Source Computer Vision Library
12	//
13	// Copyright (C) 2000-2018, Intel Corporation, all rights reserved.
14	// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15	// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16	// Third party copyrights are property of their respective owners.
17	//
18	// Redistribution and use in source and binary forms, with or without modification,
19	// are permitted provided that the following conditions are met:
20	//
21	// Redistribution's of source code must retain the above copyright notice,*
22	// this list of conditions and the following disclaimer.
23	//
24	// Redistribution's in binary form must reproduce the above copyright notice,*
25	// this list of conditions and the following disclaimer in the documentation
26	// and/or other materials provided with the distribution.
27	//
28	// The name of the copyright holders may not be used to endorse or promote products*
29	// derived from this software without specific prior written permission.
30	//
31	// This software is provided by the copyright holders and contributors "as is" and
32	// any express or implied warranties, including, but not limited to, the implied
33	// warranties of merchantability and fitness for a particular purpose are disclaimed.
34	// In no event shall the Intel Corporation or contributors be liable for any direct,
35	// indirect, incidental, special, exemplary, or consequential damages
36	// (including, but not limited to, procurement of substitute goods or services;
37	// loss of use, data, or profits; or business interruption) however caused
38	// and on any theory of liability, whether in contract, strict liability,
39	// or tort (including negligence or otherwise) arising in any way out of
40	// the use of this software, even if advised of the possibility of such damage.
41	//
42	//M/*
43
44	#include "precomp.hpp"
45	#include <opencv2/core/utils/configuration.private.hpp>
46	#include <opencv2/core/hal/hal.hpp>
47
48	////////////////////////////////////////// kmeans ////////////////////////////////////////////
49
50	namespace cv
51	{
52
53	static int CV_KMEANS_PARALLEL_GRANULARITY = (int)utils::getConfigurationParameterSizeT(name: "OPENCV_KMEANS_PARALLEL_GRANULARITY", defaultValue: `1000`);
54
55	static void generateRandomCenter(int dims, const Vec2f* box, float* center, RNG& rng)
56	{
57	float margin = `1.f`/dims;
58	for (int j = `0`; j < dims; j++)
59	center[j] = ((float)rng(`1.f`+margin`2.f`)-margin)*(box[j][`1`] - box[j][`0`]) + box[j][`0`];
60	}
61
62	class KMeansPPDistanceComputer : public ParallelLoopBody
63	{
64	public:
65	KMeansPPDistanceComputer(float tdist2_, const* Mat& data_, const float dist_, int* ci_) :
66	tdist2(tdist2_), data(data_), dist(dist_), ci(ci_)
67	{ }
68
69	void operator()( const cv::Range& range ) const CV_OVERRIDE
70	{
71	CV_TRACE_FUNCTION();
72	const int begin = range.start;
73	const int end = range.end;
74	const int dims = data.cols;
75
76	for (int i = begin; i<end; i++)
77	{
78	tdist2[i] = std::min(a: hal::normL2Sqr_(a: data.ptr<float>(y: i), b: data.ptr<float>(y: ci), n: dims), b: dist[i]);
79	}
80	}
81
82	private:
83	KMeansPPDistanceComputer& operator=(const KMeansPPDistanceComputer&); // = delete
84
85	float *tdist2;
86	const Mat& data;
87	const float *dist;
88	const int ci;
89	};
90
91	/*
92	k-means center initialization using the following algorithm:
93	Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
94	*/
95	static void generateCentersPP(const Mat& data, Mat& _out_centers,
96	int K, RNG& rng, int trials)
97	{
98	CV_TRACE_FUNCTION();
99	const int dims = data.cols, N = data.rows;
100	cv::AutoBuffer<int, `64`> _centers(K);
101	int* centers = &_centers [`0`];
102	cv::AutoBuffer<float, `0`> _dist(N*`3`);
103	float* dist = &_dist [`0`], tdist = dist + N, tdist2 = tdist + N;
104	double sum0 = `0`;
105
106	centers[`0`] = (unsigned)rng % N;
107
108	for (int i = `0`; i < N; i++)
109	{
110	dist[i] = hal::normL2Sqr_(a: data.ptr<float>(y: i), b: data.ptr<float>(y: centers[`0`]), n: dims);
111	sum0 += dist[i];
112	}
113
114	for (int k = `1`; k < K; k++)
115	{
116	double bestSum = DBL_MAX;
117	int bestCenter = -`1`;
118
119	for (int j = `0`; j < trials; j++)
120	{
121	double p = (double)rng*sum0;
122	int ci = `0`;
123	for (; ci < N - `1`; ci++)
124	{
125	p -= dist[ci];
126	if (p <= `0`)
127	break;
128	}
129
130	parallel_for_(range: Range (`0`, N),
131	body: KMeansPPDistanceComputer (tdist2, data, dist, ci),
132	nstripes: (double)divUp(a: (size_t)(dims * N), b: CV_KMEANS_PARALLEL_GRANULARITY));
133	double s = `0`;
134	for (int i = `0`; i < N; i++)
135	{
136	s += tdist2[i];
137	}
138
139	if (s < bestSum)
140	{
141	bestSum = s;
142	bestCenter = ci;
143	std::swap(a&: tdist, b&: tdist2);
144	}
145	}
146	if (bestCenter < `0`)
147	CV_Error(Error::StsNoConv, "kmeans: can't update cluster center (check input for huge or NaN values)");
148	centers[k] = bestCenter;
149	sum0 = bestSum;
150	std::swap(a&: dist, b&: tdist);
151	}
152
153	for (int k = `0`; k < K; k++)
154	{
155	const float* src = data.ptr<float>(y: centers[k]);
156	float* dst = _out_centers.ptr<float>(y: k);
157	for (int j = `0`; j < dims; j++)
158	dst[j] = src[j];
159	}
160	}
161
162	template<bool onlyDistance>
163	class KMeansDistanceComputer : public ParallelLoopBody
164	{
165	public:
166	KMeansDistanceComputer( double *distances_,
167	int *labels_,
168	const Mat& data_,
169	const Mat& centers_)
170	: distances(distances_),
171	labels(labels_),
172	data(data_),
173	centers(centers_)
174	{
175	}
176
177	void operator()(const Range& range) const CV_OVERRIDE
178	{
179	CV_TRACE_FUNCTION();
180	const int begin = range.start;
181	const int end = range.end;
182	const int K = centers.rows;
183	const int dims = centers.cols;
184
185	for (int i = begin; i < end; ++i)
186	{
187	const float sample = data.ptr<float*>(y: i);
188	if (onlyDistance)
189	{
190	const float* center = centers.ptr<float>(y: labels[i]);
191	distances[i] = hal::normL2Sqr_(a: sample, b: center, n: dims);
192	continue;
193	}
194	else
195	{
196	int k_best = `0`;
197	double min_dist = DBL_MAX;
198
199	for (int k = `0`; k < K; k++)
200	{
201	const float* center = centers.ptr<float>(y: k);
202	const double dist = hal::normL2Sqr_(a: sample, b: center, n: dims);
203
204	if (min_dist > dist)
205	{
206	min_dist = dist;
207	k_best = k;
208	}
209	}
210
211	distances[i] = min_dist;
212	labels[i] = k_best;
213	}
214	}
215	}
216
217	private:
218	KMeansDistanceComputer& operator=(const KMeansDistanceComputer&); // = delete
219
220	double *distances;
221	int *labels;
222	const Mat& data;
223	const Mat& centers;
224	};
225
226	}
227
228	double cv::kmeans( InputArray _data, int K,
229	InputOutputArray _bestLabels,
230	TermCriteria criteria, int attempts,
231	int flags, OutputArray _centers )
232	{
233	CV_INSTRUMENT_REGION();
234	const int SPP_TRIALS = `3`;
235	Mat data0 = _data.getMat();
236	const bool isrow = data0.rows == `1`;
237	const int N = isrow ? data0.cols : data0.rows;
238	const int dims = (isrow ? `1` : data0.cols)*data0.channels();
239	const int type = data0.depth();
240
241	attempts = std::max(a: attempts, b: `1`);
242	CV_Assert( data0.dims <= `2` && type == CV_32F && K > `0` );
243	CV_CheckGE(N, K, "There can't be more clusters than elements");
244
245	Mat data(N, dims, CV_32F, data0.ptr(), isrow ? dims * sizeof(float) : static_cast<size_t>(data0.step));
246
247	_bestLabels.create(rows: N, cols: `1`, CV_32S, i: -`1`, allowTransposed: true);
248
249	Mat _labels, best_labels = _bestLabels.getMat();
250	if (flags & CV_KMEANS_USE_INITIAL_LABELS)
251	{
252	CV_Assert( (best_labels.cols == `1` \|\| best_labels.rows == `1`) &&
253	best_labels.cols*best_labels.rows == N &&
254	best_labels.type() == CV_32S &&
255	best_labels.isContinuous());
256	best_labels.reshape(cn: `1`, rows: N).copyTo(m: _labels);
257	for (int i = `0`; i < N; i++)
258	{
259	CV_Assert((unsigned)_labels.at<int>(i) < (unsigned)K);
260	}
261	}
262	else
263	{
264	if (!((best_labels.cols == `1` \|\| best_labels.rows == `1`) &&
265	best_labels.cols*best_labels.rows == N &&
266	best_labels.type() == CV_32S &&
267	best_labels.isContinuous()))
268	{
269	_bestLabels.create(rows: N, cols: `1`, CV_32S);
270	best_labels = _bestLabels.getMat();
271	}
272	_labels.create(size: best_labels.size (), type: best_labels.type());
273	}
274	int* labels = _labels.ptr<int>();
275
276	Mat centers(K, dims, type), old_centers(K, dims, type), temp(`1`, dims, type);
277	cv::AutoBuffer<int, `64`> counters(K);
278	cv::AutoBuffer<double, `64`> dists(N);
279	RNG& rng = theRNG();
280
281	if (criteria.type & TermCriteria::EPS)
282	criteria.epsilon = std::max(a: criteria.epsilon, b: `0.`);
283	else
284	criteria.epsilon = FLT_EPSILON;
285	criteria.epsilon *= criteria.epsilon;
286
287	if (criteria.type & TermCriteria::COUNT)
288	criteria.maxCount = std::min(a: std::max(a: criteria.maxCount, b: `2`), b: `100`);
289	else
290	criteria.maxCount = `100`;
291
292	if (K == `1`)
293	{
294	attempts = `1`;
295	criteria.maxCount = `2`;
296	}
297
298	cv::AutoBuffer<Vec2f, `64`> box(dims);
299	if (!(flags & KMEANS_PP_CENTERS))
300	{
301	{
302	const float* sample = data.ptr<float>(y: `0`);
303	for (int j = `0`; j < dims; j++)
304	box [j] = Vec2f (sample[j], sample[j]);
305	}
306	for (int i = `1`; i < N; i++)
307	{
308	const float* sample = data.ptr<float>(y: i);
309	for (int j = `0`; j < dims; j++)
310	{
311	float v = sample[j];
312	box [j][`0`] = std::min(a: box [j][`0`], b: v);
313	box [j][`1`] = std::max(a: box [j][`1`], b: v);
314	}
315	}
316	}
317
318	double best_compactness = DBL_MAX;
319	for (int a = `0`; a < attempts; a++)
320	{
321	double compactness = `0`;
322
323	for (int iter = `0`; ;)
324	{
325	double max_center_shift = iter == `0` ? DBL_MAX : `0.0`;
326
327	swap(a&: centers, b&: old_centers);
328
329	if (iter == `0` && (a > `0` \|\| !(flags & KMEANS_USE_INITIAL_LABELS)))
330	{
331	if (flags & KMEANS_PP_CENTERS)
332	generateCentersPP(data, out_centers&: centers, K, rng, trials: SPP_TRIALS);
333	else
334	{
335	for (int k = `0`; k < K; k++)
336	generateRandomCenter(dims, box: box.data(), center: centers.ptr<float>(y: k), rng);
337	}
338	}
339	else
340	{
341	// compute centers
342	centers = Scalar (`0`);
343	for (int k = `0`; k < K; k++)
344	counters [k] = `0`;
345
346	for (int i = `0`; i < N; i++)
347	{
348	const float* sample = data.ptr<float>(y: i);
349	int k = labels[i];
350	float* center = centers.ptr<float>(y: k);
351	for (int j = `0`; j < dims; j++)
352	center[j] += sample[j];
353	counters [k]++;
354	}
355
356	for (int k = `0`; k < K; k++)
357	{
358	if (counters [k] != `0`)
359	continue;
360
361	// if some cluster appeared to be empty then:
362	// 1. find the biggest cluster
363	// 2. find the farthest from the center point in the biggest cluster
364	// 3. exclude the farthest point from the biggest cluster and form a new 1-point cluster.
365	int max_k = `0`;
366	for (int k1 = `1`; k1 < K; k1++)
367	{
368	if (counters [max_k] < counters [k1])
369	max_k = k1;
370	}
371
372	double max_dist = `0`;
373	int farthest_i = -`1`;
374	float* base_center = centers.ptr<float>(y: max_k);
375	float* _base_center = temp.ptr<float>(); // normalized
376	float scale = `1.f`/counters [max_k];
377	for (int j = `0`; j < dims; j++)
378	_base_center[j] = base_center[j]*scale;
379
380	for (int i = `0`; i < N; i++)
381	{
382	if (labels[i] != max_k)
383	continue;
384	const float* sample = data.ptr<float>(y: i);
385	double dist = hal::normL2Sqr_(a: sample, b: _base_center, n: dims);
386
387	if (max_dist <= dist)
388	{
389	max_dist = dist;
390	farthest_i = i;
391	}
392	}
393
394	counters [max_k]--;
395	counters [k]++;
396	labels[farthest_i] = k;
397
398	const float* sample = data.ptr<float>(y: farthest_i);
399	float* cur_center = centers.ptr<float>(y: k);
400	for (int j = `0`; j < dims; j++)
401	{
402	base_center[j] -= sample[j];
403	cur_center[j] += sample[j];
404	}
405	}
406
407	for (int k = `0`; k < K; k++)
408	{
409	float* center = centers.ptr<float>(y: k);
410	CV_Assert( counters[k] != `0` );
411
412	float scale = `1.f`/counters [k];
413	for (int j = `0`; j < dims; j++)
414	center[j] *= scale;
415
416	if (iter > `0`)
417	{
418	double dist = `0`;
419	const float* old_center = old_centers.ptr<float>(y: k);
420	for (int j = `0`; j < dims; j++)
421	{
422	double t = center[j] - old_center[j];
423	dist += t*t;
424	}
425	max_center_shift = std::max(a: max_center_shift, b: dist);
426	}
427	}
428	}
429
430	bool isLastIter = (++iter == MAX(criteria.maxCount, `2`) \|\| max_center_shift <= criteria.epsilon);
431
432	if (isLastIter)
433	{
434	// don't re-assign labels to avoid creation of empty clusters
435	parallel_for_(range: Range (`0`, N), body: KMeansDistanceComputer<true>(dists.data(), labels, data, centers), nstripes: (double)divUp(a: (size_t)(dims * N), b: CV_KMEANS_PARALLEL_GRANULARITY));
436	compactness = sum(src: Mat (Size (N, `1`), CV_64F, &dists [`0`]))[`0`];
437	break;
438	}
439	else
440	{
441	// assign labels
442	parallel_for_(range: Range (`0`, N), body: KMeansDistanceComputer<false>(dists.data(), labels, data, centers), nstripes: (double)divUp(a: (size_t)(dims * N * K), b: CV_KMEANS_PARALLEL_GRANULARITY));
443	}
444	}
445
446	if (compactness < best_compactness)
447	{
448	best_compactness = compactness;
449	if (_centers.needed())
450	{
451	if (_centers.fixedType() && _centers.channels() == dims)
452	centers.reshape(cn: dims).copyTo(m: _centers);
453	else
454	centers.copyTo(m: _centers);
455	}
456	_labels.copyTo(m: best_labels);
457	}
458	}
459
460	return best_compactness;
461	}
462

source code of opencv/modules/core/src/kmeans.cpp