sift.simd.hpp source code [opencv/modules/features2d/src/sift.simd.hpp]

1	// This file is part of OpenCV project.
2	// It is subject to the license terms in the LICENSE file found in the top-level directory
3	// of this distribution and at http://opencv.org/license.html.
4	//
5	// Copyright (c) 2006-2010, Rob Hess <hess@eecs.oregonstate.edu>
6	// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
7	// Copyright (C) 2020, Intel Corporation, all rights reserved.
8
9	/********************************************************************************************\
10	Implementation of SIFT is based on the code from http://blogs.oregonstate.edu/hess/code/sift/
11	Below is the original copyright.
12	Patent US6711293 expired in March 2020.
13
14	// Copyright (c) 2006-2010, Rob Hess <hess@eecs.oregonstate.edu>
15	// All rights reserved.
16
17	// The following patent has been issued for methods embodied in this
18	// software: "Method and apparatus for identifying scale invariant features
19	// in an image and use of same for locating an object in an image," David
20	// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
21	// filed March 8, 1999. Asignee: The University of British Columbia. For
22	// further details, contact David Lowe (lowe@cs.ubc.ca) or the
23	// University-Industry Liaison Office of the University of British
24	// Columbia.
25
26	// Note that restrictions imposed by this patent (and possibly others)
27	// exist independently of and may be in conflict with the freedoms granted
28	// in this license, which refers to copyright of the program, not patents
29	// for any methods that it implements. Both copyright and patent law must
30	// be obeyed to legally use and redistribute this program and it is not the
31	// purpose of this license to induce you to infringe any patents or other
32	// property right claims or to contest validity of any such claims. If you
33	// redistribute or use the program, then this license merely protects you
34	// from committing copyright infringement. It does not protect you from
35	// committing patent infringement. So, before you do anything with this
36	// program, make sure that you have permission to do so not merely in terms
37	// of copyright, but also in terms of patent law.
38
39	// Please note that this license is not to be understood as a guarantee
40	// either. If you use the program according to this license, but in
41	// conflict with patent law, it does not mean that the licensor will refund
42	// you for any losses that you incur if you are sued for your patent
43	// infringement.
44
45	// Redistribution and use in source and binary forms, with or without
46	// modification, are permitted provided that the following conditions are
47	// met:
48	// Redistributions of source code must retain the above copyright and*
49	// patent notices, this list of conditions and the following
50	// disclaimer.
51	// Redistributions in binary form must reproduce the above copyright*
52	// notice, this list of conditions and the following disclaimer in
53	// the documentation and/or other materials provided with the
54	// distribution.
55	// Neither the name of Oregon State University nor the names of its*
56	// contributors may be used to endorse or promote products derived
57	// from this software without specific prior written permission.
58
59	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
60	// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
61	// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
62	// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
63	// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
64	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
65	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
66	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
67	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
68	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
69	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
70	\********************************************************************************************/
71
72	#include "precomp.hpp"
73
74	#include <opencv2/core/hal/hal.hpp>
75	#include "opencv2/core/hal/intrin.hpp"
76	#include <opencv2/core/utils/buffer_area.private.hpp>
77
78	namespace cv {
79
80	#if !defined(CV_CPU_DISPATCH_MODE) \|\| !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY)
81	/**************************** Defs and macros **************************/
82
83	// default width of descriptor histogram array
84	static const int SIFT_DESCR_WIDTH = `4`;
85
86	// default number of bins per histogram in descriptor array
87	static const int SIFT_DESCR_HIST_BINS = `8`;
88
89	// assumed gaussian blur for input image
90	static const float SIFT_INIT_SIGMA = `0.5f`;
91
92	// width of border in which to ignore keypoints
93	static const int SIFT_IMG_BORDER = `5`;
94
95	// maximum steps of keypoint interpolation before failure
96	static const int SIFT_MAX_INTERP_STEPS = `5`;
97
98	// default number of bins in histogram for orientation assignment
99	static const int SIFT_ORI_HIST_BINS = `36`;
100
101	// determines gaussian sigma for orientation assignment
102	static const float SIFT_ORI_SIG_FCTR = `1.5f`;
103
104	// determines the radius of the region used in orientation assignment
105	static const float SIFT_ORI_RADIUS = `4.5f`; // 3 SIFT_ORI_SIG_FCTR;*
106
107	// orientation magnitude relative to max that results in new feature
108	static const float SIFT_ORI_PEAK_RATIO = `0.8f`;
109
110	// determines the size of a single descriptor orientation histogram
111	static const float SIFT_DESCR_SCL_FCTR = `3.f`;
112
113	// threshold on magnitude of elements of descriptor vector
114	static const float SIFT_DESCR_MAG_THR = `0.2f`;
115
116	// factor used to convert floating-point descriptor to unsigned char
117	static const float SIFT_INT_DESCR_FCTR = `512.f`;
118
119	#define DoG_TYPE_SHORT 0
120	#if DoG_TYPE_SHORT
121	// intermediate type used for DoG pyramids
122	typedef short sift_wt;
123	static const int SIFT_FIXPT_SCALE = `48`;
124	#else
125	// intermediate type used for DoG pyramids
126	typedef float sift_wt;
127	static const int SIFT_FIXPT_SCALE = `1`;
128	#endif
129
130	#endif // definitions and macros
131
132
133	CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
134
135	void findScaleSpaceExtrema(
136	int octave,
137	int layer,
138	int threshold,
139	int idx,
140	int step,
141	int cols,
142	int nOctaveLayers,
143	double contrastThreshold,
144	double edgeThreshold,
145	double sigma,
146	const std::vector<Mat>& gauss_pyr,
147	const std::vector<Mat>& dog_pyr,
148	std::vector<KeyPoint>& kpts,
149	const cv::Range& range);
150
151	void calcSIFTDescriptor(
152	const Mat& img, Point2f ptf, float ori, float scl,
153	const int d, const int n, Mat& dst, int row
154	);
155
156
157	#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
158
159	// Computes a gradient orientation histogram at a specified pixel
160	static
161	float calcOrientationHist(
162	const Mat& img, Point pt, int radius,
163	float sigma, float* hist, int n
164	)
165	{
166	CV_TRACE_FUNCTION();
167
168	int i, j, k, len = (radius`2`+`1`)(radius*`2`+`1`);
169
170	float expf_scale = -`1.f`/(`2.f` * sigma * sigma);
171
172	cv::utils::BufferArea area;
173	float X = `0`, Y = `0`, Mag, Ori = `0`, W = `0`, temphist = `0`;
174	area.allocate(ptr&: X, count: len, CV_SIMD_WIDTH);
175	area.allocate(ptr&: Y, count: len, CV_SIMD_WIDTH);
176	area.allocate(ptr&: Ori, count: len, CV_SIMD_WIDTH);
177	area.allocate(ptr&: W, count: len, CV_SIMD_WIDTH);
178	area.allocate(ptr&: temphist, count: n+`4`, CV_SIMD_WIDTH);
179	area.commit();
180	temphist += `2`;
181	Mag = X;
182
183	for( i = `0`; i < n; i++ )
184	temphist[i] = `0.f`;
185
186	for( i = -radius, k = `0`; i <= radius; i++ )
187	{
188	int y = pt.y + i;
189	if( y <= `0` \|\| y >= img.rows - `1` )
190	continue;
191	for( j = -radius; j <= radius; j++ )
192	{
193	int x = pt.x + j;
194	if( x <= `0` \|\| x >= img.cols - `1` )
195	continue;
196
197	float dx = (float)(img.at<sift_wt>(i0: y, i1: x+`1`) - img.at<sift_wt>(i0: y, i1: x-`1`));
198	float dy = (float)(img.at<sift_wt>(i0: y-`1`, i1: x) - img.at<sift_wt>(i0: y+`1`, i1: x));
199
200	X[k] = dx; Y[k] = dy; W[k] = (ii + jj)*expf_scale;
201	k++;
202	}
203	}
204
205	len = k;
206
207	// compute gradient values, orientations and the weights over the pixel neighborhood
208	cv::hal::exp32f(src: W, dst: W, n: len);
209	cv::hal::fastAtan2(y: Y, x: X, dst: Ori, n: len, angleInDegrees: true);
210	cv::hal::magnitude32f(x: X, y: Y, dst: Mag, n: len);
211
212	k = `0`;
213	#if (CV_SIMD \|\| CV_SIMD_SCALABLE)
214	const int vecsize = VTraits<v_float32>::vlanes();
215	v_float32 nd360 = vx_setall_f32(v: n/`360.f`);
216	v_int32 __n = vx_setall_s32(v: n);
217	int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[VTraits<v_float32>::max_nlanes];
218	float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[VTraits<v_float32>::max_nlanes];
219
220	for( ; k <= len - vecsize; k += vecsize )
221	{
222	v_float32 w = vx_load_aligned( ptr: W + k );
223	v_float32 mag = vx_load_aligned( ptr: Mag + k );
224	v_float32 ori = vx_load_aligned( ptr: Ori + k );
225	v_int32 bin = v_round( a: v_mul(a: nd360, b: ori) );
226
227	bin = v_select(mask: v_ge(a: bin, b: __n), a: v_sub(a: bin, b: __n), b: bin);
228	bin = v_select(mask: v_lt(a: bin, b: vx_setzero_s32()), a: v_add(a: bin, b: __n), b: bin);
229
230	w = v_mul(a: w, b: mag);
231	v_store_aligned(ptr: bin_buf, a: bin);
232	v_store_aligned(ptr: w_mul_mag_buf, a: w);
233	for(int vi = `0`; vi < vecsize; vi++)
234	{
235	temphist[bin_buf[vi]] += w_mul_mag_buf[vi];
236	}
237	}
238	#endif
239	for( ; k < len; k++ )
240	{
241	int bin = cvRound(value: (n/`360.f`)*Ori[k]);
242	if( bin >= n )
243	bin -= n;
244	if( bin < `0` )
245	bin += n;
246	temphist[bin] += W[k]*Mag[k];
247	}
248
249	// smooth the histogram
250	temphist[-`1`] = temphist[n-`1`];
251	temphist[-`2`] = temphist[n-`2`];
252	temphist[n] = temphist[`0`];
253	temphist[n+`1`] = temphist[`1`];
254
255	i = `0`;
256	#if (CV_SIMD \|\| CV_SIMD_SCALABLE)
257	v_float32 d_1_16 = vx_setall_f32(v: `1.f`/`16.f`);
258	v_float32 d_4_16 = vx_setall_f32(v: `4.f`/`16.f`);
259	v_float32 d_6_16 = vx_setall_f32(v: `6.f`/`16.f`);
260	for( ; i <= n - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
261	{
262	v_float32 tn2 = vx_load_aligned(ptr: temphist + i-`2`);
263	v_float32 tn1 = vx_load(ptr: temphist + i-`1`);
264	v_float32 t0 = vx_load(ptr: temphist + i);
265	v_float32 t1 = vx_load(ptr: temphist + i+`1`);
266	v_float32 t2 = vx_load(ptr: temphist + i+`2`);
267	v_float32 _hist = v_fma(a: v_add(a: tn2, b: t2), b: d_1_16,
268	c: v_fma(a: v_add(a: tn1, b: t1), b: d_4_16, c: v_mul(a: t0, b: d_6_16)));
269	v_store(ptr: hist + i, a: _hist);
270	}
271	#endif
272	for( ; i < n; i++ )
273	{
274	hist[i] = (temphist[i-`2`] + temphist[i+`2`])*(`1.f`/`16.f`) +
275	(temphist[i-`1`] + temphist[i+`1`])*(`4.f`/`16.f`) +
276	temphist[i]*(`6.f`/`16.f`);
277	}
278
279	float maxval = hist[`0`];
280	for( i = `1`; i < n; i++ )
281	maxval = std::max(a: maxval, b: hist[i]);
282
283	return maxval;
284	}
285
286
287	//
288	// Interpolates a scale-space extremum's location and scale to subpixel
289	// accuracy to form an image feature. Rejects features with low contrast.
290	// Based on Section 4 of Lowe's paper.
291	static
292	bool adjustLocalExtrema(
293	const std::vector<Mat>& dog_pyr, KeyPoint& kpt, int octv,
294	int& layer, int& r, int& c, int nOctaveLayers,
295	float contrastThreshold, float edgeThreshold, float sigma
296	)
297	{
298	CV_TRACE_FUNCTION();
299
300	const float img_scale = `1.f`/(`255`*SIFT_FIXPT_SCALE);
301	const float deriv_scale = img_scale*`0.5f`;
302	const float second_deriv_scale = img_scale;
303	const float cross_deriv_scale = img_scale*`0.25f`;
304
305	float xi=`0`, xr=`0`, xc=`0`, contr=`0`;
306	int i = `0`;
307
308	for( ; i < SIFT_MAX_INTERP_STEPS; i++ )
309	{
310	int idx = octv*(nOctaveLayers+`2`) + layer;
311	const Mat& img = dog_pyr [idx];
312	const Mat& prev = dog_pyr [idx-`1`];
313	const Mat& next = dog_pyr [idx+`1`];
314
315	Vec3f dD((img.at<sift_wt>(i0: r, i1: c+`1`) - img.at<sift_wt>(i0: r, i1: c-`1`))*deriv_scale,
316	(img.at<sift_wt>(i0: r+`1`, i1: c) - img.at<sift_wt>(i0: r-`1`, i1: c))*deriv_scale,
317	(next.at<sift_wt>(i0: r, i1: c) - prev.at<sift_wt>(i0: r, i1: c))*deriv_scale);
318
319	float v2 = (float)img.at<sift_wt>(i0: r, i1: c)*`2`;
320	float dxx = (img.at<sift_wt>(i0: r, i1: c+`1`) + img.at<sift_wt>(i0: r, i1: c-`1`) - v2)*second_deriv_scale;
321	float dyy = (img.at<sift_wt>(i0: r+`1`, i1: c) + img.at<sift_wt>(i0: r-`1`, i1: c) - v2)*second_deriv_scale;
322	float dss = (next.at<sift_wt>(i0: r, i1: c) + prev.at<sift_wt>(i0: r, i1: c) - v2)*second_deriv_scale;
323	float dxy = (img.at<sift_wt>(i0: r+`1`, i1: c+`1`) - img.at<sift_wt>(i0: r+`1`, i1: c-`1`) -
324	img.at<sift_wt>(i0: r-`1`, i1: c+`1`) + img.at<sift_wt>(i0: r-`1`, i1: c-`1`))*cross_deriv_scale;
325	float dxs = (next.at<sift_wt>(i0: r, i1: c+`1`) - next.at<sift_wt>(i0: r, i1: c-`1`) -
326	prev.at<sift_wt>(i0: r, i1: c+`1`) + prev.at<sift_wt>(i0: r, i1: c-`1`))*cross_deriv_scale;
327	float dys = (next.at<sift_wt>(i0: r+`1`, i1: c) - next.at<sift_wt>(i0: r-`1`, i1: c) -
328	prev.at<sift_wt>(i0: r+`1`, i1: c) + prev.at<sift_wt>(i0: r-`1`, i1: c))*cross_deriv_scale;
329
330	Matx33f H(dxx, dxy, dxs,
331	dxy, dyy, dys,
332	dxs, dys, dss);
333
334	Vec3f X = H.solve(rhs: dD, method: DECOMP_LU);
335
336	xi = -X [`2`];
337	xr = -X [`1`];
338	xc = -X [`0`];
339
340	if( std::abs(x: xi) < `0.5f` && std::abs(x: xr) < `0.5f` && std::abs(x: xc) < `0.5f` )
341	break;
342
343	if( std::abs(x: xi) > (float)(INT_MAX/`3`) \|\|
344	std::abs(x: xr) > (float)(INT_MAX/`3`) \|\|
345	std::abs(x: xc) > (float)(INT_MAX/`3`) )
346	return false;
347
348	c += cvRound(value: xc);
349	r += cvRound(value: xr);
350	layer += cvRound(value: xi);
351
352	if( layer < `1` \|\| layer > nOctaveLayers \|\|
353	c < SIFT_IMG_BORDER \|\| c >= img.cols - SIFT_IMG_BORDER \|\|
354	r < SIFT_IMG_BORDER \|\| r >= img.rows - SIFT_IMG_BORDER )
355	return false;
356	}
357
358	// ensure convergence of interpolation
359	if( i >= SIFT_MAX_INTERP_STEPS )
360	return false;
361
362	{
363	int idx = octv*(nOctaveLayers+`2`) + layer;
364	const Mat& img = dog_pyr [idx];
365	const Mat& prev = dog_pyr [idx-`1`];
366	const Mat& next = dog_pyr [idx+`1`];
367	Matx31f dD((img.at<sift_wt>(i0: r, i1: c+`1`) - img.at<sift_wt>(i0: r, i1: c-`1`))*deriv_scale,
368	(img.at<sift_wt>(i0: r+`1`, i1: c) - img.at<sift_wt>(i0: r-`1`, i1: c))*deriv_scale,
369	(next.at<sift_wt>(i0: r, i1: c) - prev.at<sift_wt>(i0: r, i1: c))*deriv_scale);
370	float t = dD.dot(M: Matx31f (xc, xr, xi));
371
372	contr = img.at<sift_wt>(i0: r, i1: c)img_scale + t `0.5f`;
373	if( std::abs( x: contr ) * nOctaveLayers < contrastThreshold )
374	return false;
375
376	// principal curvatures are computed using the trace and det of Hessian
377	float v2 = img.at<sift_wt>(i0: r, i1: c)*`2.f`;
378	float dxx = (img.at<sift_wt>(i0: r, i1: c+`1`) + img.at<sift_wt>(i0: r, i1: c-`1`) - v2)*second_deriv_scale;
379	float dyy = (img.at<sift_wt>(i0: r+`1`, i1: c) + img.at<sift_wt>(i0: r-`1`, i1: c) - v2)*second_deriv_scale;
380	float dxy = (img.at<sift_wt>(i0: r+`1`, i1: c+`1`) - img.at<sift_wt>(i0: r+`1`, i1: c-`1`) -
381	img.at<sift_wt>(i0: r-`1`, i1: c+`1`) + img.at<sift_wt>(i0: r-`1`, i1: c-`1`)) * cross_deriv_scale;
382	float tr = dxx + dyy;
383	float det = dxx * dyy - dxy * dxy;
384
385	if( det <= `0` \|\| trtredgeThreshold >= (edgeThreshold + `1`)(edgeThreshold + `1`)det )
386	return false;
387	}
388
389	kpt.pt.x = (c + xc) * (`1` << octv);
390	kpt.pt.y = (r + xr) * (`1` << octv);
391	kpt.octave = octv + (layer << `8`) + (cvRound(value: (xi + `0.5`)*`255`) << `16`);
392	kpt.size = sigmapowf(x: `2.f`, y: (layer + xi) / nOctaveLayers)(`1` << octv)*`2`;
393	kpt.response = std::abs(x: contr);
394
395	return true;
396	}
397
398	namespace {
399
400	class findScaleSpaceExtremaT
401	{
402	public:
403	findScaleSpaceExtremaT(
404	int _o,
405	int _i,
406	int _threshold,
407	int _idx,
408	int _step,
409	int _cols,
410	int _nOctaveLayers,
411	double _contrastThreshold,
412	double _edgeThreshold,
413	double _sigma,
414	const std::vector<Mat>& _gauss_pyr,
415	const std::vector<Mat>& _dog_pyr,
416	std::vector<KeyPoint>& kpts)
417
418	: o(_o),
419	i(_i),
420	threshold(_threshold),
421	idx(_idx),
422	step(_step),
423	cols(_cols),
424	nOctaveLayers(_nOctaveLayers),
425	contrastThreshold(_contrastThreshold),
426	edgeThreshold(_edgeThreshold),
427	sigma(_sigma),
428	gauss_pyr(_gauss_pyr),
429	dog_pyr(_dog_pyr),
430	kpts_(kpts)
431	{
432	// nothing
433	}
434	void process(const cv::Range& range)
435	{
436	CV_TRACE_FUNCTION();
437
438	const int begin = range.start;
439	const int end = range.end;
440
441	static const int n = SIFT_ORI_HIST_BINS;
442	float CV_DECL_ALIGNED(CV_SIMD_WIDTH) hist[n];
443
444	const Mat& img = dog_pyr [idx];
445	const Mat& prev = dog_pyr [idx-`1`];
446	const Mat& next = dog_pyr [idx+`1`];
447
448	for( int r = begin; r < end; r++)
449	{
450	const sift_wt* currptr = img.ptr<sift_wt>(y: r);
451	const sift_wt* prevptr = prev.ptr<sift_wt>(y: r);
452	const sift_wt* nextptr = next.ptr<sift_wt>(y: r);
453	int c = SIFT_IMG_BORDER;
454
455	#if (CV_SIMD \|\| CV_SIMD_SCALABLE) && !(DoG_TYPE_SHORT)
456	const int vecsize = VTraits<v_float32>::vlanes();
457	for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize)
458	{
459	v_float32 val = vx_load(ptr: &currptr[c]);
460	v_float32 _00,_01,_02;
461	v_float32 _10, _12;
462	v_float32 _20,_21,_22;
463
464	v_float32 vmin,vmax;
465
466
467	v_float32 cond = v_gt(a: v_abs(x: val), b: vx_setall_f32(v: (float)this->threshold));
468	if (!v_check_any(a: cond))
469	{
470	continue;
471	}
472
473	_00 = vx_load(ptr: &currptr[c-step-`1`]); _01 = vx_load(ptr: &currptr[c-step]); _02 = vx_load(ptr: &currptr[c-step+`1`]);
474	_10 = vx_load(ptr: &currptr[c -`1`]); _12 = vx_load(ptr: &currptr[c +`1`]);
475	_20 = vx_load(ptr: &currptr[c+step-`1`]); _21 = vx_load(ptr: &currptr[c+step]); _22 = vx_load(ptr: &currptr[c+step+`1`]);
476
477	vmax = v_max(a: v_max(a: v_max(a: _00,b: _01),b: v_max(a: _02,b: _10)),b: v_max(a: v_max(a: _12,b: _20),b: v_max(a: _21,b: _22)));
478	vmin = v_min(a: v_min(a: v_min(a: _00,b: _01),b: v_min(a: _02,b: _10)),b: v_min(a: v_min(a: _12,b: _20),b: v_min(a: _21,b: _22)));
479
480	v_float32 condp = v_and(a: v_and(a: cond, b: v_gt(a: val, b: vx_setall_f32(v: `0`))), b: v_ge(a: val, b: vmax));
481	v_float32 condm = v_and(a: v_and(a: cond, b: v_lt(a: val, b: vx_setall_f32(v: `0`))), b: v_le(a: val, b: vmin));
482
483	cond = v_or(a: condp, b: condm);
484	if (!v_check_any(a: cond))
485	{
486	continue;
487	}
488
489	_00 = vx_load(ptr: &prevptr[c-step-`1`]); _01 = vx_load(ptr: &prevptr[c-step]); _02 = vx_load(ptr: &prevptr[c-step+`1`]);
490	_10 = vx_load(ptr: &prevptr[c -`1`]); _12 = vx_load(ptr: &prevptr[c +`1`]);
491	_20 = vx_load(ptr: &prevptr[c+step-`1`]); _21 = vx_load(ptr: &prevptr[c+step]); _22 = vx_load(ptr: &prevptr[c+step+`1`]);
492
493	vmax = v_max(a: v_max(a: v_max(a: _00,b: _01),b: v_max(a: _02,b: _10)),b: v_max(a: v_max(a: _12,b: _20),b: v_max(a: _21,b: _22)));
494	vmin = v_min(a: v_min(a: v_min(a: _00,b: _01),b: v_min(a: _02,b: _10)),b: v_min(a: v_min(a: _12,b: _20),b: v_min(a: _21,b: _22)));
495
496	condp = v_and(a: condp, b: v_ge(a: val, b: vmax));
497	condm = v_and(a: condm, b: v_le(a: val, b: vmin));
498
499	cond = v_or(a: condp, b: condm);
500	if (!v_check_any(a: cond))
501	{
502	continue;
503	}
504
505	v_float32 _11p = vx_load(ptr: &prevptr[c]);
506	v_float32 _11n = vx_load(ptr: &nextptr[c]);
507
508	v_float32 max_middle = v_max(a: _11n,b: _11p);
509	v_float32 min_middle = v_min(a: _11n,b: _11p);
510
511	_00 = vx_load(ptr: &nextptr[c-step-`1`]); _01 = vx_load(ptr: &nextptr[c-step]); _02 = vx_load(ptr: &nextptr[c-step+`1`]);
512	_10 = vx_load(ptr: &nextptr[c -`1`]); _12 = vx_load(ptr: &nextptr[c +`1`]);
513	_20 = vx_load(ptr: &nextptr[c+step-`1`]); _21 = vx_load(ptr: &nextptr[c+step]); _22 = vx_load(ptr: &nextptr[c+step+`1`]);
514
515	vmax = v_max(a: v_max(a: v_max(a: _00,b: _01),b: v_max(a: _02,b: _10)),b: v_max(a: v_max(a: _12,b: _20),b: v_max(a: _21,b: _22)));
516	vmin = v_min(a: v_min(a: v_min(a: _00,b: _01),b: v_min(a: _02,b: _10)),b: v_min(a: v_min(a: _12,b: _20),b: v_min(a: _21,b: _22)));
517
518	condp = v_and(a: condp, b: v_ge(a: val, b: v_max(a: vmax, b: max_middle)));
519	condm = v_and(a: condm, b: v_le(a: val, b: v_min(a: vmin, b: min_middle)));
520
521	cond = v_or(a: condp, b: condm);
522	if (!v_check_any(a: cond))
523	{
524	continue;
525	}
526
527	int mask = v_signmask(a: cond);
528	for (int k = `0`; k<vecsize;k++)
529	{
530	if ((mask & (`1`<<k)) == `0`)
531	continue;
532
533	CV_TRACE_REGION("pixel_candidate_simd");
534
535	KeyPoint kpt;
536	int r1 = r, c1 = c+k, layer = i;
537	if( !adjustLocalExtrema(dog_pyr, kpt, octv: o, layer, r&: r1, c&: c1,
538	nOctaveLayers, contrastThreshold: (float)contrastThreshold,
539	edgeThreshold: (float)edgeThreshold, sigma: (float)sigma) )
540	continue;
541	float scl_octv = kpt.size*`0.5f`/(`1` << o);
542	float omax = calcOrientationHist(img: gauss_pyr [o*(nOctaveLayers+`3`) + layer],
543	pt: Point (c1, r1),
544	radius: cvRound(value: SIFT_ORI_RADIUS * scl_octv),
545	sigma: SIFT_ORI_SIG_FCTR * scl_octv,
546	hist, n);
547	float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
548	for( int j = `0`; j < n; j++ )
549	{
550	int l = j > `0` ? j - `1` : n - `1`;
551	int r2 = j < n-`1` ? j + `1` : `0`;
552
553	if( hist[j] > hist[l] && hist[j] > hist[r2] && hist[j] >= mag_thr )
554	{
555	float bin = j + `0.5f` * (hist[l]-hist[r2]) / (hist[l] - `2`*hist[j] + hist[r2]);
556	bin = bin < `0` ? n + bin : bin >= n ? bin - n : bin;
557	kpt.angle = `360.f` - (float)((`360.f`/n) * bin);
558	if(std::abs(x: kpt.angle - `360.f`) < FLT_EPSILON)
559	kpt.angle = `0.f`;
560
561	kpts_.push_back(x: kpt);
562	}
563	}
564	}
565	}
566
567	#endif //CV_SIMD && !(DoG_TYPE_SHORT)
568
569	// vector loop reminder, better predictibility and less branch density
570	for( ; c < cols-SIFT_IMG_BORDER; c++)
571	{
572	sift_wt val = currptr[c];
573	if (std::abs(x: val) <= threshold)
574	continue;
575
576	sift_wt _00,_01,_02;
577	sift_wt _10, _12;
578	sift_wt _20,_21,_22;
579	_00 = currptr[c-step-`1`]; _01 = currptr[c-step]; _02 = currptr[c-step+`1`];
580	_10 = currptr[c -`1`]; _12 = currptr[c +`1`];
581	_20 = currptr[c+step-`1`]; _21 = currptr[c+step]; _22 = currptr[c+step+`1`];
582
583	bool calculate = false;
584	if (val > `0`)
585	{
586	sift_wt vmax = std::max(a: std::max(a: std::max(a: _00,b: _01),b: std::max(a: _02,b: _10)),b: std::max(a: std::max(a: _12,b: _20),b: std::max(a: _21,b: _22)));
587	if (val >= vmax)
588	{
589	_00 = prevptr[c-step-`1`]; _01 = prevptr[c-step]; _02 = prevptr[c-step+`1`];
590	_10 = prevptr[c -`1`]; _12 = prevptr[c +`1`];
591	_20 = prevptr[c+step-`1`]; _21 = prevptr[c+step]; _22 = prevptr[c+step+`1`];
592	vmax = std::max(a: std::max(a: std::max(a: _00,b: _01),b: std::max(a: _02,b: _10)),b: std::max(a: std::max(a: _12,b: _20),b: std::max(a: _21,b: _22)));
593	if (val >= vmax)
594	{
595	_00 = nextptr[c-step-`1`]; _01 = nextptr[c-step]; _02 = nextptr[c-step+`1`];
596	_10 = nextptr[c -`1`]; _12 = nextptr[c +`1`];
597	_20 = nextptr[c+step-`1`]; _21 = nextptr[c+step]; _22 = nextptr[c+step+`1`];
598	vmax = std::max(a: std::max(a: std::max(a: _00,b: _01),b: std::max(a: _02,b: _10)),b: std::max(a: std::max(a: _12,b: _20),b: std::max(a: _21,b: _22)));
599	if (val >= vmax)
600	{
601	sift_wt _11p = prevptr[c], _11n = nextptr[c];
602	calculate = (val >= std::max(a: _11p,b: _11n));
603	}
604	}
605	}
606
607	} else { // val cant be zero here (first abs took care of zero), must be negative
608	sift_wt vmin = std::min(a: std::min(a: std::min(a: _00,b: _01),b: std::min(a: _02,b: _10)),b: std::min(a: std::min(a: _12,b: _20),b: std::min(a: _21,b: _22)));
609	if (val <= vmin)
610	{
611	_00 = prevptr[c-step-`1`]; _01 = prevptr[c-step]; _02 = prevptr[c-step+`1`];
612	_10 = prevptr[c -`1`]; _12 = prevptr[c +`1`];
613	_20 = prevptr[c+step-`1`]; _21 = prevptr[c+step]; _22 = prevptr[c+step+`1`];
614	vmin = std::min(a: std::min(a: std::min(a: _00,b: _01),b: std::min(a: _02,b: _10)),b: std::min(a: std::min(a: _12,b: _20),b: std::min(a: _21,b: _22)));
615	if (val <= vmin)
616	{
617	_00 = nextptr[c-step-`1`]; _01 = nextptr[c-step]; _02 = nextptr[c-step+`1`];
618	_10 = nextptr[c -`1`]; _12 = nextptr[c +`1`];
619	_20 = nextptr[c+step-`1`]; _21 = nextptr[c+step]; _22 = nextptr[c+step+`1`];
620	vmin = std::min(a: std::min(a: std::min(a: _00,b: _01),b: std::min(a: _02,b: _10)),b: std::min(a: std::min(a: _12,b: _20),b: std::min(a: _21,b: _22)));
621	if (val <= vmin)
622	{
623	sift_wt _11p = prevptr[c], _11n = nextptr[c];
624	calculate = (val <= std::min(a: _11p,b: _11n));
625	}
626	}
627	}
628	}
629
630	if (calculate)
631	{
632	CV_TRACE_REGION("pixel_candidate");
633
634	KeyPoint kpt;
635	int r1 = r, c1 = c, layer = i;
636	if( !adjustLocalExtrema(dog_pyr, kpt, octv: o, layer, r&: r1, c&: c1,
637	nOctaveLayers, contrastThreshold: (float)contrastThreshold,
638	edgeThreshold: (float)edgeThreshold, sigma: (float)sigma) )
639	continue;
640	float scl_octv = kpt.size*`0.5f`/(`1` << o);
641	float omax = calcOrientationHist(img: gauss_pyr [o*(nOctaveLayers+`3`) + layer],
642	pt: Point (c1, r1),
643	radius: cvRound(value: SIFT_ORI_RADIUS * scl_octv),
644	sigma: SIFT_ORI_SIG_FCTR * scl_octv,
645	hist, n);
646	float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
647	for( int j = `0`; j < n; j++ )
648	{
649	int l = j > `0` ? j - `1` : n - `1`;
650	int r2 = j < n-`1` ? j + `1` : `0`;
651
652	if( hist[j] > hist[l] && hist[j] > hist[r2] && hist[j] >= mag_thr )
653	{
654	float bin = j + `0.5f` * (hist[l]-hist[r2]) / (hist[l] - `2`*hist[j] + hist[r2]);
655	bin = bin < `0` ? n + bin : bin >= n ? bin - n : bin;
656	kpt.angle = `360.f` - (float)((`360.f`/n) * bin);
657	if(std::abs(x: kpt.angle - `360.f`) < FLT_EPSILON)
658	kpt.angle = `0.f`;
659
660	kpts_.push_back(x: kpt);
661	}
662	}
663	}
664	}
665	}
666	}
667	private:
668	int o, i;
669	int threshold;
670	int idx, step, cols;
671	int nOctaveLayers;
672	double contrastThreshold;
673	double edgeThreshold;
674	double sigma;
675	const std::vector<Mat>& gauss_pyr;
676	const std::vector<Mat>& dog_pyr;
677	std::vector<KeyPoint>& kpts_;
678	};
679
680	} // namespace
681
682
683	void findScaleSpaceExtrema(
684	int octave,
685	int layer,
686	int threshold,
687	int idx,
688	int step,
689	int cols,
690	int nOctaveLayers,
691	double contrastThreshold,
692	double edgeThreshold,
693	double sigma,
694	const std::vector<Mat>& gauss_pyr,
695	const std::vector<Mat>& dog_pyr,
696	std::vector<KeyPoint>& kpts,
697	const cv::Range& range)
698	{
699	CV_TRACE_FUNCTION();
700
701	findScaleSpaceExtremaT (octave, layer, threshold, idx,
702	step, cols,
703	nOctaveLayers, contrastThreshold, edgeThreshold, sigma,
704	gauss_pyr, dog_pyr,
705	kpts)
706	.process(range);
707	}
708
709	void calcSIFTDescriptor(
710	const Mat& img, Point2f ptf, float ori, float scl,
711	const int d, const int n, Mat& dstMat, int row
712	)
713	{
714	CV_TRACE_FUNCTION();
715
716	Point pt(cvRound(value: ptf.x), cvRound(value: ptf.y));
717	float cos_t = cosf(x: ori(float*)(CV_PI/`180`));
718	float sin_t = sinf(x: ori(float*)(CV_PI/`180`));
719	float bins_per_rad = n / `360.f`;
720	float exp_scale = -`1.f`/(d * d * `0.5f`);
721	float hist_width = SIFT_DESCR_SCL_FCTR * scl;
722	int radius = cvRound(value: hist_width * `1.4142135623730951f` * (d + `1`) * `0.5f`);
723	// Clip the radius to the diagonal of the image to avoid autobuffer too large exception
724	radius = std::min(a: radius, b: (int)std::sqrt(x: ((double) img.cols)img.cols + ((double) img.rows)img.rows));
725	cos_t /= hist_width;
726	sin_t /= hist_width;
727
728	int i, j, k;
729	const int len = (radius`2`+`1`)(radius*`2`+`1`);
730	const int len_hist = (d+`2`)(d+`2`)(n+`2`);
731	const int len_ddn = d * d * n;
732	int rows = img.rows, cols = img.cols;
733
734	cv::utils::BufferArea area;
735	float X = `0`, Y = `0`, Mag, Ori = `0`, W = `0`, RBin = `0`, CBin = `0`, hist = `0`, *rawDst = `0`;
736	area.allocate(ptr&: X, count: len, CV_SIMD_WIDTH);
737	area.allocate(ptr&: Y, count: len, CV_SIMD_WIDTH);
738	area.allocate(ptr&: Ori, count: len, CV_SIMD_WIDTH);
739	area.allocate(ptr&: W, count: len, CV_SIMD_WIDTH);
740	area.allocate(ptr&: RBin, count: len, CV_SIMD_WIDTH);
741	area.allocate(ptr&: CBin, count: len, CV_SIMD_WIDTH);
742	area.allocate(ptr&: hist, count: len_hist, CV_SIMD_WIDTH);
743	area.allocate(ptr&: rawDst, count: len_ddn, CV_SIMD_WIDTH);
744	area.commit();
745	Mag = Y;
746
747	for( i = `0`; i < d+`2`; i++ )
748	{
749	for( j = `0`; j < d+`2`; j++ )
750	for( k = `0`; k < n+`2`; k++ )
751	hist[(i(d+`2`) + j)(n+`2`) + k] = `0.`;
752	}
753
754	for( i = -radius, k = `0`; i <= radius; i++ )
755	for( j = -radius; j <= radius; j++ )
756	{
757	// Calculate sample's histogram array coords rotated relative to ori.
758	// Subtract 0.5 so samples that fall e.g. in the center of row 1 (i.e.
759	// r_rot = 1.5) have full weight placed in row 1 after interpolation.
760	float c_rot = j * cos_t - i * sin_t;
761	float r_rot = j * sin_t + i * cos_t;
762	float rbin = r_rot + d/`2` - `0.5f`;
763	float cbin = c_rot + d/`2` - `0.5f`;
764	int r = pt.y + i, c = pt.x + j;
765
766	if( rbin > -`1` && rbin < d && cbin > -`1` && cbin < d &&
767	r > `0` && r < rows - `1` && c > `0` && c < cols - `1` )
768	{
769	float dx = (float)(img.at<sift_wt>(i0: r, i1: c+`1`) - img.at<sift_wt>(i0: r, i1: c-`1`));
770	float dy = (float)(img.at<sift_wt>(i0: r-`1`, i1: c) - img.at<sift_wt>(i0: r+`1`, i1: c));
771	X[k] = dx; Y[k] = dy; RBin[k] = rbin; CBin[k] = cbin;
772	W[k] = (c_rot * c_rot + r_rot * r_rot)*exp_scale;
773	k++;
774	}
775	}
776
777	const int len_left = k;
778	cv::hal::fastAtan2(y: Y, x: X, dst: Ori, n: len_left, angleInDegrees: true);
779	cv::hal::magnitude32f(x: X, y: Y, dst: Mag, n: len_left);
780	cv::hal::exp32f(src: W, dst: W, n: len_left);
781
782	k = `0`;
783	#if (CV_SIMD \|\| CV_SIMD_SCALABLE)
784	{
785	const int vecsize = VTraits<v_float32>::vlanes();
786	int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[VTraits<v_float32>::max_nlanes];
787	float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[`8`*VTraits<v_float32>::max_nlanes];
788	const v_float32 __ori = vx_setall_f32(v: ori);
789	const v_float32 __bins_per_rad = vx_setall_f32(v: bins_per_rad);
790	const v_int32 __n = vx_setall_s32(v: n);
791	const v_int32 __1 = vx_setall_s32(v: `1`);
792	const v_int32 __d_plus_2 = vx_setall_s32(v: d+`2`);
793	const v_int32 __n_plus_2 = vx_setall_s32(v: n+`2`);
794	for( ; k <= len_left - vecsize; k += vecsize )
795	{
796	v_float32 rbin = vx_load_aligned(ptr: RBin + k);
797	v_float32 cbin = vx_load_aligned(ptr: CBin + k);
798	v_float32 obin = v_mul(a: v_sub(a: vx_load_aligned(ptr: Ori + k), b: __ori), b: __bins_per_rad);
799	v_float32 mag = v_mul(a: vx_load_aligned(ptr: Mag + k), b: vx_load_aligned(ptr: W + k));
800
801	v_int32 r0 = v_floor(a: rbin);
802	v_int32 c0 = v_floor(a: cbin);
803	v_int32 o0 = v_floor(a: obin);
804	rbin = v_sub(a: rbin, b: v_cvt_f32(a: r0));
805	cbin = v_sub(a: cbin, b: v_cvt_f32(a: c0));
806	obin = v_sub(a: obin, b: v_cvt_f32(a: o0));
807
808	o0 = v_select(mask: v_lt(a: o0, b: vx_setzero_s32()), a: v_add(a: o0, b: __n), b: o0);
809	o0 = v_select(mask: v_ge(a: o0, b: __n), a: v_sub(a: o0, b: __n), b: o0);
810
811	v_float32 v_r1 = v_mul(a: mag, b: rbin), v_r0 = v_sub(a: mag, b: v_r1);
812	v_float32 v_rc11 = v_mul(a: v_r1, b: cbin), v_rc10 = v_sub(a: v_r1, b: v_rc11);
813	v_float32 v_rc01 = v_mul(a: v_r0, b: cbin), v_rc00 = v_sub(a: v_r0, b: v_rc01);
814	v_float32 v_rco111 = v_mul(a: v_rc11, b: obin), v_rco110 = v_sub(a: v_rc11, b: v_rco111);
815	v_float32 v_rco101 = v_mul(a: v_rc10, b: obin), v_rco100 = v_sub(a: v_rc10, b: v_rco101);
816	v_float32 v_rco011 = v_mul(a: v_rc01, b: obin), v_rco010 = v_sub(a: v_rc01, b: v_rco011);
817	v_float32 v_rco001 = v_mul(a: v_rc00, b: obin), v_rco000 = v_sub(a: v_rc00, b: v_rco001);
818
819	v_int32 idx = v_muladd(a: v_muladd(a: v_add(a: r0, b: __1), b: __d_plus_2, c: v_add(a: c0, b: __1)), b: __n_plus_2, c: o0);
820	v_store_aligned(ptr: idx_buf, a: idx);
821
822	v_store_aligned(ptr: rco_buf, a: v_rco000);
823	v_store_aligned(ptr: rco_buf+vecsize, a: v_rco001);
824	v_store_aligned(ptr: rco_buf+vecsize*`2`, a: v_rco010);
825	v_store_aligned(ptr: rco_buf+vecsize*`3`, a: v_rco011);
826	v_store_aligned(ptr: rco_buf+vecsize*`4`, a: v_rco100);
827	v_store_aligned(ptr: rco_buf+vecsize*`5`, a: v_rco101);
828	v_store_aligned(ptr: rco_buf+vecsize*`6`, a: v_rco110);
829	v_store_aligned(ptr: rco_buf+vecsize*`7`, a: v_rco111);
830
831	for(int id = `0`; id < vecsize; id++)
832	{
833	hist[idx_buf[id]] += rco_buf[id];
834	hist[idx_buf[id]+`1`] += rco_buf[vecsize + id];
835	hist[idx_buf[id]+(n+`2`)] += rco_buf[`2`*vecsize + id];
836	hist[idx_buf[id]+(n+`3`)] += rco_buf[`3`*vecsize + id];
837	hist[idx_buf[id]+(d+`2`)(n+`2`)] += rco_buf[`4`vecsize + id];
838	hist[idx_buf[id]+(d+`2`)(n+`2`)+`1`] += rco_buf[`5`vecsize + id];
839	hist[idx_buf[id]+(d+`3`)(n+`2`)] += rco_buf[`6`vecsize + id];
840	hist[idx_buf[id]+(d+`3`)(n+`2`)+`1`] += rco_buf[`7`vecsize + id];
841	}
842	}
843	}
844	#endif
845	for( ; k < len_left; k++ )
846	{
847	float rbin = RBin[k], cbin = CBin[k];
848	float obin = (Ori[k] - ori)*bins_per_rad;
849	float mag = Mag[k]*W[k];
850
851	int r0 = cvFloor( value: rbin );
852	int c0 = cvFloor( value: cbin );
853	int o0 = cvFloor( value: obin );
854	rbin -= r0;
855	cbin -= c0;
856	obin -= o0;
857
858	if( o0 < `0` )
859	o0 += n;
860	if( o0 >= n )
861	o0 -= n;
862
863	// histogram update using tri-linear interpolation
864	float v_r1 = mag*rbin, v_r0 = mag - v_r1;
865	float v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
866	float v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
867	float v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
868	float v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
869	float v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
870	float v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
871
872	int idx = ((r0+`1`)(d+`2`) + c0+`1`)(n+`2`) + o0;
873	hist[idx] += v_rco000;
874	hist[idx+`1`] += v_rco001;
875	hist[idx+(n+`2`)] += v_rco010;
876	hist[idx+(n+`3`)] += v_rco011;
877	hist[idx+(d+`2`)*(n+`2`)] += v_rco100;
878	hist[idx+(d+`2`)*(n+`2`)+`1`] += v_rco101;
879	hist[idx+(d+`3`)*(n+`2`)] += v_rco110;
880	hist[idx+(d+`3`)*(n+`2`)+`1`] += v_rco111;
881	}
882
883	// finalize histogram, since the orientation histograms are circular
884	for( i = `0`; i < d; i++ )
885	for( j = `0`; j < d; j++ )
886	{
887	int idx = ((i+`1`)(d+`2`) + (j+`1`))(n+`2`);
888	hist[idx] += hist[idx+n];
889	hist[idx+`1`] += hist[idx+n+`1`];
890	for( k = `0`; k < n; k++ )
891	rawDst[(id + j)n + k] = hist[idx+k];
892	}
893	// copy histogram to the descriptor,
894	// apply hysteresis thresholding
895	// and scale the result, so that it can be easily converted
896	// to byte array
897	float nrm2 = `0`;
898	k = `0`;
899	#if (CV_SIMD \|\| CV_SIMD_SCALABLE)
900	{
901	v_float32 __nrm2 = vx_setzero_f32();
902	v_float32 __rawDst;
903	for( ; k <= len_ddn - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
904	{
905	__rawDst = vx_load_aligned(ptr: rawDst + k);
906	__nrm2 = v_fma(a: __rawDst, b: __rawDst, c: __nrm2);
907	}
908	nrm2 = (float)v_reduce_sum(a: __nrm2);
909	}
910	#endif
911	for( ; k < len_ddn; k++ )
912	nrm2 += rawDst[k]*rawDst[k];
913
914	const float thr = std::sqrt(x: nrm2)*SIFT_DESCR_MAG_THR;
915
916	i = `0`, nrm2 = `0`;
917	#if 0 //CV_AVX2
918	// This code cannot be enabled because it sums nrm2 in a different order,
919	// thus producing slightly different results
920	{
921	float CV_DECL_ALIGNED(CV_SIMD_WIDTH) nrm2_buf[`8`];
922	__m256 __dst;
923	__m256 __nrm2 = _mm256_setzero_ps();
924	__m256 __thr = _mm256_set1_ps(thr);
925	for( ; i <= len_ddn - `8`; i += `8` )
926	{
927	__dst = _mm256_loadu_ps(&rawDst[i]);
928	__dst = _mm256_min_ps(__dst, __thr);
929	_mm256_storeu_ps(&rawDst[i], __dst);
930	#if CV_FMA3
931	__nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
932	#else
933	__nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
934	#endif
935	}
936	_mm256_store_ps(nrm2_buf, __nrm2);
937	nrm2 = nrm2_buf[`0`] + nrm2_buf[`1`] + nrm2_buf[`2`] + nrm2_buf[`3`] +
938	nrm2_buf[`4`] + nrm2_buf[`5`] + nrm2_buf[`6`] + nrm2_buf[`7`];
939	}
940	#endif
941	for( ; i < len_ddn; i++ )
942	{
943	float val = std::min(a: rawDst[i], b: thr);
944	rawDst[i] = val;
945	nrm2 += val*val;
946	}
947	nrm2 = SIFT_INT_DESCR_FCTR/std::max(a: std::sqrt(x: nrm2), FLT_EPSILON);
948
949	#if 1
950	k = `0`;
951	if( dstMat.type() == CV_32F )
952	{
953	float* dst = dstMat.ptr<float>(y: row);
954	#if (CV_SIMD \|\| CV_SIMD_SCALABLE)
955	v_float32 __dst;
956	v_float32 __min = vx_setzero_f32();
957	v_float32 __max = vx_setall_f32(v: `255.0f`); // max of uchar
958	v_float32 __nrm2 = vx_setall_f32(v: nrm2);
959	for( k = `0`; k <= len_ddn - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
960	{
961	__dst = vx_load_aligned(ptr: rawDst + k);
962	__dst = v_min(a: v_max(a: v_cvt_f32(a: v_round(a: v_mul(a: __dst, b: __nrm2))), b: __min), b: __max);
963	v_store(ptr: dst + k, a: __dst);
964	}
965	#endif
966	#if defined(__GNUC__) && __GNUC__ >= 9
967	#pragma GCC diagnostic push
968	#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations" // iteration XX invokes undefined behavior
969	#endif
970	for( ; k < len_ddn; k++ )
971	{
972	dst[k] = saturate_cast<uchar>(v: rawDst[k]*nrm2);
973	}
974	#if defined(__GNUC__) && __GNUC__ >= 9
975	#pragma GCC diagnostic pop
976	#endif
977	}
978	else // CV_8U
979	{
980	uint8_t* dst = dstMat.ptr<uint8_t>(y: row);
981	#if (CV_SIMD \|\| CV_SIMD_SCALABLE)
982	v_float32 __dst0, __dst1;
983	v_uint16 __pack01;
984	v_float32 __nrm2 = vx_setall_f32(v: nrm2);
985	for( k = `0`; k <= len_ddn - VTraits<v_float32>::vlanes() * `2`; k += VTraits<v_float32>::vlanes() * `2` )
986	{
987	__dst0 = vx_load_aligned(ptr: rawDst + k);
988	__dst1 = vx_load_aligned(ptr: rawDst + k + VTraits<v_float32>::vlanes());
989
990	__pack01 = v_pack_u(a: v_round(a: v_mul(a: __dst0, b: __nrm2)), b: v_round(a: v_mul(a: __dst1, b: __nrm2)));
991	v_pack_store(ptr: dst + k, a: __pack01);
992	}
993	#endif
994
995	#if defined(__GNUC__) && __GNUC__ >= 9
996	#pragma GCC diagnostic push
997	#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations" // iteration XX invokes undefined behavior
998	#endif
999	for( ; k < len_ddn; k++ )
1000	{
1001	dst[k] = saturate_cast<uchar>(v: rawDst[k]*nrm2);
1002	}
1003	#if defined(__GNUC__) && __GNUC__ >= 9
1004	#pragma GCC diagnostic pop
1005	#endif
1006	}
1007	#else
1008	float nrm1 = `0`;
1009	for( k = `0`; k < len_ddn; k++ )
1010	{
1011	rawDst[k] *= nrm2;
1012	nrm1 += rawDst[k];
1013	}
1014	nrm1 = `1.f`/std::max(nrm1, FLT_EPSILON);
1015	if( dstMat.type() == CV_32F )
1016	{
1017	float dst = dstMat.ptr<float*>(row);
1018	for( k = `0`; k < len_ddn; k++ )
1019	{
1020	dst[k] = std::sqrt(rawDst[k] * nrm1);
1021	}
1022	}
1023	else // CV_8U
1024	{
1025	uint8_t *dst = dstMat.ptr<uint8_t>(row);
1026	for( k = `0`; k < len_ddn; k++ )
1027	{
1028	dst[k] = saturate_cast<uchar>(std::sqrt(rawDst[k] * nrm1)*SIFT_INT_DESCR_FCTR);
1029	}
1030	}
1031	#endif
1032	}
1033
1034	#endif
1035	CV_CPU_OPTIMIZATION_NAMESPACE_END
1036	} // namespace
1037

Provided by KDAB

Definitions

source code of opencv/modules/features2d/src/sift.simd.hpp