1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4//
5// Copyright (c) 2006-2010, Rob Hess <hess@eecs.oregonstate.edu>
6// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
7// Copyright (C) 2020, Intel Corporation, all rights reserved.
8
9/**********************************************************************************************\
10 Implementation of SIFT is based on the code from http://blogs.oregonstate.edu/hess/code/sift/
11 Below is the original copyright.
12 Patent US6711293 expired in March 2020.
13
14// Copyright (c) 2006-2010, Rob Hess <hess@eecs.oregonstate.edu>
15// All rights reserved.
16
17// The following patent has been issued for methods embodied in this
18// software: "Method and apparatus for identifying scale invariant features
19// in an image and use of same for locating an object in an image," David
20// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
21// filed March 8, 1999. Asignee: The University of British Columbia. For
22// further details, contact David Lowe (lowe@cs.ubc.ca) or the
23// University-Industry Liaison Office of the University of British
24// Columbia.
25
26// Note that restrictions imposed by this patent (and possibly others)
27// exist independently of and may be in conflict with the freedoms granted
28// in this license, which refers to copyright of the program, not patents
29// for any methods that it implements. Both copyright and patent law must
30// be obeyed to legally use and redistribute this program and it is not the
31// purpose of this license to induce you to infringe any patents or other
32// property right claims or to contest validity of any such claims. If you
33// redistribute or use the program, then this license merely protects you
34// from committing copyright infringement. It does not protect you from
35// committing patent infringement. So, before you do anything with this
36// program, make sure that you have permission to do so not merely in terms
37// of copyright, but also in terms of patent law.
38
39// Please note that this license is not to be understood as a guarantee
40// either. If you use the program according to this license, but in
41// conflict with patent law, it does not mean that the licensor will refund
42// you for any losses that you incur if you are sued for your patent
43// infringement.
44
45// Redistribution and use in source and binary forms, with or without
46// modification, are permitted provided that the following conditions are
47// met:
48// * Redistributions of source code must retain the above copyright and
49// patent notices, this list of conditions and the following
50// disclaimer.
51// * Redistributions in binary form must reproduce the above copyright
52// notice, this list of conditions and the following disclaimer in
53// the documentation and/or other materials provided with the
54// distribution.
55// * Neither the name of Oregon State University nor the names of its
56// contributors may be used to endorse or promote products derived
57// from this software without specific prior written permission.
58
59// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
60// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
61// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
62// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
63// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
64// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
65// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
66// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
67// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
68// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
69// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
70\**********************************************************************************************/
71
72#include "precomp.hpp"
73
74#include <opencv2/core/hal/hal.hpp>
75#include "opencv2/core/hal/intrin.hpp"
76#include <opencv2/core/utils/buffer_area.private.hpp>
77
78namespace cv {
79
80#if !defined(CV_CPU_DISPATCH_MODE) || !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY)
81/******************************* Defs and macros *****************************/
82
83// default width of descriptor histogram array
84static const int SIFT_DESCR_WIDTH = 4;
85
86// default number of bins per histogram in descriptor array
87static const int SIFT_DESCR_HIST_BINS = 8;
88
89// assumed gaussian blur for input image
90static const float SIFT_INIT_SIGMA = 0.5f;
91
92// width of border in which to ignore keypoints
93static const int SIFT_IMG_BORDER = 5;
94
95// maximum steps of keypoint interpolation before failure
96static const int SIFT_MAX_INTERP_STEPS = 5;
97
98// default number of bins in histogram for orientation assignment
99static const int SIFT_ORI_HIST_BINS = 36;
100
101// determines gaussian sigma for orientation assignment
102static const float SIFT_ORI_SIG_FCTR = 1.5f;
103
104// determines the radius of the region used in orientation assignment
105static const float SIFT_ORI_RADIUS = 4.5f; // 3 * SIFT_ORI_SIG_FCTR;
106
107// orientation magnitude relative to max that results in new feature
108static const float SIFT_ORI_PEAK_RATIO = 0.8f;
109
110// determines the size of a single descriptor orientation histogram
111static const float SIFT_DESCR_SCL_FCTR = 3.f;
112
113// threshold on magnitude of elements of descriptor vector
114static const float SIFT_DESCR_MAG_THR = 0.2f;
115
116// factor used to convert floating-point descriptor to unsigned char
117static const float SIFT_INT_DESCR_FCTR = 512.f;
118
119#define DoG_TYPE_SHORT 0
120#if DoG_TYPE_SHORT
121// intermediate type used for DoG pyramids
122typedef short sift_wt;
123static const int SIFT_FIXPT_SCALE = 48;
124#else
125// intermediate type used for DoG pyramids
126typedef float sift_wt;
127static const int SIFT_FIXPT_SCALE = 1;
128#endif
129
130#endif // definitions and macros
131
132
133CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
134
135void findScaleSpaceExtrema(
136 int octave,
137 int layer,
138 int threshold,
139 int idx,
140 int step,
141 int cols,
142 int nOctaveLayers,
143 double contrastThreshold,
144 double edgeThreshold,
145 double sigma,
146 const std::vector<Mat>& gauss_pyr,
147 const std::vector<Mat>& dog_pyr,
148 std::vector<KeyPoint>& kpts,
149 const cv::Range& range);
150
151void calcSIFTDescriptor(
152 const Mat& img, Point2f ptf, float ori, float scl,
153 const int d, const int n, Mat& dst, int row
154);
155
156
157#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
158
159// Computes a gradient orientation histogram at a specified pixel
160static
161float calcOrientationHist(
162 const Mat& img, Point pt, int radius,
163 float sigma, float* hist, int n
164)
165{
166 CV_TRACE_FUNCTION();
167
168 int i, j, k, len = (radius*2+1)*(radius*2+1);
169
170 float expf_scale = -1.f/(2.f * sigma * sigma);
171
172 cv::utils::BufferArea area;
173 float *X = 0, *Y = 0, *Mag, *Ori = 0, *W = 0, *temphist = 0;
174 area.allocate(ptr&: X, count: len, CV_SIMD_WIDTH);
175 area.allocate(ptr&: Y, count: len, CV_SIMD_WIDTH);
176 area.allocate(ptr&: Ori, count: len, CV_SIMD_WIDTH);
177 area.allocate(ptr&: W, count: len, CV_SIMD_WIDTH);
178 area.allocate(ptr&: temphist, count: n+4, CV_SIMD_WIDTH);
179 area.commit();
180 temphist += 2;
181 Mag = X;
182
183 for( i = 0; i < n; i++ )
184 temphist[i] = 0.f;
185
186 for( i = -radius, k = 0; i <= radius; i++ )
187 {
188 int y = pt.y + i;
189 if( y <= 0 || y >= img.rows - 1 )
190 continue;
191 for( j = -radius; j <= radius; j++ )
192 {
193 int x = pt.x + j;
194 if( x <= 0 || x >= img.cols - 1 )
195 continue;
196
197 float dx = (float)(img.at<sift_wt>(i0: y, i1: x+1) - img.at<sift_wt>(i0: y, i1: x-1));
198 float dy = (float)(img.at<sift_wt>(i0: y-1, i1: x) - img.at<sift_wt>(i0: y+1, i1: x));
199
200 X[k] = dx; Y[k] = dy; W[k] = (i*i + j*j)*expf_scale;
201 k++;
202 }
203 }
204
205 len = k;
206
207 // compute gradient values, orientations and the weights over the pixel neighborhood
208 cv::hal::exp32f(src: W, dst: W, n: len);
209 cv::hal::fastAtan2(y: Y, x: X, dst: Ori, n: len, angleInDegrees: true);
210 cv::hal::magnitude32f(x: X, y: Y, dst: Mag, n: len);
211
212 k = 0;
213#if (CV_SIMD || CV_SIMD_SCALABLE)
214 const int vecsize = VTraits<v_float32>::vlanes();
215 v_float32 nd360 = vx_setall_f32(v: n/360.f);
216 v_int32 __n = vx_setall_s32(v: n);
217 int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[VTraits<v_float32>::max_nlanes];
218 float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[VTraits<v_float32>::max_nlanes];
219
220 for( ; k <= len - vecsize; k += vecsize )
221 {
222 v_float32 w = vx_load_aligned( ptr: W + k );
223 v_float32 mag = vx_load_aligned( ptr: Mag + k );
224 v_float32 ori = vx_load_aligned( ptr: Ori + k );
225 v_int32 bin = v_round( a: v_mul(a: nd360, b: ori) );
226
227 bin = v_select(mask: v_ge(a: bin, b: __n), a: v_sub(a: bin, b: __n), b: bin);
228 bin = v_select(mask: v_lt(a: bin, b: vx_setzero_s32()), a: v_add(a: bin, b: __n), b: bin);
229
230 w = v_mul(a: w, b: mag);
231 v_store_aligned(ptr: bin_buf, a: bin);
232 v_store_aligned(ptr: w_mul_mag_buf, a: w);
233 for(int vi = 0; vi < vecsize; vi++)
234 {
235 temphist[bin_buf[vi]] += w_mul_mag_buf[vi];
236 }
237 }
238#endif
239 for( ; k < len; k++ )
240 {
241 int bin = cvRound(value: (n/360.f)*Ori[k]);
242 if( bin >= n )
243 bin -= n;
244 if( bin < 0 )
245 bin += n;
246 temphist[bin] += W[k]*Mag[k];
247 }
248
249 // smooth the histogram
250 temphist[-1] = temphist[n-1];
251 temphist[-2] = temphist[n-2];
252 temphist[n] = temphist[0];
253 temphist[n+1] = temphist[1];
254
255 i = 0;
256#if (CV_SIMD || CV_SIMD_SCALABLE)
257 v_float32 d_1_16 = vx_setall_f32(v: 1.f/16.f);
258 v_float32 d_4_16 = vx_setall_f32(v: 4.f/16.f);
259 v_float32 d_6_16 = vx_setall_f32(v: 6.f/16.f);
260 for( ; i <= n - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
261 {
262 v_float32 tn2 = vx_load_aligned(ptr: temphist + i-2);
263 v_float32 tn1 = vx_load(ptr: temphist + i-1);
264 v_float32 t0 = vx_load(ptr: temphist + i);
265 v_float32 t1 = vx_load(ptr: temphist + i+1);
266 v_float32 t2 = vx_load(ptr: temphist + i+2);
267 v_float32 _hist = v_fma(a: v_add(a: tn2, b: t2), b: d_1_16,
268 c: v_fma(a: v_add(a: tn1, b: t1), b: d_4_16, c: v_mul(a: t0, b: d_6_16)));
269 v_store(ptr: hist + i, a: _hist);
270 }
271#endif
272 for( ; i < n; i++ )
273 {
274 hist[i] = (temphist[i-2] + temphist[i+2])*(1.f/16.f) +
275 (temphist[i-1] + temphist[i+1])*(4.f/16.f) +
276 temphist[i]*(6.f/16.f);
277 }
278
279 float maxval = hist[0];
280 for( i = 1; i < n; i++ )
281 maxval = std::max(a: maxval, b: hist[i]);
282
283 return maxval;
284}
285
286
287//
288// Interpolates a scale-space extremum's location and scale to subpixel
289// accuracy to form an image feature. Rejects features with low contrast.
290// Based on Section 4 of Lowe's paper.
291static
292bool adjustLocalExtrema(
293 const std::vector<Mat>& dog_pyr, KeyPoint& kpt, int octv,
294 int& layer, int& r, int& c, int nOctaveLayers,
295 float contrastThreshold, float edgeThreshold, float sigma
296)
297{
298 CV_TRACE_FUNCTION();
299
300 const float img_scale = 1.f/(255*SIFT_FIXPT_SCALE);
301 const float deriv_scale = img_scale*0.5f;
302 const float second_deriv_scale = img_scale;
303 const float cross_deriv_scale = img_scale*0.25f;
304
305 float xi=0, xr=0, xc=0, contr=0;
306 int i = 0;
307
308 for( ; i < SIFT_MAX_INTERP_STEPS; i++ )
309 {
310 int idx = octv*(nOctaveLayers+2) + layer;
311 const Mat& img = dog_pyr[idx];
312 const Mat& prev = dog_pyr[idx-1];
313 const Mat& next = dog_pyr[idx+1];
314
315 Vec3f dD((img.at<sift_wt>(i0: r, i1: c+1) - img.at<sift_wt>(i0: r, i1: c-1))*deriv_scale,
316 (img.at<sift_wt>(i0: r+1, i1: c) - img.at<sift_wt>(i0: r-1, i1: c))*deriv_scale,
317 (next.at<sift_wt>(i0: r, i1: c) - prev.at<sift_wt>(i0: r, i1: c))*deriv_scale);
318
319 float v2 = (float)img.at<sift_wt>(i0: r, i1: c)*2;
320 float dxx = (img.at<sift_wt>(i0: r, i1: c+1) + img.at<sift_wt>(i0: r, i1: c-1) - v2)*second_deriv_scale;
321 float dyy = (img.at<sift_wt>(i0: r+1, i1: c) + img.at<sift_wt>(i0: r-1, i1: c) - v2)*second_deriv_scale;
322 float dss = (next.at<sift_wt>(i0: r, i1: c) + prev.at<sift_wt>(i0: r, i1: c) - v2)*second_deriv_scale;
323 float dxy = (img.at<sift_wt>(i0: r+1, i1: c+1) - img.at<sift_wt>(i0: r+1, i1: c-1) -
324 img.at<sift_wt>(i0: r-1, i1: c+1) + img.at<sift_wt>(i0: r-1, i1: c-1))*cross_deriv_scale;
325 float dxs = (next.at<sift_wt>(i0: r, i1: c+1) - next.at<sift_wt>(i0: r, i1: c-1) -
326 prev.at<sift_wt>(i0: r, i1: c+1) + prev.at<sift_wt>(i0: r, i1: c-1))*cross_deriv_scale;
327 float dys = (next.at<sift_wt>(i0: r+1, i1: c) - next.at<sift_wt>(i0: r-1, i1: c) -
328 prev.at<sift_wt>(i0: r+1, i1: c) + prev.at<sift_wt>(i0: r-1, i1: c))*cross_deriv_scale;
329
330 Matx33f H(dxx, dxy, dxs,
331 dxy, dyy, dys,
332 dxs, dys, dss);
333
334 Vec3f X = H.solve(rhs: dD, method: DECOMP_LU);
335
336 xi = -X[2];
337 xr = -X[1];
338 xc = -X[0];
339
340 if( std::abs(x: xi) < 0.5f && std::abs(x: xr) < 0.5f && std::abs(x: xc) < 0.5f )
341 break;
342
343 if( std::abs(x: xi) > (float)(INT_MAX/3) ||
344 std::abs(x: xr) > (float)(INT_MAX/3) ||
345 std::abs(x: xc) > (float)(INT_MAX/3) )
346 return false;
347
348 c += cvRound(value: xc);
349 r += cvRound(value: xr);
350 layer += cvRound(value: xi);
351
352 if( layer < 1 || layer > nOctaveLayers ||
353 c < SIFT_IMG_BORDER || c >= img.cols - SIFT_IMG_BORDER ||
354 r < SIFT_IMG_BORDER || r >= img.rows - SIFT_IMG_BORDER )
355 return false;
356 }
357
358 // ensure convergence of interpolation
359 if( i >= SIFT_MAX_INTERP_STEPS )
360 return false;
361
362 {
363 int idx = octv*(nOctaveLayers+2) + layer;
364 const Mat& img = dog_pyr[idx];
365 const Mat& prev = dog_pyr[idx-1];
366 const Mat& next = dog_pyr[idx+1];
367 Matx31f dD((img.at<sift_wt>(i0: r, i1: c+1) - img.at<sift_wt>(i0: r, i1: c-1))*deriv_scale,
368 (img.at<sift_wt>(i0: r+1, i1: c) - img.at<sift_wt>(i0: r-1, i1: c))*deriv_scale,
369 (next.at<sift_wt>(i0: r, i1: c) - prev.at<sift_wt>(i0: r, i1: c))*deriv_scale);
370 float t = dD.dot(M: Matx31f(xc, xr, xi));
371
372 contr = img.at<sift_wt>(i0: r, i1: c)*img_scale + t * 0.5f;
373 if( std::abs( x: contr ) * nOctaveLayers < contrastThreshold )
374 return false;
375
376 // principal curvatures are computed using the trace and det of Hessian
377 float v2 = img.at<sift_wt>(i0: r, i1: c)*2.f;
378 float dxx = (img.at<sift_wt>(i0: r, i1: c+1) + img.at<sift_wt>(i0: r, i1: c-1) - v2)*second_deriv_scale;
379 float dyy = (img.at<sift_wt>(i0: r+1, i1: c) + img.at<sift_wt>(i0: r-1, i1: c) - v2)*second_deriv_scale;
380 float dxy = (img.at<sift_wt>(i0: r+1, i1: c+1) - img.at<sift_wt>(i0: r+1, i1: c-1) -
381 img.at<sift_wt>(i0: r-1, i1: c+1) + img.at<sift_wt>(i0: r-1, i1: c-1)) * cross_deriv_scale;
382 float tr = dxx + dyy;
383 float det = dxx * dyy - dxy * dxy;
384
385 if( det <= 0 || tr*tr*edgeThreshold >= (edgeThreshold + 1)*(edgeThreshold + 1)*det )
386 return false;
387 }
388
389 kpt.pt.x = (c + xc) * (1 << octv);
390 kpt.pt.y = (r + xr) * (1 << octv);
391 kpt.octave = octv + (layer << 8) + (cvRound(value: (xi + 0.5)*255) << 16);
392 kpt.size = sigma*powf(x: 2.f, y: (layer + xi) / nOctaveLayers)*(1 << octv)*2;
393 kpt.response = std::abs(x: contr);
394
395 return true;
396}
397
398namespace {
399
400class findScaleSpaceExtremaT
401{
402public:
403 findScaleSpaceExtremaT(
404 int _o,
405 int _i,
406 int _threshold,
407 int _idx,
408 int _step,
409 int _cols,
410 int _nOctaveLayers,
411 double _contrastThreshold,
412 double _edgeThreshold,
413 double _sigma,
414 const std::vector<Mat>& _gauss_pyr,
415 const std::vector<Mat>& _dog_pyr,
416 std::vector<KeyPoint>& kpts)
417
418 : o(_o),
419 i(_i),
420 threshold(_threshold),
421 idx(_idx),
422 step(_step),
423 cols(_cols),
424 nOctaveLayers(_nOctaveLayers),
425 contrastThreshold(_contrastThreshold),
426 edgeThreshold(_edgeThreshold),
427 sigma(_sigma),
428 gauss_pyr(_gauss_pyr),
429 dog_pyr(_dog_pyr),
430 kpts_(kpts)
431 {
432 // nothing
433 }
434 void process(const cv::Range& range)
435 {
436 CV_TRACE_FUNCTION();
437
438 const int begin = range.start;
439 const int end = range.end;
440
441 static const int n = SIFT_ORI_HIST_BINS;
442 float CV_DECL_ALIGNED(CV_SIMD_WIDTH) hist[n];
443
444 const Mat& img = dog_pyr[idx];
445 const Mat& prev = dog_pyr[idx-1];
446 const Mat& next = dog_pyr[idx+1];
447
448 for( int r = begin; r < end; r++)
449 {
450 const sift_wt* currptr = img.ptr<sift_wt>(y: r);
451 const sift_wt* prevptr = prev.ptr<sift_wt>(y: r);
452 const sift_wt* nextptr = next.ptr<sift_wt>(y: r);
453 int c = SIFT_IMG_BORDER;
454
455#if (CV_SIMD || CV_SIMD_SCALABLE) && !(DoG_TYPE_SHORT)
456 const int vecsize = VTraits<v_float32>::vlanes();
457 for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize)
458 {
459 v_float32 val = vx_load(ptr: &currptr[c]);
460 v_float32 _00,_01,_02;
461 v_float32 _10, _12;
462 v_float32 _20,_21,_22;
463
464 v_float32 vmin,vmax;
465
466
467 v_float32 cond = v_gt(a: v_abs(x: val), b: vx_setall_f32(v: (float)this->threshold));
468 if (!v_check_any(a: cond))
469 {
470 continue;
471 }
472
473 _00 = vx_load(ptr: &currptr[c-step-1]); _01 = vx_load(ptr: &currptr[c-step]); _02 = vx_load(ptr: &currptr[c-step+1]);
474 _10 = vx_load(ptr: &currptr[c -1]); _12 = vx_load(ptr: &currptr[c +1]);
475 _20 = vx_load(ptr: &currptr[c+step-1]); _21 = vx_load(ptr: &currptr[c+step]); _22 = vx_load(ptr: &currptr[c+step+1]);
476
477 vmax = v_max(a: v_max(a: v_max(a: _00,b: _01),b: v_max(a: _02,b: _10)),b: v_max(a: v_max(a: _12,b: _20),b: v_max(a: _21,b: _22)));
478 vmin = v_min(a: v_min(a: v_min(a: _00,b: _01),b: v_min(a: _02,b: _10)),b: v_min(a: v_min(a: _12,b: _20),b: v_min(a: _21,b: _22)));
479
480 v_float32 condp = v_and(a: v_and(a: cond, b: v_gt(a: val, b: vx_setall_f32(v: 0))), b: v_ge(a: val, b: vmax));
481 v_float32 condm = v_and(a: v_and(a: cond, b: v_lt(a: val, b: vx_setall_f32(v: 0))), b: v_le(a: val, b: vmin));
482
483 cond = v_or(a: condp, b: condm);
484 if (!v_check_any(a: cond))
485 {
486 continue;
487 }
488
489 _00 = vx_load(ptr: &prevptr[c-step-1]); _01 = vx_load(ptr: &prevptr[c-step]); _02 = vx_load(ptr: &prevptr[c-step+1]);
490 _10 = vx_load(ptr: &prevptr[c -1]); _12 = vx_load(ptr: &prevptr[c +1]);
491 _20 = vx_load(ptr: &prevptr[c+step-1]); _21 = vx_load(ptr: &prevptr[c+step]); _22 = vx_load(ptr: &prevptr[c+step+1]);
492
493 vmax = v_max(a: v_max(a: v_max(a: _00,b: _01),b: v_max(a: _02,b: _10)),b: v_max(a: v_max(a: _12,b: _20),b: v_max(a: _21,b: _22)));
494 vmin = v_min(a: v_min(a: v_min(a: _00,b: _01),b: v_min(a: _02,b: _10)),b: v_min(a: v_min(a: _12,b: _20),b: v_min(a: _21,b: _22)));
495
496 condp = v_and(a: condp, b: v_ge(a: val, b: vmax));
497 condm = v_and(a: condm, b: v_le(a: val, b: vmin));
498
499 cond = v_or(a: condp, b: condm);
500 if (!v_check_any(a: cond))
501 {
502 continue;
503 }
504
505 v_float32 _11p = vx_load(ptr: &prevptr[c]);
506 v_float32 _11n = vx_load(ptr: &nextptr[c]);
507
508 v_float32 max_middle = v_max(a: _11n,b: _11p);
509 v_float32 min_middle = v_min(a: _11n,b: _11p);
510
511 _00 = vx_load(ptr: &nextptr[c-step-1]); _01 = vx_load(ptr: &nextptr[c-step]); _02 = vx_load(ptr: &nextptr[c-step+1]);
512 _10 = vx_load(ptr: &nextptr[c -1]); _12 = vx_load(ptr: &nextptr[c +1]);
513 _20 = vx_load(ptr: &nextptr[c+step-1]); _21 = vx_load(ptr: &nextptr[c+step]); _22 = vx_load(ptr: &nextptr[c+step+1]);
514
515 vmax = v_max(a: v_max(a: v_max(a: _00,b: _01),b: v_max(a: _02,b: _10)),b: v_max(a: v_max(a: _12,b: _20),b: v_max(a: _21,b: _22)));
516 vmin = v_min(a: v_min(a: v_min(a: _00,b: _01),b: v_min(a: _02,b: _10)),b: v_min(a: v_min(a: _12,b: _20),b: v_min(a: _21,b: _22)));
517
518 condp = v_and(a: condp, b: v_ge(a: val, b: v_max(a: vmax, b: max_middle)));
519 condm = v_and(a: condm, b: v_le(a: val, b: v_min(a: vmin, b: min_middle)));
520
521 cond = v_or(a: condp, b: condm);
522 if (!v_check_any(a: cond))
523 {
524 continue;
525 }
526
527 int mask = v_signmask(a: cond);
528 for (int k = 0; k<vecsize;k++)
529 {
530 if ((mask & (1<<k)) == 0)
531 continue;
532
533 CV_TRACE_REGION("pixel_candidate_simd");
534
535 KeyPoint kpt;
536 int r1 = r, c1 = c+k, layer = i;
537 if( !adjustLocalExtrema(dog_pyr, kpt, octv: o, layer, r&: r1, c&: c1,
538 nOctaveLayers, contrastThreshold: (float)contrastThreshold,
539 edgeThreshold: (float)edgeThreshold, sigma: (float)sigma) )
540 continue;
541 float scl_octv = kpt.size*0.5f/(1 << o);
542 float omax = calcOrientationHist(img: gauss_pyr[o*(nOctaveLayers+3) + layer],
543 pt: Point(c1, r1),
544 radius: cvRound(value: SIFT_ORI_RADIUS * scl_octv),
545 sigma: SIFT_ORI_SIG_FCTR * scl_octv,
546 hist, n);
547 float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
548 for( int j = 0; j < n; j++ )
549 {
550 int l = j > 0 ? j - 1 : n - 1;
551 int r2 = j < n-1 ? j + 1 : 0;
552
553 if( hist[j] > hist[l] && hist[j] > hist[r2] && hist[j] >= mag_thr )
554 {
555 float bin = j + 0.5f * (hist[l]-hist[r2]) / (hist[l] - 2*hist[j] + hist[r2]);
556 bin = bin < 0 ? n + bin : bin >= n ? bin - n : bin;
557 kpt.angle = 360.f - (float)((360.f/n) * bin);
558 if(std::abs(x: kpt.angle - 360.f) < FLT_EPSILON)
559 kpt.angle = 0.f;
560
561 kpts_.push_back(x: kpt);
562 }
563 }
564 }
565 }
566
567#endif //CV_SIMD && !(DoG_TYPE_SHORT)
568
569 // vector loop reminder, better predictibility and less branch density
570 for( ; c < cols-SIFT_IMG_BORDER; c++)
571 {
572 sift_wt val = currptr[c];
573 if (std::abs(x: val) <= threshold)
574 continue;
575
576 sift_wt _00,_01,_02;
577 sift_wt _10, _12;
578 sift_wt _20,_21,_22;
579 _00 = currptr[c-step-1]; _01 = currptr[c-step]; _02 = currptr[c-step+1];
580 _10 = currptr[c -1]; _12 = currptr[c +1];
581 _20 = currptr[c+step-1]; _21 = currptr[c+step]; _22 = currptr[c+step+1];
582
583 bool calculate = false;
584 if (val > 0)
585 {
586 sift_wt vmax = std::max(a: std::max(a: std::max(a: _00,b: _01),b: std::max(a: _02,b: _10)),b: std::max(a: std::max(a: _12,b: _20),b: std::max(a: _21,b: _22)));
587 if (val >= vmax)
588 {
589 _00 = prevptr[c-step-1]; _01 = prevptr[c-step]; _02 = prevptr[c-step+1];
590 _10 = prevptr[c -1]; _12 = prevptr[c +1];
591 _20 = prevptr[c+step-1]; _21 = prevptr[c+step]; _22 = prevptr[c+step+1];
592 vmax = std::max(a: std::max(a: std::max(a: _00,b: _01),b: std::max(a: _02,b: _10)),b: std::max(a: std::max(a: _12,b: _20),b: std::max(a: _21,b: _22)));
593 if (val >= vmax)
594 {
595 _00 = nextptr[c-step-1]; _01 = nextptr[c-step]; _02 = nextptr[c-step+1];
596 _10 = nextptr[c -1]; _12 = nextptr[c +1];
597 _20 = nextptr[c+step-1]; _21 = nextptr[c+step]; _22 = nextptr[c+step+1];
598 vmax = std::max(a: std::max(a: std::max(a: _00,b: _01),b: std::max(a: _02,b: _10)),b: std::max(a: std::max(a: _12,b: _20),b: std::max(a: _21,b: _22)));
599 if (val >= vmax)
600 {
601 sift_wt _11p = prevptr[c], _11n = nextptr[c];
602 calculate = (val >= std::max(a: _11p,b: _11n));
603 }
604 }
605 }
606
607 } else { // val cant be zero here (first abs took care of zero), must be negative
608 sift_wt vmin = std::min(a: std::min(a: std::min(a: _00,b: _01),b: std::min(a: _02,b: _10)),b: std::min(a: std::min(a: _12,b: _20),b: std::min(a: _21,b: _22)));
609 if (val <= vmin)
610 {
611 _00 = prevptr[c-step-1]; _01 = prevptr[c-step]; _02 = prevptr[c-step+1];
612 _10 = prevptr[c -1]; _12 = prevptr[c +1];
613 _20 = prevptr[c+step-1]; _21 = prevptr[c+step]; _22 = prevptr[c+step+1];
614 vmin = std::min(a: std::min(a: std::min(a: _00,b: _01),b: std::min(a: _02,b: _10)),b: std::min(a: std::min(a: _12,b: _20),b: std::min(a: _21,b: _22)));
615 if (val <= vmin)
616 {
617 _00 = nextptr[c-step-1]; _01 = nextptr[c-step]; _02 = nextptr[c-step+1];
618 _10 = nextptr[c -1]; _12 = nextptr[c +1];
619 _20 = nextptr[c+step-1]; _21 = nextptr[c+step]; _22 = nextptr[c+step+1];
620 vmin = std::min(a: std::min(a: std::min(a: _00,b: _01),b: std::min(a: _02,b: _10)),b: std::min(a: std::min(a: _12,b: _20),b: std::min(a: _21,b: _22)));
621 if (val <= vmin)
622 {
623 sift_wt _11p = prevptr[c], _11n = nextptr[c];
624 calculate = (val <= std::min(a: _11p,b: _11n));
625 }
626 }
627 }
628 }
629
630 if (calculate)
631 {
632 CV_TRACE_REGION("pixel_candidate");
633
634 KeyPoint kpt;
635 int r1 = r, c1 = c, layer = i;
636 if( !adjustLocalExtrema(dog_pyr, kpt, octv: o, layer, r&: r1, c&: c1,
637 nOctaveLayers, contrastThreshold: (float)contrastThreshold,
638 edgeThreshold: (float)edgeThreshold, sigma: (float)sigma) )
639 continue;
640 float scl_octv = kpt.size*0.5f/(1 << o);
641 float omax = calcOrientationHist(img: gauss_pyr[o*(nOctaveLayers+3) + layer],
642 pt: Point(c1, r1),
643 radius: cvRound(value: SIFT_ORI_RADIUS * scl_octv),
644 sigma: SIFT_ORI_SIG_FCTR * scl_octv,
645 hist, n);
646 float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
647 for( int j = 0; j < n; j++ )
648 {
649 int l = j > 0 ? j - 1 : n - 1;
650 int r2 = j < n-1 ? j + 1 : 0;
651
652 if( hist[j] > hist[l] && hist[j] > hist[r2] && hist[j] >= mag_thr )
653 {
654 float bin = j + 0.5f * (hist[l]-hist[r2]) / (hist[l] - 2*hist[j] + hist[r2]);
655 bin = bin < 0 ? n + bin : bin >= n ? bin - n : bin;
656 kpt.angle = 360.f - (float)((360.f/n) * bin);
657 if(std::abs(x: kpt.angle - 360.f) < FLT_EPSILON)
658 kpt.angle = 0.f;
659
660 kpts_.push_back(x: kpt);
661 }
662 }
663 }
664 }
665 }
666 }
667private:
668 int o, i;
669 int threshold;
670 int idx, step, cols;
671 int nOctaveLayers;
672 double contrastThreshold;
673 double edgeThreshold;
674 double sigma;
675 const std::vector<Mat>& gauss_pyr;
676 const std::vector<Mat>& dog_pyr;
677 std::vector<KeyPoint>& kpts_;
678};
679
680} // namespace
681
682
683void findScaleSpaceExtrema(
684 int octave,
685 int layer,
686 int threshold,
687 int idx,
688 int step,
689 int cols,
690 int nOctaveLayers,
691 double contrastThreshold,
692 double edgeThreshold,
693 double sigma,
694 const std::vector<Mat>& gauss_pyr,
695 const std::vector<Mat>& dog_pyr,
696 std::vector<KeyPoint>& kpts,
697 const cv::Range& range)
698{
699 CV_TRACE_FUNCTION();
700
701 findScaleSpaceExtremaT(octave, layer, threshold, idx,
702 step, cols,
703 nOctaveLayers, contrastThreshold, edgeThreshold, sigma,
704 gauss_pyr, dog_pyr,
705 kpts)
706 .process(range);
707}
708
709void calcSIFTDescriptor(
710 const Mat& img, Point2f ptf, float ori, float scl,
711 const int d, const int n, Mat& dstMat, int row
712)
713{
714 CV_TRACE_FUNCTION();
715
716 Point pt(cvRound(value: ptf.x), cvRound(value: ptf.y));
717 float cos_t = cosf(x: ori*(float)(CV_PI/180));
718 float sin_t = sinf(x: ori*(float)(CV_PI/180));
719 float bins_per_rad = n / 360.f;
720 float exp_scale = -1.f/(d * d * 0.5f);
721 float hist_width = SIFT_DESCR_SCL_FCTR * scl;
722 int radius = cvRound(value: hist_width * 1.4142135623730951f * (d + 1) * 0.5f);
723 // Clip the radius to the diagonal of the image to avoid autobuffer too large exception
724 radius = std::min(a: radius, b: (int)std::sqrt(x: ((double) img.cols)*img.cols + ((double) img.rows)*img.rows));
725 cos_t /= hist_width;
726 sin_t /= hist_width;
727
728 int i, j, k;
729 const int len = (radius*2+1)*(radius*2+1);
730 const int len_hist = (d+2)*(d+2)*(n+2);
731 const int len_ddn = d * d * n;
732 int rows = img.rows, cols = img.cols;
733
734 cv::utils::BufferArea area;
735 float *X = 0, *Y = 0, *Mag, *Ori = 0, *W = 0, *RBin = 0, *CBin = 0, *hist = 0, *rawDst = 0;
736 area.allocate(ptr&: X, count: len, CV_SIMD_WIDTH);
737 area.allocate(ptr&: Y, count: len, CV_SIMD_WIDTH);
738 area.allocate(ptr&: Ori, count: len, CV_SIMD_WIDTH);
739 area.allocate(ptr&: W, count: len, CV_SIMD_WIDTH);
740 area.allocate(ptr&: RBin, count: len, CV_SIMD_WIDTH);
741 area.allocate(ptr&: CBin, count: len, CV_SIMD_WIDTH);
742 area.allocate(ptr&: hist, count: len_hist, CV_SIMD_WIDTH);
743 area.allocate(ptr&: rawDst, count: len_ddn, CV_SIMD_WIDTH);
744 area.commit();
745 Mag = Y;
746
747 for( i = 0; i < d+2; i++ )
748 {
749 for( j = 0; j < d+2; j++ )
750 for( k = 0; k < n+2; k++ )
751 hist[(i*(d+2) + j)*(n+2) + k] = 0.;
752 }
753
754 for( i = -radius, k = 0; i <= radius; i++ )
755 for( j = -radius; j <= radius; j++ )
756 {
757 // Calculate sample's histogram array coords rotated relative to ori.
758 // Subtract 0.5 so samples that fall e.g. in the center of row 1 (i.e.
759 // r_rot = 1.5) have full weight placed in row 1 after interpolation.
760 float c_rot = j * cos_t - i * sin_t;
761 float r_rot = j * sin_t + i * cos_t;
762 float rbin = r_rot + d/2 - 0.5f;
763 float cbin = c_rot + d/2 - 0.5f;
764 int r = pt.y + i, c = pt.x + j;
765
766 if( rbin > -1 && rbin < d && cbin > -1 && cbin < d &&
767 r > 0 && r < rows - 1 && c > 0 && c < cols - 1 )
768 {
769 float dx = (float)(img.at<sift_wt>(i0: r, i1: c+1) - img.at<sift_wt>(i0: r, i1: c-1));
770 float dy = (float)(img.at<sift_wt>(i0: r-1, i1: c) - img.at<sift_wt>(i0: r+1, i1: c));
771 X[k] = dx; Y[k] = dy; RBin[k] = rbin; CBin[k] = cbin;
772 W[k] = (c_rot * c_rot + r_rot * r_rot)*exp_scale;
773 k++;
774 }
775 }
776
777 const int len_left = k;
778 cv::hal::fastAtan2(y: Y, x: X, dst: Ori, n: len_left, angleInDegrees: true);
779 cv::hal::magnitude32f(x: X, y: Y, dst: Mag, n: len_left);
780 cv::hal::exp32f(src: W, dst: W, n: len_left);
781
782 k = 0;
783#if (CV_SIMD || CV_SIMD_SCALABLE)
784 {
785 const int vecsize = VTraits<v_float32>::vlanes();
786 int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[VTraits<v_float32>::max_nlanes];
787 float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*VTraits<v_float32>::max_nlanes];
788 const v_float32 __ori = vx_setall_f32(v: ori);
789 const v_float32 __bins_per_rad = vx_setall_f32(v: bins_per_rad);
790 const v_int32 __n = vx_setall_s32(v: n);
791 const v_int32 __1 = vx_setall_s32(v: 1);
792 const v_int32 __d_plus_2 = vx_setall_s32(v: d+2);
793 const v_int32 __n_plus_2 = vx_setall_s32(v: n+2);
794 for( ; k <= len_left - vecsize; k += vecsize )
795 {
796 v_float32 rbin = vx_load_aligned(ptr: RBin + k);
797 v_float32 cbin = vx_load_aligned(ptr: CBin + k);
798 v_float32 obin = v_mul(a: v_sub(a: vx_load_aligned(ptr: Ori + k), b: __ori), b: __bins_per_rad);
799 v_float32 mag = v_mul(a: vx_load_aligned(ptr: Mag + k), b: vx_load_aligned(ptr: W + k));
800
801 v_int32 r0 = v_floor(a: rbin);
802 v_int32 c0 = v_floor(a: cbin);
803 v_int32 o0 = v_floor(a: obin);
804 rbin = v_sub(a: rbin, b: v_cvt_f32(a: r0));
805 cbin = v_sub(a: cbin, b: v_cvt_f32(a: c0));
806 obin = v_sub(a: obin, b: v_cvt_f32(a: o0));
807
808 o0 = v_select(mask: v_lt(a: o0, b: vx_setzero_s32()), a: v_add(a: o0, b: __n), b: o0);
809 o0 = v_select(mask: v_ge(a: o0, b: __n), a: v_sub(a: o0, b: __n), b: o0);
810
811 v_float32 v_r1 = v_mul(a: mag, b: rbin), v_r0 = v_sub(a: mag, b: v_r1);
812 v_float32 v_rc11 = v_mul(a: v_r1, b: cbin), v_rc10 = v_sub(a: v_r1, b: v_rc11);
813 v_float32 v_rc01 = v_mul(a: v_r0, b: cbin), v_rc00 = v_sub(a: v_r0, b: v_rc01);
814 v_float32 v_rco111 = v_mul(a: v_rc11, b: obin), v_rco110 = v_sub(a: v_rc11, b: v_rco111);
815 v_float32 v_rco101 = v_mul(a: v_rc10, b: obin), v_rco100 = v_sub(a: v_rc10, b: v_rco101);
816 v_float32 v_rco011 = v_mul(a: v_rc01, b: obin), v_rco010 = v_sub(a: v_rc01, b: v_rco011);
817 v_float32 v_rco001 = v_mul(a: v_rc00, b: obin), v_rco000 = v_sub(a: v_rc00, b: v_rco001);
818
819 v_int32 idx = v_muladd(a: v_muladd(a: v_add(a: r0, b: __1), b: __d_plus_2, c: v_add(a: c0, b: __1)), b: __n_plus_2, c: o0);
820 v_store_aligned(ptr: idx_buf, a: idx);
821
822 v_store_aligned(ptr: rco_buf, a: v_rco000);
823 v_store_aligned(ptr: rco_buf+vecsize, a: v_rco001);
824 v_store_aligned(ptr: rco_buf+vecsize*2, a: v_rco010);
825 v_store_aligned(ptr: rco_buf+vecsize*3, a: v_rco011);
826 v_store_aligned(ptr: rco_buf+vecsize*4, a: v_rco100);
827 v_store_aligned(ptr: rco_buf+vecsize*5, a: v_rco101);
828 v_store_aligned(ptr: rco_buf+vecsize*6, a: v_rco110);
829 v_store_aligned(ptr: rco_buf+vecsize*7, a: v_rco111);
830
831 for(int id = 0; id < vecsize; id++)
832 {
833 hist[idx_buf[id]] += rco_buf[id];
834 hist[idx_buf[id]+1] += rco_buf[vecsize + id];
835 hist[idx_buf[id]+(n+2)] += rco_buf[2*vecsize + id];
836 hist[idx_buf[id]+(n+3)] += rco_buf[3*vecsize + id];
837 hist[idx_buf[id]+(d+2)*(n+2)] += rco_buf[4*vecsize + id];
838 hist[idx_buf[id]+(d+2)*(n+2)+1] += rco_buf[5*vecsize + id];
839 hist[idx_buf[id]+(d+3)*(n+2)] += rco_buf[6*vecsize + id];
840 hist[idx_buf[id]+(d+3)*(n+2)+1] += rco_buf[7*vecsize + id];
841 }
842 }
843 }
844#endif
845 for( ; k < len_left; k++ )
846 {
847 float rbin = RBin[k], cbin = CBin[k];
848 float obin = (Ori[k] - ori)*bins_per_rad;
849 float mag = Mag[k]*W[k];
850
851 int r0 = cvFloor( value: rbin );
852 int c0 = cvFloor( value: cbin );
853 int o0 = cvFloor( value: obin );
854 rbin -= r0;
855 cbin -= c0;
856 obin -= o0;
857
858 if( o0 < 0 )
859 o0 += n;
860 if( o0 >= n )
861 o0 -= n;
862
863 // histogram update using tri-linear interpolation
864 float v_r1 = mag*rbin, v_r0 = mag - v_r1;
865 float v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
866 float v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
867 float v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
868 float v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
869 float v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
870 float v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
871
872 int idx = ((r0+1)*(d+2) + c0+1)*(n+2) + o0;
873 hist[idx] += v_rco000;
874 hist[idx+1] += v_rco001;
875 hist[idx+(n+2)] += v_rco010;
876 hist[idx+(n+3)] += v_rco011;
877 hist[idx+(d+2)*(n+2)] += v_rco100;
878 hist[idx+(d+2)*(n+2)+1] += v_rco101;
879 hist[idx+(d+3)*(n+2)] += v_rco110;
880 hist[idx+(d+3)*(n+2)+1] += v_rco111;
881 }
882
883 // finalize histogram, since the orientation histograms are circular
884 for( i = 0; i < d; i++ )
885 for( j = 0; j < d; j++ )
886 {
887 int idx = ((i+1)*(d+2) + (j+1))*(n+2);
888 hist[idx] += hist[idx+n];
889 hist[idx+1] += hist[idx+n+1];
890 for( k = 0; k < n; k++ )
891 rawDst[(i*d + j)*n + k] = hist[idx+k];
892 }
893 // copy histogram to the descriptor,
894 // apply hysteresis thresholding
895 // and scale the result, so that it can be easily converted
896 // to byte array
897 float nrm2 = 0;
898 k = 0;
899#if (CV_SIMD || CV_SIMD_SCALABLE)
900 {
901 v_float32 __nrm2 = vx_setzero_f32();
902 v_float32 __rawDst;
903 for( ; k <= len_ddn - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
904 {
905 __rawDst = vx_load_aligned(ptr: rawDst + k);
906 __nrm2 = v_fma(a: __rawDst, b: __rawDst, c: __nrm2);
907 }
908 nrm2 = (float)v_reduce_sum(a: __nrm2);
909 }
910#endif
911 for( ; k < len_ddn; k++ )
912 nrm2 += rawDst[k]*rawDst[k];
913
914 const float thr = std::sqrt(x: nrm2)*SIFT_DESCR_MAG_THR;
915
916 i = 0, nrm2 = 0;
917#if 0 //CV_AVX2
918 // This code cannot be enabled because it sums nrm2 in a different order,
919 // thus producing slightly different results
920 {
921 float CV_DECL_ALIGNED(CV_SIMD_WIDTH) nrm2_buf[8];
922 __m256 __dst;
923 __m256 __nrm2 = _mm256_setzero_ps();
924 __m256 __thr = _mm256_set1_ps(thr);
925 for( ; i <= len_ddn - 8; i += 8 )
926 {
927 __dst = _mm256_loadu_ps(&rawDst[i]);
928 __dst = _mm256_min_ps(__dst, __thr);
929 _mm256_storeu_ps(&rawDst[i], __dst);
930#if CV_FMA3
931 __nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
932#else
933 __nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
934#endif
935 }
936 _mm256_store_ps(nrm2_buf, __nrm2);
937 nrm2 = nrm2_buf[0] + nrm2_buf[1] + nrm2_buf[2] + nrm2_buf[3] +
938 nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7];
939 }
940#endif
941 for( ; i < len_ddn; i++ )
942 {
943 float val = std::min(a: rawDst[i], b: thr);
944 rawDst[i] = val;
945 nrm2 += val*val;
946 }
947 nrm2 = SIFT_INT_DESCR_FCTR/std::max(a: std::sqrt(x: nrm2), FLT_EPSILON);
948
949#if 1
950 k = 0;
951if( dstMat.type() == CV_32F )
952{
953 float* dst = dstMat.ptr<float>(y: row);
954#if (CV_SIMD || CV_SIMD_SCALABLE)
955 v_float32 __dst;
956 v_float32 __min = vx_setzero_f32();
957 v_float32 __max = vx_setall_f32(v: 255.0f); // max of uchar
958 v_float32 __nrm2 = vx_setall_f32(v: nrm2);
959 for( k = 0; k <= len_ddn - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
960 {
961 __dst = vx_load_aligned(ptr: rawDst + k);
962 __dst = v_min(a: v_max(a: v_cvt_f32(a: v_round(a: v_mul(a: __dst, b: __nrm2))), b: __min), b: __max);
963 v_store(ptr: dst + k, a: __dst);
964 }
965#endif
966#if defined(__GNUC__) && __GNUC__ >= 9
967#pragma GCC diagnostic push
968#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations" // iteration XX invokes undefined behavior
969#endif
970 for( ; k < len_ddn; k++ )
971 {
972 dst[k] = saturate_cast<uchar>(v: rawDst[k]*nrm2);
973 }
974#if defined(__GNUC__) && __GNUC__ >= 9
975#pragma GCC diagnostic pop
976#endif
977}
978else // CV_8U
979{
980 uint8_t* dst = dstMat.ptr<uint8_t>(y: row);
981#if (CV_SIMD || CV_SIMD_SCALABLE)
982 v_float32 __dst0, __dst1;
983 v_uint16 __pack01;
984 v_float32 __nrm2 = vx_setall_f32(v: nrm2);
985 for( k = 0; k <= len_ddn - VTraits<v_float32>::vlanes() * 2; k += VTraits<v_float32>::vlanes() * 2 )
986 {
987 __dst0 = vx_load_aligned(ptr: rawDst + k);
988 __dst1 = vx_load_aligned(ptr: rawDst + k + VTraits<v_float32>::vlanes());
989
990 __pack01 = v_pack_u(a: v_round(a: v_mul(a: __dst0, b: __nrm2)), b: v_round(a: v_mul(a: __dst1, b: __nrm2)));
991 v_pack_store(ptr: dst + k, a: __pack01);
992 }
993#endif
994
995#if defined(__GNUC__) && __GNUC__ >= 9
996#pragma GCC diagnostic push
997#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations" // iteration XX invokes undefined behavior
998#endif
999 for( ; k < len_ddn; k++ )
1000 {
1001 dst[k] = saturate_cast<uchar>(v: rawDst[k]*nrm2);
1002 }
1003#if defined(__GNUC__) && __GNUC__ >= 9
1004#pragma GCC diagnostic pop
1005#endif
1006}
1007#else
1008 float nrm1 = 0;
1009 for( k = 0; k < len_ddn; k++ )
1010 {
1011 rawDst[k] *= nrm2;
1012 nrm1 += rawDst[k];
1013 }
1014 nrm1 = 1.f/std::max(nrm1, FLT_EPSILON);
1015 if( dstMat.type() == CV_32F )
1016 {
1017 float *dst = dstMat.ptr<float>(row);
1018 for( k = 0; k < len_ddn; k++ )
1019 {
1020 dst[k] = std::sqrt(rawDst[k] * nrm1);
1021 }
1022 }
1023 else // CV_8U
1024 {
1025 uint8_t *dst = dstMat.ptr<uint8_t>(row);
1026 for( k = 0; k < len_ddn; k++ )
1027 {
1028 dst[k] = saturate_cast<uchar>(std::sqrt(rawDst[k] * nrm1)*SIFT_INT_DESCR_FCTR);
1029 }
1030 }
1031#endif
1032}
1033
1034#endif
1035CV_CPU_OPTIMIZATION_NAMESPACE_END
1036} // namespace
1037

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of opencv/modules/features2d/src/sift.simd.hpp