1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5// By downloading, copying, installing or using the software you agree to this license.
6// If you do not agree to this license, do not download, install,
7// copy or use the software.
8//
9//
10// License Agreement
11// For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20// * Redistribution's of source code must retain the above copyright notice,
21// this list of conditions and the following disclaimer.
22//
23// * Redistribution's in binary form must reproduce the above copyright notice,
24// this list of conditions and the following disclaimer in the documentation
25// and/or other materials provided with the distribution.
26//
27// * The name of the copyright holders may not be used to endorse or promote products
28// derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43#include "precomp.hpp"
44#include "opencv2/core/hal/intrin.hpp"
45#include "opencl_kernels_video.hpp"
46
47using namespace std;
48#define EPS 0.001F
49#define INF 1E+10F
50
51namespace cv {
52
53class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
54{
55 public:
56 DISOpticalFlowImpl();
57
58 void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
59 void collectGarbage() CV_OVERRIDE;
60
61 protected: //!< algorithm parameters
62 int finest_scale, coarsest_scale;
63 int patch_size;
64 int patch_stride;
65 int grad_descent_iter;
66 int variational_refinement_iter;
67 float variational_refinement_alpha;
68 float variational_refinement_gamma;
69 float variational_refinement_delta;
70 float variational_refinement_epsilon;
71 bool use_mean_normalization;
72 bool use_spatial_propagation;
73
74 protected: //!< some auxiliary variables
75 int border_size;
76 int w, h; //!< flow buffer width and height on the current scale
77 int ws, hs; //!< sparse flow buffer width and height on the current scale
78
79 public:
80 int getFinestScale() const CV_OVERRIDE { return finest_scale; }
81 void setFinestScale(int val) CV_OVERRIDE { finest_scale = val; }
82 int getPatchSize() const CV_OVERRIDE { return patch_size; }
83 void setPatchSize(int val) CV_OVERRIDE { patch_size = val; }
84 int getPatchStride() const CV_OVERRIDE { return patch_stride; }
85 void setPatchStride(int val) CV_OVERRIDE { patch_stride = val; }
86 int getGradientDescentIterations() const CV_OVERRIDE { return grad_descent_iter; }
87 void setGradientDescentIterations(int val) CV_OVERRIDE { grad_descent_iter = val; }
88 int getVariationalRefinementIterations() const CV_OVERRIDE { return variational_refinement_iter; }
89 void setVariationalRefinementIterations(int val) CV_OVERRIDE { variational_refinement_iter = val; }
90 float getVariationalRefinementAlpha() const CV_OVERRIDE { return variational_refinement_alpha; }
91 void setVariationalRefinementAlpha(float val) CV_OVERRIDE { variational_refinement_alpha = val; }
92 float getVariationalRefinementDelta() const CV_OVERRIDE { return variational_refinement_delta; }
93 void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; }
94 float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; }
95 void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; }
96 float getVariationalRefinementEpsilon() const CV_OVERRIDE { return variational_refinement_epsilon; }
97 void setVariationalRefinementEpsilon(float val) CV_OVERRIDE { variational_refinement_epsilon = val; }
98
99 bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; }
100 void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; }
101 bool getUseSpatialPropagation() const CV_OVERRIDE { return use_spatial_propagation; }
102 void setUseSpatialPropagation(bool val) CV_OVERRIDE { use_spatial_propagation = val; }
103
104 protected: //!< internal buffers
105 vector<Mat_<uchar> > I0s; //!< Gaussian pyramid for the current frame
106 vector<Mat_<uchar> > I1s; //!< Gaussian pyramid for the next frame
107 vector<Mat_<uchar> > I1s_ext; //!< I1s with borders
108
109 vector<Mat_<short> > I0xs; //!< Gaussian pyramid for the x gradient of the current frame
110 vector<Mat_<short> > I0ys; //!< Gaussian pyramid for the y gradient of the current frame
111
112 vector<Mat_<float> > Ux; //!< x component of the flow vectors
113 vector<Mat_<float> > Uy; //!< y component of the flow vectors
114
115 vector<Mat_<float> > initial_Ux; //!< x component of the initial flow field, if one was passed as an input
116 vector<Mat_<float> > initial_Uy; //!< y component of the initial flow field, if one was passed as an input
117
118 Mat_<Vec2f> U; //!< a buffer for the merged flow
119
120 Mat_<float> Sx; //!< intermediate sparse flow representation (x component)
121 Mat_<float> Sy; //!< intermediate sparse flow representation (y component)
122
123 /* Structure tensor components: */
124 Mat_<float> I0xx_buf; //!< sum of squares of x gradient values
125 Mat_<float> I0yy_buf; //!< sum of squares of y gradient values
126 Mat_<float> I0xy_buf; //!< sum of x and y gradient products
127
128 /* Extra buffers that are useful if patch mean-normalization is used: */
129 Mat_<float> I0x_buf; //!< sum of x gradient values
130 Mat_<float> I0y_buf; //!< sum of y gradient values
131
132 /* Auxiliary buffers used in structure tensor computation: */
133 Mat_<float> I0xx_buf_aux;
134 Mat_<float> I0yy_buf_aux;
135 Mat_<float> I0xy_buf_aux;
136 Mat_<float> I0x_buf_aux;
137 Mat_<float> I0y_buf_aux;
138
139 vector<Ptr<VariationalRefinement> > variational_refinement_processors;
140
141 private: //!< private methods and parallel sections
142 void prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow);
143 void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, Mat &dst_I0y, Mat &I0x,
144 Mat &I0y);
145 int autoSelectCoarsestScale(int img_width);
146 void autoSelectPatchSizeAndScales(int img_width);
147
148 struct PatchInverseSearch_ParBody : public ParallelLoopBody
149 {
150 DISOpticalFlowImpl *dis;
151 int nstripes, stripe_sz;
152 int hs;
153 Mat *Sx, *Sy, *Ux, *Uy, *I0, *I1, *I0x, *I0y;
154 int num_iter, pyr_level;
155
156 PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _hs, Mat &dst_Sx, Mat &dst_Sy,
157 Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, Mat &_I0x, Mat &_I0y, int _num_iter,
158 int _pyr_level);
159 void operator()(const Range &range) const CV_OVERRIDE;
160 };
161
162 struct Densification_ParBody : public ParallelLoopBody
163 {
164 DISOpticalFlowImpl *dis;
165 int nstripes, stripe_sz;
166 int h;
167 Mat *Ux, *Uy, *Sx, *Sy, *I0, *I1;
168
169 Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx,
170 Mat &src_Sy, Mat &_I0, Mat &_I1);
171 void operator()(const Range &range) const CV_OVERRIDE;
172 };
173
174#ifdef HAVE_OPENCL
175 vector<UMat> u_I0s; //!< Gaussian pyramid for the current frame
176 vector<UMat> u_I1s; //!< Gaussian pyramid for the next frame
177 vector<UMat> u_I1s_ext; //!< I1s with borders
178
179 vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame
180 vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame
181
182 vector<UMat> u_U; //!< (x,y) component of the flow vectors (CV_32FC2)
183 vector<UMat> u_initial_U; //!< (x, y) components of the initial flow field, if one was passed as an input (CV_32FC2)
184
185 UMat u_S; //!< intermediate sparse flow representation (x,y components - CV_32FC2)
186
187 /* Structure tensor components: */
188 UMat u_I0xx_buf; //!< sum of squares of x gradient values
189 UMat u_I0yy_buf; //!< sum of squares of y gradient values
190 UMat u_I0xy_buf; //!< sum of x and y gradient products
191
192 /* Extra buffers that are useful if patch mean-normalization is used: */
193 UMat u_I0x_buf; //!< sum of x gradient values
194 UMat u_I0y_buf; //!< sum of y gradient values
195
196 /* Auxiliary buffers used in structure tensor computation: */
197 UMat u_I0xx_buf_aux;
198 UMat u_I0yy_buf_aux;
199 UMat u_I0xy_buf_aux;
200 UMat u_I0x_buf_aux;
201 UMat u_I0y_buf_aux;
202
203 bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
204 UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y);
205 void ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow);
206 bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow);
207 bool ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1);
208 bool ocl_PatchInverseSearch(UMat &src_U,
209 UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level);
210#endif
211};
212
213DISOpticalFlowImpl::DISOpticalFlowImpl()
214{
215 CV_INSTRUMENT_REGION();
216
217 finest_scale = 2;
218 patch_size = 8;
219 patch_stride = 4;
220 grad_descent_iter = 16;
221 variational_refinement_iter = 5;
222 variational_refinement_alpha = 20.f;
223 variational_refinement_gamma = 10.f;
224 variational_refinement_delta = 5.f;
225 variational_refinement_epsilon = 0.01f;
226
227 border_size = 16;
228 use_mean_normalization = true;
229 use_spatial_propagation = true;
230 coarsest_scale = 10;
231
232 /* Use separate variational refinement instances for different scales to avoid repeated memory allocation: */
233 int max_possible_scales = 10;
234 ws = hs = w = h = 0;
235 for (int i = 0; i < max_possible_scales; i++)
236 variational_refinement_processors.push_back(VariationalRefinement::create());
237}
238
239void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow)
240{
241 CV_INSTRUMENT_REGION();
242
243 I0s.resize(coarsest_scale + 1);
244 I1s.resize(coarsest_scale + 1);
245 I1s_ext.resize(coarsest_scale + 1);
246 I0xs.resize(coarsest_scale + 1);
247 I0ys.resize(coarsest_scale + 1);
248 Ux.resize(coarsest_scale + 1);
249 Uy.resize(coarsest_scale + 1);
250
251 Mat flow_uv[2];
252 if (use_flow)
253 {
254 split(src: flow, mvbegin: flow_uv);
255 initial_Ux.resize(coarsest_scale + 1);
256 initial_Uy.resize(coarsest_scale + 1);
257 }
258
259 int fraction = 1;
260 int cur_rows = 0, cur_cols = 0;
261
262 for (int i = 0; i <= coarsest_scale; i++)
263 {
264 /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
265 if (i == finest_scale)
266 {
267 cur_rows = I0.rows / fraction;
268 cur_cols = I0.cols / fraction;
269 I0s[i].create(cur_rows, cur_cols);
270 resize(I0, I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
271 I1s[i].create(cur_rows, cur_cols);
272 resize(I1, I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
273
274 /* These buffers are reused in each scale so we initialize them once on the finest scale: */
275 Sx.create(cur_rows / patch_stride, cur_cols / patch_stride);
276 Sy.create(cur_rows / patch_stride, cur_cols / patch_stride);
277 I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
278 I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
279 I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
280 I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
281 I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
282
283 I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride);
284 I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride);
285 I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride);
286 I0x_buf_aux.create(cur_rows, cur_cols / patch_stride);
287 I0y_buf_aux.create(cur_rows, cur_cols / patch_stride);
288
289 U.create(cur_rows, cur_cols);
290 }
291 else if (i > finest_scale)
292 {
293 cur_rows = I0s[i - 1].rows / 2;
294 cur_cols = I0s[i - 1].cols / 2;
295 I0s[i].create(cur_rows, cur_cols);
296 resize(I0s[i - 1], I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
297 I1s[i].create(cur_rows, cur_cols);
298 resize(I1s[i - 1], I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
299 }
300
301 if (i >= finest_scale)
302 {
303 I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size);
304 copyMakeBorder(I1s[i], I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
305 I0xs[i].create(cur_rows, cur_cols);
306 I0ys[i].create(cur_rows, cur_cols);
307 spatialGradient(I0s[i], I0xs[i], I0ys[i]);
308 Ux[i].create(cur_rows, cur_cols);
309 Uy[i].create(cur_rows, cur_cols);
310 variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
311 variational_refinement_processors[i]->setDelta(variational_refinement_delta);
312 variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
313 variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon);
314 variational_refinement_processors[i]->setSorIterations(5);
315 variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
316
317 if (use_flow)
318 {
319 resize(flow_uv[0], initial_Ux[i], Size(cur_cols, cur_rows));
320 initial_Ux[i] /= fraction;
321 resize(flow_uv[1], initial_Uy[i], Size(cur_cols, cur_rows));
322 initial_Uy[i] /= fraction;
323 }
324 }
325
326 fraction *= 2;
327 }
328}
329
330/* This function computes the structure tensor elements (local sums of I0x^2, I0x*I0y and I0y^2).
331 * A simple box filter is not used instead because we need to compute these sums on a sparse grid
332 * and store them densely in the output buffers.
333 */
334void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x,
335 Mat &dst_I0y, Mat &I0x, Mat &I0y)
336{
337 CV_INSTRUMENT_REGION();
338
339 float *I0xx_ptr = dst_I0xx.ptr<float>();
340 float *I0yy_ptr = dst_I0yy.ptr<float>();
341 float *I0xy_ptr = dst_I0xy.ptr<float>();
342 float *I0x_ptr = dst_I0x.ptr<float>();
343 float *I0y_ptr = dst_I0y.ptr<float>();
344
345 float *I0xx_aux_ptr = I0xx_buf_aux.ptr<float>();
346 float *I0yy_aux_ptr = I0yy_buf_aux.ptr<float>();
347 float *I0xy_aux_ptr = I0xy_buf_aux.ptr<float>();
348 float *I0x_aux_ptr = I0x_buf_aux.ptr<float>();
349 float *I0y_aux_ptr = I0y_buf_aux.ptr<float>();
350
351 /* Separable box filter: horizontal pass */
352 for (int i = 0; i < h; i++)
353 {
354 float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
355 short *x_row = I0x.ptr<short>(i);
356 short *y_row = I0y.ptr<short>(i);
357 for (int j = 0; j < patch_size; j++)
358 {
359 sum_xx += x_row[j] * x_row[j];
360 sum_yy += y_row[j] * y_row[j];
361 sum_xy += x_row[j] * y_row[j];
362 sum_x += x_row[j];
363 sum_y += y_row[j];
364 }
365 I0xx_aux_ptr[i * ws] = sum_xx;
366 I0yy_aux_ptr[i * ws] = sum_yy;
367 I0xy_aux_ptr[i * ws] = sum_xy;
368 I0x_aux_ptr[i * ws] = sum_x;
369 I0y_aux_ptr[i * ws] = sum_y;
370 int js = 1;
371 for (int j = patch_size; j < w; j++)
372 {
373 sum_xx += (x_row[j] * x_row[j] - x_row[j - patch_size] * x_row[j - patch_size]);
374 sum_yy += (y_row[j] * y_row[j] - y_row[j - patch_size] * y_row[j - patch_size]);
375 sum_xy += (x_row[j] * y_row[j] - x_row[j - patch_size] * y_row[j - patch_size]);
376 sum_x += (x_row[j] - x_row[j - patch_size]);
377 sum_y += (y_row[j] - y_row[j - patch_size]);
378 if ((j - patch_size + 1) % patch_stride == 0)
379 {
380 I0xx_aux_ptr[i * ws + js] = sum_xx;
381 I0yy_aux_ptr[i * ws + js] = sum_yy;
382 I0xy_aux_ptr[i * ws + js] = sum_xy;
383 I0x_aux_ptr[i * ws + js] = sum_x;
384 I0y_aux_ptr[i * ws + js] = sum_y;
385 js++;
386 }
387 }
388 }
389
390 AutoBuffer<float> sum_xx(ws), sum_yy(ws), sum_xy(ws), sum_x(ws), sum_y(ws);
391 for (int j = 0; j < ws; j++)
392 {
393 sum_xx[j] = 0.0f;
394 sum_yy[j] = 0.0f;
395 sum_xy[j] = 0.0f;
396 sum_x[j] = 0.0f;
397 sum_y[j] = 0.0f;
398 }
399
400 /* Separable box filter: vertical pass */
401 for (int i = 0; i < patch_size; i++)
402 for (int j = 0; j < ws; j++)
403 {
404 sum_xx[j] += I0xx_aux_ptr[i * ws + j];
405 sum_yy[j] += I0yy_aux_ptr[i * ws + j];
406 sum_xy[j] += I0xy_aux_ptr[i * ws + j];
407 sum_x[j] += I0x_aux_ptr[i * ws + j];
408 sum_y[j] += I0y_aux_ptr[i * ws + j];
409 }
410 for (int j = 0; j < ws; j++)
411 {
412 I0xx_ptr[j] = sum_xx[j];
413 I0yy_ptr[j] = sum_yy[j];
414 I0xy_ptr[j] = sum_xy[j];
415 I0x_ptr[j] = sum_x[j];
416 I0y_ptr[j] = sum_y[j];
417 }
418 int is = 1;
419 for (int i = patch_size; i < h; i++)
420 {
421 for (int j = 0; j < ws; j++)
422 {
423 sum_xx[j] += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
424 sum_yy[j] += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
425 sum_xy[j] += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
426 sum_x[j] += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
427 sum_y[j] += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
428 }
429 if ((i - patch_size + 1) % patch_stride == 0)
430 {
431 for (int j = 0; j < ws; j++)
432 {
433 I0xx_ptr[is * ws + j] = sum_xx[j];
434 I0yy_ptr[is * ws + j] = sum_yy[j];
435 I0xy_ptr[is * ws + j] = sum_xy[j];
436 I0x_ptr[is * ws + j] = sum_x[j];
437 I0y_ptr[is * ws + j] = sum_y[j];
438 }
439 is++;
440 }
441 }
442}
443
444int DISOpticalFlowImpl::autoSelectCoarsestScale(int img_width)
445{
446 const int fratio = 5;
447 return std::max(0, (int)std::floor(x: log2(x: (2.0f*(float)img_width) / ((float)fratio * (float)patch_size))));
448}
449
450void DISOpticalFlowImpl::autoSelectPatchSizeAndScales(int img_width)
451{
452 switch (finest_scale)
453 {
454 case 1:
455 patch_size = 8;
456 coarsest_scale = autoSelectCoarsestScale(img_width);
457 finest_scale = std::max(coarsest_scale-2, 0);
458 break;
459
460 case 3:
461 patch_size = 12;
462 coarsest_scale = autoSelectCoarsestScale(img_width);
463 finest_scale = std::max(coarsest_scale-4, 0);
464 break;
465
466 case 4:
467 patch_size = 12;
468 coarsest_scale = autoSelectCoarsestScale(img_width);
469 finest_scale = std::max(coarsest_scale-5, 0);
470 break;
471
472 // default case, fall-through.
473 case 2:
474 default:
475 patch_size = 8;
476 coarsest_scale = autoSelectCoarsestScale(img_width);
477 finest_scale = std::max(coarsest_scale-2, 0);
478 break;
479 }
480}
481
482DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes,
483 int _hs, Mat &dst_Sx, Mat &dst_Sy,
484 Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1,
485 Mat &_I0x, Mat &_I0y, int _num_iter,
486 int _pyr_level)
487 : dis(&_dis), nstripes(_nstripes), hs(_hs), Sx(&dst_Sx), Sy(&dst_Sy), Ux(&src_Ux), Uy(&src_Uy), I0(&_I0), I1(&_I1),
488 I0x(&_I0x), I0y(&_I0y), num_iter(_num_iter), pyr_level(_pyr_level)
489{
490 stripe_sz = (int)ceil(x: hs / (double)nstripes);
491}
492
493/////////////////////////////////////////////* Patch processing functions */////////////////////////////////////////////
494
495/* Some auxiliary macros */
496#define HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION \
497 v_float32x4 w00v = v_setall_f32(w00); \
498 v_float32x4 w01v = v_setall_f32(w01); \
499 v_float32x4 w10v = v_setall_f32(w10); \
500 v_float32x4 w11v = v_setall_f32(w11); \
501 \
502 v_uint16x8 I0_row_8, I1_row_8, I1_row_shifted_8, I1_row_next_8, I1_row_next_shifted_8, tmp; \
503 v_uint32x4 I0_row_4_left, I1_row_4_left, I1_row_shifted_4_left, I1_row_next_4_left, I1_row_next_shifted_4_left; \
504 v_uint32x4 I0_row_4_right, I1_row_4_right, I1_row_shifted_4_right, I1_row_next_4_right, \
505 I1_row_next_shifted_4_right; \
506 v_float32x4 I_diff_left, I_diff_right; \
507 \
508 /* Preload and expand the first row of I1: */ \
509 I1_row_8 = v_load_expand(I1_ptr); \
510 I1_row_shifted_8 = v_load_expand(I1_ptr + 1); \
511 v_expand(I1_row_8, I1_row_4_left, I1_row_4_right); \
512 v_expand(I1_row_shifted_8, I1_row_shifted_4_left, I1_row_shifted_4_right); \
513 I1_ptr += I1_stride;
514
515#define HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION \
516 /* Load the next row of I1: */ \
517 I1_row_next_8 = v_load_expand(I1_ptr); \
518 I1_row_next_shifted_8 = v_load_expand(I1_ptr + 1); \
519 /* Separate the left and right halves: */ \
520 v_expand(I1_row_next_8, I1_row_next_4_left, I1_row_next_4_right); \
521 v_expand(I1_row_next_shifted_8, I1_row_next_shifted_4_left, I1_row_next_shifted_4_right); \
522 \
523 /* Load current row of I0: */ \
524 I0_row_8 = v_load_expand(I0_ptr); \
525 v_expand(I0_row_8, I0_row_4_left, I0_row_4_right); \
526 \
527 /* Compute diffs between I0 and bilinearly interpolated I1: */ \
528 I_diff_left = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left))), \
529 v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left))), \
530 v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left))), \
531 v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)))), \
532 v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left))); \
533 I_diff_right = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right))), \
534 v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right))), \
535 v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right))), \
536 v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)))), \
537 v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right)));
538
539#define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW \
540 I0_ptr += I0_stride; \
541 I1_ptr += I1_stride; \
542 \
543 I1_row_4_left = I1_row_next_4_left; \
544 I1_row_4_right = I1_row_next_4_right; \
545 I1_row_shifted_4_left = I1_row_next_shifted_4_left; \
546 I1_row_shifted_4_right = I1_row_next_shifted_4_right;
547
548/* This function essentially performs one iteration of gradient descent when finding the most similar patch in I1 for a
549 * given one in I0. It assumes that I0_ptr and I1_ptr already point to the corresponding patches and w00, w01, w10, w11
550 * are precomputed bilinear interpolation weights. It returns the SSD (sum of squared differences) between these patches
551 * and computes the values (dst_dUx, dst_dUy) that are used in the flow vector update. HAL acceleration is implemented
552 * only for the default patch size (8x8). Everything is processed in floats as using fixed-point approximations harms
553 * the quality significantly.
554 */
555inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr, short *I0y_ptr,
556 int I0_stride, int I1_stride, float w00, float w01, float w10, float w11, int patch_sz)
557{
558 float SSD = 0.0f;
559#if CV_SIMD128
560 if (patch_sz == 8)
561 {
562 /* Variables to accumulate the sums */
563 v_float32x4 Ux_vec = v_setall_f32(v: 0);
564 v_float32x4 Uy_vec = v_setall_f32(v: 0);
565 v_float32x4 SSD_vec = v_setall_f32(v: 0);
566
567 v_int16x8 I0x_row, I0y_row;
568 v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
569
570 HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
571 for (int row = 0; row < 8; row++)
572 {
573 HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
574 I0x_row = v_load(ptr: I0x_ptr);
575 v_expand(a: I0x_row, b0&: I0x_row_4_left, b1&: I0x_row_4_right);
576 I0y_row = v_load(ptr: I0y_ptr);
577 v_expand(a: I0y_row, b0&: I0y_row_4_left, b1&: I0y_row_4_right);
578
579 /* Update the sums: */
580 Ux_vec = v_add(a: Ux_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0x_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0x_row_4_right))));
581 Uy_vec = v_add(a: Uy_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0y_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0y_row_4_right))));
582 SSD_vec = v_add(a: SSD_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right)));
583
584 I0x_ptr += I0_stride;
585 I0y_ptr += I0_stride;
586 HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
587 }
588
589 /* Final reduce operations: */
590 dst_dUx = v_reduce_sum(a: Ux_vec);
591 dst_dUy = v_reduce_sum(a: Uy_vec);
592 SSD = v_reduce_sum(a: SSD_vec);
593 }
594 else
595#endif
596 {
597 dst_dUx = 0.0f;
598 dst_dUy = 0.0f;
599 float diff;
600 for (int i = 0; i < patch_sz; i++)
601 for (int j = 0; j < patch_sz; j++)
602 {
603 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
604 w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
605 I0_ptr[i * I0_stride + j];
606
607 SSD += diff * diff;
608 dst_dUx += diff * I0x_ptr[i * I0_stride + j];
609 dst_dUy += diff * I0y_ptr[i * I0_stride + j];
610 }
611 }
612 return SSD;
613}
614
615/* Same as processPatch, but with patch mean normalization, which improves robustness under changing
616 * lighting conditions
617 */
618inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr,
619 short *I0y_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
620 float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
621{
622 float sum_diff = 0.0, sum_diff_sq = 0.0;
623 float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
624 float n = (float)patch_sz * patch_sz;
625
626#if CV_SIMD128
627 if (patch_sz == 8)
628 {
629 /* Variables to accumulate the sums */
630 v_float32x4 sum_I0x_mul_vec = v_setall_f32(v: 0);
631 v_float32x4 sum_I0y_mul_vec = v_setall_f32(v: 0);
632 v_float32x4 sum_diff_vec = v_setall_f32(v: 0);
633 v_float32x4 sum_diff_sq_vec = v_setall_f32(v: 0);
634
635 v_int16x8 I0x_row, I0y_row;
636 v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
637
638 HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
639 for (int row = 0; row < 8; row++)
640 {
641 HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
642 I0x_row = v_load(ptr: I0x_ptr);
643 v_expand(a: I0x_row, b0&: I0x_row_4_left, b1&: I0x_row_4_right);
644 I0y_row = v_load(ptr: I0y_ptr);
645 v_expand(a: I0y_row, b0&: I0y_row_4_left, b1&: I0y_row_4_right);
646
647 /* Update the sums: */
648 sum_I0x_mul_vec = v_add(a: sum_I0x_mul_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0x_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0x_row_4_right))));
649 sum_I0y_mul_vec = v_add(a: sum_I0y_mul_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0y_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0y_row_4_right))));
650 sum_diff_sq_vec = v_add(a: sum_diff_sq_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right)));
651 sum_diff_vec = v_add(a: sum_diff_vec, b: v_add(a: I_diff_left, b: I_diff_right));
652
653 I0x_ptr += I0_stride;
654 I0y_ptr += I0_stride;
655 HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
656 }
657
658 /* Final reduce operations: */
659 sum_I0x_mul = v_reduce_sum(a: sum_I0x_mul_vec);
660 sum_I0y_mul = v_reduce_sum(a: sum_I0y_mul_vec);
661 sum_diff = v_reduce_sum(a: sum_diff_vec);
662 sum_diff_sq = v_reduce_sum(a: sum_diff_sq_vec);
663 }
664 else
665#endif
666 {
667 float diff;
668 for (int i = 0; i < patch_sz; i++)
669 for (int j = 0; j < patch_sz; j++)
670 {
671 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
672 w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
673 I0_ptr[i * I0_stride + j];
674
675 sum_diff += diff;
676 sum_diff_sq += diff * diff;
677
678 sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j];
679 sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j];
680 }
681 }
682 dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
683 dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
684 return sum_diff_sq - sum_diff * sum_diff / n;
685}
686
687/* Similar to processPatch, but compute only the sum of squared differences (SSD) between the patches */
688inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
689 float w11, int patch_sz)
690{
691 float SSD = 0.0f;
692#if CV_SIMD128
693 if (patch_sz == 8)
694 {
695 v_float32x4 SSD_vec = v_setall_f32(v: 0);
696 HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
697 for (int row = 0; row < 8; row++)
698 {
699 HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
700 SSD_vec = v_add(a: SSD_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right)));
701 HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
702 }
703 SSD = v_reduce_sum(a: SSD_vec);
704 }
705 else
706#endif
707 {
708 float diff;
709 for (int i = 0; i < patch_sz; i++)
710 for (int j = 0; j < patch_sz; j++)
711 {
712 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
713 w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
714 I0_ptr[i * I0_stride + j];
715 SSD += diff * diff;
716 }
717 }
718 return SSD;
719}
720
721/* Same as computeSSD, but with patch mean normalization */
722inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01,
723 float w10, float w11, int patch_sz)
724{
725 float sum_diff = 0.0f, sum_diff_sq = 0.0f;
726 float n = (float)patch_sz * patch_sz;
727#if CV_SIMD128
728 if (patch_sz == 8)
729 {
730 v_float32x4 sum_diff_vec = v_setall_f32(v: 0);
731 v_float32x4 sum_diff_sq_vec = v_setall_f32(v: 0);
732 HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
733 for (int row = 0; row < 8; row++)
734 {
735 HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
736 sum_diff_sq_vec = v_add(a: sum_diff_sq_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right)));
737 sum_diff_vec = v_add(a: sum_diff_vec, b: v_add(a: I_diff_left, b: I_diff_right));
738 HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
739 }
740 sum_diff = v_reduce_sum(a: sum_diff_vec);
741 sum_diff_sq = v_reduce_sum(a: sum_diff_sq_vec);
742 }
743 else
744 {
745#endif
746 float diff;
747 for (int i = 0; i < patch_sz; i++)
748 for (int j = 0; j < patch_sz; j++)
749 {
750 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
751 w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
752 I0_ptr[i * I0_stride + j];
753
754 sum_diff += diff;
755 sum_diff_sq += diff * diff;
756 }
757#if CV_SIMD128
758 }
759#endif
760 return sum_diff_sq - sum_diff * sum_diff / n;
761}
762
763#undef HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION
764#undef HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION
765#undef HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW
766///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
767
768void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const
769{
770 CV_INSTRUMENT_REGION();
771
772 // force separate processing of stripes if we are using spatial propagation:
773 if (dis->use_spatial_propagation && range.end > range.start + 1)
774 {
775 for (int n = range.start; n < range.end; n++)
776 (*this)(Range(n, n + 1));
777 return;
778 }
779 int psz = dis->patch_size;
780 int psz2 = psz / 2;
781 int w_ext = dis->w + 2 * dis->border_size; //!< width of I1_ext
782 int bsz = dis->border_size;
783
784 /* Input dense flow */
785 float *Ux_ptr = Ux->ptr<float>();
786 float *Uy_ptr = Uy->ptr<float>();
787
788 /* Output sparse flow */
789 float *Sx_ptr = Sx->ptr<float>();
790 float *Sy_ptr = Sy->ptr<float>();
791
792 uchar *I0_ptr = I0->ptr<uchar>();
793 uchar *I1_ptr = I1->ptr<uchar>();
794 short *I0x_ptr = I0x->ptr<short>();
795 short *I0y_ptr = I0y->ptr<short>();
796
797 /* Precomputed structure tensor */
798 float *xx_ptr = dis->I0xx_buf.ptr<float>();
799 float *yy_ptr = dis->I0yy_buf.ptr<float>();
800 float *xy_ptr = dis->I0xy_buf.ptr<float>();
801 /* And extra buffers for mean-normalization: */
802 float *x_ptr = dis->I0x_buf.ptr<float>();
803 float *y_ptr = dis->I0y_buf.ptr<float>();
804
805 bool use_temporal_candidates = false;
806 float *initial_Ux_ptr = NULL, *initial_Uy_ptr = NULL;
807 if (!dis->initial_Ux.empty())
808 {
809 initial_Ux_ptr = dis->initial_Ux[pyr_level].ptr<float>();
810 initial_Uy_ptr = dis->initial_Uy[pyr_level].ptr<float>();
811 use_temporal_candidates = true;
812 }
813
814 int i, j, dir;
815 int start_is, end_is, start_js, end_js;
816 int start_i, start_j;
817 float i_lower_limit = bsz - psz + 1.0f;
818 float i_upper_limit = bsz + dis->h - 1.0f;
819 float j_lower_limit = bsz - psz + 1.0f;
820 float j_upper_limit = bsz + dis->w - 1.0f;
821 float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
822
823#define INIT_BILINEAR_WEIGHTS(Ux, Uy) \
824 i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit); \
825 j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit); \
826 { \
827 float di = i_I1 - floor(i_I1); \
828 float dj = j_I1 - floor(j_I1); \
829 w11 = di * dj; \
830 w10 = di * (1 - dj); \
831 w01 = (1 - di) * dj; \
832 w00 = (1 - di) * (1 - dj); \
833 }
834
835#define COMPUTE_SSD(dst, Ux, Uy) \
836 INIT_BILINEAR_WEIGHTS(Ux, Uy); \
837 if (dis->use_mean_normalization) \
838 dst = computeSSDMeanNorm(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, \
839 w01, w10, w11, psz); \
840 else \
841 dst = computeSSD(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, w01, \
842 w10, w11, psz);
843
844 int num_inner_iter = (int)floor(x: dis->grad_descent_iter / (float)num_iter);
845 for (int iter = 0; iter < num_iter; iter++)
846 {
847 if (iter % 2 == 0)
848 {
849 dir = 1;
850 start_is = min(range.start * stripe_sz, hs);
851 end_is = min(range.end * stripe_sz, hs);
852 start_js = 0;
853 end_js = dis->ws;
854 start_i = start_is * dis->patch_stride;
855 start_j = 0;
856 }
857 else
858 {
859 dir = -1;
860 start_is = min(range.end * stripe_sz, hs) - 1;
861 end_is = min(range.start * stripe_sz, hs) - 1;
862 start_js = dis->ws - 1;
863 end_js = -1;
864 start_i = start_is * dis->patch_stride;
865 start_j = (dis->ws - 1) * dis->patch_stride;
866 }
867
868 i = start_i;
869 for (int is = start_is; dir * is < dir * end_is; is += dir)
870 {
871 j = start_j;
872 for (int js = start_js; dir * js < dir * end_js; js += dir)
873 {
874 if (iter == 0)
875 {
876 /* Using result form the previous pyramid level as the very first approximation: */
877 Sx_ptr[is * dis->ws + js] = Ux_ptr[(i + psz2) * dis->w + j + psz2];
878 Sy_ptr[is * dis->ws + js] = Uy_ptr[(i + psz2) * dis->w + j + psz2];
879 }
880
881 float min_SSD = INF, cur_SSD;
882 if (use_temporal_candidates || dis->use_spatial_propagation)
883 {
884 COMPUTE_SSD(min_SSD, Sx_ptr[is * dis->ws + js], Sy_ptr[is * dis->ws + js]);
885 }
886
887 if (use_temporal_candidates)
888 {
889 /* Try temporal candidates (vectors from the initial flow field that was passed to the function) */
890 COMPUTE_SSD(cur_SSD, initial_Ux_ptr[(i + psz2) * dis->w + j + psz2],
891 initial_Uy_ptr[(i + psz2) * dis->w + j + psz2]);
892 if (cur_SSD < min_SSD)
893 {
894 min_SSD = cur_SSD;
895 Sx_ptr[is * dis->ws + js] = initial_Ux_ptr[(i + psz2) * dis->w + j + psz2];
896 Sy_ptr[is * dis->ws + js] = initial_Uy_ptr[(i + psz2) * dis->w + j + psz2];
897 }
898 }
899
900 if (dis->use_spatial_propagation)
901 {
902 /* Try spatial candidates: */
903 if (dir * js > dir * start_js)
904 {
905 COMPUTE_SSD(cur_SSD, Sx_ptr[is * dis->ws + js - dir], Sy_ptr[is * dis->ws + js - dir]);
906 if (cur_SSD < min_SSD)
907 {
908 min_SSD = cur_SSD;
909 Sx_ptr[is * dis->ws + js] = Sx_ptr[is * dis->ws + js - dir];
910 Sy_ptr[is * dis->ws + js] = Sy_ptr[is * dis->ws + js - dir];
911 }
912 }
913 /* Flow vectors won't actually propagate across different stripes, which is the reason for keeping
914 * the number of stripes constant. It works well enough in practice and doesn't introduce any
915 * visible seams.
916 */
917 if (dir * is > dir * start_is)
918 {
919 COMPUTE_SSD(cur_SSD, Sx_ptr[(is - dir) * dis->ws + js], Sy_ptr[(is - dir) * dis->ws + js]);
920 if (cur_SSD < min_SSD)
921 {
922 min_SSD = cur_SSD;
923 Sx_ptr[is * dis->ws + js] = Sx_ptr[(is - dir) * dis->ws + js];
924 Sy_ptr[is * dis->ws + js] = Sy_ptr[(is - dir) * dis->ws + js];
925 }
926 }
927 }
928
929 /* Use the best candidate as a starting point for the gradient descent: */
930 float cur_Ux = Sx_ptr[is * dis->ws + js];
931 float cur_Uy = Sy_ptr[is * dis->ws + js];
932
933 /* Computing the inverse of the structure tensor: */
934 float detH = xx_ptr[is * dis->ws + js] * yy_ptr[is * dis->ws + js] -
935 xy_ptr[is * dis->ws + js] * xy_ptr[is * dis->ws + js];
936 if (abs(x: detH) < EPS)
937 detH = EPS;
938 float invH11 = yy_ptr[is * dis->ws + js] / detH;
939 float invH12 = -xy_ptr[is * dis->ws + js] / detH;
940 float invH22 = xx_ptr[is * dis->ws + js] / detH;
941 float prev_SSD = INF, SSD;
942 float x_grad_sum = x_ptr[is * dis->ws + js];
943 float y_grad_sum = y_ptr[is * dis->ws + js];
944
945 for (int t = 0; t < num_inner_iter; t++)
946 {
947 INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
948 if (dis->use_mean_normalization)
949 SSD = processPatchMeanNorm(dst_dUx&: dUx, dst_dUy&: dUy,
950 I0_ptr: I0_ptr + i * dis->w + j, I1_ptr: I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
951 I0x_ptr: I0x_ptr + i * dis->w + j, I0y_ptr: I0y_ptr + i * dis->w + j,
952 I0_stride: dis->w, I1_stride: w_ext, w00, w01, w10, w11, patch_sz: psz,
953 x_grad_sum, y_grad_sum);
954 else
955 SSD = processPatch(dst_dUx&: dUx, dst_dUy&: dUy,
956 I0_ptr: I0_ptr + i * dis->w + j, I1_ptr: I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
957 I0x_ptr: I0x_ptr + i * dis->w + j, I0y_ptr: I0y_ptr + i * dis->w + j,
958 I0_stride: dis->w, I1_stride: w_ext, w00, w01, w10, w11, patch_sz: psz);
959
960 dx = invH11 * dUx + invH12 * dUy;
961 dy = invH12 * dUx + invH22 * dUy;
962 cur_Ux -= dx;
963 cur_Uy -= dy;
964
965 /* Break when patch distance stops decreasing */
966 if (SSD >= prev_SSD)
967 break;
968 prev_SSD = SSD;
969 }
970
971 /* If gradient descent converged to a flow vector that is very far from the initial approximation
972 * (more than patch size) then we don't use it. Noticeably improves the robustness.
973 */
974 if (norm(Vec2f(cur_Ux - Sx_ptr[is * dis->ws + js], cur_Uy - Sy_ptr[is * dis->ws + js])) <= psz)
975 {
976 Sx_ptr[is * dis->ws + js] = cur_Ux;
977 Sy_ptr[is * dis->ws + js] = cur_Uy;
978 }
979 j += dir * dis->patch_stride;
980 }
981 i += dir * dis->patch_stride;
982 }
983 }
984#undef INIT_BILINEAR_WEIGHTS
985#undef COMPUTE_SSD
986}
987
988DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h,
989 Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy,
990 Mat &_I0, Mat &_I1)
991 : dis(&_dis), nstripes(_nstripes), h(_h), Ux(&dst_Ux), Uy(&dst_Uy), Sx(&src_Sx), Sy(&src_Sy), I0(&_I0), I1(&_I1)
992{
993 stripe_sz = (int)ceil(x: h / (double)nstripes);
994}
995
996/* This function transforms a sparse optical flow field obtained by PatchInverseSearch (which computes flow values
997 * on a sparse grid defined by patch_stride) into a dense optical flow field by weighted averaging of values from the
998 * overlapping patches.
999 */
1000void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const
1001{
1002 CV_INSTRUMENT_REGION();
1003
1004 int start_i = min(range.start * stripe_sz, h);
1005 int end_i = min(range.end * stripe_sz, h);
1006
1007 /* Input sparse flow */
1008 float *Sx_ptr = Sx->ptr<float>();
1009 float *Sy_ptr = Sy->ptr<float>();
1010
1011 /* Output dense flow */
1012 float *Ux_ptr = Ux->ptr<float>();
1013 float *Uy_ptr = Uy->ptr<float>();
1014
1015 uchar *I0_ptr = I0->ptr<uchar>();
1016 uchar *I1_ptr = I1->ptr<uchar>();
1017
1018 int psz = dis->patch_size;
1019 int pstr = dis->patch_stride;
1020 int i_l, i_u;
1021 int j_l, j_u;
1022 float i_m, j_m, diff;
1023
1024 /* These values define the set of sparse grid locations that contain patches overlapping with the current dense flow
1025 * location */
1026 int start_is, end_is;
1027 int start_js, end_js;
1028
1029/* Some helper macros for updating this set of sparse grid locations */
1030#define UPDATE_SPARSE_I_COORDINATES \
1031 if (i % pstr == 0 && i + psz <= h) \
1032 end_is++; \
1033 if (i - psz >= 0 && (i - psz) % pstr == 0 && start_is < end_is) \
1034 start_is++;
1035
1036#define UPDATE_SPARSE_J_COORDINATES \
1037 if (j % pstr == 0 && j + psz <= dis->w) \
1038 end_js++; \
1039 if (j - psz >= 0 && (j - psz) % pstr == 0 && start_js < end_js) \
1040 start_js++;
1041
1042 start_is = 0;
1043 end_is = -1;
1044 for (int i = 0; i < start_i; i++)
1045 {
1046 UPDATE_SPARSE_I_COORDINATES;
1047 }
1048 for (int i = start_i; i < end_i; i++)
1049 {
1050 UPDATE_SPARSE_I_COORDINATES;
1051 start_js = 0;
1052 end_js = -1;
1053 for (int j = 0; j < dis->w; j++)
1054 {
1055 UPDATE_SPARSE_J_COORDINATES;
1056 float coef, sum_coef = 0.0f;
1057 float sum_Ux = 0.0f;
1058 float sum_Uy = 0.0f;
1059
1060 /* Iterate through all the patches that overlap the current location (i,j) */
1061 for (int is = start_is; is <= end_is; is++)
1062 for (int js = start_js; js <= end_js; js++)
1063 {
1064 j_m = min(max(j + Sx_ptr[is * dis->ws + js], 0.0f), dis->w - 1.0f - EPS);
1065 i_m = min(max(i + Sy_ptr[is * dis->ws + js], 0.0f), dis->h - 1.0f - EPS);
1066 j_l = (int)j_m;
1067 j_u = j_l + 1;
1068 i_l = (int)i_m;
1069 i_u = i_l + 1;
1070 diff = (j_m - j_l) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_u] +
1071 (j_u - j_m) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_l] +
1072 (j_m - j_l) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_u] +
1073 (j_u - j_m) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_l] - I0_ptr[i * dis->w + j];
1074 coef = 1 / max(1.0f, abs(x: diff));
1075 sum_Ux += coef * Sx_ptr[is * dis->ws + js];
1076 sum_Uy += coef * Sy_ptr[is * dis->ws + js];
1077 sum_coef += coef;
1078 }
1079 CV_DbgAssert(sum_coef != 0);
1080 Ux_ptr[i * dis->w + j] = sum_Ux / sum_coef;
1081 Uy_ptr[i * dis->w + j] = sum_Uy / sum_coef;
1082 }
1083 }
1084#undef UPDATE_SPARSE_I_COORDINATES
1085#undef UPDATE_SPARSE_J_COORDINATES
1086}
1087
1088#ifdef HAVE_OPENCL
1089bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_U,
1090 UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int /*pyr_level*/)
1091{
1092 CV_INSTRUMENT_REGION();
1093 CV_INSTRUMENT_REGION_OPENCL();
1094
1095 size_t globalSize[] = {(size_t)ws, (size_t)hs};
1096 size_t localSize[] = {16, 16};
1097 int num_inner_iter = (int)floor(x: grad_descent_iter / (float)num_iter);
1098
1099 String subgroups_build_options;
1100 if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups"))
1101 subgroups_build_options = " -DCV_USE_SUBGROUPS=1";
1102
1103 String build_options = cv::format(
1104 fmt: "-DDIS_BORDER_SIZE=%d -DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1105 border_size, patch_size, patch_stride
1106 ) + subgroups_build_options;
1107
1108#if 0 // OpenCL debug
1109u_Sx = Scalar::all(0);
1110u_Sy = Scalar::all(0);
1111#endif
1112
1113 CV_Assert(num_iter == 2);
1114 for (int iter = 0; iter < num_iter; iter++)
1115 {
1116 if (iter == 0)
1117 {
1118 ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, build_options);
1119 size_t global_sz[] = {(size_t)hs * 8};
1120 size_t local_sz[] = {8};
1121
1122 k1.args(
1123 ocl::KernelArg::PtrReadOnly(m: src_U),
1124 ocl::KernelArg::PtrReadOnly(m: I0),
1125 ocl::KernelArg::PtrReadOnly(m: I1),
1126 (int)w, (int)h, (int)ws, (int)hs,
1127 ocl::KernelArg::PtrWriteOnly(m: u_S)
1128 );
1129 if (!k1.run(dims: 1, globalsize: global_sz, localsize: local_sz, sync: false))
1130 return false;
1131
1132 ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::video::dis_flow_oclsrc, build_options);
1133
1134 k2.args(
1135 ocl::KernelArg::PtrReadOnly(m: src_U),
1136 ocl::KernelArg::PtrReadOnly(m: I0),
1137 ocl::KernelArg::PtrReadOnly(m: I1),
1138 ocl::KernelArg::PtrReadOnly(m: I0x),
1139 ocl::KernelArg::PtrReadOnly(m: I0y),
1140 ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf),
1141 ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf),
1142 ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf),
1143 ocl::KernelArg::PtrReadOnly(m: u_I0x_buf),
1144 ocl::KernelArg::PtrReadOnly(m: u_I0y_buf),
1145 (int)w, (int)h, (int)ws, (int)hs,
1146 (int)num_inner_iter,
1147 ocl::KernelArg::PtrReadWrite(m: u_S)
1148 );
1149 if (!k2.run(dims: 2, globalsize: globalSize, localsize: localSize, sync: false))
1150 return false;
1151 }
1152 else
1153 {
1154 ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, build_options);
1155 size_t global_sz[] = {(size_t)hs * 8};
1156 size_t local_sz[] = {8};
1157
1158 k3.args(
1159 ocl::KernelArg::PtrReadOnly(m: I0),
1160 ocl::KernelArg::PtrReadOnly(m: I1),
1161 (int)w, (int)h, (int)ws, (int)hs,
1162 ocl::KernelArg::PtrReadWrite(m: u_S)
1163 );
1164 if (!k3.run(dims: 1, globalsize: global_sz, localsize: local_sz, sync: false))
1165 return false;
1166
1167 ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::video::dis_flow_oclsrc, build_options);
1168
1169 k4.args(
1170 ocl::KernelArg::PtrReadOnly(m: I0),
1171 ocl::KernelArg::PtrReadOnly(m: I1),
1172 ocl::KernelArg::PtrReadOnly(m: I0x),
1173 ocl::KernelArg::PtrReadOnly(m: I0y),
1174 ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf),
1175 ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf),
1176 ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf),
1177 ocl::KernelArg::PtrReadOnly(m: u_I0x_buf),
1178 ocl::KernelArg::PtrReadOnly(m: u_I0y_buf),
1179 (int)w, (int)h,(int)ws, (int)hs,
1180 (int)num_inner_iter,
1181 ocl::KernelArg::PtrReadWrite(m: u_S)
1182 );
1183 if (!k4.run(dims: 2, globalsize: globalSize, localsize: localSize, sync: false))
1184 return false;
1185 }
1186 }
1187 return true;
1188}
1189
1190bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1)
1191{
1192 CV_INSTRUMENT_REGION();
1193 CV_INSTRUMENT_REGION_OPENCL();
1194
1195 size_t globalSize[] = {(size_t)w, (size_t)h};
1196 size_t localSize[] = {16, 16};
1197
1198 String build_options = cv::format(
1199 fmt: "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1200 patch_size, patch_stride
1201 );
1202
1203 ocl::Kernel kernel("dis_densification", ocl::video::dis_flow_oclsrc, build_options);
1204 kernel.args(
1205 ocl::KernelArg::PtrReadOnly(m: src_S),
1206 ocl::KernelArg::PtrReadOnly(m: _I0),
1207 ocl::KernelArg::PtrReadOnly(m: _I1),
1208 (int)w, (int)h, (int)ws,
1209 ocl::KernelArg::PtrWriteOnly(m: dst_U)
1210 );
1211 return kernel.run(dims: 2, globalsize: globalSize, localsize: localSize, sync: false);
1212}
1213
1214void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow)
1215{
1216 CV_INSTRUMENT_REGION();
1217 // not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
1218
1219 u_I0s.resize(coarsest_scale + 1);
1220 u_I1s.resize(coarsest_scale + 1);
1221 u_I1s_ext.resize(coarsest_scale + 1);
1222 u_I0xs.resize(coarsest_scale + 1);
1223 u_I0ys.resize(coarsest_scale + 1);
1224 u_U.resize(coarsest_scale + 1);
1225
1226 if (use_flow)
1227 {
1228 u_initial_U.resize(coarsest_scale + 1);
1229 }
1230
1231 int fraction = 1;
1232 int cur_rows = 0, cur_cols = 0;
1233
1234 for (int i = 0; i <= coarsest_scale; i++)
1235 {
1236 CV_TRACE_REGION("coarsest_scale_iteration");
1237 /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
1238 if (i == finest_scale)
1239 {
1240 cur_rows = I0.rows / fraction;
1241 cur_cols = I0.cols / fraction;
1242 u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
1243 resize(I0, u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
1244 u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
1245 resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
1246
1247 /* These buffers are reused in each scale so we initialize them once on the finest scale: */
1248 u_S.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC2);
1249 u_I0xx_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1250 u_I0yy_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1251 u_I0xy_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1252 u_I0x_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1253 u_I0y_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1254
1255 u_I0xx_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1256 u_I0yy_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1257 u_I0xy_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1258 u_I0x_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1259 u_I0y_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1260 }
1261 else if (i > finest_scale)
1262 {
1263 cur_rows = u_I0s[i - 1].rows / 2;
1264 cur_cols = u_I0s[i - 1].cols / 2;
1265 u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
1266 resize(u_I0s[i - 1], u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
1267 u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
1268 resize(u_I1s[i - 1], u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
1269 }
1270
1271 if (i >= finest_scale)
1272 {
1273 u_I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size, CV_8UC1);
1274 copyMakeBorder(u_I1s[i], u_I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
1275 u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1);
1276 u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1);
1277 spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]);
1278 u_U[i].create(cur_rows, cur_cols, CV_32FC2);
1279 variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
1280 variational_refinement_processors[i]->setDelta(variational_refinement_delta);
1281 variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
1282 variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon);
1283 variational_refinement_processors[i]->setSorIterations(5);
1284 variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
1285
1286 if (use_flow)
1287 {
1288 UMat resized_flow;
1289 resize(src: flow, dst: resized_flow, dsize: Size(cur_cols, cur_rows));
1290 float scale = 1.0f / fraction;
1291 resized_flow.convertTo(u_initial_U[i], CV_32FC2, scale, 0.0f);
1292 }
1293 }
1294
1295 fraction *= 2;
1296 }
1297}
1298
1299bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
1300 UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y)
1301{
1302 CV_INSTRUMENT_REGION();
1303 CV_INSTRUMENT_REGION_OPENCL();
1304
1305 size_t globalSizeX[] = {(size_t)h};
1306 size_t localSizeX[] = {16};
1307
1308#if 0 // OpenCL debug
1309 u_I0xx_buf_aux = Scalar::all(0);
1310 u_I0yy_buf_aux = Scalar::all(0);
1311 u_I0xy_buf_aux = Scalar::all(0);
1312 u_I0x_buf_aux = Scalar::all(0);
1313 u_I0y_buf_aux = Scalar::all(0);
1314 dst_I0xx = Scalar::all(0);
1315 dst_I0yy = Scalar::all(0);
1316 dst_I0xy = Scalar::all(0);
1317 dst_I0x = Scalar::all(0);
1318 dst_I0y = Scalar::all(0);
1319#endif
1320
1321 String build_options = cv::format(
1322 fmt: "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1323 patch_size, patch_stride
1324 );
1325
1326 ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::video::dis_flow_oclsrc, build_options);
1327 kernelX.args(
1328 ocl::KernelArg::PtrReadOnly(m: I0x),
1329 ocl::KernelArg::PtrReadOnly(m: I0y),
1330 (int)w, (int)h, (int)ws,
1331 ocl::KernelArg::PtrWriteOnly(m: u_I0xx_buf_aux),
1332 ocl::KernelArg::PtrWriteOnly(m: u_I0yy_buf_aux),
1333 ocl::KernelArg::PtrWriteOnly(m: u_I0xy_buf_aux),
1334 ocl::KernelArg::PtrWriteOnly(m: u_I0x_buf_aux),
1335 ocl::KernelArg::PtrWriteOnly(m: u_I0y_buf_aux)
1336 );
1337 if (!kernelX.run(dims: 1, globalsize: globalSizeX, localsize: localSizeX, sync: false))
1338 return false;
1339
1340 size_t globalSizeY[] = {(size_t)ws};
1341 size_t localSizeY[] = {16};
1342
1343 ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::video::dis_flow_oclsrc, build_options);
1344 kernelY.args(
1345 ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf_aux),
1346 ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf_aux),
1347 ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf_aux),
1348 ocl::KernelArg::PtrReadOnly(m: u_I0x_buf_aux),
1349 ocl::KernelArg::PtrReadOnly(m: u_I0y_buf_aux),
1350 (int)w, (int)h, (int)ws,
1351 ocl::KernelArg::PtrWriteOnly(m: dst_I0xx),
1352 ocl::KernelArg::PtrWriteOnly(m: dst_I0yy),
1353 ocl::KernelArg::PtrWriteOnly(m: dst_I0xy),
1354 ocl::KernelArg::PtrWriteOnly(m: dst_I0x),
1355 ocl::KernelArg::PtrWriteOnly(m: dst_I0y)
1356 );
1357 return kernelY.run(dims: 1, globalsize: globalSizeY, localsize: localSizeY, sync: false);
1358}
1359
1360bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow)
1361{
1362 CV_INSTRUMENT_REGION();
1363 // not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
1364
1365 UMat I0Mat = I0.getUMat();
1366 UMat I1Mat = I1.getUMat();
1367 bool use_input_flow = false;
1368 if (flow.sameSize(arr: I0) && flow.depth() == CV_32F && flow.channels() == 2)
1369 use_input_flow = true;
1370 coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(x: 2.0) + 0.5), /* Original code search for maximal movement of width/4 */
1371 (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(x: 2.0))); /* Deepest pyramid level greater or equal than patch*/
1372
1373 if (coarsest_scale<0)
1374 CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12");
1375
1376 if (coarsest_scale<finest_scale)
1377 {
1378 // choose the finest level based on coarsest level.
1379 // Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239
1380 int original_img_width = I0.size().width;
1381 autoSelectPatchSizeAndScales(img_width: original_img_width);
1382 }
1383
1384 ocl_prepareBuffers(I0&: I0Mat, I1&: I1Mat, flow, use_flow: use_input_flow);
1385 u_U[coarsest_scale].setTo(0.0f);
1386
1387 for (int i = coarsest_scale; i >= finest_scale; i--)
1388 {
1389 CV_TRACE_REGION("coarsest_scale_iteration");
1390 w = u_I0s[i].cols;
1391 h = u_I0s[i].rows;
1392 ws = 1 + (w - patch_size) / patch_stride;
1393 hs = 1 + (h - patch_size) / patch_stride;
1394
1395 if (!ocl_precomputeStructureTensor(u_I0xx_buf, u_I0yy_buf, u_I0xy_buf,
1396 u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i]))
1397 return false;
1398
1399 if (!ocl_PatchInverseSearch(u_U[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i))
1400 return false;
1401
1402 if (!ocl_Densification(u_U[i], u_S, u_I0s[i], u_I1s[i]))
1403 return false;
1404
1405 if (variational_refinement_iter > 0)
1406 {
1407 std::vector<Mat> U_channels;
1408 split(u_U[i], U_channels); CV_Assert(U_channels.size() == 2);
1409 variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i],
1410 U_channels[0], U_channels[1]);
1411 merge(U_channels, u_U[i]);
1412 }
1413
1414 if (i > finest_scale)
1415 {
1416 UMat resized;
1417 resize(u_U[i], resized, u_U[i - 1].size());
1418 multiply(resized, 2, u_U[i - 1]);
1419 }
1420 }
1421
1422 UMat resized_flow;
1423 resize(u_U[finest_scale], resized_flow, I1Mat.size());
1424 multiply(src1: resized_flow, src2: 1 << finest_scale, dst: flow);
1425
1426 return true;
1427}
1428#endif
1429
1430void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
1431{
1432 CV_INSTRUMENT_REGION();
1433
1434 CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
1435 CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
1436 CV_Assert(I0.sameSize(I1));
1437 CV_Assert(I0.isContinuous());
1438 CV_Assert(I1.isContinuous());
1439
1440 CV_OCL_RUN(flow.isUMat() &&
1441 (patch_size == 8) && (use_spatial_propagation == true),
1442 ocl_calc(I0, I1, flow));
1443
1444 Mat I0Mat = I0.getMat();
1445 Mat I1Mat = I1.getMat();
1446 bool use_input_flow = false;
1447 if (flow.sameSize(arr: I0) && flow.depth() == CV_32F && flow.channels() == 2)
1448 use_input_flow = true;
1449 else
1450 flow.create(sz: I1Mat.size(), CV_32FC2);
1451 Mat flowMat = flow.getMat();
1452 coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(x: 2.0) + 0.5), /* Original code search for maximal movement of width/4 */
1453 (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(x: 2.0))); /* Deepest pyramid level greater or equal than patch*/
1454
1455 if (coarsest_scale<0)
1456 CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12");
1457
1458 if (coarsest_scale<finest_scale)
1459 {
1460 // choose the finest level based on coarsest level.
1461 // Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239
1462 int original_img_width = I0.size().width;
1463 autoSelectPatchSizeAndScales(img_width: original_img_width);
1464 }
1465
1466 int num_stripes = getNumThreads();
1467
1468 prepareBuffers(I0&: I0Mat, I1&: I1Mat, flow&: flowMat, use_flow: use_input_flow);
1469 Ux[coarsest_scale].setTo(0.0f);
1470 Uy[coarsest_scale].setTo(0.0f);
1471
1472 for (int i = coarsest_scale; i >= finest_scale; i--)
1473 {
1474 CV_TRACE_REGION("coarsest_scale_iteration");
1475 w = I0s[i].cols;
1476 h = I0s[i].rows;
1477 ws = 1 + (w - patch_size) / patch_stride;
1478 hs = 1 + (h - patch_size) / patch_stride;
1479
1480 precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x_buf, I0y_buf, I0xs[i], I0ys[i]);
1481 if (use_spatial_propagation)
1482 {
1483 /* Use a fixed number of stripes regardless the number of threads to make inverse search
1484 * with spatial propagation reproducible
1485 */
1486 parallel_for_(Range(0, 8), PatchInverseSearch_ParBody(*this, 8, hs, Sx, Sy, Ux[i], Uy[i], I0s[i],
1487 I1s_ext[i], I0xs[i], I0ys[i], 2, i));
1488 }
1489 else
1490 {
1491 parallel_for_(Range(0, num_stripes),
1492 PatchInverseSearch_ParBody(*this, num_stripes, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], I1s_ext[i],
1493 I0xs[i], I0ys[i], 1, i));
1494 }
1495
1496 parallel_for_(Range(0, num_stripes),
1497 Densification_ParBody(*this, num_stripes, I0s[i].rows, Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i]));
1498 if (variational_refinement_iter > 0)
1499 variational_refinement_processors[i]->calcUV(I0s[i], I1s[i], Ux[i], Uy[i]);
1500
1501 if (i > finest_scale)
1502 {
1503 resize(Ux[i], Ux[i - 1], Ux[i - 1].size());
1504 resize(Uy[i], Uy[i - 1], Uy[i - 1].size());
1505 Ux[i - 1] *= 2;
1506 Uy[i - 1] *= 2;
1507 }
1508 }
1509 Mat uxy[] = {Ux[finest_scale], Uy[finest_scale]};
1510 merge(uxy, 2, U);
1511 resize(U, flowMat, flowMat.size());
1512 flowMat *= 1 << finest_scale;
1513}
1514
1515void DISOpticalFlowImpl::collectGarbage()
1516{
1517 CV_INSTRUMENT_REGION();
1518
1519 I0s.clear();
1520 I1s.clear();
1521 I1s_ext.clear();
1522 I0xs.clear();
1523 I0ys.clear();
1524 Ux.clear();
1525 Uy.clear();
1526 U.release();
1527 Sx.release();
1528 Sy.release();
1529 I0xx_buf.release();
1530 I0yy_buf.release();
1531 I0xy_buf.release();
1532 I0xx_buf_aux.release();
1533 I0yy_buf_aux.release();
1534 I0xy_buf_aux.release();
1535
1536#ifdef HAVE_OPENCL
1537 u_I0s.clear();
1538 u_I1s.clear();
1539 u_I1s_ext.clear();
1540 u_I0xs.clear();
1541 u_I0ys.clear();
1542 u_U.clear();
1543 u_S.release();
1544 u_I0xx_buf.release();
1545 u_I0yy_buf.release();
1546 u_I0xy_buf.release();
1547 u_I0xx_buf_aux.release();
1548 u_I0yy_buf_aux.release();
1549 u_I0xy_buf_aux.release();
1550#endif
1551
1552 for (int i = finest_scale; i <= coarsest_scale; i++)
1553 variational_refinement_processors[i]->collectGarbage();
1554 variational_refinement_processors.clear();
1555}
1556
1557Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset)
1558{
1559 CV_INSTRUMENT_REGION();
1560
1561 Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>();
1562 dis->setPatchSize(8);
1563 if (preset == DISOpticalFlow::PRESET_ULTRAFAST)
1564 {
1565 dis->setFinestScale(2);
1566 dis->setPatchStride(4);
1567 dis->setGradientDescentIterations(12);
1568 dis->setVariationalRefinementIterations(0);
1569 }
1570 else if (preset == DISOpticalFlow::PRESET_FAST)
1571 {
1572 dis->setFinestScale(2);
1573 dis->setPatchStride(4);
1574 dis->setGradientDescentIterations(16);
1575 dis->setVariationalRefinementIterations(5);
1576 }
1577 else if (preset == DISOpticalFlow::PRESET_MEDIUM)
1578 {
1579 dis->setFinestScale(1);
1580 dis->setPatchStride(3);
1581 dis->setGradientDescentIterations(25);
1582 dis->setVariationalRefinementIterations(5);
1583 }
1584
1585 return dis;
1586}
1587
1588
1589} // namespace
1590

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of opencv/modules/video/src/dis_flow.cpp