1 | /*M/////////////////////////////////////////////////////////////////////////////////////// |
---|---|
2 | // |
3 | // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
4 | // |
5 | // By downloading, copying, installing or using the software you agree to this license. |
6 | // If you do not agree to this license, do not download, install, |
7 | // copy or use the software. |
8 | // |
9 | // |
10 | // License Agreement |
11 | // For Open Source Computer Vision Library |
12 | // |
13 | // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
14 | // Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
15 | // Third party copyrights are property of their respective owners. |
16 | // |
17 | // Redistribution and use in source and binary forms, with or without modification, |
18 | // are permitted provided that the following conditions are met: |
19 | // |
20 | // * Redistribution's of source code must retain the above copyright notice, |
21 | // this list of conditions and the following disclaimer. |
22 | // |
23 | // * Redistribution's in binary form must reproduce the above copyright notice, |
24 | // this list of conditions and the following disclaimer in the documentation |
25 | // and/or other materials provided with the distribution. |
26 | // |
27 | // * The name of the copyright holders may not be used to endorse or promote products |
28 | // derived from this software without specific prior written permission. |
29 | // |
30 | // This software is provided by the copyright holders and contributors "as is" and |
31 | // any express or implied warranties, including, but not limited to, the implied |
32 | // warranties of merchantability and fitness for a particular purpose are disclaimed. |
33 | // In no event shall the Intel Corporation or contributors be liable for any direct, |
34 | // indirect, incidental, special, exemplary, or consequential damages |
35 | // (including, but not limited to, procurement of substitute goods or services; |
36 | // loss of use, data, or profits; or business interruption) however caused |
37 | // and on any theory of liability, whether in contract, strict liability, |
38 | // or tort (including negligence or otherwise) arising in any way out of |
39 | // the use of this software, even if advised of the possibility of such damage. |
40 | // |
41 | //M*/ |
42 | |
43 | #include "precomp.hpp" |
44 | #include "opencv2/core/hal/intrin.hpp" |
45 | #include "opencl_kernels_video.hpp" |
46 | |
47 | using namespace std; |
48 | #define EPS 0.001F |
49 | #define INF 1E+10F |
50 | |
51 | namespace cv { |
52 | |
53 | class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow |
54 | { |
55 | public: |
56 | DISOpticalFlowImpl(); |
57 | |
58 | void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE; |
59 | void collectGarbage() CV_OVERRIDE; |
60 | |
61 | protected: //!< algorithm parameters |
62 | int finest_scale, coarsest_scale; |
63 | int patch_size; |
64 | int patch_stride; |
65 | int grad_descent_iter; |
66 | int variational_refinement_iter; |
67 | float variational_refinement_alpha; |
68 | float variational_refinement_gamma; |
69 | float variational_refinement_delta; |
70 | float variational_refinement_epsilon; |
71 | bool use_mean_normalization; |
72 | bool use_spatial_propagation; |
73 | |
74 | protected: //!< some auxiliary variables |
75 | int border_size; |
76 | int w, h; //!< flow buffer width and height on the current scale |
77 | int ws, hs; //!< sparse flow buffer width and height on the current scale |
78 | |
79 | public: |
80 | int getFinestScale() const CV_OVERRIDE { return finest_scale; } |
81 | void setFinestScale(int val) CV_OVERRIDE { finest_scale = val; } |
82 | int getPatchSize() const CV_OVERRIDE { return patch_size; } |
83 | void setPatchSize(int val) CV_OVERRIDE { patch_size = val; } |
84 | int getPatchStride() const CV_OVERRIDE { return patch_stride; } |
85 | void setPatchStride(int val) CV_OVERRIDE { patch_stride = val; } |
86 | int getGradientDescentIterations() const CV_OVERRIDE { return grad_descent_iter; } |
87 | void setGradientDescentIterations(int val) CV_OVERRIDE { grad_descent_iter = val; } |
88 | int getVariationalRefinementIterations() const CV_OVERRIDE { return variational_refinement_iter; } |
89 | void setVariationalRefinementIterations(int val) CV_OVERRIDE { variational_refinement_iter = val; } |
90 | float getVariationalRefinementAlpha() const CV_OVERRIDE { return variational_refinement_alpha; } |
91 | void setVariationalRefinementAlpha(float val) CV_OVERRIDE { variational_refinement_alpha = val; } |
92 | float getVariationalRefinementDelta() const CV_OVERRIDE { return variational_refinement_delta; } |
93 | void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; } |
94 | float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; } |
95 | void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; } |
96 | float getVariationalRefinementEpsilon() const CV_OVERRIDE { return variational_refinement_epsilon; } |
97 | void setVariationalRefinementEpsilon(float val) CV_OVERRIDE { variational_refinement_epsilon = val; } |
98 | |
99 | bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; } |
100 | void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; } |
101 | bool getUseSpatialPropagation() const CV_OVERRIDE { return use_spatial_propagation; } |
102 | void setUseSpatialPropagation(bool val) CV_OVERRIDE { use_spatial_propagation = val; } |
103 | |
104 | protected: //!< internal buffers |
105 | vector<Mat_<uchar> > I0s; //!< Gaussian pyramid for the current frame |
106 | vector<Mat_<uchar> > I1s; //!< Gaussian pyramid for the next frame |
107 | vector<Mat_<uchar> > I1s_ext; //!< I1s with borders |
108 | |
109 | vector<Mat_<short> > I0xs; //!< Gaussian pyramid for the x gradient of the current frame |
110 | vector<Mat_<short> > I0ys; //!< Gaussian pyramid for the y gradient of the current frame |
111 | |
112 | vector<Mat_<float> > Ux; //!< x component of the flow vectors |
113 | vector<Mat_<float> > Uy; //!< y component of the flow vectors |
114 | |
115 | vector<Mat_<float> > initial_Ux; //!< x component of the initial flow field, if one was passed as an input |
116 | vector<Mat_<float> > initial_Uy; //!< y component of the initial flow field, if one was passed as an input |
117 | |
118 | Mat_<Vec2f> U; //!< a buffer for the merged flow |
119 | |
120 | Mat_<float> Sx; //!< intermediate sparse flow representation (x component) |
121 | Mat_<float> Sy; //!< intermediate sparse flow representation (y component) |
122 | |
123 | /* Structure tensor components: */ |
124 | Mat_<float> I0xx_buf; //!< sum of squares of x gradient values |
125 | Mat_<float> I0yy_buf; //!< sum of squares of y gradient values |
126 | Mat_<float> I0xy_buf; //!< sum of x and y gradient products |
127 | |
128 | /* Extra buffers that are useful if patch mean-normalization is used: */ |
129 | Mat_<float> I0x_buf; //!< sum of x gradient values |
130 | Mat_<float> I0y_buf; //!< sum of y gradient values |
131 | |
132 | /* Auxiliary buffers used in structure tensor computation: */ |
133 | Mat_<float> I0xx_buf_aux; |
134 | Mat_<float> I0yy_buf_aux; |
135 | Mat_<float> I0xy_buf_aux; |
136 | Mat_<float> I0x_buf_aux; |
137 | Mat_<float> I0y_buf_aux; |
138 | |
139 | vector<Ptr<VariationalRefinement> > variational_refinement_processors; |
140 | |
141 | private: //!< private methods and parallel sections |
142 | void prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow); |
143 | void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, Mat &dst_I0y, Mat &I0x, |
144 | Mat &I0y); |
145 | int autoSelectCoarsestScale(int img_width); |
146 | void autoSelectPatchSizeAndScales(int img_width); |
147 | |
148 | struct PatchInverseSearch_ParBody : public ParallelLoopBody |
149 | { |
150 | DISOpticalFlowImpl *dis; |
151 | int nstripes, stripe_sz; |
152 | int hs; |
153 | Mat *Sx, *Sy, *Ux, *Uy, *I0, *I1, *I0x, *I0y; |
154 | int num_iter, pyr_level; |
155 | |
156 | PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _hs, Mat &dst_Sx, Mat &dst_Sy, |
157 | Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, Mat &_I0x, Mat &_I0y, int _num_iter, |
158 | int _pyr_level); |
159 | void operator()(const Range &range) const CV_OVERRIDE; |
160 | }; |
161 | |
162 | struct Densification_ParBody : public ParallelLoopBody |
163 | { |
164 | DISOpticalFlowImpl *dis; |
165 | int nstripes, stripe_sz; |
166 | int h; |
167 | Mat *Ux, *Uy, *Sx, *Sy, *I0, *I1; |
168 | |
169 | Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, |
170 | Mat &src_Sy, Mat &_I0, Mat &_I1); |
171 | void operator()(const Range &range) const CV_OVERRIDE; |
172 | }; |
173 | |
174 | #ifdef HAVE_OPENCL |
175 | vector<UMat> u_I0s; //!< Gaussian pyramid for the current frame |
176 | vector<UMat> u_I1s; //!< Gaussian pyramid for the next frame |
177 | vector<UMat> u_I1s_ext; //!< I1s with borders |
178 | |
179 | vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame |
180 | vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame |
181 | |
182 | vector<UMat> u_U; //!< (x,y) component of the flow vectors (CV_32FC2) |
183 | vector<UMat> u_initial_U; //!< (x, y) components of the initial flow field, if one was passed as an input (CV_32FC2) |
184 | |
185 | UMat u_S; //!< intermediate sparse flow representation (x,y components - CV_32FC2) |
186 | |
187 | /* Structure tensor components: */ |
188 | UMat u_I0xx_buf; //!< sum of squares of x gradient values |
189 | UMat u_I0yy_buf; //!< sum of squares of y gradient values |
190 | UMat u_I0xy_buf; //!< sum of x and y gradient products |
191 | |
192 | /* Extra buffers that are useful if patch mean-normalization is used: */ |
193 | UMat u_I0x_buf; //!< sum of x gradient values |
194 | UMat u_I0y_buf; //!< sum of y gradient values |
195 | |
196 | /* Auxiliary buffers used in structure tensor computation: */ |
197 | UMat u_I0xx_buf_aux; |
198 | UMat u_I0yy_buf_aux; |
199 | UMat u_I0xy_buf_aux; |
200 | UMat u_I0x_buf_aux; |
201 | UMat u_I0y_buf_aux; |
202 | |
203 | bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy, |
204 | UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y); |
205 | void ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow); |
206 | bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow); |
207 | bool ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1); |
208 | bool ocl_PatchInverseSearch(UMat &src_U, |
209 | UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level); |
210 | #endif |
211 | }; |
212 | |
213 | DISOpticalFlowImpl::DISOpticalFlowImpl() |
214 | { |
215 | CV_INSTRUMENT_REGION(); |
216 | |
217 | finest_scale = 2; |
218 | patch_size = 8; |
219 | patch_stride = 4; |
220 | grad_descent_iter = 16; |
221 | variational_refinement_iter = 5; |
222 | variational_refinement_alpha = 20.f; |
223 | variational_refinement_gamma = 10.f; |
224 | variational_refinement_delta = 5.f; |
225 | variational_refinement_epsilon = 0.01f; |
226 | |
227 | border_size = 16; |
228 | use_mean_normalization = true; |
229 | use_spatial_propagation = true; |
230 | coarsest_scale = 10; |
231 | |
232 | /* Use separate variational refinement instances for different scales to avoid repeated memory allocation: */ |
233 | int max_possible_scales = 10; |
234 | ws = hs = w = h = 0; |
235 | for (int i = 0; i < max_possible_scales; i++) |
236 | variational_refinement_processors.push_back(VariationalRefinement::create()); |
237 | } |
238 | |
239 | void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow) |
240 | { |
241 | CV_INSTRUMENT_REGION(); |
242 | |
243 | I0s.resize(coarsest_scale + 1); |
244 | I1s.resize(coarsest_scale + 1); |
245 | I1s_ext.resize(coarsest_scale + 1); |
246 | I0xs.resize(coarsest_scale + 1); |
247 | I0ys.resize(coarsest_scale + 1); |
248 | Ux.resize(coarsest_scale + 1); |
249 | Uy.resize(coarsest_scale + 1); |
250 | |
251 | Mat flow_uv[2]; |
252 | if (use_flow) |
253 | { |
254 | split(src: flow, mvbegin: flow_uv); |
255 | initial_Ux.resize(coarsest_scale + 1); |
256 | initial_Uy.resize(coarsest_scale + 1); |
257 | } |
258 | |
259 | int fraction = 1; |
260 | int cur_rows = 0, cur_cols = 0; |
261 | |
262 | for (int i = 0; i <= coarsest_scale; i++) |
263 | { |
264 | /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */ |
265 | if (i == finest_scale) |
266 | { |
267 | cur_rows = I0.rows / fraction; |
268 | cur_cols = I0.cols / fraction; |
269 | I0s[i].create(cur_rows, cur_cols); |
270 | resize(I0, I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA); |
271 | I1s[i].create(cur_rows, cur_cols); |
272 | resize(I1, I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA); |
273 | |
274 | /* These buffers are reused in each scale so we initialize them once on the finest scale: */ |
275 | Sx.create(cur_rows / patch_stride, cur_cols / patch_stride); |
276 | Sy.create(cur_rows / patch_stride, cur_cols / patch_stride); |
277 | I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride); |
278 | I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride); |
279 | I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride); |
280 | I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride); |
281 | I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride); |
282 | |
283 | I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride); |
284 | I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride); |
285 | I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride); |
286 | I0x_buf_aux.create(cur_rows, cur_cols / patch_stride); |
287 | I0y_buf_aux.create(cur_rows, cur_cols / patch_stride); |
288 | |
289 | U.create(cur_rows, cur_cols); |
290 | } |
291 | else if (i > finest_scale) |
292 | { |
293 | cur_rows = I0s[i - 1].rows / 2; |
294 | cur_cols = I0s[i - 1].cols / 2; |
295 | I0s[i].create(cur_rows, cur_cols); |
296 | resize(I0s[i - 1], I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA); |
297 | I1s[i].create(cur_rows, cur_cols); |
298 | resize(I1s[i - 1], I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA); |
299 | } |
300 | |
301 | if (i >= finest_scale) |
302 | { |
303 | I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size); |
304 | copyMakeBorder(I1s[i], I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE); |
305 | I0xs[i].create(cur_rows, cur_cols); |
306 | I0ys[i].create(cur_rows, cur_cols); |
307 | spatialGradient(I0s[i], I0xs[i], I0ys[i]); |
308 | Ux[i].create(cur_rows, cur_cols); |
309 | Uy[i].create(cur_rows, cur_cols); |
310 | variational_refinement_processors[i]->setAlpha(variational_refinement_alpha); |
311 | variational_refinement_processors[i]->setDelta(variational_refinement_delta); |
312 | variational_refinement_processors[i]->setGamma(variational_refinement_gamma); |
313 | variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon); |
314 | variational_refinement_processors[i]->setSorIterations(5); |
315 | variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter); |
316 | |
317 | if (use_flow) |
318 | { |
319 | resize(flow_uv[0], initial_Ux[i], Size(cur_cols, cur_rows)); |
320 | initial_Ux[i] /= fraction; |
321 | resize(flow_uv[1], initial_Uy[i], Size(cur_cols, cur_rows)); |
322 | initial_Uy[i] /= fraction; |
323 | } |
324 | } |
325 | |
326 | fraction *= 2; |
327 | } |
328 | } |
329 | |
330 | /* This function computes the structure tensor elements (local sums of I0x^2, I0x*I0y and I0y^2). |
331 | * A simple box filter is not used instead because we need to compute these sums on a sparse grid |
332 | * and store them densely in the output buffers. |
333 | */ |
334 | void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, |
335 | Mat &dst_I0y, Mat &I0x, Mat &I0y) |
336 | { |
337 | CV_INSTRUMENT_REGION(); |
338 | |
339 | float *I0xx_ptr = dst_I0xx.ptr<float>(); |
340 | float *I0yy_ptr = dst_I0yy.ptr<float>(); |
341 | float *I0xy_ptr = dst_I0xy.ptr<float>(); |
342 | float *I0x_ptr = dst_I0x.ptr<float>(); |
343 | float *I0y_ptr = dst_I0y.ptr<float>(); |
344 | |
345 | float *I0xx_aux_ptr = I0xx_buf_aux.ptr<float>(); |
346 | float *I0yy_aux_ptr = I0yy_buf_aux.ptr<float>(); |
347 | float *I0xy_aux_ptr = I0xy_buf_aux.ptr<float>(); |
348 | float *I0x_aux_ptr = I0x_buf_aux.ptr<float>(); |
349 | float *I0y_aux_ptr = I0y_buf_aux.ptr<float>(); |
350 | |
351 | /* Separable box filter: horizontal pass */ |
352 | for (int i = 0; i < h; i++) |
353 | { |
354 | float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f; |
355 | short *x_row = I0x.ptr<short>(i); |
356 | short *y_row = I0y.ptr<short>(i); |
357 | for (int j = 0; j < patch_size; j++) |
358 | { |
359 | sum_xx += x_row[j] * x_row[j]; |
360 | sum_yy += y_row[j] * y_row[j]; |
361 | sum_xy += x_row[j] * y_row[j]; |
362 | sum_x += x_row[j]; |
363 | sum_y += y_row[j]; |
364 | } |
365 | I0xx_aux_ptr[i * ws] = sum_xx; |
366 | I0yy_aux_ptr[i * ws] = sum_yy; |
367 | I0xy_aux_ptr[i * ws] = sum_xy; |
368 | I0x_aux_ptr[i * ws] = sum_x; |
369 | I0y_aux_ptr[i * ws] = sum_y; |
370 | int js = 1; |
371 | for (int j = patch_size; j < w; j++) |
372 | { |
373 | sum_xx += (x_row[j] * x_row[j] - x_row[j - patch_size] * x_row[j - patch_size]); |
374 | sum_yy += (y_row[j] * y_row[j] - y_row[j - patch_size] * y_row[j - patch_size]); |
375 | sum_xy += (x_row[j] * y_row[j] - x_row[j - patch_size] * y_row[j - patch_size]); |
376 | sum_x += (x_row[j] - x_row[j - patch_size]); |
377 | sum_y += (y_row[j] - y_row[j - patch_size]); |
378 | if ((j - patch_size + 1) % patch_stride == 0) |
379 | { |
380 | I0xx_aux_ptr[i * ws + js] = sum_xx; |
381 | I0yy_aux_ptr[i * ws + js] = sum_yy; |
382 | I0xy_aux_ptr[i * ws + js] = sum_xy; |
383 | I0x_aux_ptr[i * ws + js] = sum_x; |
384 | I0y_aux_ptr[i * ws + js] = sum_y; |
385 | js++; |
386 | } |
387 | } |
388 | } |
389 | |
390 | AutoBuffer<float> sum_xx(ws), sum_yy(ws), sum_xy(ws), sum_x(ws), sum_y(ws); |
391 | for (int j = 0; j < ws; j++) |
392 | { |
393 | sum_xx[j] = 0.0f; |
394 | sum_yy[j] = 0.0f; |
395 | sum_xy[j] = 0.0f; |
396 | sum_x[j] = 0.0f; |
397 | sum_y[j] = 0.0f; |
398 | } |
399 | |
400 | /* Separable box filter: vertical pass */ |
401 | for (int i = 0; i < patch_size; i++) |
402 | for (int j = 0; j < ws; j++) |
403 | { |
404 | sum_xx[j] += I0xx_aux_ptr[i * ws + j]; |
405 | sum_yy[j] += I0yy_aux_ptr[i * ws + j]; |
406 | sum_xy[j] += I0xy_aux_ptr[i * ws + j]; |
407 | sum_x[j] += I0x_aux_ptr[i * ws + j]; |
408 | sum_y[j] += I0y_aux_ptr[i * ws + j]; |
409 | } |
410 | for (int j = 0; j < ws; j++) |
411 | { |
412 | I0xx_ptr[j] = sum_xx[j]; |
413 | I0yy_ptr[j] = sum_yy[j]; |
414 | I0xy_ptr[j] = sum_xy[j]; |
415 | I0x_ptr[j] = sum_x[j]; |
416 | I0y_ptr[j] = sum_y[j]; |
417 | } |
418 | int is = 1; |
419 | for (int i = patch_size; i < h; i++) |
420 | { |
421 | for (int j = 0; j < ws; j++) |
422 | { |
423 | sum_xx[j] += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]); |
424 | sum_yy[j] += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]); |
425 | sum_xy[j] += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]); |
426 | sum_x[j] += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]); |
427 | sum_y[j] += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]); |
428 | } |
429 | if ((i - patch_size + 1) % patch_stride == 0) |
430 | { |
431 | for (int j = 0; j < ws; j++) |
432 | { |
433 | I0xx_ptr[is * ws + j] = sum_xx[j]; |
434 | I0yy_ptr[is * ws + j] = sum_yy[j]; |
435 | I0xy_ptr[is * ws + j] = sum_xy[j]; |
436 | I0x_ptr[is * ws + j] = sum_x[j]; |
437 | I0y_ptr[is * ws + j] = sum_y[j]; |
438 | } |
439 | is++; |
440 | } |
441 | } |
442 | } |
443 | |
444 | int DISOpticalFlowImpl::autoSelectCoarsestScale(int img_width) |
445 | { |
446 | const int fratio = 5; |
447 | return std::max(0, (int)std::floor(x: log2(x: (2.0f*(float)img_width) / ((float)fratio * (float)patch_size)))); |
448 | } |
449 | |
450 | void DISOpticalFlowImpl::autoSelectPatchSizeAndScales(int img_width) |
451 | { |
452 | switch (finest_scale) |
453 | { |
454 | case 1: |
455 | patch_size = 8; |
456 | coarsest_scale = autoSelectCoarsestScale(img_width); |
457 | finest_scale = std::max(coarsest_scale-2, 0); |
458 | break; |
459 | |
460 | case 3: |
461 | patch_size = 12; |
462 | coarsest_scale = autoSelectCoarsestScale(img_width); |
463 | finest_scale = std::max(coarsest_scale-4, 0); |
464 | break; |
465 | |
466 | case 4: |
467 | patch_size = 12; |
468 | coarsest_scale = autoSelectCoarsestScale(img_width); |
469 | finest_scale = std::max(coarsest_scale-5, 0); |
470 | break; |
471 | |
472 | // default case, fall-through. |
473 | case 2: |
474 | default: |
475 | patch_size = 8; |
476 | coarsest_scale = autoSelectCoarsestScale(img_width); |
477 | finest_scale = std::max(coarsest_scale-2, 0); |
478 | break; |
479 | } |
480 | } |
481 | |
482 | DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, |
483 | int _hs, Mat &dst_Sx, Mat &dst_Sy, |
484 | Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, |
485 | Mat &_I0x, Mat &_I0y, int _num_iter, |
486 | int _pyr_level) |
487 | : dis(&_dis), nstripes(_nstripes), hs(_hs), Sx(&dst_Sx), Sy(&dst_Sy), Ux(&src_Ux), Uy(&src_Uy), I0(&_I0), I1(&_I1), |
488 | I0x(&_I0x), I0y(&_I0y), num_iter(_num_iter), pyr_level(_pyr_level) |
489 | { |
490 | stripe_sz = (int)ceil(x: hs / (double)nstripes); |
491 | } |
492 | |
493 | /////////////////////////////////////////////* Patch processing functions *///////////////////////////////////////////// |
494 | |
495 | /* Some auxiliary macros */ |
496 | #define HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION \ |
497 | v_float32x4 w00v = v_setall_f32(w00); \ |
498 | v_float32x4 w01v = v_setall_f32(w01); \ |
499 | v_float32x4 w10v = v_setall_f32(w10); \ |
500 | v_float32x4 w11v = v_setall_f32(w11); \ |
501 | \ |
502 | v_uint16x8 I0_row_8, I1_row_8, I1_row_shifted_8, I1_row_next_8, I1_row_next_shifted_8, tmp; \ |
503 | v_uint32x4 I0_row_4_left, I1_row_4_left, I1_row_shifted_4_left, I1_row_next_4_left, I1_row_next_shifted_4_left; \ |
504 | v_uint32x4 I0_row_4_right, I1_row_4_right, I1_row_shifted_4_right, I1_row_next_4_right, \ |
505 | I1_row_next_shifted_4_right; \ |
506 | v_float32x4 I_diff_left, I_diff_right; \ |
507 | \ |
508 | /* Preload and expand the first row of I1: */ \ |
509 | I1_row_8 = v_load_expand(I1_ptr); \ |
510 | I1_row_shifted_8 = v_load_expand(I1_ptr + 1); \ |
511 | v_expand(I1_row_8, I1_row_4_left, I1_row_4_right); \ |
512 | v_expand(I1_row_shifted_8, I1_row_shifted_4_left, I1_row_shifted_4_right); \ |
513 | I1_ptr += I1_stride; |
514 | |
515 | #define HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION \ |
516 | /* Load the next row of I1: */ \ |
517 | I1_row_next_8 = v_load_expand(I1_ptr); \ |
518 | I1_row_next_shifted_8 = v_load_expand(I1_ptr + 1); \ |
519 | /* Separate the left and right halves: */ \ |
520 | v_expand(I1_row_next_8, I1_row_next_4_left, I1_row_next_4_right); \ |
521 | v_expand(I1_row_next_shifted_8, I1_row_next_shifted_4_left, I1_row_next_shifted_4_right); \ |
522 | \ |
523 | /* Load current row of I0: */ \ |
524 | I0_row_8 = v_load_expand(I0_ptr); \ |
525 | v_expand(I0_row_8, I0_row_4_left, I0_row_4_right); \ |
526 | \ |
527 | /* Compute diffs between I0 and bilinearly interpolated I1: */ \ |
528 | I_diff_left = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left))), \ |
529 | v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left))), \ |
530 | v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left))), \ |
531 | v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)))), \ |
532 | v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left))); \ |
533 | I_diff_right = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right))), \ |
534 | v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right))), \ |
535 | v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right))), \ |
536 | v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)))), \ |
537 | v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right))); |
538 | |
539 | #define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW \ |
540 | I0_ptr += I0_stride; \ |
541 | I1_ptr += I1_stride; \ |
542 | \ |
543 | I1_row_4_left = I1_row_next_4_left; \ |
544 | I1_row_4_right = I1_row_next_4_right; \ |
545 | I1_row_shifted_4_left = I1_row_next_shifted_4_left; \ |
546 | I1_row_shifted_4_right = I1_row_next_shifted_4_right; |
547 | |
548 | /* This function essentially performs one iteration of gradient descent when finding the most similar patch in I1 for a |
549 | * given one in I0. It assumes that I0_ptr and I1_ptr already point to the corresponding patches and w00, w01, w10, w11 |
550 | * are precomputed bilinear interpolation weights. It returns the SSD (sum of squared differences) between these patches |
551 | * and computes the values (dst_dUx, dst_dUy) that are used in the flow vector update. HAL acceleration is implemented |
552 | * only for the default patch size (8x8). Everything is processed in floats as using fixed-point approximations harms |
553 | * the quality significantly. |
554 | */ |
555 | inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr, short *I0y_ptr, |
556 | int I0_stride, int I1_stride, float w00, float w01, float w10, float w11, int patch_sz) |
557 | { |
558 | float SSD = 0.0f; |
559 | #if CV_SIMD128 |
560 | if (patch_sz == 8) |
561 | { |
562 | /* Variables to accumulate the sums */ |
563 | v_float32x4 Ux_vec = v_setall_f32(v: 0); |
564 | v_float32x4 Uy_vec = v_setall_f32(v: 0); |
565 | v_float32x4 SSD_vec = v_setall_f32(v: 0); |
566 | |
567 | v_int16x8 I0x_row, I0y_row; |
568 | v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right; |
569 | |
570 | HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION; |
571 | for (int row = 0; row < 8; row++) |
572 | { |
573 | HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION; |
574 | I0x_row = v_load(ptr: I0x_ptr); |
575 | v_expand(a: I0x_row, b0&: I0x_row_4_left, b1&: I0x_row_4_right); |
576 | I0y_row = v_load(ptr: I0y_ptr); |
577 | v_expand(a: I0y_row, b0&: I0y_row_4_left, b1&: I0y_row_4_right); |
578 | |
579 | /* Update the sums: */ |
580 | Ux_vec = v_add(a: Ux_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0x_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0x_row_4_right)))); |
581 | Uy_vec = v_add(a: Uy_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0y_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0y_row_4_right)))); |
582 | SSD_vec = v_add(a: SSD_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right))); |
583 | |
584 | I0x_ptr += I0_stride; |
585 | I0y_ptr += I0_stride; |
586 | HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW; |
587 | } |
588 | |
589 | /* Final reduce operations: */ |
590 | dst_dUx = v_reduce_sum(a: Ux_vec); |
591 | dst_dUy = v_reduce_sum(a: Uy_vec); |
592 | SSD = v_reduce_sum(a: SSD_vec); |
593 | } |
594 | else |
595 | #endif |
596 | { |
597 | dst_dUx = 0.0f; |
598 | dst_dUy = 0.0f; |
599 | float diff; |
600 | for (int i = 0; i < patch_sz; i++) |
601 | for (int j = 0; j < patch_sz; j++) |
602 | { |
603 | diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] + |
604 | w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] - |
605 | I0_ptr[i * I0_stride + j]; |
606 | |
607 | SSD += diff * diff; |
608 | dst_dUx += diff * I0x_ptr[i * I0_stride + j]; |
609 | dst_dUy += diff * I0y_ptr[i * I0_stride + j]; |
610 | } |
611 | } |
612 | return SSD; |
613 | } |
614 | |
615 | /* Same as processPatch, but with patch mean normalization, which improves robustness under changing |
616 | * lighting conditions |
617 | */ |
618 | inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr, |
619 | short *I0y_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10, |
620 | float w11, int patch_sz, float x_grad_sum, float y_grad_sum) |
621 | { |
622 | float sum_diff = 0.0, sum_diff_sq = 0.0; |
623 | float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0; |
624 | float n = (float)patch_sz * patch_sz; |
625 | |
626 | #if CV_SIMD128 |
627 | if (patch_sz == 8) |
628 | { |
629 | /* Variables to accumulate the sums */ |
630 | v_float32x4 sum_I0x_mul_vec = v_setall_f32(v: 0); |
631 | v_float32x4 sum_I0y_mul_vec = v_setall_f32(v: 0); |
632 | v_float32x4 sum_diff_vec = v_setall_f32(v: 0); |
633 | v_float32x4 sum_diff_sq_vec = v_setall_f32(v: 0); |
634 | |
635 | v_int16x8 I0x_row, I0y_row; |
636 | v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right; |
637 | |
638 | HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION; |
639 | for (int row = 0; row < 8; row++) |
640 | { |
641 | HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION; |
642 | I0x_row = v_load(ptr: I0x_ptr); |
643 | v_expand(a: I0x_row, b0&: I0x_row_4_left, b1&: I0x_row_4_right); |
644 | I0y_row = v_load(ptr: I0y_ptr); |
645 | v_expand(a: I0y_row, b0&: I0y_row_4_left, b1&: I0y_row_4_right); |
646 | |
647 | /* Update the sums: */ |
648 | sum_I0x_mul_vec = v_add(a: sum_I0x_mul_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0x_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0x_row_4_right)))); |
649 | sum_I0y_mul_vec = v_add(a: sum_I0y_mul_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0y_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0y_row_4_right)))); |
650 | sum_diff_sq_vec = v_add(a: sum_diff_sq_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right))); |
651 | sum_diff_vec = v_add(a: sum_diff_vec, b: v_add(a: I_diff_left, b: I_diff_right)); |
652 | |
653 | I0x_ptr += I0_stride; |
654 | I0y_ptr += I0_stride; |
655 | HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW; |
656 | } |
657 | |
658 | /* Final reduce operations: */ |
659 | sum_I0x_mul = v_reduce_sum(a: sum_I0x_mul_vec); |
660 | sum_I0y_mul = v_reduce_sum(a: sum_I0y_mul_vec); |
661 | sum_diff = v_reduce_sum(a: sum_diff_vec); |
662 | sum_diff_sq = v_reduce_sum(a: sum_diff_sq_vec); |
663 | } |
664 | else |
665 | #endif |
666 | { |
667 | float diff; |
668 | for (int i = 0; i < patch_sz; i++) |
669 | for (int j = 0; j < patch_sz; j++) |
670 | { |
671 | diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] + |
672 | w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] - |
673 | I0_ptr[i * I0_stride + j]; |
674 | |
675 | sum_diff += diff; |
676 | sum_diff_sq += diff * diff; |
677 | |
678 | sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j]; |
679 | sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j]; |
680 | } |
681 | } |
682 | dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n; |
683 | dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n; |
684 | return sum_diff_sq - sum_diff * sum_diff / n; |
685 | } |
686 | |
687 | /* Similar to processPatch, but compute only the sum of squared differences (SSD) between the patches */ |
688 | inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10, |
689 | float w11, int patch_sz) |
690 | { |
691 | float SSD = 0.0f; |
692 | #if CV_SIMD128 |
693 | if (patch_sz == 8) |
694 | { |
695 | v_float32x4 SSD_vec = v_setall_f32(v: 0); |
696 | HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION; |
697 | for (int row = 0; row < 8; row++) |
698 | { |
699 | HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION; |
700 | SSD_vec = v_add(a: SSD_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right))); |
701 | HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW; |
702 | } |
703 | SSD = v_reduce_sum(a: SSD_vec); |
704 | } |
705 | else |
706 | #endif |
707 | { |
708 | float diff; |
709 | for (int i = 0; i < patch_sz; i++) |
710 | for (int j = 0; j < patch_sz; j++) |
711 | { |
712 | diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] + |
713 | w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] - |
714 | I0_ptr[i * I0_stride + j]; |
715 | SSD += diff * diff; |
716 | } |
717 | } |
718 | return SSD; |
719 | } |
720 | |
721 | /* Same as computeSSD, but with patch mean normalization */ |
722 | inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01, |
723 | float w10, float w11, int patch_sz) |
724 | { |
725 | float sum_diff = 0.0f, sum_diff_sq = 0.0f; |
726 | float n = (float)patch_sz * patch_sz; |
727 | #if CV_SIMD128 |
728 | if (patch_sz == 8) |
729 | { |
730 | v_float32x4 sum_diff_vec = v_setall_f32(v: 0); |
731 | v_float32x4 sum_diff_sq_vec = v_setall_f32(v: 0); |
732 | HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION; |
733 | for (int row = 0; row < 8; row++) |
734 | { |
735 | HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION; |
736 | sum_diff_sq_vec = v_add(a: sum_diff_sq_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right))); |
737 | sum_diff_vec = v_add(a: sum_diff_vec, b: v_add(a: I_diff_left, b: I_diff_right)); |
738 | HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW; |
739 | } |
740 | sum_diff = v_reduce_sum(a: sum_diff_vec); |
741 | sum_diff_sq = v_reduce_sum(a: sum_diff_sq_vec); |
742 | } |
743 | else |
744 | { |
745 | #endif |
746 | float diff; |
747 | for (int i = 0; i < patch_sz; i++) |
748 | for (int j = 0; j < patch_sz; j++) |
749 | { |
750 | diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] + |
751 | w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] - |
752 | I0_ptr[i * I0_stride + j]; |
753 | |
754 | sum_diff += diff; |
755 | sum_diff_sq += diff * diff; |
756 | } |
757 | #if CV_SIMD128 |
758 | } |
759 | #endif |
760 | return sum_diff_sq - sum_diff * sum_diff / n; |
761 | } |
762 | |
763 | #undef HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION |
764 | #undef HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION |
765 | #undef HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW |
766 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// |
767 | |
768 | void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const |
769 | { |
770 | CV_INSTRUMENT_REGION(); |
771 | |
772 | // force separate processing of stripes if we are using spatial propagation: |
773 | if (dis->use_spatial_propagation && range.end > range.start + 1) |
774 | { |
775 | for (int n = range.start; n < range.end; n++) |
776 | (*this)(Range(n, n + 1)); |
777 | return; |
778 | } |
779 | int psz = dis->patch_size; |
780 | int psz2 = psz / 2; |
781 | int w_ext = dis->w + 2 * dis->border_size; //!< width of I1_ext |
782 | int bsz = dis->border_size; |
783 | |
784 | /* Input dense flow */ |
785 | float *Ux_ptr = Ux->ptr<float>(); |
786 | float *Uy_ptr = Uy->ptr<float>(); |
787 | |
788 | /* Output sparse flow */ |
789 | float *Sx_ptr = Sx->ptr<float>(); |
790 | float *Sy_ptr = Sy->ptr<float>(); |
791 | |
792 | uchar *I0_ptr = I0->ptr<uchar>(); |
793 | uchar *I1_ptr = I1->ptr<uchar>(); |
794 | short *I0x_ptr = I0x->ptr<short>(); |
795 | short *I0y_ptr = I0y->ptr<short>(); |
796 | |
797 | /* Precomputed structure tensor */ |
798 | float *xx_ptr = dis->I0xx_buf.ptr<float>(); |
799 | float *yy_ptr = dis->I0yy_buf.ptr<float>(); |
800 | float *xy_ptr = dis->I0xy_buf.ptr<float>(); |
801 | /* And extra buffers for mean-normalization: */ |
802 | float *x_ptr = dis->I0x_buf.ptr<float>(); |
803 | float *y_ptr = dis->I0y_buf.ptr<float>(); |
804 | |
805 | bool use_temporal_candidates = false; |
806 | float *initial_Ux_ptr = NULL, *initial_Uy_ptr = NULL; |
807 | if (!dis->initial_Ux.empty()) |
808 | { |
809 | initial_Ux_ptr = dis->initial_Ux[pyr_level].ptr<float>(); |
810 | initial_Uy_ptr = dis->initial_Uy[pyr_level].ptr<float>(); |
811 | use_temporal_candidates = true; |
812 | } |
813 | |
814 | int i, j, dir; |
815 | int start_is, end_is, start_js, end_js; |
816 | int start_i, start_j; |
817 | float i_lower_limit = bsz - psz + 1.0f; |
818 | float i_upper_limit = bsz + dis->h - 1.0f; |
819 | float j_lower_limit = bsz - psz + 1.0f; |
820 | float j_upper_limit = bsz + dis->w - 1.0f; |
821 | float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy; |
822 | |
823 | #define INIT_BILINEAR_WEIGHTS(Ux, Uy) \ |
824 | i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit); \ |
825 | j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit); \ |
826 | { \ |
827 | float di = i_I1 - floor(i_I1); \ |
828 | float dj = j_I1 - floor(j_I1); \ |
829 | w11 = di * dj; \ |
830 | w10 = di * (1 - dj); \ |
831 | w01 = (1 - di) * dj; \ |
832 | w00 = (1 - di) * (1 - dj); \ |
833 | } |
834 | |
835 | #define COMPUTE_SSD(dst, Ux, Uy) \ |
836 | INIT_BILINEAR_WEIGHTS(Ux, Uy); \ |
837 | if (dis->use_mean_normalization) \ |
838 | dst = computeSSDMeanNorm(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, \ |
839 | w01, w10, w11, psz); \ |
840 | else \ |
841 | dst = computeSSD(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, w01, \ |
842 | w10, w11, psz); |
843 | |
844 | int num_inner_iter = (int)floor(x: dis->grad_descent_iter / (float)num_iter); |
845 | for (int iter = 0; iter < num_iter; iter++) |
846 | { |
847 | if (iter % 2 == 0) |
848 | { |
849 | dir = 1; |
850 | start_is = min(range.start * stripe_sz, hs); |
851 | end_is = min(range.end * stripe_sz, hs); |
852 | start_js = 0; |
853 | end_js = dis->ws; |
854 | start_i = start_is * dis->patch_stride; |
855 | start_j = 0; |
856 | } |
857 | else |
858 | { |
859 | dir = -1; |
860 | start_is = min(range.end * stripe_sz, hs) - 1; |
861 | end_is = min(range.start * stripe_sz, hs) - 1; |
862 | start_js = dis->ws - 1; |
863 | end_js = -1; |
864 | start_i = start_is * dis->patch_stride; |
865 | start_j = (dis->ws - 1) * dis->patch_stride; |
866 | } |
867 | |
868 | i = start_i; |
869 | for (int is = start_is; dir * is < dir * end_is; is += dir) |
870 | { |
871 | j = start_j; |
872 | for (int js = start_js; dir * js < dir * end_js; js += dir) |
873 | { |
874 | if (iter == 0) |
875 | { |
876 | /* Using result form the previous pyramid level as the very first approximation: */ |
877 | Sx_ptr[is * dis->ws + js] = Ux_ptr[(i + psz2) * dis->w + j + psz2]; |
878 | Sy_ptr[is * dis->ws + js] = Uy_ptr[(i + psz2) * dis->w + j + psz2]; |
879 | } |
880 | |
881 | float min_SSD = INF, cur_SSD; |
882 | if (use_temporal_candidates || dis->use_spatial_propagation) |
883 | { |
884 | COMPUTE_SSD(min_SSD, Sx_ptr[is * dis->ws + js], Sy_ptr[is * dis->ws + js]); |
885 | } |
886 | |
887 | if (use_temporal_candidates) |
888 | { |
889 | /* Try temporal candidates (vectors from the initial flow field that was passed to the function) */ |
890 | COMPUTE_SSD(cur_SSD, initial_Ux_ptr[(i + psz2) * dis->w + j + psz2], |
891 | initial_Uy_ptr[(i + psz2) * dis->w + j + psz2]); |
892 | if (cur_SSD < min_SSD) |
893 | { |
894 | min_SSD = cur_SSD; |
895 | Sx_ptr[is * dis->ws + js] = initial_Ux_ptr[(i + psz2) * dis->w + j + psz2]; |
896 | Sy_ptr[is * dis->ws + js] = initial_Uy_ptr[(i + psz2) * dis->w + j + psz2]; |
897 | } |
898 | } |
899 | |
900 | if (dis->use_spatial_propagation) |
901 | { |
902 | /* Try spatial candidates: */ |
903 | if (dir * js > dir * start_js) |
904 | { |
905 | COMPUTE_SSD(cur_SSD, Sx_ptr[is * dis->ws + js - dir], Sy_ptr[is * dis->ws + js - dir]); |
906 | if (cur_SSD < min_SSD) |
907 | { |
908 | min_SSD = cur_SSD; |
909 | Sx_ptr[is * dis->ws + js] = Sx_ptr[is * dis->ws + js - dir]; |
910 | Sy_ptr[is * dis->ws + js] = Sy_ptr[is * dis->ws + js - dir]; |
911 | } |
912 | } |
913 | /* Flow vectors won't actually propagate across different stripes, which is the reason for keeping |
914 | * the number of stripes constant. It works well enough in practice and doesn't introduce any |
915 | * visible seams. |
916 | */ |
917 | if (dir * is > dir * start_is) |
918 | { |
919 | COMPUTE_SSD(cur_SSD, Sx_ptr[(is - dir) * dis->ws + js], Sy_ptr[(is - dir) * dis->ws + js]); |
920 | if (cur_SSD < min_SSD) |
921 | { |
922 | min_SSD = cur_SSD; |
923 | Sx_ptr[is * dis->ws + js] = Sx_ptr[(is - dir) * dis->ws + js]; |
924 | Sy_ptr[is * dis->ws + js] = Sy_ptr[(is - dir) * dis->ws + js]; |
925 | } |
926 | } |
927 | } |
928 | |
929 | /* Use the best candidate as a starting point for the gradient descent: */ |
930 | float cur_Ux = Sx_ptr[is * dis->ws + js]; |
931 | float cur_Uy = Sy_ptr[is * dis->ws + js]; |
932 | |
933 | /* Computing the inverse of the structure tensor: */ |
934 | float detH = xx_ptr[is * dis->ws + js] * yy_ptr[is * dis->ws + js] - |
935 | xy_ptr[is * dis->ws + js] * xy_ptr[is * dis->ws + js]; |
936 | if (abs(x: detH) < EPS) |
937 | detH = EPS; |
938 | float invH11 = yy_ptr[is * dis->ws + js] / detH; |
939 | float invH12 = -xy_ptr[is * dis->ws + js] / detH; |
940 | float invH22 = xx_ptr[is * dis->ws + js] / detH; |
941 | float prev_SSD = INF, SSD; |
942 | float x_grad_sum = x_ptr[is * dis->ws + js]; |
943 | float y_grad_sum = y_ptr[is * dis->ws + js]; |
944 | |
945 | for (int t = 0; t < num_inner_iter; t++) |
946 | { |
947 | INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy); |
948 | if (dis->use_mean_normalization) |
949 | SSD = processPatchMeanNorm(dst_dUx&: dUx, dst_dUy&: dUy, |
950 | I0_ptr: I0_ptr + i * dis->w + j, I1_ptr: I1_ptr + (int)i_I1 * w_ext + (int)j_I1, |
951 | I0x_ptr: I0x_ptr + i * dis->w + j, I0y_ptr: I0y_ptr + i * dis->w + j, |
952 | I0_stride: dis->w, I1_stride: w_ext, w00, w01, w10, w11, patch_sz: psz, |
953 | x_grad_sum, y_grad_sum); |
954 | else |
955 | SSD = processPatch(dst_dUx&: dUx, dst_dUy&: dUy, |
956 | I0_ptr: I0_ptr + i * dis->w + j, I1_ptr: I1_ptr + (int)i_I1 * w_ext + (int)j_I1, |
957 | I0x_ptr: I0x_ptr + i * dis->w + j, I0y_ptr: I0y_ptr + i * dis->w + j, |
958 | I0_stride: dis->w, I1_stride: w_ext, w00, w01, w10, w11, patch_sz: psz); |
959 | |
960 | dx = invH11 * dUx + invH12 * dUy; |
961 | dy = invH12 * dUx + invH22 * dUy; |
962 | cur_Ux -= dx; |
963 | cur_Uy -= dy; |
964 | |
965 | /* Break when patch distance stops decreasing */ |
966 | if (SSD >= prev_SSD) |
967 | break; |
968 | prev_SSD = SSD; |
969 | } |
970 | |
971 | /* If gradient descent converged to a flow vector that is very far from the initial approximation |
972 | * (more than patch size) then we don't use it. Noticeably improves the robustness. |
973 | */ |
974 | if (norm(Vec2f(cur_Ux - Sx_ptr[is * dis->ws + js], cur_Uy - Sy_ptr[is * dis->ws + js])) <= psz) |
975 | { |
976 | Sx_ptr[is * dis->ws + js] = cur_Ux; |
977 | Sy_ptr[is * dis->ws + js] = cur_Uy; |
978 | } |
979 | j += dir * dis->patch_stride; |
980 | } |
981 | i += dir * dis->patch_stride; |
982 | } |
983 | } |
984 | #undef INIT_BILINEAR_WEIGHTS |
985 | #undef COMPUTE_SSD |
986 | } |
987 | |
988 | DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, |
989 | Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy, |
990 | Mat &_I0, Mat &_I1) |
991 | : dis(&_dis), nstripes(_nstripes), h(_h), Ux(&dst_Ux), Uy(&dst_Uy), Sx(&src_Sx), Sy(&src_Sy), I0(&_I0), I1(&_I1) |
992 | { |
993 | stripe_sz = (int)ceil(x: h / (double)nstripes); |
994 | } |
995 | |
996 | /* This function transforms a sparse optical flow field obtained by PatchInverseSearch (which computes flow values |
997 | * on a sparse grid defined by patch_stride) into a dense optical flow field by weighted averaging of values from the |
998 | * overlapping patches. |
999 | */ |
1000 | void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const |
1001 | { |
1002 | CV_INSTRUMENT_REGION(); |
1003 | |
1004 | int start_i = min(range.start * stripe_sz, h); |
1005 | int end_i = min(range.end * stripe_sz, h); |
1006 | |
1007 | /* Input sparse flow */ |
1008 | float *Sx_ptr = Sx->ptr<float>(); |
1009 | float *Sy_ptr = Sy->ptr<float>(); |
1010 | |
1011 | /* Output dense flow */ |
1012 | float *Ux_ptr = Ux->ptr<float>(); |
1013 | float *Uy_ptr = Uy->ptr<float>(); |
1014 | |
1015 | uchar *I0_ptr = I0->ptr<uchar>(); |
1016 | uchar *I1_ptr = I1->ptr<uchar>(); |
1017 | |
1018 | int psz = dis->patch_size; |
1019 | int pstr = dis->patch_stride; |
1020 | int i_l, i_u; |
1021 | int j_l, j_u; |
1022 | float i_m, j_m, diff; |
1023 | |
1024 | /* These values define the set of sparse grid locations that contain patches overlapping with the current dense flow |
1025 | * location */ |
1026 | int start_is, end_is; |
1027 | int start_js, end_js; |
1028 | |
1029 | /* Some helper macros for updating this set of sparse grid locations */ |
1030 | #define UPDATE_SPARSE_I_COORDINATES \ |
1031 | if (i % pstr == 0 && i + psz <= h) \ |
1032 | end_is++; \ |
1033 | if (i - psz >= 0 && (i - psz) % pstr == 0 && start_is < end_is) \ |
1034 | start_is++; |
1035 | |
1036 | #define UPDATE_SPARSE_J_COORDINATES \ |
1037 | if (j % pstr == 0 && j + psz <= dis->w) \ |
1038 | end_js++; \ |
1039 | if (j - psz >= 0 && (j - psz) % pstr == 0 && start_js < end_js) \ |
1040 | start_js++; |
1041 | |
1042 | start_is = 0; |
1043 | end_is = -1; |
1044 | for (int i = 0; i < start_i; i++) |
1045 | { |
1046 | UPDATE_SPARSE_I_COORDINATES; |
1047 | } |
1048 | for (int i = start_i; i < end_i; i++) |
1049 | { |
1050 | UPDATE_SPARSE_I_COORDINATES; |
1051 | start_js = 0; |
1052 | end_js = -1; |
1053 | for (int j = 0; j < dis->w; j++) |
1054 | { |
1055 | UPDATE_SPARSE_J_COORDINATES; |
1056 | float coef, sum_coef = 0.0f; |
1057 | float sum_Ux = 0.0f; |
1058 | float sum_Uy = 0.0f; |
1059 | |
1060 | /* Iterate through all the patches that overlap the current location (i,j) */ |
1061 | for (int is = start_is; is <= end_is; is++) |
1062 | for (int js = start_js; js <= end_js; js++) |
1063 | { |
1064 | j_m = min(max(j + Sx_ptr[is * dis->ws + js], 0.0f), dis->w - 1.0f - EPS); |
1065 | i_m = min(max(i + Sy_ptr[is * dis->ws + js], 0.0f), dis->h - 1.0f - EPS); |
1066 | j_l = (int)j_m; |
1067 | j_u = j_l + 1; |
1068 | i_l = (int)i_m; |
1069 | i_u = i_l + 1; |
1070 | diff = (j_m - j_l) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_u] + |
1071 | (j_u - j_m) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_l] + |
1072 | (j_m - j_l) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_u] + |
1073 | (j_u - j_m) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_l] - I0_ptr[i * dis->w + j]; |
1074 | coef = 1 / max(1.0f, abs(x: diff)); |
1075 | sum_Ux += coef * Sx_ptr[is * dis->ws + js]; |
1076 | sum_Uy += coef * Sy_ptr[is * dis->ws + js]; |
1077 | sum_coef += coef; |
1078 | } |
1079 | CV_DbgAssert(sum_coef != 0); |
1080 | Ux_ptr[i * dis->w + j] = sum_Ux / sum_coef; |
1081 | Uy_ptr[i * dis->w + j] = sum_Uy / sum_coef; |
1082 | } |
1083 | } |
1084 | #undef UPDATE_SPARSE_I_COORDINATES |
1085 | #undef UPDATE_SPARSE_J_COORDINATES |
1086 | } |
1087 | |
1088 | #ifdef HAVE_OPENCL |
1089 | bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_U, |
1090 | UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int /*pyr_level*/) |
1091 | { |
1092 | CV_INSTRUMENT_REGION(); |
1093 | CV_INSTRUMENT_REGION_OPENCL(); |
1094 | |
1095 | size_t globalSize[] = {(size_t)ws, (size_t)hs}; |
1096 | size_t localSize[] = {16, 16}; |
1097 | int num_inner_iter = (int)floor(x: grad_descent_iter / (float)num_iter); |
1098 | |
1099 | String subgroups_build_options; |
1100 | if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups")) |
1101 | subgroups_build_options = " -DCV_USE_SUBGROUPS=1"; |
1102 | |
1103 | String build_options = cv::format( |
1104 | fmt: "-DDIS_BORDER_SIZE=%d -DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d", |
1105 | border_size, patch_size, patch_stride |
1106 | ) + subgroups_build_options; |
1107 | |
1108 | #if 0 // OpenCL debug |
1109 | u_Sx = Scalar::all(0); |
1110 | u_Sy = Scalar::all(0); |
1111 | #endif |
1112 | |
1113 | CV_Assert(num_iter == 2); |
1114 | for (int iter = 0; iter < num_iter; iter++) |
1115 | { |
1116 | if (iter == 0) |
1117 | { |
1118 | ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, build_options); |
1119 | size_t global_sz[] = {(size_t)hs * 8}; |
1120 | size_t local_sz[] = {8}; |
1121 | |
1122 | k1.args( |
1123 | ocl::KernelArg::PtrReadOnly(m: src_U), |
1124 | ocl::KernelArg::PtrReadOnly(m: I0), |
1125 | ocl::KernelArg::PtrReadOnly(m: I1), |
1126 | (int)w, (int)h, (int)ws, (int)hs, |
1127 | ocl::KernelArg::PtrWriteOnly(m: u_S) |
1128 | ); |
1129 | if (!k1.run(dims: 1, globalsize: global_sz, localsize: local_sz, sync: false)) |
1130 | return false; |
1131 | |
1132 | ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::video::dis_flow_oclsrc, build_options); |
1133 | |
1134 | k2.args( |
1135 | ocl::KernelArg::PtrReadOnly(m: src_U), |
1136 | ocl::KernelArg::PtrReadOnly(m: I0), |
1137 | ocl::KernelArg::PtrReadOnly(m: I1), |
1138 | ocl::KernelArg::PtrReadOnly(m: I0x), |
1139 | ocl::KernelArg::PtrReadOnly(m: I0y), |
1140 | ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf), |
1141 | ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf), |
1142 | ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf), |
1143 | ocl::KernelArg::PtrReadOnly(m: u_I0x_buf), |
1144 | ocl::KernelArg::PtrReadOnly(m: u_I0y_buf), |
1145 | (int)w, (int)h, (int)ws, (int)hs, |
1146 | (int)num_inner_iter, |
1147 | ocl::KernelArg::PtrReadWrite(m: u_S) |
1148 | ); |
1149 | if (!k2.run(dims: 2, globalsize: globalSize, localsize: localSize, sync: false)) |
1150 | return false; |
1151 | } |
1152 | else |
1153 | { |
1154 | ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, build_options); |
1155 | size_t global_sz[] = {(size_t)hs * 8}; |
1156 | size_t local_sz[] = {8}; |
1157 | |
1158 | k3.args( |
1159 | ocl::KernelArg::PtrReadOnly(m: I0), |
1160 | ocl::KernelArg::PtrReadOnly(m: I1), |
1161 | (int)w, (int)h, (int)ws, (int)hs, |
1162 | ocl::KernelArg::PtrReadWrite(m: u_S) |
1163 | ); |
1164 | if (!k3.run(dims: 1, globalsize: global_sz, localsize: local_sz, sync: false)) |
1165 | return false; |
1166 | |
1167 | ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::video::dis_flow_oclsrc, build_options); |
1168 | |
1169 | k4.args( |
1170 | ocl::KernelArg::PtrReadOnly(m: I0), |
1171 | ocl::KernelArg::PtrReadOnly(m: I1), |
1172 | ocl::KernelArg::PtrReadOnly(m: I0x), |
1173 | ocl::KernelArg::PtrReadOnly(m: I0y), |
1174 | ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf), |
1175 | ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf), |
1176 | ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf), |
1177 | ocl::KernelArg::PtrReadOnly(m: u_I0x_buf), |
1178 | ocl::KernelArg::PtrReadOnly(m: u_I0y_buf), |
1179 | (int)w, (int)h,(int)ws, (int)hs, |
1180 | (int)num_inner_iter, |
1181 | ocl::KernelArg::PtrReadWrite(m: u_S) |
1182 | ); |
1183 | if (!k4.run(dims: 2, globalsize: globalSize, localsize: localSize, sync: false)) |
1184 | return false; |
1185 | } |
1186 | } |
1187 | return true; |
1188 | } |
1189 | |
1190 | bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1) |
1191 | { |
1192 | CV_INSTRUMENT_REGION(); |
1193 | CV_INSTRUMENT_REGION_OPENCL(); |
1194 | |
1195 | size_t globalSize[] = {(size_t)w, (size_t)h}; |
1196 | size_t localSize[] = {16, 16}; |
1197 | |
1198 | String build_options = cv::format( |
1199 | fmt: "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d", |
1200 | patch_size, patch_stride |
1201 | ); |
1202 | |
1203 | ocl::Kernel kernel("dis_densification", ocl::video::dis_flow_oclsrc, build_options); |
1204 | kernel.args( |
1205 | ocl::KernelArg::PtrReadOnly(m: src_S), |
1206 | ocl::KernelArg::PtrReadOnly(m: _I0), |
1207 | ocl::KernelArg::PtrReadOnly(m: _I1), |
1208 | (int)w, (int)h, (int)ws, |
1209 | ocl::KernelArg::PtrWriteOnly(m: dst_U) |
1210 | ); |
1211 | return kernel.run(dims: 2, globalsize: globalSize, localsize: localSize, sync: false); |
1212 | } |
1213 | |
1214 | void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow) |
1215 | { |
1216 | CV_INSTRUMENT_REGION(); |
1217 | // not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL(); |
1218 | |
1219 | u_I0s.resize(coarsest_scale + 1); |
1220 | u_I1s.resize(coarsest_scale + 1); |
1221 | u_I1s_ext.resize(coarsest_scale + 1); |
1222 | u_I0xs.resize(coarsest_scale + 1); |
1223 | u_I0ys.resize(coarsest_scale + 1); |
1224 | u_U.resize(coarsest_scale + 1); |
1225 | |
1226 | if (use_flow) |
1227 | { |
1228 | u_initial_U.resize(coarsest_scale + 1); |
1229 | } |
1230 | |
1231 | int fraction = 1; |
1232 | int cur_rows = 0, cur_cols = 0; |
1233 | |
1234 | for (int i = 0; i <= coarsest_scale; i++) |
1235 | { |
1236 | CV_TRACE_REGION("coarsest_scale_iteration"); |
1237 | /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */ |
1238 | if (i == finest_scale) |
1239 | { |
1240 | cur_rows = I0.rows / fraction; |
1241 | cur_cols = I0.cols / fraction; |
1242 | u_I0s[i].create(cur_rows, cur_cols, CV_8UC1); |
1243 | resize(I0, u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA); |
1244 | u_I1s[i].create(cur_rows, cur_cols, CV_8UC1); |
1245 | resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA); |
1246 | |
1247 | /* These buffers are reused in each scale so we initialize them once on the finest scale: */ |
1248 | u_S.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC2); |
1249 | u_I0xx_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1); |
1250 | u_I0yy_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1); |
1251 | u_I0xy_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1); |
1252 | u_I0x_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1); |
1253 | u_I0y_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1); |
1254 | |
1255 | u_I0xx_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1); |
1256 | u_I0yy_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1); |
1257 | u_I0xy_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1); |
1258 | u_I0x_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1); |
1259 | u_I0y_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1); |
1260 | } |
1261 | else if (i > finest_scale) |
1262 | { |
1263 | cur_rows = u_I0s[i - 1].rows / 2; |
1264 | cur_cols = u_I0s[i - 1].cols / 2; |
1265 | u_I0s[i].create(cur_rows, cur_cols, CV_8UC1); |
1266 | resize(u_I0s[i - 1], u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA); |
1267 | u_I1s[i].create(cur_rows, cur_cols, CV_8UC1); |
1268 | resize(u_I1s[i - 1], u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA); |
1269 | } |
1270 | |
1271 | if (i >= finest_scale) |
1272 | { |
1273 | u_I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size, CV_8UC1); |
1274 | copyMakeBorder(u_I1s[i], u_I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE); |
1275 | u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1); |
1276 | u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1); |
1277 | spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]); |
1278 | u_U[i].create(cur_rows, cur_cols, CV_32FC2); |
1279 | variational_refinement_processors[i]->setAlpha(variational_refinement_alpha); |
1280 | variational_refinement_processors[i]->setDelta(variational_refinement_delta); |
1281 | variational_refinement_processors[i]->setGamma(variational_refinement_gamma); |
1282 | variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon); |
1283 | variational_refinement_processors[i]->setSorIterations(5); |
1284 | variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter); |
1285 | |
1286 | if (use_flow) |
1287 | { |
1288 | UMat resized_flow; |
1289 | resize(src: flow, dst: resized_flow, dsize: Size(cur_cols, cur_rows)); |
1290 | float scale = 1.0f / fraction; |
1291 | resized_flow.convertTo(u_initial_U[i], CV_32FC2, scale, 0.0f); |
1292 | } |
1293 | } |
1294 | |
1295 | fraction *= 2; |
1296 | } |
1297 | } |
1298 | |
1299 | bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy, |
1300 | UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y) |
1301 | { |
1302 | CV_INSTRUMENT_REGION(); |
1303 | CV_INSTRUMENT_REGION_OPENCL(); |
1304 | |
1305 | size_t globalSizeX[] = {(size_t)h}; |
1306 | size_t localSizeX[] = {16}; |
1307 | |
1308 | #if 0 // OpenCL debug |
1309 | u_I0xx_buf_aux = Scalar::all(0); |
1310 | u_I0yy_buf_aux = Scalar::all(0); |
1311 | u_I0xy_buf_aux = Scalar::all(0); |
1312 | u_I0x_buf_aux = Scalar::all(0); |
1313 | u_I0y_buf_aux = Scalar::all(0); |
1314 | dst_I0xx = Scalar::all(0); |
1315 | dst_I0yy = Scalar::all(0); |
1316 | dst_I0xy = Scalar::all(0); |
1317 | dst_I0x = Scalar::all(0); |
1318 | dst_I0y = Scalar::all(0); |
1319 | #endif |
1320 | |
1321 | String build_options = cv::format( |
1322 | fmt: "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d", |
1323 | patch_size, patch_stride |
1324 | ); |
1325 | |
1326 | ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::video::dis_flow_oclsrc, build_options); |
1327 | kernelX.args( |
1328 | ocl::KernelArg::PtrReadOnly(m: I0x), |
1329 | ocl::KernelArg::PtrReadOnly(m: I0y), |
1330 | (int)w, (int)h, (int)ws, |
1331 | ocl::KernelArg::PtrWriteOnly(m: u_I0xx_buf_aux), |
1332 | ocl::KernelArg::PtrWriteOnly(m: u_I0yy_buf_aux), |
1333 | ocl::KernelArg::PtrWriteOnly(m: u_I0xy_buf_aux), |
1334 | ocl::KernelArg::PtrWriteOnly(m: u_I0x_buf_aux), |
1335 | ocl::KernelArg::PtrWriteOnly(m: u_I0y_buf_aux) |
1336 | ); |
1337 | if (!kernelX.run(dims: 1, globalsize: globalSizeX, localsize: localSizeX, sync: false)) |
1338 | return false; |
1339 | |
1340 | size_t globalSizeY[] = {(size_t)ws}; |
1341 | size_t localSizeY[] = {16}; |
1342 | |
1343 | ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::video::dis_flow_oclsrc, build_options); |
1344 | kernelY.args( |
1345 | ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf_aux), |
1346 | ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf_aux), |
1347 | ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf_aux), |
1348 | ocl::KernelArg::PtrReadOnly(m: u_I0x_buf_aux), |
1349 | ocl::KernelArg::PtrReadOnly(m: u_I0y_buf_aux), |
1350 | (int)w, (int)h, (int)ws, |
1351 | ocl::KernelArg::PtrWriteOnly(m: dst_I0xx), |
1352 | ocl::KernelArg::PtrWriteOnly(m: dst_I0yy), |
1353 | ocl::KernelArg::PtrWriteOnly(m: dst_I0xy), |
1354 | ocl::KernelArg::PtrWriteOnly(m: dst_I0x), |
1355 | ocl::KernelArg::PtrWriteOnly(m: dst_I0y) |
1356 | ); |
1357 | return kernelY.run(dims: 1, globalsize: globalSizeY, localsize: localSizeY, sync: false); |
1358 | } |
1359 | |
1360 | bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow) |
1361 | { |
1362 | CV_INSTRUMENT_REGION(); |
1363 | // not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL(); |
1364 | |
1365 | UMat I0Mat = I0.getUMat(); |
1366 | UMat I1Mat = I1.getUMat(); |
1367 | bool use_input_flow = false; |
1368 | if (flow.sameSize(arr: I0) && flow.depth() == CV_32F && flow.channels() == 2) |
1369 | use_input_flow = true; |
1370 | coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(x: 2.0) + 0.5), /* Original code search for maximal movement of width/4 */ |
1371 | (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(x: 2.0))); /* Deepest pyramid level greater or equal than patch*/ |
1372 | |
1373 | if (coarsest_scale<0) |
1374 | CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12"); |
1375 | |
1376 | if (coarsest_scale<finest_scale) |
1377 | { |
1378 | // choose the finest level based on coarsest level. |
1379 | // Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239 |
1380 | int original_img_width = I0.size().width; |
1381 | autoSelectPatchSizeAndScales(img_width: original_img_width); |
1382 | } |
1383 | |
1384 | ocl_prepareBuffers(I0&: I0Mat, I1&: I1Mat, flow, use_flow: use_input_flow); |
1385 | u_U[coarsest_scale].setTo(0.0f); |
1386 | |
1387 | for (int i = coarsest_scale; i >= finest_scale; i--) |
1388 | { |
1389 | CV_TRACE_REGION("coarsest_scale_iteration"); |
1390 | w = u_I0s[i].cols; |
1391 | h = u_I0s[i].rows; |
1392 | ws = 1 + (w - patch_size) / patch_stride; |
1393 | hs = 1 + (h - patch_size) / patch_stride; |
1394 | |
1395 | if (!ocl_precomputeStructureTensor(u_I0xx_buf, u_I0yy_buf, u_I0xy_buf, |
1396 | u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i])) |
1397 | return false; |
1398 | |
1399 | if (!ocl_PatchInverseSearch(u_U[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i)) |
1400 | return false; |
1401 | |
1402 | if (!ocl_Densification(u_U[i], u_S, u_I0s[i], u_I1s[i])) |
1403 | return false; |
1404 | |
1405 | if (variational_refinement_iter > 0) |
1406 | { |
1407 | std::vector<Mat> U_channels; |
1408 | split(u_U[i], U_channels); CV_Assert(U_channels.size() == 2); |
1409 | variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i], |
1410 | U_channels[0], U_channels[1]); |
1411 | merge(U_channels, u_U[i]); |
1412 | } |
1413 | |
1414 | if (i > finest_scale) |
1415 | { |
1416 | UMat resized; |
1417 | resize(u_U[i], resized, u_U[i - 1].size()); |
1418 | multiply(resized, 2, u_U[i - 1]); |
1419 | } |
1420 | } |
1421 | |
1422 | UMat resized_flow; |
1423 | resize(u_U[finest_scale], resized_flow, I1Mat.size()); |
1424 | multiply(src1: resized_flow, src2: 1 << finest_scale, dst: flow); |
1425 | |
1426 | return true; |
1427 | } |
1428 | #endif |
1429 | |
1430 | void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow) |
1431 | { |
1432 | CV_INSTRUMENT_REGION(); |
1433 | |
1434 | CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1); |
1435 | CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1); |
1436 | CV_Assert(I0.sameSize(I1)); |
1437 | CV_Assert(I0.isContinuous()); |
1438 | CV_Assert(I1.isContinuous()); |
1439 | |
1440 | CV_OCL_RUN(flow.isUMat() && |
1441 | (patch_size == 8) && (use_spatial_propagation == true), |
1442 | ocl_calc(I0, I1, flow)); |
1443 | |
1444 | Mat I0Mat = I0.getMat(); |
1445 | Mat I1Mat = I1.getMat(); |
1446 | bool use_input_flow = false; |
1447 | if (flow.sameSize(arr: I0) && flow.depth() == CV_32F && flow.channels() == 2) |
1448 | use_input_flow = true; |
1449 | else |
1450 | flow.create(sz: I1Mat.size(), CV_32FC2); |
1451 | Mat flowMat = flow.getMat(); |
1452 | coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(x: 2.0) + 0.5), /* Original code search for maximal movement of width/4 */ |
1453 | (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(x: 2.0))); /* Deepest pyramid level greater or equal than patch*/ |
1454 | |
1455 | if (coarsest_scale<0) |
1456 | CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12"); |
1457 | |
1458 | if (coarsest_scale<finest_scale) |
1459 | { |
1460 | // choose the finest level based on coarsest level. |
1461 | // Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239 |
1462 | int original_img_width = I0.size().width; |
1463 | autoSelectPatchSizeAndScales(img_width: original_img_width); |
1464 | } |
1465 | |
1466 | int num_stripes = getNumThreads(); |
1467 | |
1468 | prepareBuffers(I0&: I0Mat, I1&: I1Mat, flow&: flowMat, use_flow: use_input_flow); |
1469 | Ux[coarsest_scale].setTo(0.0f); |
1470 | Uy[coarsest_scale].setTo(0.0f); |
1471 | |
1472 | for (int i = coarsest_scale; i >= finest_scale; i--) |
1473 | { |
1474 | CV_TRACE_REGION("coarsest_scale_iteration"); |
1475 | w = I0s[i].cols; |
1476 | h = I0s[i].rows; |
1477 | ws = 1 + (w - patch_size) / patch_stride; |
1478 | hs = 1 + (h - patch_size) / patch_stride; |
1479 | |
1480 | precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x_buf, I0y_buf, I0xs[i], I0ys[i]); |
1481 | if (use_spatial_propagation) |
1482 | { |
1483 | /* Use a fixed number of stripes regardless the number of threads to make inverse search |
1484 | * with spatial propagation reproducible |
1485 | */ |
1486 | parallel_for_(Range(0, 8), PatchInverseSearch_ParBody(*this, 8, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], |
1487 | I1s_ext[i], I0xs[i], I0ys[i], 2, i)); |
1488 | } |
1489 | else |
1490 | { |
1491 | parallel_for_(Range(0, num_stripes), |
1492 | PatchInverseSearch_ParBody(*this, num_stripes, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], I1s_ext[i], |
1493 | I0xs[i], I0ys[i], 1, i)); |
1494 | } |
1495 | |
1496 | parallel_for_(Range(0, num_stripes), |
1497 | Densification_ParBody(*this, num_stripes, I0s[i].rows, Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i])); |
1498 | if (variational_refinement_iter > 0) |
1499 | variational_refinement_processors[i]->calcUV(I0s[i], I1s[i], Ux[i], Uy[i]); |
1500 | |
1501 | if (i > finest_scale) |
1502 | { |
1503 | resize(Ux[i], Ux[i - 1], Ux[i - 1].size()); |
1504 | resize(Uy[i], Uy[i - 1], Uy[i - 1].size()); |
1505 | Ux[i - 1] *= 2; |
1506 | Uy[i - 1] *= 2; |
1507 | } |
1508 | } |
1509 | Mat uxy[] = {Ux[finest_scale], Uy[finest_scale]}; |
1510 | merge(uxy, 2, U); |
1511 | resize(U, flowMat, flowMat.size()); |
1512 | flowMat *= 1 << finest_scale; |
1513 | } |
1514 | |
1515 | void DISOpticalFlowImpl::collectGarbage() |
1516 | { |
1517 | CV_INSTRUMENT_REGION(); |
1518 | |
1519 | I0s.clear(); |
1520 | I1s.clear(); |
1521 | I1s_ext.clear(); |
1522 | I0xs.clear(); |
1523 | I0ys.clear(); |
1524 | Ux.clear(); |
1525 | Uy.clear(); |
1526 | U.release(); |
1527 | Sx.release(); |
1528 | Sy.release(); |
1529 | I0xx_buf.release(); |
1530 | I0yy_buf.release(); |
1531 | I0xy_buf.release(); |
1532 | I0xx_buf_aux.release(); |
1533 | I0yy_buf_aux.release(); |
1534 | I0xy_buf_aux.release(); |
1535 | |
1536 | #ifdef HAVE_OPENCL |
1537 | u_I0s.clear(); |
1538 | u_I1s.clear(); |
1539 | u_I1s_ext.clear(); |
1540 | u_I0xs.clear(); |
1541 | u_I0ys.clear(); |
1542 | u_U.clear(); |
1543 | u_S.release(); |
1544 | u_I0xx_buf.release(); |
1545 | u_I0yy_buf.release(); |
1546 | u_I0xy_buf.release(); |
1547 | u_I0xx_buf_aux.release(); |
1548 | u_I0yy_buf_aux.release(); |
1549 | u_I0xy_buf_aux.release(); |
1550 | #endif |
1551 | |
1552 | for (int i = finest_scale; i <= coarsest_scale; i++) |
1553 | variational_refinement_processors[i]->collectGarbage(); |
1554 | variational_refinement_processors.clear(); |
1555 | } |
1556 | |
1557 | Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset) |
1558 | { |
1559 | CV_INSTRUMENT_REGION(); |
1560 | |
1561 | Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>(); |
1562 | dis->setPatchSize(8); |
1563 | if (preset == DISOpticalFlow::PRESET_ULTRAFAST) |
1564 | { |
1565 | dis->setFinestScale(2); |
1566 | dis->setPatchStride(4); |
1567 | dis->setGradientDescentIterations(12); |
1568 | dis->setVariationalRefinementIterations(0); |
1569 | } |
1570 | else if (preset == DISOpticalFlow::PRESET_FAST) |
1571 | { |
1572 | dis->setFinestScale(2); |
1573 | dis->setPatchStride(4); |
1574 | dis->setGradientDescentIterations(16); |
1575 | dis->setVariationalRefinementIterations(5); |
1576 | } |
1577 | else if (preset == DISOpticalFlow::PRESET_MEDIUM) |
1578 | { |
1579 | dis->setFinestScale(1); |
1580 | dis->setPatchStride(3); |
1581 | dis->setGradientDescentIterations(25); |
1582 | dis->setVariationalRefinementIterations(5); |
1583 | } |
1584 | |
1585 | return dis; |
1586 | } |
1587 | |
1588 | |
1589 | } // namespace |
1590 |
Definitions
- DISOpticalFlowImpl
- getFinestScale
- setFinestScale
- getPatchSize
- setPatchSize
- getPatchStride
- setPatchStride
- getGradientDescentIterations
- setGradientDescentIterations
- getVariationalRefinementIterations
- setVariationalRefinementIterations
- getVariationalRefinementAlpha
- setVariationalRefinementAlpha
- getVariationalRefinementDelta
- setVariationalRefinementDelta
- getVariationalRefinementGamma
- setVariationalRefinementGamma
- getVariationalRefinementEpsilon
- setVariationalRefinementEpsilon
- getUseMeanNormalization
- setUseMeanNormalization
- getUseSpatialPropagation
- setUseSpatialPropagation
- PatchInverseSearch_ParBody
- Densification_ParBody
- DISOpticalFlowImpl
- prepareBuffers
- precomputeStructureTensor
- autoSelectCoarsestScale
- autoSelectPatchSizeAndScales
- PatchInverseSearch_ParBody
- processPatch
- processPatchMeanNorm
- computeSSD
- computeSSDMeanNorm
- operator()
- Densification_ParBody
- operator()
- ocl_PatchInverseSearch
- ocl_Densification
- ocl_prepareBuffers
- ocl_precomputeStructureTensor
- ocl_calc
- calc
- collectGarbage
Improve your Profiling and Debugging skills
Find out more