dis_flow.cpp source code [opencv/modules/video/src/dis_flow.cpp]

1	/M///////////////////////////////////////////////////////////////////////////////////////*
2	//
3	// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4	//
5	// By downloading, copying, installing or using the software you agree to this license.
6	// If you do not agree to this license, do not download, install,
7	// copy or use the software.
8	//
9	//
10	// License Agreement
11	// For Open Source Computer Vision Library
12	//
13	// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14	// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15	// Third party copyrights are property of their respective owners.
16	//
17	// Redistribution and use in source and binary forms, with or without modification,
18	// are permitted provided that the following conditions are met:
19	//
20	// Redistribution's of source code must retain the above copyright notice,*
21	// this list of conditions and the following disclaimer.
22	//
23	// Redistribution's in binary form must reproduce the above copyright notice,*
24	// this list of conditions and the following disclaimer in the documentation
25	// and/or other materials provided with the distribution.
26	//
27	// The name of the copyright holders may not be used to endorse or promote products*
28	// derived from this software without specific prior written permission.
29	//
30	// This software is provided by the copyright holders and contributors "as is" and
31	// any express or implied warranties, including, but not limited to, the implied
32	// warranties of merchantability and fitness for a particular purpose are disclaimed.
33	// In no event shall the Intel Corporation or contributors be liable for any direct,
34	// indirect, incidental, special, exemplary, or consequential damages
35	// (including, but not limited to, procurement of substitute goods or services;
36	// loss of use, data, or profits; or business interruption) however caused
37	// and on any theory of liability, whether in contract, strict liability,
38	// or tort (including negligence or otherwise) arising in any way out of
39	// the use of this software, even if advised of the possibility of such damage.
40	//
41	//M/*
42
43	#include "precomp.hpp"
44	#include "opencv2/core/hal/intrin.hpp"
45	#include "opencl_kernels_video.hpp"
46
47	using namespace std;
48	#define EPS 0.001F
49	#define INF 1E+10F
50
51	namespace cv {
52
53	class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
54	{
55	public:
56	DISOpticalFlowImpl();
57
58	void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
59	void collectGarbage() CV_OVERRIDE;
60
61	protected: //!< algorithm parameters
62	int finest_scale, coarsest_scale;
63	int patch_size;
64	int patch_stride;
65	int grad_descent_iter;
66	int variational_refinement_iter;
67	float variational_refinement_alpha;
68	float variational_refinement_gamma;
69	float variational_refinement_delta;
70	float variational_refinement_epsilon;
71	bool use_mean_normalization;
72	bool use_spatial_propagation;
73
74	protected: //!< some auxiliary variables
75	int border_size;
76	int w, h; //!< flow buffer width and height on the current scale
77	int ws, hs; //!< sparse flow buffer width and height on the current scale
78
79	public:
80	int getFinestScale() const CV_OVERRIDE { return finest_scale; }
81	void setFinestScale(int val) CV_OVERRIDE { finest_scale = val; }
82	int getPatchSize() const CV_OVERRIDE { return patch_size; }
83	void setPatchSize(int val) CV_OVERRIDE { patch_size = val; }
84	int getPatchStride() const CV_OVERRIDE { return patch_stride; }
85	void setPatchStride(int val) CV_OVERRIDE { patch_stride = val; }
86	int getGradientDescentIterations() const CV_OVERRIDE { return grad_descent_iter; }
87	void setGradientDescentIterations(int val) CV_OVERRIDE { grad_descent_iter = val; }
88	int getVariationalRefinementIterations() const CV_OVERRIDE { return variational_refinement_iter; }
89	void setVariationalRefinementIterations(int val) CV_OVERRIDE { variational_refinement_iter = val; }
90	float getVariationalRefinementAlpha() const CV_OVERRIDE { return variational_refinement_alpha; }
91	void setVariationalRefinementAlpha(float val) CV_OVERRIDE { variational_refinement_alpha = val; }
92	float getVariationalRefinementDelta() const CV_OVERRIDE { return variational_refinement_delta; }
93	void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; }
94	float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; }
95	void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; }
96	float getVariationalRefinementEpsilon() const CV_OVERRIDE { return variational_refinement_epsilon; }
97	void setVariationalRefinementEpsilon(float val) CV_OVERRIDE { variational_refinement_epsilon = val; }
98
99	bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; }
100	void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; }
101	bool getUseSpatialPropagation() const CV_OVERRIDE { return use_spatial_propagation; }
102	void setUseSpatialPropagation(bool val) CV_OVERRIDE { use_spatial_propagation = val; }
103
104	protected: //!< internal buffers
105	vector<Mat_<uchar> > I0s; //!< Gaussian pyramid for the current frame
106	vector<Mat_<uchar> > I1s; //!< Gaussian pyramid for the next frame
107	vector<Mat_<uchar> > I1s_ext; //!< I1s with borders
108
109	vector<Mat_<short> > I0xs; //!< Gaussian pyramid for the x gradient of the current frame
110	vector<Mat_<short> > I0ys; //!< Gaussian pyramid for the y gradient of the current frame
111
112	vector<Mat_<float> > Ux; //!< x component of the flow vectors
113	vector<Mat_<float> > Uy; //!< y component of the flow vectors
114
115	vector<Mat_<float> > initial_Ux; //!< x component of the initial flow field, if one was passed as an input
116	vector<Mat_<float> > initial_Uy; //!< y component of the initial flow field, if one was passed as an input
117
118	Mat_<Vec2f> U; //!< a buffer for the merged flow
119
120	Mat_<float> Sx; //!< intermediate sparse flow representation (x component)
121	Mat_<float> Sy; //!< intermediate sparse flow representation (y component)
122
123	/ Structure tensor components: /
124	Mat_<float> I0xx_buf; //!< sum of squares of x gradient values
125	Mat_<float> I0yy_buf; //!< sum of squares of y gradient values
126	Mat_<float> I0xy_buf; //!< sum of x and y gradient products
127
128	/ Extra buffers that are useful if patch mean-normalization is used: /
129	Mat_<float> I0x_buf; //!< sum of x gradient values
130	Mat_<float> I0y_buf; //!< sum of y gradient values
131
132	/ Auxiliary buffers used in structure tensor computation: /
133	Mat_<float> I0xx_buf_aux;
134	Mat_<float> I0yy_buf_aux;
135	Mat_<float> I0xy_buf_aux;
136	Mat_<float> I0x_buf_aux;
137	Mat_<float> I0y_buf_aux;
138
139	vector<Ptr<VariationalRefinement> > variational_refinement_processors;
140
141	private: //!< private methods and parallel sections
142	void prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow);
143	void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, Mat &dst_I0y, Mat &I0x,
144	Mat &I0y);
145	int autoSelectCoarsestScale(int img_width);
146	void autoSelectPatchSizeAndScales(int img_width);
147
148	struct PatchInverseSearch_ParBody : public ParallelLoopBody
149	{
150	DISOpticalFlowImpl *dis;
151	int nstripes, stripe_sz;
152	int hs;
153	Mat Sx, Sy, Ux, Uy, I0, I1, I0x, I0y;
154	int num_iter, pyr_level;
155
156	PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _hs, Mat &dst_Sx, Mat &dst_Sy,
157	Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, Mat &_I0x, Mat &_I0y, int _num_iter,
158	int _pyr_level);
159	void operator()(const Range &range) const CV_OVERRIDE;
160	};
161
162	struct Densification_ParBody : public ParallelLoopBody
163	{
164	DISOpticalFlowImpl *dis;
165	int nstripes, stripe_sz;
166	int h;
167	Mat Ux, Uy, Sx, Sy, I0, I1;
168
169	Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx,
170	Mat &src_Sy, Mat &_I0, Mat &_I1);
171	void operator()(const Range &range) const CV_OVERRIDE;
172	};
173
174	#ifdef HAVE_OPENCL
175	vector<UMat> u_I0s; //!< Gaussian pyramid for the current frame
176	vector<UMat> u_I1s; //!< Gaussian pyramid for the next frame
177	vector<UMat> u_I1s_ext; //!< I1s with borders
178
179	vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame
180	vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame
181
182	vector<UMat> u_U; //!< (x,y) component of the flow vectors (CV_32FC2)
183	vector<UMat> u_initial_U; //!< (x, y) components of the initial flow field, if one was passed as an input (CV_32FC2)
184
185	UMat u_S; //!< intermediate sparse flow representation (x,y components - CV_32FC2)
186
187	/ Structure tensor components: /
188	UMat u_I0xx_buf; //!< sum of squares of x gradient values
189	UMat u_I0yy_buf; //!< sum of squares of y gradient values
190	UMat u_I0xy_buf; //!< sum of x and y gradient products
191
192	/ Extra buffers that are useful if patch mean-normalization is used: /
193	UMat u_I0x_buf; //!< sum of x gradient values
194	UMat u_I0y_buf; //!< sum of y gradient values
195
196	/ Auxiliary buffers used in structure tensor computation: /
197	UMat u_I0xx_buf_aux;
198	UMat u_I0yy_buf_aux;
199	UMat u_I0xy_buf_aux;
200	UMat u_I0x_buf_aux;
201	UMat u_I0y_buf_aux;
202
203	bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
204	UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y);
205	void ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow);
206	bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow);
207	bool ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1);
208	bool ocl_PatchInverseSearch(UMat &src_U,
209	UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level);
210	#endif
211	};
212
213	DISOpticalFlowImpl::DISOpticalFlowImpl()
214	{
215	CV_INSTRUMENT_REGION();
216
217	finest_scale = `2`;
218	patch_size = `8`;
219	patch_stride = `4`;
220	grad_descent_iter = `16`;
221	variational_refinement_iter = `5`;
222	variational_refinement_alpha = `20.f`;
223	variational_refinement_gamma = `10.f`;
224	variational_refinement_delta = `5.f`;
225	variational_refinement_epsilon = `0.01f`;
226
227	border_size = `16`;
228	use_mean_normalization = true;
229	use_spatial_propagation = true;
230	coarsest_scale = `10`;
231
232	/ Use separate variational refinement instances for different scales to avoid repeated memory allocation: /
233	int max_possible_scales = `10`;
234	ws = hs = w = h = `0`;
235	for (int i = `0`; i < max_possible_scales; i++)
236	variational_refinement_processors.push_back(VariationalRefinement::create());
237	}
238
239	void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow)
240	{
241	CV_INSTRUMENT_REGION();
242
243	I0s.resize(coarsest_scale + `1`);
244	I1s.resize(coarsest_scale + `1`);
245	I1s_ext.resize(coarsest_scale + `1`);
246	I0xs.resize(coarsest_scale + `1`);
247	I0ys.resize(coarsest_scale + `1`);
248	Ux.resize(coarsest_scale + `1`);
249	Uy.resize(coarsest_scale + `1`);
250
251	Mat flow_uv[`2`];
252	if (use_flow)
253	{
254	split(src: flow, mvbegin: flow_uv);
255	initial_Ux.resize(coarsest_scale + `1`);
256	initial_Uy.resize(coarsest_scale + `1`);
257	}
258
259	int fraction = `1`;
260	int cur_rows = `0`, cur_cols = `0`;
261
262	for (int i = `0`; i <= coarsest_scale; i++)
263	{
264	/ Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway /
265	if (i == finest_scale)
266	{
267	cur_rows = I0.rows / fraction;
268	cur_cols = I0.cols / fraction;
269	I0s[i].create(cur_rows, cur_cols);
270	resize(I0, I0s[i], I0s[i].size(), `0.0`, `0.0`, INTER_AREA);
271	I1s[i].create(cur_rows, cur_cols);
272	resize(I1, I1s[i], I1s[i].size(), `0.0`, `0.0`, INTER_AREA);
273
274	/ These buffers are reused in each scale so we initialize them once on the finest scale: /
275	Sx.create(cur_rows / patch_stride, cur_cols / patch_stride);
276	Sy.create(cur_rows / patch_stride, cur_cols / patch_stride);
277	I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
278	I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
279	I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
280	I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
281	I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
282
283	I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride);
284	I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride);
285	I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride);
286	I0x_buf_aux.create(cur_rows, cur_cols / patch_stride);
287	I0y_buf_aux.create(cur_rows, cur_cols / patch_stride);
288
289	U.create(cur_rows, cur_cols);
290	}
291	else if (i > finest_scale)
292	{
293	cur_rows = I0s[i - `1`].rows / `2`;
294	cur_cols = I0s[i - `1`].cols / `2`;
295	I0s[i].create(cur_rows, cur_cols);
296	resize(I0s[i - `1`], I0s[i], I0s[i].size(), `0.0`, `0.0`, INTER_AREA);
297	I1s[i].create(cur_rows, cur_cols);
298	resize(I1s[i - `1`], I1s[i], I1s[i].size(), `0.0`, `0.0`, INTER_AREA);
299	}
300
301	if (i >= finest_scale)
302	{
303	I1s_ext[i].create(cur_rows + `2` * border_size, cur_cols + `2` * border_size);
304	copyMakeBorder(I1s[i], I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
305	I0xs[i].create(cur_rows, cur_cols);
306	I0ys[i].create(cur_rows, cur_cols);
307	spatialGradient(I0s[i], I0xs[i], I0ys[i]);
308	Ux[i].create(cur_rows, cur_cols);
309	Uy[i].create(cur_rows, cur_cols);
310	variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
311	variational_refinement_processors[i]->setDelta(variational_refinement_delta);
312	variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
313	variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon);
314	variational_refinement_processors[i]->setSorIterations(`5`);
315	variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
316
317	if (use_flow)
318	{
319	resize(flow_uv[`0`], initial_Ux[i], Size(cur_cols, cur_rows));
320	initial_Ux[i] /= fraction;
321	resize(flow_uv[`1`], initial_Uy[i], Size(cur_cols, cur_rows));
322	initial_Uy[i] /= fraction;
323	}
324	}
325
326	fraction *= `2`;
327	}
328	}
329
330	/ This function computes the structure tensor elements (local sums of I0x^2, I0xI0y and I0y^2).
331	* A simple box filter is not used instead because we need to compute these sums on a sparse grid
332	* and store them densely in the output buffers.
333	*/
334	void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x,
335	Mat &dst_I0y, Mat &I0x, Mat &I0y)
336	{
337	CV_INSTRUMENT_REGION();
338
339	float I0xx_ptr = dst_I0xx.ptr<float*>();
340	float I0yy_ptr = dst_I0yy.ptr<float*>();
341	float I0xy_ptr = dst_I0xy.ptr<float*>();
342	float I0x_ptr = dst_I0x.ptr<float*>();
343	float I0y_ptr = dst_I0y.ptr<float*>();
344
345	float I0xx_aux_ptr = I0xx_buf_aux.ptr<float*>();
346	float I0yy_aux_ptr = I0yy_buf_aux.ptr<float*>();
347	float I0xy_aux_ptr = I0xy_buf_aux.ptr<float*>();
348	float I0x_aux_ptr = I0x_buf_aux.ptr<float*>();
349	float I0y_aux_ptr = I0y_buf_aux.ptr<float*>();
350
351	/ Separable box filter: horizontal pass /
352	for (int i = `0`; i < h; i++)
353	{
354	float sum_xx = `0.0f`, sum_yy = `0.0f`, sum_xy = `0.0f`, sum_x = `0.0f`, sum_y = `0.0f`;
355	short x_row = I0x.ptr<short*>(i);
356	short y_row = I0y.ptr<short*>(i);
357	for (int j = `0`; j < patch_size; j++)
358	{
359	sum_xx += x_row[j] * x_row[j];
360	sum_yy += y_row[j] * y_row[j];
361	sum_xy += x_row[j] * y_row[j];
362	sum_x += x_row[j];
363	sum_y += y_row[j];
364	}
365	I0xx_aux_ptr[i * ws] = sum_xx;
366	I0yy_aux_ptr[i * ws] = sum_yy;
367	I0xy_aux_ptr[i * ws] = sum_xy;
368	I0x_aux_ptr[i * ws] = sum_x;
369	I0y_aux_ptr[i * ws] = sum_y;
370	int js = `1`;
371	for (int j = patch_size; j < w; j++)
372	{
373	sum_xx += (x_row[j] * x_row[j] - x_row[j - patch_size] * x_row[j - patch_size]);
374	sum_yy += (y_row[j] * y_row[j] - y_row[j - patch_size] * y_row[j - patch_size]);
375	sum_xy += (x_row[j] * y_row[j] - x_row[j - patch_size] * y_row[j - patch_size]);
376	sum_x += (x_row[j] - x_row[j - patch_size]);
377	sum_y += (y_row[j] - y_row[j - patch_size]);
378	if ((j - patch_size + `1`) % patch_stride == `0`)
379	{
380	I0xx_aux_ptr[i * ws + js] = sum_xx;
381	I0yy_aux_ptr[i * ws + js] = sum_yy;
382	I0xy_aux_ptr[i * ws + js] = sum_xy;
383	I0x_aux_ptr[i * ws + js] = sum_x;
384	I0y_aux_ptr[i * ws + js] = sum_y;
385	js++;
386	}
387	}
388	}
389
390	AutoBuffer<float> sum_xx(ws), sum_yy(ws), sum_xy(ws), sum_x(ws), sum_y(ws);
391	for (int j = `0`; j < ws; j++)
392	{
393	sum_xx[j] = `0.0f`;
394	sum_yy[j] = `0.0f`;
395	sum_xy[j] = `0.0f`;
396	sum_x[j] = `0.0f`;
397	sum_y[j] = `0.0f`;
398	}
399
400	/ Separable box filter: vertical pass /
401	for (int i = `0`; i < patch_size; i++)
402	for (int j = `0`; j < ws; j++)
403	{
404	sum_xx[j] += I0xx_aux_ptr[i * ws + j];
405	sum_yy[j] += I0yy_aux_ptr[i * ws + j];
406	sum_xy[j] += I0xy_aux_ptr[i * ws + j];
407	sum_x[j] += I0x_aux_ptr[i * ws + j];
408	sum_y[j] += I0y_aux_ptr[i * ws + j];
409	}
410	for (int j = `0`; j < ws; j++)
411	{
412	I0xx_ptr[j] = sum_xx[j];
413	I0yy_ptr[j] = sum_yy[j];
414	I0xy_ptr[j] = sum_xy[j];
415	I0x_ptr[j] = sum_x[j];
416	I0y_ptr[j] = sum_y[j];
417	}
418	int is = `1`;
419	for (int i = patch_size; i < h; i++)
420	{
421	for (int j = `0`; j < ws; j++)
422	{
423	sum_xx[j] += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
424	sum_yy[j] += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
425	sum_xy[j] += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
426	sum_x[j] += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
427	sum_y[j] += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
428	}
429	if ((i - patch_size + `1`) % patch_stride == `0`)
430	{
431	for (int j = `0`; j < ws; j++)
432	{
433	I0xx_ptr[is * ws + j] = sum_xx[j];
434	I0yy_ptr[is * ws + j] = sum_yy[j];
435	I0xy_ptr[is * ws + j] = sum_xy[j];
436	I0x_ptr[is * ws + j] = sum_x[j];
437	I0y_ptr[is * ws + j] = sum_y[j];
438	}
439	is++;
440	}
441	}
442	}
443
444	int DISOpticalFlowImpl::autoSelectCoarsestScale(int img_width)
445	{
446	const int fratio = `5`;
447	return std::max(`0`, (int)std::floor(x: log2(x: (`2.0f`(float)img_width) / ((float)fratio (float)patch_size))));
448	}
449
450	void DISOpticalFlowImpl::autoSelectPatchSizeAndScales(int img_width)
451	{
452	switch (finest_scale)
453	{
454	case `1`:
455	patch_size = `8`;
456	coarsest_scale = autoSelectCoarsestScale(img_width);
457	finest_scale = std::max(coarsest_scale-`2`, `0`);
458	break;
459
460	case `3`:
461	patch_size = `12`;
462	coarsest_scale = autoSelectCoarsestScale(img_width);
463	finest_scale = std::max(coarsest_scale-`4`, `0`);
464	break;
465
466	case `4`:
467	patch_size = `12`;
468	coarsest_scale = autoSelectCoarsestScale(img_width);
469	finest_scale = std::max(coarsest_scale-`5`, `0`);
470	break;
471
472	// default case, fall-through.
473	case `2`:
474	default:
475	patch_size = `8`;
476	coarsest_scale = autoSelectCoarsestScale(img_width);
477	finest_scale = std::max(coarsest_scale-`2`, `0`);
478	break;
479	}
480	}
481
482	DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes,
483	int _hs, Mat &dst_Sx, Mat &dst_Sy,
484	Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1,
485	Mat &_I0x, Mat &_I0y, int _num_iter,
486	int _pyr_level)
487	: dis(&_dis), nstripes(_nstripes), hs(_hs), Sx(&dst_Sx), Sy(&dst_Sy), Ux(&src_Ux), Uy(&src_Uy), I0(&_I0), I1(&_I1),
488	I0x(&_I0x), I0y(&_I0y), num_iter(_num_iter), pyr_level(_pyr_level)
489	{
490	stripe_sz = (int)ceil(x: hs / (double)nstripes);
491	}
492
493	///////////////////////////////////////////// Patch processing functions /////////////////////////////////////////////
494
495	/ Some auxiliary macros /
496	#define HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION \
497	v_float32x4 w00v = v_setall_f32(w00); \
498	v_float32x4 w01v = v_setall_f32(w01); \
499	v_float32x4 w10v = v_setall_f32(w10); \
500	v_float32x4 w11v = v_setall_f32(w11); \
501	\
502	v_uint16x8 I0_row_8, I1_row_8, I1_row_shifted_8, I1_row_next_8, I1_row_next_shifted_8, tmp; \
503	v_uint32x4 I0_row_4_left, I1_row_4_left, I1_row_shifted_4_left, I1_row_next_4_left, I1_row_next_shifted_4_left; \
504	v_uint32x4 I0_row_4_right, I1_row_4_right, I1_row_shifted_4_right, I1_row_next_4_right, \
505	I1_row_next_shifted_4_right; \
506	v_float32x4 I_diff_left, I_diff_right; \
507	\
508	/* Preload and expand the first row of I1: */ \
509	I1_row_8 = v_load_expand(I1_ptr); \
510	I1_row_shifted_8 = v_load_expand(I1_ptr + 1); \
511	v_expand(I1_row_8, I1_row_4_left, I1_row_4_right); \
512	v_expand(I1_row_shifted_8, I1_row_shifted_4_left, I1_row_shifted_4_right); \
513	I1_ptr += I1_stride;
514
515	#define HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION \
516	/* Load the next row of I1: */ \
517	I1_row_next_8 = v_load_expand(I1_ptr); \
518	I1_row_next_shifted_8 = v_load_expand(I1_ptr + 1); \
519	/* Separate the left and right halves: */ \
520	v_expand(I1_row_next_8, I1_row_next_4_left, I1_row_next_4_right); \
521	v_expand(I1_row_next_shifted_8, I1_row_next_shifted_4_left, I1_row_next_shifted_4_right); \
522	\
523	/* Load current row of I0: */ \
524	I0_row_8 = v_load_expand(I0_ptr); \
525	v_expand(I0_row_8, I0_row_4_left, I0_row_4_right); \
526	\
527	/* Compute diffs between I0 and bilinearly interpolated I1: */ \
528	I_diff_left = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left))), \
529	v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left))), \
530	v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left))), \
531	v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)))), \
532	v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left))); \
533	I_diff_right = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right))), \
534	v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right))), \
535	v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right))), \
536	v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)))), \
537	v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right)));
538
539	#define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW \
540	I0_ptr += I0_stride; \
541	I1_ptr += I1_stride; \
542	\
543	I1_row_4_left = I1_row_next_4_left; \
544	I1_row_4_right = I1_row_next_4_right; \
545	I1_row_shifted_4_left = I1_row_next_shifted_4_left; \
546	I1_row_shifted_4_right = I1_row_next_shifted_4_right;
547
548	/ This function essentially performs one iteration of gradient descent when finding the most similar patch in I1 for a*
549	* given one in I0. It assumes that I0_ptr and I1_ptr already point to the corresponding patches and w00, w01, w10, w11
550	* are precomputed bilinear interpolation weights. It returns the SSD (sum of squared differences) between these patches
551	* and computes the values (dst_dUx, dst_dUy) that are used in the flow vector update. HAL acceleration is implemented
552	* only for the default patch size (8x8). Everything is processed in floats as using fixed-point approximations harms
553	* the quality significantly.
554	*/
555	inline float processPatch(float &dst_dUx, float &dst_dUy, uchar I0_ptr, uchar I1_ptr, short I0x_ptr, short* *I0y_ptr,
556	int I0_stride, int I1_stride, float w00, float w01, float w10, float w11, int patch_sz)
557	{
558	float SSD = `0.0f`;
559	#if CV_SIMD128
560	if (patch_sz == `8`)
561	{
562	/ Variables to accumulate the sums /
563	v_float32x4 Ux_vec = v_setall_f32(v: `0`);
564	v_float32x4 Uy_vec = v_setall_f32(v: `0`);
565	v_float32x4 SSD_vec = v_setall_f32(v: `0`);
566
567	v_int16x8 I0x_row, I0y_row;
568	v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
569
570	HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
571	for (int row = `0`; row < `8`; row++)
572	{
573	HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
574	I0x_row = v_load(ptr: I0x_ptr);
575	v_expand(a: I0x_row, b0&: I0x_row_4_left, b1&: I0x_row_4_right);
576	I0y_row = v_load(ptr: I0y_ptr);
577	v_expand(a: I0y_row, b0&: I0y_row_4_left, b1&: I0y_row_4_right);
578
579	/ Update the sums: /
580	Ux_vec = v_add(a: Ux_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0x_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0x_row_4_right))));
581	Uy_vec = v_add(a: Uy_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0y_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0y_row_4_right))));
582	SSD_vec = v_add(a: SSD_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right)));
583
584	I0x_ptr += I0_stride;
585	I0y_ptr += I0_stride;
586	HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
587	}
588
589	/ Final reduce operations: /
590	dst_dUx = v_reduce_sum(a: Ux_vec);
591	dst_dUy = v_reduce_sum(a: Uy_vec);
592	SSD = v_reduce_sum(a: SSD_vec);
593	}
594	else
595	#endif
596	{
597	dst_dUx = `0.0f`;
598	dst_dUy = `0.0f`;
599	float diff;
600	for (int i = `0`; i < patch_sz; i++)
601	for (int j = `0`; j < patch_sz; j++)
602	{
603	diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + `1`] +
604	w10 * I1_ptr[(i + `1`) * I1_stride + j] + w11 * I1_ptr[(i + `1`) * I1_stride + j + `1`] -
605	I0_ptr[i * I0_stride + j];
606
607	SSD += diff * diff;
608	dst_dUx += diff * I0x_ptr[i * I0_stride + j];
609	dst_dUy += diff * I0y_ptr[i * I0_stride + j];
610	}
611	}
612	return SSD;
613	}
614
615	/ Same as processPatch, but with patch mean normalization, which improves robustness under changing*
616	* lighting conditions
617	*/
618	inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar I0_ptr, uchar I1_ptr, short *I0x_ptr,
619	short I0y_ptr, int* I0_stride, int I1_stride, float w00, float w01, float w10,
620	float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
621	{
622	float sum_diff = `0.0`, sum_diff_sq = `0.0`;
623	float sum_I0x_mul = `0.0`, sum_I0y_mul = `0.0`;
624	float n = (float)patch_sz * patch_sz;
625
626	#if CV_SIMD128
627	if (patch_sz == `8`)
628	{
629	/ Variables to accumulate the sums /
630	v_float32x4 sum_I0x_mul_vec = v_setall_f32(v: `0`);
631	v_float32x4 sum_I0y_mul_vec = v_setall_f32(v: `0`);
632	v_float32x4 sum_diff_vec = v_setall_f32(v: `0`);
633	v_float32x4 sum_diff_sq_vec = v_setall_f32(v: `0`);
634
635	v_int16x8 I0x_row, I0y_row;
636	v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
637
638	HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
639	for (int row = `0`; row < `8`; row++)
640	{
641	HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
642	I0x_row = v_load(ptr: I0x_ptr);
643	v_expand(a: I0x_row, b0&: I0x_row_4_left, b1&: I0x_row_4_right);
644	I0y_row = v_load(ptr: I0y_ptr);
645	v_expand(a: I0y_row, b0&: I0y_row_4_left, b1&: I0y_row_4_right);
646
647	/ Update the sums: /
648	sum_I0x_mul_vec = v_add(a: sum_I0x_mul_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0x_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0x_row_4_right))));
649	sum_I0y_mul_vec = v_add(a: sum_I0y_mul_vec, b: v_add(a: v_mul(a: I_diff_left, b: v_cvt_f32(a: I0y_row_4_left)), b: v_mul(a: I_diff_right, b: v_cvt_f32(a: I0y_row_4_right))));
650	sum_diff_sq_vec = v_add(a: sum_diff_sq_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right)));
651	sum_diff_vec = v_add(a: sum_diff_vec, b: v_add(a: I_diff_left, b: I_diff_right));
652
653	I0x_ptr += I0_stride;
654	I0y_ptr += I0_stride;
655	HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
656	}
657
658	/ Final reduce operations: /
659	sum_I0x_mul = v_reduce_sum(a: sum_I0x_mul_vec);
660	sum_I0y_mul = v_reduce_sum(a: sum_I0y_mul_vec);
661	sum_diff = v_reduce_sum(a: sum_diff_vec);
662	sum_diff_sq = v_reduce_sum(a: sum_diff_sq_vec);
663	}
664	else
665	#endif
666	{
667	float diff;
668	for (int i = `0`; i < patch_sz; i++)
669	for (int j = `0`; j < patch_sz; j++)
670	{
671	diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + `1`] +
672	w10 * I1_ptr[(i + `1`) * I1_stride + j] + w11 * I1_ptr[(i + `1`) * I1_stride + j + `1`] -
673	I0_ptr[i * I0_stride + j];
674
675	sum_diff += diff;
676	sum_diff_sq += diff * diff;
677
678	sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j];
679	sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j];
680	}
681	}
682	dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
683	dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
684	return sum_diff_sq - sum_diff * sum_diff / n;
685	}
686
687	/ Similar to processPatch, but compute only the sum of squared differences (SSD) between the patches /
688	inline float computeSSD(uchar I0_ptr, uchar I1_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
689	float w11, int patch_sz)
690	{
691	float SSD = `0.0f`;
692	#if CV_SIMD128
693	if (patch_sz == `8`)
694	{
695	v_float32x4 SSD_vec = v_setall_f32(v: `0`);
696	HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
697	for (int row = `0`; row < `8`; row++)
698	{
699	HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
700	SSD_vec = v_add(a: SSD_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right)));
701	HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
702	}
703	SSD = v_reduce_sum(a: SSD_vec);
704	}
705	else
706	#endif
707	{
708	float diff;
709	for (int i = `0`; i < patch_sz; i++)
710	for (int j = `0`; j < patch_sz; j++)
711	{
712	diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + `1`] +
713	w10 * I1_ptr[(i + `1`) * I1_stride + j] + w11 * I1_ptr[(i + `1`) * I1_stride + j + `1`] -
714	I0_ptr[i * I0_stride + j];
715	SSD += diff * diff;
716	}
717	}
718	return SSD;
719	}
720
721	/ Same as computeSSD, but with patch mean normalization /
722	inline float computeSSDMeanNorm(uchar I0_ptr, uchar I1_ptr, int I0_stride, int I1_stride, float w00, float w01,
723	float w10, float w11, int patch_sz)
724	{
725	float sum_diff = `0.0f`, sum_diff_sq = `0.0f`;
726	float n = (float)patch_sz * patch_sz;
727	#if CV_SIMD128
728	if (patch_sz == `8`)
729	{
730	v_float32x4 sum_diff_vec = v_setall_f32(v: `0`);
731	v_float32x4 sum_diff_sq_vec = v_setall_f32(v: `0`);
732	HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
733	for (int row = `0`; row < `8`; row++)
734	{
735	HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
736	sum_diff_sq_vec = v_add(a: sum_diff_sq_vec, b: v_add(a: v_mul(a: I_diff_left, b: I_diff_left), b: v_mul(a: I_diff_right, b: I_diff_right)));
737	sum_diff_vec = v_add(a: sum_diff_vec, b: v_add(a: I_diff_left, b: I_diff_right));
738	HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
739	}
740	sum_diff = v_reduce_sum(a: sum_diff_vec);
741	sum_diff_sq = v_reduce_sum(a: sum_diff_sq_vec);
742	}
743	else
744	{
745	#endif
746	float diff;
747	for (int i = `0`; i < patch_sz; i++)
748	for (int j = `0`; j < patch_sz; j++)
749	{
750	diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + `1`] +
751	w10 * I1_ptr[(i + `1`) * I1_stride + j] + w11 * I1_ptr[(i + `1`) * I1_stride + j + `1`] -
752	I0_ptr[i * I0_stride + j];
753
754	sum_diff += diff;
755	sum_diff_sq += diff * diff;
756	}
757	#if CV_SIMD128
758	}
759	#endif
760	return sum_diff_sq - sum_diff * sum_diff / n;
761	}
762
763	#undef HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION
764	#undef HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION
765	#undef HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW
766	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
767
768	void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const
769	{
770	CV_INSTRUMENT_REGION();
771
772	// force separate processing of stripes if we are using spatial propagation:
773	if (dis->use_spatial_propagation && range.end > range.start + `1`)
774	{
775	for (int n = range.start; n < range.end; n++)
776	(*this)(Range (n, n + `1`));
777	return;
778	}
779	int psz = dis->patch_size;
780	int psz2 = psz / `2`;
781	int w_ext = dis->w + `2` * dis->border_size; //!< width of I1_ext
782	int bsz = dis->border_size;
783
784	/ Input dense flow /
785	float Ux_ptr = Ux->ptr<float*>();
786	float Uy_ptr = Uy->ptr<float*>();
787
788	/ Output sparse flow /
789	float Sx_ptr = Sx->ptr<float*>();
790	float Sy_ptr = Sy->ptr<float*>();
791
792	uchar *I0_ptr = I0->ptr<uchar>();
793	uchar *I1_ptr = I1->ptr<uchar>();
794	short I0x_ptr = I0x->ptr<short*>();
795	short I0y_ptr = I0y->ptr<short*>();
796
797	/ Precomputed structure tensor /
798	float xx_ptr = dis->I0xx_buf.ptr<float*>();
799	float yy_ptr = dis->I0yy_buf.ptr<float*>();
800	float xy_ptr = dis->I0xy_buf.ptr<float*>();
801	/ And extra buffers for mean-normalization: /
802	float x_ptr = dis->I0x_buf.ptr<float*>();
803	float y_ptr = dis->I0y_buf.ptr<float*>();
804
805	bool use_temporal_candidates = false;
806	float initial_Ux_ptr = NULL, initial_Uy_ptr = NULL;
807	if (!dis->initial_Ux.empty())
808	{
809	initial_Ux_ptr = dis->initial_Ux[pyr_level].ptr<float>();
810	initial_Uy_ptr = dis->initial_Uy[pyr_level].ptr<float>();
811	use_temporal_candidates = true;
812	}
813
814	int i, j, dir;
815	int start_is, end_is, start_js, end_js;
816	int start_i, start_j;
817	float i_lower_limit = bsz - psz + `1.0f`;
818	float i_upper_limit = bsz + dis->h - `1.0f`;
819	float j_lower_limit = bsz - psz + `1.0f`;
820	float j_upper_limit = bsz + dis->w - `1.0f`;
821	float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
822
823	#define INIT_BILINEAR_WEIGHTS(Ux, Uy) \
824	i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit); \
825	j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit); \
826	{ \
827	float di = i_I1 - floor(i_I1); \
828	float dj = j_I1 - floor(j_I1); \
829	w11 = di * dj; \
830	w10 = di * (1 - dj); \
831	w01 = (1 - di) * dj; \
832	w00 = (1 - di) * (1 - dj); \
833	}
834
835	#define COMPUTE_SSD(dst, Ux, Uy) \
836	INIT_BILINEAR_WEIGHTS(Ux, Uy); \
837	if (dis->use_mean_normalization) \
838	dst = computeSSDMeanNorm(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, \
839	w01, w10, w11, psz); \
840	else \
841	dst = computeSSD(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, w01, \
842	w10, w11, psz);
843
844	int num_inner_iter = (int)floor(x: dis->grad_descent_iter / (float)num_iter);
845	for (int iter = `0`; iter < num_iter; iter++)
846	{
847	if (iter % `2` == `0`)
848	{
849	dir = `1`;
850	start_is = min(range.start * stripe_sz, hs);
851	end_is = min(range.end * stripe_sz, hs);
852	start_js = `0`;
853	end_js = dis->ws;
854	start_i = start_is * dis->patch_stride;
855	start_j = `0`;
856	}
857	else
858	{
859	dir = -`1`;
860	start_is = min(range.end * stripe_sz, hs) - `1`;
861	end_is = min(range.start * stripe_sz, hs) - `1`;
862	start_js = dis->ws - `1`;
863	end_js = -`1`;
864	start_i = start_is * dis->patch_stride;
865	start_j = (dis->ws - `1`) * dis->patch_stride;
866	}
867
868	i = start_i;
869	for (int is = start_is; dir * is < dir * end_is; is += dir)
870	{
871	j = start_j;
872	for (int js = start_js; dir * js < dir * end_js; js += dir)
873	{
874	if (iter == `0`)
875	{
876	/ Using result form the previous pyramid level as the very first approximation: /
877	Sx_ptr[is * dis->ws + js] = Ux_ptr[(i + psz2) * dis->w + j + psz2];
878	Sy_ptr[is * dis->ws + js] = Uy_ptr[(i + psz2) * dis->w + j + psz2];
879	}
880
881	float min_SSD = INF, cur_SSD;
882	if (use_temporal_candidates \|\| dis->use_spatial_propagation)
883	{
884	COMPUTE_SSD(min_SSD, Sx_ptr[is * dis->ws + js], Sy_ptr[is * dis->ws + js]);
885	}
886
887	if (use_temporal_candidates)
888	{
889	/ Try temporal candidates (vectors from the initial flow field that was passed to the function) /
890	COMPUTE_SSD(cur_SSD, initial_Ux_ptr[(i + psz2) * dis->w + j + psz2],
891	initial_Uy_ptr[(i + psz2) * dis->w + j + psz2]);
892	if (cur_SSD < min_SSD)
893	{
894	min_SSD = cur_SSD;
895	Sx_ptr[is * dis->ws + js] = initial_Ux_ptr[(i + psz2) * dis->w + j + psz2];
896	Sy_ptr[is * dis->ws + js] = initial_Uy_ptr[(i + psz2) * dis->w + j + psz2];
897	}
898	}
899
900	if (dis->use_spatial_propagation)
901	{
902	/ Try spatial candidates: /
903	if (dir * js > dir * start_js)
904	{
905	COMPUTE_SSD(cur_SSD, Sx_ptr[is * dis->ws + js - dir], Sy_ptr[is * dis->ws + js - dir]);
906	if (cur_SSD < min_SSD)
907	{
908	min_SSD = cur_SSD;
909	Sx_ptr[is * dis->ws + js] = Sx_ptr[is * dis->ws + js - dir];
910	Sy_ptr[is * dis->ws + js] = Sy_ptr[is * dis->ws + js - dir];
911	}
912	}
913	/ Flow vectors won't actually propagate across different stripes, which is the reason for keeping*
914	* the number of stripes constant. It works well enough in practice and doesn't introduce any
915	* visible seams.
916	*/
917	if (dir * is > dir * start_is)
918	{
919	COMPUTE_SSD(cur_SSD, Sx_ptr[(is - dir) * dis->ws + js], Sy_ptr[(is - dir) * dis->ws + js]);
920	if (cur_SSD < min_SSD)
921	{
922	min_SSD = cur_SSD;
923	Sx_ptr[is * dis->ws + js] = Sx_ptr[(is - dir) * dis->ws + js];
924	Sy_ptr[is * dis->ws + js] = Sy_ptr[(is - dir) * dis->ws + js];
925	}
926	}
927	}
928
929	/ Use the best candidate as a starting point for the gradient descent: /
930	float cur_Ux = Sx_ptr[is * dis->ws + js];
931	float cur_Uy = Sy_ptr[is * dis->ws + js];
932
933	/ Computing the inverse of the structure tensor: /
934	float detH = xx_ptr[is * dis->ws + js] * yy_ptr[is * dis->ws + js] -
935	xy_ptr[is * dis->ws + js] * xy_ptr[is * dis->ws + js];
936	if (abs(x: detH) < EPS)
937	detH = EPS;
938	float invH11 = yy_ptr[is * dis->ws + js] / detH;
939	float invH12 = -xy_ptr[is * dis->ws + js] / detH;
940	float invH22 = xx_ptr[is * dis->ws + js] / detH;
941	float prev_SSD = INF, SSD;
942	float x_grad_sum = x_ptr[is * dis->ws + js];
943	float y_grad_sum = y_ptr[is * dis->ws + js];
944
945	for (int t = `0`; t < num_inner_iter; t++)
946	{
947	INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
948	if (dis->use_mean_normalization)
949	SSD = processPatchMeanNorm(dst_dUx&: dUx, dst_dUy&: dUy,
950	I0_ptr: I0_ptr + i * dis->w + j, I1_ptr: I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
951	I0x_ptr: I0x_ptr + i * dis->w + j, I0y_ptr: I0y_ptr + i * dis->w + j,
952	I0_stride: dis->w, I1_stride: w_ext, w00, w01, w10, w11, patch_sz: psz,
953	x_grad_sum, y_grad_sum);
954	else
955	SSD = processPatch(dst_dUx&: dUx, dst_dUy&: dUy,
956	I0_ptr: I0_ptr + i * dis->w + j, I1_ptr: I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
957	I0x_ptr: I0x_ptr + i * dis->w + j, I0y_ptr: I0y_ptr + i * dis->w + j,
958	I0_stride: dis->w, I1_stride: w_ext, w00, w01, w10, w11, patch_sz: psz);
959
960	dx = invH11 * dUx + invH12 * dUy;
961	dy = invH12 * dUx + invH22 * dUy;
962	cur_Ux -= dx;
963	cur_Uy -= dy;
964
965	/ Break when patch distance stops decreasing /
966	if (SSD >= prev_SSD)
967	break;
968	prev_SSD = SSD;
969	}
970
971	/ If gradient descent converged to a flow vector that is very far from the initial approximation*
972	* (more than patch size) then we don't use it. Noticeably improves the robustness.
973	*/
974	if (norm(Vec2f (cur_Ux - Sx_ptr[is * dis->ws + js], cur_Uy - Sy_ptr[is * dis->ws + js])) <= psz)
975	{
976	Sx_ptr[is * dis->ws + js] = cur_Ux;
977	Sy_ptr[is * dis->ws + js] = cur_Uy;
978	}
979	j += dir * dis->patch_stride;
980	}
981	i += dir * dis->patch_stride;
982	}
983	}
984	#undef INIT_BILINEAR_WEIGHTS
985	#undef COMPUTE_SSD
986	}
987
988	DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h,
989	Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy,
990	Mat &_I0, Mat &_I1)
991	: dis(&_dis), nstripes(_nstripes), h(_h), Ux(&dst_Ux), Uy(&dst_Uy), Sx(&src_Sx), Sy(&src_Sy), I0(&_I0), I1(&_I1)
992	{
993	stripe_sz = (int)ceil(x: h / (double)nstripes);
994	}
995
996	/ This function transforms a sparse optical flow field obtained by PatchInverseSearch (which computes flow values*
997	* on a sparse grid defined by patch_stride) into a dense optical flow field by weighted averaging of values from the
998	* overlapping patches.
999	*/
1000	void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const
1001	{
1002	CV_INSTRUMENT_REGION();
1003
1004	int start_i = min(range.start * stripe_sz, h);
1005	int end_i = min(range.end * stripe_sz, h);
1006
1007	/ Input sparse flow /
1008	float Sx_ptr = Sx->ptr<float*>();
1009	float Sy_ptr = Sy->ptr<float*>();
1010
1011	/ Output dense flow /
1012	float Ux_ptr = Ux->ptr<float*>();
1013	float Uy_ptr = Uy->ptr<float*>();
1014
1015	uchar *I0_ptr = I0->ptr<uchar>();
1016	uchar *I1_ptr = I1->ptr<uchar>();
1017
1018	int psz = dis->patch_size;
1019	int pstr = dis->patch_stride;
1020	int i_l, i_u;
1021	int j_l, j_u;
1022	float i_m, j_m, diff;
1023
1024	/ These values define the set of sparse grid locations that contain patches overlapping with the current dense flow*
1025	* location */
1026	int start_is, end_is;
1027	int start_js, end_js;
1028
1029	/ Some helper macros for updating this set of sparse grid locations /
1030	#define UPDATE_SPARSE_I_COORDINATES \
1031	if (i % pstr == 0 && i + psz <= h) \
1032	end_is++; \
1033	if (i - psz >= 0 && (i - psz) % pstr == 0 && start_is < end_is) \
1034	start_is++;
1035
1036	#define UPDATE_SPARSE_J_COORDINATES \
1037	if (j % pstr == 0 && j + psz <= dis->w) \
1038	end_js++; \
1039	if (j - psz >= 0 && (j - psz) % pstr == 0 && start_js < end_js) \
1040	start_js++;
1041
1042	start_is = `0`;
1043	end_is = -`1`;
1044	for (int i = `0`; i < start_i; i++)
1045	{
1046	UPDATE_SPARSE_I_COORDINATES;
1047	}
1048	for (int i = start_i; i < end_i; i++)
1049	{
1050	UPDATE_SPARSE_I_COORDINATES;
1051	start_js = `0`;
1052	end_js = -`1`;
1053	for (int j = `0`; j < dis->w; j++)
1054	{
1055	UPDATE_SPARSE_J_COORDINATES;
1056	float coef, sum_coef = `0.0f`;
1057	float sum_Ux = `0.0f`;
1058	float sum_Uy = `0.0f`;
1059
1060	/ Iterate through all the patches that overlap the current location (i,j) /
1061	for (int is = start_is; is <= end_is; is++)
1062	for (int js = start_js; js <= end_js; js++)
1063	{
1064	j_m = min(max(j + Sx_ptr[is * dis->ws + js], `0.0f`), dis->w - `1.0f` - EPS);
1065	i_m = min(max(i + Sy_ptr[is * dis->ws + js], `0.0f`), dis->h - `1.0f` - EPS);
1066	j_l = (int)j_m;
1067	j_u = j_l + `1`;
1068	i_l = (int)i_m;
1069	i_u = i_l + `1`;
1070	diff = (j_m - j_l) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_u] +
1071	(j_u - j_m) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_l] +
1072	(j_m - j_l) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_u] +
1073	(j_u - j_m) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_l] - I0_ptr[i * dis->w + j];
1074	coef = `1` / max(`1.0f`, abs(x: diff));
1075	sum_Ux += coef * Sx_ptr[is * dis->ws + js];
1076	sum_Uy += coef * Sy_ptr[is * dis->ws + js];
1077	sum_coef += coef;
1078	}
1079	CV_DbgAssert(sum_coef != `0`);
1080	Ux_ptr[i * dis->w + j] = sum_Ux / sum_coef;
1081	Uy_ptr[i * dis->w + j] = sum_Uy / sum_coef;
1082	}
1083	}
1084	#undef UPDATE_SPARSE_I_COORDINATES
1085	#undef UPDATE_SPARSE_J_COORDINATES
1086	}
1087
1088	#ifdef HAVE_OPENCL
1089	bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_U,
1090	UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int /pyr_level/)
1091	{
1092	CV_INSTRUMENT_REGION();
1093	CV_INSTRUMENT_REGION_OPENCL();
1094
1095	size_t globalSize[] = {(size_t)ws, (size_t)hs};
1096	size_t localSize[] = {`16`, `16`};
1097	int num_inner_iter = (int)floor(x: grad_descent_iter / (float)num_iter);
1098
1099	String subgroups_build_options;
1100	if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups"))
1101	subgroups_build_options = " -DCV_USE_SUBGROUPS=1";
1102
1103	String build_options = cv::format(
1104	fmt: "-DDIS_BORDER_SIZE=%d -DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1105	border_size, patch_size, patch_stride
1106	) + subgroups_build_options;
1107
1108	#if 0 // OpenCL debug
1109	u_Sx = Scalar::all(`0`);
1110	u_Sy = Scalar::all(`0`);
1111	#endif
1112
1113	CV_Assert(num_iter == `2`);
1114	for (int iter = `0`; iter < num_iter; iter++)
1115	{
1116	if (iter == `0`)
1117	{
1118	ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, build_options);
1119	size_t global_sz[] = {(size_t)hs * `8`};
1120	size_t local_sz[] = {`8`};
1121
1122	k1.args(
1123	ocl::KernelArg::PtrReadOnly(m: src_U),
1124	ocl::KernelArg::PtrReadOnly(m: I0),
1125	ocl::KernelArg::PtrReadOnly(m: I1),
1126	(int)w, (int)h, (int)ws, (int)hs,
1127	ocl::KernelArg::PtrWriteOnly(m: u_S)
1128	);
1129	if (!k1.run(dims: `1`, globalsize: global_sz, localsize: local_sz, sync: false))
1130	return false;
1131
1132	ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::video::dis_flow_oclsrc, build_options);
1133
1134	k2.args(
1135	ocl::KernelArg::PtrReadOnly(m: src_U),
1136	ocl::KernelArg::PtrReadOnly(m: I0),
1137	ocl::KernelArg::PtrReadOnly(m: I1),
1138	ocl::KernelArg::PtrReadOnly(m: I0x),
1139	ocl::KernelArg::PtrReadOnly(m: I0y),
1140	ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf),
1141	ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf),
1142	ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf),
1143	ocl::KernelArg::PtrReadOnly(m: u_I0x_buf),
1144	ocl::KernelArg::PtrReadOnly(m: u_I0y_buf),
1145	(int)w, (int)h, (int)ws, (int)hs,
1146	(int)num_inner_iter,
1147	ocl::KernelArg::PtrReadWrite(m: u_S)
1148	);
1149	if (!k2.run(dims: `2`, globalsize: globalSize, localsize: localSize, sync: false))
1150	return false;
1151	}
1152	else
1153	{
1154	ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, build_options);
1155	size_t global_sz[] = {(size_t)hs * `8`};
1156	size_t local_sz[] = {`8`};
1157
1158	k3.args(
1159	ocl::KernelArg::PtrReadOnly(m: I0),
1160	ocl::KernelArg::PtrReadOnly(m: I1),
1161	(int)w, (int)h, (int)ws, (int)hs,
1162	ocl::KernelArg::PtrReadWrite(m: u_S)
1163	);
1164	if (!k3.run(dims: `1`, globalsize: global_sz, localsize: local_sz, sync: false))
1165	return false;
1166
1167	ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::video::dis_flow_oclsrc, build_options);
1168
1169	k4.args(
1170	ocl::KernelArg::PtrReadOnly(m: I0),
1171	ocl::KernelArg::PtrReadOnly(m: I1),
1172	ocl::KernelArg::PtrReadOnly(m: I0x),
1173	ocl::KernelArg::PtrReadOnly(m: I0y),
1174	ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf),
1175	ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf),
1176	ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf),
1177	ocl::KernelArg::PtrReadOnly(m: u_I0x_buf),
1178	ocl::KernelArg::PtrReadOnly(m: u_I0y_buf),
1179	(int)w, (int)h,(int)ws, (int)hs,
1180	(int)num_inner_iter,
1181	ocl::KernelArg::PtrReadWrite(m: u_S)
1182	);
1183	if (!k4.run(dims: `2`, globalsize: globalSize, localsize: localSize, sync: false))
1184	return false;
1185	}
1186	}
1187	return true;
1188	}
1189
1190	bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1)
1191	{
1192	CV_INSTRUMENT_REGION();
1193	CV_INSTRUMENT_REGION_OPENCL();
1194
1195	size_t globalSize[] = {(size_t)w, (size_t)h};
1196	size_t localSize[] = {`16`, `16`};
1197
1198	String build_options = cv::format(
1199	fmt: "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1200	patch_size, patch_stride
1201	);
1202
1203	ocl::Kernel kernel("dis_densification", ocl::video::dis_flow_oclsrc, build_options);
1204	kernel.args(
1205	ocl::KernelArg::PtrReadOnly(m: src_S),
1206	ocl::KernelArg::PtrReadOnly(m: _I0),
1207	ocl::KernelArg::PtrReadOnly(m: _I1),
1208	(int)w, (int)h, (int)ws,
1209	ocl::KernelArg::PtrWriteOnly(m: dst_U)
1210	);
1211	return kernel.run(dims: `2`, globalsize: globalSize, localsize: localSize, sync: false);
1212	}
1213
1214	void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow)
1215	{
1216	CV_INSTRUMENT_REGION();
1217	// not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
1218
1219	u_I0s.resize(coarsest_scale + `1`);
1220	u_I1s.resize(coarsest_scale + `1`);
1221	u_I1s_ext.resize(coarsest_scale + `1`);
1222	u_I0xs.resize(coarsest_scale + `1`);
1223	u_I0ys.resize(coarsest_scale + `1`);
1224	u_U.resize(coarsest_scale + `1`);
1225
1226	if (use_flow)
1227	{
1228	u_initial_U.resize(coarsest_scale + `1`);
1229	}
1230
1231	int fraction = `1`;
1232	int cur_rows = `0`, cur_cols = `0`;
1233
1234	for (int i = `0`; i <= coarsest_scale; i++)
1235	{
1236	CV_TRACE_REGION("coarsest_scale_iteration");
1237	/ Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway /
1238	if (i == finest_scale)
1239	{
1240	cur_rows = I0.rows / fraction;
1241	cur_cols = I0.cols / fraction;
1242	u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
1243	resize(I0, u_I0s[i], u_I0s[i].size(), `0.0`, `0.0`, INTER_AREA);
1244	u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
1245	resize(I1, u_I1s[i], u_I1s[i].size(), `0.0`, `0.0`, INTER_AREA);
1246
1247	/ These buffers are reused in each scale so we initialize them once on the finest scale: /
1248	u_S.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC2);
1249	u_I0xx_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1250	u_I0yy_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1251	u_I0xy_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1252	u_I0x_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1253	u_I0y_buf.create(rows: cur_rows / patch_stride, cols: cur_cols / patch_stride, CV_32FC1);
1254
1255	u_I0xx_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1256	u_I0yy_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1257	u_I0xy_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1258	u_I0x_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1259	u_I0y_buf_aux.create(rows: cur_rows, cols: cur_cols / patch_stride, CV_32FC1);
1260	}
1261	else if (i > finest_scale)
1262	{
1263	cur_rows = u_I0s[i - `1`].rows / `2`;
1264	cur_cols = u_I0s[i - `1`].cols / `2`;
1265	u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
1266	resize(u_I0s[i - `1`], u_I0s[i], u_I0s[i].size(), `0.0`, `0.0`, INTER_AREA);
1267	u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
1268	resize(u_I1s[i - `1`], u_I1s[i], u_I1s[i].size(), `0.0`, `0.0`, INTER_AREA);
1269	}
1270
1271	if (i >= finest_scale)
1272	{
1273	u_I1s_ext[i].create(cur_rows + `2` * border_size, cur_cols + `2` * border_size, CV_8UC1);
1274	copyMakeBorder(u_I1s[i], u_I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
1275	u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1);
1276	u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1);
1277	spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]);
1278	u_U[i].create(cur_rows, cur_cols, CV_32FC2);
1279	variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
1280	variational_refinement_processors[i]->setDelta(variational_refinement_delta);
1281	variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
1282	variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon);
1283	variational_refinement_processors[i]->setSorIterations(`5`);
1284	variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
1285
1286	if (use_flow)
1287	{
1288	UMat resized_flow;
1289	resize(src: flow, dst: resized_flow, dsize: Size (cur_cols, cur_rows));
1290	float scale = `1.0f` / fraction;
1291	resized_flow.convertTo(u_initial_U[i], CV_32FC2, scale, `0.0f`);
1292	}
1293	}
1294
1295	fraction *= `2`;
1296	}
1297	}
1298
1299	bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
1300	UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y)
1301	{
1302	CV_INSTRUMENT_REGION();
1303	CV_INSTRUMENT_REGION_OPENCL();
1304
1305	size_t globalSizeX[] = {(size_t)h};
1306	size_t localSizeX[] = {`16`};
1307
1308	#if 0 // OpenCL debug
1309	u_I0xx_buf_aux = Scalar::all(`0`);
1310	u_I0yy_buf_aux = Scalar::all(`0`);
1311	u_I0xy_buf_aux = Scalar::all(`0`);
1312	u_I0x_buf_aux = Scalar::all(`0`);
1313	u_I0y_buf_aux = Scalar::all(`0`);
1314	dst_I0xx = Scalar::all(`0`);
1315	dst_I0yy = Scalar::all(`0`);
1316	dst_I0xy = Scalar::all(`0`);
1317	dst_I0x = Scalar::all(`0`);
1318	dst_I0y = Scalar::all(`0`);
1319	#endif
1320
1321	String build_options = cv::format(
1322	fmt: "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1323	patch_size, patch_stride
1324	);
1325
1326	ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::video::dis_flow_oclsrc, build_options);
1327	kernelX.args(
1328	ocl::KernelArg::PtrReadOnly(m: I0x),
1329	ocl::KernelArg::PtrReadOnly(m: I0y),
1330	(int)w, (int)h, (int)ws,
1331	ocl::KernelArg::PtrWriteOnly(m: u_I0xx_buf_aux),
1332	ocl::KernelArg::PtrWriteOnly(m: u_I0yy_buf_aux),
1333	ocl::KernelArg::PtrWriteOnly(m: u_I0xy_buf_aux),
1334	ocl::KernelArg::PtrWriteOnly(m: u_I0x_buf_aux),
1335	ocl::KernelArg::PtrWriteOnly(m: u_I0y_buf_aux)
1336	);
1337	if (!kernelX.run(dims: `1`, globalsize: globalSizeX, localsize: localSizeX, sync: false))
1338	return false;
1339
1340	size_t globalSizeY[] = {(size_t)ws};
1341	size_t localSizeY[] = {`16`};
1342
1343	ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::video::dis_flow_oclsrc, build_options);
1344	kernelY.args(
1345	ocl::KernelArg::PtrReadOnly(m: u_I0xx_buf_aux),
1346	ocl::KernelArg::PtrReadOnly(m: u_I0yy_buf_aux),
1347	ocl::KernelArg::PtrReadOnly(m: u_I0xy_buf_aux),
1348	ocl::KernelArg::PtrReadOnly(m: u_I0x_buf_aux),
1349	ocl::KernelArg::PtrReadOnly(m: u_I0y_buf_aux),
1350	(int)w, (int)h, (int)ws,
1351	ocl::KernelArg::PtrWriteOnly(m: dst_I0xx),
1352	ocl::KernelArg::PtrWriteOnly(m: dst_I0yy),
1353	ocl::KernelArg::PtrWriteOnly(m: dst_I0xy),
1354	ocl::KernelArg::PtrWriteOnly(m: dst_I0x),
1355	ocl::KernelArg::PtrWriteOnly(m: dst_I0y)
1356	);
1357	return kernelY.run(dims: `1`, globalsize: globalSizeY, localsize: localSizeY, sync: false);
1358	}
1359
1360	bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow)
1361	{
1362	CV_INSTRUMENT_REGION();
1363	// not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
1364
1365	UMat I0Mat = I0.getUMat();
1366	UMat I1Mat = I1.getUMat();
1367	bool use_input_flow = false;
1368	if (flow.sameSize(arr: I0) && flow.depth() == CV_32F && flow.channels() == `2`)
1369	use_input_flow = true;
1370	coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (`4.0` * patch_size)) / log(x: `2.0`) + `0.5`), / Original code search for maximal movement of width/4 /
1371	(int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(x: `2.0`))); / Deepest pyramid level greater or equal than patch/
1372
1373	if (coarsest_scale<`0`)
1374	CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12");
1375
1376	if (coarsest_scale<finest_scale)
1377	{
1378	// choose the finest level based on coarsest level.
1379	// Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239
1380	int original_img_width = I0.size().width;
1381	autoSelectPatchSizeAndScales(img_width: original_img_width);
1382	}
1383
1384	ocl_prepareBuffers(I0&: I0Mat, I1&: I1Mat, flow, use_flow: use_input_flow);
1385	u_U[coarsest_scale].setTo(`0.0f`);
1386
1387	for (int i = coarsest_scale; i >= finest_scale; i--)
1388	{
1389	CV_TRACE_REGION("coarsest_scale_iteration");
1390	w = u_I0s[i].cols;
1391	h = u_I0s[i].rows;
1392	ws = `1` + (w - patch_size) / patch_stride;
1393	hs = `1` + (h - patch_size) / patch_stride;
1394
1395	if (!ocl_precomputeStructureTensor(u_I0xx_buf, u_I0yy_buf, u_I0xy_buf,
1396	u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i]))
1397	return false;
1398
1399	if (!ocl_PatchInverseSearch(u_U[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], `2`, i))
1400	return false;
1401
1402	if (!ocl_Densification(u_U[i], u_S, u_I0s[i], u_I1s[i]))
1403	return false;
1404
1405	if (variational_refinement_iter > `0`)
1406	{
1407	std::vector<Mat> U_channels;
1408	split(u_U[i], U_channels); CV_Assert(U_channels.size() == `2`);
1409	variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i],
1410	U_channels[`0`], U_channels[`1`]);
1411	merge(U_channels, u_U[i]);
1412	}
1413
1414	if (i > finest_scale)
1415	{
1416	UMat resized;
1417	resize(u_U[i], resized, u_U[i - `1`].size());
1418	multiply(resized, `2`, u_U[i - `1`]);
1419	}
1420	}
1421
1422	UMat resized_flow;
1423	resize(u_U[finest_scale], resized_flow, I1Mat.size());
1424	multiply(src1: resized_flow, src2: `1` << finest_scale, dst: flow);
1425
1426	return true;
1427	}
1428	#endif
1429
1430	void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
1431	{
1432	CV_INSTRUMENT_REGION();
1433
1434	CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == `1`);
1435	CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == `1`);
1436	CV_Assert(I0.sameSize(I1));
1437	CV_Assert(I0.isContinuous());
1438	CV_Assert(I1.isContinuous());
1439
1440	CV_OCL_RUN(flow.isUMat() &&
1441	(patch_size == `8`) && (use_spatial_propagation == true),
1442	ocl_calc(I0, I1, flow));
1443
1444	Mat I0Mat = I0.getMat();
1445	Mat I1Mat = I1.getMat();
1446	bool use_input_flow = false;
1447	if (flow.sameSize(arr: I0) && flow.depth() == CV_32F && flow.channels() == `2`)
1448	use_input_flow = true;
1449	else
1450	flow.create(sz: I1Mat.size (), CV_32FC2);
1451	Mat flowMat = flow.getMat();
1452	coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (`4.0` * patch_size)) / log(x: `2.0`) + `0.5`), / Original code search for maximal movement of width/4 /
1453	(int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(x: `2.0`))); / Deepest pyramid level greater or equal than patch/
1454
1455	if (coarsest_scale<`0`)
1456	CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12");
1457
1458	if (coarsest_scale<finest_scale)
1459	{
1460	// choose the finest level based on coarsest level.
1461	// Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239
1462	int original_img_width = I0.size().width;
1463	autoSelectPatchSizeAndScales(img_width: original_img_width);
1464	}
1465
1466	int num_stripes = getNumThreads();
1467
1468	prepareBuffers(I0&: I0Mat, I1&: I1Mat, flow&: flowMat, use_flow: use_input_flow);
1469	Ux[coarsest_scale].setTo(`0.0f`);
1470	Uy[coarsest_scale].setTo(`0.0f`);
1471
1472	for (int i = coarsest_scale; i >= finest_scale; i--)
1473	{
1474	CV_TRACE_REGION("coarsest_scale_iteration");
1475	w = I0s[i].cols;
1476	h = I0s[i].rows;
1477	ws = `1` + (w - patch_size) / patch_stride;
1478	hs = `1` + (h - patch_size) / patch_stride;
1479
1480	precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x_buf, I0y_buf, I0xs[i], I0ys[i]);
1481	if (use_spatial_propagation)
1482	{
1483	/ Use a fixed number of stripes regardless the number of threads to make inverse search*
1484	* with spatial propagation reproducible
1485	*/
1486	parallel_for_(Range(`0`, `8`), PatchInverseSearch_ParBody(*this, `8`, hs, Sx, Sy, Ux[i], Uy[i], I0s[i],
1487	I1s_ext[i], I0xs[i], I0ys[i], `2`, i));
1488	}
1489	else
1490	{
1491	parallel_for_(Range(`0`, num_stripes),
1492	PatchInverseSearch_ParBody(*this, num_stripes, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], I1s_ext[i],
1493	I0xs[i], I0ys[i], `1`, i));
1494	}
1495
1496	parallel_for_(Range(`0`, num_stripes),
1497	Densification_ParBody(*this, num_stripes, I0s[i].rows, Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i]));
1498	if (variational_refinement_iter > `0`)
1499	variational_refinement_processors[i]->calcUV(I0s[i], I1s[i], Ux[i], Uy[i]);
1500
1501	if (i > finest_scale)
1502	{
1503	resize(Ux[i], Ux[i - `1`], Ux[i - `1`].size());
1504	resize(Uy[i], Uy[i - `1`], Uy[i - `1`].size());
1505	Ux[i - `1`] *= `2`;
1506	Uy[i - `1`] *= `2`;
1507	}
1508	}
1509	Mat uxy[] = {Ux[finest_scale], Uy[finest_scale]};
1510	merge(uxy, `2`, U);
1511	resize(U, flowMat, flowMat.size());
1512	flowMat *= `1` << finest_scale;
1513	}
1514
1515	void DISOpticalFlowImpl::collectGarbage()
1516	{
1517	CV_INSTRUMENT_REGION();
1518
1519	I0s.clear();
1520	I1s.clear();
1521	I1s_ext.clear();
1522	I0xs.clear();
1523	I0ys.clear();
1524	Ux.clear();
1525	Uy.clear();
1526	U.release();
1527	Sx.release();
1528	Sy.release();
1529	I0xx_buf.release();
1530	I0yy_buf.release();
1531	I0xy_buf.release();
1532	I0xx_buf_aux.release();
1533	I0yy_buf_aux.release();
1534	I0xy_buf_aux.release();
1535
1536	#ifdef HAVE_OPENCL
1537	u_I0s.clear();
1538	u_I1s.clear();
1539	u_I1s_ext.clear();
1540	u_I0xs.clear();
1541	u_I0ys.clear();
1542	u_U.clear();
1543	u_S.release();
1544	u_I0xx_buf.release();
1545	u_I0yy_buf.release();
1546	u_I0xy_buf.release();
1547	u_I0xx_buf_aux.release();
1548	u_I0yy_buf_aux.release();
1549	u_I0xy_buf_aux.release();
1550	#endif
1551
1552	for (int i = finest_scale; i <= coarsest_scale; i++)
1553	variational_refinement_processors[i]->collectGarbage();
1554	variational_refinement_processors.clear();
1555	}
1556
1557	Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset)
1558	{
1559	CV_INSTRUMENT_REGION();
1560
1561	Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>();
1562	dis->setPatchSize(`8`);
1563	if (preset == DISOpticalFlow::PRESET_ULTRAFAST)
1564	{
1565	dis->setFinestScale(`2`);
1566	dis->setPatchStride(`4`);
1567	dis->setGradientDescentIterations(`12`);
1568	dis->setVariationalRefinementIterations(`0`);
1569	}
1570	else if (preset == DISOpticalFlow::PRESET_FAST)
1571	{
1572	dis->setFinestScale(`2`);
1573	dis->setPatchStride(`4`);
1574	dis->setGradientDescentIterations(`16`);
1575	dis->setVariationalRefinementIterations(`5`);
1576	}
1577	else if (preset == DISOpticalFlow::PRESET_MEDIUM)
1578	{
1579	dis->setFinestScale(`1`);
1580	dis->setPatchStride(`3`);
1581	dis->setGradientDescentIterations(`25`);
1582	dis->setVariationalRefinementIterations(`5`);
1583	}
1584
1585	return dis;
1586	}
1587
1588
1589	} // namespace
1590

Provided by KDAB

Definitions

DISOpticalFlowImpl
getFinestScale
setFinestScale
getPatchSize
setPatchSize
getPatchStride
setPatchStride
getGradientDescentIterations
setGradientDescentIterations
getVariationalRefinementIterations
setVariationalRefinementIterations
getVariationalRefinementAlpha
setVariationalRefinementAlpha
getVariationalRefinementDelta
setVariationalRefinementDelta
getVariationalRefinementGamma
setVariationalRefinementGamma
getVariationalRefinementEpsilon
setVariationalRefinementEpsilon
getUseMeanNormalization
setUseMeanNormalization
getUseSpatialPropagation
setUseSpatialPropagation
PatchInverseSearch_ParBody
Densification_ParBody
DISOpticalFlowImpl
prepareBuffers
precomputeStructureTensor
autoSelectCoarsestScale
autoSelectPatchSizeAndScales
PatchInverseSearch_ParBody
processPatch
processPatchMeanNorm
computeSSD
computeSSDMeanNorm
operator()
Densification_ParBody
operator()
ocl_PatchInverseSearch
ocl_Densification
ocl_prepareBuffers
ocl_precomputeStructureTensor
ocl_calc
calc
collectGarbage

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of opencv/modules/video/src/dis_flow.cpp