imgwarp.cpp source code [opencv/modules/imgproc/src/imgwarp.cpp]

1	/M///////////////////////////////////////////////////////////////////////////////////////*
2	//
3	// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4	//
5	// By downloading, copying, installing or using the software you agree to this license.
6	// If you do not agree to this license, do not download, install,
7	// copy or use the software.
8	//
9	//
10	// License Agreement
11	// For Open Source Computer Vision Library
12	//
13	// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14	// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15	// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16	// Third party copyrights are property of their respective owners.
17	//
18	// Redistribution and use in source and binary forms, with or without modification,
19	// are permitted provided that the following conditions are met:
20	//
21	// Redistribution's of source code must retain the above copyright notice,*
22	// this list of conditions and the following disclaimer.
23	//
24	// Redistribution's in binary form must reproduce the above copyright notice,*
25	// this list of conditions and the following disclaimer in the documentation
26	// and/or other materials provided with the distribution.
27	//
28	// The name of the copyright holders may not be used to endorse or promote products*
29	// derived from this software without specific prior written permission.
30	//
31	// This software is provided by the copyright holders and contributors "as is" and
32	// any express or implied warranties, including, but not limited to, the implied
33	// warranties of merchantability and fitness for a particular purpose are disclaimed.
34	// In no event shall the Intel Corporation or contributors be liable for any direct,
35	// indirect, incidental, special, exemplary, or consequential damages
36	// (including, but not limited to, procurement of substitute goods or services;
37	// loss of use, data, or profits; or business interruption) however caused
38	// and on any theory of liability, whether in contract, strict liability,
39	// or tort (including negligence or otherwise) arising in any way out of
40	// the use of this software, even if advised of the possibility of such damage.
41	//
42	//M/*
43
44	/ ////////////////////////////////////////////////////////////////////*
45	//
46	// Geometrical transforms on images and matrices: rotation, zoom etc.
47	//
48	// /*
49
50	#include "precomp.hpp"
51	#include "opencl_kernels_imgproc.hpp"
52	#include "hal_replacement.hpp"
53	#include <opencv2/core/utils/configuration.private.hpp>
54	#include "opencv2/core/hal/intrin.hpp"
55	#include "opencv2/core/softfloat.hpp"
56	#include "imgwarp.hpp"
57
58	using namespace cv;
59
60	namespace cv
61	{
62
63	#if defined (HAVE_IPP) && (!IPP_DISABLE_WARPAFFINE \|\| !IPP_DISABLE_WARPPERSPECTIVE \|\| !IPP_DISABLE_REMAP)
64	typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void, void* , int*, IppiSize);
65
66	template <int channels, typename Type>
67	bool IPPSetSimple(cv::Scalar value, void dataPointer, int* step, IppiSize &size, ippiSetFunc func)
68	{
69	CV_INSTRUMENT_REGION_IPP();
70
71	Type values[channels];
72	for( int i = `0`; i < channels; i++ )
73	values[i] = saturate_cast<Type>(value[i]);
74	return func(values, dataPointer, step, size) >= `0`;
75	}
76
77	static bool IPPSet(const cv::Scalar &value, void dataPointer, int* step, IppiSize &size, int channels, int depth)
78	{
79	CV_INSTRUMENT_REGION_IPP();
80
81	if( channels == `1` )
82	{
83	switch( depth )
84	{
85	case CV_8U:
86	return CV_INSTRUMENT_FUN_IPP(ippiSet_8u_C1R, saturate_cast<Ipp8u>(value[`0`]), (Ipp8u *)dataPointer, step, size) >= `0`;
87	case CV_16U:
88	return CV_INSTRUMENT_FUN_IPP(ippiSet_16u_C1R, saturate_cast<Ipp16u>(value[`0`]), (Ipp16u *)dataPointer, step, size) >= `0`;
89	case CV_32F:
90	return CV_INSTRUMENT_FUN_IPP(ippiSet_32f_C1R, saturate_cast<Ipp32f>(value[`0`]), (Ipp32f *)dataPointer, step, size) >= `0`;
91	}
92	}
93	else
94	{
95	if( channels == `3` )
96	{
97	switch( depth )
98	{
99	case CV_8U:
100	return IPPSetSimple<`3`, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
101	case CV_16U:
102	return IPPSetSimple<`3`, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
103	case CV_32F:
104	return IPPSetSimple<`3`, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
105	}
106	}
107	else if( channels == `4` )
108	{
109	switch( depth )
110	{
111	case CV_8U:
112	return IPPSetSimple<`4`, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
113	case CV_16U:
114	return IPPSetSimple<`4`, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
115	case CV_32F:
116	return IPPSetSimple<`4`, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
117	}
118	}
119	}
120	return false;
121	}
122	#endif
123
124	/*********** interpolation formulas and tables ************/
125
126	const int INTER_REMAP_COEF_BITS=`15`;
127	const int INTER_REMAP_COEF_SCALE=`1` << INTER_REMAP_COEF_BITS;
128
129	static uchar NNDeltaTab_i[INTER_TAB_SIZE2][`2`];
130
131	static float BilinearTab_f[INTER_TAB_SIZE2][`2`][`2`];
132	static short BilinearTab_i[INTER_TAB_SIZE2][`2`][`2`];
133
134	#if CV_SIMD128
135	static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+`2`][`2`][`8`];
136	static short (BilinearTab_iC4)[`2`][`8`] = (short* (*)[`2`][`8`])alignPtr(ptr: BilinearTab_iC4_buf, n: `16`);
137	#endif
138
139	static float BicubicTab_f[INTER_TAB_SIZE2][`4`][`4`];
140	static short BicubicTab_i[INTER_TAB_SIZE2][`4`][`4`];
141
142	static float Lanczos4Tab_f[INTER_TAB_SIZE2][`8`][`8`];
143	static short Lanczos4Tab_i[INTER_TAB_SIZE2][`8`][`8`];
144
145	static inline void interpolateLinear( float x, float* coeffs )
146	{
147	coeffs[`0`] = `1.f` - x;
148	coeffs[`1`] = x;
149	}
150
151	static inline void interpolateCubic( float x, float* coeffs )
152	{
153	const float A = -`0.75f`;
154
155	coeffs[`0`] = ((A(x + `1`) - `5`A)(x + `1`) + `8`A)(x + `1`) - `4`A;
156	coeffs[`1`] = ((A + `2`)x - (A + `3`))x*x + `1`;
157	coeffs[`2`] = ((A + `2`)(`1` - x) - (A + `3`))(`1` - x)*(`1` - x) + `1`;
158	coeffs[`3`] = `1.f` - coeffs[`0`] - coeffs[`1`] - coeffs[`2`];
159	}
160
161	static inline void interpolateLanczos4( float x, float* coeffs )
162	{
163	static const double s45 = `0.70710678118654752440084436210485`;
164	static const double cs[][`2`]=
165	{{`1`, `0`}, {-s45, -s45}, {`0`, `1`}, {s45, -s45}, {-`1`, `0`}, {s45, s45}, {`0`, -`1`}, {-s45, s45}};
166
167	if( x < FLT_EPSILON )
168	{
169	for( int i = `0`; i < `8`; i++ )
170	coeffs[i] = `0`;
171	coeffs[`3`] = `1`;
172	return;
173	}
174
175	float sum = `0`;
176	double y0=-(x+`3`)CV_PI`0.25`, s0 = std::sin(x: y0), c0= std::cos(x: y0);
177	for(int i = `0`; i < `8`; i++ )
178	{
179	double y = -(x+`3`-i)CV_PI`0.25`;
180	coeffs[i] = (float)((cs[i][`0`]s0 + cs[i][`1`]c0)/(y*y));
181	sum += coeffs[i];
182	}
183
184	sum = `1.f`/sum;
185	for(int i = `0`; i < `8`; i++ )
186	coeffs[i] *= sum;
187	}
188
189	static void initInterTab1D(int method, float* tab, int tabsz)
190	{
191	float scale = `1.f`/tabsz;
192	if( method == INTER_LINEAR )
193	{
194	for( int i = `0`; i < tabsz; i++, tab += `2` )
195	interpolateLinear( x: i*scale, coeffs: tab );
196	}
197	else if( method == INTER_CUBIC )
198	{
199	for( int i = `0`; i < tabsz; i++, tab += `4` )
200	interpolateCubic( x: i*scale, coeffs: tab );
201	}
202	else if( method == INTER_LANCZOS4 )
203	{
204	for( int i = `0`; i < tabsz; i++, tab += `8` )
205	interpolateLanczos4( x: i*scale, coeffs: tab );
206	}
207	else
208	CV_Error( cv::Error::StsBadArg, "Unknown interpolation method" );
209	}
210
211
212	static const void* initInterTab2D( int method, bool fixpt )
213	{
214	static bool inittab[INTER_MAX+`1`] = {false};
215	float* tab = `0`;
216	short* itab = `0`;
217	int ksize = `0`;
218	if( method == INTER_LINEAR )
219	tab = BilinearTab_f[`0`][`0`], itab = BilinearTab_i[`0`][`0`], ksize=`2`;
220	else if( method == INTER_CUBIC )
221	tab = BicubicTab_f[`0`][`0`], itab = BicubicTab_i[`0`][`0`], ksize=`4`;
222	else if( method == INTER_LANCZOS4 )
223	tab = Lanczos4Tab_f[`0`][`0`], itab = Lanczos4Tab_i[`0`][`0`], ksize=`8`;
224	else
225	CV_Error( cv::Error::StsBadArg, "Unknown/unsupported interpolation type" );
226
227	if( !inittab[method] )
228	{
229	AutoBuffer<float> _tab(`8`*INTER_TAB_SIZE);
230	int i, j, k1, k2;
231	initInterTab1D(method, tab: _tab.data(), tabsz: INTER_TAB_SIZE);
232	for( i = `0`; i < INTER_TAB_SIZE; i++ )
233	for( j = `0`; j < INTER_TAB_SIZE; j++, tab += ksizeksize, itab += ksizeksize )
234	{
235	int isum = `0`;
236	NNDeltaTab_i[i*INTER_TAB_SIZE+j][`0`] = j < INTER_TAB_SIZE/`2`;
237	NNDeltaTab_i[i*INTER_TAB_SIZE+j][`1`] = i < INTER_TAB_SIZE/`2`;
238
239	for( k1 = `0`; k1 < ksize; k1++ )
240	{
241	float vy = _tab [i*ksize + k1];
242	for( k2 = `0`; k2 < ksize; k2++ )
243	{
244	float v = vy_tab [jksize + k2];
245	tab[k1*ksize + k2] = v;
246	isum += itab[k1ksize + k2] = saturate_cast<short>(v: vINTER_REMAP_COEF_SCALE);
247	}
248	}
249
250	if( isum != INTER_REMAP_COEF_SCALE )
251	{
252	int diff = isum - INTER_REMAP_COEF_SCALE;
253	int ksize2 = ksize/`2`, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
254	for( k1 = ksize2; k1 < ksize2+`2`; k1++ )
255	for( k2 = ksize2; k2 < ksize2+`2`; k2++ )
256	{
257	if( itab[k1ksize+k2] < itab[mk1ksize+mk2] )
258	mk1 = k1, mk2 = k2;
259	else if( itab[k1ksize+k2] > itab[Mk1ksize+Mk2] )
260	Mk1 = k1, Mk2 = k2;
261	}
262	if( diff < `0` )
263	itab[Mk1ksize + Mk2] = (short)(itab[Mk1ksize + Mk2] - diff);
264	else
265	itab[mk1ksize + mk2] = (short)(itab[mk1ksize + mk2] - diff);
266	}
267	}
268	tab -= INTER_TAB_SIZE2ksizeksize;
269	itab -= INTER_TAB_SIZE2ksizeksize;
270	#if CV_SIMD128
271	if( method == INTER_LINEAR )
272	{
273	for( i = `0`; i < INTER_TAB_SIZE2; i++ )
274	for( j = `0`; j < `4`; j++ )
275	{
276	BilinearTab_iC4[i][`0`][j*`2`] = BilinearTab_i[i][`0`][`0`];
277	BilinearTab_iC4[i][`0`][j*`2`+`1`] = BilinearTab_i[i][`0`][`1`];
278	BilinearTab_iC4[i][`1`][j*`2`] = BilinearTab_i[i][`1`][`0`];
279	BilinearTab_iC4[i][`1`][j*`2`+`1`] = BilinearTab_i[i][`1`][`1`];
280	}
281	}
282	#endif
283	inittab[method] = true;
284	}
285	return fixpt ? (const void)itab : (const* void*)tab;
286	}
287
288	#ifndef __MINGW32__
289	static bool initAllInterTab2D()
290	{
291	return initInterTab2D( method: INTER_LINEAR, fixpt: false ) &&
292	initInterTab2D( method: INTER_LINEAR, fixpt: true ) &&
293	initInterTab2D( method: INTER_CUBIC, fixpt: false ) &&
294	initInterTab2D( method: INTER_CUBIC, fixpt: true ) &&
295	initInterTab2D( method: INTER_LANCZOS4, fixpt: false ) &&
296	initInterTab2D( method: INTER_LANCZOS4, fixpt: true );
297	}
298
299	static volatile bool doInitAllInterTab2D = initAllInterTab2D();
300	#endif
301
302	template<typename ST, typename DT> struct Cast
303	{
304	typedef ST type1;
305	typedef DT rtype;
306
307	DT operator()(ST val) const { return saturate_cast<DT>(val); }
308	};
309
310	template<typename ST, typename DT, int bits> struct FixedPtCast
311	{
312	typedef ST type1;
313	typedef DT rtype;
314	enum { SHIFT = bits, DELTA = `1` << (bits-`1`) };
315
316	DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
317	};
318
319	static inline int clip(int x, int a, int b)
320	{
321	return x >= a ? (x < b ? x : b-`1`) : a;
322	}
323
324	/**************************************************************************************\
325	* General warping (affine, perspective, remap) *
326	\**************************************************************************************/
327
328	template<typename T, bool isRelative>
329	static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
330	int borderType, const Scalar& _borderValue, const Point& _offset )
331	{
332	Size ssize = _src.size (), dsize = _dst.size ();
333	const int cn = _src.channels();
334	const T* S0 = _src.ptr<T>();
335	T cval[CV_CN_MAX];
336	size_t sstep = _src.step/sizeof(S0[`0`]);
337
338	for(int k = `0`; k < cn; k++ )
339	cval[k] = saturate_cast<T>(_borderValue [k & `3`]);
340
341	unsigned width1 = ssize.width, height1 = ssize.height;
342
343	if( _dst.isContinuous() && _xy.isContinuous() && !isRelative )
344	{
345	dsize.width *= dsize.height;
346	dsize.height = `1`;
347	}
348
349	for(int dy = `0`; dy < dsize.height; dy++ )
350	{
351	T* D = _dst.ptr<T>(dy);
352	const short* XY = _xy.ptr<short>(y: dy);
353	const int off_y = isRelative ? (_offset.y+dy) : `0`;
354	if( cn == `1` )
355	{
356	for(int dx = `0`; dx < dsize.width; dx++ )
357	{
358	const int off_x = isRelative ? (_offset.x+dx) : `0`;
359	int sx = XY[dx`2`]+off_x, sy = XY[dx`2`+`1`]+off_y;
360	if( (unsigned)sx < width1 && (unsigned)sy < height1 )
361	D[dx] = S0[sy*sstep + sx];
362	else
363	{
364	if( borderType == BORDER_REPLICATE )
365	{
366	sx = clip(x: sx, a: `0`, b: ssize.width);
367	sy = clip(x: sy, a: `0`, b: ssize.height);
368	D[dx] = S0[sy*sstep + sx];
369	}
370	else if( borderType == BORDER_CONSTANT )
371	D[dx] = cval[`0`];
372	else if( borderType != BORDER_TRANSPARENT )
373	{
374	sx = borderInterpolate(p: sx, len: ssize.width, borderType);
375	sy = borderInterpolate(p: sy, len: ssize.height, borderType);
376	D[dx] = S0[sy*sstep + sx];
377	}
378	}
379	}
380	}
381	else
382	{
383	for(int dx = `0`; dx < dsize.width; dx++, D += cn )
384	{
385	const int off_x = isRelative ? (_offset.x+dx) : `0`;
386	int sx = XY[dx`2`]+off_x, sy = XY[dx`2`+`1`]+off_y;
387	const T *S;
388	if( (unsigned)sx < width1 && (unsigned)sy < height1 )
389	{
390	if( cn == `3` )
391	{
392	S = S0 + sysstep + sx`3`;
393	D[`0`] = S[`0`], D[`1`] = S[`1`], D[`2`] = S[`2`];
394	}
395	else if( cn == `4` )
396	{
397	S = S0 + sysstep + sx`4`;
398	D[`0`] = S[`0`], D[`1`] = S[`1`], D[`2`] = S[`2`], D[`3`] = S[`3`];
399	}
400	else
401	{
402	S = S0 + sysstep + sxcn;
403	for(int k = `0`; k < cn; k++ )
404	D[k] = S[k];
405	}
406	}
407	else if( borderType != BORDER_TRANSPARENT )
408	{
409	if( borderType == BORDER_REPLICATE )
410	{
411	sx = clip(x: sx, a: `0`, b: ssize.width);
412	sy = clip(x: sy, a: `0`, b: ssize.height);
413	S = S0 + sysstep + sxcn;
414	}
415	else if( borderType == BORDER_CONSTANT )
416	S = &cval[`0`];
417	else
418	{
419	sx = borderInterpolate(p: sx, len: ssize.width, borderType);
420	sy = borderInterpolate(p: sy, len: ssize.height, borderType);
421	S = S0 + sysstep + sxcn;
422	}
423	for(int k = `0`; k < cn; k++ )
424	D[k] = S[k];
425	}
426	}
427	}
428	}
429	}
430
431	template<bool>
432	struct RemapNoVec
433	{
434	int operator()( const Mat&, void, const* short, const* ushort*,
435	const void, int, cv::Point& ) const* { return `0`; }
436	};
437
438	#if CV_SIMD128
439
440	typedef unsigned short CV_DECL_ALIGNED(`1`) unaligned_ushort;
441	typedef int CV_DECL_ALIGNED(`1`) unaligned_int;
442
443	template<bool isRelative>
444	struct RemapVec_8u
445	{
446	int operator()( const Mat& _src, void* _dst, const short* XY,
447	const ushort* FXY, const void* _wtab, int width, const Point& _offset ) const
448	{
449	int cn = _src.channels(), x = `0`, sstep = (int)_src.step;
450	if( (cn != `1` && cn != `3` && cn != `4`) \|\| sstep >= `0x8000` )
451	return `0`;
452
453	const uchar S0 = _src.ptr(), S1 = _src.ptr(y: `1`);
454	const short* wtab = cn == `1` ? (const short*)_wtab : &BilinearTab_iC4[`0`][`0`][`0`];
455	uchar* D = (uchar*)_dst;
456	v_int32x4 delta = v_setall_s32(v: INTER_REMAP_COEF_SCALE / `2`);
457	v_int16x8 xy2ofs = v_reinterpret_as_s16(a: v_setall_s32(v: cn + (sstep << `16`)));
458	int CV_DECL_ALIGNED(`16`) iofs0[`4`], iofs1[`4`];
459	const uchar* src_limit_8bytes = _src.datalimit - VTraits<v_int16x8>::vlanes();
460	#define CV_PICK_AND_PACK_RGB(ptr, offset, result) \
461	{ \
462	const uchar* const p = ((const uchar*)ptr) + (offset); \
463	if (p <= src_limit_8bytes) \
464	{ \
465	v_uint8x16 rrggbb, dummy; \
466	v_uint16x8 rrggbb8, dummy8; \
467	v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4 ((unaligned_int)(p), 0, 0, 0)); \
468	v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4 ((unaligned_int)(p + 3), 0, 0, 0)); \
469	v_zip(rgb0, rgb1, rrggbb, dummy); \
470	v_expand(rrggbb, rrggbb8, dummy8); \
471	result = v_reinterpret_as_s16(rrggbb8); \
472	} \
473	else \
474	{ \
475	result = v_int16x8 ((short)p[0], (short)p[3], /* r0r1 */ \
476	(short)p[1], (short)p[4], /* g0g1 */ \
477	(short)p[2], (short)p[5], /* b0b1 */ 0, 0); \
478	} \
479	}
480	#define CV_PICK_AND_PACK_RGBA(ptr, offset, result) \
481	{ \
482	const uchar* const p = ((const uchar*)ptr) + (offset); \
483	CV_DbgAssert(p <= src_limit_8bytes); \
484	v_uint8x16 rrggbbaa, dummy; \
485	v_uint16x8 rrggbbaa8, dummy8; \
486	v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4 ((unaligned_int)(p), 0, 0, 0)); \
487	v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4 ((unaligned_int)(p + VTraits<v_int32x4>::vlanes()), 0, 0, 0)); \
488	v_zip(rgba0, rgba1, rrggbbaa, dummy); \
489	v_expand(rrggbbaa, rrggbbaa8, dummy8); \
490	result = v_reinterpret_as_s16(rrggbbaa8); \
491	}
492	#define CV_PICK_AND_PACK4(base,offset) \
493	v_uint16x8 ((unaligned_ushort)(base + offset[0]), (unaligned_ushort)(base + offset[1]), \
494	(unaligned_ushort)(base + offset[2]), (unaligned_ushort)(base + offset[3]), \
495	0, 0, 0, 0)
496
497	const short _rel_offset_x = static_cast<short>(_offset.x);
498	const short _rel_offset_y = static_cast<short>(_offset.y);
499	v_int16x8 v_dxy0(_rel_offset_x, _rel_offset_y, _rel_offset_x, _rel_offset_y, _rel_offset_x, _rel_offset_y, _rel_offset_x, _rel_offset_y);
500	v_int16x8 v_dxy1 = v_dxy0;
501	v_dxy0 = v_add(a: v_dxy0, b: v_int16x8 (`0`, `0`, `1`, `0`, `2`, `0`, `3`, `0`));
502	v_dxy1 = v_add(a: v_dxy1, b: v_int16x8 (`4`, `0`, `5`, `0`, `6`, `0`, `7`, `0`));
503	if( cn == `1` )
504	{
505	for( ; x <= width - `8`; x += `8` )
506	{
507	v_int16x8 _xy0 = v_load(ptr: XY + x*`2`);
508	v_int16x8 _xy1 = v_load(ptr: XY + x*`2` + `8`);
509	if (isRelative)
510	{
511	const short x_s16 = static_cast<short>(x);
512	v_int16x8 v_dxy01(x_s16, `0`, x_s16, `0`, x_s16, `0`, x_s16, `0`);
513	_xy0 = v_add(a: _xy0, b: v_add(a: v_dxy01, b: v_dxy0));
514	_xy1 = v_add(a: _xy1, b: v_add(a: v_dxy01, b: v_dxy1));
515	}
516	v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2;
517
518	v_int32x4 xy0 = v_dotprod( a: _xy0, b: xy2ofs );
519	v_int32x4 xy1 = v_dotprod( a: _xy1, b: xy2ofs );
520	v_store( ptr: iofs0, a: xy0 );
521	v_store( ptr: iofs1, a: xy1 );
522
523	v_uint16x8 stub, dummy;
524	v_uint16x8 vec16;
525	vec16 = CV_PICK_AND_PACK4(S0, iofs0);
526	v_expand(a: v_reinterpret_as_u8(a: vec16), b0&: stub, b1&: dummy);
527	v0 = v_reinterpret_as_s32(a: stub);
528	vec16 = CV_PICK_AND_PACK4(S1, iofs0);
529	v_expand(a: v_reinterpret_as_u8(a: vec16), b0&: stub, b1&: dummy);
530	v1 = v_reinterpret_as_s32(a: stub);
531
532	v_zip(a0: v_load_low(ptr: (int)(wtab + FXY[x] `4`)), a1: v_load_low(ptr: (int)(wtab + FXY[x + `1`] `4`)), b0&: a0, b1&: a1);
533	v_zip(a0: v_load_low(ptr: (int)(wtab + FXY[x + `2`] `4`)), a1: v_load_low(ptr: (int)(wtab + FXY[x + `3`] `4`)), b0, b1);
534	v_recombine(a: a0, b: b0, c&: a2, d&: b2);
535	v1 = v_dotprod(a: v_reinterpret_as_s16(a: v1), b: v_reinterpret_as_s16(a: b2), c: delta);
536	v0 = v_dotprod(a: v_reinterpret_as_s16(a: v0), b: v_reinterpret_as_s16(a: a2), c: v1);
537
538	vec16 = CV_PICK_AND_PACK4(S0, iofs1);
539	v_expand(a: v_reinterpret_as_u8(a: vec16), b0&: stub, b1&: dummy);
540	v2 = v_reinterpret_as_s32(a: stub);
541	vec16 = CV_PICK_AND_PACK4(S1, iofs1);
542	v_expand(a: v_reinterpret_as_u8(a: vec16), b0&: stub, b1&: dummy);
543	v3 = v_reinterpret_as_s32(a: stub);
544
545	v_zip(a0: v_load_low(ptr: (int)(wtab + FXY[x + `4`] `4`)), a1: v_load_low(ptr: (int)(wtab + FXY[x + `5`] `4`)), b0&: c0, b1&: c1);
546	v_zip(a0: v_load_low(ptr: (int)(wtab + FXY[x + `6`] `4`)), a1: v_load_low(ptr: (int)(wtab + FXY[x + `7`] `4`)), b0&: d0, b1&: d1);
547	v_recombine(a: c0, b: d0, c&: c2, d&: d2);
548	v3 = v_dotprod(a: v_reinterpret_as_s16(a: v3), b: v_reinterpret_as_s16(a: d2), c: delta);
549	v2 = v_dotprod(a: v_reinterpret_as_s16(a: v2), b: v_reinterpret_as_s16(a: c2), c: v3);
550
551	v0 = v_shr<INTER_REMAP_COEF_BITS>(a: v0);
552	v2 = v_shr<INTER_REMAP_COEF_BITS>(a: v2);
553	v_pack_u_store(ptr: D + x, a: v_pack(a: v0, b: v2));
554	}
555	}
556	else if( cn == `3` )
557	{
558	for( ; x <= width - `5`; x += `4`, D += `12` )
559	{
560	v_int16x8 u0, v0, u1, v1;
561	v_int16x8 _xy0 = v_load(ptr: XY + x * `2`);
562	if (isRelative)
563	{
564	const short x_s16 = static_cast<short>(x);
565	v_int16x8 v_dxy01(x_s16, `0`, x_s16, `0`, x_s16, `0`, x_s16, `0`);
566	_xy0 = v_add(a: _xy0, b: v_add(a: v_dxy01, b: v_dxy0));
567	}
568
569	v_int32x4 xy0 = v_dotprod(a: _xy0, b: xy2ofs);
570	v_store(ptr: iofs0, a: xy0);
571
572	int offset0 = FXY[x] * `16`;
573	int offset1 = FXY[x + `1`] * `16`;
574	int offset2 = FXY[x + `2`] * `16`;
575	int offset3 = FXY[x + `3`] * `16`;
576	v_int16x8 w00 = v_load(ptr: wtab + offset0);
577	v_int16x8 w01 = v_load(ptr: wtab + offset0 + `8`);
578	v_int16x8 w10 = v_load(ptr: wtab + offset1);
579	v_int16x8 w11 = v_load(ptr: wtab + offset1 + `8`);
580
581	CV_PICK_AND_PACK_RGB(S0, iofs0[`0`], u0);
582	CV_PICK_AND_PACK_RGB(S1, iofs0[`0`], v0);
583	CV_PICK_AND_PACK_RGB(S0, iofs0[`1`], u1);
584	CV_PICK_AND_PACK_RGB(S1, iofs0[`1`], v1);
585
586	v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(a: v_dotprod(a: u0, b: w00, c: v_dotprod(a: v0, b: w01, c: delta)));
587	v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(a: v_dotprod(a: u1, b: w10, c: v_dotprod(a: v1, b: w11, c: delta)));
588
589	result0 = v_rotate_left<`1`>(a: result0);
590	v_int16x8 result8 = v_pack(a: result0, b: result1);
591	v_uint8x16 result16 = v_pack_u(a: result8, b: result8);
592	v_store_low(ptr: D, a: v_rotate_right<`1`>(a: result16));
593
594
595	w00 = v_load(ptr: wtab + offset2);
596	w01 = v_load(ptr: wtab + offset2 + `8`);
597	w10 = v_load(ptr: wtab + offset3);
598	w11 = v_load(ptr: wtab + offset3 + `8`);
599	CV_PICK_AND_PACK_RGB(S0, iofs0[`2`], u0);
600	CV_PICK_AND_PACK_RGB(S1, iofs0[`2`], v0);
601	CV_PICK_AND_PACK_RGB(S0, iofs0[`3`], u1);
602	CV_PICK_AND_PACK_RGB(S1, iofs0[`3`], v1);
603
604	result0 = v_shr<INTER_REMAP_COEF_BITS>(a: v_dotprod(a: u0, b: w00, c: v_dotprod(a: v0, b: w01, c: delta)));
605	result1 = v_shr<INTER_REMAP_COEF_BITS>(a: v_dotprod(a: u1, b: w10, c: v_dotprod(a: v1, b: w11, c: delta)));
606
607	result0 = v_rotate_left<`1`>(a: result0);
608	result8 = v_pack(a: result0, b: result1);
609	result16 = v_pack_u(a: result8, b: result8);
610	v_store_low(ptr: D + `6`, a: v_rotate_right<`1`>(a: result16));
611	}
612	}
613	else if( cn == `4` )
614	{
615	for( ; x <= width - `4`; x += `4`, D += `16` )
616	{
617	v_int16x8 _xy0 = v_load(ptr: XY + x * `2`);
618	if (isRelative)
619	{
620	const short x_s16 = static_cast<short>(x);
621	v_int16x8 v_dxy01(x_s16, `0`, x_s16, `0`, x_s16, `0`, x_s16, `0`);
622	_xy0 = v_add(a: _xy0, b: v_add(a: v_dxy01, b: v_dxy0));
623	}
624	v_int16x8 u0, v0, u1, v1;
625
626	v_int32x4 xy0 = v_dotprod( a: _xy0, b: xy2ofs );
627	v_store(ptr: iofs0, a: xy0);
628
629	int offset0 = FXY[x] * `16`;
630	int offset1 = FXY[x + `1`] * `16`;
631	int offset2 = FXY[x + `2`] * `16`;
632	int offset3 = FXY[x + `3`] * `16`;
633
634	v_int16x8 w00 = v_load(ptr: wtab + offset0);
635	v_int16x8 w01 = v_load(ptr: wtab + offset0 + `8`);
636	v_int16x8 w10 = v_load(ptr: wtab + offset1);
637	v_int16x8 w11 = v_load(ptr: wtab + offset1 + `8`);
638	CV_PICK_AND_PACK_RGBA(S0, iofs0[`0`], u0);
639	CV_PICK_AND_PACK_RGBA(S1, iofs0[`0`], v0);
640	CV_PICK_AND_PACK_RGBA(S0, iofs0[`1`], u1);
641	CV_PICK_AND_PACK_RGBA(S1, iofs0[`1`], v1);
642
643	v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(a: v_dotprod(a: u0, b: w00, c: v_dotprod(a: v0, b: w01, c: delta)));
644	v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(a: v_dotprod(a: u1, b: w10, c: v_dotprod(a: v1, b: w11, c: delta)));
645	v_int16x8 result8 = v_pack(a: result0, b: result1);
646	v_pack_u_store(ptr: D, a: result8);
647
648	w00 = v_load(ptr: wtab + offset2);
649	w01 = v_load(ptr: wtab + offset2 + `8`);
650	w10 = v_load(ptr: wtab + offset3);
651	w11 = v_load(ptr: wtab + offset3 + `8`);
652	CV_PICK_AND_PACK_RGBA(S0, iofs0[`2`], u0);
653	CV_PICK_AND_PACK_RGBA(S1, iofs0[`2`], v0);
654	CV_PICK_AND_PACK_RGBA(S0, iofs0[`3`], u1);
655	CV_PICK_AND_PACK_RGBA(S1, iofs0[`3`], v1);
656
657	result0 = v_shr<INTER_REMAP_COEF_BITS>(a: v_dotprod(a: u0, b: w00, c: v_dotprod(a: v0, b: w01, c: delta)));
658	result1 = v_shr<INTER_REMAP_COEF_BITS>(a: v_dotprod(a: u1, b: w10, c: v_dotprod(a: v1, b: w11, c: delta)));
659	result8 = v_pack(a: result0, b: result1);
660	v_pack_u_store(ptr: D + `8`, a: result8);
661	}
662	}
663
664	return x;
665	}
666	};
667
668	#else
669
670	template<bool isRelative> using RemapVec_8u = RemapNoVec<isRelative>;
671
672	#endif
673
674	template<class CastOp, class VecOp, typename AT, bool isRelative>
675	static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
676	const Mat& _fxy, const void* _wtab,
677	int borderType, const Scalar& _borderValue, const Point& _offset )
678	{
679	typedef typename CastOp::rtype T;
680	typedef typename CastOp::type1 WT;
681	Size ssize = _src.size (), dsize = _dst.size ();
682	const int cn = _src.channels();
683	const AT* wtab = (const AT*)_wtab;
684	const T* S0 = _src.ptr<T>();
685	size_t sstep = _src.step/sizeof(S0[`0`]);
686	T cval[CV_CN_MAX];
687	CastOp castOp;
688	VecOp vecOp;
689
690	for(int k = `0`; k < cn; k++ )
691	cval[k] = saturate_cast<T>(_borderValue [k & `3`]);
692
693	unsigned width1 = std::max(a: ssize.width-`1`, b: `0`), height1 = std::max(a: ssize.height-`1`, b: `0`);
694	CV_Assert( !ssize.empty() );
695	#if CV_SIMD128
696	if( _src.type() == CV_8UC3 )
697	width1 = std::max(a: ssize.width-`2`, b: `0`);
698	#endif
699
700	for(int dy = `0`; dy < dsize.height; dy++ )
701	{
702	T* D = _dst.ptr<T>(dy);
703	const short* XY = _xy.ptr<short>(y: dy);
704	const ushort* FXY = _fxy.ptr<ushort>(y: dy);
705	int X0 = `0`;
706	bool prevInlier = false;
707	const int off_y = (isRelative ? (_offset.y+dy) : `0`);
708	for(int dx = `0`; dx <= dsize.width; dx++ )
709	{
710	bool curInlier = dx < dsize.width ?
711	(unsigned)XY[dx*`2`]+(isRelative ? (_offset.x+dx) : `0`) < width1 &&
712	(unsigned)XY[dx*`2`+`1`]+off_y < height1 : !prevInlier;
713	if( curInlier == prevInlier )
714	continue;
715
716	int X1 = dx;
717	dx = X0;
718	X0 = X1;
719	prevInlier = curInlier;
720
721	if( !curInlier )
722	{
723	Point subOffset(_offset.x+dx, _offset.y+dy);
724	int len = vecOp( _src, D, XY + dx*`2`, FXY + dx, wtab, X1 - dx, subOffset );
725	D += len*cn;
726	dx += len;
727
728	if( cn == `1` )
729	{
730	for( ; dx < X1; dx++, D++ )
731	{
732	int sx = XY[dx`2`]+(isRelative ? (_offset.x+dx) : `0`), sy = XY[dx`2`+`1`]+off_y;
733	const AT* w = wtab + FXY[dx]*`4`;
734	const T* S = S0 + sy*sstep + sx;
735	D = castOp(WT(S[`0`]w[`0`] + S[`1`]w[`1`] + S[sstep]w[`2`] + S[sstep+`1`]*w[`3`]));
736	}
737	}
738	else if( cn == `2` )
739	for( ; dx < X1; dx++, D += `2` )
740	{
741	int sx = XY[dx`2`]+(isRelative ? (_offset.x+dx) : `0`), sy = XY[dx`2`+`1`]+off_y;
742	const AT* w = wtab + FXY[dx]*`4`;
743	const T* S = S0 + sysstep + sx`2`;
744	WT t0 = S[`0`]w[`0`] + S[`2`]w[`1`] + S[sstep]w[`2`] + S[sstep+`2`]w[`3`];
745	WT t1 = S[`1`]w[`0`] + S[`3`]w[`1`] + S[sstep+`1`]w[`2`] + S[sstep+`3`]w[`3`];
746	D[`0`] = castOp(t0); D[`1`] = castOp(t1);
747	}
748	else if( cn == `3` )
749	for( ; dx < X1; dx++, D += `3` )
750	{
751	int sx = XY[dx`2`]+(isRelative ? (_offset.x+dx) : `0`), sy = XY[dx`2`+`1`]+off_y;
752	const AT* w = wtab + FXY[dx]*`4`;
753	const T* S = S0 + sysstep + sx`3`;
754	WT t0 = S[`0`]w[`0`] + S[`3`]w[`1`] + S[sstep]w[`2`] + S[sstep+`3`]w[`3`];
755	WT t1 = S[`1`]w[`0`] + S[`4`]w[`1`] + S[sstep+`1`]w[`2`] + S[sstep+`4`]w[`3`];
756	WT t2 = S[`2`]w[`0`] + S[`5`]w[`1`] + S[sstep+`2`]w[`2`] + S[sstep+`5`]w[`3`];
757	D[`0`] = castOp(t0); D[`1`] = castOp(t1); D[`2`] = castOp(t2);
758	}
759	else if( cn == `4` )
760	for( ; dx < X1; dx++, D += `4` )
761	{
762	int sx = XY[dx`2`]+(isRelative ? (_offset.x+dx) : `0`), sy = XY[dx`2`+`1`]+off_y;
763	const AT* w = wtab + FXY[dx]*`4`;
764	const T* S = S0 + sysstep + sx`4`;
765	WT t0 = S[`0`]w[`0`] + S[`4`]w[`1`] + S[sstep]w[`2`] + S[sstep+`4`]w[`3`];
766	WT t1 = S[`1`]w[`0`] + S[`5`]w[`1`] + S[sstep+`1`]w[`2`] + S[sstep+`5`]w[`3`];
767	D[`0`] = castOp(t0); D[`1`] = castOp(t1);
768	t0 = S[`2`]w[`0`] + S[`6`]w[`1`] + S[sstep+`2`]w[`2`] + S[sstep+`6`]w[`3`];
769	t1 = S[`3`]w[`0`] + S[`7`]w[`1`] + S[sstep+`3`]w[`2`] + S[sstep+`7`]w[`3`];
770	D[`2`] = castOp(t0); D[`3`] = castOp(t1);
771	}
772	else
773	for( ; dx < X1; dx++, D += cn )
774	{
775	int sx = XY[dx`2`]+(isRelative ? (_offset.x+dx) : `0`), sy = XY[dx`2`+`1`]+off_y;
776	const AT* w = wtab + FXY[dx]*`4`;
777	const T* S = S0 + sysstep + sxcn;
778	for(int k = `0`; k < cn; k++ )
779	{
780	WT t0 = S[k]w[`0`] + S[k+cn]w[`1`] + S[sstep+k]w[`2`] + S[sstep+k+cn]w[`3`];
781	D[k] = castOp(t0);
782	}
783	}
784	}
785	else
786	{
787	if (borderType == BORDER_TRANSPARENT) {
788	for (; dx < X1; dx++, D += cn) {
789	if (dx >= dsize.width) continue;
790	const int sx = XY[dx * `2`]+(isRelative ? (_offset.x+dx) : `0`), sy = XY[dx * `2` + `1`]+off_y;
791	// If the mapped point is still within bounds, it did not get computed
792	// because it lacked 4 neighbors. Still, it can be computed with an
793	// approximate formula. If it is outside, the point is left untouched.
794	if (sx >= `0` && sx <= ssize.width - `1` && sy >= `0` && sy <= ssize.height - `1`) {
795	const AT* w = wtab + FXY[dx] * `4`;
796	WT w_tot = `0`;
797	if (sx >= `0` && sy >= `0`) w_tot += w[`0`];
798	if (sy >= `0` && sx < ssize.width - `1`) w_tot += w[`1`];
799	if (sx >= `0` && sy < ssize.height - `1`) w_tot += w[`2`];
800	if (sx < ssize.width - `1` && sy < ssize.height - `1`) w_tot += w[`3`];
801	if (w_tot == `0.f`) continue;
802	const WT w_tot_ini = (WT)w[`0`] + w[`1`] + w[`2`] + w[`3`];
803	const T* S = S0 + sy * sstep + sx * cn;
804	for (int k = `0`; k < cn; k++) {
805	WT t0 = `0`;
806	if (sx >= `0` && sy >= `0`) t0 += S[k] * w[`0`];
807	if (sy >= `0` && sx < ssize.width - `1`) t0 += S[k + cn] * w[`1`];
808	if (sx >= `0` && sy < ssize.height - `1`) t0 += S[sstep + k] * w[`2`];
809	if (sx < ssize.width - `1` && sy < ssize.height - `1`) t0 += S[sstep + k + cn] * w[`3`];
810	t0 = (WT)(t0 * (float)w_tot_ini / w_tot);
811	D[k] = castOp(t0);
812	}
813	}
814	}
815	continue;
816	}
817
818	if( cn == `1` )
819	for( ; dx < X1; dx++, D++ )
820	{
821	int sx = XY[dx`2`]+(isRelative ? (_offset.x+dx) : `0`), sy = XY[dx`2`+`1`]+off_y;
822	if( borderType == BORDER_CONSTANT &&
823	(sx >= ssize.width \|\| sx+`1` < `0` \|\|
824	sy >= ssize.height \|\| sy+`1` < `0`) )
825	{
826	D[`0`] = cval[`0`];
827	}
828	else
829	{
830	int sx0, sx1, sy0, sy1;
831	T v0, v1, v2, v3;
832	const AT* w = wtab + FXY[dx]*`4`;
833	if( borderType == BORDER_REPLICATE )
834	{
835	sx0 = clip(x: sx, a: `0`, b: ssize.width);
836	sx1 = clip(x: sx+`1`, a: `0`, b: ssize.width);
837	sy0 = clip(x: sy, a: `0`, b: ssize.height);
838	sy1 = clip(x: sy+`1`, a: `0`, b: ssize.height);
839	v0 = S0[sy0*sstep + sx0];
840	v1 = S0[sy0*sstep + sx1];
841	v2 = S0[sy1*sstep + sx0];
842	v3 = S0[sy1*sstep + sx1];
843	}
844	else
845	{
846	sx0 = borderInterpolate(p: sx, len: ssize.width, borderType);
847	sx1 = borderInterpolate(p: sx+`1`, len: ssize.width, borderType);
848	sy0 = borderInterpolate(p: sy, len: ssize.height, borderType);
849	sy1 = borderInterpolate(p: sy+`1`, len: ssize.height, borderType);
850	v0 = sx0 >= `0` && sy0 >= `0` ? S0[sy0*sstep + sx0] : cval[`0`];
851	v1 = sx1 >= `0` && sy0 >= `0` ? S0[sy0*sstep + sx1] : cval[`0`];
852	v2 = sx0 >= `0` && sy1 >= `0` ? S0[sy1*sstep + sx0] : cval[`0`];
853	v3 = sx1 >= `0` && sy1 >= `0` ? S0[sy1*sstep + sx1] : cval[`0`];
854	}
855	D[`0`] = castOp(WT(v0w[`0`] + v1w[`1`] + v2w[`2`] + v3w[`3`]));
856	}
857	}
858	else
859	for( ; dx < X1; dx++, D += cn )
860	{
861	int sx = XY[dx`2`]+(isRelative ? (_offset.x+dx) : `0`), sy = XY[dx`2`+`1`]+off_y;
862	if( borderType == BORDER_CONSTANT &&
863	(sx >= ssize.width \|\| sx+`1` < `0` \|\|
864	sy >= ssize.height \|\| sy+`1` < `0`) )
865	{
866	for(int k = `0`; k < cn; k++ )
867	D[k] = cval[k];
868	}
869	else
870	{
871	int sx0, sx1, sy0, sy1;
872	const T v0, v1, v2, v3;
873	const AT* w = wtab + FXY[dx]*`4`;
874	if( borderType == BORDER_REPLICATE )
875	{
876	sx0 = clip(x: sx, a: `0`, b: ssize.width);
877	sx1 = clip(x: sx+`1`, a: `0`, b: ssize.width);
878	sy0 = clip(x: sy, a: `0`, b: ssize.height);
879	sy1 = clip(x: sy+`1`, a: `0`, b: ssize.height);
880	v0 = S0 + sy0sstep + sx0cn;
881	v1 = S0 + sy0sstep + sx1cn;
882	v2 = S0 + sy1sstep + sx0cn;
883	v3 = S0 + sy1sstep + sx1cn;
884	}
885	else
886	{
887	sx0 = borderInterpolate(p: sx, len: ssize.width, borderType);
888	sx1 = borderInterpolate(p: sx+`1`, len: ssize.width, borderType);
889	sy0 = borderInterpolate(p: sy, len: ssize.height, borderType);
890	sy1 = borderInterpolate(p: sy+`1`, len: ssize.height, borderType);
891	v0 = sx0 >= `0` && sy0 >= `0` ? S0 + sy0sstep + sx0cn : &cval[`0`];
892	v1 = sx1 >= `0` && sy0 >= `0` ? S0 + sy0sstep + sx1cn : &cval[`0`];
893	v2 = sx0 >= `0` && sy1 >= `0` ? S0 + sy1sstep + sx0cn : &cval[`0`];
894	v3 = sx1 >= `0` && sy1 >= `0` ? S0 + sy1sstep + sx1cn : &cval[`0`];
895	}
896	for(int k = `0`; k < cn; k++ )
897	D[k] = castOp(WT(v0[k]w[`0`] + v1[k]w[`1`] + v2[k]w[`2`] + v3[k]w[`3`]));
898	}
899	}
900	}
901	}
902	}
903	}
904
905
906	template<class CastOp, typename AT, int ONE, bool isRelative>
907	static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
908	const Mat& _fxy, const void* _wtab,
909	int borderType, const Scalar& _borderValue, const Point& _offset )
910	{
911	typedef typename CastOp::rtype T;
912	typedef typename CastOp::type1 WT;
913	Size ssize = _src.size (), dsize = _dst.size ();
914	const int cn = _src.channels();
915	const AT* wtab = (const AT*)_wtab;
916	const T* S0 = _src.ptr<T>();
917	size_t sstep = _src.step/sizeof(S0[`0`]);
918	T cval[CV_CN_MAX];
919	CastOp castOp;
920
921	for(int k = `0`; k < cn; k++ )
922	cval[k] = saturate_cast<T>(_borderValue [k & `3`]);
923
924	int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
925
926	unsigned width1 = std::max(a: ssize.width-`3`, b: `0`), height1 = std::max(a: ssize.height-`3`, b: `0`);
927
928	if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() && !isRelative )
929	{
930	dsize.width *= dsize.height;
931	dsize.height = `1`;
932	}
933
934	for(int dy = `0`; dy < dsize.height; dy++ )
935	{
936	T* D = _dst.ptr<T>(dy);
937	const short* XY = _xy.ptr<short>(y: dy);
938	const ushort* FXY = _fxy.ptr<ushort>(y: dy);
939	const int off_y = isRelative ? (_offset.y+dy) : `0`;
940	for(int dx = `0`; dx < dsize.width; dx++, D += cn )
941	{
942	const int off_x = isRelative ? (_offset.x+dx) : `0`;
943	int sx = XY[dx`2`]-`1`+off_x, sy = XY[dx`2`+`1`]-`1`+off_y;
944	const AT* w = wtab + FXY[dx]*`16`;
945	if( (unsigned)sx < width1 && (unsigned)sy < height1 )
946	{
947	const T* S = S0 + sysstep + sxcn;
948	for(int k = `0`; k < cn; k++ )
949	{
950	WT sum = S[`0`]w[`0`] + S[cn]w[`1`] + S[cn`2`]w[`2`] + S[cn`3`]w[`3`];
951	S += sstep;
952	sum += S[`0`]w[`4`] + S[cn]w[`5`] + S[cn`2`]w[`6`] + S[cn`3`]w[`7`];
953	S += sstep;
954	sum += S[`0`]w[`8`] + S[cn]w[`9`] + S[cn`2`]w[`10`] + S[cn`3`]w[`11`];
955	S += sstep;
956	sum += S[`0`]w[`12`] + S[cn]w[`13`] + S[cn`2`]w[`14`] + S[cn`3`]w[`15`];
957	S -= sstep * `3` - `1`;
958	D[k] = castOp(sum);
959	}
960	}
961	else
962	{
963	int x[`4`], y[`4`];
964	if( borderType == BORDER_TRANSPARENT &&
965	((unsigned)(sx+`1`) >= (unsigned)ssize.width \|\|
966	(unsigned)(sy+`1`) >= (unsigned)ssize.height) )
967	continue;
968
969	if( borderType1 == BORDER_CONSTANT &&
970	(sx >= ssize.width \|\| sx+`4` <= `0` \|\|
971	sy >= ssize.height \|\| sy+`4` <= `0`))
972	{
973	for(int k = `0`; k < cn; k++ )
974	D[k] = cval[k];
975	continue;
976	}
977
978	for(int i = `0`; i < `4`; i++ )
979	{
980	x[i] = borderInterpolate(p: sx + i, len: ssize.width, borderType: borderType1)*cn;
981	y[i] = borderInterpolate(p: sy + i, len: ssize.height, borderType: borderType1);
982	}
983
984	for(int k = `0`; k < cn; k++, S0++, w -= `16` )
985	{
986	WT cv = cval[k], sum = cv*ONE;
987	for(int i = `0`; i < `4`; i++, w += `4` )
988	{
989	int yi = y[i];
990	if( yi < `0` )
991	continue;
992	const T* S = S0 + yi*sstep;
993	if( x[`0`] >= `0` )
994	sum += (S[x[`0`]] - cv)*w[`0`];
995	if( x[`1`] >= `0` )
996	sum += (S[x[`1`]] - cv)*w[`1`];
997	if( x[`2`] >= `0` )
998	sum += (S[x[`2`]] - cv)*w[`2`];
999	if( x[`3`] >= `0` )
1000	sum += (S[x[`3`]] - cv)*w[`3`];
1001	}
1002	D[k] = castOp(sum);
1003	}
1004	S0 -= cn;
1005	}
1006	}
1007	}
1008	}
1009
1010
1011	template<class CastOp, typename AT, int ONE, bool isRelative>
1012	static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
1013	const Mat& _fxy, const void* _wtab,
1014	int borderType, const Scalar& _borderValue, const Point& _offset )
1015	{
1016	typedef typename CastOp::rtype T;
1017	typedef typename CastOp::type1 WT;
1018	Size ssize = _src.size (), dsize = _dst.size ();
1019	const int cn = _src.channels();
1020	const AT* wtab = (const AT*)_wtab;
1021	const T* S0 = _src.ptr<T>();
1022	size_t sstep = _src.step/sizeof(S0[`0`]);
1023	T cval[CV_CN_MAX];
1024	CastOp castOp;
1025
1026	for(int k = `0`; k < cn; k++ )
1027	cval[k] = saturate_cast<T>(_borderValue [k & `3`]);
1028
1029	int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
1030
1031	unsigned width1 = std::max(a: ssize.width-`7`, b: `0`), height1 = std::max(a: ssize.height-`7`, b: `0`);
1032
1033	if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() && !isRelative )
1034	{
1035	dsize.width *= dsize.height;
1036	dsize.height = `1`;
1037	}
1038
1039	for(int dy = `0`; dy < dsize.height; dy++ )
1040	{
1041	T* D = _dst.ptr<T>(dy);
1042	const short* XY = _xy.ptr<short>(y: dy);
1043	const ushort* FXY = _fxy.ptr<ushort>(y: dy);
1044	const int off_y = isRelative ? (_offset.y+dy) : `0`;
1045	for(int dx = `0`; dx < dsize.width; dx++, D += cn )
1046	{
1047	const int off_x = isRelative ? (_offset.x+dx) : `0`;
1048	int sx = XY[dx`2`]-`3`+off_x, sy = XY[dx`2`+`1`]-`3`+off_y;
1049	const AT* w = wtab + FXY[dx]*`64`;
1050	if( (unsigned)sx < width1 && (unsigned)sy < height1 )
1051	{
1052	const T* S = S0 + sysstep + sxcn;
1053	for(int k = `0`; k < cn; k++ )
1054	{
1055	WT sum = `0`;
1056	for( int r = `0`; r < `8`; r++, S += sstep, w += `8` )
1057	sum += S[`0`]w[`0`] + S[cn]w[`1`] + S[cn`2`]w[`2`] + S[cn`3`]w[`3`] +
1058	S[cn`4`]w[`4`] + S[cn`5`]w[`5`] + S[cn`6`]w[`6`] + S[cn`7`]w[`7`];
1059	w -= `64`;
1060	S -= sstep*`8` - `1`;
1061	D[k] = castOp(sum);
1062	}
1063	}
1064	else
1065	{
1066	int x[`8`], y[`8`];
1067	if( borderType == BORDER_TRANSPARENT &&
1068	((unsigned)(sx+`3`) >= (unsigned)ssize.width \|\|
1069	(unsigned)(sy+`3`) >= (unsigned)ssize.height) )
1070	continue;
1071
1072	if( borderType1 == BORDER_CONSTANT &&
1073	(sx >= ssize.width \|\| sx+`8` <= `0` \|\|
1074	sy >= ssize.height \|\| sy+`8` <= `0`))
1075	{
1076	for(int k = `0`; k < cn; k++ )
1077	D[k] = cval[k];
1078	continue;
1079	}
1080
1081	for(int i = `0`; i < `8`; i++ )
1082	{
1083	x[i] = borderInterpolate(p: sx + i, len: ssize.width, borderType: borderType1)*cn;
1084	y[i] = borderInterpolate(p: sy + i, len: ssize.height, borderType: borderType1);
1085	}
1086
1087	for(int k = `0`; k < cn; k++, S0++, w -= `64` )
1088	{
1089	WT cv = cval[k], sum = cv*ONE;
1090	for(int i = `0`; i < `8`; i++, w += `8` )
1091	{
1092	int yi = y[i];
1093	if( yi < `0` )
1094	continue;
1095	const T* S1 = S0 + yi*sstep;
1096	if( x[`0`] >= `0` )
1097	sum += (S1[x[`0`]] - cv)*w[`0`];
1098	if( x[`1`] >= `0` )
1099	sum += (S1[x[`1`]] - cv)*w[`1`];
1100	if( x[`2`] >= `0` )
1101	sum += (S1[x[`2`]] - cv)*w[`2`];
1102	if( x[`3`] >= `0` )
1103	sum += (S1[x[`3`]] - cv)*w[`3`];
1104	if( x[`4`] >= `0` )
1105	sum += (S1[x[`4`]] - cv)*w[`4`];
1106	if( x[`5`] >= `0` )
1107	sum += (S1[x[`5`]] - cv)*w[`5`];
1108	if( x[`6`] >= `0` )
1109	sum += (S1[x[`6`]] - cv)*w[`6`];
1110	if( x[`7`] >= `0` )
1111	sum += (S1[x[`7`]] - cv)*w[`7`];
1112	}
1113	D[k] = castOp(sum);
1114	}
1115	S0 -= cn;
1116	}
1117	}
1118	}
1119	}
1120
1121
1122	typedef void (RemapNNFunc)(const* Mat& _src, Mat& _dst, const Mat& _xy,
1123	int borderType, const Scalar& _borderValue, const Point& _offset);
1124
1125	typedef void (RemapFunc)(const* Mat& _src, Mat& _dst, const Mat& _xy,
1126	const Mat& _fxy, const void* _wtab,
1127	int borderType, const Scalar& _borderValue, const Point& _offset);
1128
1129	class RemapInvoker :
1130	public ParallelLoopBody
1131	{
1132	public:
1133	RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
1134	const Mat _m2, int* _borderType, const Scalar &_borderValue,
1135	int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
1136	ParallelLoopBody (), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
1137	borderType(_borderType), borderValue (_borderValue),
1138	planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
1139	{
1140	}
1141
1142	virtual void operator() (const Range& range) const CV_OVERRIDE
1143	{
1144	int x, y, x1, y1;
1145	const int buf_size = `1` << `14`;
1146	int brows0 = std::min(a: `128`, b: dst->rows), map_depth = m1->depth();
1147	int bcols0 = std::min(a: buf_size/brows0, b: dst->cols);
1148	brows0 = std::min(a: buf_size/bcols0, b: dst->rows);
1149
1150	Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
1151	if( !nnfunc )
1152	_bufa.create(rows: brows0, cols: bcols0, CV_16UC1);
1153
1154	for( y = range.start; y < range.end; y += brows0 )
1155	{
1156	for( x = `0`; x < dst->cols; x += bcols0 )
1157	{
1158	int brows = std::min(a: brows0, b: range.end - y);
1159	int bcols = std::min(a: bcols0, b: dst->cols - x);
1160	Mat dpart(*dst, Rect (x, y, bcols, brows));
1161	Mat bufxy(_bufxy, Rect (`0`, `0`, bcols, brows));
1162
1163	if( nnfunc )
1164	{
1165	if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
1166	bufxy = (*m1)(Rect (x, y, bcols, brows));
1167	else if( map_depth != CV_32F )
1168	{
1169	for( y1 = `0`; y1 < brows; y1++ )
1170	{
1171	short* XY = bufxy.ptr<short>(y: y1);
1172	const short* sXY = m1->ptr<short>(y: y+y1) + x*`2`;
1173	const ushort* sA = m2->ptr<ushort>(y: y+y1) + x;
1174
1175	for( x1 = `0`; x1 < bcols; x1++ )
1176	{
1177	int a = sA[x1] & (INTER_TAB_SIZE2-`1`);
1178	XY[x1`2`] = sXY[x1`2`] + NNDeltaTab_i[a][`0`];
1179	XY[x1`2`+`1`] = sXY[x1`2`+`1`] + NNDeltaTab_i[a][`1`];
1180	}
1181	}
1182	}
1183	else if( !planar_input )
1184	(*m1)(Rect (x, y, bcols, brows)).convertTo(m: bufxy, rtype: bufxy.depth());
1185	else
1186	{
1187	for( y1 = `0`; y1 < brows; y1++ )
1188	{
1189	short* XY = bufxy.ptr<short>(y: y1);
1190	const float* sX = m1->ptr<float>(y: y+y1) + x;
1191	const float* sY = m2->ptr<float>(y: y+y1) + x;
1192	x1 = `0`;
1193
1194	#if CV_SIMD128
1195	{
1196	int span = VTraits<v_float32x4>::vlanes();
1197	for( ; x1 <= bcols - span * `2`; x1 += span * `2` )
1198	{
1199	v_int32x4 ix0 = v_round(a: v_load(ptr: sX + x1));
1200	v_int32x4 iy0 = v_round(a: v_load(ptr: sY + x1));
1201	v_int32x4 ix1 = v_round(a: v_load(ptr: sX + x1 + span));
1202	v_int32x4 iy1 = v_round(a: v_load(ptr: sY + x1 + span));
1203
1204	v_int16x8 dx, dy;
1205	dx = v_pack(a: ix0, b: ix1);
1206	dy = v_pack(a: iy0, b: iy1);
1207	v_store_interleave(ptr: XY + x1 * `2`, a0: dx, b0: dy);
1208	}
1209	}
1210	#endif
1211	for( ; x1 < bcols; x1++ )
1212	{
1213	XY[x1`2`] = saturate_cast<short*>(v: sX[x1]);
1214	XY[x1`2`+`1`] = saturate_cast<short*>(v: sY[x1]);
1215	}
1216	}
1217	}
1218	nnfunc( *src, dpart, bufxy, borderType, borderValue, Point (x, y) );
1219	continue;
1220	}
1221
1222	Mat bufa(_bufa, Rect (`0`, `0`, bcols, brows));
1223	for( y1 = `0`; y1 < brows; y1++ )
1224	{
1225	short* XY = bufxy.ptr<short>(y: y1);
1226	ushort* A = bufa.ptr<ushort>(y: y1);
1227
1228	if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 \|\| m2->type() == CV_16SC1) )
1229	{
1230	bufxy = (*m1)(Rect (x, y, bcols, brows));
1231
1232	const ushort* sA = m2->ptr<ushort>(y: y+y1) + x;
1233	x1 = `0`;
1234
1235	#if CV_SIMD128
1236	{
1237	v_uint16x8 v_scale = v_setall_u16(v: INTER_TAB_SIZE2 - `1`);
1238	int span = VTraits<v_uint16x8>::vlanes();
1239	for( ; x1 <= bcols - span; x1 += span )
1240	v_store(ptr: (unsigned short*)(A + x1), a: v_and(a: v_load(ptr: sA + x1), b: v_scale));
1241	}
1242	#endif
1243	for( ; x1 < bcols; x1++ )
1244	A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-`1`));
1245	}
1246	else if( planar_input )
1247	{
1248	const float* sX = m1->ptr<float>(y: y+y1) + x;
1249	const float* sY = m2->ptr<float>(y: y+y1) + x;
1250
1251	x1 = `0`;
1252	#if CV_SIMD128
1253	{
1254	v_float32x4 v_scale = v_setall_f32(v: (float)INTER_TAB_SIZE);
1255	v_int32x4 v_scale2 = v_setall_s32(v: INTER_TAB_SIZE - `1`);
1256	int span = VTraits<v_float32x4>::vlanes();
1257	for( ; x1 <= bcols - span * `2`; x1 += span * `2` )
1258	{
1259	v_int32x4 v_sx0 = v_round(a: v_mul(a: v_scale, b: v_load(ptr: sX + x1)));
1260	v_int32x4 v_sy0 = v_round(a: v_mul(a: v_scale, b: v_load(ptr: sY + x1)));
1261	v_int32x4 v_sx1 = v_round(a: v_mul(a: v_scale, b: v_load(ptr: sX + x1 + span)));
1262	v_int32x4 v_sy1 = v_round(a: v_mul(a: v_scale, b: v_load(ptr: sY + x1 + span)));
1263	v_uint16x8 v_sx8 = v_reinterpret_as_u16(a: v_pack(a: v_and(a: v_sx0, b: v_scale2), b: v_and(a: v_sx1, b: v_scale2)));
1264	v_uint16x8 v_sy8 = v_reinterpret_as_u16(a: v_pack(a: v_and(a: v_sy0, b: v_scale2), b: v_and(a: v_sy1, b: v_scale2)));
1265	v_uint16x8 v_v = v_or(a: v_shl<INTER_BITS>(a: v_sy8), b: v_sx8);
1266	v_store(ptr: A + x1, a: v_v);
1267
1268	v_int16x8 v_d0 = v_pack(a: v_shr<INTER_BITS>(a: v_sx0), b: v_shr<INTER_BITS>(a: v_sx1));
1269	v_int16x8 v_d1 = v_pack(a: v_shr<INTER_BITS>(a: v_sy0), b: v_shr<INTER_BITS>(a: v_sy1));
1270	v_store_interleave(ptr: XY + (x1 << `1`), a0: v_d0, b0: v_d1);
1271	}
1272	}
1273	#endif
1274	for( ; x1 < bcols; x1++ )
1275	{
1276	int sx = cvRound(value: sX[x1]*static_cast<float>(INTER_TAB_SIZE));
1277	int sy = cvRound(value: sY[x1]*static_cast<float>(INTER_TAB_SIZE));
1278	int v = (sy & (INTER_TAB_SIZE-`1`))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-`1`));
1279	XY[x1`2`] = saturate_cast<short*>(v: sx >> INTER_BITS);
1280	XY[x1`2`+`1`] = saturate_cast<short*>(v: sy >> INTER_BITS);
1281	A[x1] = (ushort)v;
1282	}
1283	}
1284	else
1285	{
1286	const float* sXY = m1->ptr<float>(y: y+y1) + x*`2`;
1287	x1 = `0`;
1288
1289	#if CV_SIMD128
1290	{
1291	v_float32x4 v_scale = v_setall_f32(v: (float)INTER_TAB_SIZE);
1292	v_int32x4 v_scale2 = v_setall_s32(v: INTER_TAB_SIZE - `1`), v_scale3 = v_setall_s32(v: INTER_TAB_SIZE);
1293	int span = VTraits<v_float32x4>::vlanes();
1294	for( ; x1 <= bcols - span * `2`; x1 += span * `2` )
1295	{
1296	v_float32x4 v_fx, v_fy;
1297	v_load_deinterleave(ptr: sXY + (x1 << `1`), a&: v_fx, b&: v_fy);
1298	v_int32x4 v_sx0 = v_round(a: v_mul(a: v_fx, b: v_scale));
1299	v_int32x4 v_sy0 = v_round(a: v_mul(a: v_fy, b: v_scale));
1300	v_load_deinterleave(ptr: sXY + ((x1 + span) << `1`), a&: v_fx, b&: v_fy);
1301	v_int32x4 v_sx1 = v_round(a: v_mul(a: v_fx, b: v_scale));
1302	v_int32x4 v_sy1 = v_round(a: v_mul(a: v_fy, b: v_scale));
1303	v_int32x4 v_v0 = v_muladd(a: v_scale3, b: (v_and(a: v_sy0, b: v_scale2)), c: (v_and(a: v_sx0, b: v_scale2)));
1304	v_int32x4 v_v1 = v_muladd(a: v_scale3, b: (v_and(a: v_sy1, b: v_scale2)), c: (v_and(a: v_sx1, b: v_scale2)));
1305	v_uint16x8 v_v8 = v_reinterpret_as_u16(a: v_pack(a: v_v0, b: v_v1));
1306	v_store(ptr: A + x1, a: v_v8);
1307	v_int16x8 v_dx = v_pack(a: v_shr<INTER_BITS>(a: v_sx0), b: v_shr<INTER_BITS>(a: v_sx1));
1308	v_int16x8 v_dy = v_pack(a: v_shr<INTER_BITS>(a: v_sy0), b: v_shr<INTER_BITS>(a: v_sy1));
1309	v_store_interleave(ptr: XY + (x1 << `1`), a0: v_dx, b0: v_dy);
1310	}
1311	}
1312	#endif
1313
1314	for( ; x1 < bcols; x1++ )
1315	{
1316	int sx = cvRound(value: sXY[x1`2`]static_cast<float>(INTER_TAB_SIZE));
1317	int sy = cvRound(value: sXY[x1`2`+`1`]static_cast<float>(INTER_TAB_SIZE));
1318	int v = (sy & (INTER_TAB_SIZE-`1`))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-`1`));
1319	XY[x1`2`] = saturate_cast<short*>(v: sx >> INTER_BITS);
1320	XY[x1`2`+`1`] = saturate_cast<short*>(v: sy >> INTER_BITS);
1321	A[x1] = (ushort)v;
1322	}
1323	}
1324	}
1325	ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue, Point (x, y));
1326	}
1327	}
1328	}
1329
1330	private:
1331	const Mat* src;
1332	Mat* dst;
1333	const Mat m1, m2;
1334	int borderType;
1335	Scalar borderValue;
1336	int planar_input;
1337	RemapNNFunc nnfunc;
1338	RemapFunc ifunc;
1339	const void *ctab;
1340	};
1341
1342	#ifdef HAVE_OPENCL
1343
1344	static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
1345	int interpolation, int borderType, const Scalar& borderValue)
1346	{
1347	const bool hasRelativeFlag = ((interpolation & WARP_RELATIVE_MAP) != `0`);
1348	interpolation &= ~WARP_RELATIVE_MAP;
1349
1350	const ocl::Device & dev = ocl::Device::getDefault();
1351	int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
1352	rowsPerWI = dev.isIntel() ? `4` : `1`;
1353
1354	if (borderType == BORDER_TRANSPARENT \|\| !(interpolation == INTER_LINEAR \|\| interpolation == INTER_NEAREST)
1355	\|\| _map1.type() == CV_16SC1 \|\| _map2.type() == CV_16SC1)
1356	return false;
1357
1358	UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
1359
1360	if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 \|\| map2.empty())) \|\|
1361	(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 \|\| map1.empty())) )
1362	{
1363	if (map1.type() != CV_16SC2)
1364	std::swap(a&: map1, b&: map2);
1365	}
1366	else
1367	CV_Assert( map1.type() == CV_32FC2 \|\| (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
1368
1369	_dst.create(sz: map1.size (), type);
1370	UMat dst = _dst.getUMat();
1371
1372	String kernelName = "remap";
1373	if (map1.type() == CV_32FC2 && map2.empty())
1374	kernelName += "_32FC2";
1375	else if (map1.type() == CV_16SC2)
1376	{
1377	kernelName += "_16SC2";
1378	if (!map2.empty())
1379	kernelName += "_16UC1";
1380	}
1381	else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
1382	kernelName += "_2_32FC1";
1383	else
1384	CV_Error(Error::StsBadArg, "Unsupported map types");
1385
1386	static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
1387	static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
1388	"BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
1389	String buildOptions = format(fmt: "-D %s -D %s -D T=%s -D ROWS_PER_WI=%d -D WARP_RELATIVE=%d",
1390	interMap[interpolation], borderMap[borderType],
1391	ocl::typeToStr(t: type), rowsPerWI,
1392	hasRelativeFlag ? `1` : `0`);
1393
1394	if (interpolation != INTER_NEAREST)
1395	{
1396	char cvt[`3`][`50`];
1397	int wdepth = std::max(CV_32F, b: depth);
1398	buildOptions = buildOptions
1399	+ format(fmt: " -D WT=%s -D CONVERT_TO_T=%s -D CONVERT_TO_WT=%s"
1400	" -D CONVERT_TO_WT2=%s -D WT2=%s",
1401	ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
1402	ocl::convertTypeStr(sdepth: wdepth, ddepth: depth, cn, buf: cvt[`0`], buf_size: sizeof(cvt[`0`])),
1403	ocl::convertTypeStr(sdepth: depth, ddepth: wdepth, cn, buf: cvt[`1`], buf_size: sizeof(cvt[`1`])),
1404	ocl::convertTypeStr(CV_32S, ddepth: wdepth, cn: `2`, buf: cvt[`2`], buf_size: sizeof(cvt[`2`])),
1405	ocl::typeToStr(CV_MAKE_TYPE(wdepth, `2`)));
1406	}
1407	int scalarcn = cn == `3` ? `4` : cn;
1408	int sctype = CV_MAKETYPE(depth, scalarcn);
1409	buildOptions += format(fmt: " -D T=%s -D T1=%s -D CN=%d -D ST=%s -D SRC_DEPTH=%d",
1410	ocl::typeToStr(t: type), ocl::typeToStr(t: depth),
1411	cn, ocl::typeToStr(t: sctype), depth);
1412
1413	ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
1414
1415	Mat scalar(`1`, `1`, sctype, borderValue);
1416	ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(m: src), dstarg = ocl::KernelArg::WriteOnly(m: dst),
1417	map1arg = ocl::KernelArg::ReadOnlyNoSize(m: map1),
1418	scalararg = ocl::KernelArg::Constant(arr: (void*)scalar.ptr(), n: scalar.elemSize());
1419
1420	if (map2.empty())
1421	k.args(kernel_args: srcarg, kernel_args: dstarg, kernel_args: map1arg, kernel_args: scalararg);
1422	else
1423	k.args(kernel_args: srcarg, kernel_args: dstarg, kernel_args: map1arg, kernel_args: ocl::KernelArg::ReadOnlyNoSize(m: map2), kernel_args: scalararg);
1424
1425	size_t globalThreads[`2`] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - `1`) / rowsPerWI };
1426	return k.run(dims: `2`, globalsize: globalThreads, NULL, sync: false);
1427	}
1428
1429	#if 0
1430	/**
1431	@deprecated with old version of cv::linearPolar
1432	*/
1433	static bool ocl_linearPolar(InputArray _src, OutputArray _dst,
1434	Point2f center, double maxRadius, int flags)
1435	{
1436	UMat src_with_border; // don't scope this variable (it holds image data)
1437
1438	UMat mapx, mapy, r, cp_sp;
1439	UMat src = _src.getUMat();
1440	_dst.create(src.size(), src.type());
1441	Size dsize = src.size();
1442	r.create(Size(`1`, dsize.width), CV_32F);
1443	cp_sp.create(Size(`1`, dsize.height), CV_32FC2);
1444
1445	mapx.create(dsize, CV_32F);
1446	mapy.create(dsize, CV_32F);
1447	size_t w = dsize.width;
1448	size_t h = dsize.height;
1449	String buildOptions;
1450	unsigned mem_size = `32`;
1451	if (flags & cv::WARP_INVERSE_MAP)
1452	{
1453	buildOptions = "-D InverseMap";
1454	}
1455	else
1456	{
1457	buildOptions = format("-D ForwardMap -D MEM_SIZE=%d", mem_size);
1458	}
1459	String retval;
1460	ocl::Program p(ocl::imgproc::linearPolar_oclsrc, buildOptions, retval);
1461	ocl::Kernel k("linearPolar", p);
1462	ocl::KernelArg ocl_mapx = ocl::KernelArg::PtrReadWrite(mapx), ocl_mapy = ocl::KernelArg::PtrReadWrite(mapy);
1463	ocl::KernelArg ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
1464	ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
1465
1466	if (!(flags & cv::WARP_INVERSE_MAP))
1467	{
1468
1469
1470
1471	ocl::Kernel computeAngleRadius_Kernel("computeAngleRadius", p);
1472	float PI2_height = (float) CV_2PI / dsize.height;
1473	float maxRadius_width = (float) maxRadius / dsize.width;
1474	computeAngleRadius_Kernel.args(ocl_cp_sp, ocl_r, maxRadius_width, PI2_height, (unsigned)dsize.width, (unsigned)dsize.height);
1475	size_t max_dim = max(h, w);
1476	computeAngleRadius_Kernel.run(`1`, &max_dim, NULL, false);
1477	k.args(ocl_mapx, ocl_mapy, ocl_cp_sp, ocl_r, center.x, center.y, (unsigned)dsize.width, (unsigned)dsize.height);
1478	}
1479	else
1480	{
1481	const int ANGLE_BORDER = `1`;
1482
1483	cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, `0`, `0`, BORDER_WRAP);
1484	src = src_with_border;
1485	Size ssize = src_with_border.size();
1486	ssize.height -= `2` * ANGLE_BORDER;
1487	float ascale = ssize.height / ((float)CV_2PI);
1488	float pscale = ssize.width / ((float) maxRadius);
1489
1490	k.args(ocl_mapx, ocl_mapy, ascale, pscale, center.x, center.y, ANGLE_BORDER, (unsigned)dsize.width, (unsigned)dsize.height);
1491
1492
1493	}
1494	size_t globalThreads[`2`] = { (size_t)dsize.width , (size_t)dsize.height };
1495	size_t localThreads[`2`] = { mem_size , mem_size };
1496	k.run(`2`, globalThreads, localThreads, false);
1497	remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
1498	return true;
1499	}
1500	static bool ocl_logPolar(InputArray _src, OutputArray _dst,
1501	Point2f center, double M, int flags)
1502	{
1503	if (M <= `0`)
1504	CV_Error(cv::Error::StsOutOfRange, "M should be >0");
1505	UMat src_with_border; // don't scope this variable (it holds image data)
1506
1507	UMat mapx, mapy, r, cp_sp;
1508	UMat src = _src.getUMat();
1509	_dst.create(src.size(), src.type());
1510	Size dsize = src.size();
1511	r.create(Size(`1`, dsize.width), CV_32F);
1512	cp_sp.create(Size(`1`, dsize.height), CV_32FC2);
1513
1514	mapx.create(dsize, CV_32F);
1515	mapy.create(dsize, CV_32F);
1516	size_t w = dsize.width;
1517	size_t h = dsize.height;
1518	String buildOptions;
1519	unsigned mem_size = `32`;
1520	if (flags & cv::WARP_INVERSE_MAP)
1521	{
1522	buildOptions = "-D InverseMap";
1523	}
1524	else
1525	{
1526	buildOptions = format("-D ForwardMap -D MEM_SIZE=%d", mem_size);
1527	}
1528	String retval;
1529	ocl::Program p(ocl::imgproc::logPolar_oclsrc, buildOptions, retval);
1530	//ocl::Program p(ocl::imgproc::my_linearPolar_oclsrc, buildOptions, retval);
1531	//printf("%s\n", retval);
1532	ocl::Kernel k("logPolar", p);
1533	ocl::KernelArg ocl_mapx = ocl::KernelArg::PtrReadWrite(mapx), ocl_mapy = ocl::KernelArg::PtrReadWrite(mapy);
1534	ocl::KernelArg ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
1535	ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
1536
1537	if (!(flags & cv::WARP_INVERSE_MAP))
1538	{
1539
1540
1541
1542	ocl::Kernel computeAngleRadius_Kernel("computeAngleRadius", p);
1543	float PI2_height = (float) CV_2PI / dsize.height;
1544
1545	computeAngleRadius_Kernel.args(ocl_cp_sp, ocl_r, (float)M, PI2_height, (unsigned)dsize.width, (unsigned)dsize.height);
1546	size_t max_dim = max(h, w);
1547	computeAngleRadius_Kernel.run(`1`, &max_dim, NULL, false);
1548	k.args(ocl_mapx, ocl_mapy, ocl_cp_sp, ocl_r, center.x, center.y, (unsigned)dsize.width, (unsigned)dsize.height);
1549	}
1550	else
1551	{
1552	const int ANGLE_BORDER = `1`;
1553
1554	cv::copyMakeBorder(src, src_with_border, ANGLE_BORDER, ANGLE_BORDER, `0`, `0`, BORDER_WRAP);
1555	src = src_with_border;
1556	Size ssize = src_with_border.size();
1557	ssize.height -= `2` * ANGLE_BORDER;
1558	float ascale = ssize.height / ((float)CV_2PI);
1559
1560
1561	k.args(ocl_mapx, ocl_mapy, ascale, (float)M, center.x, center.y, ANGLE_BORDER, (unsigned)dsize.width, (unsigned)dsize.height);
1562
1563
1564	}
1565	size_t globalThreads[`2`] = { (size_t)dsize.width , (size_t)dsize.height };
1566	size_t localThreads[`2`] = { mem_size , mem_size };
1567	k.run(`2`, globalThreads, localThreads, false);
1568	remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
1569	return true;
1570	}
1571	#endif
1572
1573	#endif
1574
1575	#if defined HAVE_IPP && !IPP_DISABLE_REMAP
1576
1577	typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
1578	const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
1579	void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
1580
1581	class IPPRemapInvoker :
1582	public ParallelLoopBody
1583	{
1584	public:
1585	IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
1586	int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
1587	ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
1588	ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
1589	{
1590	ok = true*;
1591	}
1592
1593	virtual void operator() (const Range & range) const
1594	{
1595	IppiRect srcRoiRect = { `0`, `0`, src.cols, src.rows };
1596	Mat dstRoi = dst.rowRange(range);
1597	IppiSize dstRoiSize = ippiSize(dstRoi.size());
1598	int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1599
1600	if (borderType == BORDER_CONSTANT &&
1601	!IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
1602	{
1603	ok = false*;
1604	return;
1605	}
1606
1607	if (CV_INSTRUMENT_FUN_IPP(ippFunc, src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
1608	map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
1609	dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < `0`)
1610	ok = false*;
1611	else
1612	{
1613	CV_IMPL_ADD(CV_IMPL_IPP\|CV_IMPL_MT);
1614	}
1615	}
1616
1617	private:
1618	Mat & src, & dst, & map1, & map2;
1619	ippiRemap ippFunc;
1620	int ippInterpolation, borderType;
1621	Scalar borderValue;
1622	bool * ok;
1623	};
1624
1625	#endif
1626
1627	}
1628
1629	void cv::remap( InputArray _src, OutputArray _dst,
1630	InputArray _map1, InputArray _map2,
1631	int interpolation, int borderType, const Scalar& borderValue )
1632	{
1633	CV_INSTRUMENT_REGION();
1634
1635	const bool hasRelativeFlag = ((interpolation & WARP_RELATIVE_MAP) != `0`);
1636
1637	static RemapNNFunc nn_tab[`2`][`8`] =
1638	{
1639	{
1640	remapNearest<uchar, false>, remapNearest<schar, false>, remapNearest<ushort, false>, remapNearest<short, false>,
1641	remapNearest<int, false>, remapNearest<float, false>, remapNearest<double, false>, `0`
1642	},
1643	{
1644	remapNearest<uchar, true>, remapNearest<schar, true>, remapNearest<ushort, true>, remapNearest<short, true>,
1645	remapNearest<int, true>, remapNearest<float, true>, remapNearest<double, true>, `0`
1646	}
1647	};
1648
1649	static RemapFunc linear_tab[`2`][`8`] =
1650	{
1651	{
1652	remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u<false>, short, false>, `0`,
1653	remapBilinear<Cast<float, ushort>, RemapNoVec<false>, float, false>,
1654	remapBilinear<Cast<float, short>, RemapNoVec<false>, float, false>, `0`,
1655	remapBilinear<Cast<float, float>, RemapNoVec<false>, float, false>,
1656	remapBilinear<Cast<double, double>, RemapNoVec<false>, float, false>, `0`
1657	},
1658	{
1659	remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u<true>, short, true>, `0`,
1660	remapBilinear<Cast<float, ushort>, RemapNoVec<true>, float, true>,
1661	remapBilinear<Cast<float, short>, RemapNoVec<true>, float, true>, `0`,
1662	remapBilinear<Cast<float, float>, RemapNoVec<true>, float, true>,
1663	remapBilinear<Cast<double, double>, RemapNoVec<true>, float, true>, `0`
1664	}
1665	};
1666
1667	static RemapFunc cubic_tab[`2`][`8`] =
1668	{
1669	{
1670	remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, false>, `0`,
1671	remapBicubic<Cast<float, ushort>, float, `1`, false>,
1672	remapBicubic<Cast<float, short>, float, `1`, false>, `0`,
1673	remapBicubic<Cast<float, float>, float, `1`, false>,
1674	remapBicubic<Cast<double, double>, float, `1`, false>, `0`
1675	},
1676	{
1677	remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, true>, `0`,
1678	remapBicubic<Cast<float, ushort>, float, `1`, true>,
1679	remapBicubic<Cast<float, short>, float, `1`, true>, `0`,
1680	remapBicubic<Cast<float, float>, float, `1`, true>,
1681	remapBicubic<Cast<double, double>, float, `1`, true>, `0`
1682	}
1683	};
1684
1685	static RemapFunc lanczos4_tab[`2`][`8`] =
1686	{
1687	{
1688	remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, false>, `0`,
1689	remapLanczos4<Cast<float, ushort>, float, `1`, false>,
1690	remapLanczos4<Cast<float, short>, float, `1`, false>, `0`,
1691	remapLanczos4<Cast<float, float>, float, `1`, false>,
1692	remapLanczos4<Cast<double, double>, float, `1`, false>, `0`
1693	},
1694	{
1695	remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, true>, `0`,
1696	remapLanczos4<Cast<float, ushort>, float, `1`, true>,
1697	remapLanczos4<Cast<float, short>, float, `1`, true>, `0`,
1698	remapLanczos4<Cast<float, float>, float, `1`, true>,
1699	remapLanczos4<Cast<double, double>, float, `1`, true>, `0`
1700	}
1701	};
1702
1703	CV_Assert( !_map1.empty() );
1704	CV_Assert( _map2.empty() \|\| (_map2.size() == _map1.size()));
1705
1706	CV_OCL_RUN(_src.dims() <= `2` && _dst.isUMat(),
1707	ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
1708
1709	Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
1710	_dst.create( sz: map1.size (), type: src.type() );
1711	Mat dst = _dst.getMat();
1712
1713	CV_Assert( dst.cols < SHRT_MAX && dst.rows < SHRT_MAX && src.cols < SHRT_MAX && src.rows < SHRT_MAX );
1714
1715	if( dst.data == src.data )
1716	src = src.clone();
1717
1718	if ((map1.type() == CV_32FC1) && (map2.type() == CV_32FC1))
1719	{
1720	CALL_HAL(remap32f, cv_hal_remap32f, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
1721	map1.ptr<float>(), map1.step, map2.ptr<float>(), map2.step, interpolation, borderType, borderValue.val);
1722	}
1723	if ((map1.type() == CV_32FC2) && map2.empty())
1724	{
1725	CALL_HAL(remap32fc2, cv_hal_remap32fc2, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
1726	map1.ptr<float>(), map1.step, interpolation, borderType, borderValue.val);
1727	}
1728	if ((map1.type() == CV_16SC2) && (map2.empty() \|\| map2.type() == CV_16UC1))
1729	{
1730	CALL_HAL(remap16s, cv_hal_remap16s, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
1731	map1.ptr<short>(), map1.step, map2.ptr<ushort>(), map2.step, interpolation, borderType, borderValue.val);
1732	}
1733
1734	interpolation &= ~WARP_RELATIVE_MAP;
1735	if( interpolation == INTER_AREA )
1736	interpolation = INTER_LINEAR;
1737
1738	int type = src.type(), depth = CV_MAT_DEPTH(type);
1739
1740	#if defined HAVE_IPP && !IPP_DISABLE_REMAP
1741	CV_IPP_CHECK()
1742	{
1743	if ((interpolation == INTER_LINEAR \|\| interpolation == INTER_CUBIC \|\| interpolation == INTER_NEAREST) &&
1744	map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
1745	(borderType == BORDER_CONSTANT \|\| borderType == BORDER_TRANSPARENT))
1746	{
1747	int ippInterpolation =
1748	interpolation == INTER_NEAREST ? IPPI_INTER_NN :
1749	interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
1750
1751	ippiRemap ippFunc =
1752	type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
1753	type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
1754	type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
1755	type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
1756	type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
1757	type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
1758	type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
1759	type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
1760	type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : `0`;
1761
1762	if (ippFunc)
1763	{
1764	bool ok;
1765	IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
1766	borderType, borderValue, &ok);
1767	Range range(`0`, dst.rows);
1768	parallel_for_(range, invoker, dst.total() / (double)(`1` << `16`));
1769
1770	if (ok)
1771	{
1772	CV_IMPL_ADD(CV_IMPL_IPP\|CV_IMPL_MT);
1773	return;
1774	}
1775	setIppErrorStatus();
1776	}
1777	}
1778	}
1779	#endif
1780
1781	RemapNNFunc nnfunc = `0`;
1782	RemapFunc ifunc = `0`;
1783	const void* ctab = `0`;
1784	bool fixpt = depth == CV_8U;
1785	bool planar_input = false;
1786
1787	const int relativeOptionIndex = (hasRelativeFlag ? `1` : `0`);
1788	if( interpolation == INTER_NEAREST )
1789	{
1790	nnfunc = nn_tab[relativeOptionIndex][depth];
1791	CV_Assert( nnfunc != `0` );
1792	}
1793	else
1794	{
1795	if( interpolation == INTER_LINEAR )
1796	ifunc = linear_tab[relativeOptionIndex][depth];
1797	else if( interpolation == INTER_CUBIC ){
1798	ifunc = cubic_tab[relativeOptionIndex][depth];
1799	CV_Assert( _src.channels() <= `4` );
1800	}
1801	else if( interpolation == INTER_LANCZOS4 ){
1802	ifunc = lanczos4_tab[relativeOptionIndex][depth];
1803	CV_Assert( _src.channels() <= `4` );
1804	}
1805	else
1806	CV_Error( cv::Error::StsBadArg, "Unknown interpolation method" );
1807	CV_Assert( ifunc != `0` );
1808	ctab = initInterTab2D( method: interpolation, fixpt );
1809	}
1810
1811	const Mat m1 = &map1, m2 = &map2;
1812
1813	if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 \|\| map2.type() == CV_16SC1 \|\| map2.empty())) \|\|
1814	(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 \|\| map1.type() == CV_16SC1 \|\| map1.empty())) )
1815	{
1816	if( map1.type() != CV_16SC2 )
1817	std::swap(a&: m1, b&: m2);
1818	}
1819	else
1820	{
1821	CV_Assert( ((map1.type() == CV_32FC2 \|\| map1.type() == CV_16SC2) && map2.empty()) \|\|
1822	(map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
1823	planar_input = map1.channels() == `1`;
1824	}
1825
1826	RemapInvoker invoker(src, dst, m1, m2,
1827	borderType, borderValue, planar_input, nnfunc, ifunc,
1828	ctab);
1829	parallel_for_(range: Range (`0`, dst.rows), body: invoker, nstripes: dst.total()/(double)(`1`<<`16`));
1830	}
1831
1832
1833	void cv::convertMaps( InputArray _map1, InputArray _map2,
1834	OutputArray _dstmap1, OutputArray _dstmap2,
1835	int dstm1type, bool nninterpolate )
1836	{
1837	CV_INSTRUMENT_REGION();
1838
1839	Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
1840	Size size = map1.size ();
1841	const Mat m1 = &map1, m2 = &map2;
1842	int m1type = m1->type(), m2type = m2->type();
1843
1844	CV_Assert( (m1type == CV_16SC2 && (nninterpolate \|\| m2type == CV_16UC1 \|\| m2type == CV_16SC1)) \|\|
1845	(m2type == CV_16SC2 && (nninterpolate \|\| m1type == CV_16UC1 \|\| m1type == CV_16SC1)) \|\|
1846	(m1type == CV_32FC1 && m2type == CV_32FC1) \|\|
1847	(m1type == CV_32FC2 && m2->empty()) );
1848
1849	if( m2type == CV_16SC2 )
1850	{
1851	std::swap( a&: m1, b&: m2 );
1852	std::swap( a&: m1type, b&: m2type );
1853	}
1854
1855	if( dstm1type <= `0` )
1856	dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
1857	CV_Assert( dstm1type == CV_16SC2 \|\| dstm1type == CV_32FC1 \|\| dstm1type == CV_32FC2 );
1858	_dstmap1.create( sz: size, type: dstm1type );
1859	dstmap1 = _dstmap1.getMat();
1860
1861	if( !nninterpolate && dstm1type != CV_32FC2 )
1862	{
1863	_dstmap2.create( sz: size, type: dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
1864	dstmap2 = _dstmap2.getMat();
1865	}
1866	else
1867	_dstmap2.release();
1868
1869	if( m1type == dstm1type \|\| (nninterpolate &&
1870	((m1type == CV_16SC2 && dstm1type == CV_32FC2) \|\|
1871	(m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
1872	{
1873	m1->convertTo( m: dstmap1, rtype: dstmap1.type() );
1874	if( !dstmap2.empty() && dstmap2.type() == m2->type() )
1875	m2->copyTo( m: dstmap2 );
1876	return;
1877	}
1878
1879	if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
1880	{
1881	Mat vdata[] = { m1, m2 };
1882	merge( mv: vdata, count: `2`, dst: dstmap1 );
1883	return;
1884	}
1885
1886	if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
1887	{
1888	Mat mv[] = { dstmap1, dstmap2 };
1889	split( src: *m1, mvbegin: mv );
1890	return;
1891	}
1892
1893	if( m1->isContinuous() && (m2->empty() \|\| m2->isContinuous()) &&
1894	dstmap1.isContinuous() && (dstmap2.empty() \|\| dstmap2.isContinuous()) )
1895	{
1896	size.width *= size.height;
1897	size.height = `1`;
1898	}
1899
1900	#if CV_TRY_SSE4_1
1901	bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
1902	#endif
1903
1904	const float scale = `1.f`/static_cast<float>(INTER_TAB_SIZE);
1905	int x, y;
1906	for( y = `0`; y < size.height; y++ )
1907	{
1908	const float* src1f = m1->ptr<float>(y);
1909	const float* src2f = m2->ptr<float>(y);
1910	const short* src1 = (const short*)src1f;
1911	const ushort* src2 = (const ushort*)src2f;
1912
1913	float* dst1f = dstmap1.ptr<float>(y);
1914	float* dst2f = dstmap2.ptr<float>(y);
1915	short* dst1 = (short*)dst1f;
1916	ushort* dst2 = (ushort*)dst2f;
1917	x = `0`;
1918
1919	if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
1920	{
1921	if( nninterpolate )
1922	{
1923	#if CV_TRY_SSE4_1
1924	if (useSSE4_1)
1925	opt_SSE4_1::convertMaps_nninterpolate32f1c16s_SSE41(src1f, src2f, dst1, width: size.width);
1926	else
1927	#endif
1928	{
1929	#if CV_SIMD128
1930	{
1931	int span = VTraits<v_int16x8>::vlanes();
1932	for( ; x <= size.width - span; x += span )
1933	{
1934	v_int16x8 v_dst[`2`];
1935	#define CV_PACK_MAP(X) v_pack(v_round(v_load(X)), v_round(v_load((X)+4)))
1936	v_dst[`0`] = CV_PACK_MAP(src1f + x);
1937	v_dst[`1`] = CV_PACK_MAP(src2f + x);
1938	#undef CV_PACK_MAP
1939	v_store_interleave(ptr: dst1 + (x << `1`), a0: v_dst[`0`], b0: v_dst[`1`]);
1940	}
1941	}
1942	#endif
1943	for( ; x < size.width; x++ )
1944	{
1945	dst1[x`2`] = saturate_cast<short*>(v: src1f[x]);
1946	dst1[x`2`+`1`] = saturate_cast<short*>(v: src2f[x]);
1947	}
1948	}
1949	}
1950	else
1951	{
1952	#if CV_TRY_SSE4_1
1953	if (useSSE4_1)
1954	opt_SSE4_1::convertMaps_32f1c16s_SSE41(src1f, src2f, dst1, dst2, width: size.width);
1955	else
1956	#endif
1957	{
1958	#if CV_SIMD128
1959	{
1960	v_float32x4 v_scale = v_setall_f32(v: (float)INTER_TAB_SIZE);
1961	v_int32x4 v_mask = v_setall_s32(v: INTER_TAB_SIZE - `1`);
1962	v_int32x4 v_scale3 = v_setall_s32(v: INTER_TAB_SIZE);
1963	int span = VTraits<v_float32x4>::vlanes();
1964	for( ; x <= size.width - span * `2`; x += span * `2` )
1965	{
1966	v_int32x4 v_ix0 = v_round(a: v_mul(a: v_scale, b: v_load(ptr: src1f + x)));
1967	v_int32x4 v_ix1 = v_round(a: v_mul(a: v_scale, b: v_load(ptr: src1f + x + span)));
1968	v_int32x4 v_iy0 = v_round(a: v_mul(a: v_scale, b: v_load(ptr: src2f + x)));
1969	v_int32x4 v_iy1 = v_round(a: v_mul(a: v_scale, b: v_load(ptr: src2f + x + span)));
1970
1971	v_int16x8 v_dst[`2`];
1972	v_dst[`0`] = v_pack(a: v_shr<INTER_BITS>(a: v_ix0), b: v_shr<INTER_BITS>(a: v_ix1));
1973	v_dst[`1`] = v_pack(a: v_shr<INTER_BITS>(a: v_iy0), b: v_shr<INTER_BITS>(a: v_iy1));
1974	v_store_interleave(ptr: dst1 + (x << `1`), a0: v_dst[`0`], b0: v_dst[`1`]);
1975
1976	v_int32x4 v_dst0 = v_muladd(a: v_scale3, b: (v_and(a: v_iy0, b: v_mask)), c: (v_and(a: v_ix0, b: v_mask)));
1977	v_int32x4 v_dst1 = v_muladd(a: v_scale3, b: (v_and(a: v_iy1, b: v_mask)), c: (v_and(a: v_ix1, b: v_mask)));
1978	v_store(ptr: dst2 + x, a: v_pack_u(a: v_dst0, b: v_dst1));
1979	}
1980	}
1981	#endif
1982	for( ; x < size.width; x++ )
1983	{
1984	int ix = saturate_cast<int>(v: src1f[x]*static_cast<float>(INTER_TAB_SIZE));
1985	int iy = saturate_cast<int>(v: src2f[x]*static_cast<float>(INTER_TAB_SIZE));
1986	dst1[x`2`] = saturate_cast<short*>(v: ix >> INTER_BITS);
1987	dst1[x`2`+`1`] = saturate_cast<short*>(v: iy >> INTER_BITS);
1988	dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-`1`))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-`1`)));
1989	}
1990	}
1991	}
1992	}
1993	else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
1994	{
1995	#if CV_TRY_SSE4_1
1996	if( useSSE4_1 )
1997	opt_SSE4_1::convertMaps_32f2c16s_SSE41(src1f, dst1, dst2, width: size.width);
1998	else
1999	#endif
2000	{
2001	#if CV_SIMD128
2002	{
2003	v_float32x4 v_scale = v_setall_f32(v: (float)INTER_TAB_SIZE);
2004	v_int32x4 v_mask = v_setall_s32(v: INTER_TAB_SIZE - `1`);
2005	v_int32x4 v_scale3 = v_setall_s32(v: INTER_TAB_SIZE);
2006	int span = VTraits<v_uint16x8>::vlanes();
2007	for (; x <= size.width - span; x += span )
2008	{
2009	v_float32x4 v_src0[`2`], v_src1[`2`];
2010	v_load_deinterleave(ptr: src1f + (x << `1`), a&: v_src0[`0`], b&: v_src0[`1`]);
2011	v_load_deinterleave(ptr: src1f + (x << `1`) + span, a&: v_src1[`0`], b&: v_src1[`1`]);
2012	v_int32x4 v_ix0 = v_round(a: v_mul(a: v_src0[`0`], b: v_scale));
2013	v_int32x4 v_ix1 = v_round(a: v_mul(a: v_src1[`0`], b: v_scale));
2014	v_int32x4 v_iy0 = v_round(a: v_mul(a: v_src0[`1`], b: v_scale));
2015	v_int32x4 v_iy1 = v_round(a: v_mul(a: v_src1[`1`], b: v_scale));
2016
2017	v_int16x8 v_dst[`2`];
2018	v_dst[`0`] = v_pack(a: v_shr<INTER_BITS>(a: v_ix0), b: v_shr<INTER_BITS>(a: v_ix1));
2019	v_dst[`1`] = v_pack(a: v_shr<INTER_BITS>(a: v_iy0), b: v_shr<INTER_BITS>(a: v_iy1));
2020	v_store_interleave(ptr: dst1 + (x << `1`), a0: v_dst[`0`], b0: v_dst[`1`]);
2021
2022	v_store(ptr: dst2 + x, a: v_pack_u(
2023	a: v_muladd(a: v_scale3, b: (v_and(a: v_iy0, b: v_mask)), c: (v_and(a: v_ix0, b: v_mask))),
2024	b: v_muladd(a: v_scale3, b: (v_and(a: v_iy1, b: v_mask)), c: (v_and(a: v_ix1, b: v_mask)))));
2025	}
2026	}
2027	#endif
2028	for( ; x < size.width; x++ )
2029	{
2030	int ix = saturate_cast<int>(v: src1f[x`2`]static_cast<float>(INTER_TAB_SIZE));
2031	int iy = saturate_cast<int>(v: src1f[x`2`+`1`]static_cast<float>(INTER_TAB_SIZE));
2032	dst1[x`2`] = saturate_cast<short*>(v: ix >> INTER_BITS);
2033	dst1[x`2`+`1`] = saturate_cast<short*>(v: iy >> INTER_BITS);
2034	dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-`1`))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-`1`)));
2035	}
2036	}
2037	}
2038	else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
2039	{
2040	#if CV_SIMD128
2041	{
2042	v_uint16x8 v_mask2 = v_setall_u16(v: INTER_TAB_SIZE2-`1`);
2043	v_uint32x4 v_zero = v_setzero_u32(), v_mask = v_setall_u32(v: INTER_TAB_SIZE-`1`);
2044	v_float32x4 v_scale = v_setall_f32(v: scale);
2045	int span = VTraits<v_float32x4>::vlanes();
2046	for( ; x <= size.width - span * `2`; x += span * `2` )
2047	{
2048	v_uint32x4 v_fxy1, v_fxy2;
2049	if ( src2 )
2050	{
2051	v_uint16x8 v_src2 = v_and(a: v_load(ptr: src2 + x), b: v_mask2);
2052	v_expand(a: v_src2, b0&: v_fxy1, b1&: v_fxy2);
2053	}
2054	else
2055	v_fxy1 = v_fxy2 = v_zero;
2056
2057	v_int16x8 v_src[`2`];
2058	v_int32x4 v_src0[`2`], v_src1[`2`];
2059	v_load_deinterleave(ptr: src1 + (x << `1`), a0&: v_src[`0`], b0&: v_src[`1`]);
2060	v_expand(a: v_src[`0`], b0&: v_src0[`0`], b1&: v_src0[`1`]);
2061	v_expand(a: v_src[`1`], b0&: v_src1[`0`], b1&: v_src1[`1`]);
2062	#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_and((FXY), v_mask))),\
2063	v_cvt_f32(v_reinterpret_as_s32(X)))
2064	#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_shr<INTER_BITS>((FXY)))),\
2065	v_cvt_f32(v_reinterpret_as_s32(Y)))
2066	v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[`0`], v_fxy1);
2067	v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[`0`], v_fxy1);
2068	v_store(ptr: dst1f + x, a: v_dst1);
2069	v_store(ptr: dst2f + x, a: v_dst2);
2070
2071	v_dst1 = CV_COMPUTE_MAP_X(v_src0[`1`], v_fxy2);
2072	v_dst2 = CV_COMPUTE_MAP_Y(v_src1[`1`], v_fxy2);
2073	v_store(ptr: dst1f + x + span, a: v_dst1);
2074	v_store(ptr: dst2f + x + span, a: v_dst2);
2075	#undef CV_COMPUTE_MAP_X
2076	#undef CV_COMPUTE_MAP_Y
2077	}
2078	}
2079	#endif
2080	for( ; x < size.width; x++ )
2081	{
2082	int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-`1`) : `0`;
2083	dst1f[x] = src1[x`2`] + (fxy & (INTER_TAB_SIZE-`1`))scale;
2084	dst2f[x] = src1[x`2`+`1`] + (fxy >> INTER_BITS)scale;
2085	}
2086	}
2087	else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
2088	{
2089	#if CV_SIMD128
2090	{
2091	v_int16x8 v_mask2 = v_setall_s16(v: INTER_TAB_SIZE2-`1`);
2092	v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(v: INTER_TAB_SIZE-`1`);
2093	v_float32x4 v_scale = v_setall_f32(v: scale);
2094	int span = VTraits<v_int16x8>::vlanes();
2095	for( ; x <= size.width - span; x += span )
2096	{
2097	v_int32x4 v_fxy1, v_fxy2;
2098	if (src2)
2099	{
2100	v_int16x8 v_src2 = v_and(a: v_load(ptr: (short *)src2 + x), b: v_mask2);
2101	v_expand(a: v_src2, b0&: v_fxy1, b1&: v_fxy2);
2102	}
2103	else
2104	v_fxy1 = v_fxy2 = v_zero;
2105
2106	v_int16x8 v_src[`2`];
2107	v_int32x4 v_src0[`2`], v_src1[`2`];
2108	v_float32x4 v_dst[`2`];
2109	v_load_deinterleave(ptr: src1 + (x << `1`), a0&: v_src[`0`], b0&: v_src[`1`]);
2110	v_expand(a: v_src[`0`], b0&: v_src0[`0`], b1&: v_src0[`1`]);
2111	v_expand(a: v_src[`1`], b0&: v_src1[`0`], b1&: v_src1[`1`]);
2112
2113	#define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_and((FXY), v_mask)), v_cvt_f32(X))
2114	#define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_shr<INTER_BITS>((FXY))), v_cvt_f32(Y))
2115	v_dst[`0`] = CV_COMPUTE_MAP_X(v_src0[`0`], v_fxy1);
2116	v_dst[`1`] = CV_COMPUTE_MAP_Y(v_src1[`0`], v_fxy1);
2117	v_store_interleave(ptr: dst1f + (x << `1`), a: v_dst[`0`], b: v_dst[`1`]);
2118
2119	v_dst[`0`] = CV_COMPUTE_MAP_X(v_src0[`1`], v_fxy2);
2120	v_dst[`1`] = CV_COMPUTE_MAP_Y(v_src1[`1`], v_fxy2);
2121	v_store_interleave(ptr: dst1f + (x << `1`) + span, a: v_dst[`0`], b: v_dst[`1`]);
2122	#undef CV_COMPUTE_MAP_X
2123	#undef CV_COMPUTE_MAP_Y
2124	}
2125	}
2126	#endif
2127	for( ; x < size.width; x++ )
2128	{
2129	int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-`1`): `0`;
2130	dst1f[x`2`] = src1[x`2`] + (fxy & (INTER_TAB_SIZE-`1`))*scale;
2131	dst1f[x`2`+`1`] = src1[x`2`+`1`] + (fxy >> INTER_BITS)*scale;
2132	}
2133	}
2134	else
2135	CV_Error( cv::Error::StsNotImplemented, "Unsupported combination of input/output matrices" );
2136	}
2137	}
2138
2139
2140	namespace cv
2141	{
2142
2143	class WarpAffineInvoker :
2144	public ParallelLoopBody
2145	{
2146	public:
2147	WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
2148	const Scalar &_borderValue, int _adelta, int* _bdelta, const* double *_M) :
2149	ParallelLoopBody (), src (_src), dst (_dst), interpolation(_interpolation),
2150	borderType(_borderType), borderValue (_borderValue), adelta(_adelta), bdelta(_bdelta),
2151	M(_M)
2152	{
2153	}
2154
2155	virtual void operator() (const Range& range) const CV_OVERRIDE
2156	{
2157	const int BLOCK_SZ = `64`;
2158	AutoBuffer<short, `0`> __XY(BLOCK_SZ * BLOCK_SZ * `2`), __A(BLOCK_SZ * BLOCK_SZ);
2159	short XY = __XY.data(), A = __A.data();
2160	const int AB_BITS = MAX(`10`, (int)INTER_BITS);
2161	const int AB_SCALE = `1` << AB_BITS;
2162	int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/`2` : AB_SCALE/INTER_TAB_SIZE/`2`, x, y, y1;
2163
2164	int bh0 = std::min(a: BLOCK_SZ/`2`, b: dst.rows);
2165	int bw0 = std::min(a: BLOCK_SZ*BLOCK_SZ/bh0, b: dst.cols);
2166	bh0 = std::min(a: BLOCK_SZ*BLOCK_SZ/bw0, b: dst.rows);
2167
2168	for( y = range.start; y < range.end; y += bh0 )
2169	{
2170	for( x = `0`; x < dst.cols; x += bw0 )
2171	{
2172	int bw = std::min( a: bw0, b: dst.cols - x);
2173	int bh = std::min( a: bh0, b: range.end - y);
2174
2175	Mat _XY(bh, bw, CV_16SC2, XY);
2176	Mat dpart(dst, Rect (x, y, bw, bh));
2177
2178	for( y1 = `0`; y1 < bh; y1++ )
2179	{
2180	short* xy = XY + y1bw`2`;
2181	int X0 = saturate_cast<int>(v: (M[`1`](y + y1) + M[`2`])AB_SCALE) + round_delta;
2182	int Y0 = saturate_cast<int>(v: (M[`4`](y + y1) + M[`5`])AB_SCALE) + round_delta;
2183
2184	if( interpolation == INTER_NEAREST )
2185	hal::warpAffineBlocklineNN(adelta: adelta + x, bdelta: bdelta + x, xy, X0, Y0, bw);
2186	else
2187	hal::warpAffineBlockline(adelta: adelta + x, bdelta: bdelta + x, xy, alpha: A + y1*bw, X0, Y0, bw);
2188	}
2189
2190	if( interpolation == INTER_NEAREST )
2191	remap( src: src, dst: dpart, map1: _XY, map2: Mat (), interpolation, borderType, borderValue );
2192	else
2193	{
2194	Mat _matA(bh, bw, CV_16U, A);
2195	remap( src: src, dst: dpart, map1: _XY, map2: _matA, interpolation, borderType, borderValue );
2196	}
2197	}
2198	}
2199	}
2200
2201	private:
2202	Mat src;
2203	Mat dst;
2204	int interpolation, borderType;
2205	Scalar borderValue;
2206	int adelta, bdelta;
2207	const double *M;
2208	};
2209
2210
2211	#if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPAFFINE
2212	typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void, IppiSize, int, IppiRect, void* , int, IppiRect, double* [`2`][`3`], int);
2213
2214	class IPPWarpAffineInvoker :
2215	public ParallelLoopBody
2216	{
2217	public:
2218	IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[`2`][`3`], int &_interpolation, int _borderType,
2219	const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
2220	ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
2221	borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
2222	{
2223	ok = true*;
2224	}
2225
2226	virtual void operator() (const Range& range) const CV_OVERRIDE
2227	{
2228	IppiSize srcsize = { src.cols, src.rows };
2229	IppiRect srcroi = { `0`, `0`, src.cols, src.rows };
2230	IppiRect dstroi = { `0`, range.start, dst.cols, range.end - range.start };
2231	int cnn = src.channels();
2232	if( borderType == BORDER_CONSTANT )
2233	{
2234	IppiSize setSize = { dst.cols, range.end - range.start };
2235	void *dataPointer = dst.ptr(range.start);
2236	if( !IPPSet( borderValue, dataPointer, (int)dst.step[`0`], setSize, cnn, src.depth() ) )
2237	{
2238	ok = false*;
2239	return;
2240	}
2241	}
2242
2243	// Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
2244	IppStatus status = CV_INSTRUMENT_FUN_IPP(func,( src.ptr(), srcsize, (int)src.step[`0`], srcroi, dst.ptr(),
2245	(int)dst.step[`0`], dstroi, coeffs, mode ));
2246	if( status < `0`)
2247	ok = false*;
2248	else
2249	{
2250	CV_IMPL_ADD(CV_IMPL_IPP\|CV_IMPL_MT);
2251	}
2252	}
2253	private:
2254	Mat &src;
2255	Mat &dst;
2256	int mode;
2257	double (&coeffs)[`2`][`3`];
2258	int borderType;
2259	Scalar borderValue;
2260	ippiWarpAffineBackFunc func;
2261	bool *ok;
2262	const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
2263	};
2264	#endif
2265
2266	#ifdef HAVE_OPENCL
2267
2268	enum { OCL_OP_PERSPECTIVE = `1`, OCL_OP_AFFINE = `0` };
2269
2270	static bool ocl_warpTransform_cols4(InputArray _src, OutputArray _dst, InputArray _M0,
2271	Size dsize, int flags, int borderType, const Scalar& borderValue,
2272	int op_type)
2273	{
2274	CV_Assert(op_type == OCL_OP_AFFINE \|\| op_type == OCL_OP_PERSPECTIVE);
2275	const ocl::Device & dev = ocl::Device::getDefault();
2276	int type = _src.type(), dtype = _dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2277
2278	int interpolation = flags & INTER_MAX;
2279	if( interpolation == INTER_AREA )
2280	interpolation = INTER_LINEAR;
2281
2282	if ( !dev.isIntel() \|\| !(type == CV_8UC1) \|\|
2283	!(dtype == CV_8UC1) \|\| !(_dst.cols() % `4` == `0`) \|\|
2284	!(borderType == cv::BORDER_CONSTANT &&
2285	(interpolation == cv::INTER_NEAREST \|\| interpolation == cv::INTER_LINEAR \|\| interpolation == cv::INTER_CUBIC)))
2286	return false;
2287
2288	const char * const warp_op[`2`] = { "Affine", "Perspective" };
2289	const char * const interpolationMap[`3`] = { "nearest", "linear", "cubic" };
2290	ocl::ProgramSource program = ocl::imgproc::warp_transform_oclsrc;
2291	String kernelName = format(fmt: "warp%s_%s_8u", warp_op[op_type], interpolationMap[interpolation]);
2292
2293	bool is32f = (interpolation == INTER_CUBIC \|\| interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
2294	int wdepth = interpolation == INTER_NEAREST ? depth : std::max(a: is32f ? CV_32F : CV_32S, b: depth);
2295	int sctype = CV_MAKETYPE(wdepth, cn);
2296
2297	ocl::Kernel k;
2298	String opts = format(fmt: "-D ST=%s", ocl::typeToStr(t: sctype));
2299
2300	k.create(kname: kernelName.c_str(), prog: program, buildopts: opts);
2301	if (k.empty())
2302	return false;
2303
2304	float borderBuf[] = { `0`, `0`, `0`, `0` };
2305	scalarToRawData(s: borderValue, buf: borderBuf, type: sctype);
2306
2307	UMat src = _src.getUMat(), M0;
2308	_dst.create( sz: dsize.empty() ? src.size () : dsize, type: src.type() );
2309	UMat dst = _dst.getUMat();
2310
2311	if (src.u == dst.u)
2312	src = src.clone();
2313
2314	float M[`9`] = {`0`};
2315	int matRows = (op_type == OCL_OP_AFFINE ? `2` : `3`);
2316	Mat matM(matRows, `3`, CV_32F, M), M1 = _M0.getMat();
2317	CV_Assert( (M1.type() == CV_32F \|\| M1.type() == CV_64F) && M1.rows == matRows && M1.cols == `3` );
2318	M1.convertTo(m: matM, rtype: matM.type());
2319
2320	if( !(flags & WARP_INVERSE_MAP) )
2321	{
2322	if (op_type == OCL_OP_PERSPECTIVE)
2323	invert(src: matM, dst: matM);
2324	else
2325	{
2326	float D = M[`0`]M[`4`] - M[`1`]M[`3`];
2327	D = D != `0` ? `1.f`/D : `0`;
2328	float A11 = M[`4`]D, A22=M[`0`]D;
2329	M[`0`] = A11; M[`1`] *= -D;
2330	M[`3`] *= -D; M[`4`] = A22;
2331	float b1 = -M[`0`]M[`2`] - M[`1`]M[`5`];
2332	float b2 = -M[`3`]M[`2`] - M[`4`]M[`5`];
2333	M[`2`] = b1; M[`5`] = b2;
2334	}
2335	}
2336	matM.convertTo(m: M0, CV_32F);
2337
2338	k.args(kernel_args: ocl::KernelArg::ReadOnly(m: src), kernel_args: ocl::KernelArg::WriteOnly(m: dst), kernel_args: ocl::KernelArg::PtrReadOnly(m: M0),
2339	kernel_args: ocl::KernelArg (ocl::KernelArg::CONSTANT, `0`, `0`, `0`, borderBuf, CV_ELEM_SIZE(sctype)));
2340
2341	size_t globalThreads[`2`];
2342	globalThreads[`0`] = (size_t)(dst.cols / `4`);
2343	globalThreads[`1`] = (size_t)dst.rows;
2344
2345	return k.run(dims: `2`, globalsize: globalThreads, NULL, sync: false);
2346	}
2347
2348	static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
2349	Size dsize, int flags, int borderType, const Scalar& borderValue,
2350	int op_type)
2351	{
2352	CV_Assert(op_type == OCL_OP_AFFINE \|\| op_type == OCL_OP_PERSPECTIVE);
2353	const ocl::Device & dev = ocl::Device::getDefault();
2354
2355	int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2356	const bool doubleSupport = dev.doubleFPConfig() > `0`;
2357
2358	int interpolation = flags & INTER_MAX;
2359	if( interpolation == INTER_AREA )
2360	interpolation = INTER_LINEAR;
2361	int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? `4` : `1`;
2362
2363	if ( !(borderType == cv::BORDER_CONSTANT &&
2364	(interpolation == cv::INTER_NEAREST \|\| interpolation == cv::INTER_LINEAR \|\| interpolation == cv::INTER_CUBIC)) \|\|
2365	(!doubleSupport && depth == CV_64F) \|\| cn > `4`)
2366	return false;
2367
2368	bool useDouble = depth == CV_64F;
2369
2370	const char * const interpolationMap[`3`] = { "NEAREST", "LINEAR", "CUBIC" };
2371	ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
2372	ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
2373	const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
2374
2375	int scalarcn = cn == `3` ? `4` : cn;
2376	bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC \|\| interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
2377	int wdepth = interpolation == INTER_NEAREST ? depth : std::max(a: is32f ? CV_32F : CV_32S, b: depth);
2378	int sctype = CV_MAKETYPE(wdepth, scalarcn);
2379
2380	ocl::Kernel k;
2381	String opts;
2382	if (interpolation == INTER_NEAREST)
2383	{
2384	opts = format(fmt: "-D INTER_NEAREST -D T=%s%s -D CT=%s -D T1=%s -D ST=%s -D CN=%d -D ROWS_PER_WI=%d",
2385	ocl::typeToStr(t: type),
2386	doubleSupport ? " -D DOUBLE_SUPPORT" : "",
2387	useDouble ? "double" : "float",
2388	ocl::typeToStr(CV_MAT_DEPTH(type)),
2389	ocl::typeToStr(t: sctype), cn, rowsPerWI);
2390	}
2391	else
2392	{
2393	char cvt[`2`][`50`];
2394	opts = format(fmt: "-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D SRC_DEPTH=%d"
2395	" -D CONVERT_TO_WT=%s -D CONVERT_TO_T=%s%s -D CT=%s -D CN=%d -D ROWS_PER_WI=%d",
2396	interpolationMap[interpolation], ocl::typeToStr(t: type),
2397	ocl::typeToStr(CV_MAT_DEPTH(type)),
2398	ocl::typeToStr(t: sctype),
2399	ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
2400	ocl::convertTypeStr(sdepth: depth, ddepth: wdepth, cn, buf: cvt[`0`], buf_size: sizeof(cvt[`0`])),
2401	ocl::convertTypeStr(sdepth: wdepth, ddepth: depth, cn, buf: cvt[`1`], buf_size: sizeof(cvt[`1`])),
2402	doubleSupport ? " -D DOUBLE_SUPPORT" : "",
2403	useDouble ? "double" : "float",
2404	cn, rowsPerWI);
2405	}
2406
2407	k.create(kname: kernelName, prog: program, buildopts: opts);
2408	if (k.empty())
2409	return false;
2410
2411	double borderBuf[] = { `0`, `0`, `0`, `0` };
2412	scalarToRawData(s: borderValue, buf: borderBuf, type: sctype);
2413
2414	UMat src = _src.getUMat(), M0;
2415	_dst.create( sz: dsize.empty() ? src.size () : dsize, type: src.type() );
2416	UMat dst = _dst.getUMat();
2417
2418	if (src.u == dst.u)
2419	src = src.clone();
2420
2421	double M[`9`] = {`0`};
2422	int matRows = (op_type == OCL_OP_AFFINE ? `2` : `3`);
2423	Mat matM(matRows, `3`, CV_64F, M), M1 = _M0.getMat();
2424	CV_Assert( (M1.type() == CV_32F \|\| M1.type() == CV_64F) &&
2425	M1.rows == matRows && M1.cols == `3` );
2426	M1.convertTo(m: matM, rtype: matM.type());
2427
2428	if( !(flags & WARP_INVERSE_MAP) )
2429	{
2430	if (op_type == OCL_OP_PERSPECTIVE)
2431	invert(src: matM, dst: matM);
2432	else
2433	{
2434	double D = M[`0`]M[`4`] - M[`1`]M[`3`];
2435	D = D != `0` ? `1.`/D : `0`;
2436	double A11 = M[`4`]D, A22=M[`0`]D;
2437	M[`0`] = A11; M[`1`] *= -D;
2438	M[`3`] *= -D; M[`4`] = A22;
2439	double b1 = -M[`0`]M[`2`] - M[`1`]M[`5`];
2440	double b2 = -M[`3`]M[`2`] - M[`4`]M[`5`];
2441	M[`2`] = b1; M[`5`] = b2;
2442	}
2443	}
2444	matM.convertTo(m: M0, rtype: useDouble ? CV_64F : CV_32F);
2445
2446	k.args(kernel_args: ocl::KernelArg::ReadOnly(m: src), kernel_args: ocl::KernelArg::WriteOnly(m: dst), kernel_args: ocl::KernelArg::PtrReadOnly(m: M0),
2447	kernel_args: ocl::KernelArg (ocl::KernelArg::CONSTANT, `0`, `0`, `0`, borderBuf, CV_ELEM_SIZE(sctype)));
2448
2449	size_t globalThreads[`2`] = { (size_t)dst.cols, ((size_t)dst.rows + rowsPerWI - `1`) / rowsPerWI };
2450	return k.run(dims: `2`, globalsize: globalThreads, NULL, sync: false);
2451	}
2452
2453	#endif
2454
2455	#ifdef HAVE_IPP
2456	#define IPP_WARPAFFINE_PARALLEL 1
2457
2458	#ifdef HAVE_IPP_IW
2459
2460	class ipp_warpAffineParallel: public ParallelLoopBody
2461	{
2462	public:
2463	ipp_warpAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, IppiInterpolationType _inter, double (&_coeffs)[`2`][`3`], ::ipp::IwiBorderType _borderType, IwTransDirection _iwTransDirection, bool *_ok):m_src(src), m_dst(dst)
2464	{
2465	pOk = _ok;
2466
2467	inter = _inter;
2468	borderType = _borderType;
2469	iwTransDirection = _iwTransDirection;
2470
2471	for( int i = `0`; i < `2`; i++ )
2472	for( int j = `0`; j < `3`; j++ )
2473	coeffs[i][j] = _coeffs[i][j];
2474
2475	pOk = true*;
2476	}
2477	~ipp_warpAffineParallel() {}
2478
2479	virtual void operator() (const Range& range) const CV_OVERRIDE
2480	{
2481	CV_INSTRUMENT_REGION_IPP();
2482
2483	if(pOk == false*)
2484	return;
2485
2486	try
2487	{
2488	::ipp::IwiTile tile = ::ipp::IwiRoi (`0`, range.start, m_dst.m_size.width, range.end - range.start);
2489	CV_INSTRUMENT_FUN_IPP(::ipp::iwiWarpAffine, m_src, m_dst, coeffs, iwTransDirection, inter, ::ipp::IwiWarpAffineParams (), borderType, tile);
2490	}
2491	catch(const ::ipp::IwException &)
2492	{
2493	pOk = false*;
2494	return;
2495	}
2496	}
2497	private:
2498	::ipp::IwiImage &m_src;
2499	::ipp::IwiImage &m_dst;
2500
2501	IppiInterpolationType inter;
2502	double coeffs[`2`][`3`];
2503	::ipp::IwiBorderType borderType;
2504	IwTransDirection iwTransDirection;
2505
2506	bool *pOk;
2507	const ipp_warpAffineParallel& operator= (const ipp_warpAffineParallel&);
2508	};
2509
2510	#endif
2511
2512	static bool ipp_warpAffine( InputArray _src, OutputArray _dst, int interpolation, int borderType, const Scalar & borderValue, InputArray _M, int flags )
2513	{
2514	#ifdef HAVE_IPP_IW
2515	CV_INSTRUMENT_REGION_IPP();
2516
2517	if (!cv::ipp::useIPP_NotExact())
2518	return false;
2519
2520	IppiInterpolationType ippInter = ippiGetInterpolation(inter: interpolation);
2521	if((int)ippInter < `0`)
2522	return false;
2523
2524	// Acquire data and begin processing
2525	try
2526	{
2527	Mat src = _src.getMat();
2528	Mat dst = _dst.getMat();
2529	::ipp::IwiImage iwSrc = ippiGetImage(src);
2530	::ipp::IwiImage iwDst = ippiGetImage(src: dst);
2531	::ipp::IwiBorderType ippBorder(ippiGetBorderType(borderTypeNI: borderType), ippiGetValue(scalar: borderValue));
2532	IwTransDirection iwTransDirection;
2533	if(!ippBorder)
2534	return false;
2535
2536	if( !(flags & WARP_INVERSE_MAP) )
2537	iwTransDirection = iwTransForward;
2538	else
2539	iwTransDirection = iwTransInverse;
2540
2541	Mat M = _M.getMat();
2542	double coeffs[`2`][`3`];
2543	for( int i = `0`; i < `2`; i++ )
2544	for( int j = `0`; j < `3`; j++ )
2545	coeffs[i][j] = M.at<double>(i0: i, i1: j);
2546
2547	const int threads = ippiSuggestThreadsNum(image: iwDst, multiplier: `2`);
2548
2549	if(IPP_WARPAFFINE_PARALLEL && threads > `1`)
2550	{
2551	bool ok = true;
2552	Range range(`0`, (int)iwDst.m_size.height);
2553	ipp_warpAffineParallel invoker(iwSrc, iwDst, ippInter, coeffs, ippBorder, iwTransDirection, &ok);
2554	if(!ok)
2555	return false;
2556
2557	parallel_for_(range, body: invoker, nstripes: threads*`4`);
2558
2559	if(!ok)
2560	return false;
2561	} else {
2562	CV_INSTRUMENT_FUN_IPP(::ipp::iwiWarpAffine, iwSrc, iwDst, coeffs, iwTransDirection, ippInter, ::ipp::IwiWarpAffineParams (), ippBorder);
2563	}
2564
2565	}
2566	catch (const ::ipp::IwException &)
2567	{
2568	return false;
2569	}
2570
2571	return true;
2572	#else
2573	CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(interpolation);
2574	CV_UNUSED(borderType); CV_UNUSED(borderValue); CV_UNUSED(_M); CV_UNUSED(flags);
2575	return false;
2576	#endif
2577	}
2578
2579	#endif
2580
2581	namespace hal {
2582
2583	void warpAffine(int src_type,
2584	const uchar * src_data, size_t src_step, int src_width, int src_height,
2585	uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
2586	const double M[`6`], int interpolation, int borderType, const double borderValue[`4`])
2587	{
2588	CALL_HAL(warpAffine, cv_hal_warpAffine, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
2589
2590	Mat src(Size (src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
2591	Mat dst(Size (dst_width, dst_height), src_type, dst_data, dst_step);
2592
2593	int x;
2594	AutoBuffer<int> _abdelta(dst.cols*`2`);
2595	int* adelta = &_abdelta [`0`], *bdelta = adelta + dst.cols;
2596	const int AB_BITS = MAX(`10`, (int)INTER_BITS);
2597	const int AB_SCALE = `1` << AB_BITS;
2598
2599	for( x = `0`; x < dst.cols; x++ )
2600	{
2601	adelta[x] = saturate_cast<int>(v: M[`0`]xAB_SCALE);
2602	bdelta[x] = saturate_cast<int>(v: M[`3`]xAB_SCALE);
2603	}
2604
2605	Range range(`0`, dst.rows);
2606	WarpAffineInvoker invoker(src, dst, interpolation, borderType,
2607	Scalar (borderValue[`0`], borderValue[`1`], borderValue[`2`], borderValue[`3`]),
2608	adelta, bdelta, M);
2609	parallel_for_(range, body: invoker, nstripes: dst.total()/(double)(`1`<<`16`));
2610	}
2611
2612	void warpAffineBlocklineNN(int adelta, int* bdelta, short** xy, int X0, int Y0, int bw)
2613	{
2614	CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
2615
2616	constexpr int AB_BITS = MAX(`10`, static_cast<int>(INTER_BITS));
2617	int x1 = `0`;
2618	#if (CV_SIMD \|\| CV_SIMD_SCALABLE)
2619	{
2620	const v_int32 v_X0 = vx_setall_s32(v: X0);
2621	const v_int32 v_Y0 = vx_setall_s32(v: Y0);
2622	const int step = VTraits<v_int16>::vlanes();
2623	for (; x1 <= bw - step; x1 += step)
2624	{
2625	v_int16 v_X = v_pack(a: v_shr<AB_BITS>(a: v_add(a: v_X0, b: vx_load(ptr: adelta + x1))),
2626	b: v_shr<AB_BITS>(a: v_add(a: v_X0, b: vx_load(ptr: adelta + x1 + step / `2`))));
2627	v_int16 v_Y = v_pack(a: v_shr<AB_BITS>(a: v_add(a: v_Y0, b: vx_load(ptr: bdelta + x1))),
2628	b: v_shr<AB_BITS>(a: v_add(a: v_Y0, b: vx_load(ptr: bdelta + x1 + step / `2`))));
2629	v_store_interleave(ptr: xy + `2` * x1, a0: v_X, b0: v_Y);
2630	}
2631	}
2632	#endif
2633	for (; x1 < bw; x1++)
2634	{
2635	const int X = (X0 + adelta[x1]) >> AB_BITS;
2636	const int Y = (Y0 + bdelta[x1]) >> AB_BITS;
2637	xy[x1 * `2`] = saturate_cast<short>(v: X);
2638	xy[x1 * `2` + `1`] = saturate_cast<short>(v: Y);
2639	}
2640	}
2641
2642	void warpAffineBlockline(int adelta, int* bdelta, short** xy, short* alpha, int X0, int Y0, int bw)
2643	{
2644	CALL_HAL(warpAffineBlockline, cv_hal_warpAffineBlockline, adelta, bdelta, xy, alpha, X0, Y0, bw);
2645
2646	const int AB_BITS = MAX(`10`, (int)INTER_BITS);
2647	int x1 = `0`;
2648	#if CV_TRY_AVX2
2649	bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
2650	if ( useAVX2 )
2651	x1 = opt_AVX2::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
2652	#endif
2653	#if CV_TRY_LASX
2654	bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
2655	if ( useLASX )
2656	x1 = opt_LASX::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
2657	#endif
2658	{
2659	#if CV_SIMD128
2660	{
2661	v_int32x4 v__X0 = v_setall_s32(v: X0), v__Y0 = v_setall_s32(v: Y0);
2662	v_int32x4 v_mask = v_setall_s32(v: INTER_TAB_SIZE - `1`);
2663	int span = VTraits<v_float32x4>::vlanes();
2664	for( ; x1 <= bw - span * `2`; x1 += span * `2` )
2665	{
2666	v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(a: v_add(a: v__X0, b: v_load(ptr: adelta + x1)));
2667	v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(a: v_add(a: v__Y0, b: v_load(ptr: bdelta + x1)));
2668	v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(a: v_add(a: v__X0, b: v_load(ptr: adelta + x1 + span)));
2669	v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(a: v_add(a: v__Y0, b: v_load(ptr: bdelta + x1 + span)));
2670
2671	v_int16x8 v_xy[`2`];
2672	v_xy[`0`] = v_pack(a: v_shr<INTER_BITS>(a: v_X0), b: v_shr<INTER_BITS>(a: v_X1));
2673	v_xy[`1`] = v_pack(a: v_shr<INTER_BITS>(a: v_Y0), b: v_shr<INTER_BITS>(a: v_Y1));
2674	v_store_interleave(ptr: xy + (x1 << `1`), a0: v_xy[`0`], b0: v_xy[`1`]);
2675
2676	v_int32x4 v_alpha0 = v_or(a: v_shl<INTER_BITS>(a: v_and(a: v_Y0, b: v_mask)), b: v_and(a: v_X0, b: v_mask));
2677	v_int32x4 v_alpha1 = v_or(a: v_shl<INTER_BITS>(a: v_and(a: v_Y1, b: v_mask)), b: v_and(a: v_X1, b: v_mask));
2678	v_store(ptr: alpha + x1, a: v_pack(a: v_alpha0, b: v_alpha1));
2679	}
2680	}
2681	#endif
2682	for( ; x1 < bw; x1++ )
2683	{
2684	int X = (X0 + adelta[x1]) >> (AB_BITS - INTER_BITS);
2685	int Y = (Y0 + bdelta[x1]) >> (AB_BITS - INTER_BITS);
2686	xy[x1`2`] = saturate_cast<short*>(v: X >> INTER_BITS);
2687	xy[x1`2`+`1`] = saturate_cast<short*>(v: Y >> INTER_BITS);
2688	alpha[x1] = (short)((Y & (INTER_TAB_SIZE-`1`))*INTER_TAB_SIZE +
2689	(X & (INTER_TAB_SIZE-`1`)));
2690	}
2691	}
2692	}
2693
2694	} // hal::
2695	} // cv::
2696
2697
2698	void cv::warpAffine( InputArray _src, OutputArray _dst,
2699	InputArray _M0, Size dsize,
2700	int flags, int borderType, const Scalar& borderValue )
2701	{
2702	CV_INSTRUMENT_REGION();
2703
2704	int interpolation = flags & INTER_MAX;
2705	CV_Assert( _src.channels() <= `4` \|\| (interpolation != INTER_LANCZOS4 &&
2706	interpolation != INTER_CUBIC) );
2707
2708	CV_OCL_RUN(_src.dims() <= `2` && _dst.isUMat() &&
2709	_src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
2710	ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType,
2711	borderValue, op_type: OCL_OP_AFFINE))
2712
2713	CV_OCL_RUN(_src.dims() <= `2` && _dst.isUMat(),
2714	ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
2715	borderValue, op_type: OCL_OP_AFFINE))
2716
2717	Mat src = _src.getMat(), M0 = _M0.getMat();
2718	_dst.create( sz: dsize.empty() ? src.size () : dsize, type: src.type() );
2719	Mat dst = _dst.getMat();
2720	CV_Assert( src.cols > `0` && src.rows > `0` );
2721	if( dst.data == src.data )
2722	src = src.clone();
2723
2724	double M[`6`] = {`0`};
2725	Mat matM(`2`, `3`, CV_64F, M);
2726	if( interpolation == INTER_AREA )
2727	interpolation = INTER_LINEAR;
2728
2729	CV_Assert( (M0.type() == CV_32F \|\| M0.type() == CV_64F) && M0.rows == `2` && M0.cols == `3` );
2730	M0.convertTo(m: matM, rtype: matM.type());
2731
2732	CV_IPP_RUN_FAST(ipp_warpAffine(src, dst, interpolation, borderType, borderValue, matM, flags));
2733
2734	if( !(flags & WARP_INVERSE_MAP) )
2735	{
2736	double D = M[`0`]M[`4`] - M[`1`]M[`3`];
2737	D = D != `0` ? `1.`/D : `0`;
2738	double A11 = M[`4`]D, A22=M[`0`]D;
2739	M[`0`] = A11; M[`1`] *= -D;
2740	M[`3`] *= -D; M[`4`] = A22;
2741	double b1 = -M[`0`]M[`2`] - M[`1`]M[`5`];
2742	double b2 = -M[`3`]M[`2`] - M[`4`]M[`5`];
2743	M[`2`] = b1; M[`5`] = b2;
2744	}
2745
2746	#if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPAFFINE
2747	CV_IPP_CHECK()
2748	{
2749	int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2750	if( ( depth == CV_8U \|\| depth == CV_16U \|\| depth == CV_32F ) &&
2751	( cn == `1` \|\| cn == `3` \|\| cn == `4` ) &&
2752	( interpolation == INTER_NEAREST \|\| interpolation == INTER_LINEAR \|\| interpolation == INTER_CUBIC) &&
2753	( borderType == cv::BORDER_TRANSPARENT \|\| borderType == cv::BORDER_CONSTANT) )
2754	{
2755	ippiWarpAffineBackFunc ippFunc = `0`;
2756	if ((flags & WARP_INVERSE_MAP) != `0`)
2757	{
2758	ippFunc =
2759	type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
2760	type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
2761	type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
2762	type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
2763	type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
2764	type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
2765	type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
2766	type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
2767	type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
2768	`0`;
2769	}
2770	else
2771	{
2772	ippFunc =
2773	type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
2774	type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
2775	type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
2776	type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
2777	type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
2778	type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
2779	type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
2780	type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
2781	type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
2782	`0`;
2783	}
2784	int mode =
2785	interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
2786	interpolation == INTER_NEAREST ? IPPI_INTER_NN :
2787	interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
2788	`0`;
2789	CV_Assert(mode && ippFunc);
2790
2791	double coeffs[`2`][`3`];
2792	for( int i = `0`; i < `2`; i++ )
2793	for( int j = `0`; j < `3`; j++ )
2794	coeffs[i][j] = matM.at<double>(i, j);
2795
2796	bool ok;
2797	Range range(`0`, dst.rows);
2798	IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
2799	parallel_for_(range, invoker, dst.total()/(double)(`1`<<`16`));
2800	if( ok )
2801	{
2802	CV_IMPL_ADD(CV_IMPL_IPP\|CV_IMPL_MT);
2803	return;
2804	}
2805	setIppErrorStatus();
2806	}
2807	}
2808	#endif
2809
2810	hal::warpAffine(src_type: src.type(), src_data: src.data, src_step: src.step, src_width: src.cols, src_height: src.rows, dst_data: dst.data, dst_step: dst.step, dst_width: dst.cols, dst_height: dst.rows,
2811	M, interpolation, borderType, borderValue: borderValue.val);
2812	}
2813
2814
2815	namespace cv
2816	{
2817	#if CV_SIMD128_64F
2818	void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double M, short** xy, double X0, double Y0, double W0, int bw)
2819	{
2820	const v_float64x2 v_M0 = v_setall_f64(v: M[`0`]);
2821	const v_float64x2 v_M3 = v_setall_f64(v: M[`3`]);
2822	const v_float64x2 v_M6 = v_setall_f64(v: M[`6`]);
2823	const v_float64x2 v_intmax = v_setall_f64(v: (double)INT_MAX);
2824	const v_float64x2 v_intmin = v_setall_f64(v: (double)INT_MIN);
2825	const v_float64x2 v_2 = v_setall_f64(v: `2.0`);
2826	const v_float64x2 v_zero = v_setzero_f64();
2827	const v_float64x2 v_1 = v_setall_f64(v: `1.0`);
2828
2829	int x1 = `0`;
2830	v_float64x2 v_X0d = v_setall_f64(v: X0);
2831	v_float64x2 v_Y0d = v_setall_f64(v: Y0);
2832	v_float64x2 v_W0 = v_setall_f64(v: W0);
2833	v_float64x2 v_x1(`0.0`, `1.0`);
2834
2835	for (; x1 <= bw - `16`; x1 += `16`)
2836	{
2837	// 0-3
2838	v_int32x4 v_X0, v_Y0;
2839	{
2840	v_float64x2 v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2841	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_1, b: v_W), b: v_zero);
2842	v_float64x2 v_fX0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2843	v_float64x2 v_fY0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2844	v_x1 = v_add(a: v_x1, b: v_2);
2845
2846	v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2847	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_1, b: v_W), b: v_zero);
2848	v_float64x2 v_fX1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2849	v_float64x2 v_fY1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2850	v_x1 = v_add(a: v_x1, b: v_2);
2851
2852	v_X0 = v_round(a: v_fX0, b: v_fX1);
2853	v_Y0 = v_round(a: v_fY0, b: v_fY1);
2854	}
2855
2856	// 4-7
2857	v_int32x4 v_X1, v_Y1;
2858	{
2859	v_float64x2 v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2860	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_1, b: v_W), b: v_zero);
2861	v_float64x2 v_fX0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2862	v_float64x2 v_fY0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2863	v_x1 = v_add(a: v_x1, b: v_2);
2864
2865	v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2866	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_1, b: v_W), b: v_zero);
2867	v_float64x2 v_fX1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2868	v_float64x2 v_fY1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2869	v_x1 = v_add(a: v_x1, b: v_2);
2870
2871	v_X1 = v_round(a: v_fX0, b: v_fX1);
2872	v_Y1 = v_round(a: v_fY0, b: v_fY1);
2873	}
2874
2875	// 8-11
2876	v_int32x4 v_X2, v_Y2;
2877	{
2878	v_float64x2 v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2879	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_1, b: v_W), b: v_zero);
2880	v_float64x2 v_fX0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2881	v_float64x2 v_fY0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2882	v_x1 = v_add(a: v_x1, b: v_2);
2883
2884	v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2885	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_1, b: v_W), b: v_zero);
2886	v_float64x2 v_fX1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2887	v_float64x2 v_fY1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2888	v_x1 = v_add(a: v_x1, b: v_2);
2889
2890	v_X2 = v_round(a: v_fX0, b: v_fX1);
2891	v_Y2 = v_round(a: v_fY0, b: v_fY1);
2892	}
2893
2894	// 12-15
2895	v_int32x4 v_X3, v_Y3;
2896	{
2897	v_float64x2 v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2898	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_1, b: v_W), b: v_zero);
2899	v_float64x2 v_fX0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2900	v_float64x2 v_fY0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2901	v_x1 = v_add(a: v_x1, b: v_2);
2902
2903	v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2904	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_1, b: v_W), b: v_zero);
2905	v_float64x2 v_fX1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2906	v_float64x2 v_fY1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2907	v_x1 = v_add(a: v_x1, b: v_2);
2908
2909	v_X3 = v_round(a: v_fX0, b: v_fX1);
2910	v_Y3 = v_round(a: v_fY0, b: v_fY1);
2911	}
2912
2913	// convert to 16s
2914	v_X0 = v_reinterpret_as_s32(a: v_pack(a: v_X0, b: v_X1));
2915	v_X1 = v_reinterpret_as_s32(a: v_pack(a: v_X2, b: v_X3));
2916	v_Y0 = v_reinterpret_as_s32(a: v_pack(a: v_Y0, b: v_Y1));
2917	v_Y1 = v_reinterpret_as_s32(a: v_pack(a: v_Y2, b: v_Y3));
2918
2919	v_store_interleave(ptr: xy + x1 * `2`, a0: (v_reinterpret_as_s16)(a: v_X0), b0: (v_reinterpret_as_s16)(a: v_Y0));
2920	v_store_interleave(ptr: xy + x1 * `2` + `16`, a0: (v_reinterpret_as_s16)(a: v_X1), b0: (v_reinterpret_as_s16)(a: v_Y1));
2921	}
2922
2923	for( ; x1 < bw; x1++ )
2924	{
2925	double W = W0 + M[`6`]*x1;
2926	W = W ? `1.`/W : `0`;
2927	double fX = std::max(a: (double)INT_MIN, b: std::min(a: (double)INT_MAX, b: (X0 + M[`0`]x1)W));
2928	double fY = std::max(a: (double)INT_MIN, b: std::min(a: (double)INT_MAX, b: (Y0 + M[`3`]x1)W));
2929	int X = saturate_cast<int>(v: fX);
2930	int Y = saturate_cast<int>(v: fY);
2931
2932	xy[x1`2`] = saturate_cast<short*>(v: X);
2933	xy[x1`2`+`1`] = saturate_cast<short*>(v: Y);
2934	}
2935	}
2936
2937	void WarpPerspectiveLine_Process_CV_SIMD(const double M, short** xy, short* alpha, double X0, double Y0, double W0, int bw)
2938	{
2939	const v_float64x2 v_M0 = v_setall_f64(v: M[`0`]);
2940	const v_float64x2 v_M3 = v_setall_f64(v: M[`3`]);
2941	const v_float64x2 v_M6 = v_setall_f64(v: M[`6`]);
2942	const v_float64x2 v_intmax = v_setall_f64(v: (double)INT_MAX);
2943	const v_float64x2 v_intmin = v_setall_f64(v: (double)INT_MIN);
2944	const v_float64x2 v_2 = v_setall_f64(v: `2.0`);
2945	const v_float64x2 v_zero = v_setzero_f64();
2946	const v_float64x2 v_its = v_setall_f64(v: (double)INTER_TAB_SIZE);
2947	const v_int32x4 v_itsi1 = v_setall_s32(v: INTER_TAB_SIZE - `1`);
2948
2949	int x1 = `0`;
2950
2951	v_float64x2 v_X0d = v_setall_f64(v: X0);
2952	v_float64x2 v_Y0d = v_setall_f64(v: Y0);
2953	v_float64x2 v_W0 = v_setall_f64(v: W0);
2954	v_float64x2 v_x1(`0.0`, `1.0`);
2955
2956	for (; x1 <= bw - `16`; x1 += `16`)
2957	{
2958	// 0-3
2959	v_int32x4 v_X0, v_Y0;
2960	{
2961	v_float64x2 v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2962	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_its, b: v_W), b: v_zero);
2963	v_float64x2 v_fX0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2964	v_float64x2 v_fY0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2965	v_x1 = v_add(a: v_x1, b: v_2);
2966
2967	v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2968	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_its, b: v_W), b: v_zero);
2969	v_float64x2 v_fX1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2970	v_float64x2 v_fY1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2971	v_x1 = v_add(a: v_x1, b: v_2);
2972
2973	v_X0 = v_round(a: v_fX0, b: v_fX1);
2974	v_Y0 = v_round(a: v_fY0, b: v_fY1);
2975	}
2976
2977	// 4-7
2978	v_int32x4 v_X1, v_Y1;
2979	{
2980	v_float64x2 v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2981	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_its, b: v_W), b: v_zero);
2982	v_float64x2 v_fX0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2983	v_float64x2 v_fY0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2984	v_x1 = v_add(a: v_x1, b: v_2);
2985
2986	v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
2987	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_its, b: v_W), b: v_zero);
2988	v_float64x2 v_fX1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
2989	v_float64x2 v_fY1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
2990	v_x1 = v_add(a: v_x1, b: v_2);
2991
2992	v_X1 = v_round(a: v_fX0, b: v_fX1);
2993	v_Y1 = v_round(a: v_fY0, b: v_fY1);
2994	}
2995
2996	// 8-11
2997	v_int32x4 v_X2, v_Y2;
2998	{
2999	v_float64x2 v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
3000	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_its, b: v_W), b: v_zero);
3001	v_float64x2 v_fX0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
3002	v_float64x2 v_fY0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
3003	v_x1 = v_add(a: v_x1, b: v_2);
3004
3005	v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
3006	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_its, b: v_W), b: v_zero);
3007	v_float64x2 v_fX1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
3008	v_float64x2 v_fY1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
3009	v_x1 = v_add(a: v_x1, b: v_2);
3010
3011	v_X2 = v_round(a: v_fX0, b: v_fX1);
3012	v_Y2 = v_round(a: v_fY0, b: v_fY1);
3013	}
3014
3015	// 12-15
3016	v_int32x4 v_X3, v_Y3;
3017	{
3018	v_float64x2 v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
3019	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_its, b: v_W), b: v_zero);
3020	v_float64x2 v_fX0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
3021	v_float64x2 v_fY0 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
3022	v_x1 = v_add(a: v_x1, b: v_2);
3023
3024	v_W = v_muladd(a: v_M6, b: v_x1, c: v_W0);
3025	v_W = v_select(mask: v_ne(a: v_W, b: v_zero), a: v_div(a: v_its, b: v_W), b: v_zero);
3026	v_float64x2 v_fX1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M0, b: v_x1, c: v_X0d), b: v_W)));
3027	v_float64x2 v_fY1 = v_max(a: v_intmin, b: v_min(a: v_intmax, b: v_mul(a: v_muladd(a: v_M3, b: v_x1, c: v_Y0d), b: v_W)));
3028	v_x1 = v_add(a: v_x1, b: v_2);
3029
3030	v_X3 = v_round(a: v_fX0, b: v_fX1);
3031	v_Y3 = v_round(a: v_fY0, b: v_fY1);
3032	}
3033
3034	// store alpha
3035	v_int32x4 v_alpha0 = v_add(a: v_shl<INTER_BITS>(a: v_and(a: v_Y0, b: v_itsi1)), b: v_and(a: v_X0, b: v_itsi1));
3036	v_int32x4 v_alpha1 = v_add(a: v_shl<INTER_BITS>(a: v_and(a: v_Y1, b: v_itsi1)), b: v_and(a: v_X1, b: v_itsi1));
3037	v_store(ptr: (alpha + x1), a: v_pack(a: v_alpha0, b: v_alpha1));
3038
3039	v_alpha0 = v_add(a: v_shl<INTER_BITS>(a: v_and(a: v_Y2, b: v_itsi1)), b: v_and(a: v_X2, b: v_itsi1));
3040	v_alpha1 = v_add(a: v_shl<INTER_BITS>(a: v_and(a: v_Y3, b: v_itsi1)), b: v_and(a: v_X3, b: v_itsi1));
3041	v_store(ptr: (alpha + x1 + `8`), a: v_pack(a: v_alpha0, b: v_alpha1));
3042
3043	// convert to 16s
3044	v_X0 = v_reinterpret_as_s32(a: v_pack(a: v_shr<INTER_BITS>(a: v_X0), b: v_shr<INTER_BITS>(a: v_X1)));
3045	v_X1 = v_reinterpret_as_s32(a: v_pack(a: v_shr<INTER_BITS>(a: v_X2), b: v_shr<INTER_BITS>(a: v_X3)));
3046	v_Y0 = v_reinterpret_as_s32(a: v_pack(a: v_shr<INTER_BITS>(a: v_Y0), b: v_shr<INTER_BITS>(a: v_Y1)));
3047	v_Y1 = v_reinterpret_as_s32(a: v_pack(a: v_shr<INTER_BITS>(a: v_Y2), b: v_shr<INTER_BITS>(a: v_Y3)));
3048
3049	v_store_interleave(ptr: xy + x1 * `2`, a0: (v_reinterpret_as_s16)(a: v_X0), b0: (v_reinterpret_as_s16)(a: v_Y0));
3050	v_store_interleave(ptr: xy + x1 * `2` + `16`, a0: (v_reinterpret_as_s16)(a: v_X1), b0: (v_reinterpret_as_s16)(a: v_Y1));
3051	}
3052
3053	for( ; x1 < bw; x1++ )
3054	{
3055	double W = W0 + M[`6`]*x1;
3056	W = W ? static_cast<double>(INTER_TAB_SIZE)/W : `0`;
3057	double fX = std::max(a: (double)INT_MIN, b: std::min(a: (double)INT_MAX, b: (X0 + M[`0`]x1)W));
3058	double fY = std::max(a: (double)INT_MIN, b: std::min(a: (double)INT_MAX, b: (Y0 + M[`3`]x1)W));
3059	int X = saturate_cast<int>(v: fX);
3060	int Y = saturate_cast<int>(v: fY);
3061
3062	xy[x1`2`] = saturate_cast<short*>(v: X >> INTER_BITS);
3063	xy[x1`2`+`1`] = saturate_cast<short*>(v: Y >> INTER_BITS);
3064	alpha[x1] = (short)((Y & (INTER_TAB_SIZE-`1`))*INTER_TAB_SIZE +
3065	(X & (INTER_TAB_SIZE-`1`)));
3066	}
3067	}
3068	#endif
3069
3070	class WarpPerspectiveInvoker :
3071	public ParallelLoopBody
3072	{
3073	public:
3074	WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, const double _M, int* _interpolation,
3075	int _borderType, const Scalar &_borderValue) :
3076	ParallelLoopBody (), src (_src), dst (_dst), M(_M), interpolation(_interpolation),
3077	borderType(_borderType), borderValue (_borderValue)
3078	{
3079	#if defined(_MSC_VER) && _MSC_VER == 1800 /* MSVS 2013 */ && CV_AVX
3080	// details: https://github.com/opencv/opencv/issues/11026
3081	borderValue.val[`2`] = _borderValue.val[`2`];
3082	borderValue.val[`3`] = _borderValue.val[`3`];
3083	#endif
3084	}
3085
3086	virtual void operator() (const Range& range) const CV_OVERRIDE
3087	{
3088	const int BLOCK_SZ = `32`;
3089	short XY[BLOCK_SZBLOCK_SZ`2`], A[BLOCK_SZ*BLOCK_SZ];
3090	int x, y, y1, width = dst.cols, height = dst.rows;
3091
3092	int bh0 = std::min(a: BLOCK_SZ/`2`, b: height);
3093	int bw0 = std::min(a: BLOCK_SZ*BLOCK_SZ/bh0, b: width);
3094	bh0 = std::min(a: BLOCK_SZ*BLOCK_SZ/bw0, b: height);
3095
3096	for( y = range.start; y < range.end; y += bh0 )
3097	{
3098	for( x = `0`; x < width; x += bw0 )
3099	{
3100	int bw = std::min( a: bw0, b: width - x);
3101	int bh = std::min( a: bh0, b: range.end - y); // height
3102
3103	Mat _XY(bh, bw, CV_16SC2, XY);
3104	Mat dpart(dst, Rect (x, y, bw, bh));
3105
3106	for( y1 = `0`; y1 < bh; y1++ )
3107	{
3108	short* xy = XY + y1bw`2`;
3109	double X0 = M[`0`]x + M[`1`](y + y1) + M[`2`];
3110	double Y0 = M[`3`]x + M[`4`](y + y1) + M[`5`];
3111	double W0 = M[`6`]x + M[`7`](y + y1) + M[`8`];
3112
3113	if( interpolation == INTER_NEAREST )
3114	hal::warpPerspectiveBlocklineNN(M, xy, X0, Y0, W0, bw);
3115	else
3116	hal::warpPerspectiveBlockline(M, xy, alpha: A + y1*bw, X0, Y0, W0, bw);
3117	}
3118
3119	if( interpolation == INTER_NEAREST )
3120	remap( src: src, dst: dpart, map1: _XY, map2: Mat (), interpolation, borderType, borderValue );
3121	else
3122	{
3123	Mat _matA(bh, bw, CV_16U, A);
3124	remap( src: src, dst: dpart, map1: _XY, map2: _matA, interpolation, borderType, borderValue );
3125	}
3126	}
3127	}
3128	}
3129
3130	private:
3131	Mat src;
3132	Mat dst;
3133	const double* M;
3134	int interpolation, borderType;
3135	Scalar borderValue;
3136	};
3137
3138	#if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPPERSPECTIVE
3139	typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void, IppiSize, int, IppiRect, void* , int, IppiRect, double* [`3`][`3`], int);
3140
3141	class IPPWarpPerspectiveInvoker :
3142	public ParallelLoopBody
3143	{
3144	public:
3145	IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[`3`][`3`], int &_interpolation,
3146	int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
3147	ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
3148	borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
3149	{
3150	ok = true*;
3151	}
3152
3153	virtual void operator() (const Range& range) const CV_OVERRIDE
3154	{
3155	IppiSize srcsize = {src.cols, src.rows};
3156	IppiRect srcroi = {`0`, `0`, src.cols, src.rows};
3157	IppiRect dstroi = {`0`, range.start, dst.cols, range.end - range.start};
3158	int cnn = src.channels();
3159
3160	if( borderType == BORDER_CONSTANT )
3161	{
3162	IppiSize setSize = {dst.cols, range.end - range.start};
3163	void *dataPointer = dst.ptr(range.start);
3164	if( !IPPSet( borderValue, dataPointer, (int)dst.step[`0`], setSize, cnn, src.depth() ) )
3165	{
3166	ok = false*;
3167	return;
3168	}
3169	}
3170
3171	IppStatus status = CV_INSTRUMENT_FUN_IPP(func,(src.ptr();, srcsize, (int)src.step[`0`], srcroi, dst.ptr(), (int)dst.step[`0`], dstroi, coeffs, mode));
3172	if (status != ippStsNoErr)
3173	ok = false*;
3174	else
3175	{
3176	CV_IMPL_ADD(CV_IMPL_IPP\|CV_IMPL_MT);
3177	}
3178	}
3179	private:
3180	Mat &src;
3181	Mat &dst;
3182	int mode;
3183	double (&coeffs)[`3`][`3`];
3184	int borderType;
3185	const Scalar borderValue;
3186	ippiWarpPerspectiveFunc func;
3187	bool *ok;
3188
3189	const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
3190	};
3191	#endif
3192
3193	namespace hal {
3194
3195	void warpPerspective(int src_type,
3196	const uchar * src_data, size_t src_step, int src_width, int src_height,
3197	uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
3198	const double M[`9`], int interpolation, int borderType, const double borderValue[`4`])
3199	{
3200	CALL_HAL(warpPerspective, cv_hal_warpPerspective, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
3201	Mat src(Size (src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
3202	Mat dst(Size (dst_width, dst_height), src_type, dst_data, dst_step);
3203
3204	Range range(`0`, dst.rows);
3205	WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar (borderValue[`0`], borderValue[`1`], borderValue[`2`], borderValue[`3`]));
3206	parallel_for_(range, body: invoker, nstripes: dst.total()/(double)(`1`<<`16`));
3207	}
3208
3209	void warpPerspectiveBlocklineNN(const double M, short** xy, double X0, double Y0, double W0, int bw)
3210	{
3211	CALL_HAL(warpPerspectiveBlocklineNN, cv_hal_warpPerspectiveBlocklineNN, M, xy, X0, Y0, W0, bw);
3212
3213	#if CV_TRY_SSE4_1
3214	Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
3215	if(CV_CPU_HAS_SUPPORT_SSE4_1)
3216	pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
3217
3218	if (pwarp_impl_sse4)
3219	pwarp_impl_sse4 ->processNN(M, xy, X0, Y0, W0, bw);
3220	else
3221	#endif
3222	{
3223	#if CV_SIMD128_64F
3224	WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
3225	#else
3226	for( int x1 = `0`; x1 < bw; x1++ )
3227	{
3228	double W = W0 + M[`6`]*x1;
3229	W = W ? `1.`/W : `0`;
3230	double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[`0`]x1)W));
3231	double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[`3`]x1)W));
3232	int X = saturate_cast<int>(fX);
3233	int Y = saturate_cast<int>(fY);
3234
3235	xy[x1`2`] = saturate_cast<short*>(X);
3236	xy[x1`2`+`1`] = saturate_cast<short*>(Y);
3237	}
3238	#endif
3239	}
3240	}
3241
3242	void warpPerspectiveBlockline(const double M, short** xy, short* alpha, double X0, double Y0, double W0, int bw)
3243	{
3244	CALL_HAL(warpPerspectiveBlockline, cv_hal_warpPerspectiveBlockline, M, xy, alpha, X0, Y0, W0, bw);
3245
3246	#if CV_TRY_SSE4_1
3247	Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
3248	if(CV_CPU_HAS_SUPPORT_SSE4_1)
3249	pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
3250
3251	if (pwarp_impl_sse4)
3252	pwarp_impl_sse4 ->process(M, xy, alpha, X0, Y0, W0, bw);
3253	else
3254	#endif
3255	{
3256	#if CV_SIMD128_64F
3257	WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
3258	#else
3259	for( int x1 = `0`; x1 < bw; x1++ )
3260	{
3261	double W = W0 + M[`6`]*x1;
3262	W = W ? INTER_TAB_SIZE/W : `0`;
3263	double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[`0`]x1)W));
3264	double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[`3`]x1)W));
3265	int X = saturate_cast<int>(fX);
3266	int Y = saturate_cast<int>(fY);
3267
3268	xy[x1`2`] = saturate_cast<short*>(X >> INTER_BITS);
3269	xy[x1`2`+`1`] = saturate_cast<short*>(Y >> INTER_BITS);
3270	alpha[x1] = (short)((Y & (INTER_TAB_SIZE-`1`))*INTER_TAB_SIZE +
3271	(X & (INTER_TAB_SIZE-`1`)));
3272	}
3273	#endif
3274	}
3275	}
3276
3277	} // hal::
3278	} // cv::
3279
3280	void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
3281	Size dsize, int flags, int borderType, const Scalar& borderValue )
3282	{
3283	CV_INSTRUMENT_REGION();
3284
3285	CV_Assert( _src.total() > `0` );
3286
3287	CV_OCL_RUN(_src.dims() <= `2` && _dst.isUMat() &&
3288	_src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
3289	ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType, borderValue,
3290	op_type: OCL_OP_PERSPECTIVE))
3291
3292	CV_OCL_RUN(_src.dims() <= `2` && _dst.isUMat(),
3293	ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
3294	op_type: OCL_OP_PERSPECTIVE))
3295
3296	Mat src = _src.getMat(), M0 = _M0.getMat();
3297	_dst.create( sz: dsize.empty() ? src.size () : dsize, type: src.type() );
3298	Mat dst = _dst.getMat();
3299
3300	if( dst.data == src.data )
3301	src = src.clone();
3302
3303	double M[`9`];
3304	Mat matM(`3`, `3`, CV_64F, M);
3305	int interpolation = flags & INTER_MAX;
3306	if( interpolation == INTER_AREA )
3307	interpolation = INTER_LINEAR;
3308
3309	CV_Assert( (M0.type() == CV_32F \|\| M0.type() == CV_64F) && M0.rows == `3` && M0.cols == `3` );
3310	M0.convertTo(m: matM, rtype: matM.type());
3311
3312	#if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPPERSPECTIVE
3313	CV_IPP_CHECK()
3314	{
3315	int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
3316	if( (depth == CV_8U \|\| depth == CV_16U \|\| depth == CV_32F) &&
3317	(cn == `1` \|\| cn == `3` \|\| cn == `4`) &&
3318	( borderType == cv::BORDER_TRANSPARENT \|\| borderType == cv::BORDER_CONSTANT ) &&
3319	(interpolation == INTER_NEAREST \|\| interpolation == INTER_LINEAR \|\| interpolation == INTER_CUBIC))
3320	{
3321	ippiWarpPerspectiveFunc ippFunc = `0`;
3322	if ((flags & WARP_INVERSE_MAP) != `0`)
3323	{
3324	ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
3325	type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
3326	type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
3327	type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
3328	type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
3329	type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
3330	type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
3331	type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
3332	type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : `0`;
3333	}
3334	else
3335	{
3336	ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
3337	type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
3338	type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
3339	type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
3340	type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
3341	type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
3342	type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
3343	type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
3344	type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : `0`;
3345	}
3346	int mode =
3347	interpolation == INTER_NEAREST ? IPPI_INTER_NN :
3348	interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
3349	interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : `0`;
3350	CV_Assert(mode && ippFunc);
3351
3352	double coeffs[`3`][`3`];
3353	for( int i = `0`; i < `3`; i++ )
3354	for( int j = `0`; j < `3`; j++ )
3355	coeffs[i][j] = matM.at<double>(i, j);
3356
3357	bool ok;
3358	Range range(`0`, dst.rows);
3359	IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
3360	parallel_for_(range, invoker, dst.total()/(double)(`1`<<`16`));
3361	if( ok )
3362	{
3363	CV_IMPL_ADD(CV_IMPL_IPP\|CV_IMPL_MT);
3364	return;
3365	}
3366	setIppErrorStatus();
3367	}
3368	}
3369	#endif
3370
3371	if( !(flags & WARP_INVERSE_MAP) )
3372	invert(src: matM, dst: matM);
3373
3374	hal::warpPerspective(src_type: src.type(), src_data: src.data, src_step: src.step, src_width: src.cols, src_height: src.rows, dst_data: dst.data, dst_step: dst.step, dst_width: dst.cols, dst_height: dst.rows,
3375	M: matM.ptr<double>(), interpolation, borderType, borderValue: borderValue.val);
3376	}
3377
3378
3379	cv::Matx23d cv::getRotationMatrix2D_(Point2f center, double angle, double scale)
3380	{
3381	CV_INSTRUMENT_REGION();
3382
3383	angle *= CV_PI/`180`;
3384	double alpha = std::cos(x: angle)*scale;
3385	double beta = std::sin(x: angle)*scale;
3386
3387	Matx23d M(
3388	alpha, beta, (`1`-alpha)center.x - betacenter.y,
3389	-beta, alpha, betacenter.x + (`1`-alpha)center.y
3390	);
3391	return M;
3392	}
3393
3394	/ Calculates coefficients of perspective transformation*
3395	* which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
3396	*
3397	* c00xi + c01yi + c02
3398	* ui = ---------------------
3399	* c20xi + c21yi + c22
3400	*
3401	* c10xi + c11yi + c12
3402	* vi = ---------------------
3403	* c20xi + c21yi + c22
3404	*
3405	* Coefficients are calculated by solving one of 2 linear systems:
3406	* / x0 y0 1 0 0 0 -x0u0 -y0u0 \ /c00\ /u0\
3407	* \| x1 y1 1 0 0 0 -x1u1 -y1u1 \| \|c01\| \|u1\|
3408	* \| x2 y2 1 0 0 0 -x2u2 -y2u2 \| \|c02\| \|u2\|
3409	* \| x3 y3 1 0 0 0 -x3u3 -y3u3 \|.\|c10\|=\|u3\|,
3410	* \| 0 0 0 x0 y0 1 -x0v0 -y0v0 \| \|c11\| \|v0\|
3411	* \| 0 0 0 x1 y1 1 -x1v1 -y1v1 \| \|c12\| \|v1\|
3412	* \| 0 0 0 x2 y2 1 -x2v2 -y2v2 \| \|c20\| \|v2\|
3413	* \ 0 0 0 x3 y3 1 -x3v3 -y3v3 / \c21/ \v3/
3414	*
3415	* where:
3416	* cij - matrix coefficients, c22 = 1
3417	*
3418	* or
3419	*
3420	* / x0 y0 1 0 0 0 -x0u0 -y0u0 -u0 \ /c00\ /0\
3421	* \| x1 y1 1 0 0 0 -x1u1 -y1u1 -u1 \| \|c01\| \|0\|
3422	* \| x2 y2 1 0 0 0 -x2u2 -y2u2 -u2 \| \|c02\| \|0\|
3423	* \| x3 y3 1 0 0 0 -x3u3 -y3u3 -u3 \|.\|c10\|=\|0\|,
3424	* \| 0 0 0 x0 y0 1 -x0v0 -y0v0 -v0 \| \|c11\| \|0\|
3425	* \| 0 0 0 x1 y1 1 -x1v1 -y1v1 -v1 \| \|c12\| \|0\|
3426	* \| 0 0 0 x2 y2 1 -x2v2 -y2v2 -v2 \| \|c20\| \|0\|
3427	* \ 0 0 0 x3 y3 1 -x3v3 -y3v3 -v3 / \|c21\| \0/
3428	* \c22/
3429	*
3430	* where:
3431	* cij - matrix coefficients, c00^2 + c01^2 + c02^2 + c10^2 + c11^2 + c12^2 + c20^2 + c21^2 + c22^2 = 1
3432	*/
3433	cv::Mat cv::getPerspectiveTransform(const Point2f src[], const Point2f dst[], int solveMethod)
3434	{
3435	CV_INSTRUMENT_REGION();
3436
3437	// try c22 = 1
3438	Mat M(`3`, `3`, CV_64F), X8(`8`, `1`, CV_64F, M.ptr());
3439	double a[`8`][`8`], b[`8`];
3440	Mat A(`8`, `8`, CV_64F, a), B(`8`, `1`, CV_64F, b);
3441
3442	for( int i = `0`; i < `4`; ++i )
3443	{
3444	a[i][`0`] = a[i+`4`][`3`] = src[i].x;
3445	a[i][`1`] = a[i+`4`][`4`] = src[i].y;
3446	a[i][`2`] = a[i+`4`][`5`] = `1`;
3447	a[i][`3`] = a[i][`4`] = a[i][`5`] =
3448	a[i+`4`][`0`] = a[i+`4`][`1`] = a[i+`4`][`2`] = `0`;
3449	a[i][`6`] = -src[i].x*dst[i].x;
3450	a[i][`7`] = -src[i].y*dst[i].x;
3451	a[i+`4`][`6`] = -src[i].x*dst[i].y;
3452	a[i+`4`][`7`] = -src[i].y*dst[i].y;
3453	b[i] = dst[i].x;
3454	b[i+`4`] = dst[i].y;
3455	}
3456
3457	if (solve(src1: A, src2: B, dst: X8, flags: solveMethod) && norm(src1: A * X8, src2: B) < `1e-8`)
3458	{
3459	M.ptr<double>()[`8`] = `1.`;
3460
3461	return M;
3462	}
3463
3464	// c00^2 + c01^2 + c02^2 + c10^2 + c11^2 + c12^2 + c20^2 + c21^2 + c22^2 = 1
3465	hconcat(src1: A, src2: -B, dst: A);
3466
3467	Mat AtA;
3468	mulTransposed(src: A, dst: AtA, aTa: true);
3469
3470	Mat D, U;
3471	SVDecomp(src: AtA, w: D, u: U, vt: noArray());
3472
3473	Mat X9(`9`, `1`, CV_64F, M.ptr());
3474	U.col(x: `8`).copyTo(m: X9);
3475
3476	return M;
3477	}
3478
3479	/ Calculates coefficients of affine transformation*
3480	* which maps (xi,yi) to (ui,vi), (i=1,2,3):
3481	*
3482	* ui = c00xi + c01yi + c02
3483	*
3484	* vi = c10xi + c11yi + c12
3485	*
3486	* Coefficients are calculated by solving linear system:
3487	* / x0 y0 1 0 0 0 \ /c00\ /u0\
3488	* \| x1 y1 1 0 0 0 \| \|c01\| \|u1\|
3489	* \| x2 y2 1 0 0 0 \| \|c02\| \|u2\|
3490	* \| 0 0 0 x0 y0 1 \| \|c10\| \|v0\|
3491	* \| 0 0 0 x1 y1 1 \| \|c11\| \|v1\|
3492	* \ 0 0 0 x2 y2 1 / \|c12\| \|v2\|
3493	*
3494	* where:
3495	* cij - matrix coefficients
3496	*/
3497
3498	cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
3499	{
3500	Mat M(`2`, `3`, CV_64F), X(`6`, `1`, CV_64F, M.ptr());
3501	double a[`6`*`6`], b[`6`];
3502	Mat A(`6`, `6`, CV_64F, a), B(`6`, `1`, CV_64F, b);
3503
3504	for( int i = `0`; i < `3`; i++ )
3505	{
3506	int j = i*`12`;
3507	int k = i*`12`+`6`;
3508	a[j] = a[k+`3`] = src[i].x;
3509	a[j+`1`] = a[k+`4`] = src[i].y;
3510	a[j+`2`] = a[k+`5`] = `1`;
3511	a[j+`3`] = a[j+`4`] = a[j+`5`] = `0`;
3512	a[k] = a[k+`1`] = a[k+`2`] = `0`;
3513	b[i*`2`] = dst[i].x;
3514	b[i*`2`+`1`] = dst[i].y;
3515	}
3516
3517	solve( src1: A, src2: B, dst: X );
3518	return M;
3519	}
3520
3521	void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
3522	{
3523	Mat matM = _matM.getMat();
3524	CV_Assert(matM.rows == `2` && matM.cols == `3`);
3525	__iM.create(rows: `2`, cols: `3`, type: matM.type());
3526	Mat _iM = __iM.getMat();
3527
3528	if( matM.type() == CV_32F )
3529	{
3530	const softfloat* M = matM.ptr<softfloat>();
3531	softfloat* iM = _iM.ptr<softfloat>();
3532	int step = (int)(matM.step/sizeof(M[`0`])), istep = (int)(_iM.step/sizeof(iM[`0`]));
3533
3534	softdouble D = M[`0`]M[step+`1`] - M[`1`]M[step];
3535	D = D != `0.` ? softdouble (`1.`)/D : softdouble (`0.`);
3536	softdouble A11 = M[step+`1`]D, A22 = M[`0`]D, A12 = -M[`1`]D, A21 = -M[step]D;
3537	softdouble b1 = -A11 M[`2`] - A12 M[step+`2`];
3538	softdouble b2 = -A21 M[`2`] - A22 M[step+`2`];
3539
3540	iM[`0`] = A11; iM[`1`] = A12; iM[`2`] = b1;
3541	iM[istep] = A21; iM[istep+`1`] = A22; iM[istep+`2`] = b2;
3542	}
3543	else if( matM.type() == CV_64F )
3544	{
3545	const softdouble* M = matM.ptr<softdouble>();
3546	softdouble* iM = _iM.ptr<softdouble>();
3547	int step = (int)(matM.step/sizeof(M[`0`])), istep = (int)(_iM.step/sizeof(iM[`0`]));
3548
3549	softdouble D = M[`0`]M[step+`1`] - M[`1`]M[step];
3550	D = D != `0.` ? softdouble (`1.`)/D : softdouble (`0.`);
3551	softdouble A11 = M[step+`1`]D, A22 = M[`0`]D, A12 = -M[`1`]D, A21 = -M[step]D;
3552	softdouble b1 = -A11 M[`2`] - A12 M[step+`2`];
3553	softdouble b2 = -A21 M[`2`] - A22 M[step+`2`];
3554
3555	iM[`0`] = A11; iM[`1`] = A12; iM[`2`] = b1;
3556	iM[istep] = A21; iM[istep+`1`] = A22; iM[istep+`2`] = b2;
3557	}
3558	else
3559	CV_Error( cv::Error::StsUnsupportedFormat, "" );
3560	}
3561
3562	cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst, int solveMethod)
3563	{
3564	Mat src = _src.getMat(), dst = _dst.getMat();
3565	CV_Assert(src.checkVector(`2`, CV_32F) == `4` && dst.checkVector(`2`, CV_32F) == `4`);
3566	return getPerspectiveTransform(src: (const Point2f)src.data, dst: (const* Point2f*)dst.data, solveMethod);
3567	}
3568
3569	cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
3570	{
3571	Mat src = _src.getMat(), dst = _dst.getMat();
3572	CV_Assert(src.checkVector(`2`, CV_32F) == `3` && dst.checkVector(`2`, CV_32F) == `3`);
3573	return getAffineTransform(src: (const Point2f)src.data, dst: (const* Point2f*)dst.data);
3574	}
3575
3576	CV_IMPL void
3577	cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
3578	int flags, CvScalar fillval )
3579	{
3580	cv::Mat src = cv::cvarrToMat(arr: srcarr), dst = cv::cvarrToMat(arr: dstarr);
3581	cv::Mat matrix = cv::cvarrToMat(arr: marr);
3582	CV_Assert( src.type() == dst.type() );
3583	cv::warpAffine( src: src, dst: dst, M0: matrix, dsize: dst.size (), flags,
3584	borderType: (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3585	borderValue: fillval );
3586	}
3587
3588	CV_IMPL void
3589	cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
3590	int flags, CvScalar fillval )
3591	{
3592	cv::Mat src = cv::cvarrToMat(arr: srcarr), dst = cv::cvarrToMat(arr: dstarr);
3593	cv::Mat matrix = cv::cvarrToMat(arr: marr);
3594	CV_Assert( src.type() == dst.type() );
3595	cv::warpPerspective( src: src, dst: dst, M0: matrix, dsize: dst.size (), flags,
3596	borderType: (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3597	borderValue: fillval );
3598	}
3599
3600	CV_IMPL void
3601	cvRemap( const CvArr* srcarr, CvArr* dstarr,
3602	const CvArr* _mapx, const CvArr* _mapy,
3603	int flags, CvScalar fillval )
3604	{
3605	cv::Mat src = cv::cvarrToMat(arr: srcarr), dst = cv::cvarrToMat(arr: dstarr), dst0 = dst;
3606	cv::Mat mapx = cv::cvarrToMat(arr: _mapx), mapy = cv::cvarrToMat(arr: _mapy);
3607	CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
3608	cv::remap( src: src, dst: dst, map1: mapx, map2: mapy, interpolation: flags & cv::INTER_MAX,
3609	borderType: (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
3610	borderValue: fillval );
3611	CV_Assert( dst0.data == dst.data );
3612	}
3613
3614
3615	CV_IMPL CvMat*
3616	cv2DRotationMatrix( CvPoint2D32f center, double angle,
3617	double scale, CvMat* matrix )
3618	{
3619	cv::Mat M0 = cv::cvarrToMat(arr: matrix), M = cv::getRotationMatrix2D(center, angle, scale);
3620	CV_Assert( M.size() == M0.size() );
3621	M.convertTo(m: M0, rtype: M0.type());
3622	return matrix;
3623	}
3624
3625
3626	CV_IMPL CvMat*
3627	cvGetPerspectiveTransform( const CvPoint2D32f* src,
3628	const CvPoint2D32f* dst,
3629	CvMat* matrix )
3630	{
3631	cv::Mat M0 = cv::cvarrToMat(arr: matrix),
3632	M = cv::getPerspectiveTransform(src: (const cv::Point2f)src, dst: (const* cv::Point2f*)dst);
3633	CV_Assert( M.size() == M0.size() );
3634	M.convertTo(m: M0, rtype: M0.type());
3635	return matrix;
3636	}
3637
3638
3639	CV_IMPL CvMat*
3640	cvGetAffineTransform( const CvPoint2D32f* src,
3641	const CvPoint2D32f* dst,
3642	CvMat* matrix )
3643	{
3644	cv::Mat M0 = cv::cvarrToMat(arr: matrix),
3645	M = cv::getAffineTransform(src: (const cv::Point2f)src, dst: (const* cv::Point2f*)dst);
3646	CV_Assert( M.size() == M0.size() );
3647	M.convertTo(m: M0, rtype: M0.type());
3648	return matrix;
3649	}
3650
3651
3652	CV_IMPL void
3653	cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
3654	{
3655	cv::Mat map1 = cv::cvarrToMat(arr: arr1), map2;
3656	cv::Mat dstmap1 = cv::cvarrToMat(arr: dstarr1), dstmap2;
3657
3658	if( arr2 )
3659	map2 = cv::cvarrToMat(arr: arr2);
3660	if( dstarr2 )
3661	{
3662	dstmap2 = cv::cvarrToMat(arr: dstarr2);
3663	if( dstmap2.type() == CV_16SC1 )
3664	dstmap2 = cv::Mat (dstmap2.size (), CV_16UC1, dstmap2.ptr(), dstmap2.step);
3665	}
3666
3667	cv::convertMaps( map1: map1, map2: map2, dstmap1: dstmap1, dstmap2: dstmap2, dstm1type: dstmap1.type(), nninterpolate: false );
3668	}
3669
3670	/****************************************************************************************
3671	PkLab.net 2018 based on cv::linearPolar from OpenCV by J.L. Blanco, Apr 2009
3672	****************************************************************************************/
3673	void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
3674	Point2f center, double maxRadius, int flags)
3675	{
3676	// if dest size is empty given than calculate using proportional setting
3677	// thus we calculate needed angles to keep same area as bounding circle
3678	if ((dsize.width <= `0`) && (dsize.height <= `0`))
3679	{
3680	dsize.width = cvRound(value: maxRadius);
3681	dsize.height = cvRound(value: maxRadius * CV_PI);
3682	}
3683	else if (dsize.height <= `0`)
3684	{
3685	dsize.height = cvRound(value: dsize.width * CV_PI);
3686	}
3687
3688	Mat mapx, mapy;
3689	mapx.create(size: dsize, CV_32F);
3690	mapy.create(size: dsize, CV_32F);
3691	bool semiLog = (flags & WARP_POLAR_LOG) != `0`;
3692
3693	if (!(flags & cv::WARP_INVERSE_MAP))
3694	{
3695	CV_Assert(!dsize.empty());
3696	double Kangle = CV_2PI / dsize.height;
3697	int phi, rho;
3698
3699	// precalculate scaled rho
3700	Mat rhos = Mat (`1`, dsize.width, CV_32F);
3701	float* bufRhos = (float*)(rhos.data);
3702	if (semiLog)
3703	{
3704	double Kmag = std::log(x: maxRadius) / dsize.width;
3705	for (rho = `0`; rho < dsize.width; rho++)
3706	bufRhos[rho] = (float)(std::exp(x: rho * Kmag) - `1.0`);
3707
3708	}
3709	else
3710	{
3711	double Kmag = maxRadius / dsize.width;
3712	for (rho = `0`; rho < dsize.width; rho++)
3713	bufRhos[rho] = (float)(rho * Kmag);
3714	}
3715
3716	for (phi = `0`; phi < dsize.height; phi++)
3717	{
3718	double KKy = Kangle * phi;
3719	double cp = std::cos(x: KKy);
3720	double sp = std::sin(x: KKy);
3721	float* mx = (float)(mapx.data + phimapx.step);
3722	float* my = (float)(mapy.data + phimapy.step);
3723
3724	for (rho = `0`; rho < dsize.width; rho++)
3725	{
3726	double x = bufRhos[rho] * cp + center.x;
3727	double y = bufRhos[rho] * sp + center.y;
3728
3729	mx[rho] = (float)x;
3730	my[rho] = (float)y;
3731	}
3732	}
3733	remap(_src, _dst, map1: mapx, map2: mapy, interpolation: flags & cv::INTER_MAX, borderType: (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
3734	}
3735	else
3736	{
3737	const int ANGLE_BORDER = `1`;
3738	cv::copyMakeBorder(src: _src, dst: _dst, top: ANGLE_BORDER, bottom: ANGLE_BORDER, left: `0`, right: `0`, borderType: BORDER_WRAP);
3739	Mat src = _dst.getMat();
3740	Size ssize = _dst.size();
3741	ssize.height -= `2` * ANGLE_BORDER;
3742	CV_Assert(!ssize.empty());
3743	const double Kangle = CV_2PI / ssize.height;
3744	double Kmag;
3745	if (semiLog)
3746	Kmag = std::log(x: maxRadius) / ssize.width;
3747	else
3748	Kmag = maxRadius / ssize.width;
3749
3750	Mat bufx, bufy, bufp, bufa;
3751
3752	bufx = Mat (`1`, dsize.width, CV_32F);
3753	bufy = Mat (`1`, dsize.width, CV_32F);
3754	bufp = Mat (`1`, dsize.width, CV_32F);
3755	bufa = Mat (`1`, dsize.width, CV_32F);
3756
3757	for (int x = `0`; x < dsize.width; x++)
3758	bufx.at<float>(i0: `0`, i1: x) = (float)x - center.x;
3759
3760	cv::parallel_for_(range: cv::Range (`0`, dsize.height), functor: [&](const cv::Range& range) {
3761	for (int y = range.start; y < range.end; ++y) {
3762	Mat local_bufx = bufx.clone();
3763	Mat local_bufy = Mat (`1`, dsize.width, CV_32F);
3764	Mat local_bufp = Mat (`1`, dsize.width, CV_32F);
3765	Mat local_bufa = Mat (`1`, dsize.width, CV_32F);
3766
3767	for (int x = `0`; x < dsize.width; x++) {
3768	local_bufy.at<float>(i0: `0`, i1: x) = static_cast<float>(y) - center.y;
3769	}
3770
3771	cartToPolar(x: local_bufx, y: local_bufy, magnitude: local_bufp, angle: local_bufa, angleInDegrees: false);
3772
3773	if (semiLog) {
3774	local_bufp += `1.f`;
3775	log(src: local_bufp, dst: local_bufp);
3776	}
3777
3778	float* mx = (float)(mapx.data + y mapx.step);
3779	float* my = (float)(mapy.data + y mapy.step);
3780
3781	for (int x = `0`; x < dsize.width; x++) {
3782	double rho = local_bufp.at<float>(i0: `0`, i1: x) / Kmag;
3783	double phi = local_bufa.at<float>(i0: `0`, i1: x) / Kangle;
3784	mx[x] = static_cast<float>(rho);
3785	my[x] = static_cast<float>(phi) + ANGLE_BORDER;
3786	}
3787	}
3788	});
3789
3790	remap(src: src, _dst, map1: mapx, map2: mapy, interpolation: flags & cv::INTER_MAX,
3791	borderType: (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
3792	}
3793	}
3794
3795	void cv::linearPolar( InputArray _src, OutputArray _dst,
3796	Point2f center, double maxRadius, int flags )
3797	{
3798	warpPolar(_src, _dst, dsize: _src.size(), center, maxRadius, flags: flags & ~WARP_POLAR_LOG);
3799	}
3800
3801	void cv::logPolar( InputArray _src, OutputArray _dst,
3802	Point2f center, double maxRadius, int flags )
3803	{
3804	Size ssize = _src.size();
3805	double M = maxRadius > `0` ? std::exp(x: ssize.width / maxRadius) : `1`;
3806	warpPolar(_src, _dst, dsize: ssize, center, maxRadius: M, flags: flags \| WARP_POLAR_LOG);
3807	}
3808
3809	CV_IMPL
3810	void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
3811	CvPoint2D32f center, double maxRadius, int flags )
3812	{
3813	Mat src = cvarrToMat(arr: srcarr);
3814	Mat dst = cvarrToMat(arr: dstarr);
3815
3816	CV_Assert(src.size == dst.size);
3817	CV_Assert(src.type() == dst.type());
3818
3819	cv::linearPolar(src: src, dst: dst, center, maxRadius, flags);
3820	}
3821
3822	CV_IMPL
3823	void cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
3824	CvPoint2D32f center, double M, int flags )
3825	{
3826	Mat src = cvarrToMat(arr: srcarr);
3827	Mat dst = cvarrToMat(arr: dstarr);
3828
3829	CV_Assert(src.size == dst.size);
3830	CV_Assert(src.type() == dst.type());
3831
3832	cv::logPolar(src: src, dst: dst, center, maxRadius: M, flags);
3833	}
3834
3835	/ End of file. /
3836

source code of opencv/modules/imgproc/src/imgwarp.cpp