resize.avx2.cpp source code [opencv/modules/imgproc/src/resize.avx2.cpp]

1	/M///////////////////////////////////////////////////////////////////////////////////////*
2	//
3	// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4	//
5	// By downloading, copying, installing or using the software you agree to this license.
6	// If you do not agree to this license, do not download, install,
7	// copy or use the software.
8	//
9	//
10	// License Agreement
11	// For Open Source Computer Vision Library
12	//
13	// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14	// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15	// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16	// Third party copyrights are property of their respective owners.
17	//
18	// Redistribution and use in source and binary forms, with or without modification,
19	// are permitted provided that the following conditions are met:
20	//
21	// Redistribution's of source code must retain the above copyright notice,*
22	// this list of conditions and the following disclaimer.
23	//
24	// Redistribution's in binary form must reproduce the above copyright notice,*
25	// this list of conditions and the following disclaimer in the documentation
26	// and/or other materials provided with the distribution.
27	//
28	// The name of the copyright holders may not be used to endorse or promote products*
29	// derived from this software without specific prior written permission.
30	//
31	// This software is provided by the copyright holders and contributors "as is" and
32	// any express or implied warranties, including, but not limited to, the implied
33	// warranties of merchantability and fitness for a particular purpose are disclaimed.
34	// In no event shall the Intel Corporation or contributors be liable for any direct,
35	// indirect, incidental, special, exemplary, or consequential damages
36	// (including, but not limited to, procurement of substitute goods or services;
37	// loss of use, data, or profits; or business interruption) however caused
38	// and on any theory of liability, whether in contract, strict liability,
39	// or tort (including negligence or otherwise) arising in any way out of
40	// the use of this software, even if advised of the possibility of such damage.
41	//
42	//M/*
43
44	/ ////////////////////////////////////////////////////////////////////*
45	//
46	// Geometrical transforms on images and matrices: rotation, zoom etc.
47	//
48	// /*
49
50	#include "precomp.hpp"
51	#include "resize.hpp"
52
53	namespace cv
54	{
55	namespace opt_AVX2
56	{
57
58	class resizeNNInvokerAVX4 CV_FINAL :
59	public ParallelLoopBody
60	{
61	public:
62	resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int _x_ofs, double* _ify) :
63	ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs),
64	ify(_ify)
65	{
66	}
67
68	#if defined(__INTEL_COMPILER)
69	#pragma optimization_parameter target_arch=AVX
70	#endif
71	virtual void operator() (const Range& range) const CV_OVERRIDE
72	{
73	Size ssize = src.size (), dsize = dst.size ();
74	int y, x;
75	int width = dsize.width;
76	int avxWidth = width - (width & `0x7`);
77	const __m256i CV_DECL_ALIGNED(`64`) mask = _mm256_set1_epi32(i: -`1`);
78	if(((int64)(dst.data + dst.step) & `0x1f`) == `0`)
79	{
80	for(y = range.start; y < range.end; y++)
81	{
82	uchar* D = dst.data + dst.step*y;
83	uchar* Dstart = D;
84	int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-`1`);
85	const uchar* S = src.data + sy*src.step;
86	#ifdef CV_ICC
87	#pragma unroll(4)
88	#endif
89	for(x = `0`; x < avxWidth; x += `8`)
90	{
91	const __m256i CV_DECL_ALIGNED(`64`) addr = (__m256i)(x_ofs + x);
92	__m256i CV_DECL_ALIGNED(`64`) indices = _mm256_lddqu_si256(p: addr);
93	__m256i CV_DECL_ALIGNED(`64`) pixels = _mm256_i32gather_epi32((const int*)S, indices, `1`);
94	_mm256_maskstore_epi32(X: (int*)D, M: mask, Y: pixels);
95	D += `32`;
96	}
97	for(; x < width; x++)
98	{
99	(int*)(Dstart + x`4`) = (int**)(S + x_ofs[x]);
100	}
101	}
102	}
103	else
104	{
105	for(y = range.start; y < range.end; y++)
106	{
107	uchar* D = dst.data + dst.step*y;
108	uchar* Dstart = D;
109	int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-`1`);
110	const uchar* S = src.data + sy*src.step;
111	#ifdef CV_ICC
112	#pragma unroll(4)
113	#endif
114	for(x = `0`; x < avxWidth; x += `8`)
115	{
116	const __m256i CV_DECL_ALIGNED(`64`) addr = (__m256i)(x_ofs + x);
117	__m256i CV_DECL_ALIGNED(`64`) indices = _mm256_lddqu_si256(p: addr);
118	__m256i CV_DECL_ALIGNED(`64`) pixels = _mm256_i32gather_epi32((const int*)S, indices, `1`);
119	_mm256_storeu_si256(p: (__m256i*)D, a: pixels);
120	D += `32`;
121	}
122	for(; x < width; x++)
123	{
124	(int*)(Dstart + x`4`) = (int**)(S + x_ofs[x]);
125	}
126	}
127	}
128	_mm256_zeroupper();
129	}
130
131	private:
132	const Mat& src;
133	Mat& dst;
134	int* x_ofs;
135	double ify;
136
137	resizeNNInvokerAVX4(const resizeNNInvokerAVX4&);
138	resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&);
139	};
140
141	class resizeNNInvokerAVX2 CV_FINAL :
142	public ParallelLoopBody
143	{
144	public:
145	resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int _x_ofs, double* _ify) :
146	ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs),
147	ify(_ify)
148	{
149	}
150
151	#if defined(__INTEL_COMPILER)
152	#pragma optimization_parameter target_arch=AVX
153	#endif
154	virtual void operator() (const Range& range) const CV_OVERRIDE
155	{
156	Size ssize = src.size (), dsize = dst.size ();
157	int y, x;
158	int width = dsize.width;
159	//int avxWidth = (width - 1) - ((width - 1) & 0x7);
160	int avxWidth = width - (width & `0xf`);
161	const __m256i CV_DECL_ALIGNED(`64`) mask = _mm256_set1_epi32(i: -`1`);
162	const __m256i CV_DECL_ALIGNED(`64`) shuffle_mask = _mm256_set_epi8(b31: `15`,b30: `14`,b29: `11`,b28: `10`,b27: `13`,b26: `12`,b25: `9`,b24: `8`,b23: `7`,b22: `6`,b21: `3`,b20: `2`,b19: `5`,b18: `4`,b17: `1`,b16: `0`,
163	b15: `15`,b14: `14`,b13: `11`,b12: `10`,b11: `13`,b10: `12`,b09: `9`,b08: `8`,b07: `7`,b06: `6`,b05: `3`,b04: `2`,b03: `5`,b02: `4`,b01: `1`,b00: `0`);
164	const __m256i CV_DECL_ALIGNED(`64`) permute_mask = _mm256_set_epi32(i0: `7`, i1: `5`, i2: `3`, i3: `1`, i4: `6`, i5: `4`, i6: `2`, i7: `0`);
165	//const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
166	// 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
167	if(((int64)(dst.data + dst.step) & `0x1f`) == `0`)
168	{
169	for(y = range.start; y < range.end; y++)
170	{
171	uchar* D = dst.data + dst.step*y;
172	uchar* Dstart = D;
173	int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-`1`);
174	const uchar* S = src.data + sy*src.step;
175	const uchar* S2 = S - `2`;
176	#ifdef CV_ICC
177	#pragma unroll(4)
178	#endif
179	for(x = `0`; x < avxWidth; x += `16`)
180	{
181	const __m256i CV_DECL_ALIGNED(`64`) addr = (__m256i)(x_ofs + x);
182	__m256i CV_DECL_ALIGNED(`64`) indices = _mm256_lddqu_si256(p: addr);
183	__m256i CV_DECL_ALIGNED(`64`) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, `1`);
184	const __m256i CV_DECL_ALIGNED(`64`) addr2 = (__m256i)(x_ofs + x + `8`);
185	__m256i CV_DECL_ALIGNED(`64`) indices2 = _mm256_lddqu_si256(p: addr2);
186	__m256i CV_DECL_ALIGNED(`64`) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, `1`);
187	__m256i CV_DECL_ALIGNED(`64`) unpacked = _mm256_blend_epi16(pixels1, pixels2, `0xaa`);
188
189	__m256i CV_DECL_ALIGNED(`64`) bytes_shuffled = _mm256_shuffle_epi8(a: unpacked, b: shuffle_mask);
190	__m256i CV_DECL_ALIGNED(`64`) ints_permuted = _mm256_permutevar8x32_epi32(a: bytes_shuffled, b: permute_mask);
191	_mm256_maskstore_epi32(X: (int*)D, M: mask, Y: ints_permuted);
192	D += `32`;
193	}
194	for(; x < width; x++)
195	{
196	(ushort)(Dstart + x`2`) = (ushort*)(S + x_ofs[x]);
197	}
198
199	}
200	}
201	else
202	{
203	for(y = range.start; y < range.end; y++)
204	{
205	uchar* D = dst.data + dst.step*y;
206	uchar* Dstart = D;
207	int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-`1`);
208	const uchar* S = src.data + sy*src.step;
209	const uchar* S2 = S - `2`;
210	#ifdef CV_ICC
211	#pragma unroll(4)
212	#endif
213	for(x = `0`; x < avxWidth; x += `16`)
214	{
215	const __m256i CV_DECL_ALIGNED(`64`) addr = (__m256i)(x_ofs + x);
216	__m256i CV_DECL_ALIGNED(`64`) indices = _mm256_lddqu_si256(p: addr);
217	__m256i CV_DECL_ALIGNED(`64`) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, `1`);
218	const __m256i CV_DECL_ALIGNED(`64`) addr2 = (__m256i)(x_ofs + x + `8`);
219	__m256i CV_DECL_ALIGNED(`64`) indices2 = _mm256_lddqu_si256(p: addr2);
220	__m256i CV_DECL_ALIGNED(`64`) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, `1`);
221	__m256i CV_DECL_ALIGNED(`64`) unpacked = _mm256_blend_epi16(pixels1, pixels2, `0xaa`);
222
223	__m256i CV_DECL_ALIGNED(`64`) bytes_shuffled = _mm256_shuffle_epi8(a: unpacked, b: shuffle_mask);
224	__m256i CV_DECL_ALIGNED(`64`) ints_permuted = _mm256_permutevar8x32_epi32(a: bytes_shuffled, b: permute_mask);
225	_mm256_storeu_si256(p: (__m256i*)D, a: ints_permuted);
226	D += `32`;
227	}
228	for(; x < width; x++)
229	{
230	(ushort)(Dstart + x`2`) = (ushort*)(S + x_ofs[x]);
231	}
232	}
233	}
234	_mm256_zeroupper();
235	}
236
237	private:
238	const Mat& src;
239	Mat& dst;
240	int* x_ofs;
241	double ify;
242
243	resizeNNInvokerAVX2(const resizeNNInvokerAVX2&);
244	resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&);
245	};
246
247	void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int x_ofs, double* ify)
248	{
249	resizeNNInvokerAVX2 invoker(src, dst, x_ofs, ify);
250	parallel_for_(range, body: invoker, nstripes: dst.total() / (double)(`1` << `16`));
251	}
252
253	void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int x_ofs, double* ify)
254	{
255	resizeNNInvokerAVX4 invoker(src, dst, x_ofs, ify);
256	parallel_for_(range, body: invoker, nstripes: dst.total() / (double)(`1` << `16`));
257	}
258
259	}
260	}
261	/ End of file. /
262

Provided by KDAB

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of opencv/modules/imgproc/src/resize.avx2.cpp