resize.sse4_1.cpp source code [opencv/modules/imgproc/src/resize.sse4_1.cpp]

1	/M///////////////////////////////////////////////////////////////////////////////////////*
2	//
3	// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4	//
5	// By downloading, copying, installing or using the software you agree to this license.
6	// If you do not agree to this license, do not download, install,
7	// copy or use the software.
8	//
9	//
10	// License Agreement
11	// For Open Source Computer Vision Library
12	//
13	// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14	// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15	// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16	// Third party copyrights are property of their respective owners.
17	//
18	// Redistribution and use in source and binary forms, with or without modification,
19	// are permitted provided that the following conditions are met:
20	//
21	// Redistribution's of source code must retain the above copyright notice,*
22	// this list of conditions and the following disclaimer.
23	//
24	// Redistribution's in binary form must reproduce the above copyright notice,*
25	// this list of conditions and the following disclaimer in the documentation
26	// and/or other materials provided with the distribution.
27	//
28	// The name of the copyright holders may not be used to endorse or promote products*
29	// derived from this software without specific prior written permission.
30	//
31	// This software is provided by the copyright holders and contributors "as is" and
32	// any express or implied warranties, including, but not limited to, the implied
33	// warranties of merchantability and fitness for a particular purpose are disclaimed.
34	// In no event shall the Intel Corporation or contributors be liable for any direct,
35	// indirect, incidental, special, exemplary, or consequential damages
36	// (including, but not limited to, procurement of substitute goods or services;
37	// loss of use, data, or profits; or business interruption) however caused
38	// and on any theory of liability, whether in contract, strict liability,
39	// or tort (including negligence or otherwise) arising in any way out of
40	// the use of this software, even if advised of the possibility of such damage.
41	//
42	//M/*
43
44	/ ////////////////////////////////////////////////////////////////////*
45	//
46	// Geometrical transforms on images and matrices: rotation, zoom etc.
47	//
48	// /*
49
50	#include "precomp.hpp"
51	#include "resize.hpp"
52
53	namespace cv
54	{
55	namespace opt_SSE4_1
56	{
57
58	class resizeNNInvokerSSE2 :
59	public ParallelLoopBody
60	{
61	public:
62	resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int _x_ofs, double* _ify) :
63	ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs),
64	ify(_ify)
65	{
66	}
67
68	#if defined(__INTEL_COMPILER)
69	#pragma optimization_parameter target_arch=SSE4.2
70	#endif
71	virtual void operator() (const Range& range) const CV_OVERRIDE
72	{
73	Size ssize = src.size (), dsize = dst.size ();
74	int y, x;
75	int width = dsize.width;
76	int sseWidth = width - (width & `0x7`);
77	for(y = range.start; y < range.end; y++)
78	{
79	uchar* D = dst.data + dst.step*y;
80	uchar* Dstart = D;
81	int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-`1`);
82	const uchar* S = src.data + sy*src.step;
83	__m128i CV_DECL_ALIGNED(`64`) pixels = _mm_set1_epi16(w: `0`);
84	for(x = `0`; x < sseWidth; x += `8`)
85	{
86	ushort imm = (ushort)(S + x_ofs[x + `0`]);
87	pixels = _mm_insert_epi16(pixels, imm, `0`);
88	imm = (ushort)(S + x_ofs[x + `1`]);
89	pixels = _mm_insert_epi16(pixels, imm, `1`);
90	imm = (ushort)(S + x_ofs[x + `2`]);
91	pixels = _mm_insert_epi16(pixels, imm, `2`);
92	imm = (ushort)(S + x_ofs[x + `3`]);
93	pixels = _mm_insert_epi16(pixels, imm, `3`);
94	imm = (ushort)(S + x_ofs[x + `4`]);
95	pixels = _mm_insert_epi16(pixels, imm, `4`);
96	imm = (ushort)(S + x_ofs[x + `5`]);
97	pixels = _mm_insert_epi16(pixels, imm, `5`);
98	imm = (ushort)(S + x_ofs[x + `6`]);
99	pixels = _mm_insert_epi16(pixels, imm, `6`);
100	imm = (ushort)(S + x_ofs[x + `7`]);
101	pixels = _mm_insert_epi16(pixels, imm, `7`);
102	_mm_storeu_si128(p: (__m128i*)D, b: pixels);
103	D += `16`;
104	}
105	for(; x < width; x++)
106	{
107	(ushort)(Dstart + x`2`) = (ushort*)(S + x_ofs[x]);
108	}
109	}
110	}
111
112	private:
113	const Mat& src;
114	Mat& dst;
115	int* x_ofs;
116	double ify;
117
118	resizeNNInvokerSSE2(const resizeNNInvokerSSE2&);
119	resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&);
120	};
121
122	class resizeNNInvokerSSE4 :
123	public ParallelLoopBody
124	{
125	public:
126	resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int _x_ofs, double* _ify) :
127	ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs),
128	ify(_ify)
129	{
130	}
131	#if defined(__INTEL_COMPILER)
132	#pragma optimization_parameter target_arch=SSE4.2
133	#endif
134	virtual void operator() (const Range& range) const CV_OVERRIDE
135	{
136	Size ssize = src.size (), dsize = dst.size ();
137	int y, x;
138	int width = dsize.width;
139	int sseWidth = width - (width & `0x3`);
140	for(y = range.start; y < range.end; y++)
141	{
142	uchar* D = dst.data + dst.step*y;
143	uchar* Dstart = D;
144	int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-`1`);
145	const uchar* S = src.data + sy*src.step;
146	__m128i CV_DECL_ALIGNED(`64`) pixels = _mm_set1_epi16(w: `0`);
147	for(x = `0`; x < sseWidth; x += `4`)
148	{
149	int imm = (int**)(S + x_ofs[x + `0`]);
150	pixels = _mm_insert_epi32(pixels, imm, `0`);
151	imm = (int**)(S + x_ofs[x + `1`]);
152	pixels = _mm_insert_epi32(pixels, imm, `1`);
153	imm = (int**)(S + x_ofs[x + `2`]);
154	pixels = _mm_insert_epi32(pixels, imm, `2`);
155	imm = (int**)(S + x_ofs[x + `3`]);
156	pixels = _mm_insert_epi32(pixels, imm, `3`);
157	_mm_storeu_si128(p: (__m128i*)D, b: pixels);
158	D += `16`;
159	}
160	for(; x < width; x++)
161	{
162	(int*)(Dstart + x`4`) = (int**)(S + x_ofs[x]);
163	}
164	}
165	}
166
167	private:
168	const Mat& src;
169	Mat& dst;
170	int* x_ofs;
171	double ify;
172
173	resizeNNInvokerSSE4(const resizeNNInvokerSSE4&);
174	resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&);
175	};
176
177	void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int x_ofs, double* ify)
178	{
179	resizeNNInvokerSSE2 invoker(src, dst, x_ofs, ify);
180	parallel_for_(range, body: invoker, nstripes: dst.total() / (double)(`1` << `16`));
181	}
182
183	void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int x_ofs, double* ify)
184	{
185	resizeNNInvokerSSE4 invoker(src, dst, x_ofs, ify);
186	parallel_for_(range, body: invoker, nstripes: dst.total() / (double)(`1` << `16`));
187	}
188
189	int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width)
190	{
191	const float S0 = src[`0`], S1 = src[`1`], S2 = src[`2`], S3 = src[`3`],
192	S4 = src[`4`], S5 = src[`5`], S6 = src[`6`], S7 = src[`7`];
193	int x = `0`;
194	__m128 v_b0 = _mm_set1_ps(w: beta[`0`]), v_b1 = _mm_set1_ps(w: beta[`1`]),
195	v_b2 = _mm_set1_ps(w: beta[`2`]), v_b3 = _mm_set1_ps(w: beta[`3`]),
196	v_b4 = _mm_set1_ps(w: beta[`4`]), v_b5 = _mm_set1_ps(w: beta[`5`]),
197	v_b6 = _mm_set1_ps(w: beta[`6`]), v_b7 = _mm_set1_ps(w: beta[`7`]);
198
199	for (; x <= width - `8`; x += `8`)
200	{
201	__m128 v_dst0 = _mm_mul_ps(a: v_b0, b: _mm_loadu_ps(p: S0 + x));
202	v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b1, b: _mm_loadu_ps(p: S1 + x)));
203	v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b2, b: _mm_loadu_ps(p: S2 + x)));
204	v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b3, b: _mm_loadu_ps(p: S3 + x)));
205	v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b4, b: _mm_loadu_ps(p: S4 + x)));
206	v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b5, b: _mm_loadu_ps(p: S5 + x)));
207	v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b6, b: _mm_loadu_ps(p: S6 + x)));
208	v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b7, b: _mm_loadu_ps(p: S7 + x)));
209
210	__m128 v_dst1 = _mm_mul_ps(a: v_b0, b: _mm_loadu_ps(p: S0 + x + `4`));
211	v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b1, b: _mm_loadu_ps(p: S1 + x + `4`)));
212	v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b2, b: _mm_loadu_ps(p: S2 + x + `4`)));
213	v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b3, b: _mm_loadu_ps(p: S3 + x + `4`)));
214	v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b4, b: _mm_loadu_ps(p: S4 + x + `4`)));
215	v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b5, b: _mm_loadu_ps(p: S5 + x + `4`)));
216	v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b6, b: _mm_loadu_ps(p: S6 + x + `4`)));
217	v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b7, b: _mm_loadu_ps(p: S7 + x + `4`)));
218
219	__m128i v_dsti0 = _mm_cvtps_epi32(a: v_dst0);
220	__m128i v_dsti1 = _mm_cvtps_epi32(a: v_dst1);
221
222	_mm_storeu_si128(p: (__m128i *)(dst + x), b: _mm_packus_epi32(V1: v_dsti0, V2: v_dsti1));
223	}
224
225	return x;
226	}
227
228	}
229	}
230	/ End of file. /
231

Provided by KDAB

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of opencv/modules/imgproc/src/resize.sse4_1.cpp