1 | /*M/////////////////////////////////////////////////////////////////////////////////////// |
2 | // |
3 | // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
4 | // |
5 | // By downloading, copying, installing or using the software you agree to this license. |
6 | // If you do not agree to this license, do not download, install, |
7 | // copy or use the software. |
8 | // |
9 | // |
10 | // License Agreement |
11 | // For Open Source Computer Vision Library |
12 | // |
13 | // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
14 | // Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
15 | // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. |
16 | // Third party copyrights are property of their respective owners. |
17 | // |
18 | // Redistribution and use in source and binary forms, with or without modification, |
19 | // are permitted provided that the following conditions are met: |
20 | // |
21 | // * Redistribution's of source code must retain the above copyright notice, |
22 | // this list of conditions and the following disclaimer. |
23 | // |
24 | // * Redistribution's in binary form must reproduce the above copyright notice, |
25 | // this list of conditions and the following disclaimer in the documentation |
26 | // and/or other materials provided with the distribution. |
27 | // |
28 | // * The name of the copyright holders may not be used to endorse or promote products |
29 | // derived from this software without specific prior written permission. |
30 | // |
31 | // This software is provided by the copyright holders and contributors "as is" and |
32 | // any express or implied warranties, including, but not limited to, the implied |
33 | // warranties of merchantability and fitness for a particular purpose are disclaimed. |
34 | // In no event shall the Intel Corporation or contributors be liable for any direct, |
35 | // indirect, incidental, special, exemplary, or consequential damages |
36 | // (including, but not limited to, procurement of substitute goods or services; |
37 | // loss of use, data, or profits; or business interruption) however caused |
38 | // and on any theory of liability, whether in contract, strict liability, |
39 | // or tort (including negligence or otherwise) arising in any way out of |
40 | // the use of this software, even if advised of the possibility of such damage. |
41 | // |
42 | //M*/ |
43 | |
44 | /* //////////////////////////////////////////////////////////////////// |
45 | // |
46 | // Geometrical transforms on images and matrices: rotation, zoom etc. |
47 | // |
48 | // */ |
49 | |
50 | #include "precomp.hpp" |
51 | #include "resize.hpp" |
52 | |
53 | namespace cv |
54 | { |
55 | namespace opt_SSE4_1 |
56 | { |
57 | |
58 | class : |
59 | public ParallelLoopBody |
60 | { |
61 | public: |
62 | (const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) : |
63 | ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), |
64 | ify(_ify) |
65 | { |
66 | } |
67 | |
68 | #if defined(__INTEL_COMPILER) |
69 | #pragma optimization_parameter target_arch=SSE4.2 |
70 | #endif |
71 | virtual void (const Range& range) const CV_OVERRIDE |
72 | { |
73 | Size ssize = src.size(), dsize = dst.size(); |
74 | int y, x; |
75 | int width = dsize.width; |
76 | int sseWidth = width - (width & 0x7); |
77 | for(y = range.start; y < range.end; y++) |
78 | { |
79 | uchar* D = dst.data + dst.step*y; |
80 | uchar* Dstart = D; |
81 | int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-1); |
82 | const uchar* S = src.data + sy*src.step; |
83 | __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(w: 0); |
84 | for(x = 0; x < sseWidth; x += 8) |
85 | { |
86 | ushort imm = *(ushort*)(S + x_ofs[x + 0]); |
87 | pixels = _mm_insert_epi16(pixels, imm, 0); |
88 | imm = *(ushort*)(S + x_ofs[x + 1]); |
89 | pixels = _mm_insert_epi16(pixels, imm, 1); |
90 | imm = *(ushort*)(S + x_ofs[x + 2]); |
91 | pixels = _mm_insert_epi16(pixels, imm, 2); |
92 | imm = *(ushort*)(S + x_ofs[x + 3]); |
93 | pixels = _mm_insert_epi16(pixels, imm, 3); |
94 | imm = *(ushort*)(S + x_ofs[x + 4]); |
95 | pixels = _mm_insert_epi16(pixels, imm, 4); |
96 | imm = *(ushort*)(S + x_ofs[x + 5]); |
97 | pixels = _mm_insert_epi16(pixels, imm, 5); |
98 | imm = *(ushort*)(S + x_ofs[x + 6]); |
99 | pixels = _mm_insert_epi16(pixels, imm, 6); |
100 | imm = *(ushort*)(S + x_ofs[x + 7]); |
101 | pixels = _mm_insert_epi16(pixels, imm, 7); |
102 | _mm_storeu_si128(p: (__m128i*)D, b: pixels); |
103 | D += 16; |
104 | } |
105 | for(; x < width; x++) |
106 | { |
107 | *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); |
108 | } |
109 | } |
110 | } |
111 | |
112 | private: |
113 | const Mat& ; |
114 | Mat& ; |
115 | int* ; |
116 | double ; |
117 | |
118 | (const resizeNNInvokerSSE2&); |
119 | resizeNNInvokerSSE2& (const resizeNNInvokerSSE2&); |
120 | }; |
121 | |
122 | class : |
123 | public ParallelLoopBody |
124 | { |
125 | public: |
126 | (const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) : |
127 | ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), |
128 | ify(_ify) |
129 | { |
130 | } |
131 | #if defined(__INTEL_COMPILER) |
132 | #pragma optimization_parameter target_arch=SSE4.2 |
133 | #endif |
134 | virtual void (const Range& range) const CV_OVERRIDE |
135 | { |
136 | Size ssize = src.size(), dsize = dst.size(); |
137 | int y, x; |
138 | int width = dsize.width; |
139 | int sseWidth = width - (width & 0x3); |
140 | for(y = range.start; y < range.end; y++) |
141 | { |
142 | uchar* D = dst.data + dst.step*y; |
143 | uchar* Dstart = D; |
144 | int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-1); |
145 | const uchar* S = src.data + sy*src.step; |
146 | __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(w: 0); |
147 | for(x = 0; x < sseWidth; x += 4) |
148 | { |
149 | int imm = *(int*)(S + x_ofs[x + 0]); |
150 | pixels = _mm_insert_epi32(pixels, imm, 0); |
151 | imm = *(int*)(S + x_ofs[x + 1]); |
152 | pixels = _mm_insert_epi32(pixels, imm, 1); |
153 | imm = *(int*)(S + x_ofs[x + 2]); |
154 | pixels = _mm_insert_epi32(pixels, imm, 2); |
155 | imm = *(int*)(S + x_ofs[x + 3]); |
156 | pixels = _mm_insert_epi32(pixels, imm, 3); |
157 | _mm_storeu_si128(p: (__m128i*)D, b: pixels); |
158 | D += 16; |
159 | } |
160 | for(; x < width; x++) |
161 | { |
162 | *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); |
163 | } |
164 | } |
165 | } |
166 | |
167 | private: |
168 | const Mat& ; |
169 | Mat& ; |
170 | int* ; |
171 | double ; |
172 | |
173 | (const resizeNNInvokerSSE4&); |
174 | resizeNNInvokerSSE4& (const resizeNNInvokerSSE4&); |
175 | }; |
176 | |
177 | void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify) |
178 | { |
179 | resizeNNInvokerSSE2 invoker(src, dst, x_ofs, ify); |
180 | parallel_for_(range, body: invoker, nstripes: dst.total() / (double)(1 << 16)); |
181 | } |
182 | |
183 | void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify) |
184 | { |
185 | resizeNNInvokerSSE4 invoker(src, dst, x_ofs, ify); |
186 | parallel_for_(range, body: invoker, nstripes: dst.total() / (double)(1 << 16)); |
187 | } |
188 | |
189 | int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width) |
190 | { |
191 | const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], |
192 | *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; |
193 | int x = 0; |
194 | __m128 v_b0 = _mm_set1_ps(w: beta[0]), v_b1 = _mm_set1_ps(w: beta[1]), |
195 | v_b2 = _mm_set1_ps(w: beta[2]), v_b3 = _mm_set1_ps(w: beta[3]), |
196 | v_b4 = _mm_set1_ps(w: beta[4]), v_b5 = _mm_set1_ps(w: beta[5]), |
197 | v_b6 = _mm_set1_ps(w: beta[6]), v_b7 = _mm_set1_ps(w: beta[7]); |
198 | |
199 | for (; x <= width - 8; x += 8) |
200 | { |
201 | __m128 v_dst0 = _mm_mul_ps(a: v_b0, b: _mm_loadu_ps(p: S0 + x)); |
202 | v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b1, b: _mm_loadu_ps(p: S1 + x))); |
203 | v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b2, b: _mm_loadu_ps(p: S2 + x))); |
204 | v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b3, b: _mm_loadu_ps(p: S3 + x))); |
205 | v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b4, b: _mm_loadu_ps(p: S4 + x))); |
206 | v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b5, b: _mm_loadu_ps(p: S5 + x))); |
207 | v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b6, b: _mm_loadu_ps(p: S6 + x))); |
208 | v_dst0 = _mm_add_ps(a: v_dst0, b: _mm_mul_ps(a: v_b7, b: _mm_loadu_ps(p: S7 + x))); |
209 | |
210 | __m128 v_dst1 = _mm_mul_ps(a: v_b0, b: _mm_loadu_ps(p: S0 + x + 4)); |
211 | v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b1, b: _mm_loadu_ps(p: S1 + x + 4))); |
212 | v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b2, b: _mm_loadu_ps(p: S2 + x + 4))); |
213 | v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b3, b: _mm_loadu_ps(p: S3 + x + 4))); |
214 | v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b4, b: _mm_loadu_ps(p: S4 + x + 4))); |
215 | v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b5, b: _mm_loadu_ps(p: S5 + x + 4))); |
216 | v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b6, b: _mm_loadu_ps(p: S6 + x + 4))); |
217 | v_dst1 = _mm_add_ps(a: v_dst1, b: _mm_mul_ps(a: v_b7, b: _mm_loadu_ps(p: S7 + x + 4))); |
218 | |
219 | __m128i v_dsti0 = _mm_cvtps_epi32(a: v_dst0); |
220 | __m128i v_dsti1 = _mm_cvtps_epi32(a: v_dst1); |
221 | |
222 | _mm_storeu_si128(p: (__m128i *)(dst + x), b: _mm_packus_epi32(V1: v_dsti0, V2: v_dsti1)); |
223 | } |
224 | |
225 | return x; |
226 | } |
227 | |
228 | } |
229 | } |
230 | /* End of file. */ |
231 | |