1 | /*M/////////////////////////////////////////////////////////////////////////////////////// |
2 | // |
3 | // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
4 | // |
5 | // By downloading, copying, installing or using the software you agree to this license. |
6 | // If you do not agree to this license, do not download, install, |
7 | // copy or use the software. |
8 | // |
9 | // |
10 | // License Agreement |
11 | // For Open Source Computer Vision Library |
12 | // |
13 | // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
14 | // Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
15 | // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. |
16 | // Third party copyrights are property of their respective owners. |
17 | // |
18 | // Redistribution and use in source and binary forms, with or without modification, |
19 | // are permitted provided that the following conditions are met: |
20 | // |
21 | // * Redistribution's of source code must retain the above copyright notice, |
22 | // this list of conditions and the following disclaimer. |
23 | // |
24 | // * Redistribution's in binary form must reproduce the above copyright notice, |
25 | // this list of conditions and the following disclaimer in the documentation |
26 | // and/or other materials provided with the distribution. |
27 | // |
28 | // * The name of the copyright holders may not be used to endorse or promote products |
29 | // derived from this software without specific prior written permission. |
30 | // |
31 | // This software is provided by the copyright holders and contributors "as is" and |
32 | // any express or implied warranties, including, but not limited to, the implied |
33 | // warranties of merchantability and fitness for a particular purpose are disclaimed. |
34 | // In no event shall the Intel Corporation or contributors be liable for any direct, |
35 | // indirect, incidental, special, exemplary, or consequential damages |
36 | // (including, but not limited to, procurement of substitute goods or services; |
37 | // loss of use, data, or profits; or business interruption) however caused |
38 | // and on any theory of liability, whether in contract, strict liability, |
39 | // or tort (including negligence or otherwise) arising in any way out of |
40 | // the use of this software, even if advised of the possibility of such damage. |
41 | // |
42 | //M*/ |
43 | |
44 | /* //////////////////////////////////////////////////////////////////// |
45 | // |
46 | // Geometrical transforms on images and matrices: rotation, zoom etc. |
47 | // |
48 | // */ |
49 | |
50 | #include "precomp.hpp" |
51 | #include "resize.hpp" |
52 | |
53 | namespace cv |
54 | { |
55 | namespace opt_AVX2 |
56 | { |
57 | |
58 | class resizeNNInvokerAVX4 CV_FINAL : |
59 | public ParallelLoopBody |
60 | { |
61 | public: |
62 | resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) : |
63 | ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), |
64 | ify(_ify) |
65 | { |
66 | } |
67 | |
68 | #if defined(__INTEL_COMPILER) |
69 | #pragma optimization_parameter target_arch=AVX |
70 | #endif |
71 | virtual void operator() (const Range& range) const CV_OVERRIDE |
72 | { |
73 | Size ssize = src.size(), dsize = dst.size(); |
74 | int y, x; |
75 | int width = dsize.width; |
76 | int avxWidth = width - (width & 0x7); |
77 | const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(i: -1); |
78 | if(((int64)(dst.data + dst.step) & 0x1f) == 0) |
79 | { |
80 | for(y = range.start; y < range.end; y++) |
81 | { |
82 | uchar* D = dst.data + dst.step*y; |
83 | uchar* Dstart = D; |
84 | int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-1); |
85 | const uchar* S = src.data + sy*src.step; |
86 | #ifdef CV_ICC |
87 | #pragma unroll(4) |
88 | #endif |
89 | for(x = 0; x < avxWidth; x += 8) |
90 | { |
91 | const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); |
92 | __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(p: addr); |
93 | __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); |
94 | _mm256_maskstore_epi32(X: (int*)D, M: mask, Y: pixels); |
95 | D += 32; |
96 | } |
97 | for(; x < width; x++) |
98 | { |
99 | *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); |
100 | } |
101 | } |
102 | } |
103 | else |
104 | { |
105 | for(y = range.start; y < range.end; y++) |
106 | { |
107 | uchar* D = dst.data + dst.step*y; |
108 | uchar* Dstart = D; |
109 | int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-1); |
110 | const uchar* S = src.data + sy*src.step; |
111 | #ifdef CV_ICC |
112 | #pragma unroll(4) |
113 | #endif |
114 | for(x = 0; x < avxWidth; x += 8) |
115 | { |
116 | const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); |
117 | __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(p: addr); |
118 | __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); |
119 | _mm256_storeu_si256(p: (__m256i*)D, a: pixels); |
120 | D += 32; |
121 | } |
122 | for(; x < width; x++) |
123 | { |
124 | *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); |
125 | } |
126 | } |
127 | } |
128 | _mm256_zeroupper(); |
129 | } |
130 | |
131 | private: |
132 | const Mat& src; |
133 | Mat& dst; |
134 | int* x_ofs; |
135 | double ify; |
136 | |
137 | resizeNNInvokerAVX4(const resizeNNInvokerAVX4&); |
138 | resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&); |
139 | }; |
140 | |
141 | class resizeNNInvokerAVX2 CV_FINAL : |
142 | public ParallelLoopBody |
143 | { |
144 | public: |
145 | resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) : |
146 | ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), |
147 | ify(_ify) |
148 | { |
149 | } |
150 | |
151 | #if defined(__INTEL_COMPILER) |
152 | #pragma optimization_parameter target_arch=AVX |
153 | #endif |
154 | virtual void operator() (const Range& range) const CV_OVERRIDE |
155 | { |
156 | Size ssize = src.size(), dsize = dst.size(); |
157 | int y, x; |
158 | int width = dsize.width; |
159 | //int avxWidth = (width - 1) - ((width - 1) & 0x7); |
160 | int avxWidth = width - (width & 0xf); |
161 | const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(i: -1); |
162 | const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(b31: 15,b30: 14,b29: 11,b28: 10,b27: 13,b26: 12,b25: 9,b24: 8,b23: 7,b22: 6,b21: 3,b20: 2,b19: 5,b18: 4,b17: 1,b16: 0, |
163 | b15: 15,b14: 14,b13: 11,b12: 10,b11: 13,b10: 12,b09: 9,b08: 8,b07: 7,b06: 6,b05: 3,b04: 2,b03: 5,b02: 4,b01: 1,b00: 0); |
164 | const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(i0: 7, i1: 5, i2: 3, i3: 1, i4: 6, i5: 4, i6: 2, i7: 0); |
165 | //const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, |
166 | // 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); |
167 | if(((int64)(dst.data + dst.step) & 0x1f) == 0) |
168 | { |
169 | for(y = range.start; y < range.end; y++) |
170 | { |
171 | uchar* D = dst.data + dst.step*y; |
172 | uchar* Dstart = D; |
173 | int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-1); |
174 | const uchar* S = src.data + sy*src.step; |
175 | const uchar* S2 = S - 2; |
176 | #ifdef CV_ICC |
177 | #pragma unroll(4) |
178 | #endif |
179 | for(x = 0; x < avxWidth; x += 16) |
180 | { |
181 | const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); |
182 | __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(p: addr); |
183 | __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); |
184 | const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); |
185 | __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(p: addr2); |
186 | __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); |
187 | __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); |
188 | |
189 | __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(a: unpacked, b: shuffle_mask); |
190 | __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(a: bytes_shuffled, b: permute_mask); |
191 | _mm256_maskstore_epi32(X: (int*)D, M: mask, Y: ints_permuted); |
192 | D += 32; |
193 | } |
194 | for(; x < width; x++) |
195 | { |
196 | *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); |
197 | } |
198 | |
199 | } |
200 | } |
201 | else |
202 | { |
203 | for(y = range.start; y < range.end; y++) |
204 | { |
205 | uchar* D = dst.data + dst.step*y; |
206 | uchar* Dstart = D; |
207 | int sy = std::min(a: cvFloor(value: y*ify), b: ssize.height-1); |
208 | const uchar* S = src.data + sy*src.step; |
209 | const uchar* S2 = S - 2; |
210 | #ifdef CV_ICC |
211 | #pragma unroll(4) |
212 | #endif |
213 | for(x = 0; x < avxWidth; x += 16) |
214 | { |
215 | const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); |
216 | __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(p: addr); |
217 | __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); |
218 | const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); |
219 | __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(p: addr2); |
220 | __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); |
221 | __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); |
222 | |
223 | __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(a: unpacked, b: shuffle_mask); |
224 | __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(a: bytes_shuffled, b: permute_mask); |
225 | _mm256_storeu_si256(p: (__m256i*)D, a: ints_permuted); |
226 | D += 32; |
227 | } |
228 | for(; x < width; x++) |
229 | { |
230 | *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); |
231 | } |
232 | } |
233 | } |
234 | _mm256_zeroupper(); |
235 | } |
236 | |
237 | private: |
238 | const Mat& src; |
239 | Mat& dst; |
240 | int* x_ofs; |
241 | double ify; |
242 | |
243 | resizeNNInvokerAVX2(const resizeNNInvokerAVX2&); |
244 | resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&); |
245 | }; |
246 | |
247 | void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify) |
248 | { |
249 | resizeNNInvokerAVX2 invoker(src, dst, x_ofs, ify); |
250 | parallel_for_(range, body: invoker, nstripes: dst.total() / (double)(1 << 16)); |
251 | } |
252 | |
253 | void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify) |
254 | { |
255 | resizeNNInvokerAVX4 invoker(src, dst, x_ofs, ify); |
256 | parallel_for_(range, body: invoker, nstripes: dst.total() / (double)(1 << 16)); |
257 | } |
258 | |
259 | } |
260 | } |
261 | /* End of file. */ |
262 | |