1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4
5// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
6// Here is the original license:
7/*
8 This file is a part of ficus language project.
9 See ficus/LICENSE for the licensing terms
10*/
11
12#include "../../precomp.hpp"
13#include "convolution.hpp"
14
15#include "conv_block.simd.hpp"
16#include "layers/cpu_kernels/conv_block.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
17#include <opencv2/core/utils/logger.hpp>
18
19namespace cv { namespace dnn {
20enum { VEC_ALIGN = 32}; // Memory alignment.
21
22void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
23 const int convMR, const int convNR);
24void convBlockMR1_F32(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
25 const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
26
27#ifdef CONV_ARM_FP16
28// Fast convert float 32 to float16
29static inline void _cvt32f16f(const float* src, __fp16* dst, int len)
30{
31 int j = 0;
32 const int VECSZ = 4;
33 __fp16* dst_FP16 = (__fp16 *)dst;
34 if (len > VECSZ * 4)
35 {
36 const int VECSZ4 = 4 * VECSZ;
37 for( ; j + VECSZ4 < len; j += VECSZ4)
38 {
39
40 float32x4_t v0 = vld1q_f32(src + j);
41 float32x4_t v1 = vld1q_f32(src + j + 4);
42 float32x4_t v2 = vld1q_f32(src + j + 8);
43 float32x4_t v3 = vld1q_f32(src + j + 12);
44
45 vst1q_f16(dst_FP16 + j, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
46 vst1q_f16(dst_FP16 + j + 8, vcombine_f16(vcvt_f16_f32(v2), vcvt_f16_f32(v3)));
47 }
48 }
49
50 for( ; j < len; j += VECSZ )
51 {
52 if( j > len - VECSZ )
53 {
54 if( j == 0 )
55 break;
56 j = len - VECSZ;
57 }
58
59 float16x4_t hv = vcvt_f16_f32(vld1q_f32(src + j));
60 vst1_f16(dst_FP16 + j, hv);
61 }
62 for( ; j < len; j++ )
63 dst[j] = __fp16(src[j]);
64}
65#endif
66
67float* FastConv::getWeights()
68{
69 return alignPtr(ptr: weightsBuf.data(), n: VEC_ALIGN);
70}
71
72float* FastConv::getWeightsWino()
73{
74 return alignPtr(ptr: weightsWinoBuf.data(), n: VEC_ALIGN);
75}
76
77hfloat* FastConv::getWeightsFP16()
78{
79 return alignPtr(ptr: weightsBuf_FP16.data(), n: VEC_ALIGN);
80}
81
82hfloat* FastConv::getWeightsWinoFP16()
83{
84 return alignPtr(ptr: weightsWinoBuf_FP16.data(), n: VEC_ALIGN);
85}
86
87Ptr<FastConv> initFastConv(
88 InputArray _weightsMat,
89 float* srcBias,
90 int ngroups,
91 int K, int C,
92 const std::vector<size_t>& kernel_size,
93 const std::vector<size_t>& strides,
94 const std::vector<size_t>& dilations,
95 const std::vector<size_t>& pads_begin,
96 const std::vector<size_t>& pads_end,
97 int conv_dim,
98 const bool _useFP16,
99 bool useWinograd)
100{
101 Ptr<FastConv> conv = makePtr<FastConv>();
102 CV_Assert(ngroups > 0 && K > 0 && C > 0 && K % ngroups == 0);
103
104 // Weight shape, [K, C, Dk, Hk, Wk] for Conv3D, [K, C, Hk, Wk] for Conv2D, [K, C, Wk] for Conv1D.
105 int Dk = conv_dim == CONV_3D ? (int)kernel_size[0] : 1;
106 int Hk = conv_dim == CONV_1D ? 1 : (int)kernel_size[kernel_size.size() - 2];
107 int Wk = (int)kernel_size.back();
108 int karea = Wk*Hk*Dk;
109
110 conv->pad_front = conv_dim == CONV_3D ? (int)pads_begin[0] : 0;
111 conv->pad_top = conv_dim == CONV_1D ? 0 : (int)pads_begin[pads_begin.size() - 2];
112 conv->pad_left = (int)pads_begin.back();
113
114 conv->pad_behind = conv_dim == CONV_3D ? (int)pads_end[0] : 0;
115 conv->pad_bottom = conv_dim == CONV_1D ? 0 : (int)pads_end[pads_end.size() - 2];
116 conv->pad_right = (int)pads_end.back();
117
118 int stride_d = conv_dim == CONV_3D ? (int)strides[0] : 1;
119 int stride_h = conv_dim == CONV_1D ? 1 : (int)strides[strides.size() - 2];
120 int stride_w = (int)strides.back();
121
122 int dilation_d = conv_dim == CONV_3D ? (int)dilations[0] : 1;
123 int dilation_h = conv_dim == CONV_1D ? 1 : (int)dilations[dilations.size() - 2];
124 int dilation_w = (int)dilations.back();
125
126 CV_Assert(Dk > 0 && Hk > 0 && Wk > 0);
127 CV_Assert(stride_d >= 0 && stride_h >= 0 && stride_w > 0);
128 CV_Assert(dilation_d > 0 && dilation_h > 0 && dilation_w > 0);
129
130 conv->K = K; conv->C = C; conv->Hk = Hk; conv->Wk = Wk, conv->Dk = Dk;
131
132 conv->stride_d = stride_d;
133 conv->stride_h = stride_h;
134 conv->stride_w = stride_w;
135
136 conv->dilation_d = dilation_d;
137 conv->dilation_h = dilation_h;
138 conv->dilation_w = dilation_w;
139 conv->conv_dim = conv_dim;
140 conv->ngroups = ngroups;
141
142 bool ifRunDepthWise = ngroups > 1 && ngroups == K && ngroups == C;
143 bool ifRunDepthWiseRemain = false; // It's for big padding or big kernel or Conv3D depth-wise convolution.
144
145 if (ifRunDepthWise)
146 {
147 if (conv_dim == CONV_1D)
148 {
149 ifRunDepthWise &= Hk == 1 && Wk == 3 && (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
150 && max(a: stride_w, b: dilation_w) >= conv->pad_left && conv->pad_left <= 1;
151 }
152 else if (conv_dim == CONV_2D)
153 {
154 ifRunDepthWise &= Hk == 3 && Wk == 3 && ((stride_w == 1) || (stride_w == 2 && dilation_w == 1)) &&
155 max(a: stride_w, b: dilation_w) >= conv->pad_left && max(a: stride_h, b: dilation_h) >= conv->pad_top
156 && conv->pad_left <= 1 && conv->pad_top <= 1;
157 }
158
159 if (!ifRunDepthWise || conv_dim == CONV_3D)
160 {
161 ifRunDepthWise = false;
162 ifRunDepthWiseRemain = true;
163 }
164 }
165
166 conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? CONV_TYPE_DEPTHWISE :
167 useWinograd && (conv_dim == CONV_2D && (conv->useSIMD128 || conv->useAVX || conv->useAVX2 || conv->useNEON) &&
168 Hk == 3 && Wk == 3 && dilation_h == 1 && dilation_w == 1 && stride_h == 1 && stride_w == 1) ?
169 CONV_TYPE_WINOGRAD3X3 :
170 (ifRunDepthWiseRemain ? CONV_TYPE_DEPTHWISE_REMAIN : CONV_TYPE_GENERIC);
171
172#if !(CV_NEON || CV_SIMD128 || CV_TRY_AVX || CV_TRY_AVX2)
173 if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 ,CV_TRY_AVX and CV_TRY_AVX2 are not available.
174 conv->conv_type = CONV_TYPE_GENERIC;
175#endif
176
177 Mat weightsMat = _weightsMat.getMat();
178 auto wShape = shape(mat: weightsMat);
179 const size_t wstep = weightsMat.step1();
180
181 conv->useFP16 = false;
182#ifdef CONV_ARM_FP16
183 if (_useFP16 && (conv->conv_type == CONV_TYPE_GENERIC || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN
184 || conv->conv_type == CONV_TYPE_WINOGRAD3X3))
185 conv->useFP16 = true;
186
187 // Runtime FP16 check.
188 if (conv->useFP16 && !checkHardwareSupport(CPU_NEON_FP16))
189 {
190 conv->useFP16 = false;
191 CV_LOG_ONCE_WARNING(NULL, "DNN: the CPU does not support the instruction set required by FP16, fallback to FP32.");
192 }
193#endif
194
195 float *srcWeights = (float *)weightsMat.data;
196 if (conv->conv_type == CONV_TYPE_DEPTHWISE || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
197 {
198 // Handle the Conv1D, Conv2D and Conv3D depth-wise.
199 // for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
200 // but add some padding to make the weights array layout more SIMD-friendly
201 int ksize = karea;
202
203 // TODO: simplify the following code with std::copy.
204 // this code aims to let memory fit with vector size.
205 int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN;
206 int nweights = C * padded_ksize;
207
208#ifdef CONV_ARM_FP16
209 if (conv->useFP16)
210 {
211 conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN);
212 auto weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
213
214 parallel_for_(Range(0, C), [&](const Range& r0){
215 for(int c = r0.start; c < r0.end; c++)
216 _cvt32f16f(srcWeights + c*wstep, weightsPtr_FP16 + c*padded_ksize, ksize);
217 });
218 }
219 else
220#endif
221 {
222 conv->weightsBuf.resize(new_size: nweights + VEC_ALIGN);
223 auto weightsPtr = conv->getWeights();
224
225 parallel_for_(range: Range(0, C), functor: [&](const Range& r0) {
226 for(int c = r0.start; c < r0.end; c++)
227 memcpy(dest: weightsPtr + c*padded_ksize, src: srcWeights + c*wstep, n: ksize*sizeof(weightsPtr[0]));
228 });
229 }
230 }
231 else if(conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
232 {
233 static const float ktm[8][3] = {
234 {1.0f, 0.0f, 0.0f},
235 {-2.0f / 9, -2.0f / 9, -2.0f / 9},
236 {-2.0f / 9, 2.0f / 9, -2.0f / 9},
237 {1.0f / 90, 1.0f / 45, 2.0f / 45},
238 {1.0f / 90, -1.0f / 45, 2.0f / 45},
239 {32.f/45, 16.f/45, 8.f/45},
240 {32.f/45, -16.f/45, 8.f/45},
241 {0.0f, 0.0f, 1.0f}
242 };
243
244 const int CONV_WINO_KBLOCK = 4;
245
246#if CV_TRY_AVX || CV_TRY_AVX2
247 const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
248#else
249 const int CONV_WINO_ATOM_F32 = 4;
250#endif
251 const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
252
253#ifdef CONV_ARM_FP16
254 // FP 16
255 const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
256 const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
257#endif
258
259 // the weights are packed as 6-dim tensor:
260 // ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE,
261 // where W is the size of Winograd-transformed kernel (8x8),
262 // ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32),
263 // KBLOCK is some platform-dependent constant dependent on the number of SIMD registers.
264 int ksize = CONV_WINO_KSIZE * CONV_WINO_KSIZE;
265 int Cg = C/ngroups;
266 int Kg = K/ngroups;
267 int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
268 size_t nweights = ngroups*Kg_nblocks*Cg*CONV_WINO_KBLOCK*CONV_WINO_AREA;
269
270 float* wptrWino = nullptr;
271#ifdef CONV_ARM_FP16
272 __fp16* wptrWino_FP16 = nullptr;
273 if (conv->useFP16)
274 {
275 conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN);
276 wptrWino_FP16 = (__fp16*)conv->getWeightsWinoFP16();
277 }
278 else
279#endif
280 {
281 conv->weightsWinoBuf.resize(new_size: nweights + VEC_ALIGN);
282 wptrWino = conv->getWeightsWino();
283 }
284
285 parallel_for_(range: Range(0, K), functor: [&](const Range& r0){
286 float kernelTm[CONV_WINO_AREA];
287 for (int k = r0.start; k < r0.end; k++)
288 {
289 int g = k / Kg;
290 int k_ = k - g*Kg;
291 int ki = k_ / CONV_WINO_KBLOCK;
292 int dk = k_ - ki*CONV_WINO_KBLOCK;
293
294 for (int c = 0; c < Cg; c++)
295 {
296 // wstep = Hk*Wk*Cg
297 const float *kernel0 = srcWeights + k * wstep + c * ksize;
298
299 // transform kernel, transposed
300 const float *k0 = kernel0;
301 const float *k1 = kernel0 + 3;
302 const float *k2 = kernel0 + 6;
303
304 // h
305 float tmp[8][3];
306 for (int i = 0; i < 8; i++)
307 {
308 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
309 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
310 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
311 }
312
313 // v
314 for (int j = 0; j < 8; j++)
315 {
316 float *tmpp = &tmp[j][0];
317
318 for (int i = 0; i < 8; i++)
319 kernelTm[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
320 }
321
322 // repack the data.
323#ifdef CONV_ARM_FP16
324 if (conv->useFP16)
325 {
326 __fp16* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
327 (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F16;
328 for (int i = 0; i < CONV_WINO_NATOMS_F16; i++,
329 wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16)
330 {
331 CV_Assert(wptrWino_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= wptrWino_FP16 + nweights);
332 for (int j = 0; j < CONV_WINO_ATOM_F16; j++)
333 {
334 wptr[j] = (__fp16)kernelTm[i * CONV_WINO_ATOM_F16 + j];
335 }
336 }
337 }
338 else
339#endif
340 {
341 float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
342 (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32;
343 for (int i = 0; i < CONV_WINO_NATOMS_F32; i++,
344 wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32)
345 {
346 CV_Assert(wptrWino <= wptr && wptr + CONV_WINO_ATOM_F32 <= wptrWino + nweights);
347 memcpy(dest: wptr, src: kernelTm + i * CONV_WINO_ATOM_F32, n: CONV_WINO_ATOM_F32*sizeof (wptr[0]));
348 }
349 }
350 }
351 }
352 });
353 }
354 else if (conv->conv_type == CONV_TYPE_GENERIC)
355 {
356 // The weights are packed as
357 // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk*Dk) x CONV_MR tensor
358 int Kg = K/ngroups, Cg = max(a: C/ngroups, b: 1);
359 int DkHkWkCg = Dk*Hk*Wk*Cg;
360
361 int numStripsMR = (Kg + CONV_MR_FP32 - 1) / CONV_MR_FP32;
362 int Kg_aligned = numStripsMR * CONV_MR_FP32;
363 size_t nweights = ngroups*Kg_aligned*DkHkWkCg;
364 float* weightsPtr = nullptr;
365
366#ifdef CONV_ARM_FP16
367 int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16;
368 int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16;
369 size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg;
370 __fp16* weightsPtr_FP16 = nullptr;
371
372 if (conv->useFP16)
373 {
374 conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN);
375 weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
376 }
377 else
378#endif
379 {
380 conv->weightsBuf.resize(new_size: nweights + VEC_ALIGN);
381 weightsPtr = conv->getWeights();
382 }
383
384 // Pack the weight.
385#ifdef CONV_ARM_FP16
386 if (conv->useFP16)
387 {
388 parallel_for_(Range(0, ngroups * numStripsMR_FP16), [&](const Range& r0){
389 for (int gsi = r0.start; gsi < r0.end; gsi++)
390 {
391 int g = gsi / numStripsMR_FP16;
392 int si = gsi - g * numStripsMR_FP16;
393
394 int startK = si * CONV_MR_FP16;
395 CV_Assert(startK < Kg_aligned_FP16);
396
397 __fp16* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
398 int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding.
399
400 int k_idx = g*Kg + startK;
401 for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++)
402 {
403 for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR_FP16)
404 {
405 const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
406 int k = 0;
407 for(; k < dk; k++, wptr += wstep)
408 packed_wptr[k] = (__fp16)(*wptr);
409 for(; k < CONV_MR_FP16; k++)
410 packed_wptr[k] = (__fp16)0.f;
411 }
412 }
413 }});
414 }
415 else
416#endif
417 {
418 parallel_for_(range: Range(0, ngroups * numStripsMR), functor: [&](const Range& r0){
419 for (int gsi = r0.start; gsi < r0.end; gsi++)
420 {
421 int g = gsi / numStripsMR;
422 int si = gsi - g * numStripsMR;
423
424 int startK = si * CONV_MR_FP32;
425 CV_Assert(startK < Kg_aligned);
426
427 float* packed_wptr = weightsPtr + DkHkWkCg * (startK + g * Kg_aligned);
428 int dk = Kg - startK < CONV_MR_FP32 ? Kg - startK : CONV_MR_FP32; // check if we need zero padding.
429
430 int k_idx = g*Kg + startK;
431 for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++)
432 {
433 for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR_FP32)
434 {
435 const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
436 int k = 0;
437 for(; k < dk; k++, wptr += wstep)
438 packed_wptr[k] = *wptr;
439 for(; k < CONV_MR_FP32; k++)
440 packed_wptr[k] = 0.f;
441 }
442 }
443 }});
444 }
445 }
446 else
447 CV_Error(cv::Error::StsUnsupportedFormat, "Unknown convolution type.");
448
449 // store bias; append some zero's to make sure that
450 // we can always read MR elements starting from any valid index
451 {
452 int k = 0, nbias = K + VEC_ALIGN;
453 conv->biasBuf.resize(new_size: nbias);
454 float* biasBufPtr = conv->biasBuf.data();
455 for(; k < K; k++)
456 biasBufPtr[k] = srcBias ? srcBias[k] : 0.f;
457 for(; k < nbias; k++)
458 biasBufPtr[k] = 0.f;
459 }
460 return conv;
461}
462
463static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
464 const int stride_w, const int ksize, const int esz)
465{
466 char * inpbufC = inpbuf + s0 * esz;
467 float* inptrInC = (float* )inptrIn;
468
469#ifdef CONV_ARM_FP16
470 __fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
471 if (esz == sizeof(__fp16))
472 {
473 if (stride_w == 1)
474 {
475 for (int k = 0; k < ksize; k++)
476 {
477 int k1 = ofstab[k];
478
479 float32x4_t v0 = vld1q_f32(inptrInC + k1);
480 float32x4_t v1 = vld1q_f32(inptrInC + k1 + 4);
481 vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
482 }
483 }
484 else
485 {
486 for (int k = 0; k < ksize; k++)
487 {
488 int k1 = ofstab[k];
489 float32x4_t v0 = {inptrInC[k1], inptrInC[k1 + stride_w], inptrInC[k1 + 2*stride_w], inptrInC[k1 + 3*stride_w]};
490 float32x4_t v1 = {inptrInC[k1 + 4*stride_w], inptrInC[k1 + 5*stride_w], inptrInC[k1 + 6*stride_w], inptrInC[k1 + 7*stride_w]};
491
492 vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
493 }
494 }
495 }
496 else // float 32
497#endif
498 {
499 CV_Assert(esz == sizeof(float ));
500 float* inpbufC_FP32 = (float* )inpbufC;
501 if (stride_w == 1)
502 for (int k = 0; k < ksize; k++)
503 {
504 int k1 = ofstab[k];
505#if CV_SIMD256
506 vx_store(inpbufC_FP32 + k*CONV_NR_FP32, vx_load(inptrInC + k1));
507#elif CV_SIMD128
508 v_float32x4 vv0 = v_load(ptr: inptrInC + k1);
509 v_float32x4 vv1 = v_load(ptr: inptrInC + k1 + 4);
510 v_store(ptr: inpbufC_FP32 + k*CONV_NR_FP32, a: vv0);
511 v_store(ptr: inpbufC_FP32 + k*CONV_NR_FP32 + 4, a: vv1);
512#else
513 float v0 = inptrInC[k1];
514 float v1 = inptrInC[k1 + 1];
515 float v2 = inptrInC[k1 + 2];
516 float v3 = inptrInC[k1 + 3];
517 float v4 = inptrInC[k1 + 4];
518 float v5 = inptrInC[k1 + 5];
519 float v6 = inptrInC[k1 + 6];
520 float v7 = inptrInC[k1 + 7];
521
522 inpbufC_FP32[k*CONV_NR_FP32] = v0;
523 inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
524 inpbufC_FP32[k*CONV_NR_FP32+2] = v2;
525 inpbufC_FP32[k*CONV_NR_FP32+3] = v3;
526 inpbufC_FP32[k*CONV_NR_FP32+4] = v4;
527 inpbufC_FP32[k*CONV_NR_FP32+5] = v5;
528 inpbufC_FP32[k*CONV_NR_FP32+6] = v6;
529 inpbufC_FP32[k*CONV_NR_FP32+7] = v7;
530#endif
531 }
532 else
533 for (int k = 0; k < ksize; k++)
534 {
535 int k1 = ofstab[k];
536 float v0 = inptrInC[k1];
537 float v1 = inptrInC[k1 + stride_w];
538 float v2 = inptrInC[k1 + 2*stride_w];
539 float v3 = inptrInC[k1 + 3*stride_w];
540 float v4 = inptrInC[k1 + 4*stride_w];
541 float v5 = inptrInC[k1 + 5*stride_w];
542 float v6 = inptrInC[k1 + 6*stride_w];
543 float v7 = inptrInC[k1 + 7*stride_w];
544
545 inpbufC_FP32[k*CONV_NR_FP32] = v0;
546 inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
547 inpbufC_FP32[k*CONV_NR_FP32+2] = v2;
548 inpbufC_FP32[k*CONV_NR_FP32+3] = v3;
549 inpbufC_FP32[k*CONV_NR_FP32+4] = v4;
550 inpbufC_FP32[k*CONV_NR_FP32+5] = v5;
551 inpbufC_FP32[k*CONV_NR_FP32+6] = v6;
552 inpbufC_FP32[k*CONV_NR_FP32+7] = v7;
553 }
554 }
555 x0+=7;
556 s0+=7;
557 inptrIn += 7*stride_w;
558 in_w += 7*stride_w;
559}
560
561static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
562 const int stride_w, const int ksize, const int esz)
563{
564 char* inpbufC = inpbuf + s0 * esz;
565 float* inptrInC = inptrIn;
566
567#ifdef CONV_ARM_FP16
568 __fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
569 if (esz == sizeof(__fp16))
570 {
571 for (int k = 0; k < ksize; k++)
572 {
573 int k1 = ofstab[k];
574 float v0 = inptrInC[k1];
575 float v1 = inptrInC[k1 + stride_w];
576 inpbufC_FP16[k*CONV_NR_FP16] = (__fp16)v0;
577 inpbufC_FP16[k*CONV_NR_FP16+1] = (__fp16)v1;
578 }
579 } else
580#endif
581 {
582 float * inpbufC_FP32 = (float *)inpbufC;
583 for (int k = 0; k < ksize; k++)
584 {
585 int k1 = ofstab[k];
586 float v0 = inptrInC[k1];
587 float v1 = inptrInC[k1 + stride_w];
588 inpbufC_FP32[k*CONV_NR_FP32] = v0;
589 inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
590 }
591 }
592
593 x0++;
594 s0++;
595 inptrIn += stride_w;
596 in_w += stride_w;
597}
598
599static inline void packInputData(char* inpbuf_task, float* inp, const int* ofstab, const int* dhwTab, int zyx0, int zyx_limit,
600 int ksize, int stride_d, int stride_h, int stride_w, int pad_front, int pad_top, int pad_left,
601 int Dk, int Hk, int Wk, int dilation_d, int dilation_h, int dilation_w, int Di, int Hi, int Wi,
602 int H0, int W0, int Cg, int stripesize, int inp_plane_ofs, int inp_planesize, int conv_dim, int conv_type,
603 const int CONV_NR, const int esz, bool fast_1x1, bool useFP16)
604{
605 for (int stripe = 0; zyx0 < zyx_limit; stripe++, zyx0 += CONV_NR)
606 {
607 char *inpbuf = inpbuf_task + stripe * stripesize * esz;
608 float *inptr = inp + inp_plane_ofs;
609
610 /*
611 1. pack the data. Copy the HkxWk CONV_NR-wide slices from
612 each feature plane of the input tensor to the input buffer.
613 */
614 if (fast_1x1)
615 {
616 int slice_len = zyx_limit - zyx0;
617 bool partial = slice_len < CONV_NR;
618 const int CONV_NR_esz = CONV_NR * esz;
619 // Superfast branch for 1x1 convolutions with sy=sx=1.
620 // in this case each feature plane can be safely treated
621 // as 1D array, and we just extract next portion
622 // of CONV_NR elements from each feature plane and
623 // put it together.
624 inptr += zyx0;
625 if (!partial)
626 {
627 // Make special branch where memcpy() is called with a constant buffer size.
628 // Compilers will likely unroll this loop properly.
629#ifdef CONV_ARM_FP16
630 if (useFP16)
631 {
632 for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
633 _cvt32f16f(inptr, (__fp16 *)inpbuf, CONV_NR);
634 }
635 else
636#endif
637 for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
638 memcpy(dest: inpbuf, src: inptr, n: CONV_NR_esz);
639 }
640 else
641 {
642#ifdef CONV_ARM_FP16
643 if (useFP16)
644 {
645 for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
646 {
647 _cvt32f16f(inptr, (__fp16 *)inpbuf, slice_len);
648 }
649 }
650 else
651#endif
652 for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
653 {
654 memcpy(dest: inpbuf, src: inptr, n: slice_len * esz);
655 }
656 }
657 }
658 else if (conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
659 {
660 CV_Assert(Cg == 1);
661 const int HW0 = H0 * W0;
662 const int HWi = Hi * Wi;
663 int slice_len = std::min(a: zyx_limit - zyx0, b: CONV_NR);
664
665 // here some non-continuous sub-row of the row will not be
666 // filled from the tensor; we need to make sure that the uncovered
667 // elements are explicitly set to 0's. the easiest way is to
668 // set all the elements to 0's before the loop.
669 memset(s: inpbuf, c: 0, n: stripesize * esz);
670
671 int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0;
672 int y0 = yx0 / W0, x0 = yx0 - y0 * W0;
673
674 if (conv_dim == CONV_1D)
675 {
676 for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
677 {
678 int delta = std::min(a: slice_len - slice_i, b: W0 - x0);
679 int x1 = x0 + delta;
680
681 int in_w = x0 * stride_w - pad_left;
682 float* inptrIn = inptr + in_w;
683
684 int s0 = slice_i;
685
686 for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
687 {
688 // Pack 8
689 if (x0 + 8 <= x1 && 0 <= in_w &&
690 in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
691 {
692 packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
693 }
694 else if (x0 + 2 <= x1 && 0 <= in_w &&
695 in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
696 {
697 packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
698 }
699 else
700 {
701 int w0 = std::max(a: 0, b: (-in_w + dilation_w-1)/dilation_w);
702 int w1 = std::min(a: Wk, b: (Wi - in_w + dilation_w-1)/dilation_w);
703 const float* inptrInC = inptrIn;
704#ifdef CONV_ARM_FP16
705 if (useFP16)
706 {
707 __fp16* inpbufC = (__fp16 *)inpbuf + s0;
708 for (int w = w0; w < w1; w++)
709 {
710 int imgofs = w*dilation_w;
711 inpbufC[w*CONV_NR] = (__fp16)inptrInC[imgofs];
712 }
713 }
714 else
715#endif
716 {
717 float* inpbufC = (float *)inpbuf + s0;
718 for (int w = w0; w < w1; w++)
719 {
720 int imgofs = w*dilation_w;
721 inpbufC[w*CONV_NR] = inptrInC[imgofs];
722 }
723 }
724 }
725 }
726 slice_i += delta;
727 }
728 }
729 else if (conv_dim == CONV_2D)
730 {
731 for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
732 {
733 int delta = std::min(a: slice_len - slice_i, b: W0 - x0);
734 int x1 = x0 + delta;
735
736 int in_h = y0 * stride_h - pad_top;
737 int in_w = x0 * stride_w - pad_left;
738
739 float* inptrIn = inptr + in_h*Wi + in_w;
740
741 bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
742 int h0 = std::max(a: 0, b: (-in_h + dilation_h-1)/dilation_h);
743 int h1 = std::min(a: Hk, b: (Hi - in_h + dilation_h-1)/dilation_h);
744
745 int s0 = slice_i;
746 for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
747 {
748 // Pack 8
749 if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
750 in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
751 {
752 packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
753 }
754 else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
755 in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
756 {
757 packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
758 }
759 else
760 {
761 int w0 = std::max(a: 0, b: (-in_w + dilation_w-1)/dilation_w);
762 int w1 = std::min(a: Wk, b: (Wi - in_w + dilation_w-1)/dilation_w);
763
764 const float* inptrInC = inptrIn;
765#ifdef CONV_ARM_FP16
766 if (useFP16)
767 {
768 __fp16* inpbufC = (__fp16 *)inpbuf + s0;
769
770 for (int h = h0; h < h1; h++)
771 {
772 for (int w = w0; w < w1; w++)
773 {
774 int imgofs = h*(dilation_h*Wi) + w*dilation_w;
775 inpbufC[(h*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs];
776 }
777 }
778 }
779 else
780#endif
781 {
782 float* inpbufC = (float *)inpbuf + s0;
783
784 for (int h = h0; h < h1; h++)
785 {
786 for (int w = w0; w < w1; w++)
787 {
788 int imgofs = h*(dilation_h*Wi) + w*dilation_w;
789 inpbufC[(h*Wk + w)*CONV_NR] = inptrInC[imgofs];
790 }
791 }
792 }
793 }
794 }
795 slice_i += delta;
796 }
797 }
798 else if (conv_dim == CONV_3D)
799 {
800 for (int slice_i = 0; slice_i < slice_len; z0 += (y0+1)/H0, y0 = (y0+1)%H0, x0=0)
801 {
802 int delta = std::min(a: slice_len - slice_i, b: W0 - x0);
803 int x1 = x0 + delta;
804
805 int in_d = z0 * stride_d - pad_front;
806 int in_h = y0 * stride_h - pad_top;
807 int in_w = x0 * stride_w - pad_left;
808
809 float* inptrIn = inptr + in_d*HWi + in_h*Wi + in_w;
810
811 int d0 = std::max(a: 0, b: (-in_d + dilation_d - 1) / dilation_d);
812 int d1 = std::min(a: Dk, b: (Di - in_d + dilation_d - 1) / dilation_d);
813
814 bool ok_i = 0 <= in_d && in_d < Di - (Dk-1)*dilation_d &&
815 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
816 int h0 = std::max(a: 0, b: (-in_h + dilation_h-1)/dilation_h);
817 int h1 = std::min(a: Hk, b: (Hi - in_h + dilation_h-1)/dilation_h);
818
819 int s0 = slice_i;
820 for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
821 {
822 // Pack 8
823 if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
824 in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
825 {
826 packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
827 }
828 else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
829 in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
830 {
831 packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
832 }
833 else
834 {
835 int w0 = std::max(a: 0, b: (-in_w + dilation_w-1)/dilation_w);
836 int w1 = std::min(a: Wk, b: (Wi - in_w + dilation_w-1)/dilation_w);
837 const float* inptrInC = inptrIn;
838#ifdef CONV_ARM_FP16
839 if (useFP16)
840 {
841 __fp16* inpbufC = (__fp16* )inpbuf + s0;
842
843 for ( int d = d0; d < d1; d++)
844 {
845 for (int h = h0; h < h1; h++)
846 {
847 for (int w = w0; w < w1; w++)
848 {
849 int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
850 inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs];
851 }
852 }
853 }
854 }
855 else
856#endif
857 {
858 float* inpbufC = (float* )inpbuf + s0;
859
860 for ( int d = d0; d < d1; d++)
861 {
862 for (int h = h0; h < h1; h++)
863 {
864 for (int w = w0; w < w1; w++)
865 {
866 int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
867 inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = inptrInC[imgofs];
868 }
869 }
870 }
871 }
872 }
873 }
874 slice_i += delta;
875 }
876 }
877 }
878 else
879 {
880 const int HW0 = H0 * W0;
881 const int HWi = Hi * Wi;
882 int z0_ = zyx0 / HW0, yx0 = zyx0 - z0_ * HW0;
883 int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0;
884 for (int k = 0; k < ksize; k++)
885 {
886 int dz = dhwTab[k * 3], dy = dhwTab[k * 3 + 1], dx = dhwTab[k * 3 + 2];
887 int i = 0, z0 = z0_, y0 = y0_, x0 = x0_;
888 for (; i < CONV_NR;)
889 {
890 float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i;
891#ifdef CONV_ARM_FP16
892 __fp16 * inpbuf_ki_FP16 = (__fp16 *)inpbuf + k * CONV_NR * Cg + i;
893#endif
894
895 int zi = z0 * stride_d + dz - pad_front;
896 int yi = y0 * stride_h + dy - pad_top;
897 int xi = x0 * stride_w + dx - pad_left;
898
899 if ((unsigned) zi < (unsigned) Di && (unsigned) yi < (unsigned) Hi &&
900 (unsigned) xi < (unsigned) Wi)
901 {
902 const float *inptr_ki = inptr + zi * HWi + yi * Wi + xi;
903 if (i + 8 <= CONV_NR && x0 + 8 <= W0 && xi + stride_w * 8 <= Wi)
904 {
905 if (stride_w == 1)
906 {
907#ifdef CONV_ARM_FP16
908 if (useFP16)
909 {
910 for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
911 {
912 float32x4_t v0 = vld1q_f32(inptr_ki);
913 float32x4_t v1 = vld1q_f32(inptr_ki + 4);
914
915 vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
916 }
917 }
918 else
919#endif
920 for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
921 {
922 float t0 = inptr_ki[0], t1 = inptr_ki[1];
923 float t2 = inptr_ki[2], t3 = inptr_ki[3];
924 float t4 = inptr_ki[4], t5 = inptr_ki[5];
925 float t6 = inptr_ki[6], t7 = inptr_ki[7];
926 inpbuf_ki[0] = t0;
927 inpbuf_ki[1] = t1;
928 inpbuf_ki[2] = t2;
929 inpbuf_ki[3] = t3;
930 inpbuf_ki[4] = t4;
931 inpbuf_ki[5] = t5;
932 inpbuf_ki[6] = t6;
933 inpbuf_ki[7] = t7;
934 }
935 }
936 else if (stride_w == 2)
937 {
938#ifdef CONV_ARM_FP16
939 if (useFP16)
940 {
941 for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
942 {
943 float32x4_t v0 = {inptr_ki[0], inptr_ki[2], inptr_ki[4], inptr_ki[6]};
944 float32x4_t v1 = {inptr_ki[8], inptr_ki[10], inptr_ki[12], inptr_ki[14]};
945 vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
946 }
947 }
948 else
949#endif
950 for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
951 {
952 float t0 = inptr_ki[0], t1 = inptr_ki[2];
953 float t2 = inptr_ki[4], t3 = inptr_ki[6];
954 float t4 = inptr_ki[8], t5 = inptr_ki[10];
955 float t6 = inptr_ki[12], t7 = inptr_ki[14];
956 inpbuf_ki[0] = t0;
957 inpbuf_ki[1] = t1;
958 inpbuf_ki[2] = t2;
959 inpbuf_ki[3] = t3;
960 inpbuf_ki[4] = t4;
961 inpbuf_ki[5] = t5;
962 inpbuf_ki[6] = t6;
963 inpbuf_ki[7] = t7;
964 }
965 }
966 else
967 {
968#ifdef CONV_ARM_FP16
969 if (useFP16)
970 {
971 for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
972 {
973 float32x4_t v0 = {inptr_ki[0], inptr_ki[stride_w], inptr_ki[stride_w * 2], inptr_ki[stride_w * 3]};
974 float32x4_t v1 = {inptr_ki[stride_w * 4], inptr_ki[stride_w * 5], inptr_ki[stride_w * 6], inptr_ki[stride_w * 7]};
975 vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
976 }
977 }
978 else
979#endif
980 for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
981 {
982 float t0 = inptr_ki[0], t1 = inptr_ki[stride_w];
983 float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3];
984 float t4 = inptr_ki[stride_w * 4], t5 = inptr_ki[stride_w * 5];
985 float t6 = inptr_ki[stride_w * 6], t7 = inptr_ki[stride_w * 7];
986 inpbuf_ki[0] = t0;
987 inpbuf_ki[1] = t1;
988 inpbuf_ki[2] = t2;
989 inpbuf_ki[3] = t3;
990 inpbuf_ki[4] = t4;
991 inpbuf_ki[5] = t5;
992 inpbuf_ki[6] = t6;
993 inpbuf_ki[7] = t7;
994 }
995 }
996 i += 8;
997 x0 += 8;
998 }
999 else if (i + 4 <= CONV_NR && x0 + 4 <= W0 && xi + stride_w * 4 <= Wi)
1000 {
1001 if (stride_w == 1)
1002 {
1003#ifdef CONV_ARM_FP16
1004 if (useFP16)
1005 {
1006 for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
1007 {
1008 float32x4_t v0 = vld1q_f32(inptr_ki);
1009 vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
1010 }
1011 }
1012 else
1013#endif
1014 for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
1015 {
1016 float t0 = inptr_ki[0], t1 = inptr_ki[1];
1017 float t2 = inptr_ki[2], t3 = inptr_ki[3];
1018 inpbuf_ki[0] = t0;
1019 inpbuf_ki[1] = t1;
1020 inpbuf_ki[2] = t2;
1021 inpbuf_ki[3] = t3;
1022 }
1023 }
1024 else
1025 {
1026#ifdef CONV_ARM_FP16
1027 if (useFP16)
1028 {
1029 for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
1030 {
1031 float32x4_t v0 = {inptr_ki[0], inptr_ki[stride_w], inptr_ki[stride_w * 2], inptr_ki[stride_w * 3]};
1032 vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
1033 }
1034 }
1035 else
1036#endif
1037 for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
1038 {
1039 float t0 = inptr_ki[0], t1 = inptr_ki[stride_w];
1040 float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3];
1041 inpbuf_ki[0] = t0;
1042 inpbuf_ki[1] = t1;
1043 inpbuf_ki[2] = t2;
1044 inpbuf_ki[3] = t3;
1045 }
1046 }
1047 i += 4;
1048 x0 += 4;
1049 }
1050 else
1051 {
1052#ifdef CONV_ARM_FP16
1053 if (useFP16)
1054 {
1055 for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
1056 inpbuf_ki_FP16[0] = (__fp16)(*inptr_ki);
1057 }
1058 else
1059#endif
1060 for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
1061 *inpbuf_ki = *inptr_ki;
1062 i++;
1063 x0++;
1064 }
1065 }
1066 else
1067 {
1068#ifdef CONV_ARM_FP16
1069 if (useFP16)
1070 {
1071 for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR)
1072 inpbuf_ki_FP16[0] = (__fp16)0.f;
1073 }
1074 else
1075#endif
1076 for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR)
1077 inpbuf_ki[0] = 0.f;
1078 i++;
1079 x0++;
1080 }
1081
1082 int mask = x0 >= W0;
1083 y0 += mask;
1084 x0 &= mask - 1;
1085
1086 mask = y0 >= H0; // Only Conv 3D need jump at z0 dimension
1087 if (mask && conv_dim != CONV_3D)
1088 break;
1089
1090 z0 += mask;
1091 y0 &= mask - 1;
1092 }
1093 }
1094 }
1095 }
1096}
1097
1098void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
1099 const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd)
1100{
1101 Mat input = _input.getMat();
1102 Mat output = _output.getMat();
1103 int conv_dim = conv->conv_dim;
1104
1105 CV_Assert_N(input.dims == output.dims,
1106 input.size[0] == output.size[0],
1107 conv->C == input.size[1],
1108 conv->K == output.size[1],
1109 input.type() == output.type(),
1110 input.isContinuous(),
1111 output.isContinuous());
1112
1113 const bool useFP16 = conv->useFP16;
1114 Mat fusedAddMat;
1115 if (fusedAdd)
1116 {
1117 CV_Assert(conv->conv_dim != CONV_3D && "Conv3D does not support Conv+Add fusion optimization!");
1118 fusedAddMat = _output.getMat();
1119 }
1120
1121 if (conv->conv_type == CONV_TYPE_DEPTHWISE)
1122 {
1123 // Depthwise-Convolution layer should not be followed by Add layer.
1124 CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D) && !useFP16);
1125 return runDepthwise(input: input, output: output, conv, activ: actLayer.get(), reluslope, fusedAdd);
1126 }
1127
1128 MatShape inputShape = shape(mat: input);
1129 MatShape outputShape = shape(mat: output);
1130
1131 CV_Assert(inputShape.size() == outputShape.size());
1132
1133 ActivationLayer* activ = nullptr;
1134 float minval = -FLT_MAX, maxval = FLT_MAX;
1135 bool ifMinMaxAct = false;
1136
1137 if (actLayer)
1138 {
1139 Ptr<ReLULayer> activ_relu = actLayer.dynamicCast<ReLULayer>();
1140 Ptr<ReLU6Layer> activ_relu6 = actLayer.dynamicCast<ReLU6Layer>();
1141
1142 if (!activ_relu.empty())
1143 {
1144 if (activ_relu->negativeSlope == 0.0f)
1145 {
1146 minval = 0.0f;
1147 ifMinMaxAct = true;
1148 activ = nullptr;
1149 }
1150 else // Leaky ReLU
1151 {
1152 activ = actLayer.get();
1153 }
1154 }
1155 else if (!activ_relu6.empty())
1156 {
1157 minval = activ_relu6->minValue;
1158 maxval = activ_relu6->maxValue;
1159
1160 ifMinMaxAct = true;
1161 activ = nullptr;
1162 }
1163 else
1164 activ = actLayer.get();
1165 }
1166 else
1167 activ = nullptr;
1168
1169 if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
1170 {
1171 CV_Assert((!conv->weightsWinoBuf.empty() || !conv->weightsWinoBuf_FP16.empty()) && input.dims == 4 && conv_dim == CONV_2D);
1172 if (runWinograd63(input: input, fusedAddMat: fusedAddMat, output: output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
1173 return;
1174 }
1175
1176 int N = inputShape[0], C = inputShape[1];
1177
1178 // input shape: [N, C, D, H, W] for Conv3D, [N, C, H, W] for Conv2D, [N, C, W] for Conv1D.
1179 int Di = conv_dim == CONV_3D ? inputShape[2] : 1;
1180 int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
1181 int Wi = inputShape[inputShape.size() - 1];
1182
1183 int ngroups = conv->ngroups;
1184 int K = conv->K, Dk = conv->Dk, Hk = conv->Hk, Wk = conv->Wk;
1185
1186 int D0 = conv_dim == CONV_3D ? outputShape[2] : 1;
1187 int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
1188 int W0 = outputShape[outputShape.size() - 1];
1189
1190 int Cg = C/ngroups, Kg = K/ngroups;
1191
1192 const size_t inp_planesize = (size_t)Di*Hi*Wi;
1193 const size_t out_planesize = (size_t)D0*H0*W0;
1194
1195 int pad_front = conv->pad_front;
1196 int pad_top = conv->pad_top;
1197 int pad_left = conv->pad_left;
1198
1199 int stride_d = conv->stride_d, stride_h = conv->stride_h, stride_w = conv->stride_w;
1200 int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
1201
1202 int ksize = Dk*Hk*Wk;
1203 bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1
1204 && pad_front == 0 && pad_left == 0 && pad_top == 0;
1205 int DkHkWkCg = Dk*Hk*Wk*Cg;
1206
1207 std::vector<int> ofstab_(Hk*Wk*Dk*4, 0);
1208 int* ofstab = ofstab_.data();
1209 int* dhwTab = ofstab + Hk*Wk*Dk;
1210 int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN;
1211
1212 if (conv_dim == CONV_1D)
1213 {
1214 for( int w = 0; w < Wk; w++)
1215 {
1216 int dw = w*dilation_w;
1217 dhwTab[w*3+2] = dw;
1218 ofstab[w] = dw;
1219 }
1220 }
1221 else if (conv_dim == CONV_2D)
1222 {
1223 for (int h = 0; h < Hk; h++)
1224 for( int w = 0; w < Wk; w++)
1225 {
1226 int k = h*Wk + w;
1227 int dh = h*dilation_h, dw = w*dilation_w;
1228 dhwTab[k*3+1] = dh;
1229 dhwTab[k*3+2] = dw;
1230 ofstab[k] = dh*Wi + dw;
1231 }
1232 }
1233 else
1234 {
1235 for (int d = 0; d < Dk; d++)
1236 for (int h = 0; h < Hk; h++)
1237 {
1238 for (int w = 0; w < Wk; w++)
1239 {
1240 int k = d*Hk*Wk + h*Wk + w;
1241 int dd = d*dilation_d, dh = h*dilation_h, dw = w*dilation_w;
1242 dhwTab[k*3] = dd;
1243 dhwTab[k*3+1] = dh;
1244 dhwTab[k*3+2] = dw;
1245 ofstab[k] = dd*Hi*Wi + dh*Wi + dw;
1246 }
1247 }
1248 }
1249
1250 int CONV_NR = CONV_NR_FP32;
1251 int CONV_MR = CONV_MR_FP32;
1252 int esz = sizeof(float );
1253
1254#ifdef CONV_ARM_FP16
1255 if (useFP16)
1256 {
1257 // works at FP 16.
1258 CONV_NR = CONV_NR_FP16;
1259 CONV_MR = CONV_MR_FP16;
1260 esz = sizeof(__fp16);
1261 }
1262#endif
1263
1264 int MAX_STRIPES = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : (56 + CONV_NR - 1)/CONV_NR;
1265
1266 // Friendly to L1 cache
1267 const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
1268 const int C_BLOCK_SIZE = 256;
1269
1270 int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR;
1271 int Kg_aligned = Kg_nblocks * CONV_MR;
1272
1273 int stripes_per_plane0 = ((int)out_planesize + CONV_NR - 1) / CONV_NR;
1274 int stripes_per_plane = stripes_per_plane0;
1275
1276 if (stripes_per_plane < ntasks * 4 || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
1277 {
1278 MAX_STRIPES = 1;
1279 stripes_per_plane = 1;
1280 }
1281 else
1282 Kg_nblocks = 1;
1283
1284 bool separateIm2col = (fast_1x1 || stripes_per_plane == 1) && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN;
1285
1286 int Kstripes = Kg_nblocks * stripes_per_plane;
1287 int nsubtasks = N * ngroups * Kstripes;
1288
1289 size_t stripesize = alignSize(sz: CONV_NR * ksize * Cg, n: VEC_ALIGN);
1290 size_t cbufsize = alignSize(sz: CONV_NR * K_BLOCK_SIZE * MAX_STRIPES, n: VEC_ALIGN);
1291
1292 size_t taskbufsize = cbufsize * sizeof(float );
1293
1294 if (!separateIm2col)
1295 taskbufsize += MAX_STRIPES * stripesize * esz;
1296
1297 size_t totalbufsize_base = taskbufsize * ntasks;
1298 size_t totalbufsize = totalbufsize_base;
1299 if (separateIm2col)
1300 totalbufsize += N * ngroups * stripes_per_plane0 * stripesize * esz;
1301
1302 AutoBuffer<char> inpbuf_all_;
1303 char* inpbuf_all = nullptr;
1304
1305 inpbuf_all_.allocate(size: totalbufsize + VEC_ALIGN * sizeof(float ));
1306 inpbuf_all = alignPtr(ptr: inpbuf_all_.data(), n: (int)(VEC_ALIGN * sizeof(float )));
1307 char* inpbuf_all_0 = inpbuf_all + totalbufsize_base;
1308
1309 float* inp = input.ptr<float>();
1310 float* out = output.ptr<float>();
1311 float* fusedAddPtr0 = fusedAddMat.empty() ? 0 : fusedAddMat.ptr<float>();
1312
1313 // In the case of 1x1 convolution we first reorder the whole input tensor.
1314 // In general, im2row results in Hk*Wk-x unrolling factor
1315 // (e.g. 3*3=9x unrolling for 3x3 convolution), thus for 1x1 convolution
1316 // the reordered tensor will take as much space as the original tensor.
1317 if (separateIm2col)
1318 {
1319 // the optional phase 1. im2row
1320 parallel_for_(range: Range(0, ntasks), functor: [&](const Range& r0) {
1321 for (int task_id = r0.start; task_id < r0.end; task_id++)
1322 {
1323 if (fast_1x1)
1324 {
1325 int nc0 = task_id*N*C/ntasks, nc1 = (task_id+1)*N*C/ntasks, dc = 0;
1326 for (; nc0 < nc1; nc0 += dc)
1327 {
1328 int n = nc0/C, c0 = nc0 - n*C;
1329 int g = c0 / Cg;
1330 c0 -= g*Cg;
1331 dc = Cg - c0 <= nc1 - nc0 ? Cg - c0 : nc1 - nc0;
1332
1333 float * inptr_ = inp + (size_t)nc0*inp_planesize;
1334 char* inpbuf_ = inpbuf_all_0 + ((n*ngroups + g)*stripes_per_plane0*stripesize + c0*CONV_NR)*esz;
1335
1336 packInputData(inpbuf_task: inpbuf_, inp: inptr_, ofstab, dhwTab, zyx0: 0, zyx_limit: out_planesize, ksize, stride_d, stride_h,
1337 stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
1338 Di, Hi, Wi, H0, W0, Cg: dc, stripesize, inp_plane_ofs: 0, inp_planesize, conv_dim: conv->conv_dim,
1339 conv_type: conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
1340 }
1341 }
1342 else
1343 {
1344 const int allTasks = N * ngroups * stripes_per_plane0;
1345 int ngs0 = task_id*allTasks/ntasks, ngs1 = (task_id+1)*allTasks/ntasks, ds = 0;
1346
1347 for (; ngs0 < ngs1; ngs0 += ds)
1348 {
1349 int n = ngs0 / (ngroups * stripes_per_plane0), gs0 = ngs0 - n*ngroups*stripes_per_plane0;
1350 int g = gs0 / stripes_per_plane0, s0 = gs0 - g*stripes_per_plane0;
1351
1352 ds = stripes_per_plane0 - s0 <= ngs1 - ngs0 ? stripes_per_plane0 - s0 : ngs1 - ngs0;
1353
1354 int zyx = s0 * CONV_NR;
1355 int zyx_limit = (s0 + ds) * CONV_NR < out_planesize ? (s0 + ds) * CONV_NR : out_planesize;
1356
1357 float * inptr_ = inp + (size_t)(n * ngroups + g) * Cg * inp_planesize;
1358 char* inpbuf_ = inpbuf_all_0 + ((n * ngroups + g) * stripes_per_plane0 * stripesize + s0 * stripesize) * esz;
1359
1360 packInputData(inpbuf_task: inpbuf_, inp: inptr_, ofstab, dhwTab, zyx0: zyx, zyx_limit, ksize, stride_d, stride_h,
1361 stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
1362 Di, Hi, Wi, H0, W0, Cg, stripesize, inp_plane_ofs: 0, inp_planesize, conv_dim: conv->conv_dim,
1363 conv_type: conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
1364 }
1365 }
1366 }
1367 });
1368 }
1369
1370 // Compute
1371 parallel_for_(range: Range(0, ntasks), functor: [&](const Range& r0) {
1372 for (int task_id = r0.start; task_id < r0.end; task_id++)
1373 {
1374 float * cbuf_task = (float *)(inpbuf_all + taskbufsize * task_id);
1375 char * inpbuf_task = (char*)(cbuf_task + cbufsize);
1376
1377 int ngs0 = (int)((size_t)nsubtasks * task_id / ntasks);
1378 int ngs1 = (int)((size_t)nsubtasks * (task_id+1) / ntasks);
1379 for (int subtask = ngs0; subtask < ngs1; )
1380 {
1381 int ng = subtask / Kstripes;
1382 int kzyx0 = subtask - ng * Kstripes;
1383 int kzyx1 = kzyx0 + (ngs1 - subtask);
1384 int n = ng / ngroups, g = ng % ngroups; // ng - n * ngroups;
1385 size_t inp_plane_ofs = (size_t)(n * ngroups + g) * Cg * inp_planesize;
1386 kzyx1 = kzyx1 <= Kstripes ? kzyx1 : Kstripes;
1387 subtask += kzyx1 - kzyx0;
1388 int k0, k1;
1389 int zyx0, zyx_limit, zyx_block_limit = 0;
1390
1391 if (stripes_per_plane == 1 || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
1392 {
1393 k0 = kzyx0 * CONV_MR;
1394 k1 = kzyx1 * CONV_MR;
1395 k1 = k1 <= Kg ? k1 : Kg;
1396 zyx0 = 0;
1397 zyx_limit = (int)out_planesize;
1398 }
1399 else
1400 {
1401 k0 = 0;
1402 k1 = Kg;
1403 zyx0 = kzyx0 * CONV_NR;
1404 zyx_limit = kzyx1 * CONV_NR;
1405 zyx_limit = zyx_limit < out_planesize ? zyx_limit : (int)out_planesize;
1406 }
1407
1408 for (; zyx0 < zyx_limit; zyx0 = zyx_block_limit)
1409 {
1410 // step 1. extract part of input tensor and represent it in zigzag form
1411 zyx_block_limit = zyx0 + CONV_NR * MAX_STRIPES;
1412 zyx_block_limit = zyx_block_limit < zyx_limit ? zyx_block_limit : zyx_limit;
1413
1414 int nstripes = (zyx_block_limit - zyx0 + CONV_NR - 1) / CONV_NR;
1415
1416 CV_Assert(nstripes <= MAX_STRIPES);
1417
1418 if (!separateIm2col)
1419 {
1420 packInputData(inpbuf_task, inp, ofstab, dhwTab, zyx0, zyx_limit: zyx_block_limit, ksize, stride_d, stride_h,
1421 stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
1422 Di, Hi, Wi, H0, W0, Cg, stripesize, inp_plane_ofs, inp_planesize, conv_dim: conv->conv_dim,
1423 conv_type: conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
1424 }
1425
1426 char *weights = nullptr;
1427#ifdef CONV_ARM_FP16
1428 if (useFP16)
1429 {
1430 CV_Assert(!conv->weightsBuf_FP16.empty());
1431 weights = (char *)conv->getWeightsFP16();
1432 }
1433 else
1434#endif
1435 {
1436 CV_Assert(!conv->weightsBuf.empty());
1437 weights = (char *)conv->getWeights();
1438 }
1439 // optional branch, only for depth-wise convolution which was implemented by generic convolution.
1440 // In this case, CONV_MR is 1, and CONV_NR remains the same.
1441 if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
1442 {
1443 CV_Assert(weights);
1444 size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
1445 float *cptr0 = cbuf_task;
1446 weights += g * padded_ksize * esz;
1447
1448 int out_width = zyx_block_limit - zyx0;
1449 float *outptr = out + outofs;
1450 const float biasVal = *(conv->biasBuf.data() + g);
1451 const char *inptr_ = separateIm2col ? inpbuf_all_0 + (ng * stripes_per_plane0 + zyx0 / CONV_NR) * stripesize * esz :
1452 inpbuf_task;
1453
1454 for (int stripe = 0; stripe < nstripes; stripe++)
1455 {
1456 const char *inptr = inptr_ + stripe * stripesize * esz;
1457 const int outLen = std::min(a: out_width - stripe * CONV_NR, b: CONV_NR);
1458 bool ifBuffer = outLen < CONV_NR;
1459 float *cptr = outptr + stripe * CONV_NR;
1460 if (ifBuffer)
1461 {
1462 memcpy(dest: cptr0, src: cptr, n: outLen * sizeof(float ));
1463 cptr = cptr0;
1464 }
1465#if CV_NEON && CV_NEON_AARCH64
1466 if (conv->useNEON)
1467 {
1468#ifdef CONV_ARM_FP16
1469 if (useFP16)
1470 {
1471 opt_NEON_FP16::convBlockMR1_F16(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
1472 }
1473 else
1474#endif
1475 opt_NEON::convBlockMR1_F32(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
1476 }
1477 else
1478#endif
1479 convBlockMR1_F32(np: DkHkWkCg, a: (const float *)weights, b: (const float *)inptr, c: cptr, bias: biasVal, init_c: fusedAdd, minval, maxval, ifMinMaxAct, outLen, convNR: CONV_NR);
1480
1481 if (ifBuffer)
1482 {
1483 memcpy(dest: outptr + stripe * CONV_NR, src: cptr, n: outLen * sizeof(float ));
1484 }
1485 }
1486 if (activ)
1487 activ->forwardSlice(src: outptr, dst: outptr, len: out_width, outPlaneSize: out_planesize, cn0: g, cn1: g + 1);
1488 continue;
1489 }
1490
1491 CV_Assert(weights);
1492 weights += g * Kg_aligned * DkHkWkCg * esz;
1493
1494 const float *biasptr = conv->biasBuf.data() + Kg * g;
1495 int ldc = nstripes * CONV_NR;
1496
1497 // 2. do convolution, compute Kg x (zyx_block_limit - zyx0) part of the output tensor
1498 int out_width = zyx_block_limit - zyx0;
1499 for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE)
1500 {
1501 int k1_block = k0_block + K_BLOCK_SIZE < k1 ? k0_block + K_BLOCK_SIZE : k1;
1502 for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE)
1503 {
1504 int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg;
1505 const char *inptr = separateIm2col ? inpbuf_all_0 + (ng * stripes_per_plane0 + zyx0 / CONV_NR) * stripesize * esz :
1506 inpbuf_task;
1507 inptr += (c0 * CONV_NR) * esz;
1508 for (int stripe = 0; stripe < nstripes; stripe++, inptr += stripesize * esz)
1509 {
1510 const int outLen = std::min(a: out_width - stripe * CONV_NR, b: CONV_NR);
1511
1512 char *wptr = weights + (k0_block * DkHkWkCg + c0 * CONV_MR) * esz;
1513 float *cptr = cbuf_task + stripe * CONV_NR;
1514 hfloat* cptr_f16 = (hfloat*)cbuf_task + stripe*CONV_NR;
1515 for (int k = k0_block; k < k1_block; k += CONV_MR,
1516 wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc)
1517 {
1518#if CV_TRY_AVX2
1519 if (conv->useAVX2)
1520 opt_AVX2::convBlock_F32(np: c1 - c0, a: (const float *)wptr, b: (const float *)inptr, c: cptr, ldc, init_c: c0 == 0, width: outLen, convMR: CONV_MR, convNR: CONV_NR);
1521 else
1522#endif
1523#if CV_TRY_AVX
1524 if (conv->useAVX)
1525 opt_AVX::convBlock_F32(np: c1 - c0, a: (const float *)wptr, b: (const float *)inptr, c: cptr, ldc, init_c: c0 == 0, width: outLen, convMR: CONV_MR, convNR: CONV_NR);
1526 else
1527#endif
1528#if CV_NEON
1529 if (conv->useNEON)
1530 {
1531#ifdef CONV_ARM_FP16
1532 if (useFP16)
1533 {
1534 opt_NEON_FP16::convBlock_F16(c1 - c0, wptr, inptr, (char *)cptr_f16, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
1535 }
1536 else
1537#endif
1538 opt_NEON::convBlock_F32(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
1539 }
1540 else
1541#endif
1542 // The possible outLen range is 24 or 8~1.
1543 convBlock_F32(np: c1 - c0, a: (const float *)wptr, b: (const float *)inptr, c: cptr, ldc, init_c: c0 == 0, outLen, convMR: CONV_MR, convNR: CONV_NR);
1544 }
1545 }
1546 }
1547
1548 size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
1549 const float *cptr = cbuf_task;
1550 const hfloat *cptr_fp16 = (const hfloat *)cbuf_task;
1551 float *outptr = out + outofs;
1552 const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0;
1553
1554 for (int k = k0_block; k < k1_block; k++,
1555 cptr += ldc, cptr_fp16 += ldc, outptr += out_planesize,
1556 pbptr += (pbptr ? out_planesize : 0))
1557 {
1558 float biasval = biasptr[k];
1559 int j = 0;
1560
1561#ifdef CONV_ARM_FP16
1562 if (useFP16)
1563 {
1564 float32x4_t vbias = vdupq_n_f32(biasval);
1565 float32x4_t vmax = vdupq_n_f32(maxval);
1566 float32x4_t vmin = vdupq_n_f32(minval);
1567 if (pbptr)
1568 {
1569 for (; j + 7 < out_width; j += 8)
1570 {
1571 float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias;
1572 float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + + j + 4)) + vbias;
1573
1574 v0 += vld1q_f32(pbptr + j);
1575 v1 += vld1q_f32(pbptr + j + 4);
1576
1577 if (ifMinMaxAct)
1578 {
1579 v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax);
1580 v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax);
1581 }
1582
1583 vst1q_f32(outptr + j, v0);
1584 vst1q_f32(outptr + j + 4, v1);
1585 }
1586 }
1587 else
1588 {
1589 for (; j + 7 < out_width; j += 8)
1590 {
1591 float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias;
1592 float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j + 4)) + vbias;
1593
1594 if (ifMinMaxAct)
1595 {
1596 v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax);
1597 v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax);
1598 }
1599
1600 vst1q_f32(outptr + j, v0);
1601 vst1q_f32(outptr + j + 4, v1);
1602 }
1603 }
1604
1605 if (pbptr)
1606 {
1607 for (; j < out_width; j++)
1608 {
1609 float v = (float )cptr_fp16[j] + biasval;
1610 v += pbptr[j];
1611 if (ifMinMaxAct)
1612 v = std::min(std::max(v, minval), maxval);
1613 outptr[j] = v;
1614 }
1615 }
1616 else
1617 {
1618 for (; j < out_width; j++)
1619 {
1620 float v = (float )cptr_fp16[j] + biasval;
1621
1622 if (ifMinMaxAct)
1623 v = std::min(std::max(v, minval), maxval);
1624 outptr[j] = v;
1625 }
1626 }
1627 }
1628 else
1629#endif
1630 {
1631#if CV_SIMD128
1632 v_float32x4 vbias = v_setall_f32(v: biasval);
1633 v_float32x4 vmax = v_setall_f32(v: maxval);
1634 v_float32x4 vmin = v_setall_f32(v: minval);
1635
1636 if (pbptr)
1637 {
1638 for (; j + 7 < out_width; j += 8)
1639 {
1640 v_float32x4 v0 = v_add(a: v_load(ptr: cptr + j), b: vbias);
1641 v_float32x4 v1 = v_add(a: v_load(ptr: cptr + j + 4), b: vbias);
1642
1643 v0 = v_add(a: v0, b: v_load(ptr: pbptr + j));
1644 v1 = v_add(a: v1, b: v_load(ptr: pbptr + j + 4));
1645
1646 if (ifMinMaxAct)
1647 {
1648 v0 = v_min(a: v_max(a: v0, b: vmin), b: vmax);
1649 v1 = v_min(a: v_max(a: v1, b: vmin), b: vmax);
1650 }
1651
1652 v_store(ptr: outptr + j, a: v0);
1653 v_store(ptr: outptr + j + 4, a: v1);
1654 }
1655 }
1656 else
1657 {
1658 for (; j + 7 < out_width; j += 8)
1659 {
1660 v_float32x4 v0 = v_add(a: v_load(ptr: cptr + j), b: vbias);
1661 v_float32x4 v1 = v_add(a: v_load(ptr: cptr + j + 4), b: vbias);
1662
1663 if (ifMinMaxAct)
1664 {
1665 v0 = v_min(a: v_max(a: v0, b: vmin), b: vmax);
1666 v1 = v_min(a: v_max(a: v1, b: vmin), b: vmax);
1667 }
1668
1669 v_store(ptr: outptr + j, a: v0);
1670 v_store(ptr: outptr + j + 4, a: v1);
1671 }
1672 }
1673#endif
1674 if (pbptr)
1675 {
1676 for (; j < out_width; j++)
1677 {
1678 float v = cptr[j] + biasval;
1679 v += pbptr[j];
1680 if (ifMinMaxAct)
1681 v = std::min(a: std::max(a: v, b: minval), b: maxval);
1682 outptr[j] = v;
1683 }
1684 }
1685 else
1686 {
1687 for (; j < out_width; j++)
1688 {
1689 float v = cptr[j] + biasval;
1690
1691 if (ifMinMaxAct)
1692 v = std::min(a: std::max(a: v, b: minval), b: maxval);
1693 outptr[j] = v;
1694 }
1695 }
1696 }
1697
1698 if (activ)
1699 activ->forwardSlice(src: outptr, dst: outptr, len: out_width, outPlaneSize: out_planesize, cn0: Kg * g + k, cn1: Kg * g + k + 1);
1700 }
1701 }
1702 }
1703 }
1704 }
1705 });
1706}
1707
1708
1709/****************************************************************************************\
1710 SIMD and no-SIMD code for convBlock
1711\****************************************************************************************/
1712
1713static inline void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
1714 const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
1715{
1716 std::vector<float> cbuffer(outLen, 0);
1717 float* cbuf = cbuffer.data();
1718 for( int p = 0; p < np; p++ )
1719 {
1720 float ai = a[p];
1721 for( int j = 0; j < outLen; j++ )
1722 cbuf[j] += b[convNR*p + j] * ai;
1723 }
1724
1725 if (init_c)
1726 {
1727 for(int j = 0; j < outLen; j++)
1728 {
1729 c[j] += cbuf[j] + bias;
1730 if (ifMinMaxAct)
1731 c[j] = std::min(a: std::max(a: c[j], b: minval), b: maxval);
1732 }
1733 }
1734 else
1735 {
1736 for(int j = 0; j < outLen; j++)
1737 {
1738 c[j] = cbuf[j] + bias;
1739 if (ifMinMaxAct)
1740 c[j] = std::min(a: std::max(a: c[j], b: minval), b: maxval);
1741 }
1742 }
1743}
1744
1745#if CV_SIMD128
1746static inline void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
1747 const float minval, const float maxval, bool ifMinMaxAct, const int convNR)
1748{
1749 CV_Assert(convNR == 24);
1750 v_float32x4 c0 = v_setall_f32(v: bias), c1 = c0, c2 = c0;
1751 v_float32x4 c3 = c0, c4 = c0, c5 = c0;
1752
1753 for (int p = 0; p < np; p++, a++, b += convNR)
1754 {
1755 v_float32x4 a0 = v_setall_f32(v: a[0]);
1756 v_float32x4 b0 = v_load(ptr: b), b1 = v_load(ptr: b + 4), b2 = v_load(ptr: b + 8);
1757 v_float32x4 b3 = v_load(ptr: b + 12), b4 = v_load(ptr: b + 16), b5 = v_load(ptr: b + 20);
1758
1759 c0 = v_fma(a: b0, b: a0, c: c0);
1760 c1 = v_fma(a: b1, b: a0, c: c1);
1761 c2 = v_fma(a: b2, b: a0, c: c2);
1762 c3 = v_fma(a: b3, b: a0, c: c3);
1763 c4 = v_fma(a: b4, b: a0, c: c4);
1764 c5 = v_fma(a: b5, b: a0, c: c5);
1765 }
1766
1767 if (init_c)
1768 {
1769 c0 = v_add(a: c0, b: v_load(ptr: c));
1770 c1 = v_add(a: c1, b: v_load(ptr: c + 4));
1771 c2 = v_add(a: c2, b: v_load(ptr: c + 8));
1772 c3 = v_add(a: c3, b: v_load(ptr: c + 12));
1773 c4 = v_add(a: c4, b: v_load(ptr: c + 16));
1774 c5 = v_add(a: c5, b: v_load(ptr: c + 20));
1775 }
1776
1777 if (ifMinMaxAct)
1778 {
1779 v_float32x4 vmax = v_setall_f32(v: maxval), vmin = v_setall_f32(v: minval);
1780 c0 = v_min(a: v_max(a: c0, b: vmin), b: vmax);
1781 c1 = v_min(a: v_max(a: c1, b: vmin), b: vmax);
1782 c2 = v_min(a: v_max(a: c2, b: vmin), b: vmax);
1783 c3 = v_min(a: v_max(a: c3, b: vmin), b: vmax);
1784 c4 = v_min(a: v_max(a: c4, b: vmin), b: vmax);
1785 c5 = v_min(a: v_max(a: c5, b: vmin), b: vmax);
1786 }
1787
1788 v_store(ptr: c, a: c0);
1789 v_store(ptr: c + 4, a: c1);
1790 v_store(ptr: c + 8, a: c2);
1791 v_store(ptr: c + 12, a: c3);
1792 v_store(ptr: c + 16, a: c4);
1793 v_store(ptr: c + 20, a: c5);
1794}
1795
1796static inline void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
1797 const float minval, const float maxval, bool ifMinMaxAct, const int convNR)
1798{
1799 CV_Assert(convNR == 12);
1800 v_float32x4 c0 = v_setall_f32(v: bias), c1 = c0, c2 = c0;
1801 for (int p = 0; p < np; p++, a++, b += convNR)
1802 {
1803 v_float32x4 a0 = v_setall_f32(v: a[0]);
1804 v_float32x4 b0 = v_load(ptr: b), b1 = v_load(ptr: b + 4), b2 = v_load(ptr: b + 8);
1805
1806 c0 = v_fma(a: b0, b: a0, c: c0);
1807 c1 = v_fma(a: b1, b: a0, c: c1);
1808 c2 = v_fma(a: b2, b: a0, c: c2);
1809 }
1810
1811 if (init_c)
1812 {
1813 c0 = v_add(a: c0, b: v_load(ptr: c));
1814 c1 = v_add(a: c1, b: v_load(ptr: c + 4));
1815 c2 = v_add(a: c2, b: v_load(ptr: c + 8));
1816 }
1817
1818 if (ifMinMaxAct)
1819 {
1820 v_float32x4 vmax = v_setall_f32(v: maxval), vmin = v_setall_f32(v: minval);
1821 c0 = v_min(a: v_max(a: c0, b: vmin), b: vmax);
1822 c1 = v_min(a: v_max(a: c1, b: vmin), b: vmax);
1823 c2 = v_min(a: v_max(a: c2, b: vmin), b: vmax);
1824 }
1825
1826 v_store(ptr: c, a: c0);
1827 v_store(ptr: c + 4, a: c1);
1828 v_store(ptr: c + 8, a: c2);
1829}
1830#endif
1831
1832void convBlockMR1_F32(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
1833 const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
1834{
1835#if CV_SIMD128
1836 // The outLen represents the valid output value in CONV_NR length.
1837 // When outLen is very small, we use the no-SIMD branch.
1838 const int convNRby3 = convNR/3;
1839 if (outLen > convNRby3)
1840 {
1841 if (convNR == 24)
1842 convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR);
1843 else if (convNR == 12)
1844 convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR);
1845 else
1846 convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
1847 }
1848 else
1849 convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
1850#else
1851 convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
1852#endif
1853}
1854
1855#if CV_SIMD128
1856static inline void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
1857{
1858 v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
1859 v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
1860 v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
1861 v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
1862
1863 for (int p = 0; p < np; p++, a += convMR, b += convNR)
1864 {
1865 v_float32x4 a0 = v_setall_f32(v: a[0]);
1866 v_float32x4 b0 = v_load(ptr: b), b1 = v_load(ptr: b + 4), b2 = v_load(ptr: b + 8);
1867 v_float32x4 b3 = v_load(ptr: b + 12), b4 = v_load(ptr: b + 16), b5 = v_load(ptr: b + 20);
1868
1869 c0 = v_fma(a: b0, b: a0, c: c0);
1870 c1 = v_fma(a: b1, b: a0, c: c1);
1871 c2 = v_fma(a: b2, b: a0, c: c2);
1872 c3 = v_fma(a: b3, b: a0, c: c3);
1873 c4 = v_fma(a: b4, b: a0, c: c4);
1874 c5 = v_fma(a: b5, b: a0, c: c5);
1875
1876 a0 = v_setall_f32(v: a[1]);
1877 c6 = v_fma(a: b0, b: a0, c: c6);
1878 c7 = v_fma(a: b1, b: a0, c: c7);
1879 c8 = v_fma(a: b2, b: a0, c: c8);
1880 c9 = v_fma(a: b3, b: a0, c: c9);
1881 c10 = v_fma(a: b4, b: a0, c: c10);
1882 c11 = v_fma(a: b5, b: a0, c: c11);
1883
1884 a0 = v_setall_f32(v: a[2]);
1885 c12 = v_fma(a: b0, b: a0, c: c12);
1886 c13 = v_fma(a: b1, b: a0, c: c13);
1887 c14 = v_fma(a: b2, b: a0, c: c14);
1888 c15 = v_fma(a: b3, b: a0, c: c15);
1889 c16 = v_fma(a: b4, b: a0, c: c16);
1890 c17 = v_fma(a: b5, b: a0, c: c17);
1891
1892 a0 = v_setall_f32(v: a[3]);
1893 c18 = v_fma(a: b0, b: a0, c: c18);
1894 c19 = v_fma(a: b1, b: a0, c: c19);
1895 c20 = v_fma(a: b2, b: a0, c: c20);
1896 c21 = v_fma(a: b3, b: a0, c: c21);
1897 c22 = v_fma(a: b4, b: a0, c: c22);
1898 c23 = v_fma(a: b5, b: a0, c: c23);
1899 }
1900
1901 if (!init_c)
1902 {
1903 c0 = v_add(a: c0, b: v_load(ptr: c));
1904 c1 = v_add(a: c1, b: v_load(ptr: c + 4));
1905 c2 = v_add(a: c2, b: v_load(ptr: c + 8));
1906 c3 = v_add(a: c3, b: v_load(ptr: c + 12));
1907 c4 = v_add(a: c4, b: v_load(ptr: c + 16));
1908 c5 = v_add(a: c5, b: v_load(ptr: c + 20));
1909
1910 c6 = v_add(a: c6 , b: v_load(ptr: c + ldc));
1911 c7 = v_add(a: c7 , b: v_load(ptr: c + ldc + 4));
1912 c8 = v_add(a: c8 , b: v_load(ptr: c + ldc + 8));
1913 c9 = v_add(a: c9 , b: v_load(ptr: c + ldc + 12));
1914 c10 = v_add(a: c10, b: v_load(ptr: c + ldc + 16));
1915 c11 = v_add(a: c11, b: v_load(ptr: c + ldc + 20));
1916
1917 c12 = v_add(a: c12, b: v_load(ptr: c + ldc*2));
1918 c13 = v_add(a: c13, b: v_load(ptr: c + ldc*2 + 4));
1919 c14 = v_add(a: c14, b: v_load(ptr: c + ldc*2 + 8));
1920 c15 = v_add(a: c15, b: v_load(ptr: c + ldc*2 + 12));
1921 c16 = v_add(a: c16, b: v_load(ptr: c + ldc*2 + 16));
1922 c17 = v_add(a: c17, b: v_load(ptr: c + ldc*2 + 20));
1923
1924 c18 = v_add(a: c18, b: v_load(ptr: c + ldc*3));
1925 c19 = v_add(a: c19, b: v_load(ptr: c + ldc*3 + 4));
1926 c20 = v_add(a: c20, b: v_load(ptr: c + ldc*3 + 8));
1927 c21 = v_add(a: c21, b: v_load(ptr: c + ldc*3 + 12));
1928 c22 = v_add(a: c22, b: v_load(ptr: c + ldc*3 + 16));
1929 c23 = v_add(a: c23, b: v_load(ptr: c + ldc*3 + 20));
1930 }
1931
1932 v_store(ptr: c, a: c0);
1933 v_store(ptr: c + 4, a: c1);
1934 v_store(ptr: c + 8, a: c2);
1935 v_store(ptr: c + 12, a: c3);
1936 v_store(ptr: c + 16, a: c4);
1937 v_store(ptr: c + 20, a: c5);
1938
1939 v_store(ptr: c + ldc, a: c6);
1940 v_store(ptr: c + ldc + 4, a: c7);
1941 v_store(ptr: c + ldc + 8, a: c8);
1942 v_store(ptr: c + ldc + 12, a: c9);
1943 v_store(ptr: c + ldc + 16, a: c10);
1944 v_store(ptr: c + ldc + 20, a: c11);
1945
1946 v_store(ptr: c + ldc * 2, a: c12);
1947 v_store(ptr: c + ldc * 2 + 4, a: c13);
1948 v_store(ptr: c + ldc * 2 + 8, a: c14);
1949 v_store(ptr: c + ldc * 2 + 12, a: c15);
1950 v_store(ptr: c + ldc * 2 + 16, a: c16);
1951 v_store(ptr: c + ldc * 2 + 20, a: c17);
1952
1953 v_store(ptr: c + ldc * 3, a: c18);
1954 v_store(ptr: c + ldc * 3 + 4, a: c19);
1955 v_store(ptr: c + ldc * 3 + 8, a: c20);
1956 v_store(ptr: c + ldc * 3 + 12, a: c21);
1957 v_store(ptr: c + ldc * 3 + 16, a: c22);
1958 v_store(ptr: c + ldc * 3 + 20, a: c23);
1959}
1960
1961static inline void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
1962{
1963 CV_Assert(convNR >= 4);
1964 v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
1965 v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0;
1966
1967 for (int p = 0; p < np; p++, a += convMR, b += convNR)
1968 {
1969 v_float32x4 a0 = v_setall_f32(v: a[0]);
1970 v_float32x4 a1 = v_setall_f32(v: a[1]);
1971 v_float32x4 a2 = v_setall_f32(v: a[2]);
1972 v_float32x4 a3 = v_setall_f32(v: a[3]);
1973
1974 v_float32x4 b0 = v_load(ptr: b), b1 = v_load(ptr: b + 4);
1975
1976 c0 = v_fma(a: b0, b: a0, c: c0);
1977 c1 = v_fma(a: b1, b: a0, c: c1);
1978
1979 c2 = v_fma(a: b0, b: a1, c: c2);
1980 c3 = v_fma(a: b1, b: a1, c: c3);
1981
1982 c4 = v_fma(a: b0, b: a2, c: c4);
1983 c5 = v_fma(a: b1, b: a2, c: c5);
1984
1985 c6 = v_fma(a: b0, b: a3, c: c6);
1986 c7 = v_fma(a: b1, b: a3, c: c7);
1987 }
1988
1989 if (!init_c)
1990 {
1991 c0 = v_add(a: c0, b: v_load(ptr: c));
1992 c1 = v_add(a: c1, b: v_load(ptr: c + 4));
1993
1994 c2 = v_add(a: c2, b: v_load(ptr: c + ldc));
1995 c3 = v_add(a: c3, b: v_load(ptr: c + ldc + 4));
1996
1997 c4 = v_add(a: c4, b: v_load(ptr: c + ldc*2));
1998 c5 = v_add(a: c5, b: v_load(ptr: c + ldc*2 + 4));
1999
2000 c6 = v_add(a: c6, b: v_load(ptr: c + ldc*3));
2001 c7 = v_add(a: c7, b: v_load(ptr: c + ldc*3 + 4));
2002 }
2003
2004 v_store(ptr: c, a: c0);
2005 v_store(ptr: c + 4, a: c1);
2006 v_store(ptr: c + ldc, a: c2);
2007 v_store(ptr: c + ldc + 4, a: c3);
2008 v_store(ptr: c + ldc * 2, a: c4);
2009 v_store(ptr: c + ldc * 2 + 4, a: c5);
2010 v_store(ptr: c + ldc * 3, a: c6);
2011 v_store(ptr: c + ldc * 3 + 4, a: c7);
2012}
2013
2014static inline void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
2015{
2016 CV_Assert(convNR >= 4);
2017 v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
2018
2019 for (int p = 0; p < np; p++, a += convMR, b += convNR)
2020 {
2021 v_float32x4 a0 = v_setall_f32(v: a[0]);
2022 v_float32x4 a1 = v_setall_f32(v: a[1]);
2023 v_float32x4 a2 = v_setall_f32(v: a[2]);
2024 v_float32x4 a3 = v_setall_f32(v: a[3]);
2025
2026 v_float32x4 b0 = v_load(ptr: b);
2027
2028 c0 = v_fma(a: b0, b: a0, c: c0);
2029 c1 = v_fma(a: b0, b: a1, c: c1);
2030 c2 = v_fma(a: b0, b: a2, c: c2);
2031 c3 = v_fma(a: b0, b: a3, c: c3);
2032 }
2033
2034 if (!init_c)
2035 {
2036 c0 = v_add(a: c0, b: v_load(ptr: c));
2037 c1 = v_add(a: c1, b: v_load(ptr: c + ldc));
2038 c2 = v_add(a: c2, b: v_load(ptr: c + ldc*2));
2039 c3 = v_add(a: c3, b: v_load(ptr: c + ldc*3));
2040 }
2041
2042 v_store(ptr: c, a: c0);
2043 v_store(ptr: c + ldc, a: c1);
2044 v_store(ptr: c + ldc * 2, a: c2);
2045 v_store(ptr: c + ldc * 3, a: c3);
2046}
2047#endif
2048
2049static inline void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
2050 const int convMR, const int convNR)
2051{
2052 std::vector<float> cbuffer(convMR * outLen, 0);
2053 float* cbuf = cbuffer.data();
2054 for( int p = 0; p < np; p++ )
2055 {
2056 for( int i = 0; i < convMR; i++ )
2057 {
2058 float ai = a[convMR*p + i];
2059 for( int j = 0; j < outLen; j++ )
2060 cbuf[i * outLen+j] += b[convNR*p + j] * ai;
2061 }
2062 }
2063
2064 if (!init_c)
2065 {
2066 for(int i = 0; i < convMR; i++)
2067 {
2068 for(int j = 0; j < outLen; j++)
2069 c[i*ldc + j] += cbuf[i*outLen + j];
2070 }
2071 }
2072 else
2073 {
2074 for(int i = 0; i < convMR; i++)
2075 {
2076 for(int j = 0; j < outLen; j++)
2077 c[i*ldc + j] = cbuf[i*outLen + j];
2078 }
2079 }
2080}
2081
2082void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
2083 const int convMR, const int convNR)
2084{
2085 // The possible outLen range is [24, 8~1].
2086#if CV_SIMD128
2087 CV_Assert(convMR == 4);
2088 if (outLen > 8 && convNR == 24)
2089 {
2090 convBlock4x24(np, a, b, c, ldc, init_c, convMR, convNR);
2091 return;
2092 }
2093
2094 if (outLen <= 8 && outLen > 4)
2095 {
2096 convBlock4x8(np, a, b, c, ldc, init_c, convMR, convNR);
2097 return;
2098 }
2099
2100 if (outLen <= 4 && outLen > 1)
2101 {
2102 convBlock4x4(np, a, b, c, ldc, init_c, convMR, convNR);
2103 return;
2104 }
2105 convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
2106#else
2107 convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
2108#endif
2109}
2110
2111}} // namespace cv::dnn
2112

source code of opencv/modules/dnn/src/layers/cpu_kernels/convolution.cpp