convolution.cpp source code [opencv/modules/dnn/src/layers/cpu_kernels/convolution.cpp]

1	// This file is part of OpenCV project.
2	// It is subject to the license terms in the LICENSE file found in the top-level directory
3	// of this distribution and at http://opencv.org/license.html.
4
5	// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
6	// Here is the original license:
7	/*
8	This file is a part of ficus language project.
9	See ficus/LICENSE for the licensing terms
10	*/
11
12	#include "../../precomp.hpp"
13	#include "convolution.hpp"
14
15	#include "conv_block.simd.hpp"
16	#include "layers/cpu_kernels/conv_block.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
17	#include <opencv2/core/utils/logger.hpp>
18
19	namespace cv { namespace dnn {
20	enum { VEC_ALIGN = `32`}; // Memory alignment.
21
22	void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
23	const int convMR, const int convNR);
24	void convBlockMR1_F32(int np, const float* a, const float* b, float c, const* float bias, bool init_c,
25	const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
26
27	#ifdef CONV_ARM_FP16
28	// Fast convert float 32 to float16
29	static inline void _cvt32f16f(const float* src, __fp16* dst, int len)
30	{
31	int j = `0`;
32	const int VECSZ = `4`;
33	__fp16* dst_FP16 = (__fp16 *)dst;
34	if (len > VECSZ * `4`)
35	{
36	const int VECSZ4 = `4` * VECSZ;
37	for( ; j + VECSZ4 < len; j += VECSZ4)
38	{
39
40	float32x4_t v0 = vld1q_f32(src + j);
41	float32x4_t v1 = vld1q_f32(src + j + `4`);
42	float32x4_t v2 = vld1q_f32(src + j + `8`);
43	float32x4_t v3 = vld1q_f32(src + j + `12`);
44
45	vst1q_f16(dst_FP16 + j, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
46	vst1q_f16(dst_FP16 + j + `8`, vcombine_f16(vcvt_f16_f32(v2), vcvt_f16_f32(v3)));
47	}
48	}
49
50	for( ; j < len; j += VECSZ )
51	{
52	if( j > len - VECSZ )
53	{
54	if( j == `0` )
55	break;
56	j = len - VECSZ;
57	}
58
59	float16x4_t hv = vcvt_f16_f32(vld1q_f32(src + j));
60	vst1_f16(dst_FP16 + j, hv);
61	}
62	for( ; j < len; j++ )
63	dst[j] = __fp16(src[j]);
64	}
65	#endif
66
67	float* FastConv::getWeights()
68	{
69	return alignPtr(ptr: weightsBuf.data(), n: VEC_ALIGN);
70	}
71
72	float* FastConv::getWeightsWino()
73	{
74	return alignPtr(ptr: weightsWinoBuf.data(), n: VEC_ALIGN);
75	}
76
77	hfloat* FastConv::getWeightsFP16()
78	{
79	return alignPtr(ptr: weightsBuf_FP16.data(), n: VEC_ALIGN);
80	}
81
82	hfloat* FastConv::getWeightsWinoFP16()
83	{
84	return alignPtr(ptr: weightsWinoBuf_FP16.data(), n: VEC_ALIGN);
85	}
86
87	Ptr<FastConv> initFastConv(
88	InputArray _weightsMat,
89	float* srcBias,
90	int ngroups,
91	int K, int C,
92	const std::vector<size_t>& kernel_size,
93	const std::vector<size_t>& strides,
94	const std::vector<size_t>& dilations,
95	const std::vector<size_t>& pads_begin,
96	const std::vector<size_t>& pads_end,
97	int conv_dim,
98	const bool _useFP16,
99	bool useWinograd)
100	{
101	Ptr<FastConv> conv = makePtr<FastConv>();
102	CV_Assert(ngroups > `0` && K > `0` && C > `0` && K % ngroups == `0`);
103
104	// Weight shape, [K, C, Dk, Hk, Wk] for Conv3D, [K, C, Hk, Wk] for Conv2D, [K, C, Wk] for Conv1D.
105	int Dk = conv_dim == CONV_3D ? (int)kernel_size [`0`] : `1`;
106	int Hk = conv_dim == CONV_1D ? `1` : (int)kernel_size [kernel_size.size() - `2`];
107	int Wk = (int)kernel_size.back();
108	int karea = WkHkDk;
109
110	conv ->pad_front = conv_dim == CONV_3D ? (int)pads_begin [`0`] : `0`;
111	conv ->pad_top = conv_dim == CONV_1D ? `0` : (int)pads_begin [pads_begin.size() - `2`];
112	conv ->pad_left = (int)pads_begin.back();
113
114	conv ->pad_behind = conv_dim == CONV_3D ? (int)pads_end [`0`] : `0`;
115	conv ->pad_bottom = conv_dim == CONV_1D ? `0` : (int)pads_end [pads_end.size() - `2`];
116	conv ->pad_right = (int)pads_end.back();
117
118	int stride_d = conv_dim == CONV_3D ? (int)strides [`0`] : `1`;
119	int stride_h = conv_dim == CONV_1D ? `1` : (int)strides [strides.size() - `2`];
120	int stride_w = (int)strides.back();
121
122	int dilation_d = conv_dim == CONV_3D ? (int)dilations [`0`] : `1`;
123	int dilation_h = conv_dim == CONV_1D ? `1` : (int)dilations [dilations.size() - `2`];
124	int dilation_w = (int)dilations.back();
125
126	CV_Assert(Dk > `0` && Hk > `0` && Wk > `0`);
127	CV_Assert(stride_d >= `0` && stride_h >= `0` && stride_w > `0`);
128	CV_Assert(dilation_d > `0` && dilation_h > `0` && dilation_w > `0`);
129
130	conv ->K = K; conv ->C = C; conv ->Hk = Hk; conv ->Wk = Wk, conv ->Dk = Dk;
131
132	conv ->stride_d = stride_d;
133	conv ->stride_h = stride_h;
134	conv ->stride_w = stride_w;
135
136	conv ->dilation_d = dilation_d;
137	conv ->dilation_h = dilation_h;
138	conv ->dilation_w = dilation_w;
139	conv ->conv_dim = conv_dim;
140	conv ->ngroups = ngroups;
141
142	bool ifRunDepthWise = ngroups > `1` && ngroups == K && ngroups == C;
143	bool ifRunDepthWiseRemain = false; // It's for big padding or big kernel or Conv3D depth-wise convolution.
144
145	if (ifRunDepthWise)
146	{
147	if (conv_dim == CONV_1D)
148	{
149	ifRunDepthWise &= Hk == `1` && Wk == `3` && (stride_w == `1` \|\| (stride_w == `2` && dilation_w == `1`))
150	&& max(a: stride_w, b: dilation_w) >= conv ->pad_left && conv ->pad_left <= `1`;
151	}
152	else if (conv_dim == CONV_2D)
153	{
154	ifRunDepthWise &= Hk == `3` && Wk == `3` && ((stride_w == `1`) \|\| (stride_w == `2` && dilation_w == `1`)) &&
155	max(a: stride_w, b: dilation_w) >= conv ->pad_left && max(a: stride_h, b: dilation_h) >= conv ->pad_top
156	&& conv ->pad_left <= `1` && conv ->pad_top <= `1`;
157	}
158
159	if (!ifRunDepthWise \|\| conv_dim == CONV_3D)
160	{
161	ifRunDepthWise = false;
162	ifRunDepthWiseRemain = true;
163	}
164	}
165
166	conv ->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? CONV_TYPE_DEPTHWISE :
167	useWinograd && (conv_dim == CONV_2D && (conv ->useSIMD128 \|\| conv ->useAVX \|\| conv ->useAVX2 \|\| conv ->useNEON) &&
168	Hk == `3` && Wk == `3` && dilation_h == `1` && dilation_w == `1` && stride_h == `1` && stride_w == `1`) ?
169	CONV_TYPE_WINOGRAD3X3 :
170	(ifRunDepthWiseRemain ? CONV_TYPE_DEPTHWISE_REMAIN : CONV_TYPE_GENERIC);
171
172	#if !(CV_NEON \|\| CV_SIMD128 \|\| CV_TRY_AVX \|\| CV_TRY_AVX2)
173	if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 ,CV_TRY_AVX and CV_TRY_AVX2 are not available.
174	conv->conv_type = CONV_TYPE_GENERIC;
175	#endif
176
177	Mat weightsMat = _weightsMat.getMat();
178	auto wShape = shape(mat: weightsMat);
179	const size_t wstep = weightsMat.step1();
180
181	conv ->useFP16 = false;
182	#ifdef CONV_ARM_FP16
183	if (_useFP16 && (conv->conv_type == CONV_TYPE_GENERIC \|\| conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN
184	\|\| conv->conv_type == CONV_TYPE_WINOGRAD3X3))
185	conv->useFP16 = true;
186
187	// Runtime FP16 check.
188	if (conv->useFP16 && !checkHardwareSupport(CPU_NEON_FP16))
189	{
190	conv->useFP16 = false;
191	CV_LOG_ONCE_WARNING(NULL, "DNN: the CPU does not support the instruction set required by FP16, fallback to FP32.");
192	}
193	#endif
194
195	float srcWeights = (float* *)weightsMat.data;
196	if (conv ->conv_type == CONV_TYPE_DEPTHWISE \|\| conv ->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
197	{
198	// Handle the Conv1D, Conv2D and Conv3D depth-wise.
199	// for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
200	// but add some padding to make the weights array layout more SIMD-friendly
201	int ksize = karea;
202
203	// TODO: simplify the following code with std::copy.
204	// this code aims to let memory fit with vector size.
205	int padded_ksize = ((ksize + VEC_ALIGN-`1`) / VEC_ALIGN) * VEC_ALIGN;
206	int nweights = C * padded_ksize;
207
208	#ifdef CONV_ARM_FP16
209	if (conv->useFP16)
210	{
211	conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN);
212	auto weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
213
214	parallel_for_(Range(`0`, C), [&](const Range& r0){
215	for(int c = r0.start; c < r0.end; c++)
216	_cvt32f16f(srcWeights + cwstep, weightsPtr_FP16 + cpadded_ksize, ksize);
217	});
218	}
219	else
220	#endif
221	{
222	conv ->weightsBuf.resize(new_size: nweights + VEC_ALIGN);
223	auto weightsPtr = conv ->getWeights();
224
225	parallel_for_(range: Range (`0`, C), functor: [&](const Range& r0) {
226	for(int c = r0.start; c < r0.end; c++)
227	memcpy(dest: weightsPtr + cpadded_ksize, src: srcWeights + cwstep, n: ksize*sizeof(weightsPtr[`0`]));
228	});
229	}
230	}
231	else if(conv ->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
232	{
233	static const float ktm[`8`][`3`] = {
234	{`1.0f`, `0.0f`, `0.0f`},
235	{-`2.0f` / `9`, -`2.0f` / `9`, -`2.0f` / `9`},
236	{-`2.0f` / `9`, `2.0f` / `9`, -`2.0f` / `9`},
237	{`1.0f` / `90`, `1.0f` / `45`, `2.0f` / `45`},
238	{`1.0f` / `90`, -`1.0f` / `45`, `2.0f` / `45`},
239	{`32.f`/`45`, `16.f`/`45`, `8.f`/`45`},
240	{`32.f`/`45`, -`16.f`/`45`, `8.f`/`45`},
241	{`0.0f`, `0.0f`, `1.0f`}
242	};
243
244	const int CONV_WINO_KBLOCK = `4`;
245
246	#if CV_TRY_AVX \|\| CV_TRY_AVX2
247	const int CONV_WINO_ATOM_F32 = (conv ->useAVX \|\| conv ->useAVX2) ? `8` : `4`;
248	#else
249	const int CONV_WINO_ATOM_F32 = `4`;
250	#endif
251	const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
252
253	#ifdef CONV_ARM_FP16
254	// FP 16
255	const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * `2`;
256	const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
257	#endif
258
259	// the weights are packed as 6-dim tensor:
260	// ngroups ceil((K/ngroups)/KBLOCK) * (WW/ATOM_SIZE) (C/ngroups) * KBLOCK * ATOM_SIZE,*
261	// where W is the size of Winograd-transformed kernel (8x8),
262	// ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32),
263	// KBLOCK is some platform-dependent constant dependent on the number of SIMD registers.
264	int ksize = CONV_WINO_KSIZE * CONV_WINO_KSIZE;
265	int Cg = C/ngroups;
266	int Kg = K/ngroups;
267	int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - `1`)/CONV_WINO_KBLOCK;
268	size_t nweights = ngroupsKg_nblocksCgCONV_WINO_KBLOCKCONV_WINO_AREA;
269
270	float* wptrWino = nullptr;
271	#ifdef CONV_ARM_FP16
272	__fp16* wptrWino_FP16 = nullptr;
273	if (conv->useFP16)
274	{
275	conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN);
276	wptrWino_FP16 = (__fp16*)conv->getWeightsWinoFP16();
277	}
278	else
279	#endif
280	{
281	conv ->weightsWinoBuf.resize(new_size: nweights + VEC_ALIGN);
282	wptrWino = conv ->getWeightsWino();
283	}
284
285	parallel_for_(range: Range (`0`, K), functor: [&](const Range& r0){
286	float kernelTm[CONV_WINO_AREA];
287	for (int k = r0.start; k < r0.end; k++)
288	{
289	int g = k / Kg;
290	int k_ = k - g*Kg;
291	int ki = k_ / CONV_WINO_KBLOCK;
292	int dk = k_ - ki*CONV_WINO_KBLOCK;
293
294	for (int c = `0`; c < Cg; c++)
295	{
296	// wstep = HkWkCg
297	const float kernel0 = srcWeights + k wstep + c * ksize;
298
299	// transform kernel, transposed
300	const float *k0 = kernel0;
301	const float *k1 = kernel0 + `3`;
302	const float *k2 = kernel0 + `6`;
303
304	// h
305	float tmp[`8`][`3`];
306	for (int i = `0`; i < `8`; i++)
307	{
308	tmp[i][`0`] = k0[`0`] * ktm[i][`0`] + k0[`1`] * ktm[i][`1`] + k0[`2`] * ktm[i][`2`];
309	tmp[i][`1`] = k1[`0`] * ktm[i][`0`] + k1[`1`] * ktm[i][`1`] + k1[`2`] * ktm[i][`2`];
310	tmp[i][`2`] = k2[`0`] * ktm[i][`0`] + k2[`1`] * ktm[i][`1`] + k2[`2`] * ktm[i][`2`];
311	}
312
313	// v
314	for (int j = `0`; j < `8`; j++)
315	{
316	float *tmpp = &tmp[j][`0`];
317
318	for (int i = `0`; i < `8`; i++)
319	kernelTm[j * `8` + i] = tmpp[`0`] * ktm[i][`0`] + tmpp[`1`] * ktm[i][`1`] + tmpp[`2`] * ktm[i][`2`];
320	}
321
322	// repack the data.
323	#ifdef CONV_ARM_FP16
324	if (conv->useFP16)
325	{
326	__fp16* wptr = wptrWino_FP16 + (gKg_nblocks + ki) Cg CONV_WINO_KBLOCKCONV_WINO_AREA +
327	(cCONV_WINO_KBLOCK + dk)CONV_WINO_ATOM_F16;
328	for (int i = `0`; i < CONV_WINO_NATOMS_F16; i++,
329	wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16)
330	{
331	CV_Assert(wptrWino_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= wptrWino_FP16 + nweights);
332	for (int j = `0`; j < CONV_WINO_ATOM_F16; j++)
333	{
334	wptr[j] = (__fp16)kernelTm[i * CONV_WINO_ATOM_F16 + j];
335	}
336	}
337	}
338	else
339	#endif
340	{
341	float* wptr = wptrWino + (gKg_nblocks + ki) Cg CONV_WINO_KBLOCKCONV_WINO_AREA +
342	(cCONV_WINO_KBLOCK + dk)CONV_WINO_ATOM_F32;
343	for (int i = `0`; i < CONV_WINO_NATOMS_F32; i++,
344	wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32)
345	{
346	CV_Assert(wptrWino <= wptr && wptr + CONV_WINO_ATOM_F32 <= wptrWino + nweights);
347	memcpy(dest: wptr, src: kernelTm + i * CONV_WINO_ATOM_F32, n: CONV_WINO_ATOM_F32*sizeof (wptr[`0`]));
348	}
349	}
350	}
351	}
352	});
353	}
354	else if (conv ->conv_type == CONV_TYPE_GENERIC)
355	{
356	// The weights are packed as
357	// ngroups x (ceil((K/ngroups)/CONV_MR)CONV_MR) x (CgHkWkDk) x CONV_MR tensor
358	int Kg = K/ngroups, Cg = max(a: C/ngroups, b: `1`);
359	int DkHkWkCg = DkHkWk*Cg;
360
361	int numStripsMR = (Kg + CONV_MR_FP32 - `1`) / CONV_MR_FP32;
362	int Kg_aligned = numStripsMR * CONV_MR_FP32;
363	size_t nweights = ngroupsKg_alignedDkHkWkCg;
364	float* weightsPtr = nullptr;
365
366	#ifdef CONV_ARM_FP16
367	int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - `1`) / CONV_MR_FP16;
368	int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16;
369	size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg;
370	__fp16* weightsPtr_FP16 = nullptr;
371
372	if (conv->useFP16)
373	{
374	conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN);
375	weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
376	}
377	else
378	#endif
379	{
380	conv ->weightsBuf.resize(new_size: nweights + VEC_ALIGN);
381	weightsPtr = conv ->getWeights();
382	}
383
384	// Pack the weight.
385	#ifdef CONV_ARM_FP16
386	if (conv->useFP16)
387	{
388	parallel_for_(Range(`0`, ngroups * numStripsMR_FP16), [&](const Range& r0){
389	for (int gsi = r0.start; gsi < r0.end; gsi++)
390	{
391	int g = gsi / numStripsMR_FP16;
392	int si = gsi - g * numStripsMR_FP16;
393
394	int startK = si * CONV_MR_FP16;
395	CV_Assert(startK < Kg_aligned_FP16);
396
397	__fp16* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
398	int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding.
399
400	int k_idx = g*Kg + startK;
401	for(int hwd = `0`; hwd < HkWkDk; hwd++)
402	{
403	for(int c = `0`; c < Cg; c++, packed_wptr += CONV_MR_FP16)
404	{
405	const float* wptr = srcWeights + wstep * k_idx + cHkWk*Dk + hwd;
406	int k = `0`;
407	for(; k < dk; k++, wptr += wstep)
408	packed_wptr[k] = (__fp16)(*wptr);
409	for(; k < CONV_MR_FP16; k++)
410	packed_wptr[k] = (__fp16)`0.f`;
411	}
412	}
413	}});
414	}
415	else
416	#endif
417	{
418	parallel_for_(range: Range (`0`, ngroups * numStripsMR), functor: [&](const Range& r0){
419	for (int gsi = r0.start; gsi < r0.end; gsi++)
420	{
421	int g = gsi / numStripsMR;
422	int si = gsi - g * numStripsMR;
423
424	int startK = si * CONV_MR_FP32;
425	CV_Assert(startK < Kg_aligned);
426
427	float* packed_wptr = weightsPtr + DkHkWkCg * (startK + g * Kg_aligned);
428	int dk = Kg - startK < CONV_MR_FP32 ? Kg - startK : CONV_MR_FP32; // check if we need zero padding.
429
430	int k_idx = g*Kg + startK;
431	for(int hwd = `0`; hwd < HkWkDk; hwd++)
432	{
433	for(int c = `0`; c < Cg; c++, packed_wptr += CONV_MR_FP32)
434	{
435	const float* wptr = srcWeights + wstep * k_idx + cHkWk*Dk + hwd;
436	int k = `0`;
437	for(; k < dk; k++, wptr += wstep)
438	packed_wptr[k] = *wptr;
439	for(; k < CONV_MR_FP32; k++)
440	packed_wptr[k] = `0.f`;
441	}
442	}
443	}});
444	}
445	}
446	else
447	CV_Error(cv::Error::StsUnsupportedFormat, "Unknown convolution type.");
448
449	// store bias; append some zero's to make sure that
450	// we can always read MR elements starting from any valid index
451	{
452	int k = `0`, nbias = K + VEC_ALIGN;
453	conv ->biasBuf.resize(new_size: nbias);
454	float* biasBufPtr = conv ->biasBuf.data();
455	for(; k < K; k++)
456	biasBufPtr[k] = srcBias ? srcBias[k] : `0.f`;
457	for(; k < nbias; k++)
458	biasBufPtr[k] = `0.f`;
459	}
460	return conv;
461	}
462
463	static inline void packData8(char& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const* int* ofstab,
464	const int stride_w, const int ksize, const int esz)
465	{
466	char * inpbufC = inpbuf + s0 * esz;
467	float* inptrInC = (float* )inptrIn;
468
469	#ifdef CONV_ARM_FP16
470	__fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
471	if (esz == sizeof(__fp16))
472	{
473	if (stride_w == `1`)
474	{
475	for (int k = `0`; k < ksize; k++)
476	{
477	int k1 = ofstab[k];
478
479	float32x4_t v0 = vld1q_f32(inptrInC + k1);
480	float32x4_t v1 = vld1q_f32(inptrInC + k1 + `4`);
481	vst1q_f16((__fp16)inpbufC_FP16 + k CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
482	}
483	}
484	else
485	{
486	for (int k = `0`; k < ksize; k++)
487	{
488	int k1 = ofstab[k];
489	float32x4_t v0 = {inptrInC[k1], inptrInC[k1 + stride_w], inptrInC[k1 + `2`stride_w], inptrInC[k1 + `3`stride_w]};
490	float32x4_t v1 = {inptrInC[k1 + `4`stride_w], inptrInC[k1 + `5`stride_w], inptrInC[k1 + `6`stride_w], inptrInC[k1 + `7`stride_w]};
491
492	vst1q_f16((__fp16)inpbufC_FP16 + k CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
493	}
494	}
495	}
496	else // float 32
497	#endif
498	{
499	CV_Assert(esz == sizeof(float ));
500	float* inpbufC_FP32 = (float* )inpbufC;
501	if (stride_w == `1`)
502	for (int k = `0`; k < ksize; k++)
503	{
504	int k1 = ofstab[k];
505	#if CV_SIMD256
506	vx_store(inpbufC_FP32 + k*CONV_NR_FP32, vx_load(inptrInC + k1));
507	#elif CV_SIMD128
508	v_float32x4 vv0 = v_load(ptr: inptrInC + k1);
509	v_float32x4 vv1 = v_load(ptr: inptrInC + k1 + `4`);
510	v_store(ptr: inpbufC_FP32 + k*CONV_NR_FP32, a: vv0);
511	v_store(ptr: inpbufC_FP32 + k*CONV_NR_FP32 + `4`, a: vv1);
512	#else
513	float v0 = inptrInC[k1];
514	float v1 = inptrInC[k1 + `1`];
515	float v2 = inptrInC[k1 + `2`];
516	float v3 = inptrInC[k1 + `3`];
517	float v4 = inptrInC[k1 + `4`];
518	float v5 = inptrInC[k1 + `5`];
519	float v6 = inptrInC[k1 + `6`];
520	float v7 = inptrInC[k1 + `7`];
521
522	inpbufC_FP32[k*CONV_NR_FP32] = v0;
523	inpbufC_FP32[k*CONV_NR_FP32+`1`] = v1;
524	inpbufC_FP32[k*CONV_NR_FP32+`2`] = v2;
525	inpbufC_FP32[k*CONV_NR_FP32+`3`] = v3;
526	inpbufC_FP32[k*CONV_NR_FP32+`4`] = v4;
527	inpbufC_FP32[k*CONV_NR_FP32+`5`] = v5;
528	inpbufC_FP32[k*CONV_NR_FP32+`6`] = v6;
529	inpbufC_FP32[k*CONV_NR_FP32+`7`] = v7;
530	#endif
531	}
532	else
533	for (int k = `0`; k < ksize; k++)
534	{
535	int k1 = ofstab[k];
536	float v0 = inptrInC[k1];
537	float v1 = inptrInC[k1 + stride_w];
538	float v2 = inptrInC[k1 + `2`*stride_w];
539	float v3 = inptrInC[k1 + `3`*stride_w];
540	float v4 = inptrInC[k1 + `4`*stride_w];
541	float v5 = inptrInC[k1 + `5`*stride_w];
542	float v6 = inptrInC[k1 + `6`*stride_w];
543	float v7 = inptrInC[k1 + `7`*stride_w];
544
545	inpbufC_FP32[k*CONV_NR_FP32] = v0;
546	inpbufC_FP32[k*CONV_NR_FP32+`1`] = v1;
547	inpbufC_FP32[k*CONV_NR_FP32+`2`] = v2;
548	inpbufC_FP32[k*CONV_NR_FP32+`3`] = v3;
549	inpbufC_FP32[k*CONV_NR_FP32+`4`] = v4;
550	inpbufC_FP32[k*CONV_NR_FP32+`5`] = v5;
551	inpbufC_FP32[k*CONV_NR_FP32+`6`] = v6;
552	inpbufC_FP32[k*CONV_NR_FP32+`7`] = v7;
553	}
554	}
555	x0+=`7`;
556	s0+=`7`;
557	inptrIn += `7`*stride_w;
558	in_w += `7`*stride_w;
559	}
560
561	static inline void packData2(char & inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const* int* ofstab,
562	const int stride_w, const int ksize, const int esz)
563	{
564	char* inpbufC = inpbuf + s0 * esz;
565	float* inptrInC = inptrIn;
566
567	#ifdef CONV_ARM_FP16
568	__fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
569	if (esz == sizeof(__fp16))
570	{
571	for (int k = `0`; k < ksize; k++)
572	{
573	int k1 = ofstab[k];
574	float v0 = inptrInC[k1];
575	float v1 = inptrInC[k1 + stride_w];
576	inpbufC_FP16[kCONV_NR_FP16] = (__fp16*)v0;
577	inpbufC_FP16[kCONV_NR_FP16+`1`] = (__fp16*)v1;
578	}
579	} else
580	#endif
581	{
582	float * inpbufC_FP32 = (float *)inpbufC;
583	for (int k = `0`; k < ksize; k++)
584	{
585	int k1 = ofstab[k];
586	float v0 = inptrInC[k1];
587	float v1 = inptrInC[k1 + stride_w];
588	inpbufC_FP32[k*CONV_NR_FP32] = v0;
589	inpbufC_FP32[k*CONV_NR_FP32+`1`] = v1;
590	}
591	}
592
593	x0++;
594	s0++;
595	inptrIn += stride_w;
596	in_w += stride_w;
597	}
598
599	static inline void packInputData(char* inpbuf_task, float* inp, const int* ofstab, const int* dhwTab, int zyx0, int zyx_limit,
600	int ksize, int stride_d, int stride_h, int stride_w, int pad_front, int pad_top, int pad_left,
601	int Dk, int Hk, int Wk, int dilation_d, int dilation_h, int dilation_w, int Di, int Hi, int Wi,
602	int H0, int W0, int Cg, int stripesize, int inp_plane_ofs, int inp_planesize, int conv_dim, int conv_type,
603	const int CONV_NR, const int esz, bool fast_1x1, bool useFP16)
604	{
605	for (int stripe = `0`; zyx0 < zyx_limit; stripe++, zyx0 += CONV_NR)
606	{
607	char inpbuf = inpbuf_task + stripe stripesize * esz;
608	float *inptr = inp + inp_plane_ofs;
609
610	/*
611	1. pack the data. Copy the HkxWk CONV_NR-wide slices from
612	each feature plane of the input tensor to the input buffer.
613	*/
614	if (fast_1x1)
615	{
616	int slice_len = zyx_limit - zyx0;
617	bool partial = slice_len < CONV_NR;
618	const int CONV_NR_esz = CONV_NR * esz;
619	// Superfast branch for 1x1 convolutions with sy=sx=1.
620	// in this case each feature plane can be safely treated
621	// as 1D array, and we just extract next portion
622	// of CONV_NR elements from each feature plane and
623	// put it together.
624	inptr += zyx0;
625	if (!partial)
626	{
627	// Make special branch where memcpy() is called with a constant buffer size.
628	// Compilers will likely unroll this loop properly.
629	#ifdef CONV_ARM_FP16
630	if (useFP16)
631	{
632	for (int c = `0`; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
633	_cvt32f16f(inptr, (__fp16 *)inpbuf, CONV_NR);
634	}
635	else
636	#endif
637	for (int c = `0`; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
638	memcpy(dest: inpbuf, src: inptr, n: CONV_NR_esz);
639	}
640	else
641	{
642	#ifdef CONV_ARM_FP16
643	if (useFP16)
644	{
645	for (int c = `0`; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
646	{
647	_cvt32f16f(inptr, (__fp16 *)inpbuf, slice_len);
648	}
649	}
650	else
651	#endif
652	for (int c = `0`; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
653	{
654	memcpy(dest: inpbuf, src: inptr, n: slice_len * esz);
655	}
656	}
657	}
658	else if (conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
659	{
660	CV_Assert(Cg == `1`);
661	const int HW0 = H0 * W0;
662	const int HWi = Hi * Wi;
663	int slice_len = std::min(a: zyx_limit - zyx0, b: CONV_NR);
664
665	// here some non-continuous sub-row of the row will not be
666	// filled from the tensor; we need to make sure that the uncovered
667	// elements are explicitly set to 0's. the easiest way is to
668	// set all the elements to 0's before the loop.
669	memset(s: inpbuf, c: `0`, n: stripesize * esz);
670
671	int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0;
672	int y0 = yx0 / W0, x0 = yx0 - y0 * W0;
673
674	if (conv_dim == CONV_1D)
675	{
676	for (int slice_i = `0`; slice_i < slice_len; y0++, x0=`0`)
677	{
678	int delta = std::min(a: slice_len - slice_i, b: W0 - x0);
679	int x1 = x0 + delta;
680
681	int in_w = x0 * stride_w - pad_left;
682	float* inptrIn = inptr + in_w;
683
684	int s0 = slice_i;
685
686	for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
687	{
688	// Pack 8
689	if (x0 + `8` <= x1 && `0` <= in_w &&
690	in_w + stride_w`8` <= Wi - (Wk-`1`)dilation_w)
691	{
692	packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
693	}
694	else if (x0 + `2` <= x1 && `0` <= in_w &&
695	in_w + stride_w`2` <= Wi - (Wk-`1`)dilation_w)
696	{
697	packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
698	}
699	else
700	{
701	int w0 = std::max(a: `0`, b: (-in_w + dilation_w-`1`)/dilation_w);
702	int w1 = std::min(a: Wk, b: (Wi - in_w + dilation_w-`1`)/dilation_w);
703	const float* inptrInC = inptrIn;
704	#ifdef CONV_ARM_FP16
705	if (useFP16)
706	{
707	__fp16* inpbufC = (__fp16 *)inpbuf + s0;
708	for (int w = w0; w < w1; w++)
709	{
710	int imgofs = w*dilation_w;
711	inpbufC[wCONV_NR] = (__fp16*)inptrInC[imgofs];
712	}
713	}
714	else
715	#endif
716	{
717	float* inpbufC = (float *)inpbuf + s0;
718	for (int w = w0; w < w1; w++)
719	{
720	int imgofs = w*dilation_w;
721	inpbufC[w*CONV_NR] = inptrInC[imgofs];
722	}
723	}
724	}
725	}
726	slice_i += delta;
727	}
728	}
729	else if (conv_dim == CONV_2D)
730	{
731	for (int slice_i = `0`; slice_i < slice_len; y0++, x0=`0`)
732	{
733	int delta = std::min(a: slice_len - slice_i, b: W0 - x0);
734	int x1 = x0 + delta;
735
736	int in_h = y0 * stride_h - pad_top;
737	int in_w = x0 * stride_w - pad_left;
738
739	float* inptrIn = inptr + in_h*Wi + in_w;
740
741	bool ok_i = `0` <= in_h && in_h < Hi - (Hk-`1`)*dilation_h;
742	int h0 = std::max(a: `0`, b: (-in_h + dilation_h-`1`)/dilation_h);
743	int h1 = std::min(a: Hk, b: (Hi - in_h + dilation_h-`1`)/dilation_h);
744
745	int s0 = slice_i;
746	for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
747	{
748	// Pack 8
749	if (ok_i && x0 + `8` <= x1 && `0` <= in_w &&
750	in_w + stride_w`8` <= Wi - (Wk-`1`)dilation_w)
751	{
752	packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
753	}
754	else if (ok_i && x0 + `2` <= x1 && `0` <= in_w &&
755	in_w + stride_w`2` <= Wi - (Wk-`1`)dilation_w)
756	{
757	packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
758	}
759	else
760	{
761	int w0 = std::max(a: `0`, b: (-in_w + dilation_w-`1`)/dilation_w);
762	int w1 = std::min(a: Wk, b: (Wi - in_w + dilation_w-`1`)/dilation_w);
763
764	const float* inptrInC = inptrIn;
765	#ifdef CONV_ARM_FP16
766	if (useFP16)
767	{
768	__fp16* inpbufC = (__fp16 *)inpbuf + s0;
769
770	for (int h = h0; h < h1; h++)
771	{
772	for (int w = w0; w < w1; w++)
773	{
774	int imgofs = h(dilation_hWi) + w*dilation_w;
775	inpbufC[(hWk + w)CONV_NR] = (__fp16)inptrInC[imgofs];
776	}
777	}
778	}
779	else
780	#endif
781	{
782	float* inpbufC = (float *)inpbuf + s0;
783
784	for (int h = h0; h < h1; h++)
785	{
786	for (int w = w0; w < w1; w++)
787	{
788	int imgofs = h(dilation_hWi) + w*dilation_w;
789	inpbufC[(hWk + w)CONV_NR] = inptrInC[imgofs];
790	}
791	}
792	}
793	}
794	}
795	slice_i += delta;
796	}
797	}
798	else if (conv_dim == CONV_3D)
799	{
800	for (int slice_i = `0`; slice_i < slice_len; z0 += (y0+`1`)/H0, y0 = (y0+`1`)%H0, x0=`0`)
801	{
802	int delta = std::min(a: slice_len - slice_i, b: W0 - x0);
803	int x1 = x0 + delta;
804
805	int in_d = z0 * stride_d - pad_front;
806	int in_h = y0 * stride_h - pad_top;
807	int in_w = x0 * stride_w - pad_left;
808
809	float* inptrIn = inptr + in_dHWi + in_hWi + in_w;
810
811	int d0 = std::max(a: `0`, b: (-in_d + dilation_d - `1`) / dilation_d);
812	int d1 = std::min(a: Dk, b: (Di - in_d + dilation_d - `1`) / dilation_d);
813
814	bool ok_i = `0` <= in_d && in_d < Di - (Dk-`1`)*dilation_d &&
815	`0` <= in_h && in_h < Hi - (Hk-`1`)*dilation_h;
816	int h0 = std::max(a: `0`, b: (-in_h + dilation_h-`1`)/dilation_h);
817	int h1 = std::min(a: Hk, b: (Hi - in_h + dilation_h-`1`)/dilation_h);
818
819	int s0 = slice_i;
820	for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
821	{
822	// Pack 8
823	if (ok_i && x0 + `8` <= x1 && `0` <= in_w &&
824	in_w + stride_w`8` <= Wi - (Wk-`1`)dilation_w)
825	{
826	packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
827	}
828	else if (ok_i && x0 + `2` <= x1 && `0` <= in_w &&
829	in_w + stride_w`2` <= Wi - (Wk-`1`)dilation_w)
830	{
831	packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
832	}
833	else
834	{
835	int w0 = std::max(a: `0`, b: (-in_w + dilation_w-`1`)/dilation_w);
836	int w1 = std::min(a: Wk, b: (Wi - in_w + dilation_w-`1`)/dilation_w);
837	const float* inptrInC = inptrIn;
838	#ifdef CONV_ARM_FP16
839	if (useFP16)
840	{
841	__fp16* inpbufC = (__fp16* )inpbuf + s0;
842
843	for ( int d = d0; d < d1; d++)
844	{
845	for (int h = h0; h < h1; h++)
846	{
847	for (int w = w0; w < w1; w++)
848	{
849	int imgofs = ddilation_dHWi + h(dilation_hWi) + w*dilation_w;
850	inpbufC[((dHk + h)Wk + w)CONV_NR] = (__fp16*)inptrInC[imgofs];
851	}
852	}
853	}
854	}
855	else
856	#endif
857	{
858	float* inpbufC = (float* )inpbuf + s0;
859
860	for ( int d = d0; d < d1; d++)
861	{
862	for (int h = h0; h < h1; h++)
863	{
864	for (int w = w0; w < w1; w++)
865	{
866	int imgofs = ddilation_dHWi + h(dilation_hWi) + w*dilation_w;
867	inpbufC[((dHk + h)Wk + w)*CONV_NR] = inptrInC[imgofs];
868	}
869	}
870	}
871	}
872	}
873	}
874	slice_i += delta;
875	}
876	}
877	}
878	else
879	{
880	const int HW0 = H0 * W0;
881	const int HWi = Hi * Wi;
882	int z0_ = zyx0 / HW0, yx0 = zyx0 - z0_ * HW0;
883	int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0;
884	for (int k = `0`; k < ksize; k++)
885	{
886	int dz = dhwTab[k * `3`], dy = dhwTab[k * `3` + `1`], dx = dhwTab[k * `3` + `2`];
887	int i = `0`, z0 = z0_, y0 = y0_, x0 = x0_;
888	for (; i < CONV_NR;)
889	{
890	float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i;
891	#ifdef CONV_ARM_FP16
892	__fp16 * inpbuf_ki_FP16 = (__fp16 )inpbuf + k CONV_NR * Cg + i;
893	#endif
894
895	int zi = z0 * stride_d + dz - pad_front;
896	int yi = y0 * stride_h + dy - pad_top;
897	int xi = x0 * stride_w + dx - pad_left;
898
899	if ((unsigned) zi < (unsigned) Di && (unsigned) yi < (unsigned) Hi &&
900	(unsigned) xi < (unsigned) Wi)
901	{
902	const float inptr_ki = inptr + zi HWi + yi * Wi + xi;
903	if (i + `8` <= CONV_NR && x0 + `8` <= W0 && xi + stride_w * `8` <= Wi)
904	{
905	if (stride_w == `1`)
906	{
907	#ifdef CONV_ARM_FP16
908	if (useFP16)
909	{
910	for (int c = `0`; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
911	{
912	float32x4_t v0 = vld1q_f32(inptr_ki);
913	float32x4_t v1 = vld1q_f32(inptr_ki + `4`);
914
915	vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
916	}
917	}
918	else
919	#endif
920	for (int c = `0`; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
921	{
922	float t0 = inptr_ki[`0`], t1 = inptr_ki[`1`];
923	float t2 = inptr_ki[`2`], t3 = inptr_ki[`3`];
924	float t4 = inptr_ki[`4`], t5 = inptr_ki[`5`];
925	float t6 = inptr_ki[`6`], t7 = inptr_ki[`7`];
926	inpbuf_ki[`0`] = t0;
927	inpbuf_ki[`1`] = t1;
928	inpbuf_ki[`2`] = t2;
929	inpbuf_ki[`3`] = t3;
930	inpbuf_ki[`4`] = t4;
931	inpbuf_ki[`5`] = t5;
932	inpbuf_ki[`6`] = t6;
933	inpbuf_ki[`7`] = t7;
934	}
935	}
936	else if (stride_w == `2`)
937	{
938	#ifdef CONV_ARM_FP16
939	if (useFP16)
940	{
941	for (int c = `0`; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
942	{
943	float32x4_t v0 = {inptr_ki[`0`], inptr_ki[`2`], inptr_ki[`4`], inptr_ki[`6`]};
944	float32x4_t v1 = {inptr_ki[`8`], inptr_ki[`10`], inptr_ki[`12`], inptr_ki[`14`]};
945	vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
946	}
947	}
948	else
949	#endif
950	for (int c = `0`; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
951	{
952	float t0 = inptr_ki[`0`], t1 = inptr_ki[`2`];
953	float t2 = inptr_ki[`4`], t3 = inptr_ki[`6`];
954	float t4 = inptr_ki[`8`], t5 = inptr_ki[`10`];
955	float t6 = inptr_ki[`12`], t7 = inptr_ki[`14`];
956	inpbuf_ki[`0`] = t0;
957	inpbuf_ki[`1`] = t1;
958	inpbuf_ki[`2`] = t2;
959	inpbuf_ki[`3`] = t3;
960	inpbuf_ki[`4`] = t4;
961	inpbuf_ki[`5`] = t5;
962	inpbuf_ki[`6`] = t6;
963	inpbuf_ki[`7`] = t7;
964	}
965	}
966	else
967	{
968	#ifdef CONV_ARM_FP16
969	if (useFP16)
970	{
971	for (int c = `0`; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
972	{
973	float32x4_t v0 = {inptr_ki[`0`], inptr_ki[stride_w], inptr_ki[stride_w * `2`], inptr_ki[stride_w * `3`]};
974	float32x4_t v1 = {inptr_ki[stride_w * `4`], inptr_ki[stride_w * `5`], inptr_ki[stride_w * `6`], inptr_ki[stride_w * `7`]};
975	vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
976	}
977	}
978	else
979	#endif
980	for (int c = `0`; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
981	{
982	float t0 = inptr_ki[`0`], t1 = inptr_ki[stride_w];
983	float t2 = inptr_ki[stride_w * `2`], t3 = inptr_ki[stride_w * `3`];
984	float t4 = inptr_ki[stride_w * `4`], t5 = inptr_ki[stride_w * `5`];
985	float t6 = inptr_ki[stride_w * `6`], t7 = inptr_ki[stride_w * `7`];
986	inpbuf_ki[`0`] = t0;
987	inpbuf_ki[`1`] = t1;
988	inpbuf_ki[`2`] = t2;
989	inpbuf_ki[`3`] = t3;
990	inpbuf_ki[`4`] = t4;
991	inpbuf_ki[`5`] = t5;
992	inpbuf_ki[`6`] = t6;
993	inpbuf_ki[`7`] = t7;
994	}
995	}
996	i += `8`;
997	x0 += `8`;
998	}
999	else if (i + `4` <= CONV_NR && x0 + `4` <= W0 && xi + stride_w * `4` <= Wi)
1000	{
1001	if (stride_w == `1`)
1002	{
1003	#ifdef CONV_ARM_FP16
1004	if (useFP16)
1005	{
1006	for (int c = `0`; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
1007	{
1008	float32x4_t v0 = vld1q_f32(inptr_ki);
1009	vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
1010	}
1011	}
1012	else
1013	#endif
1014	for (int c = `0`; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
1015	{
1016	float t0 = inptr_ki[`0`], t1 = inptr_ki[`1`];
1017	float t2 = inptr_ki[`2`], t3 = inptr_ki[`3`];
1018	inpbuf_ki[`0`] = t0;
1019	inpbuf_ki[`1`] = t1;
1020	inpbuf_ki[`2`] = t2;
1021	inpbuf_ki[`3`] = t3;
1022	}
1023	}
1024	else
1025	{
1026	#ifdef CONV_ARM_FP16
1027	if (useFP16)
1028	{
1029	for (int c = `0`; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
1030	{
1031	float32x4_t v0 = {inptr_ki[`0`], inptr_ki[stride_w], inptr_ki[stride_w * `2`], inptr_ki[stride_w * `3`]};
1032	vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
1033	}
1034	}
1035	else
1036	#endif
1037	for (int c = `0`; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
1038	{
1039	float t0 = inptr_ki[`0`], t1 = inptr_ki[stride_w];
1040	float t2 = inptr_ki[stride_w * `2`], t3 = inptr_ki[stride_w * `3`];
1041	inpbuf_ki[`0`] = t0;
1042	inpbuf_ki[`1`] = t1;
1043	inpbuf_ki[`2`] = t2;
1044	inpbuf_ki[`3`] = t3;
1045	}
1046	}
1047	i += `4`;
1048	x0 += `4`;
1049	}
1050	else
1051	{
1052	#ifdef CONV_ARM_FP16
1053	if (useFP16)
1054	{
1055	for (int c = `0`; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
1056	inpbuf_ki_FP16[`0`] = (__fp16)(*inptr_ki);
1057	}
1058	else
1059	#endif
1060	for (int c = `0`; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
1061	inpbuf_ki = inptr_ki;
1062	i++;
1063	x0++;
1064	}
1065	}
1066	else
1067	{
1068	#ifdef CONV_ARM_FP16
1069	if (useFP16)
1070	{
1071	for (int c = `0`; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR)
1072	inpbuf_ki_FP16[`0`] = (__fp16)`0.f`;
1073	}
1074	else
1075	#endif
1076	for (int c = `0`; c < Cg; c++, inpbuf_ki += CONV_NR)
1077	inpbuf_ki[`0`] = `0.f`;
1078	i++;
1079	x0++;
1080	}
1081
1082	int mask = x0 >= W0;
1083	y0 += mask;
1084	x0 &= mask - `1`;
1085
1086	mask = y0 >= H0; // Only Conv 3D need jump at z0 dimension
1087	if (mask && conv_dim != CONV_3D)
1088	break;
1089
1090	z0 += mask;
1091	y0 &= mask - `1`;
1092	}
1093	}
1094	}
1095	}
1096	}
1097
1098	void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
1099	const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd)
1100	{
1101	Mat input = _input.getMat();
1102	Mat output = _output.getMat();
1103	int conv_dim = conv ->conv_dim;
1104
1105	CV_Assert_N(input.dims == output.dims,
1106	input.size[`0`] == output.size[`0`],
1107	conv ->C == input.size[`1`],
1108	conv ->K == output.size[`1`],
1109	input.type() == output.type(),
1110	input.isContinuous(),
1111	output.isContinuous());
1112
1113	const bool useFP16 = conv ->useFP16;
1114	Mat fusedAddMat;
1115	if (fusedAdd)
1116	{
1117	CV_Assert(conv ->conv_dim != CONV_3D && "Conv3D does not support Conv+Add fusion optimization!");
1118	fusedAddMat = _output.getMat();
1119	}
1120
1121	if (conv ->conv_type == CONV_TYPE_DEPTHWISE)
1122	{
1123	// Depthwise-Convolution layer should not be followed by Add layer.
1124	CV_Assert((conv_dim == CONV_1D \|\| conv_dim == CONV_2D) && !useFP16);
1125	return runDepthwise(input: input, output: output, conv, activ: actLayer.get(), reluslope, fusedAdd);
1126	}
1127
1128	MatShape inputShape = shape(mat: input);
1129	MatShape outputShape = shape(mat: output);
1130
1131	CV_Assert(inputShape.size() == outputShape.size());
1132
1133	ActivationLayer* activ = nullptr;
1134	float minval = -FLT_MAX, maxval = FLT_MAX;
1135	bool ifMinMaxAct = false;
1136
1137	if (actLayer)
1138	{
1139	Ptr<ReLULayer> activ_relu = actLayer.dynamicCast<ReLULayer>();
1140	Ptr<ReLU6Layer> activ_relu6 = actLayer.dynamicCast<ReLU6Layer>();
1141
1142	if (!activ_relu.empty())
1143	{
1144	if (activ_relu ->negativeSlope == `0.0f`)
1145	{
1146	minval = `0.0f`;
1147	ifMinMaxAct = true;
1148	activ = nullptr;
1149	}
1150	else // Leaky ReLU
1151	{
1152	activ = actLayer.get();
1153	}
1154	}
1155	else if (!activ_relu6.empty())
1156	{
1157	minval = activ_relu6 ->minValue;
1158	maxval = activ_relu6 ->maxValue;
1159
1160	ifMinMaxAct = true;
1161	activ = nullptr;
1162	}
1163	else
1164	activ = actLayer.get();
1165	}
1166	else
1167	activ = nullptr;
1168
1169	if (conv ->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
1170	{
1171	CV_Assert((!conv ->weightsWinoBuf.empty() \|\| !conv ->weightsWinoBuf_FP16.empty()) && input.dims == `4` && conv_dim == CONV_2D);
1172	if (runWinograd63(input: input, fusedAddMat: fusedAddMat, output: output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
1173	return;
1174	}
1175
1176	int N = inputShape [`0`], C = inputShape [`1`];
1177
1178	// input shape: [N, C, D, H, W] for Conv3D, [N, C, H, W] for Conv2D, [N, C, W] for Conv1D.
1179	int Di = conv_dim == CONV_3D ? inputShape [`2`] : `1`;
1180	int Hi = conv_dim == CONV_1D ? `1` : inputShape [inputShape.size() - `2`];
1181	int Wi = inputShape [inputShape.size() - `1`];
1182
1183	int ngroups = conv ->ngroups;
1184	int K = conv ->K, Dk = conv ->Dk, Hk = conv ->Hk, Wk = conv ->Wk;
1185
1186	int D0 = conv_dim == CONV_3D ? outputShape [`2`] : `1`;
1187	int H0 = conv_dim == CONV_1D ? `1` : outputShape [outputShape.size() - `2`];
1188	int W0 = outputShape [outputShape.size() - `1`];
1189
1190	int Cg = C/ngroups, Kg = K/ngroups;
1191
1192	const size_t inp_planesize = (size_t)DiHiWi;
1193	const size_t out_planesize = (size_t)D0H0W0;
1194
1195	int pad_front = conv ->pad_front;
1196	int pad_top = conv ->pad_top;
1197	int pad_left = conv ->pad_left;
1198
1199	int stride_d = conv ->stride_d, stride_h = conv ->stride_h, stride_w = conv ->stride_w;
1200	int dilation_d = conv ->dilation_d, dilation_h = conv ->dilation_h, dilation_w = conv ->dilation_w;
1201
1202	int ksize = DkHkWk;
1203	bool fast_1x1 = ksize == `1` && stride_d == `1` && stride_w == `1` && stride_h == `1`
1204	&& pad_front == `0` && pad_left == `0` && pad_top == `0`;
1205	int DkHkWkCg = DkHkWk*Cg;
1206
1207	std::vector<int> ofstab_(HkWkDk*`4`, `0`);
1208	int* ofstab = ofstab_.data();
1209	int* dhwTab = ofstab + HkWkDk;
1210	int padded_ksize = ((ksize + VEC_ALIGN-`1`) / VEC_ALIGN) * VEC_ALIGN;
1211
1212	if (conv_dim == CONV_1D)
1213	{
1214	for( int w = `0`; w < Wk; w++)
1215	{
1216	int dw = w*dilation_w;
1217	dhwTab[w*`3`+`2`] = dw;
1218	ofstab[w] = dw;
1219	}
1220	}
1221	else if (conv_dim == CONV_2D)
1222	{
1223	for (int h = `0`; h < Hk; h++)
1224	for( int w = `0`; w < Wk; w++)
1225	{
1226	int k = h*Wk + w;
1227	int dh = hdilation_h, dw = wdilation_w;
1228	dhwTab[k*`3`+`1`] = dh;
1229	dhwTab[k*`3`+`2`] = dw;
1230	ofstab[k] = dh*Wi + dw;
1231	}
1232	}
1233	else
1234	{
1235	for (int d = `0`; d < Dk; d++)
1236	for (int h = `0`; h < Hk; h++)
1237	{
1238	for (int w = `0`; w < Wk; w++)
1239	{
1240	int k = dHkWk + h*Wk + w;
1241	int dd = ddilation_d, dh = hdilation_h, dw = w*dilation_w;
1242	dhwTab[k*`3`] = dd;
1243	dhwTab[k*`3`+`1`] = dh;
1244	dhwTab[k*`3`+`2`] = dw;
1245	ofstab[k] = ddHiWi + dh*Wi + dw;
1246	}
1247	}
1248	}
1249
1250	int CONV_NR = CONV_NR_FP32;
1251	int CONV_MR = CONV_MR_FP32;
1252	int esz = sizeof(float );
1253
1254	#ifdef CONV_ARM_FP16
1255	if (useFP16)
1256	{
1257	// works at FP 16.
1258	CONV_NR = CONV_NR_FP16;
1259	CONV_MR = CONV_MR_FP16;
1260	esz = sizeof(__fp16);
1261	}
1262	#endif
1263
1264	int MAX_STRIPES = conv ->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? `1` : (`56` + CONV_NR - `1`)/CONV_NR;
1265
1266	// Friendly to L1 cache
1267	const int K_BLOCK_SIZE = conv ->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? `1` : `32`;
1268	const int C_BLOCK_SIZE = `256`;
1269
1270	int Kg_nblocks = (Kg + CONV_MR-`1`)/CONV_MR;
1271	int Kg_aligned = Kg_nblocks * CONV_MR;
1272
1273	int stripes_per_plane0 = ((int)out_planesize + CONV_NR - `1`) / CONV_NR;
1274	int stripes_per_plane = stripes_per_plane0;
1275
1276	if (stripes_per_plane < ntasks * `4` \|\| conv ->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
1277	{
1278	MAX_STRIPES = `1`;
1279	stripes_per_plane = `1`;
1280	}
1281	else
1282	Kg_nblocks = `1`;
1283
1284	bool separateIm2col = (fast_1x1 \|\| stripes_per_plane == `1`) && conv ->conv_type != CONV_TYPE_DEPTHWISE_REMAIN;
1285
1286	int Kstripes = Kg_nblocks * stripes_per_plane;
1287	int nsubtasks = N * ngroups * Kstripes;
1288
1289	size_t stripesize = alignSize(sz: CONV_NR * ksize * Cg, n: VEC_ALIGN);
1290	size_t cbufsize = alignSize(sz: CONV_NR * K_BLOCK_SIZE * MAX_STRIPES, n: VEC_ALIGN);
1291
1292	size_t taskbufsize = cbufsize * sizeof(float );
1293
1294	if (!separateIm2col)
1295	taskbufsize += MAX_STRIPES * stripesize * esz;
1296
1297	size_t totalbufsize_base = taskbufsize * ntasks;
1298	size_t totalbufsize = totalbufsize_base;
1299	if (separateIm2col)
1300	totalbufsize += N * ngroups * stripes_per_plane0 * stripesize * esz;
1301
1302	AutoBuffer<char> inpbuf_all_;
1303	char* inpbuf_all = nullptr;
1304
1305	inpbuf_all_.allocate(size: totalbufsize + VEC_ALIGN * sizeof(float ));
1306	inpbuf_all = alignPtr(ptr: inpbuf_all_.data(), n: (int)(VEC_ALIGN * sizeof(float )));
1307	char* inpbuf_all_0 = inpbuf_all + totalbufsize_base;
1308
1309	float* inp = input.ptr<float>();
1310	float* out = output.ptr<float>();
1311	float* fusedAddPtr0 = fusedAddMat.empty() ? `0` : fusedAddMat.ptr<float>();
1312
1313	// In the case of 1x1 convolution we first reorder the whole input tensor.
1314	// In general, im2row results in HkWk-x unrolling factor*
1315	// (e.g. 33=9x unrolling for 3x3 convolution), thus for 1x1 convolution*
1316	// the reordered tensor will take as much space as the original tensor.
1317	if (separateIm2col)
1318	{
1319	// the optional phase 1. im2row
1320	parallel_for_(range: Range (`0`, ntasks), functor: [&](const Range& r0) {
1321	for (int task_id = r0.start; task_id < r0.end; task_id++)
1322	{
1323	if (fast_1x1)
1324	{
1325	int nc0 = task_idNC/ntasks, nc1 = (task_id+`1`)NC/ntasks, dc = `0`;
1326	for (; nc0 < nc1; nc0 += dc)
1327	{
1328	int n = nc0/C, c0 = nc0 - n*C;
1329	int g = c0 / Cg;
1330	c0 -= g*Cg;
1331	dc = Cg - c0 <= nc1 - nc0 ? Cg - c0 : nc1 - nc0;
1332
1333	float * inptr_ = inp + (size_t)nc0*inp_planesize;
1334	char* inpbuf_ = inpbuf_all_0 + ((nngroups + g)stripes_per_plane0stripesize + c0CONV_NR)*esz;
1335
1336	packInputData(inpbuf_task: inpbuf_, inp: inptr_, ofstab, dhwTab, zyx0: `0`, zyx_limit: out_planesize, ksize, stride_d, stride_h,
1337	stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
1338	Di, Hi, Wi, H0, W0, Cg: dc, stripesize, inp_plane_ofs: `0`, inp_planesize, conv_dim: conv ->conv_dim,
1339	conv_type: conv ->conv_type, CONV_NR, esz, fast_1x1, useFP16);
1340	}
1341	}
1342	else
1343	{
1344	const int allTasks = N * ngroups * stripes_per_plane0;
1345	int ngs0 = task_idallTasks/ntasks, ngs1 = (task_id+`1`)allTasks/ntasks, ds = `0`;
1346
1347	for (; ngs0 < ngs1; ngs0 += ds)
1348	{
1349	int n = ngs0 / (ngroups * stripes_per_plane0), gs0 = ngs0 - nngroupsstripes_per_plane0;
1350	int g = gs0 / stripes_per_plane0, s0 = gs0 - g*stripes_per_plane0;
1351
1352	ds = stripes_per_plane0 - s0 <= ngs1 - ngs0 ? stripes_per_plane0 - s0 : ngs1 - ngs0;
1353
1354	int zyx = s0 * CONV_NR;
1355	int zyx_limit = (s0 + ds) * CONV_NR < out_planesize ? (s0 + ds) * CONV_NR : out_planesize;
1356
1357	float * inptr_ = inp + (size_t)(n * ngroups + g) * Cg * inp_planesize;
1358	char* inpbuf_ = inpbuf_all_0 + ((n * ngroups + g) * stripes_per_plane0 * stripesize + s0 * stripesize) * esz;
1359
1360	packInputData(inpbuf_task: inpbuf_, inp: inptr_, ofstab, dhwTab, zyx0: zyx, zyx_limit, ksize, stride_d, stride_h,
1361	stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
1362	Di, Hi, Wi, H0, W0, Cg, stripesize, inp_plane_ofs: `0`, inp_planesize, conv_dim: conv ->conv_dim,
1363	conv_type: conv ->conv_type, CONV_NR, esz, fast_1x1, useFP16);
1364	}
1365	}
1366	}
1367	});
1368	}
1369
1370	// Compute
1371	parallel_for_(range: Range (`0`, ntasks), functor: [&](const Range& r0) {
1372	for (int task_id = r0.start; task_id < r0.end; task_id++)
1373	{
1374	float * cbuf_task = (float )(inpbuf_all + taskbufsize task_id);
1375	char * inpbuf_task = (char*)(cbuf_task + cbufsize);
1376
1377	int ngs0 = (int)((size_t)nsubtasks * task_id / ntasks);
1378	int ngs1 = (int)((size_t)nsubtasks * (task_id+`1`) / ntasks);
1379	for (int subtask = ngs0; subtask < ngs1; )
1380	{
1381	int ng = subtask / Kstripes;
1382	int kzyx0 = subtask - ng * Kstripes;
1383	int kzyx1 = kzyx0 + (ngs1 - subtask);
1384	int n = ng / ngroups, g = ng % ngroups; // ng - n ngroups;*
1385	size_t inp_plane_ofs = (size_t)(n * ngroups + g) * Cg * inp_planesize;
1386	kzyx1 = kzyx1 <= Kstripes ? kzyx1 : Kstripes;
1387	subtask += kzyx1 - kzyx0;
1388	int k0, k1;
1389	int zyx0, zyx_limit, zyx_block_limit = `0`;
1390
1391	if (stripes_per_plane == `1` \|\| conv ->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
1392	{
1393	k0 = kzyx0 * CONV_MR;
1394	k1 = kzyx1 * CONV_MR;
1395	k1 = k1 <= Kg ? k1 : Kg;
1396	zyx0 = `0`;
1397	zyx_limit = (int)out_planesize;
1398	}
1399	else
1400	{
1401	k0 = `0`;
1402	k1 = Kg;
1403	zyx0 = kzyx0 * CONV_NR;
1404	zyx_limit = kzyx1 * CONV_NR;
1405	zyx_limit = zyx_limit < out_planesize ? zyx_limit : (int)out_planesize;
1406	}
1407
1408	for (; zyx0 < zyx_limit; zyx0 = zyx_block_limit)
1409	{
1410	// step 1. extract part of input tensor and represent it in zigzag form
1411	zyx_block_limit = zyx0 + CONV_NR * MAX_STRIPES;
1412	zyx_block_limit = zyx_block_limit < zyx_limit ? zyx_block_limit : zyx_limit;
1413
1414	int nstripes = (zyx_block_limit - zyx0 + CONV_NR - `1`) / CONV_NR;
1415
1416	CV_Assert(nstripes <= MAX_STRIPES);
1417
1418	if (!separateIm2col)
1419	{
1420	packInputData(inpbuf_task, inp, ofstab, dhwTab, zyx0, zyx_limit: zyx_block_limit, ksize, stride_d, stride_h,
1421	stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
1422	Di, Hi, Wi, H0, W0, Cg, stripesize, inp_plane_ofs, inp_planesize, conv_dim: conv ->conv_dim,
1423	conv_type: conv ->conv_type, CONV_NR, esz, fast_1x1, useFP16);
1424	}
1425
1426	char weights = nullptr*;
1427	#ifdef CONV_ARM_FP16
1428	if (useFP16)
1429	{
1430	CV_Assert(!conv->weightsBuf_FP16.empty());
1431	weights = (char *)conv->getWeightsFP16();
1432	}
1433	else
1434	#endif
1435	{
1436	CV_Assert(!conv ->weightsBuf.empty());
1437	weights = (char *)conv ->getWeights();
1438	}
1439	// optional branch, only for depth-wise convolution which was implemented by generic convolution.
1440	// In this case, CONV_MR is 1, and CONV_NR remains the same.
1441	if (conv ->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
1442	{
1443	CV_Assert(weights);
1444	size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
1445	float *cptr0 = cbuf_task;
1446	weights += g * padded_ksize * esz;
1447
1448	int out_width = zyx_block_limit - zyx0;
1449	float *outptr = out + outofs;
1450	const float biasVal = *(conv ->biasBuf.data() + g);
1451	const char inptr_ = separateIm2col ? inpbuf_all_0 + (ng stripes_per_plane0 + zyx0 / CONV_NR) * stripesize * esz :
1452	inpbuf_task;
1453
1454	for (int stripe = `0`; stripe < nstripes; stripe++)
1455	{
1456	const char inptr = inptr_ + stripe stripesize * esz;
1457	const int outLen = std::min(a: out_width - stripe * CONV_NR, b: CONV_NR);
1458	bool ifBuffer = outLen < CONV_NR;
1459	float cptr = outptr + stripe CONV_NR;
1460	if (ifBuffer)
1461	{
1462	memcpy(dest: cptr0, src: cptr, n: outLen * sizeof(float ));
1463	cptr = cptr0;
1464	}
1465	#if CV_NEON && CV_NEON_AARCH64
1466	if (conv->useNEON)
1467	{
1468	#ifdef CONV_ARM_FP16
1469	if (useFP16)
1470	{
1471	opt_NEON_FP16::convBlockMR1_F16(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
1472	}
1473	else
1474	#endif
1475	opt_NEON::convBlockMR1_F32(DkHkWkCg, (const float )weights, (const* float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
1476	}
1477	else
1478	#endif
1479	convBlockMR1_F32(np: DkHkWkCg, a: (const float )weights, b: (const* float *)inptr, c: cptr, bias: biasVal, init_c: fusedAdd, minval, maxval, ifMinMaxAct, outLen, convNR: CONV_NR);
1480
1481	if (ifBuffer)
1482	{
1483	memcpy(dest: outptr + stripe * CONV_NR, src: cptr, n: outLen * sizeof(float ));
1484	}
1485	}
1486	if (activ)
1487	activ->forwardSlice(src: outptr, dst: outptr, len: out_width, outPlaneSize: out_planesize, cn0: g, cn1: g + `1`);
1488	continue;
1489	}
1490
1491	CV_Assert(weights);
1492	weights += g * Kg_aligned * DkHkWkCg * esz;
1493
1494	const float biasptr = conv ->biasBuf.data() + Kg g;
1495	int ldc = nstripes * CONV_NR;
1496
1497	// 2. do convolution, compute Kg x (zyx_block_limit - zyx0) part of the output tensor
1498	int out_width = zyx_block_limit - zyx0;
1499	for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE)
1500	{
1501	int k1_block = k0_block + K_BLOCK_SIZE < k1 ? k0_block + K_BLOCK_SIZE : k1;
1502	for (int c0 = `0`; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE)
1503	{
1504	int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg;
1505	const char inptr = separateIm2col ? inpbuf_all_0 + (ng stripes_per_plane0 + zyx0 / CONV_NR) * stripesize * esz :
1506	inpbuf_task;
1507	inptr += (c0 * CONV_NR) * esz;
1508	for (int stripe = `0`; stripe < nstripes; stripe++, inptr += stripesize * esz)
1509	{
1510	const int outLen = std::min(a: out_width - stripe * CONV_NR, b: CONV_NR);
1511
1512	char wptr = weights + (k0_block DkHkWkCg + c0 * CONV_MR) * esz;
1513	float cptr = cbuf_task + stripe CONV_NR;
1514	hfloat* cptr_f16 = (hfloat)cbuf_task + stripeCONV_NR;
1515	for (int k = k0_block; k < k1_block; k += CONV_MR,
1516	wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc)
1517	{
1518	#if CV_TRY_AVX2
1519	if (conv ->useAVX2)
1520	opt_AVX2::convBlock_F32(np: c1 - c0, a: (const float )wptr, b: (const* float *)inptr, c: cptr, ldc, init_c: c0 == `0`, width: outLen, convMR: CONV_MR, convNR: CONV_NR);
1521	else
1522	#endif
1523	#if CV_TRY_AVX
1524	if (conv ->useAVX)
1525	opt_AVX::convBlock_F32(np: c1 - c0, a: (const float )wptr, b: (const* float *)inptr, c: cptr, ldc, init_c: c0 == `0`, width: outLen, convMR: CONV_MR, convNR: CONV_NR);
1526	else
1527	#endif
1528	#if CV_NEON
1529	if (conv->useNEON)
1530	{
1531	#ifdef CONV_ARM_FP16
1532	if (useFP16)
1533	{
1534	opt_NEON_FP16::convBlock_F16(c1 - c0, wptr, inptr, (char *)cptr_f16, ldc, c0 == `0`, outLen, CONV_MR, CONV_NR);
1535	}
1536	else
1537	#endif
1538	opt_NEON::convBlock_F32(c1 - c0, (const float )wptr, (const* float *)inptr, cptr, ldc, c0 == `0`, outLen, CONV_MR, CONV_NR);
1539	}
1540	else
1541	#endif
1542	// The possible outLen range is 24 or 8~1.
1543	convBlock_F32(np: c1 - c0, a: (const float )wptr, b: (const* float *)inptr, c: cptr, ldc, init_c: c0 == `0`, outLen, convMR: CONV_MR, convNR: CONV_NR);
1544	}
1545	}
1546	}
1547
1548	size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
1549	const float *cptr = cbuf_task;
1550	const hfloat cptr_fp16 = (const* hfloat *)cbuf_task;
1551	float *outptr = out + outofs;
1552	const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : `0`;
1553
1554	for (int k = k0_block; k < k1_block; k++,
1555	cptr += ldc, cptr_fp16 += ldc, outptr += out_planesize,
1556	pbptr += (pbptr ? out_planesize : `0`))
1557	{
1558	float biasval = biasptr[k];
1559	int j = `0`;
1560
1561	#ifdef CONV_ARM_FP16
1562	if (useFP16)
1563	{
1564	float32x4_t vbias = vdupq_n_f32(biasval);
1565	float32x4_t vmax = vdupq_n_f32(maxval);
1566	float32x4_t vmin = vdupq_n_f32(minval);
1567	if (pbptr)
1568	{
1569	for (; j + `7` < out_width; j += `8`)
1570	{
1571	float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias;
1572	float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + + j + `4`)) + vbias;
1573
1574	v0 += vld1q_f32(pbptr + j);
1575	v1 += vld1q_f32(pbptr + j + `4`);
1576
1577	if (ifMinMaxAct)
1578	{
1579	v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax);
1580	v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax);
1581	}
1582
1583	vst1q_f32(outptr + j, v0);
1584	vst1q_f32(outptr + j + `4`, v1);
1585	}
1586	}
1587	else
1588	{
1589	for (; j + `7` < out_width; j += `8`)
1590	{
1591	float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias;
1592	float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j + `4`)) + vbias;
1593
1594	if (ifMinMaxAct)
1595	{
1596	v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax);
1597	v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax);
1598	}
1599
1600	vst1q_f32(outptr + j, v0);
1601	vst1q_f32(outptr + j + `4`, v1);
1602	}
1603	}
1604
1605	if (pbptr)
1606	{
1607	for (; j < out_width; j++)
1608	{
1609	float v = (float )cptr_fp16[j] + biasval;
1610	v += pbptr[j];
1611	if (ifMinMaxAct)
1612	v = std::min(std::max(v, minval), maxval);
1613	outptr[j] = v;
1614	}
1615	}
1616	else
1617	{
1618	for (; j < out_width; j++)
1619	{
1620	float v = (float )cptr_fp16[j] + biasval;
1621
1622	if (ifMinMaxAct)
1623	v = std::min(std::max(v, minval), maxval);
1624	outptr[j] = v;
1625	}
1626	}
1627	}
1628	else
1629	#endif
1630	{
1631	#if CV_SIMD128
1632	v_float32x4 vbias = v_setall_f32(v: biasval);
1633	v_float32x4 vmax = v_setall_f32(v: maxval);
1634	v_float32x4 vmin = v_setall_f32(v: minval);
1635
1636	if (pbptr)
1637	{
1638	for (; j + `7` < out_width; j += `8`)
1639	{
1640	v_float32x4 v0 = v_add(a: v_load(ptr: cptr + j), b: vbias);
1641	v_float32x4 v1 = v_add(a: v_load(ptr: cptr + j + `4`), b: vbias);
1642
1643	v0 = v_add(a: v0, b: v_load(ptr: pbptr + j));
1644	v1 = v_add(a: v1, b: v_load(ptr: pbptr + j + `4`));
1645
1646	if (ifMinMaxAct)
1647	{
1648	v0 = v_min(a: v_max(a: v0, b: vmin), b: vmax);
1649	v1 = v_min(a: v_max(a: v1, b: vmin), b: vmax);
1650	}
1651
1652	v_store(ptr: outptr + j, a: v0);
1653	v_store(ptr: outptr + j + `4`, a: v1);
1654	}
1655	}
1656	else
1657	{
1658	for (; j + `7` < out_width; j += `8`)
1659	{
1660	v_float32x4 v0 = v_add(a: v_load(ptr: cptr + j), b: vbias);
1661	v_float32x4 v1 = v_add(a: v_load(ptr: cptr + j + `4`), b: vbias);
1662
1663	if (ifMinMaxAct)
1664	{
1665	v0 = v_min(a: v_max(a: v0, b: vmin), b: vmax);
1666	v1 = v_min(a: v_max(a: v1, b: vmin), b: vmax);
1667	}
1668
1669	v_store(ptr: outptr + j, a: v0);
1670	v_store(ptr: outptr + j + `4`, a: v1);
1671	}
1672	}
1673	#endif
1674	if (pbptr)
1675	{
1676	for (; j < out_width; j++)
1677	{
1678	float v = cptr[j] + biasval;
1679	v += pbptr[j];
1680	if (ifMinMaxAct)
1681	v = std::min(a: std::max(a: v, b: minval), b: maxval);
1682	outptr[j] = v;
1683	}
1684	}
1685	else
1686	{
1687	for (; j < out_width; j++)
1688	{
1689	float v = cptr[j] + biasval;
1690
1691	if (ifMinMaxAct)
1692	v = std::min(a: std::max(a: v, b: minval), b: maxval);
1693	outptr[j] = v;
1694	}
1695	}
1696	}
1697
1698	if (activ)
1699	activ->forwardSlice(src: outptr, dst: outptr, len: out_width, outPlaneSize: out_planesize, cn0: Kg * g + k, cn1: Kg * g + k + `1`);
1700	}
1701	}
1702	}
1703	}
1704	}
1705	});
1706	}
1707
1708
1709	/**************************************************************************************\
1710	SIMD and no-SIMD code for convBlock
1711	\**************************************************************************************/
1712
1713	static inline void convBlockMR1NoSIMD(int np, const float* a, const float* b, float c, const* float bias, bool init_c,
1714	const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
1715	{
1716	std::vector<float> cbuffer(outLen, `0`);
1717	float* cbuf = cbuffer.data();
1718	for( int p = `0`; p < np; p++ )
1719	{
1720	float ai = a[p];
1721	for( int j = `0`; j < outLen; j++ )
1722	cbuf[j] += b[convNRp + j] ai;
1723	}
1724
1725	if (init_c)
1726	{
1727	for(int j = `0`; j < outLen; j++)
1728	{
1729	c[j] += cbuf[j] + bias;
1730	if (ifMinMaxAct)
1731	c[j] = std::min(a: std::max(a: c[j], b: minval), b: maxval);
1732	}
1733	}
1734	else
1735	{
1736	for(int j = `0`; j < outLen; j++)
1737	{
1738	c[j] = cbuf[j] + bias;
1739	if (ifMinMaxAct)
1740	c[j] = std::min(a: std::max(a: c[j], b: minval), b: maxval);
1741	}
1742	}
1743	}
1744
1745	#if CV_SIMD128
1746	static inline void convBlockMR1x24(int np, const float* a, const float* b, float c, const* float bias, bool init_c,
1747	const float minval, const float maxval, bool ifMinMaxAct, const int convNR)
1748	{
1749	CV_Assert(convNR == `24`);
1750	v_float32x4 c0 = v_setall_f32(v: bias), c1 = c0, c2 = c0;
1751	v_float32x4 c3 = c0, c4 = c0, c5 = c0;
1752
1753	for (int p = `0`; p < np; p++, a++, b += convNR)
1754	{
1755	v_float32x4 a0 = v_setall_f32(v: a[`0`]);
1756	v_float32x4 b0 = v_load(ptr: b), b1 = v_load(ptr: b + `4`), b2 = v_load(ptr: b + `8`);
1757	v_float32x4 b3 = v_load(ptr: b + `12`), b4 = v_load(ptr: b + `16`), b5 = v_load(ptr: b + `20`);
1758
1759	c0 = v_fma(a: b0, b: a0, c: c0);
1760	c1 = v_fma(a: b1, b: a0, c: c1);
1761	c2 = v_fma(a: b2, b: a0, c: c2);
1762	c3 = v_fma(a: b3, b: a0, c: c3);
1763	c4 = v_fma(a: b4, b: a0, c: c4);
1764	c5 = v_fma(a: b5, b: a0, c: c5);
1765	}
1766
1767	if (init_c)
1768	{
1769	c0 = v_add(a: c0, b: v_load(ptr: c));
1770	c1 = v_add(a: c1, b: v_load(ptr: c + `4`));
1771	c2 = v_add(a: c2, b: v_load(ptr: c + `8`));
1772	c3 = v_add(a: c3, b: v_load(ptr: c + `12`));
1773	c4 = v_add(a: c4, b: v_load(ptr: c + `16`));
1774	c5 = v_add(a: c5, b: v_load(ptr: c + `20`));
1775	}
1776
1777	if (ifMinMaxAct)
1778	{
1779	v_float32x4 vmax = v_setall_f32(v: maxval), vmin = v_setall_f32(v: minval);
1780	c0 = v_min(a: v_max(a: c0, b: vmin), b: vmax);
1781	c1 = v_min(a: v_max(a: c1, b: vmin), b: vmax);
1782	c2 = v_min(a: v_max(a: c2, b: vmin), b: vmax);
1783	c3 = v_min(a: v_max(a: c3, b: vmin), b: vmax);
1784	c4 = v_min(a: v_max(a: c4, b: vmin), b: vmax);
1785	c5 = v_min(a: v_max(a: c5, b: vmin), b: vmax);
1786	}
1787
1788	v_store(ptr: c, a: c0);
1789	v_store(ptr: c + `4`, a: c1);
1790	v_store(ptr: c + `8`, a: c2);
1791	v_store(ptr: c + `12`, a: c3);
1792	v_store(ptr: c + `16`, a: c4);
1793	v_store(ptr: c + `20`, a: c5);
1794	}
1795
1796	static inline void convBlockMR1x12(int np, const float* a, const float* b, float c, const* float bias, bool init_c,
1797	const float minval, const float maxval, bool ifMinMaxAct, const int convNR)
1798	{
1799	CV_Assert(convNR == `12`);
1800	v_float32x4 c0 = v_setall_f32(v: bias), c1 = c0, c2 = c0;
1801	for (int p = `0`; p < np; p++, a++, b += convNR)
1802	{
1803	v_float32x4 a0 = v_setall_f32(v: a[`0`]);
1804	v_float32x4 b0 = v_load(ptr: b), b1 = v_load(ptr: b + `4`), b2 = v_load(ptr: b + `8`);
1805
1806	c0 = v_fma(a: b0, b: a0, c: c0);
1807	c1 = v_fma(a: b1, b: a0, c: c1);
1808	c2 = v_fma(a: b2, b: a0, c: c2);
1809	}
1810
1811	if (init_c)
1812	{
1813	c0 = v_add(a: c0, b: v_load(ptr: c));
1814	c1 = v_add(a: c1, b: v_load(ptr: c + `4`));
1815	c2 = v_add(a: c2, b: v_load(ptr: c + `8`));
1816	}
1817
1818	if (ifMinMaxAct)
1819	{
1820	v_float32x4 vmax = v_setall_f32(v: maxval), vmin = v_setall_f32(v: minval);
1821	c0 = v_min(a: v_max(a: c0, b: vmin), b: vmax);
1822	c1 = v_min(a: v_max(a: c1, b: vmin), b: vmax);
1823	c2 = v_min(a: v_max(a: c2, b: vmin), b: vmax);
1824	}
1825
1826	v_store(ptr: c, a: c0);
1827	v_store(ptr: c + `4`, a: c1);
1828	v_store(ptr: c + `8`, a: c2);
1829	}
1830	#endif
1831
1832	void convBlockMR1_F32(int np, const float* a, const float* b, float c, const* float bias, bool init_c,
1833	const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
1834	{
1835	#if CV_SIMD128
1836	// The outLen represents the valid output value in CONV_NR length.
1837	// When outLen is very small, we use the no-SIMD branch.
1838	const int convNRby3 = convNR/`3`;
1839	if (outLen > convNRby3)
1840	{
1841	if (convNR == `24`)
1842	convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR);
1843	else if (convNR == `12`)
1844	convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR);
1845	else
1846	convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
1847	}
1848	else
1849	convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
1850	#else
1851	convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
1852	#endif
1853	}
1854
1855	#if CV_SIMD128
1856	static inline void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
1857	{
1858	v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
1859	v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
1860	v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
1861	v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
1862
1863	for (int p = `0`; p < np; p++, a += convMR, b += convNR)
1864	{
1865	v_float32x4 a0 = v_setall_f32(v: a[`0`]);
1866	v_float32x4 b0 = v_load(ptr: b), b1 = v_load(ptr: b + `4`), b2 = v_load(ptr: b + `8`);
1867	v_float32x4 b3 = v_load(ptr: b + `12`), b4 = v_load(ptr: b + `16`), b5 = v_load(ptr: b + `20`);
1868
1869	c0 = v_fma(a: b0, b: a0, c: c0);
1870	c1 = v_fma(a: b1, b: a0, c: c1);
1871	c2 = v_fma(a: b2, b: a0, c: c2);
1872	c3 = v_fma(a: b3, b: a0, c: c3);
1873	c4 = v_fma(a: b4, b: a0, c: c4);
1874	c5 = v_fma(a: b5, b: a0, c: c5);
1875
1876	a0 = v_setall_f32(v: a[`1`]);
1877	c6 = v_fma(a: b0, b: a0, c: c6);
1878	c7 = v_fma(a: b1, b: a0, c: c7);
1879	c8 = v_fma(a: b2, b: a0, c: c8);
1880	c9 = v_fma(a: b3, b: a0, c: c9);
1881	c10 = v_fma(a: b4, b: a0, c: c10);
1882	c11 = v_fma(a: b5, b: a0, c: c11);
1883
1884	a0 = v_setall_f32(v: a[`2`]);
1885	c12 = v_fma(a: b0, b: a0, c: c12);
1886	c13 = v_fma(a: b1, b: a0, c: c13);
1887	c14 = v_fma(a: b2, b: a0, c: c14);
1888	c15 = v_fma(a: b3, b: a0, c: c15);
1889	c16 = v_fma(a: b4, b: a0, c: c16);
1890	c17 = v_fma(a: b5, b: a0, c: c17);
1891
1892	a0 = v_setall_f32(v: a[`3`]);
1893	c18 = v_fma(a: b0, b: a0, c: c18);
1894	c19 = v_fma(a: b1, b: a0, c: c19);
1895	c20 = v_fma(a: b2, b: a0, c: c20);
1896	c21 = v_fma(a: b3, b: a0, c: c21);
1897	c22 = v_fma(a: b4, b: a0, c: c22);
1898	c23 = v_fma(a: b5, b: a0, c: c23);
1899	}
1900
1901	if (!init_c)
1902	{
1903	c0 = v_add(a: c0, b: v_load(ptr: c));
1904	c1 = v_add(a: c1, b: v_load(ptr: c + `4`));
1905	c2 = v_add(a: c2, b: v_load(ptr: c + `8`));
1906	c3 = v_add(a: c3, b: v_load(ptr: c + `12`));
1907	c4 = v_add(a: c4, b: v_load(ptr: c + `16`));
1908	c5 = v_add(a: c5, b: v_load(ptr: c + `20`));
1909
1910	c6 = v_add(a: c6 , b: v_load(ptr: c + ldc));
1911	c7 = v_add(a: c7 , b: v_load(ptr: c + ldc + `4`));
1912	c8 = v_add(a: c8 , b: v_load(ptr: c + ldc + `8`));
1913	c9 = v_add(a: c9 , b: v_load(ptr: c + ldc + `12`));
1914	c10 = v_add(a: c10, b: v_load(ptr: c + ldc + `16`));
1915	c11 = v_add(a: c11, b: v_load(ptr: c + ldc + `20`));
1916
1917	c12 = v_add(a: c12, b: v_load(ptr: c + ldc*`2`));
1918	c13 = v_add(a: c13, b: v_load(ptr: c + ldc*`2` + `4`));
1919	c14 = v_add(a: c14, b: v_load(ptr: c + ldc*`2` + `8`));
1920	c15 = v_add(a: c15, b: v_load(ptr: c + ldc*`2` + `12`));
1921	c16 = v_add(a: c16, b: v_load(ptr: c + ldc*`2` + `16`));
1922	c17 = v_add(a: c17, b: v_load(ptr: c + ldc*`2` + `20`));
1923
1924	c18 = v_add(a: c18, b: v_load(ptr: c + ldc*`3`));
1925	c19 = v_add(a: c19, b: v_load(ptr: c + ldc*`3` + `4`));
1926	c20 = v_add(a: c20, b: v_load(ptr: c + ldc*`3` + `8`));
1927	c21 = v_add(a: c21, b: v_load(ptr: c + ldc*`3` + `12`));
1928	c22 = v_add(a: c22, b: v_load(ptr: c + ldc*`3` + `16`));
1929	c23 = v_add(a: c23, b: v_load(ptr: c + ldc*`3` + `20`));
1930	}
1931
1932	v_store(ptr: c, a: c0);
1933	v_store(ptr: c + `4`, a: c1);
1934	v_store(ptr: c + `8`, a: c2);
1935	v_store(ptr: c + `12`, a: c3);
1936	v_store(ptr: c + `16`, a: c4);
1937	v_store(ptr: c + `20`, a: c5);
1938
1939	v_store(ptr: c + ldc, a: c6);
1940	v_store(ptr: c + ldc + `4`, a: c7);
1941	v_store(ptr: c + ldc + `8`, a: c8);
1942	v_store(ptr: c + ldc + `12`, a: c9);
1943	v_store(ptr: c + ldc + `16`, a: c10);
1944	v_store(ptr: c + ldc + `20`, a: c11);
1945
1946	v_store(ptr: c + ldc * `2`, a: c12);
1947	v_store(ptr: c + ldc * `2` + `4`, a: c13);
1948	v_store(ptr: c + ldc * `2` + `8`, a: c14);
1949	v_store(ptr: c + ldc * `2` + `12`, a: c15);
1950	v_store(ptr: c + ldc * `2` + `16`, a: c16);
1951	v_store(ptr: c + ldc * `2` + `20`, a: c17);
1952
1953	v_store(ptr: c + ldc * `3`, a: c18);
1954	v_store(ptr: c + ldc * `3` + `4`, a: c19);
1955	v_store(ptr: c + ldc * `3` + `8`, a: c20);
1956	v_store(ptr: c + ldc * `3` + `12`, a: c21);
1957	v_store(ptr: c + ldc * `3` + `16`, a: c22);
1958	v_store(ptr: c + ldc * `3` + `20`, a: c23);
1959	}
1960
1961	static inline void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
1962	{
1963	CV_Assert(convNR >= `4`);
1964	v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
1965	v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0;
1966
1967	for (int p = `0`; p < np; p++, a += convMR, b += convNR)
1968	{
1969	v_float32x4 a0 = v_setall_f32(v: a[`0`]);
1970	v_float32x4 a1 = v_setall_f32(v: a[`1`]);
1971	v_float32x4 a2 = v_setall_f32(v: a[`2`]);
1972	v_float32x4 a3 = v_setall_f32(v: a[`3`]);
1973
1974	v_float32x4 b0 = v_load(ptr: b), b1 = v_load(ptr: b + `4`);
1975
1976	c0 = v_fma(a: b0, b: a0, c: c0);
1977	c1 = v_fma(a: b1, b: a0, c: c1);
1978
1979	c2 = v_fma(a: b0, b: a1, c: c2);
1980	c3 = v_fma(a: b1, b: a1, c: c3);
1981
1982	c4 = v_fma(a: b0, b: a2, c: c4);
1983	c5 = v_fma(a: b1, b: a2, c: c5);
1984
1985	c6 = v_fma(a: b0, b: a3, c: c6);
1986	c7 = v_fma(a: b1, b: a3, c: c7);
1987	}
1988
1989	if (!init_c)
1990	{
1991	c0 = v_add(a: c0, b: v_load(ptr: c));
1992	c1 = v_add(a: c1, b: v_load(ptr: c + `4`));
1993
1994	c2 = v_add(a: c2, b: v_load(ptr: c + ldc));
1995	c3 = v_add(a: c3, b: v_load(ptr: c + ldc + `4`));
1996
1997	c4 = v_add(a: c4, b: v_load(ptr: c + ldc*`2`));
1998	c5 = v_add(a: c5, b: v_load(ptr: c + ldc*`2` + `4`));
1999
2000	c6 = v_add(a: c6, b: v_load(ptr: c + ldc*`3`));
2001	c7 = v_add(a: c7, b: v_load(ptr: c + ldc*`3` + `4`));
2002	}
2003
2004	v_store(ptr: c, a: c0);
2005	v_store(ptr: c + `4`, a: c1);
2006	v_store(ptr: c + ldc, a: c2);
2007	v_store(ptr: c + ldc + `4`, a: c3);
2008	v_store(ptr: c + ldc * `2`, a: c4);
2009	v_store(ptr: c + ldc * `2` + `4`, a: c5);
2010	v_store(ptr: c + ldc * `3`, a: c6);
2011	v_store(ptr: c + ldc * `3` + `4`, a: c7);
2012	}
2013
2014	static inline void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
2015	{
2016	CV_Assert(convNR >= `4`);
2017	v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
2018
2019	for (int p = `0`; p < np; p++, a += convMR, b += convNR)
2020	{
2021	v_float32x4 a0 = v_setall_f32(v: a[`0`]);
2022	v_float32x4 a1 = v_setall_f32(v: a[`1`]);
2023	v_float32x4 a2 = v_setall_f32(v: a[`2`]);
2024	v_float32x4 a3 = v_setall_f32(v: a[`3`]);
2025
2026	v_float32x4 b0 = v_load(ptr: b);
2027
2028	c0 = v_fma(a: b0, b: a0, c: c0);
2029	c1 = v_fma(a: b0, b: a1, c: c1);
2030	c2 = v_fma(a: b0, b: a2, c: c2);
2031	c3 = v_fma(a: b0, b: a3, c: c3);
2032	}
2033
2034	if (!init_c)
2035	{
2036	c0 = v_add(a: c0, b: v_load(ptr: c));
2037	c1 = v_add(a: c1, b: v_load(ptr: c + ldc));
2038	c2 = v_add(a: c2, b: v_load(ptr: c + ldc*`2`));
2039	c3 = v_add(a: c3, b: v_load(ptr: c + ldc*`3`));
2040	}
2041
2042	v_store(ptr: c, a: c0);
2043	v_store(ptr: c + ldc, a: c1);
2044	v_store(ptr: c + ldc * `2`, a: c2);
2045	v_store(ptr: c + ldc * `3`, a: c3);
2046	}
2047	#endif
2048
2049	static inline void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
2050	const int convMR, const int convNR)
2051	{
2052	std::vector<float> cbuffer(convMR * outLen, `0`);
2053	float* cbuf = cbuffer.data();
2054	for( int p = `0`; p < np; p++ )
2055	{
2056	for( int i = `0`; i < convMR; i++ )
2057	{
2058	float ai = a[convMR*p + i];
2059	for( int j = `0`; j < outLen; j++ )
2060	cbuf[i * outLen+j] += b[convNRp + j] ai;
2061	}
2062	}
2063
2064	if (!init_c)
2065	{
2066	for(int i = `0`; i < convMR; i++)
2067	{
2068	for(int j = `0`; j < outLen; j++)
2069	c[ildc + j] += cbuf[ioutLen + j];
2070	}
2071	}
2072	else
2073	{
2074	for(int i = `0`; i < convMR; i++)
2075	{
2076	for(int j = `0`; j < outLen; j++)
2077	c[ildc + j] = cbuf[ioutLen + j];
2078	}
2079	}
2080	}
2081
2082	void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
2083	const int convMR, const int convNR)
2084	{
2085	// The possible outLen range is [24, 8~1].
2086	#if CV_SIMD128
2087	CV_Assert(convMR == `4`);
2088	if (outLen > `8` && convNR == `24`)
2089	{
2090	convBlock4x24(np, a, b, c, ldc, init_c, convMR, convNR);
2091	return;
2092	}
2093
2094	if (outLen <= `8` && outLen > `4`)
2095	{
2096	convBlock4x8(np, a, b, c, ldc, init_c, convMR, convNR);
2097	return;
2098	}
2099
2100	if (outLen <= `4` && outLen > `1`)
2101	{
2102	convBlock4x4(np, a, b, c, ldc, init_c, convMR, convNR);
2103	return;
2104	}
2105	convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
2106	#else
2107	convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
2108	#endif
2109	}
2110
2111	}} // namespace cv::dnn
2112

source code of opencv/modules/dnn/src/layers/cpu_kernels/convolution.cpp