1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5
6#include "precomp.hpp"
7#include "opencl_kernels_core.hpp"
8#include "stat.hpp"
9
10#include "sum.simd.hpp"
11#include "sum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
12
13namespace cv
14{
15
16SumFunc getSumFunc(int depth)
17{
18 CV_INSTRUMENT_REGION();
19 CV_CPU_DISPATCH(getSumFunc, (depth),
20 CV_CPU_DISPATCH_MODES_ALL);
21}
22
23#ifdef HAVE_OPENCL
24
25bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask,
26 InputArray _src2, bool calc2, const Scalar & res2 )
27{
28 CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
29
30 const ocl::Device & dev = ocl::Device::getDefault();
31 bool doubleSupport = dev.doubleFPConfig() > 0,
32 haveMask = _mask.kind() != _InputArray::NONE,
33 haveSrc2 = _src2.kind() != _InputArray::NONE;
34 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
35 kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(src1: _src, src2: _src2) : 1,
36 mcn = std::max(a: cn, b: kercn);
37 CV_Assert(!haveSrc2 || _src2.type() == type);
38 int convert_cn = haveSrc2 ? mcn : cn;
39
40 if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
41 return false;
42
43 if (depth >= CV_16F)
44 return false;
45
46 int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
47 size_t wgs = dev.maxWorkGroupSize();
48
49 int ddepth = std::max(a: sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, b: depth),
50 dtype = CV_MAKE_TYPE(ddepth, cn);
51 CV_Assert(!haveMask || _mask.type() == CV_8UC1);
52
53 int wgs2_aligned = 1;
54 while (wgs2_aligned < (int)wgs)
55 wgs2_aligned <<= 1;
56 wgs2_aligned >>= 1;
57
58 static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
59 char cvt[2][50];
60 String opts = format(fmt: "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
61 " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s",
62 ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(t: depth),
63 ocl::typeToStr(t: dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
64 ocl::typeToStr(t: ddepth), ddepth, cn,
65 ocl::convertTypeStr(sdepth: depth, ddepth, cn: mcn, buf: cvt[0], buf_size: sizeof(cvt[0])),
66 opMap[sum_op], (int)wgs, wgs2_aligned,
67 doubleSupport ? " -D DOUBLE_SUPPORT" : "",
68 haveMask ? " -D HAVE_MASK" : "",
69 _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
70 haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
71 haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
72 haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
73 depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, cn: convert_cn, buf: cvt[1], buf_size: sizeof(cvt[1])) : "noconvert");
74
75 ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
76 if (k.empty())
77 return false;
78
79 UMat src = _src.getUMat(), src2 = _src2.getUMat(),
80 db(1, dbsize, dtype), mask = _mask.getUMat();
81
82 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(m: src),
83 dbarg = ocl::KernelArg::PtrWriteOnly(m: db),
84 maskarg = ocl::KernelArg::ReadOnlyNoSize(m: mask),
85 src2arg = ocl::KernelArg::ReadOnlyNoSize(m: src2);
86
87 if (haveMask)
88 {
89 if (haveSrc2)
90 k.args(kernel_args: srcarg, kernel_args: src.cols, kernel_args: (int)src.total(), kernel_args: ngroups, kernel_args: dbarg, kernel_args: maskarg, kernel_args: src2arg);
91 else
92 k.args(kernel_args: srcarg, kernel_args: src.cols, kernel_args: (int)src.total(), kernel_args: ngroups, kernel_args: dbarg, kernel_args: maskarg);
93 }
94 else
95 {
96 if (haveSrc2)
97 k.args(kernel_args: srcarg, kernel_args: src.cols, kernel_args: (int)src.total(), kernel_args: ngroups, kernel_args: dbarg, kernel_args: src2arg);
98 else
99 k.args(kernel_args: srcarg, kernel_args: src.cols, kernel_args: (int)src.total(), kernel_args: ngroups, kernel_args: dbarg);
100 }
101
102 size_t globalsize = ngroups * wgs;
103 if (k.run(dims: 1, globalsize: &globalsize, localsize: &wgs, sync: true))
104 {
105 typedef Scalar (*part_sum)(Mat m);
106 part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
107 func = funcs[ddepth - CV_32S];
108
109 Mat mres = db.getMat(flags: ACCESS_READ);
110 if (calc2)
111 const_cast<Scalar &>(res2) = func(mres.colRange(startcol: ngroups, endcol: dbsize));
112
113 res = func(mres.colRange(startcol: 0, endcol: ngroups));
114 return true;
115 }
116 return false;
117}
118
119#endif
120
121Scalar sum(InputArray _src)
122{
123 CV_INSTRUMENT_REGION();
124
125 Scalar _res = Scalar::all(v0: 0.0);
126
127#ifdef HAVE_OPENCL
128 CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
129 ocl_sum(_src, res&: _res, sum_op: OCL_OP_SUM),
130 _res);
131#endif
132
133 Mat src = _src.getMat();
134 int cn = src.channels();
135 CV_CheckLE( cn, 4, "cv::sum does not support more than 4 channels" );
136
137 if (_src.dims() <= 2)
138 {
139 CALL_HAL_RET2(sum, cv_hal_sum, _res, src.data, src.step, src.type(), src.cols, src.rows, &_res[0]);
140 }
141 else if (_src.isContinuous())
142 {
143 CALL_HAL_RET2(sum, cv_hal_sum, _res, src.data, 0, src.type(), (int)src.total(), 1, &_res[0]);
144 }
145
146 int k, depth = src.depth();
147 SumFunc func = getSumFunc(depth);
148 CV_Assert( func != nullptr );
149
150 const Mat* arrays[] = {&src, 0};
151 uchar* ptrs[1] = {};
152 NAryMatIterator it(arrays, ptrs);
153 int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
154 int j, count = 0;
155 AutoBuffer<int> _buf;
156 int* buf = (int*)&_res[0];
157 size_t esz = 0;
158 bool blockSum = depth < CV_32S;
159
160 if( blockSum )
161 {
162 intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
163 blockSize = std::min(a: blockSize, b: intSumBlockSize);
164 _buf.allocate(size: cn);
165 buf = _buf.data();
166
167 for( k = 0; k < cn; k++ )
168 buf[k] = 0;
169 esz = src.elemSize();
170 }
171
172 for( size_t i = 0; i < it.nplanes; i++, ++it )
173 {
174 for( j = 0; j < total; j += blockSize )
175 {
176 int bsz = std::min(a: total - j, b: blockSize);
177 func( ptrs[0], 0, (uchar*)buf, bsz, cn );
178 count += bsz;
179 if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
180 {
181 for( k = 0; k < cn; k++ )
182 {
183 _res[k] += buf[k];
184 buf[k] = 0;
185 }
186 count = 0;
187 }
188 ptrs[0] += bsz*esz;
189 }
190 }
191 return _res;
192}
193
194} // namespace
195

source code of opencv/modules/core/src/sum.dispatch.cpp