| 1 | // This file is part of OpenCV project. |
| 2 | // It is subject to the license terms in the LICENSE file found in the top-level directory |
| 3 | // of this distribution and at http://opencv.org/license.html |
| 4 | |
| 5 | |
| 6 | #include "precomp.hpp" |
| 7 | #include "opencl_kernels_core.hpp" |
| 8 | #include "stat.hpp" |
| 9 | |
| 10 | #include "sum.simd.hpp" |
| 11 | #include "sum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content |
| 12 | |
| 13 | namespace cv |
| 14 | { |
| 15 | |
| 16 | SumFunc getSumFunc(int depth) |
| 17 | { |
| 18 | CV_INSTRUMENT_REGION(); |
| 19 | CV_CPU_DISPATCH(getSumFunc, (depth), |
| 20 | CV_CPU_DISPATCH_MODES_ALL); |
| 21 | } |
| 22 | |
| 23 | #ifdef HAVE_OPENCL |
| 24 | |
| 25 | bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask, |
| 26 | InputArray _src2, bool calc2, const Scalar & res2 ) |
| 27 | { |
| 28 | CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR); |
| 29 | |
| 30 | const ocl::Device & dev = ocl::Device::getDefault(); |
| 31 | bool doubleSupport = dev.doubleFPConfig() > 0, |
| 32 | haveMask = _mask.kind() != _InputArray::NONE, |
| 33 | haveSrc2 = _src2.kind() != _InputArray::NONE; |
| 34 | int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), |
| 35 | kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(src1: _src, src2: _src2) : 1, |
| 36 | mcn = std::max(a: cn, b: kercn); |
| 37 | CV_Assert(!haveSrc2 || _src2.type() == type); |
| 38 | int convert_cn = haveSrc2 ? mcn : cn; |
| 39 | |
| 40 | if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) |
| 41 | return false; |
| 42 | |
| 43 | if (depth >= CV_16F) |
| 44 | return false; |
| 45 | |
| 46 | int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1); |
| 47 | size_t wgs = dev.maxWorkGroupSize(); |
| 48 | |
| 49 | int ddepth = std::max(a: sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, b: depth), |
| 50 | dtype = CV_MAKE_TYPE(ddepth, cn); |
| 51 | CV_Assert(!haveMask || _mask.type() == CV_8UC1); |
| 52 | |
| 53 | int wgs2_aligned = 1; |
| 54 | while (wgs2_aligned < (int)wgs) |
| 55 | wgs2_aligned <<= 1; |
| 56 | wgs2_aligned >>= 1; |
| 57 | |
| 58 | static const char * const opMap[3] = { "OP_SUM" , "OP_SUM_ABS" , "OP_SUM_SQR" }; |
| 59 | char cvt[2][50]; |
| 60 | String opts = format(fmt: "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d" |
| 61 | " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s" , |
| 62 | ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(t: depth), |
| 63 | ocl::typeToStr(t: dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)), |
| 64 | ocl::typeToStr(t: ddepth), ddepth, cn, |
| 65 | ocl::convertTypeStr(sdepth: depth, ddepth, cn: mcn, buf: cvt[0], buf_size: sizeof(cvt[0])), |
| 66 | opMap[sum_op], (int)wgs, wgs2_aligned, |
| 67 | doubleSupport ? " -D DOUBLE_SUPPORT" : "" , |
| 68 | haveMask ? " -D HAVE_MASK" : "" , |
| 69 | _src.isContinuous() ? " -D HAVE_SRC_CONT" : "" , |
| 70 | haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "" , kercn, |
| 71 | haveSrc2 ? " -D HAVE_SRC2" : "" , calc2 ? " -D OP_CALC2" : "" , |
| 72 | haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "" , |
| 73 | depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, cn: convert_cn, buf: cvt[1], buf_size: sizeof(cvt[1])) : "noconvert" ); |
| 74 | |
| 75 | ocl::Kernel k("reduce" , ocl::core::reduce_oclsrc, opts); |
| 76 | if (k.empty()) |
| 77 | return false; |
| 78 | |
| 79 | UMat src = _src.getUMat(), src2 = _src2.getUMat(), |
| 80 | db(1, dbsize, dtype), mask = _mask.getUMat(); |
| 81 | |
| 82 | ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(m: src), |
| 83 | dbarg = ocl::KernelArg::PtrWriteOnly(m: db), |
| 84 | maskarg = ocl::KernelArg::ReadOnlyNoSize(m: mask), |
| 85 | src2arg = ocl::KernelArg::ReadOnlyNoSize(m: src2); |
| 86 | |
| 87 | if (haveMask) |
| 88 | { |
| 89 | if (haveSrc2) |
| 90 | k.args(kernel_args: srcarg, kernel_args: src.cols, kernel_args: (int)src.total(), kernel_args: ngroups, kernel_args: dbarg, kernel_args: maskarg, kernel_args: src2arg); |
| 91 | else |
| 92 | k.args(kernel_args: srcarg, kernel_args: src.cols, kernel_args: (int)src.total(), kernel_args: ngroups, kernel_args: dbarg, kernel_args: maskarg); |
| 93 | } |
| 94 | else |
| 95 | { |
| 96 | if (haveSrc2) |
| 97 | k.args(kernel_args: srcarg, kernel_args: src.cols, kernel_args: (int)src.total(), kernel_args: ngroups, kernel_args: dbarg, kernel_args: src2arg); |
| 98 | else |
| 99 | k.args(kernel_args: srcarg, kernel_args: src.cols, kernel_args: (int)src.total(), kernel_args: ngroups, kernel_args: dbarg); |
| 100 | } |
| 101 | |
| 102 | size_t globalsize = ngroups * wgs; |
| 103 | if (k.run(dims: 1, globalsize: &globalsize, localsize: &wgs, sync: true)) |
| 104 | { |
| 105 | typedef Scalar (*part_sum)(Mat m); |
| 106 | part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> }, |
| 107 | func = funcs[ddepth - CV_32S]; |
| 108 | |
| 109 | Mat mres = db.getMat(flags: ACCESS_READ); |
| 110 | if (calc2) |
| 111 | const_cast<Scalar &>(res2) = func(mres.colRange(startcol: ngroups, endcol: dbsize)); |
| 112 | |
| 113 | res = func(mres.colRange(startcol: 0, endcol: ngroups)); |
| 114 | return true; |
| 115 | } |
| 116 | return false; |
| 117 | } |
| 118 | |
| 119 | #endif |
| 120 | |
| 121 | Scalar sum(InputArray _src) |
| 122 | { |
| 123 | CV_INSTRUMENT_REGION(); |
| 124 | |
| 125 | Scalar _res = Scalar::all(v0: 0.0); |
| 126 | |
| 127 | #ifdef HAVE_OPENCL |
| 128 | CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, |
| 129 | ocl_sum(_src, res&: _res, sum_op: OCL_OP_SUM), |
| 130 | _res); |
| 131 | #endif |
| 132 | |
| 133 | Mat src = _src.getMat(); |
| 134 | int cn = src.channels(); |
| 135 | CV_CheckLE( cn, 4, "cv::sum does not support more than 4 channels" ); |
| 136 | |
| 137 | if (_src.dims() <= 2) |
| 138 | { |
| 139 | CALL_HAL_RET2(sum, cv_hal_sum, _res, src.data, src.step, src.type(), src.cols, src.rows, &_res[0]); |
| 140 | } |
| 141 | else if (_src.isContinuous()) |
| 142 | { |
| 143 | CALL_HAL_RET2(sum, cv_hal_sum, _res, src.data, 0, src.type(), (int)src.total(), 1, &_res[0]); |
| 144 | } |
| 145 | |
| 146 | int k, depth = src.depth(); |
| 147 | SumFunc func = getSumFunc(depth); |
| 148 | CV_Assert( func != nullptr ); |
| 149 | |
| 150 | const Mat* arrays[] = {&src, 0}; |
| 151 | uchar* ptrs[1] = {}; |
| 152 | NAryMatIterator it(arrays, ptrs); |
| 153 | int total = (int)it.size, blockSize = total, intSumBlockSize = 0; |
| 154 | int j, count = 0; |
| 155 | AutoBuffer<int> _buf; |
| 156 | int* buf = (int*)&_res[0]; |
| 157 | size_t esz = 0; |
| 158 | bool blockSum = depth < CV_32S; |
| 159 | |
| 160 | if( blockSum ) |
| 161 | { |
| 162 | intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15); |
| 163 | blockSize = std::min(a: blockSize, b: intSumBlockSize); |
| 164 | _buf.allocate(size: cn); |
| 165 | buf = _buf.data(); |
| 166 | |
| 167 | for( k = 0; k < cn; k++ ) |
| 168 | buf[k] = 0; |
| 169 | esz = src.elemSize(); |
| 170 | } |
| 171 | |
| 172 | for( size_t i = 0; i < it.nplanes; i++, ++it ) |
| 173 | { |
| 174 | for( j = 0; j < total; j += blockSize ) |
| 175 | { |
| 176 | int bsz = std::min(a: total - j, b: blockSize); |
| 177 | func( ptrs[0], 0, (uchar*)buf, bsz, cn ); |
| 178 | count += bsz; |
| 179 | if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) |
| 180 | { |
| 181 | for( k = 0; k < cn; k++ ) |
| 182 | { |
| 183 | _res[k] += buf[k]; |
| 184 | buf[k] = 0; |
| 185 | } |
| 186 | count = 0; |
| 187 | } |
| 188 | ptrs[0] += bsz*esz; |
| 189 | } |
| 190 | } |
| 191 | return _res; |
| 192 | } |
| 193 | |
| 194 | } // namespace |
| 195 | |