| 1 | // This file is part of OpenCV project. |
| 2 | // It is subject to the license terms in the LICENSE file found in the top-level directory |
| 3 | // of this distribution and at http://opencv.org/license.html |
| 4 | |
| 5 | |
| 6 | #include "precomp.hpp" |
| 7 | #include "opencl_kernels_core.hpp" |
| 8 | #include "stat.hpp" |
| 9 | |
| 10 | #include "count_non_zero.simd.hpp" |
| 11 | #include "count_non_zero.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content |
| 12 | |
| 13 | namespace cv { |
| 14 | |
| 15 | static CountNonZeroFunc getCountNonZeroTab(int depth) |
| 16 | { |
| 17 | CV_INSTRUMENT_REGION(); |
| 18 | CV_CPU_DISPATCH(getCountNonZeroTab, (depth), |
| 19 | CV_CPU_DISPATCH_MODES_ALL); |
| 20 | } |
| 21 | |
| 22 | #ifdef HAVE_OPENCL |
| 23 | static bool ocl_countNonZero( InputArray _src, int & res ) |
| 24 | { |
| 25 | int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(src1: _src); |
| 26 | bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; |
| 27 | |
| 28 | if (depth == CV_64F && !doubleSupport) |
| 29 | return false; |
| 30 | |
| 31 | int dbsize = ocl::Device::getDefault().maxComputeUnits(); |
| 32 | size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); |
| 33 | |
| 34 | int wgs2_aligned = 1; |
| 35 | while (wgs2_aligned < (int)wgs) |
| 36 | wgs2_aligned <<= 1; |
| 37 | wgs2_aligned >>= 1; |
| 38 | |
| 39 | ocl::Kernel k("reduce" , ocl::core::reduce_oclsrc, |
| 40 | format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO" |
| 41 | " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s" , |
| 42 | ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), |
| 43 | ocl::typeToStr(depth), (int)wgs, kercn, |
| 44 | wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "" , |
| 45 | _src.isContinuous() ? " -D HAVE_SRC_CONT" : "" )); |
| 46 | if (k.empty()) |
| 47 | return false; |
| 48 | |
| 49 | UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1); |
| 50 | k.args(kernel_args: ocl::KernelArg::ReadOnlyNoSize(m: src), kernel_args: src.cols, kernel_args: (int)src.total(), |
| 51 | kernel_args: dbsize, kernel_args: ocl::KernelArg::PtrWriteOnly(m: db)); |
| 52 | |
| 53 | size_t globalsize = dbsize * wgs; |
| 54 | if (k.run(dims: 1, globalsize: &globalsize, localsize: &wgs, sync: true)) |
| 55 | return res = saturate_cast<int>(v: cv::sum(src: db.getMat(flags: ACCESS_READ))[0]), true; |
| 56 | return false; |
| 57 | } |
| 58 | #endif |
| 59 | |
| 60 | #if defined HAVE_IPP |
| 61 | static bool ipp_countNonZero( Mat &src, int &res ) |
| 62 | { |
| 63 | CV_INSTRUMENT_REGION_IPP(); |
| 64 | |
| 65 | #if IPP_VERSION_X100 < 201801 |
| 66 | // Poor performance of SSE42 |
| 67 | if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42) |
| 68 | return false; |
| 69 | #endif |
| 70 | |
| 71 | Ipp32s count = 0; |
| 72 | int depth = src.depth(); |
| 73 | |
| 74 | if(src.dims <= 2) |
| 75 | { |
| 76 | IppStatus status; |
| 77 | IppiSize size = {.width: src.cols*src.channels(), .height: src.rows}; |
| 78 | |
| 79 | if(depth == CV_8U) |
| 80 | status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0); |
| 81 | else if(depth == CV_32F) |
| 82 | status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0); |
| 83 | else |
| 84 | return false; |
| 85 | |
| 86 | if(status < 0) |
| 87 | return false; |
| 88 | |
| 89 | res = size.width*size.height - count; |
| 90 | } |
| 91 | else |
| 92 | { |
| 93 | IppStatus status; |
| 94 | const Mat *arrays[] = {&src, NULL}; |
| 95 | Mat planes[1]; |
| 96 | NAryMatIterator it(arrays, planes, 1); |
| 97 | IppiSize size = {.width: (int)it.size*src.channels(), .height: 1}; |
| 98 | res = 0; |
| 99 | for (size_t i = 0; i < it.nplanes; i++, ++it) |
| 100 | { |
| 101 | if(depth == CV_8U) |
| 102 | status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0); |
| 103 | else if(depth == CV_32F) |
| 104 | status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0); |
| 105 | else |
| 106 | return false; |
| 107 | |
| 108 | if(status < 0 || (int)it.planes->total()*src.channels() < count) |
| 109 | return false; |
| 110 | |
| 111 | res += (int)it.planes->total()*src.channels() - count; |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | return true; |
| 116 | } |
| 117 | #endif |
| 118 | |
| 119 | int countNonZero(InputArray _src) |
| 120 | { |
| 121 | CV_INSTRUMENT_REGION(); |
| 122 | |
| 123 | int type = _src.type(), cn = CV_MAT_CN(type); |
| 124 | CV_Assert( cn == 1 ); |
| 125 | |
| 126 | #if defined HAVE_OPENCL || defined HAVE_IPP |
| 127 | int res = -1; |
| 128 | #endif |
| 129 | |
| 130 | #ifdef HAVE_OPENCL |
| 131 | CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, |
| 132 | ocl_countNonZero(_src, res), |
| 133 | res) |
| 134 | #endif |
| 135 | |
| 136 | Mat src = _src.getMat(); |
| 137 | CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res); |
| 138 | |
| 139 | CountNonZeroFunc func = getCountNonZeroTab(depth: src.depth()); |
| 140 | CV_Assert( func != 0 ); |
| 141 | |
| 142 | const Mat* arrays[] = {&src, 0}; |
| 143 | uchar* ptrs[1] = {}; |
| 144 | NAryMatIterator it(arrays, ptrs); |
| 145 | int total = (int)it.size, nz = 0; |
| 146 | |
| 147 | for( size_t i = 0; i < it.nplanes; i++, ++it ) |
| 148 | nz += func( ptrs[0], total ); |
| 149 | |
| 150 | return nz; |
| 151 | } |
| 152 | |
| 153 | void findNonZero(InputArray _src, OutputArray _idx) |
| 154 | { |
| 155 | Mat src = _src.getMat(); |
| 156 | CV_Assert( src.channels() == 1 && src.dims == 2 ); |
| 157 | |
| 158 | int depth = src.depth(); |
| 159 | std::vector<Point> idxvec; |
| 160 | int rows = src.rows, cols = src.cols; |
| 161 | AutoBuffer<int> buf_(cols + 1); |
| 162 | int* buf = buf_.data(); |
| 163 | |
| 164 | for( int i = 0; i < rows; i++ ) |
| 165 | { |
| 166 | int j, k = 0; |
| 167 | const uchar* ptr8 = src.ptr(y: i); |
| 168 | if( depth == CV_8U || depth == CV_8S ) |
| 169 | { |
| 170 | for( j = 0; j < cols; j++ ) |
| 171 | if( ptr8[j] != 0 ) buf[k++] = j; |
| 172 | } |
| 173 | else if( depth == CV_16U || depth == CV_16S ) |
| 174 | { |
| 175 | const ushort* ptr16 = (const ushort*)ptr8; |
| 176 | for( j = 0; j < cols; j++ ) |
| 177 | if( ptr16[j] != 0 ) buf[k++] = j; |
| 178 | } |
| 179 | else if( depth == CV_32S ) |
| 180 | { |
| 181 | const int* ptr32s = (const int*)ptr8; |
| 182 | for( j = 0; j < cols; j++ ) |
| 183 | if( ptr32s[j] != 0 ) buf[k++] = j; |
| 184 | } |
| 185 | else if( depth == CV_32F ) |
| 186 | { |
| 187 | const float* ptr32f = (const float*)ptr8; |
| 188 | for( j = 0; j < cols; j++ ) |
| 189 | if( ptr32f[j] != 0 ) buf[k++] = j; |
| 190 | } |
| 191 | else |
| 192 | { |
| 193 | const double* ptr64f = (const double*)ptr8; |
| 194 | for( j = 0; j < cols; j++ ) |
| 195 | if( ptr64f[j] != 0 ) buf[k++] = j; |
| 196 | } |
| 197 | |
| 198 | if( k > 0 ) |
| 199 | { |
| 200 | size_t sz = idxvec.size(); |
| 201 | idxvec.resize(new_size: sz + k); |
| 202 | for( j = 0; j < k; j++ ) |
| 203 | idxvec[sz + j] = Point(buf[j], i); |
| 204 | } |
| 205 | } |
| 206 | |
| 207 | if( idxvec.empty() || (_idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous()) ) |
| 208 | _idx.release(); |
| 209 | |
| 210 | if( !idxvec.empty() ) |
| 211 | Mat(idxvec).copyTo(m: _idx); |
| 212 | } |
| 213 | |
| 214 | } // namespace |
| 215 | |