1 | /*M/////////////////////////////////////////////////////////////////////////////////////// |
---|---|
2 | // |
3 | // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
4 | // |
5 | // By downloading, copying, installing or using the software you agree to this license. |
6 | // If you do not agree to this license, do not download, install, |
7 | // copy or use the software. |
8 | // |
9 | // |
10 | // License Agreement |
11 | // For Open Source Computer Vision Library |
12 | // |
13 | // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
14 | // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. |
15 | // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. |
16 | // Third party copyrights are property of their respective owners. |
17 | // |
18 | // Redistribution and use in source and binary forms, with or without modification, |
19 | // are permitted provided that the following conditions are met: |
20 | // |
21 | // * Redistribution's of source code must retain the above copyright notice, |
22 | // this list of conditions and the following disclaimer. |
23 | // |
24 | // * Redistribution's in binary form must reproduce the above copyright notice, |
25 | // this list of conditions and the following disclaimer in the documentation |
26 | // and/or other materials provided with the distribution. |
27 | // |
28 | // * The name of the copyright holders may not be used to endorse or promote products |
29 | // derived from this software without specific prior written permission. |
30 | // |
31 | // This software is provided by the copyright holders and contributors "as is" and |
32 | // any express or implied warranties, including, but not limited to, the implied |
33 | // warranties of merchantability and fitness for a particular purpose are disclaimed. |
34 | // In no event shall the Intel Corporation or contributors be liable for any direct, |
35 | // indirect, incidental, special, exemplary, or consequential damages |
36 | // (including, but not limited to, procurement of substitute goods or services; |
37 | // loss of use, data, or profits; or business interruption) however caused |
38 | // and on any theory of liability, whether in contract, strict liability, |
39 | // or tort (including negligence or otherwise) arising in any way out of |
40 | // the use of this software, even if advised of the possibility of such damage. |
41 | // |
42 | //M*/ |
43 | |
44 | #include "precomp.hpp" |
45 | #include <opencv2/core/utils/logger.hpp> |
46 | |
47 | #include "opencl_kernels_core.hpp" |
48 | #include "opencv2/core/opencl/runtime/opencl_clblas.hpp" |
49 | #include "opencv2/core/opencl/runtime/opencl_core.hpp" |
50 | #include "intel_gpu_gemm.inl.hpp" |
51 | |
52 | #include "matmul.simd.hpp" |
53 | #include "matmul.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content |
54 | |
55 | namespace cv |
56 | { |
57 | |
58 | /****************************************************************************************\ |
59 | * GEMM * |
60 | \****************************************************************************************/ |
61 | |
62 | #ifdef HAVE_CLAMDBLAS |
63 | |
64 | static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha, |
65 | InputArray matC, double beta, OutputArray matD, int flags ) |
66 | { |
67 | int type = matA.type(), esz = CV_ELEM_SIZE(type); |
68 | bool haveC = matC.kind() != cv::_InputArray::NONE; |
69 | Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); |
70 | bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; |
71 | |
72 | if (atrans) |
73 | sizeA = Size(sizeA.height, sizeA.width); |
74 | if (btrans) |
75 | sizeB = Size(sizeB.height, sizeB.width); |
76 | if (haveC && ctrans) |
77 | sizeC = Size(sizeC.height, sizeC.width); |
78 | |
79 | Size sizeD(sizeB.width, sizeA.height); |
80 | |
81 | CV_Assert( matB.type() == type && (!haveC || matC.type() == type) ); |
82 | CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); |
83 | |
84 | matD.create(sizeD, type); |
85 | if ( matA.offset() % esz != 0 || matA.step() % esz != 0 || |
86 | matB.offset() % esz != 0 || matB.step() % esz != 0 || |
87 | (haveC && (matC.offset() % esz != 0 || matC.step() % esz != 0)) ) |
88 | return false; |
89 | |
90 | UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); |
91 | if (!ocl::internal::isCLBuffer(A) || !ocl::internal::isCLBuffer(B) || !ocl::internal::isCLBuffer(D)) |
92 | { |
93 | return false; |
94 | } |
95 | if (haveC) |
96 | { |
97 | UMat C = matC.getUMat(); |
98 | if (!ocl::internal::isCLBuffer(C)) |
99 | return false; |
100 | } |
101 | if (haveC) |
102 | ctrans ? transpose(matC, D) : matC.copyTo(D); |
103 | else |
104 | D.setTo(Scalar::all(0)); |
105 | |
106 | int M = sizeD.height, N = sizeD.width, K = sizeA.width; |
107 | int lda = (int)A.step / esz, ldb = (int)B.step / esz, ldc = (int)D.step / esz; |
108 | int offa = (int)A.offset / esz, offb = (int)B.offset / esz, offc = (int)D.offset / esz; |
109 | |
110 | cl_command_queue clq = (cl_command_queue)ocl::Queue::getDefault().ptr(); |
111 | clblasTranspose transA = atrans ? clblasTrans : clblasNoTrans; |
112 | clblasTranspose transB = btrans ? clblasTrans : clblasNoTrans; |
113 | clblasOrder order = clblasRowMajor; |
114 | clblasStatus status = clblasSuccess; |
115 | |
116 | if (type == CV_32FC1) |
117 | status = clblasSgemm(order, transA, transB, M, N, K, |
118 | (cl_float)alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda, |
119 | (const cl_mem)B.handle(ACCESS_READ), offb, ldb, |
120 | (cl_float)beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc, |
121 | 1, &clq, 0, NULL, NULL); |
122 | else if (type == CV_64FC1) |
123 | status = clblasDgemm(order, transA, transB, M, N, K, |
124 | alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda, |
125 | (const cl_mem)B.handle(ACCESS_READ), offb, ldb, |
126 | beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc, |
127 | 1, &clq, 0, NULL, NULL); |
128 | else if (type == CV_32FC2) |
129 | { |
130 | cl_float2 alpha_2 = { { (cl_float)alpha, 0 } }; |
131 | cl_float2 beta_2 = { { (cl_float)beta, 0 } }; |
132 | status = clblasCgemm(order, transA, transB, M, N, K, |
133 | alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda, |
134 | (const cl_mem)B.handle(ACCESS_READ), offb, ldb, |
135 | beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc, |
136 | 1, &clq, 0, NULL, NULL); |
137 | } |
138 | else if (type == CV_64FC2) |
139 | { |
140 | cl_double2 alpha_2 = { { alpha, 0 } }; |
141 | cl_double2 beta_2 = { { beta, 0 } }; |
142 | status = clblasZgemm(order, transA, transB, M, N, K, |
143 | alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda, |
144 | (const cl_mem)B.handle(ACCESS_READ), offb, ldb, |
145 | beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc, |
146 | 1, &clq, 0, NULL, NULL); |
147 | } |
148 | else |
149 | CV_Error(Error::StsUnsupportedFormat, ""); |
150 | |
151 | return status == clblasSuccess; |
152 | } |
153 | |
154 | #endif |
155 | |
156 | #ifdef HAVE_OPENCL |
157 | static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, |
158 | InputArray matC, double beta, OutputArray matD, int flags ) |
159 | { |
160 | int type = matA.type(); |
161 | int depth = CV_MAT_DEPTH(type); |
162 | int cn = CV_MAT_CN(type); |
163 | |
164 | CV_CheckTypeEQ(type, matB.type(), ""); |
165 | CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, ""); |
166 | |
167 | const ocl::Device & dev = ocl::Device::getDefault(); |
168 | bool doubleSupport = dev.doubleFPConfig() > 0; |
169 | |
170 | if (!doubleSupport && depth == CV_64F) |
171 | return false; |
172 | |
173 | bool haveC = matC.kind() != cv::_InputArray::NONE; |
174 | Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); |
175 | bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; |
176 | |
177 | if (haveC) |
178 | CV_CheckTypeEQ(type, matC.type(), ""); |
179 | |
180 | Size sizeD(((btrans) ? sizeB.height : sizeB.width), |
181 | ((atrans) ? sizeA.width : sizeA.height)); |
182 | |
183 | if (atrans) |
184 | sizeA = Size(sizeA.height, sizeA.width); |
185 | if (btrans) |
186 | sizeB = Size(sizeB.height, sizeB.width); |
187 | if (haveC && ctrans) |
188 | sizeC = Size(sizeC.height, sizeC.width); |
189 | |
190 | CV_CheckEQ(sizeA.width, sizeB.height, ""); |
191 | if (haveC) |
192 | CV_CheckEQ(sizeC, sizeD, ""); |
193 | |
194 | UMat A = matA.getUMat(); |
195 | UMat B = matB.getUMat(); |
196 | |
197 | matD.create(sz: sizeD, type); |
198 | UMat D = matD.getUMat(); |
199 | |
200 | bool isPropagatedC2D = false; // D content is updated with C / C.t() |
201 | |
202 | if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1) |
203 | { |
204 | if (haveC && beta != 0.0) |
205 | { |
206 | ctrans ? transpose(src: matC, dst: D) : matC.copyTo(arr: D); |
207 | isPropagatedC2D = true; |
208 | } |
209 | else |
210 | { |
211 | beta = 0.0; |
212 | } |
213 | |
214 | bool res = intel_gpu_gemm(A, sizeA: matA.size(), |
215 | B, sizeB: matB.size(), |
216 | D, sizeD, |
217 | alpha, |
218 | beta, |
219 | atrans, btrans, |
220 | isPropagatedC2D); |
221 | if (res) |
222 | return true; |
223 | // fallback on generic OpenCL code |
224 | } |
225 | |
226 | if (sizeD.width < 8 || sizeD.height < 8) |
227 | return false; |
228 | |
229 | String opts; |
230 | |
231 | int wg_size = (int)dev.maxWorkGroupSize(); |
232 | int sizeDmin = std::min(a: sizeD.width, b: sizeD.height); |
233 | wg_size = std::min(a: wg_size, b: sizeDmin * sizeDmin); |
234 | int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; |
235 | |
236 | if (atrans) |
237 | A = A.t(); |
238 | |
239 | if (btrans) |
240 | B = B.t(); |
241 | |
242 | if (haveC && !isPropagatedC2D) |
243 | ctrans ? transpose(src: matC, dst: D) : matC.copyTo(arr: D); |
244 | |
245 | int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; |
246 | int kercn = ocl::checkOptimalVectorWidth(vectorWidths, src1: B, src2: D); |
247 | |
248 | opts += format(fmt: " -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", |
249 | ocl::typeToStr(t: type), ocl::typeToStr(t: depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), |
250 | cn, kercn, block_size, |
251 | (sizeA.width % block_size !=0) ? " -D NO_MULT": "", |
252 | haveC ? " -D HAVE_C": "", |
253 | doubleSupport ? " -D DOUBLE_SUPPORT": ""); |
254 | |
255 | ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); |
256 | if (k.empty()) |
257 | return false; |
258 | |
259 | if (depth == CV_64F) |
260 | k.args(kernel_args: ocl::KernelArg::ReadOnlyNoSize(m: A), |
261 | kernel_args: ocl::KernelArg::ReadOnlyNoSize(m: B, wscale: cn, iwscale: kercn), |
262 | kernel_args: ocl::KernelArg::ReadWrite(m: D, wscale: cn, iwscale: kercn), |
263 | kernel_args: sizeA.width, kernel_args: alpha, kernel_args: beta); |
264 | else |
265 | k.args(kernel_args: ocl::KernelArg::ReadOnlyNoSize(m: A), |
266 | kernel_args: ocl::KernelArg::ReadOnlyNoSize(m: B, wscale: cn, iwscale: kercn), |
267 | kernel_args: ocl::KernelArg::ReadWrite(m: D, wscale: cn, iwscale: kercn), |
268 | kernel_args: sizeA.width, kernel_args: (float)alpha, kernel_args: (float)beta); |
269 | |
270 | size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; |
271 | size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; |
272 | |
273 | return k.run(dims: 2, globalsize, localsize: block_size !=1 ? localsize : NULL, sync: false); |
274 | } |
275 | #endif |
276 | |
277 | |
278 | namespace hal { |
279 | |
280 | void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, |
281 | float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, |
282 | int m_a, int n_a, int n_d, int flags) |
283 | { |
284 | CV_INSTRUMENT_REGION(); |
285 | CALL_HAL(gemm32f, cv_hal_gemm32f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) |
286 | #ifdef CV_GEMM_BASELINE_ONLY |
287 | CV_CPU_CALL_BASELINE(gemm32f, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)); |
288 | #else |
289 | CV_CPU_DISPATCH(gemm32f, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags), |
290 | CV_CPU_DISPATCH_MODES_ALL); |
291 | #endif |
292 | } |
293 | |
294 | void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, |
295 | double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, |
296 | int m_a, int n_a, int n_d, int flags) |
297 | { |
298 | CV_INSTRUMENT_REGION(); |
299 | CALL_HAL(gemm64f, cv_hal_gemm64f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) |
300 | #ifdef CV_GEMM_BASELINE_ONLY |
301 | CV_CPU_CALL_BASELINE(gemm64f, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)); |
302 | #else |
303 | CV_CPU_DISPATCH(gemm64f, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags), |
304 | CV_CPU_DISPATCH_MODES_ALL); |
305 | #endif |
306 | } |
307 | |
308 | void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, |
309 | float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, |
310 | int m_a, int n_a, int n_d, int flags) |
311 | { |
312 | CV_INSTRUMENT_REGION(); |
313 | CALL_HAL(gemm32fc, cv_hal_gemm32fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) |
314 | #ifdef CV_GEMM_BASELINE_ONLY |
315 | CV_CPU_CALL_BASELINE(gemm32fc, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)); |
316 | #else |
317 | CV_CPU_DISPATCH(gemm32fc, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags), |
318 | CV_CPU_DISPATCH_MODES_ALL); |
319 | #endif |
320 | } |
321 | |
322 | void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, |
323 | double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, |
324 | int m_a, int n_a, int n_d, int flags) |
325 | { |
326 | CV_INSTRUMENT_REGION(); |
327 | CALL_HAL(gemm64fc, cv_hal_gemm64fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) |
328 | #ifdef CV_GEMM_BASELINE_ONLY |
329 | CV_CPU_CALL_BASELINE(gemm64fc, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)); |
330 | #else |
331 | CV_CPU_DISPATCH(gemm64fc, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags), |
332 | CV_CPU_DISPATCH_MODES_ALL); |
333 | #endif |
334 | } |
335 | |
336 | } // namespace hal |
337 | |
338 | void gemm(InputArray matA, InputArray matB, double alpha, |
339 | InputArray matC, double beta, OutputArray _matD, int flags) |
340 | { |
341 | #ifdef HAVE_CLAMDBLAS |
342 | CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() && |
343 | matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes |
344 | ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags)) |
345 | #endif |
346 | |
347 | #ifdef HAVE_OPENCL |
348 | CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2, |
349 | ocl_gemm(matA, matB, alpha, matC, beta, matD: _matD, flags)) |
350 | #endif |
351 | |
352 | Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0.0 ? matC.getMat() : Mat(); |
353 | Size a_size = A.size(), d_size; |
354 | int len = 0, type = A.type(); |
355 | |
356 | CV_Assert_N( type == B.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); |
357 | |
358 | switch( flags & (GEMM_1_T|GEMM_2_T) ) |
359 | { |
360 | case 0: |
361 | d_size = Size( B.cols, a_size.height ); |
362 | len = B.rows; |
363 | CV_Assert( a_size.width == len ); |
364 | break; |
365 | case 1: |
366 | d_size = Size( B.cols, a_size.width ); |
367 | len = B.rows; |
368 | CV_Assert( a_size.height == len ); |
369 | break; |
370 | case 2: |
371 | d_size = Size( B.rows, a_size.height ); |
372 | len = B.cols; |
373 | CV_Assert( a_size.width == len ); |
374 | break; |
375 | case 3: |
376 | d_size = Size( B.rows, a_size.width ); |
377 | len = B.cols; |
378 | CV_Assert( a_size.height == len ); |
379 | break; |
380 | } |
381 | |
382 | if( !C.empty() ) |
383 | { |
384 | CV_Assert_N( C.type() == type, |
385 | (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) || |
386 | ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height))); |
387 | } |
388 | |
389 | _matD.create( rows: d_size.height, cols: d_size.width, type ); |
390 | Mat D = _matD.getMat(); |
391 | if( (flags & GEMM_3_T) != 0 && C.data == D.data ) |
392 | { |
393 | transpose( src: C, dst: C ); |
394 | flags &= ~GEMM_3_T; |
395 | } |
396 | |
397 | Mat *DProxyPtr = &D, DProxy; |
398 | if( D.data == A.data || D.data == B.data ) |
399 | { |
400 | DProxy = Mat(d_size.height, d_size.width, D.type()); |
401 | DProxyPtr = &DProxy; |
402 | } |
403 | |
404 | if( type == CV_32FC1 ) |
405 | hal::gemm32f(src1: A.ptr<float>(), src1_step: A.step, src2: B.ptr<float>(), src2_step: B.step, alpha: static_cast<float>(alpha), |
406 | src3: C.ptr<float>(), src3_step: C.step, beta: static_cast<float>(beta), |
407 | dst: DProxyPtr->ptr<float>(), dst_step: DProxyPtr->step, |
408 | m_a: a_size.height, n_a: a_size.width, n_d: DProxyPtr->cols, flags); |
409 | else if( type == CV_64FC1 ) |
410 | hal::gemm64f(src1: A.ptr<double>(), src1_step: A.step, src2: B.ptr<double>(), src2_step: B.step, alpha, |
411 | src3: C.ptr<double>(), src3_step: C.step, beta, |
412 | dst: DProxyPtr->ptr<double>(), dst_step: DProxyPtr->step, |
413 | m_a: a_size.height, n_a: a_size.width, n_d: DProxyPtr->cols, flags); |
414 | else if( type == CV_32FC2 ) |
415 | hal::gemm32fc(src1: A.ptr<float>(), src1_step: A.step, src2: B.ptr<float>(), src2_step: B.step, alpha: static_cast<float>(alpha), |
416 | src3: C.ptr<float>(), src3_step: C.step, beta: static_cast<float>(beta), |
417 | dst: DProxyPtr->ptr<float>(), dst_step: DProxyPtr->step, |
418 | m_a: a_size.height, n_a: a_size.width, n_d: DProxyPtr->cols, flags); |
419 | else |
420 | { |
421 | CV_Assert( type == CV_64FC2 ); |
422 | hal::gemm64fc(src1: A.ptr<double>(), src1_step: A.step, src2: B.ptr<double>(), src2_step: B.step, alpha, |
423 | src3: C.ptr<double>(), src3_step: C.step, beta, |
424 | dst: D.ptr<double>(), dst_step: D.step, |
425 | m_a: a_size.height, n_a: a_size.width, n_d: DProxyPtr->cols, flags); |
426 | } |
427 | |
428 | if(DProxyPtr != &D) |
429 | DProxyPtr->copyTo(m: D); |
430 | } |
431 | |
432 | |
433 | |
434 | /****************************************************************************************\ |
435 | * Transform * |
436 | \****************************************************************************************/ |
437 | |
438 | static TransformFunc getTransformFunc(int depth) |
439 | { |
440 | CV_INSTRUMENT_REGION(); |
441 | CV_CPU_DISPATCH(getTransformFunc, (depth), |
442 | CV_CPU_DISPATCH_MODES_ALL); |
443 | } |
444 | |
445 | static TransformFunc getDiagTransformFunc(int depth) |
446 | { |
447 | CV_INSTRUMENT_REGION(); |
448 | CV_CPU_DISPATCH(getDiagTransformFunc, (depth), |
449 | CV_CPU_DISPATCH_MODES_ALL); |
450 | } |
451 | |
452 | void transform(InputArray _src, OutputArray _dst, InputArray _mtx) |
453 | { |
454 | CV_INSTRUMENT_REGION(); |
455 | |
456 | Mat src = _src.getMat(), m = _mtx.getMat(); |
457 | int depth = src.depth(), scn = src.channels(), dcn = m.rows; |
458 | CV_Assert( scn == m.cols || scn + 1 == m.cols ); |
459 | bool isDiag = false; |
460 | |
461 | _dst.create( sz: src.size(), CV_MAKETYPE(depth, dcn) ); |
462 | Mat dst = _dst.getMat(); |
463 | |
464 | if (src.data == dst.data) // inplace case |
465 | { |
466 | CV_Assert(scn == dcn); |
467 | src = src.clone(); // TODO Add performance warning |
468 | } |
469 | |
470 | int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F; |
471 | AutoBuffer<double> _mbuf; |
472 | double* mbuf; |
473 | |
474 | if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 ) |
475 | { |
476 | _mbuf.allocate(size: dcn*(scn+1)); |
477 | mbuf = _mbuf.data(); |
478 | Mat tmp(dcn, scn+1, mtype, mbuf); |
479 | memset(s: tmp.ptr(), c: 0, n: tmp.total()*tmp.elemSize()); |
480 | if( m.cols == scn+1 ) |
481 | m.convertTo(m: tmp, rtype: mtype); |
482 | else |
483 | { |
484 | Mat tmppart = tmp.colRange(startcol: 0, endcol: m.cols); |
485 | m.convertTo(m: tmppart, rtype: mtype); |
486 | } |
487 | m = tmp; |
488 | } |
489 | else |
490 | mbuf = m.ptr<double>(); |
491 | |
492 | if( scn == dcn ) |
493 | { |
494 | int i, j; |
495 | double eps = mtype == CV_32F ? FLT_EPSILON : DBL_EPSILON; |
496 | |
497 | if( scn == 1 ) |
498 | { |
499 | double alpha, beta; |
500 | if( mtype == CV_32F ) |
501 | alpha = m.at<float>(i0: 0), beta = m.at<float>(i0: 1); |
502 | else |
503 | alpha = m.at<double>(i0: 0), beta = m.at<double>(i0: 1); |
504 | src.convertTo(m: dst, rtype: dst.type(), alpha, beta); |
505 | return; |
506 | } |
507 | |
508 | for( i = 0, isDiag = true; isDiag && i < scn; i++ ) |
509 | { |
510 | for( j = 0; isDiag && j < scn; j++ ) |
511 | { |
512 | double v = mtype == CV_32F ? m.at<float>(i0: i, i1: j) : m.at<double>(i0: i, i1: j); |
513 | if( i != j && fabs(x: v) > eps ) |
514 | isDiag = false; |
515 | } |
516 | } |
517 | } |
518 | |
519 | TransformFunc func = isDiag ? getDiagTransformFunc(depth): getTransformFunc(depth); |
520 | CV_Assert( func != 0 ); |
521 | |
522 | const Mat* arrays[] = {&src, &dst, 0}; |
523 | uchar* ptrs[2] = {}; |
524 | NAryMatIterator it(arrays, ptrs); |
525 | size_t i, total = it.size; |
526 | |
527 | for( i = 0; i < it.nplanes; i++, ++it ) |
528 | func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); |
529 | } |
530 | |
531 | |
532 | |
533 | /****************************************************************************************\ |
534 | * Perspective Transform * |
535 | \****************************************************************************************/ |
536 | |
537 | static TransformFunc getPerspectiveTransform(int depth) |
538 | { |
539 | CV_INSTRUMENT_REGION(); |
540 | CV_CPU_DISPATCH(getPerspectiveTransform, (depth), |
541 | CV_CPU_DISPATCH_MODES_ALL); |
542 | } |
543 | |
544 | void perspectiveTransform(InputArray _src, OutputArray _dst, InputArray _mtx) |
545 | { |
546 | CV_INSTRUMENT_REGION(); |
547 | |
548 | Mat src = _src.getMat(), m = _mtx.getMat(); |
549 | int depth = src.depth(), scn = src.channels(), dcn = m.rows-1; |
550 | CV_Assert( scn + 1 == m.cols ); |
551 | CV_Assert( depth == CV_32F || depth == CV_64F ); |
552 | |
553 | _dst.create( sz: src.size(), CV_MAKETYPE(depth, dcn) ); |
554 | Mat dst = _dst.getMat(); |
555 | |
556 | const int mtype = CV_64F; |
557 | AutoBuffer<double> _mbuf; |
558 | double* mbuf = m.ptr<double>(); |
559 | |
560 | if( !m.isContinuous() || m.type() != mtype ) |
561 | { |
562 | _mbuf.allocate(size: (dcn+1)*(scn+1)); |
563 | mbuf = _mbuf.data(); |
564 | Mat tmp(dcn+1, scn+1, mtype, mbuf); |
565 | m.convertTo(m: tmp, rtype: mtype); |
566 | m = tmp; |
567 | } |
568 | |
569 | TransformFunc func = getPerspectiveTransform(depth); |
570 | CV_Assert( func != 0 ); |
571 | |
572 | const Mat* arrays[] = {&src, &dst, 0}; |
573 | uchar* ptrs[2] = {}; |
574 | NAryMatIterator it(arrays, ptrs); |
575 | size_t i, total = it.size; |
576 | |
577 | for( i = 0; i < it.nplanes; i++, ++it ) |
578 | func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); |
579 | } |
580 | |
581 | /****************************************************************************************\ |
582 | * ScaleAdd * |
583 | \****************************************************************************************/ |
584 | |
585 | #ifdef HAVE_OPENCL |
586 | |
587 | static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) |
588 | { |
589 | const ocl::Device & d = ocl::Device::getDefault(); |
590 | |
591 | bool doubleSupport = d.doubleFPConfig() > 0; |
592 | Size size = _src1.size(); |
593 | int depth = CV_MAT_DEPTH(type); |
594 | if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() ) |
595 | return false; |
596 | |
597 | _dst.create(sz: size, type); |
598 | int cn = CV_MAT_CN(type), wdepth = std::max(a: depth, CV_32F); |
599 | int kercn = ocl::predictOptimalVectorWidthMax(src1: _src1, src2: _src2, src3: _dst), |
600 | rowsPerWI = d.isIntel() ? 4 : 1; |
601 | |
602 | char cvt[2][50]; |
603 | ocl::Kernel k("KF", ocl::core::arithm_oclsrc, |
604 | format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D DEPTH_dst=%d -D workT=%s -D convertToWT1=%s" |
605 | " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s" |
606 | " -D wdepth=%d%s -D rowsPerWI=%d", |
607 | ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), depth, |
608 | ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), |
609 | ocl::convertTypeStr(depth, wdepth, kercn, cvt[0], sizeof(cvt[0])), |
610 | ocl::convertTypeStr(wdepth, depth, kercn, cvt[1], sizeof(cvt[1])), |
611 | ocl::typeToStr(wdepth), wdepth, |
612 | doubleSupport ? " -D DOUBLE_SUPPORT": "", rowsPerWI)); |
613 | if (k.empty()) |
614 | return false; |
615 | |
616 | UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(); |
617 | |
618 | ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(m: src1), |
619 | src2arg = ocl::KernelArg::ReadOnlyNoSize(m: src2), |
620 | dstarg = ocl::KernelArg::WriteOnly(m: dst, wscale: cn, iwscale: kercn); |
621 | |
622 | if (wdepth == CV_32F) |
623 | k.args(kernel_args: src1arg, kernel_args: src2arg, kernel_args: dstarg, kernel_args: (float)alpha); |
624 | else |
625 | k.args(kernel_args: src1arg, kernel_args: src2arg, kernel_args: dstarg, kernel_args: alpha); |
626 | |
627 | size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI }; |
628 | return k.run(dims: 2, globalsize, NULL, sync: false); |
629 | } |
630 | |
631 | #endif |
632 | |
633 | static ScaleAddFunc getScaleAddFunc(int depth) |
634 | { |
635 | CV_INSTRUMENT_REGION(); |
636 | CV_CPU_DISPATCH(getScaleAddFunc, (depth), |
637 | CV_CPU_DISPATCH_MODES_ALL); |
638 | } |
639 | |
640 | void scaleAdd(InputArray _src1, double alpha, InputArray _src2, OutputArray _dst) |
641 | { |
642 | CV_INSTRUMENT_REGION(); |
643 | |
644 | int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); |
645 | CV_Assert( type == _src2.type() ); |
646 | |
647 | CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(), |
648 | ocl_scaleAdd(_src1, alpha, _src2, _dst, type)) |
649 | |
650 | if( depth < CV_32F ) |
651 | { |
652 | addWeighted(src1: _src1, alpha, src2: _src2, beta: 1, gamma: 0, dst: _dst, dtype: depth); |
653 | return; |
654 | } |
655 | |
656 | Mat src1 = _src1.getMat(), src2 = _src2.getMat(); |
657 | CV_Assert(src1.size == src2.size); |
658 | |
659 | _dst.create(dims: src1.dims, size: src1.size, type); |
660 | Mat dst = _dst.getMat(); |
661 | |
662 | float falpha = (float)alpha; |
663 | void* palpha = depth == CV_32F ? (void*)&falpha : (void*)α |
664 | |
665 | ScaleAddFunc func = getScaleAddFunc(depth); |
666 | CV_Assert(func); |
667 | |
668 | if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous()) |
669 | { |
670 | size_t len = src1.total()*cn; |
671 | func(src1.ptr(), src2.ptr(), dst.ptr(), (int)len, palpha); |
672 | return; |
673 | } |
674 | |
675 | const Mat* arrays[] = {&src1, &src2, &dst, 0}; |
676 | uchar* ptrs[3] = {}; |
677 | NAryMatIterator it(arrays, ptrs); |
678 | size_t i, len = it.size*cn; |
679 | |
680 | for( i = 0; i < it.nplanes; i++, ++it ) |
681 | func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha ); |
682 | } |
683 | |
684 | /****************************************************************************************\ |
685 | * Covariation Matrix * |
686 | \****************************************************************************************/ |
687 | |
688 | void calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype ) |
689 | { |
690 | CV_INSTRUMENT_REGION(); |
691 | |
692 | CV_Assert_N( data, nsamples > 0 ); |
693 | Size size = data[0].size(); |
694 | int sz = size.width * size.height, esz = (int)data[0].elemSize(); |
695 | int type = data[0].type(); |
696 | Mat mean; |
697 | ctype = std::max(a: std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), b: _mean.depth()), CV_32F); |
698 | |
699 | if( (flags & CV_COVAR_USE_AVG) != 0 ) |
700 | { |
701 | CV_Assert( _mean.size() == size ); |
702 | if( _mean.isContinuous() && _mean.type() == ctype ) |
703 | mean = _mean.reshape(cn: 1, rows: 1); |
704 | else |
705 | { |
706 | _mean.convertTo(m: mean, rtype: ctype); |
707 | mean = mean.reshape(cn: 1, rows: 1); |
708 | } |
709 | } |
710 | |
711 | Mat _data(nsamples, sz, type); |
712 | |
713 | for( int i = 0; i < nsamples; i++ ) |
714 | { |
715 | CV_Assert_N( data[i].size() == size, data[i].type() == type ); |
716 | if( data[i].isContinuous() ) |
717 | memcpy( dest: _data.ptr(y: i), src: data[i].ptr(), n: sz*esz ); |
718 | else |
719 | { |
720 | Mat dataRow(size.height, size.width, type, _data.ptr(y: i)); |
721 | data[i].copyTo(m: dataRow); |
722 | } |
723 | } |
724 | |
725 | calcCovarMatrix( samples: _data, covar, mean, flags: (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype ); |
726 | if( (flags & CV_COVAR_USE_AVG) == 0 ) |
727 | _mean = mean.reshape(cn: 1, rows: size.height); |
728 | } |
729 | |
730 | void calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype ) |
731 | { |
732 | CV_INSTRUMENT_REGION(); |
733 | |
734 | if(_src.kind() == _InputArray::STD_VECTOR_MAT || _src.kind() == _InputArray::STD_ARRAY_MAT) |
735 | { |
736 | std::vector<cv::Mat> src; |
737 | _src.getMatVector(mv&: src); |
738 | |
739 | CV_Assert( src.size() > 0 ); |
740 | |
741 | Size size = src[0].size(); |
742 | int type = src[0].type(); |
743 | |
744 | ctype = std::max(a: std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), b: _mean.depth()), CV_32F); |
745 | |
746 | Mat _data(static_cast<int>(src.size()), size.area(), type); |
747 | |
748 | int i = 0; |
749 | for(std::vector<cv::Mat>::iterator each = src.begin(); each != src.end(); ++each, ++i ) |
750 | { |
751 | CV_Assert_N( (*each).size() == size, (*each).type() == type ); |
752 | Mat dataRow(size.height, size.width, type, _data.ptr(y: i)); |
753 | (*each).copyTo(m: dataRow); |
754 | } |
755 | |
756 | Mat mean; |
757 | if( (flags & CV_COVAR_USE_AVG) != 0 ) |
758 | { |
759 | CV_Assert( _mean.size() == size ); |
760 | |
761 | if( mean.type() != ctype ) |
762 | { |
763 | mean = _mean.getMat(); |
764 | _mean.create(sz: mean.size(), type: ctype); |
765 | Mat tmp = _mean.getMat(); |
766 | mean.convertTo(m: tmp, rtype: ctype); |
767 | mean = tmp; |
768 | } |
769 | |
770 | mean = _mean.getMat().reshape(cn: 1, rows: 1); |
771 | } |
772 | |
773 | calcCovarMatrix( src: _data, _covar, mean: mean, flags: (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype ); |
774 | |
775 | if( (flags & CV_COVAR_USE_AVG) == 0 ) |
776 | { |
777 | mean = mean.reshape(cn: 1, rows: size.height); |
778 | mean.copyTo(m: _mean); |
779 | } |
780 | return; |
781 | } |
782 | |
783 | Mat data = _src.getMat(), mean; |
784 | CV_Assert( ((flags & CV_COVAR_ROWS) != 0) ^ ((flags & CV_COVAR_COLS) != 0) ); |
785 | bool takeRows = (flags & CV_COVAR_ROWS) != 0; |
786 | int type = data.type(); |
787 | int nsamples = takeRows ? data.rows : data.cols; |
788 | CV_Assert( nsamples > 0 ); |
789 | Size size = takeRows ? Size(data.cols, 1) : Size(1, data.rows); |
790 | |
791 | if( (flags & CV_COVAR_USE_AVG) != 0 ) |
792 | { |
793 | mean = _mean.getMat(); |
794 | ctype = std::max(a: std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), b: mean.depth()), CV_32F); |
795 | CV_Assert( mean.size() == size ); |
796 | if( mean.type() != ctype ) |
797 | { |
798 | _mean.create(sz: mean.size(), type: ctype); |
799 | Mat tmp = _mean.getMat(); |
800 | mean.convertTo(m: tmp, rtype: ctype); |
801 | mean = tmp; |
802 | } |
803 | } |
804 | else |
805 | { |
806 | ctype = std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), CV_32F); |
807 | reduce( src: _src, dst: _mean, dim: takeRows ? 0 : 1, rtype: REDUCE_AVG, dtype: ctype ); |
808 | mean = _mean.getMat(); |
809 | } |
810 | |
811 | mulTransposed( src: data, dst: _covar, aTa: ((flags & CV_COVAR_NORMAL) == 0) ^ takeRows, |
812 | delta: mean, scale: (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, dtype: ctype ); |
813 | } |
814 | |
815 | |
816 | |
817 | /****************************************************************************************\ |
818 | * Mahalanobis * |
819 | \****************************************************************************************/ |
820 | |
821 | static MahalanobisImplFunc getMahalanobisImplFunc(int depth) |
822 | { |
823 | #ifdef CV_MAHALANOBIS_BASELINE_ONLY |
824 | CV_CPU_CALL_BASELINE(getMahalanobisImplFunc, (depth)); |
825 | #else |
826 | CV_INSTRUMENT_REGION(); |
827 | CV_CPU_DISPATCH(getMahalanobisImplFunc, (depth), |
828 | CV_CPU_DISPATCH_MODES_ALL); |
829 | #endif |
830 | } |
831 | |
832 | |
833 | double Mahalanobis(InputArray _v1, InputArray _v2, InputArray _icovar) |
834 | { |
835 | CV_INSTRUMENT_REGION(); |
836 | |
837 | Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat(); |
838 | int type = v1.type(), depth = v1.depth(); |
839 | Size sz = v1.size(); |
840 | int len = sz.width*sz.height*v1.channels(); |
841 | AutoBuffer<double> buf(len); |
842 | |
843 | CV_Assert_N( type == v2.type(), type == icovar.type(), |
844 | sz == v2.size(), len == icovar.rows && len == icovar.cols ); |
845 | |
846 | sz.width *= v1.channels(); |
847 | if( v1.isContinuous() && v2.isContinuous() ) |
848 | { |
849 | sz.width *= sz.height; |
850 | sz.height = 1; |
851 | } |
852 | |
853 | MahalanobisImplFunc func = getMahalanobisImplFunc(depth); |
854 | CV_Assert(func); |
855 | |
856 | double result = func(v1, v2, icovar, buf.data(), len); |
857 | return std::sqrt(x: result); |
858 | } |
859 | |
860 | |
861 | |
862 | /****************************************************************************************\ |
863 | * MulTransposed * |
864 | \****************************************************************************************/ |
865 | |
866 | static MulTransposedFunc getMulTransposedFunc(int stype, int dtype, bool ata) |
867 | { |
868 | #ifdef CV_MULTRANSPOSED_BASELINE_ONLY |
869 | CV_CPU_CALL_BASELINE(getMulTransposedFunc, (stype, dtype, ata)); |
870 | #else |
871 | CV_INSTRUMENT_REGION(); |
872 | CV_CPU_DISPATCH(getMulTransposedFunc, (stype, dtype, ata), |
873 | CV_CPU_DISPATCH_MODES_ALL); |
874 | #endif |
875 | } |
876 | |
877 | void mulTransposed(InputArray _src, OutputArray _dst, bool ata, |
878 | InputArray _delta, double scale, int dtype) |
879 | { |
880 | CV_INSTRUMENT_REGION(); |
881 | |
882 | Mat src = _src.getMat(), delta = _delta.getMat(); |
883 | const int gemm_level = 100; // boundary above which GEMM is faster. |
884 | int stype = src.type(); |
885 | dtype = std::max(a: std::max(CV_MAT_DEPTH(dtype >= 0 ? dtype : stype), b: delta.depth()), CV_32F); |
886 | CV_Assert( src.channels() == 1 ); |
887 | |
888 | if( !delta.empty() ) |
889 | { |
890 | CV_Assert_N( delta.channels() == 1, |
891 | (delta.rows == src.rows || delta.rows == 1), |
892 | (delta.cols == src.cols || delta.cols == 1)); |
893 | if( delta.type() != dtype ) |
894 | delta.convertTo(m: delta, rtype: dtype); |
895 | } |
896 | |
897 | int dsize = ata ? src.cols : src.rows; |
898 | _dst.create( rows: dsize, cols: dsize, type: dtype ); |
899 | Mat dst = _dst.getMat(); |
900 | |
901 | if( src.data == dst.data || (stype == dtype && |
902 | (dst.cols >= gemm_level && dst.rows >= gemm_level && |
903 | src.cols >= gemm_level && src.rows >= gemm_level))) |
904 | { |
905 | Mat src2; |
906 | const Mat* tsrc = &src; |
907 | if( !delta.empty() ) |
908 | { |
909 | if( delta.size() == src.size() ) |
910 | subtract( src1: src, src2: delta, dst: src2 ); |
911 | else |
912 | { |
913 | repeat(src: delta, ny: src.rows/delta.rows, nx: src.cols/delta.cols, dst: src2); |
914 | subtract( src1: src, src2, dst: src2 ); |
915 | } |
916 | tsrc = &src2; |
917 | } |
918 | gemm( matA: *tsrc, matB: *tsrc, alpha: scale, matC: Mat(), beta: 0, matD: dst, flags: ata ? GEMM_1_T : GEMM_2_T ); |
919 | } |
920 | else |
921 | { |
922 | MulTransposedFunc func = getMulTransposedFunc(stype, dtype, ata); |
923 | if( !func ) |
924 | CV_Error( CV_StsUnsupportedFormat, ""); |
925 | |
926 | func( src, dst, delta, scale ); |
927 | completeSymm( m: dst, lowerToUpper: false ); |
928 | } |
929 | } |
930 | |
931 | /****************************************************************************************\ |
932 | * Dot Product * |
933 | \****************************************************************************************/ |
934 | |
935 | static double dotProd_8u(const uchar* src1, const uchar* src2, int len) |
936 | { |
937 | CV_INSTRUMENT_REGION(); |
938 | CV_CPU_DISPATCH(dotProd_8u, (src1, src2, len), |
939 | CV_CPU_DISPATCH_MODES_ALL); |
940 | } |
941 | static double dotProd_8s(const schar* src1, const schar* src2, int len) |
942 | { |
943 | CV_INSTRUMENT_REGION(); |
944 | CV_CPU_DISPATCH(dotProd_8s, (src1, src2, len), |
945 | CV_CPU_DISPATCH_MODES_ALL); |
946 | } |
947 | static double dotProd_16u(const ushort* src1, const ushort* src2, int len) |
948 | { |
949 | CV_INSTRUMENT_REGION(); |
950 | CV_CPU_DISPATCH(dotProd_16u, (src1, src2, len), |
951 | CV_CPU_DISPATCH_MODES_ALL); |
952 | } |
953 | static double dotProd_16s(const short* src1, const short* src2, int len) |
954 | { |
955 | CV_INSTRUMENT_REGION(); |
956 | CV_CPU_DISPATCH(dotProd_16s, (src1, src2, len), |
957 | CV_CPU_DISPATCH_MODES_ALL); |
958 | } |
959 | static double dotProd_32s(const int* src1, const int* src2, int len) |
960 | { |
961 | CV_INSTRUMENT_REGION(); |
962 | CV_CPU_DISPATCH(dotProd_32s, (src1, src2, len), |
963 | CV_CPU_DISPATCH_MODES_ALL); |
964 | } |
965 | static double dotProd_32f(const float* src1, const float* src2, int len) |
966 | { |
967 | CV_INSTRUMENT_REGION(); |
968 | CV_CPU_DISPATCH(dotProd_32f, (src1, src2, len), |
969 | CV_CPU_DISPATCH_MODES_ALL); |
970 | } |
971 | static double dotProd_64f(const double* src1, const double* src2, int len) |
972 | { |
973 | CV_INSTRUMENT_REGION(); |
974 | CV_CPU_DISPATCH(dotProd_64f, (src1, src2, len), |
975 | CV_CPU_DISPATCH_MODES_ALL); |
976 | } |
977 | |
978 | typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len); |
979 | |
980 | static DotProdFunc getDotProdFunc(int depth) |
981 | { |
982 | static DotProdFunc dotProdTab[] = |
983 | { |
984 | (DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s), |
985 | (DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s, |
986 | (DotProdFunc)dotProd_32s, (DotProdFunc)GET_OPTIMIZED(dotProd_32f), |
987 | (DotProdFunc)dotProd_64f, 0 |
988 | }; |
989 | |
990 | return dotProdTab[depth]; |
991 | } |
992 | |
993 | double Mat::dot(InputArray _mat) const |
994 | { |
995 | CV_INSTRUMENT_REGION(); |
996 | |
997 | Mat mat = _mat.getMat(); |
998 | int cn = channels(); |
999 | DotProdFunc func = getDotProdFunc(depth: depth()); |
1000 | CV_Assert_N( mat.type() == type(), mat.size == size, func != 0 ); |
1001 | |
1002 | if( isContinuous() && mat.isContinuous() ) |
1003 | { |
1004 | size_t len = total()*cn; |
1005 | if( len == (size_t)(int)len ) |
1006 | return func(data, mat.data, (int)len); |
1007 | } |
1008 | |
1009 | const Mat* arrays[] = {this, &mat, 0}; |
1010 | uchar* ptrs[2] = {}; |
1011 | NAryMatIterator it(arrays, ptrs); |
1012 | int len = (int)(it.size*cn); |
1013 | double r = 0; |
1014 | |
1015 | for( size_t i = 0; i < it.nplanes; i++, ++it ) |
1016 | r += func( ptrs[0], ptrs[1], len ); |
1017 | |
1018 | return r; |
1019 | } |
1020 | |
1021 | |
1022 | #ifdef HAVE_OPENCL |
1023 | |
1024 | static bool ocl_dot( InputArray _src1, InputArray _src2, double & res ) |
1025 | { |
1026 | UMat src1 = _src1.getUMat().reshape(cn: 1), src2 = _src2.getUMat().reshape(cn: 1); |
1027 | |
1028 | int type = src1.type(), depth = CV_MAT_DEPTH(type), |
1029 | kercn = ocl::predictOptimalVectorWidth(src1, src2); |
1030 | bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; |
1031 | |
1032 | if ( !doubleSupport && depth == CV_64F ) |
1033 | return false; |
1034 | |
1035 | int dbsize = ocl::Device::getDefault().maxComputeUnits(); |
1036 | size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); |
1037 | int ddepth = std::max(CV_32F, b: depth); |
1038 | |
1039 | int wgs2_aligned = 1; |
1040 | while (wgs2_aligned < (int)wgs) |
1041 | wgs2_aligned <<= 1; |
1042 | wgs2_aligned >>= 1; |
1043 | |
1044 | char cvt[50]; |
1045 | ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, |
1046 | format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT " |
1047 | "-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d", |
1048 | ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth), |
1049 | ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), |
1050 | ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt, sizeof(cvt)), |
1051 | (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT": "", |
1052 | _src1.isContinuous() ? " -D HAVE_SRC_CONT": "", |
1053 | _src2.isContinuous() ? " -D HAVE_SRC2_CONT": "", kercn)); |
1054 | if (k.empty()) |
1055 | return false; |
1056 | |
1057 | UMat db(1, dbsize, ddepth); |
1058 | |
1059 | ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(m: src1), |
1060 | src2arg = ocl::KernelArg::ReadOnlyNoSize(m: src2), |
1061 | dbarg = ocl::KernelArg::PtrWriteOnly(m: db); |
1062 | |
1063 | k.args(kernel_args: src1arg, kernel_args: src1.cols, kernel_args: (int)src1.total(), kernel_args: dbsize, kernel_args: dbarg, kernel_args: src2arg); |
1064 | |
1065 | size_t globalsize = dbsize * wgs; |
1066 | if (k.run(dims: 1, globalsize: &globalsize, localsize: &wgs, sync: true)) |
1067 | { |
1068 | res = sum(src: db.getMat(flags: ACCESS_READ))[0]; |
1069 | return true; |
1070 | } |
1071 | return false; |
1072 | } |
1073 | |
1074 | #endif |
1075 | |
1076 | double UMat::dot(InputArray m) const |
1077 | { |
1078 | CV_INSTRUMENT_REGION(); |
1079 | |
1080 | CV_Assert(m.sameSize(*this) && m.type() == type()); |
1081 | |
1082 | #ifdef HAVE_OPENCL |
1083 | double r = 0; |
1084 | CV_OCL_RUN_(dims <= 2, ocl_dot(src1: *this, src2: m, res&: r), r) |
1085 | #endif |
1086 | |
1087 | return getMat(flags: ACCESS_READ).dot(mat: m); |
1088 | } |
1089 | |
1090 | } // namespace cv:: |
1091 | |
1092 | |
1093 | #ifndef OPENCV_EXCLUDE_C_API |
1094 | /****************************************************************************************\ |
1095 | * Earlier API * |
1096 | \****************************************************************************************/ |
1097 | |
1098 | CV_IMPL void cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha, |
1099 | const CvArr* Carr, double beta, CvArr* Darr, int flags ) |
1100 | { |
1101 | cv::Mat A = cv::cvarrToMat(arr: Aarr), B = cv::cvarrToMat(arr: Barr); |
1102 | cv::Mat C, D = cv::cvarrToMat(arr: Darr); |
1103 | |
1104 | if( Carr ) |
1105 | C = cv::cvarrToMat(arr: Carr); |
1106 | |
1107 | CV_Assert_N( (D.rows == ((flags & CV_GEMM_A_T) == 0 ? A.rows : A.cols)), |
1108 | (D.cols == ((flags & CV_GEMM_B_T) == 0 ? B.cols : B.rows)), |
1109 | D.type() == A.type() ); |
1110 | |
1111 | gemm( matA: A, matB: B, alpha, matC: C, beta, matD: D, flags ); |
1112 | } |
1113 | |
1114 | |
1115 | CV_IMPL void |
1116 | cvTransform( const CvArr* srcarr, CvArr* dstarr, |
1117 | const CvMat* transmat, const CvMat* shiftvec ) |
1118 | { |
1119 | cv::Mat m = cv::cvarrToMat(arr: transmat), src = cv::cvarrToMat(arr: srcarr), dst = cv::cvarrToMat(arr: dstarr); |
1120 | |
1121 | if( shiftvec ) |
1122 | { |
1123 | cv::Mat v = cv::cvarrToMat(arr: shiftvec).reshape(cn: 1,rows: m.rows), |
1124 | _m(m.rows, m.cols + 1, m.type()), m1 = _m.colRange(startcol: 0,endcol: m.cols), v1 = _m.col(x: m.cols); |
1125 | m.convertTo(m: m1, rtype: m1.type()); |
1126 | v.convertTo(m: v1, rtype: v1.type()); |
1127 | m = _m; |
1128 | } |
1129 | |
1130 | CV_Assert_N( dst.depth() == src.depth(), dst.channels() == m.rows ); |
1131 | cv::transform( src: src, dst: dst, mtx: m ); |
1132 | } |
1133 | |
1134 | |
1135 | CV_IMPL void |
1136 | cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat ) |
1137 | { |
1138 | cv::Mat m = cv::cvarrToMat(arr: mat), src = cv::cvarrToMat(arr: srcarr), dst = cv::cvarrToMat(arr: dstarr); |
1139 | |
1140 | CV_Assert_N( dst.type() == src.type(), dst.channels() == m.rows-1 ); |
1141 | cv::perspectiveTransform( src: src, dst: dst, mtx: m ); |
1142 | } |
1143 | |
1144 | |
1145 | CV_IMPL void cvScaleAdd( const CvArr* srcarr1, CvScalar scale, |
1146 | const CvArr* srcarr2, CvArr* dstarr ) |
1147 | { |
1148 | cv::Mat src1 = cv::cvarrToMat(arr: srcarr1), dst = cv::cvarrToMat(arr: dstarr); |
1149 | |
1150 | CV_Assert_N( src1.size == dst.size, src1.type() == dst.type() ); |
1151 | cv::scaleAdd( src1: src1, alpha: scale.val[0], src2: cv::cvarrToMat(arr: srcarr2), dst: dst ); |
1152 | } |
1153 | |
1154 | |
1155 | CV_IMPL void |
1156 | cvCalcCovarMatrix( const CvArr** vecarr, int count, |
1157 | CvArr* covarr, CvArr* avgarr, int flags ) |
1158 | { |
1159 | cv::Mat cov0 = cv::cvarrToMat(arr: covarr), cov = cov0, mean0, mean; |
1160 | CV_Assert_N( vecarr != 0, count >= 1 ); |
1161 | |
1162 | if( avgarr ) |
1163 | mean = mean0 = cv::cvarrToMat(arr: avgarr); |
1164 | |
1165 | if( (flags & CV_COVAR_COLS) != 0 || (flags & CV_COVAR_ROWS) != 0 ) |
1166 | { |
1167 | |
1168 | cv::Mat data = cv::cvarrToMat(arr: vecarr[0]); |
1169 | cv::calcCovarMatrix( src: data, covar: cov, mean: mean, flags, ctype: cov.type() ); |
1170 | } |
1171 | else |
1172 | { |
1173 | std::vector<cv::Mat> data(count); |
1174 | for( int i = 0; i < count; i++ ) |
1175 | data[i] = cv::cvarrToMat(arr: vecarr[i]); |
1176 | cv::calcCovarMatrix( data: &data[0], nsamples: count, covar&: cov, mean&: mean, flags, ctype: cov.type() ); |
1177 | } |
1178 | |
1179 | if( mean.data != mean0.data && mean0.data ) |
1180 | mean.convertTo(m: mean0, rtype: mean0.type()); |
1181 | |
1182 | if( cov.data != cov0.data ) |
1183 | cov.convertTo(m: cov0, rtype: cov0.type()); |
1184 | } |
1185 | |
1186 | |
1187 | CV_IMPL double |
1188 | cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, const CvArr* matarr ) |
1189 | { |
1190 | return cv::Mahalanobis(v1: cv::cvarrToMat(arr: srcAarr), |
1191 | v2: cv::cvarrToMat(arr: srcBarr), icovar: cv::cvarrToMat(arr: matarr)); |
1192 | } |
1193 | |
1194 | CV_IMPL void |
1195 | cvMulTransposed( const CvArr* srcarr, CvArr* dstarr, |
1196 | int order, const CvArr* deltaarr, double scale ) |
1197 | { |
1198 | cv::Mat src = cv::cvarrToMat(arr: srcarr), dst0 = cv::cvarrToMat(arr: dstarr), dst = dst0, delta; |
1199 | if( deltaarr ) |
1200 | delta = cv::cvarrToMat(arr: deltaarr); |
1201 | cv::mulTransposed( src: src, dst: dst, ata: order != 0, delta: delta, scale, dtype: dst.type()); |
1202 | if( dst.data != dst0.data ) |
1203 | dst.convertTo(m: dst0, rtype: dst0.type()); |
1204 | } |
1205 | |
1206 | CV_IMPL double cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr ) |
1207 | { |
1208 | return cv::cvarrToMat(arr: srcAarr).dot(mat: cv::cvarrToMat(arr: srcBarr)); |
1209 | } |
1210 | |
1211 | |
1212 | CV_IMPL void |
1213 | cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigenvects, int flags ) |
1214 | { |
1215 | cv::Mat data = cv::cvarrToMat(arr: data_arr), mean0 = cv::cvarrToMat(arr: avg_arr); |
1216 | cv::Mat evals0 = cv::cvarrToMat(arr: eigenvals), evects0 = cv::cvarrToMat(arr: eigenvects); |
1217 | cv::Mat mean = mean0, evals = evals0, evects = evects0; |
1218 | |
1219 | cv::PCA pca; |
1220 | pca.mean = mean; |
1221 | pca.eigenvalues = evals; |
1222 | pca.eigenvectors = evects; |
1223 | |
1224 | pca(data, (flags & CV_PCA_USE_AVG) ? mean : cv::Mat(), |
1225 | flags, !evals.empty() ? evals.rows + evals.cols - 1 : 0); |
1226 | |
1227 | if( pca.mean.size() == mean.size() ) |
1228 | pca.mean.convertTo( m: mean, rtype: mean.type() ); |
1229 | else |
1230 | { |
1231 | cv::Mat temp; pca.mean.convertTo( m: temp, rtype: mean.type() ); |
1232 | transpose( src: temp, dst: mean ); |
1233 | } |
1234 | |
1235 | evals = pca.eigenvalues; |
1236 | evects = pca.eigenvectors; |
1237 | int ecount0 = evals0.cols + evals0.rows - 1; |
1238 | int ecount = evals.cols + evals.rows - 1; |
1239 | |
1240 | CV_Assert_N( (evals0.cols == 1 || evals0.rows == 1), |
1241 | ecount0 <= ecount, |
1242 | evects0.cols == evects.cols, |
1243 | evects0.rows == ecount0 ); |
1244 | |
1245 | cv::Mat temp = evals0; |
1246 | if( evals.rows == 1 ) |
1247 | evals.colRange(startcol: 0, endcol: ecount0).convertTo(m: temp, rtype: evals0.type()); |
1248 | else |
1249 | evals.rowRange(startrow: 0, endrow: ecount0).convertTo(m: temp, rtype: evals0.type()); |
1250 | if( temp.data != evals0.data ) |
1251 | transpose(src: temp, dst: evals0); |
1252 | evects.rowRange(startrow: 0, endrow: ecount0).convertTo( m: evects0, rtype: evects0.type() ); |
1253 | |
1254 | // otherwise some datatype's or size's were incorrect, so the output arrays have been reallocated |
1255 | CV_Assert( mean0.data == mean.data ); |
1256 | } |
1257 | |
1258 | |
1259 | CV_IMPL void |
1260 | cvProjectPCA( const CvArr* data_arr, const CvArr* avg_arr, |
1261 | const CvArr* eigenvects, CvArr* result_arr ) |
1262 | { |
1263 | cv::Mat data = cv::cvarrToMat(arr: data_arr), mean = cv::cvarrToMat(arr: avg_arr); |
1264 | cv::Mat evects = cv::cvarrToMat(arr: eigenvects), dst0 = cv::cvarrToMat(arr: result_arr), dst = dst0; |
1265 | |
1266 | cv::PCA pca; |
1267 | pca.mean = mean; |
1268 | int n; |
1269 | if( mean.rows == 1 ) |
1270 | { |
1271 | CV_Assert_N(dst.cols <= evects.rows, dst.rows == data.rows); |
1272 | n = dst.cols; |
1273 | } |
1274 | else |
1275 | { |
1276 | CV_Assert_N(dst.rows <= evects.rows, dst.cols == data.cols); |
1277 | n = dst.rows; |
1278 | } |
1279 | pca.eigenvectors = evects.rowRange(startrow: 0, endrow: n); |
1280 | |
1281 | cv::Mat result = pca.project(vec: data); |
1282 | if( result.cols != dst.cols ) |
1283 | result = result.reshape(cn: 1, rows: 1); |
1284 | result.convertTo(m: dst, rtype: dst.type()); |
1285 | |
1286 | CV_Assert(dst0.data == dst.data); |
1287 | } |
1288 | |
1289 | |
1290 | CV_IMPL void |
1291 | cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr, |
1292 | const CvArr* eigenvects, CvArr* result_arr ) |
1293 | { |
1294 | cv::Mat data = cv::cvarrToMat(arr: proj_arr), mean = cv::cvarrToMat(arr: avg_arr); |
1295 | cv::Mat evects = cv::cvarrToMat(arr: eigenvects), dst0 = cv::cvarrToMat(arr: result_arr), dst = dst0; |
1296 | |
1297 | cv::PCA pca; |
1298 | pca.mean = mean; |
1299 | int n; |
1300 | if( mean.rows == 1 ) |
1301 | { |
1302 | CV_Assert_N(data.cols <= evects.rows, dst.rows == data.rows); |
1303 | n = data.cols; |
1304 | } |
1305 | else |
1306 | { |
1307 | CV_Assert_N(data.rows <= evects.rows, dst.cols == data.cols); |
1308 | n = data.rows; |
1309 | } |
1310 | pca.eigenvectors = evects.rowRange(startrow: 0, endrow: n); |
1311 | |
1312 | cv::Mat result = pca.backProject(vec: data); |
1313 | result.convertTo(m: dst, rtype: dst.type()); |
1314 | |
1315 | CV_Assert(dst0.data == dst.data); |
1316 | } |
1317 | |
1318 | #endif // OPENCV_EXCLUDE_C_API |
1319 | |
1320 | /* End of file. */ |
1321 |
Definitions
- ocl_gemm
- gemm32f
- gemm64f
- gemm32fc
- gemm64fc
- gemm
- getTransformFunc
- getDiagTransformFunc
- transform
- getPerspectiveTransform
- perspectiveTransform
- ocl_scaleAdd
- getScaleAddFunc
- scaleAdd
- calcCovarMatrix
- calcCovarMatrix
- getMahalanobisImplFunc
- Mahalanobis
- getMulTransposedFunc
- mulTransposed
- dotProd_8u
- dotProd_8s
- dotProd_16u
- dotProd_16s
- dotProd_32s
- dotProd_32f
- dotProd_64f
- getDotProdFunc
- dot
- ocl_dot
- dot
- cvGEMM
- cvTransform
- cvPerspectiveTransform
- cvScaleAdd
- cvCalcCovarMatrix
- cvMahalanobis
- cvMulTransposed
- cvDotProduct
- cvCalcPCA
- cvProjectPCA
Improve your Profiling and Debugging skills
Find out more