1 | // This file is part of OpenCV project. |
2 | // It is subject to the license terms in the LICENSE file found in the top-level directory |
3 | // of this distribution and at http://opencv.org/license.html. |
4 | |
5 | // This file is modified from the https://github.com/HonglinChu/NanoTrack/blob/master/ncnn_macos_nanotrack/nanotrack.cpp |
6 | // Author, HongLinChu, 1628464345@qq.com |
7 | // Adapt to OpenCV, ZihaoMu: zihaomu@outlook.com |
8 | |
9 | // Link to original inference code: https://github.com/HonglinChu/NanoTrack |
10 | // Link to original training repo: https://github.com/HonglinChu/SiamTrackers/tree/master/NanoTrack |
11 | |
12 | #include "../precomp.hpp" |
13 | #ifdef HAVE_OPENCV_DNN |
14 | #include "opencv2/dnn.hpp" |
15 | #endif |
16 | |
17 | namespace cv { |
18 | |
19 | TrackerNano::TrackerNano() |
20 | { |
21 | // nothing |
22 | } |
23 | |
24 | TrackerNano::~TrackerNano() |
25 | { |
26 | // nothing |
27 | } |
28 | |
29 | TrackerNano::Params::Params() |
30 | { |
31 | backbone = "backbone.onnx" ; |
32 | neckhead = "neckhead.onnx" ; |
33 | #ifdef HAVE_OPENCV_DNN |
34 | backend = dnn::DNN_BACKEND_DEFAULT; |
35 | target = dnn::DNN_TARGET_CPU; |
36 | #else |
37 | backend = -1; // invalid value |
38 | target = -1; // invalid value |
39 | #endif |
40 | } |
41 | |
42 | #ifdef HAVE_OPENCV_DNN |
43 | static void softmax(const Mat& src, Mat& dst) |
44 | { |
45 | Mat maxVal; |
46 | cv::max(src1: src.row(y: 1), src2: src.row(y: 0), dst&: maxVal); |
47 | |
48 | src.row(y: 1) -= maxVal; |
49 | src.row(y: 0) -= maxVal; |
50 | |
51 | exp(src, dst); |
52 | |
53 | Mat sumVal = dst.row(y: 0) + dst.row(y: 1); |
54 | dst.row(y: 0) = dst.row(y: 0) / sumVal; |
55 | dst.row(y: 1) = dst.row(y: 1) / sumVal; |
56 | } |
57 | |
58 | static float sizeCal(float w, float h) |
59 | { |
60 | float pad = (w + h) * 0.5f; |
61 | float sz2 = (w + pad) * (h + pad); |
62 | return sqrt(x: sz2); |
63 | } |
64 | |
65 | static Mat sizeCal(const Mat& w, const Mat& h) |
66 | { |
67 | Mat pad = (w + h) * 0.5; |
68 | Mat sz2 = (w + pad).mul(e: (h + pad)); |
69 | |
70 | cv::sqrt(src: sz2, dst: sz2); |
71 | return sz2; |
72 | } |
73 | |
74 | // Similar python code: r = np.maximum(r, 1. / r) # r is matrix |
75 | static void elementReciprocalMax(Mat& srcDst) |
76 | { |
77 | size_t totalV = srcDst.total(); |
78 | float* ptr = srcDst.ptr<float>(y: 0); |
79 | for (size_t i = 0; i < totalV; i++) |
80 | { |
81 | float val = *(ptr + i); |
82 | *(ptr + i) = std::max(a: val, b: 1.0f/val); |
83 | } |
84 | } |
85 | |
86 | class TrackerNanoImpl : public TrackerNano |
87 | { |
88 | public: |
89 | TrackerNanoImpl(const TrackerNano::Params& parameters) |
90 | : params(parameters) |
91 | { |
92 | backbone = dnn::readNet(model: params.backbone); |
93 | neckhead = dnn::readNet(model: params.neckhead); |
94 | |
95 | CV_Assert(!backbone.empty()); |
96 | CV_Assert(!neckhead.empty()); |
97 | |
98 | backbone.setPreferableBackend(params.backend); |
99 | backbone.setPreferableTarget(params.target); |
100 | neckhead.setPreferableBackend(params.backend); |
101 | neckhead.setPreferableTarget(params.target); |
102 | } |
103 | |
104 | void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE; |
105 | bool update(InputArray image, Rect& boundingBox) CV_OVERRIDE; |
106 | float getTrackingScore() CV_OVERRIDE; |
107 | |
108 | // Save the target bounding box for each frame. |
109 | std::vector<float> targetSz = {0, 0}; // H and W of bounding box |
110 | std::vector<float> targetPos = {0, 0}; // center point of bounding box (x, y) |
111 | float tracking_score; |
112 | |
113 | TrackerNano::Params params; |
114 | |
115 | struct trackerConfig |
116 | { |
117 | float windowInfluence = 0.455f; |
118 | float lr = 0.37f; |
119 | float contextAmount = 0.5; |
120 | bool swapRB = true; |
121 | int totalStride = 16; |
122 | float penaltyK = 0.055f; |
123 | }; |
124 | |
125 | protected: |
126 | const int exemplarSize = 127; |
127 | const int instanceSize = 255; |
128 | |
129 | trackerConfig trackState; |
130 | int scoreSize; |
131 | Size imgSize = {0, 0}; |
132 | Mat hanningWindow; |
133 | Mat grid2searchX, grid2searchY; |
134 | |
135 | dnn::Net backbone, neckhead; |
136 | Mat image; |
137 | |
138 | void getSubwindow(Mat& dstCrop, Mat& srcImg, int originalSz, int resizeSz); |
139 | void generateGrids(); |
140 | }; |
141 | |
142 | void TrackerNanoImpl::generateGrids() |
143 | { |
144 | int sz = scoreSize; |
145 | const int sz2 = sz / 2; |
146 | |
147 | std::vector<float> x1Vec(sz, 0); |
148 | |
149 | for (int i = 0; i < sz; i++) |
150 | { |
151 | x1Vec[i] = (float)(i - sz2); |
152 | } |
153 | |
154 | Mat x1M(1, sz, CV_32FC1, x1Vec.data()); |
155 | |
156 | cv::repeat(src: x1M, ny: sz, nx: 1, dst: grid2searchX); |
157 | cv::repeat(src: x1M.t(), ny: 1, nx: sz, dst: grid2searchY); |
158 | |
159 | grid2searchX *= trackState.totalStride; |
160 | grid2searchY *= trackState.totalStride; |
161 | |
162 | grid2searchX += instanceSize/2; |
163 | grid2searchY += instanceSize/2; |
164 | } |
165 | |
166 | void TrackerNanoImpl::init(InputArray image_, const Rect &boundingBox_) |
167 | { |
168 | scoreSize = (instanceSize - exemplarSize) / trackState.totalStride + 8; |
169 | trackState = trackerConfig(); |
170 | image = image_.getMat().clone(); |
171 | |
172 | // convert Rect2d from left-up to center. |
173 | targetPos[0] = float(boundingBox_.x) + float(boundingBox_.width) * 0.5f; |
174 | targetPos[1] = float(boundingBox_.y) + float(boundingBox_.height) * 0.5f; |
175 | |
176 | targetSz[0] = float(boundingBox_.width); |
177 | targetSz[1] = float(boundingBox_.height); |
178 | |
179 | imgSize = image.size(); |
180 | |
181 | // Extent the bounding box. |
182 | float sumSz = targetSz[0] + targetSz[1]; |
183 | float wExtent = targetSz[0] + trackState.contextAmount * (sumSz); |
184 | float hExtent = targetSz[1] + trackState.contextAmount * (sumSz); |
185 | int sz = int(cv::sqrt(x: wExtent * hExtent)); |
186 | |
187 | Mat crop; |
188 | getSubwindow(dstCrop&: crop, srcImg&: image, originalSz: sz, resizeSz: exemplarSize); |
189 | Mat blob = dnn::blobFromImage(image: crop, scalefactor: 1.0, size: Size(), mean: Scalar(), swapRB: trackState.swapRB); |
190 | |
191 | backbone.setInput(blob); |
192 | Mat out = backbone.forward(); // Feature extraction. |
193 | neckhead.setInput(blob: out, name: "input1" ); |
194 | |
195 | createHanningWindow(dst: hanningWindow, winSize: Size(scoreSize, scoreSize), CV_32F); |
196 | generateGrids(); |
197 | } |
198 | |
199 | void TrackerNanoImpl::getSubwindow(Mat& dstCrop, Mat& srcImg, int originalSz, int resizeSz) |
200 | { |
201 | Scalar avgChans = mean(src: srcImg); |
202 | Size imgSz = srcImg.size(); |
203 | int c = (originalSz + 1) / 2; |
204 | |
205 | int context_xmin = (int)(targetPos[0]) - c; |
206 | int context_xmax = context_xmin + originalSz - 1; |
207 | int context_ymin = (int)(targetPos[1]) - c; |
208 | int context_ymax = context_ymin + originalSz - 1; |
209 | |
210 | int left_pad = std::max(a: 0, b: -context_xmin); |
211 | int top_pad = std::max(a: 0, b: -context_ymin); |
212 | int right_pad = std::max(a: 0, b: context_xmax - imgSz.width + 1); |
213 | int bottom_pad = std::max(a: 0, b: context_ymax - imgSz.height + 1); |
214 | |
215 | context_xmin += left_pad; |
216 | context_xmax += left_pad; |
217 | context_ymin += top_pad; |
218 | context_ymax += top_pad; |
219 | |
220 | Mat cropImg; |
221 | if (left_pad == 0 && top_pad == 0 && right_pad == 0 && bottom_pad == 0) |
222 | { |
223 | // Crop image without padding. |
224 | cropImg = srcImg(cv::Rect(context_xmin, context_ymin, |
225 | context_xmax - context_xmin + 1, context_ymax - context_ymin + 1)); |
226 | } |
227 | else // Crop image with padding, and the padding value is avgChans |
228 | { |
229 | cv::Mat tmpMat; |
230 | cv::copyMakeBorder(src: srcImg, dst: tmpMat, top: top_pad, bottom: bottom_pad, left: left_pad, right: right_pad, borderType: cv::BORDER_CONSTANT, value: avgChans); |
231 | cropImg = tmpMat(cv::Rect(context_xmin, context_ymin, context_xmax - context_xmin + 1, context_ymax - context_ymin + 1)); |
232 | } |
233 | resize(src: cropImg, dst: dstCrop, dsize: Size(resizeSz, resizeSz)); |
234 | } |
235 | |
236 | bool TrackerNanoImpl::update(InputArray image_, Rect &boundingBoxRes) |
237 | { |
238 | image = image_.getMat().clone(); |
239 | int targetSzSum = (int)(targetSz[0] + targetSz[1]); |
240 | |
241 | float wc = targetSz[0] + trackState.contextAmount * targetSzSum; |
242 | float hc = targetSz[1] + trackState.contextAmount * targetSzSum; |
243 | float sz = cv::sqrt(x: wc * hc); |
244 | float scale_z = exemplarSize / sz; |
245 | float sx = sz * (instanceSize / exemplarSize); |
246 | targetSz[0] *= scale_z; |
247 | targetSz[1] *= scale_z; |
248 | |
249 | Mat crop; |
250 | getSubwindow(dstCrop&: crop, srcImg&: image, originalSz: int(sx), resizeSz: instanceSize); |
251 | |
252 | Mat blob = dnn::blobFromImage(image: crop, scalefactor: 1.0, size: Size(), mean: Scalar(), swapRB: trackState.swapRB); |
253 | backbone.setInput(blob); |
254 | Mat xf = backbone.forward(); |
255 | neckhead.setInput(blob: xf, name: "input2" ); |
256 | std::vector<String> outputName = {"output1" , "output2" }; |
257 | std::vector<Mat> outs; |
258 | neckhead.forward(outputBlobs: outs, outBlobNames: outputName); |
259 | |
260 | CV_Assert(outs.size() == 2); |
261 | |
262 | Mat clsScore = outs[0]; // 1x2x16x16 |
263 | Mat bboxPred = outs[1]; // 1x4x16x16 |
264 | |
265 | clsScore = clsScore.reshape(cn: 0, newshape: {2, scoreSize, scoreSize}); |
266 | bboxPred = bboxPred.reshape(cn: 0, newshape: {4, scoreSize, scoreSize}); |
267 | |
268 | Mat scoreSoftmax; // 2x16x16 |
269 | softmax(src: clsScore, dst&: scoreSoftmax); |
270 | |
271 | Mat score = scoreSoftmax.row(y: 1); |
272 | score = score.reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
273 | |
274 | Mat predX1 = grid2searchX - bboxPred.row(y: 0).reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
275 | Mat predY1 = grid2searchY - bboxPred.row(y: 1).reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
276 | Mat predX2 = grid2searchX + bboxPred.row(y: 2).reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
277 | Mat predY2 = grid2searchY + bboxPred.row(y: 3).reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
278 | |
279 | // size penalty |
280 | // scale penalty |
281 | Mat sc = sizeCal(w: predX2 - predX1, h: predY2 - predY1)/sizeCal(w: targetPos[0], h: targetPos[1]); |
282 | elementReciprocalMax(srcDst&: sc); |
283 | |
284 | // ratio penalty |
285 | float ratioVal = targetSz[0] / targetSz[1]; |
286 | |
287 | Mat ratioM(scoreSize, scoreSize, CV_32FC1, Scalar::all(v0: ratioVal)); |
288 | Mat rc = ratioM / ((predX2 - predX1) / (predY2 - predY1)); |
289 | elementReciprocalMax(srcDst&: rc); |
290 | |
291 | Mat penalty; |
292 | exp(src: ((rc.mul(m: sc) - 1) * trackState.penaltyK * (-1)), dst: penalty); |
293 | Mat pscore = penalty.mul(m: score); |
294 | |
295 | // Window penalty |
296 | pscore = pscore * (1.0 - trackState.windowInfluence) + hanningWindow * trackState.windowInfluence; |
297 | |
298 | // get Max |
299 | int bestID[2] = { 0, 0 }; |
300 | minMaxIdx(src: pscore, minVal: 0, maxVal: 0, minIdx: 0, maxIdx: bestID); |
301 | |
302 | tracking_score = pscore.at<float>(idx: bestID); |
303 | |
304 | float x1Val = predX1.at<float>(idx: bestID); |
305 | float x2Val = predX2.at<float>(idx: bestID); |
306 | float y1Val = predY1.at<float>(idx: bestID); |
307 | float y2Val = predY2.at<float>(idx: bestID); |
308 | |
309 | float predXs = (x1Val + x2Val)/2; |
310 | float predYs = (y1Val + y2Val)/2; |
311 | float predW = (x2Val - x1Val)/scale_z; |
312 | float predH = (y2Val - y1Val)/scale_z; |
313 | |
314 | float diffXs = (predXs - instanceSize / 2) / scale_z; |
315 | float diffYs = (predYs - instanceSize / 2) / scale_z; |
316 | |
317 | targetSz[0] /= scale_z; |
318 | targetSz[1] /= scale_z; |
319 | |
320 | float lr = penalty.at<float>(idx: bestID) * score.at<float>(idx: bestID) * trackState.lr; |
321 | |
322 | float resX = targetPos[0] + diffXs; |
323 | float resY = targetPos[1] + diffYs; |
324 | float resW = predW * lr + (1 - lr) * targetSz[0]; |
325 | float resH = predH * lr + (1 - lr) * targetSz[1]; |
326 | |
327 | resX = std::max(a: 0.f, b: std::min(a: (float)imgSize.width, b: resX)); |
328 | resY = std::max(a: 0.f, b: std::min(a: (float)imgSize.height, b: resY)); |
329 | resW = std::max(a: 10.f, b: std::min(a: (float)imgSize.width, b: resW)); |
330 | resH = std::max(a: 10.f, b: std::min(a: (float)imgSize.height, b: resH)); |
331 | |
332 | targetPos[0] = resX; |
333 | targetPos[1] = resY; |
334 | targetSz[0] = resW; |
335 | targetSz[1] = resH; |
336 | |
337 | // convert center to Rect. |
338 | boundingBoxRes = { int(resX - resW/2), int(resY - resH/2), int(resW), int(resH)}; |
339 | return true; |
340 | } |
341 | |
342 | float TrackerNanoImpl::getTrackingScore() |
343 | { |
344 | return tracking_score; |
345 | } |
346 | |
347 | Ptr<TrackerNano> TrackerNano::create(const TrackerNano::Params& parameters) |
348 | { |
349 | return makePtr<TrackerNanoImpl>(a1: parameters); |
350 | } |
351 | |
352 | #else // OPENCV_HAVE_DNN |
353 | Ptr<TrackerNano> TrackerNano::create(const TrackerNano::Params& parameters) |
354 | { |
355 | CV_UNUSED(parameters); |
356 | CV_Error(cv::Error::StsNotImplemented, "to use NanoTrack, the tracking module needs to be built with opencv_dnn !" ); |
357 | } |
358 | #endif // OPENCV_HAVE_DNN |
359 | } |
360 | |