| 1 | // This file is part of OpenCV project. |
| 2 | // It is subject to the license terms in the LICENSE file found in the top-level directory |
| 3 | // of this distribution and at http://opencv.org/license.html. |
| 4 | |
| 5 | // This file is modified from the https://github.com/HonglinChu/NanoTrack/blob/master/ncnn_macos_nanotrack/nanotrack.cpp |
| 6 | // Author, HongLinChu, 1628464345@qq.com |
| 7 | // Adapt to OpenCV, ZihaoMu: zihaomu@outlook.com |
| 8 | |
| 9 | // Link to original inference code: https://github.com/HonglinChu/NanoTrack |
| 10 | // Link to original training repo: https://github.com/HonglinChu/SiamTrackers/tree/master/NanoTrack |
| 11 | |
| 12 | #include "../precomp.hpp" |
| 13 | #ifdef HAVE_OPENCV_DNN |
| 14 | #include "opencv2/dnn.hpp" |
| 15 | #endif |
| 16 | |
| 17 | namespace cv { |
| 18 | |
| 19 | TrackerNano::TrackerNano() |
| 20 | { |
| 21 | // nothing |
| 22 | } |
| 23 | |
| 24 | TrackerNano::~TrackerNano() |
| 25 | { |
| 26 | // nothing |
| 27 | } |
| 28 | |
| 29 | TrackerNano::Params::Params() |
| 30 | { |
| 31 | backbone = "backbone.onnx" ; |
| 32 | neckhead = "neckhead.onnx" ; |
| 33 | #ifdef HAVE_OPENCV_DNN |
| 34 | backend = dnn::DNN_BACKEND_DEFAULT; |
| 35 | target = dnn::DNN_TARGET_CPU; |
| 36 | #else |
| 37 | backend = -1; // invalid value |
| 38 | target = -1; // invalid value |
| 39 | #endif |
| 40 | } |
| 41 | |
| 42 | #ifdef HAVE_OPENCV_DNN |
| 43 | static void softmax(const Mat& src, Mat& dst) |
| 44 | { |
| 45 | Mat maxVal; |
| 46 | cv::max(src1: src.row(y: 1), src2: src.row(y: 0), dst&: maxVal); |
| 47 | |
| 48 | src.row(y: 1) -= maxVal; |
| 49 | src.row(y: 0) -= maxVal; |
| 50 | |
| 51 | exp(src, dst); |
| 52 | |
| 53 | Mat sumVal = dst.row(y: 0) + dst.row(y: 1); |
| 54 | dst.row(y: 0) = dst.row(y: 0) / sumVal; |
| 55 | dst.row(y: 1) = dst.row(y: 1) / sumVal; |
| 56 | } |
| 57 | |
| 58 | static float sizeCal(float w, float h) |
| 59 | { |
| 60 | float pad = (w + h) * 0.5f; |
| 61 | float sz2 = (w + pad) * (h + pad); |
| 62 | return sqrt(x: sz2); |
| 63 | } |
| 64 | |
| 65 | static Mat sizeCal(const Mat& w, const Mat& h) |
| 66 | { |
| 67 | Mat pad = (w + h) * 0.5; |
| 68 | Mat sz2 = (w + pad).mul(e: (h + pad)); |
| 69 | |
| 70 | cv::sqrt(src: sz2, dst: sz2); |
| 71 | return sz2; |
| 72 | } |
| 73 | |
| 74 | // Similar python code: r = np.maximum(r, 1. / r) # r is matrix |
| 75 | static void elementReciprocalMax(Mat& srcDst) |
| 76 | { |
| 77 | size_t totalV = srcDst.total(); |
| 78 | float* ptr = srcDst.ptr<float>(y: 0); |
| 79 | for (size_t i = 0; i < totalV; i++) |
| 80 | { |
| 81 | float val = *(ptr + i); |
| 82 | *(ptr + i) = std::max(a: val, b: 1.0f/val); |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | class TrackerNanoImpl : public TrackerNano |
| 87 | { |
| 88 | public: |
| 89 | TrackerNanoImpl(const TrackerNano::Params& parameters) |
| 90 | { |
| 91 | backbone = dnn::readNet(model: parameters.backbone); |
| 92 | neckhead = dnn::readNet(model: parameters.neckhead); |
| 93 | |
| 94 | CV_Assert(!backbone.empty()); |
| 95 | CV_Assert(!neckhead.empty()); |
| 96 | |
| 97 | backbone.setPreferableBackend(parameters.backend); |
| 98 | backbone.setPreferableTarget(parameters.target); |
| 99 | neckhead.setPreferableBackend(parameters.backend); |
| 100 | neckhead.setPreferableTarget(parameters.target); |
| 101 | } |
| 102 | |
| 103 | TrackerNanoImpl(const dnn::Net& _backbone, const dnn::Net& _neckhead) |
| 104 | { |
| 105 | CV_Assert(!_backbone.empty()); |
| 106 | CV_Assert(!_neckhead.empty()); |
| 107 | |
| 108 | backbone = _backbone; |
| 109 | neckhead = _neckhead; |
| 110 | } |
| 111 | |
| 112 | void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE; |
| 113 | bool update(InputArray image, Rect& boundingBox) CV_OVERRIDE; |
| 114 | float getTrackingScore() CV_OVERRIDE; |
| 115 | |
| 116 | // Save the target bounding box for each frame. |
| 117 | std::vector<float> targetSz = {0, 0}; // H and W of bounding box |
| 118 | std::vector<float> targetPos = {0, 0}; // center point of bounding box (x, y) |
| 119 | float tracking_score; |
| 120 | |
| 121 | struct trackerConfig |
| 122 | { |
| 123 | float windowInfluence = 0.455f; |
| 124 | float lr = 0.37f; |
| 125 | float contextAmount = 0.5; |
| 126 | bool swapRB = true; |
| 127 | int totalStride = 16; |
| 128 | float penaltyK = 0.055f; |
| 129 | }; |
| 130 | |
| 131 | protected: |
| 132 | const int exemplarSize = 127; |
| 133 | const int instanceSize = 255; |
| 134 | |
| 135 | trackerConfig trackState; |
| 136 | int scoreSize; |
| 137 | Size imgSize = {0, 0}; |
| 138 | Mat hanningWindow; |
| 139 | Mat grid2searchX, grid2searchY; |
| 140 | |
| 141 | dnn::Net backbone, neckhead; |
| 142 | Mat image; |
| 143 | |
| 144 | void getSubwindow(Mat& dstCrop, Mat& srcImg, int originalSz, int resizeSz); |
| 145 | void generateGrids(); |
| 146 | }; |
| 147 | |
| 148 | void TrackerNanoImpl::generateGrids() |
| 149 | { |
| 150 | int sz = scoreSize; |
| 151 | const int sz2 = sz / 2; |
| 152 | |
| 153 | std::vector<float> x1Vec(sz, 0); |
| 154 | |
| 155 | for (int i = 0; i < sz; i++) |
| 156 | { |
| 157 | x1Vec[i] = (float)(i - sz2); |
| 158 | } |
| 159 | |
| 160 | Mat x1M(1, sz, CV_32FC1, x1Vec.data()); |
| 161 | |
| 162 | cv::repeat(src: x1M, ny: sz, nx: 1, dst: grid2searchX); |
| 163 | cv::repeat(src: x1M.t(), ny: 1, nx: sz, dst: grid2searchY); |
| 164 | |
| 165 | grid2searchX *= trackState.totalStride; |
| 166 | grid2searchY *= trackState.totalStride; |
| 167 | |
| 168 | grid2searchX += instanceSize/2; |
| 169 | grid2searchY += instanceSize/2; |
| 170 | } |
| 171 | |
| 172 | void TrackerNanoImpl::init(InputArray image_, const Rect &boundingBox_) |
| 173 | { |
| 174 | scoreSize = (instanceSize - exemplarSize) / trackState.totalStride + 8; |
| 175 | trackState = trackerConfig(); |
| 176 | image = image_.getMat().clone(); |
| 177 | |
| 178 | // convert Rect2d from left-up to center. |
| 179 | targetPos[0] = float(boundingBox_.x) + float(boundingBox_.width) * 0.5f; |
| 180 | targetPos[1] = float(boundingBox_.y) + float(boundingBox_.height) * 0.5f; |
| 181 | |
| 182 | targetSz[0] = float(boundingBox_.width); |
| 183 | targetSz[1] = float(boundingBox_.height); |
| 184 | |
| 185 | imgSize = image.size(); |
| 186 | |
| 187 | // Extent the bounding box. |
| 188 | float sumSz = targetSz[0] + targetSz[1]; |
| 189 | float wExtent = targetSz[0] + trackState.contextAmount * (sumSz); |
| 190 | float hExtent = targetSz[1] + trackState.contextAmount * (sumSz); |
| 191 | int sz = int(cv::sqrt(x: wExtent * hExtent)); |
| 192 | |
| 193 | Mat crop; |
| 194 | getSubwindow(dstCrop&: crop, srcImg&: image, originalSz: sz, resizeSz: exemplarSize); |
| 195 | Mat blob = dnn::blobFromImage(image: crop, scalefactor: 1.0, size: Size(), mean: Scalar(), swapRB: trackState.swapRB); |
| 196 | |
| 197 | backbone.setInput(blob); |
| 198 | Mat out = backbone.forward(); // Feature extraction. |
| 199 | neckhead.setInput(blob: out, name: "input1" ); |
| 200 | |
| 201 | createHanningWindow(dst: hanningWindow, winSize: Size(scoreSize, scoreSize), CV_32F); |
| 202 | generateGrids(); |
| 203 | } |
| 204 | |
| 205 | void TrackerNanoImpl::getSubwindow(Mat& dstCrop, Mat& srcImg, int originalSz, int resizeSz) |
| 206 | { |
| 207 | Scalar avgChans = mean(src: srcImg); |
| 208 | Size imgSz = srcImg.size(); |
| 209 | int c = (originalSz + 1) / 2; |
| 210 | |
| 211 | int context_xmin = (int)(targetPos[0]) - c; |
| 212 | int context_xmax = context_xmin + originalSz - 1; |
| 213 | int context_ymin = (int)(targetPos[1]) - c; |
| 214 | int context_ymax = context_ymin + originalSz - 1; |
| 215 | |
| 216 | int left_pad = std::max(a: 0, b: -context_xmin); |
| 217 | int top_pad = std::max(a: 0, b: -context_ymin); |
| 218 | int right_pad = std::max(a: 0, b: context_xmax - imgSz.width + 1); |
| 219 | int bottom_pad = std::max(a: 0, b: context_ymax - imgSz.height + 1); |
| 220 | |
| 221 | context_xmin += left_pad; |
| 222 | context_xmax += left_pad; |
| 223 | context_ymin += top_pad; |
| 224 | context_ymax += top_pad; |
| 225 | |
| 226 | Mat cropImg; |
| 227 | if (left_pad == 0 && top_pad == 0 && right_pad == 0 && bottom_pad == 0) |
| 228 | { |
| 229 | // Crop image without padding. |
| 230 | cropImg = srcImg(cv::Rect(context_xmin, context_ymin, |
| 231 | context_xmax - context_xmin + 1, context_ymax - context_ymin + 1)); |
| 232 | } |
| 233 | else // Crop image with padding, and the padding value is avgChans |
| 234 | { |
| 235 | cv::Mat tmpMat; |
| 236 | cv::copyMakeBorder(src: srcImg, dst: tmpMat, top: top_pad, bottom: bottom_pad, left: left_pad, right: right_pad, borderType: cv::BORDER_CONSTANT, value: avgChans); |
| 237 | cropImg = tmpMat(cv::Rect(context_xmin, context_ymin, context_xmax - context_xmin + 1, context_ymax - context_ymin + 1)); |
| 238 | } |
| 239 | resize(src: cropImg, dst: dstCrop, dsize: Size(resizeSz, resizeSz)); |
| 240 | } |
| 241 | |
| 242 | bool TrackerNanoImpl::update(InputArray image_, Rect &boundingBoxRes) |
| 243 | { |
| 244 | image = image_.getMat().clone(); |
| 245 | int targetSzSum = (int)(targetSz[0] + targetSz[1]); |
| 246 | |
| 247 | float wc = targetSz[0] + trackState.contextAmount * targetSzSum; |
| 248 | float hc = targetSz[1] + trackState.contextAmount * targetSzSum; |
| 249 | float sz = cv::sqrt(x: wc * hc); |
| 250 | float scale_z = exemplarSize / sz; |
| 251 | float sx = sz * (instanceSize / exemplarSize); |
| 252 | targetSz[0] *= scale_z; |
| 253 | targetSz[1] *= scale_z; |
| 254 | |
| 255 | Mat crop; |
| 256 | getSubwindow(dstCrop&: crop, srcImg&: image, originalSz: int(sx), resizeSz: instanceSize); |
| 257 | |
| 258 | Mat blob = dnn::blobFromImage(image: crop, scalefactor: 1.0, size: Size(), mean: Scalar(), swapRB: trackState.swapRB); |
| 259 | backbone.setInput(blob); |
| 260 | Mat xf = backbone.forward(); |
| 261 | neckhead.setInput(blob: xf, name: "input2" ); |
| 262 | std::vector<String> outputName = {"output1" , "output2" }; |
| 263 | std::vector<Mat> outs; |
| 264 | neckhead.forward(outputBlobs: outs, outBlobNames: outputName); |
| 265 | |
| 266 | CV_Assert(outs.size() == 2); |
| 267 | |
| 268 | Mat clsScore = outs[0]; // 1x2x16x16 |
| 269 | Mat bboxPred = outs[1]; // 1x4x16x16 |
| 270 | |
| 271 | clsScore = clsScore.reshape(cn: 0, newshape: {2, scoreSize, scoreSize}); |
| 272 | bboxPred = bboxPred.reshape(cn: 0, newshape: {4, scoreSize, scoreSize}); |
| 273 | |
| 274 | Mat scoreSoftmax; // 2x16x16 |
| 275 | softmax(src: clsScore, dst&: scoreSoftmax); |
| 276 | |
| 277 | Mat score = scoreSoftmax.row(y: 1); |
| 278 | score = score.reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
| 279 | |
| 280 | Mat predX1 = grid2searchX - bboxPred.row(y: 0).reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
| 281 | Mat predY1 = grid2searchY - bboxPred.row(y: 1).reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
| 282 | Mat predX2 = grid2searchX + bboxPred.row(y: 2).reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
| 283 | Mat predY2 = grid2searchY + bboxPred.row(y: 3).reshape(cn: 0, newshape: {scoreSize, scoreSize}); |
| 284 | |
| 285 | // size penalty |
| 286 | // scale penalty |
| 287 | Mat sc = sizeCal(w: predX2 - predX1, h: predY2 - predY1)/sizeCal(w: targetPos[0], h: targetPos[1]); |
| 288 | elementReciprocalMax(srcDst&: sc); |
| 289 | |
| 290 | // ratio penalty |
| 291 | float ratioVal = targetSz[0] / targetSz[1]; |
| 292 | |
| 293 | Mat ratioM(scoreSize, scoreSize, CV_32FC1, Scalar::all(v0: ratioVal)); |
| 294 | Mat rc = ratioM / ((predX2 - predX1) / (predY2 - predY1)); |
| 295 | elementReciprocalMax(srcDst&: rc); |
| 296 | |
| 297 | Mat penalty; |
| 298 | exp(src: ((rc.mul(m: sc) - 1) * trackState.penaltyK * (-1)), dst: penalty); |
| 299 | Mat pscore = penalty.mul(m: score); |
| 300 | |
| 301 | // Window penalty |
| 302 | pscore = pscore * (1.0 - trackState.windowInfluence) + hanningWindow * trackState.windowInfluence; |
| 303 | |
| 304 | // get Max |
| 305 | int bestID[2] = { 0, 0 }; |
| 306 | minMaxIdx(src: pscore, minVal: 0, maxVal: 0, minIdx: 0, maxIdx: bestID); |
| 307 | |
| 308 | tracking_score = pscore.at<float>(idx: bestID); |
| 309 | |
| 310 | float x1Val = predX1.at<float>(idx: bestID); |
| 311 | float x2Val = predX2.at<float>(idx: bestID); |
| 312 | float y1Val = predY1.at<float>(idx: bestID); |
| 313 | float y2Val = predY2.at<float>(idx: bestID); |
| 314 | |
| 315 | float predXs = (x1Val + x2Val)/2; |
| 316 | float predYs = (y1Val + y2Val)/2; |
| 317 | float predW = (x2Val - x1Val)/scale_z; |
| 318 | float predH = (y2Val - y1Val)/scale_z; |
| 319 | |
| 320 | float diffXs = (predXs - instanceSize / 2) / scale_z; |
| 321 | float diffYs = (predYs - instanceSize / 2) / scale_z; |
| 322 | |
| 323 | targetSz[0] /= scale_z; |
| 324 | targetSz[1] /= scale_z; |
| 325 | |
| 326 | float lr = penalty.at<float>(idx: bestID) * score.at<float>(idx: bestID) * trackState.lr; |
| 327 | |
| 328 | float resX = targetPos[0] + diffXs; |
| 329 | float resY = targetPos[1] + diffYs; |
| 330 | float resW = predW * lr + (1 - lr) * targetSz[0]; |
| 331 | float resH = predH * lr + (1 - lr) * targetSz[1]; |
| 332 | |
| 333 | resX = std::max(a: 0.f, b: std::min(a: (float)imgSize.width, b: resX)); |
| 334 | resY = std::max(a: 0.f, b: std::min(a: (float)imgSize.height, b: resY)); |
| 335 | resW = std::max(a: 10.f, b: std::min(a: (float)imgSize.width, b: resW)); |
| 336 | resH = std::max(a: 10.f, b: std::min(a: (float)imgSize.height, b: resH)); |
| 337 | |
| 338 | targetPos[0] = resX; |
| 339 | targetPos[1] = resY; |
| 340 | targetSz[0] = resW; |
| 341 | targetSz[1] = resH; |
| 342 | |
| 343 | // convert center to Rect. |
| 344 | boundingBoxRes = { int(resX - resW/2), int(resY - resH/2), int(resW), int(resH)}; |
| 345 | return true; |
| 346 | } |
| 347 | |
| 348 | float TrackerNanoImpl::getTrackingScore() |
| 349 | { |
| 350 | return tracking_score; |
| 351 | } |
| 352 | |
| 353 | Ptr<TrackerNano> TrackerNano::create(const TrackerNano::Params& parameters) |
| 354 | { |
| 355 | return makePtr<TrackerNanoImpl>(a1: parameters); |
| 356 | } |
| 357 | |
| 358 | Ptr<TrackerNano> TrackerNano::create(const dnn::Net& backbone, const dnn::Net& neckhead) |
| 359 | { |
| 360 | return makePtr<TrackerNanoImpl>(a1: backbone, a1: neckhead); |
| 361 | } |
| 362 | |
| 363 | #else // OPENCV_HAVE_DNN |
| 364 | Ptr<TrackerNano> TrackerNano::create(const TrackerNano::Params& parameters) |
| 365 | { |
| 366 | CV_UNUSED(parameters); |
| 367 | CV_Error(cv::Error::StsNotImplemented, "to use NanoTrack, the tracking module needs to be built with opencv_dnn !" ); |
| 368 | } |
| 369 | #endif // OPENCV_HAVE_DNN |
| 370 | } |
| 371 | |