1 | // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved |
2 | // |
3 | // This source code is subject to the terms of the BSD 2 Clause License and |
4 | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
5 | // was not distributed with this source code in the LICENSE file, you can |
6 | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
7 | // Media Patent License 1.0 was not distributed with this source code in the |
8 | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
9 | |
10 | use crate::api::InterConfig; |
11 | use crate::context::{ |
12 | BlockOffset, PlaneBlockOffset, SuperBlockOffset, TileBlockOffset, |
13 | TileSuperBlockOffset, MAX_SB_SIZE_LOG2, MIB_SIZE_LOG2, MI_SIZE, |
14 | MI_SIZE_LOG2, SB_SIZE, |
15 | }; |
16 | use crate::dist::*; |
17 | use crate::frame::*; |
18 | use crate::mc::MotionVector; |
19 | use crate::partition::*; |
20 | use crate::predict::PredictionMode; |
21 | use crate::tiling::*; |
22 | use crate::util::ILog; |
23 | use crate::util::{clamp, Pixel}; |
24 | use crate::FrameInvariants; |
25 | |
26 | use arrayvec::*; |
27 | use std::ops::{Index, IndexMut}; |
28 | use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; |
29 | |
30 | #[derive (Debug, Copy, Clone, Default)] |
31 | pub struct MEStats { |
32 | pub mv: MotionVector, |
33 | /// sad value, on the scale of a 128x128 block |
34 | pub normalized_sad: u32, |
35 | } |
36 | |
37 | #[derive (Debug, Clone)] |
38 | pub struct FrameMEStats { |
39 | stats: Box<[MEStats]>, |
40 | pub cols: usize, |
41 | pub rows: usize, |
42 | } |
43 | |
44 | /// cbindgen:ignore |
45 | pub type RefMEStats = Arc<RwLock<[FrameMEStats; REF_FRAMES]>>; |
46 | /// cbindgen:ignore |
47 | pub type ReadGuardMEStats<'a> = |
48 | RwLockReadGuard<'a, [FrameMEStats; REF_FRAMES]>; |
49 | /// cbindgen:ignore |
50 | pub type WriteGuardMEStats<'a> = |
51 | RwLockWriteGuard<'a, [FrameMEStats; REF_FRAMES]>; |
52 | |
53 | impl FrameMEStats { |
54 | #[inline ] |
55 | pub fn rows_iter(&self) -> std::slice::ChunksExact<'_, MEStats> { |
56 | self.stats.chunks_exact(self.cols) |
57 | } |
58 | |
59 | pub fn new(cols: usize, rows: usize) -> Self { |
60 | Self { |
61 | // dynamic allocation: once per frame |
62 | stats: vec![MEStats::default(); cols * rows].into_boxed_slice(), |
63 | cols, |
64 | rows, |
65 | } |
66 | } |
67 | pub fn new_arc_array(cols: usize, rows: usize) -> RefMEStats { |
68 | Arc::new(RwLock::new([ |
69 | FrameMEStats::new(cols, rows), |
70 | FrameMEStats::new(cols, rows), |
71 | FrameMEStats::new(cols, rows), |
72 | FrameMEStats::new(cols, rows), |
73 | FrameMEStats::new(cols, rows), |
74 | FrameMEStats::new(cols, rows), |
75 | FrameMEStats::new(cols, rows), |
76 | FrameMEStats::new(cols, rows), |
77 | ])) |
78 | } |
79 | } |
80 | |
81 | impl Index<usize> for FrameMEStats { |
82 | type Output = [MEStats]; |
83 | #[inline ] |
84 | fn index(&self, index: usize) -> &Self::Output { |
85 | &self.stats[index * self.cols..(index + 1) * self.cols] |
86 | } |
87 | } |
88 | |
89 | impl IndexMut<usize> for FrameMEStats { |
90 | #[inline ] |
91 | fn index_mut(&mut self, index: usize) -> &mut Self::Output { |
92 | &mut self.stats[index * self.cols..(index + 1) * self.cols] |
93 | } |
94 | } |
95 | |
96 | /// Result of motion search. |
97 | #[derive (Debug, Copy, Clone)] |
98 | pub struct MotionSearchResult { |
99 | /// Motion vector chosen by the motion search. |
100 | pub mv: MotionVector, |
101 | /// Rate distortion data associated with `mv`. |
102 | pub rd: MVCandidateRD, |
103 | } |
104 | |
105 | impl MotionSearchResult { |
106 | /// Creates an 'empty' value. |
107 | /// |
108 | /// To be considered empty, cost is set higher than any naturally occurring |
109 | /// cost value. The idea is that comparing to any valid rd output, the search |
110 | /// result will always be replaced. |
111 | #[inline (always)] |
112 | pub fn empty() -> MotionSearchResult { |
113 | MotionSearchResult { |
114 | mv: MotionVector::default(), |
115 | rd: MVCandidateRD::empty(), |
116 | } |
117 | } |
118 | |
119 | /// Check if the value should be considered to be empty. |
120 | #[inline (always)] |
121 | const fn is_empty(&self) -> bool { |
122 | self.rd.cost == u64::MAX |
123 | } |
124 | } |
125 | |
126 | /// Holds data from computing rate distortion of a motion vector. |
127 | #[derive (Debug, Copy, Clone)] |
128 | pub struct MVCandidateRD { |
129 | /// Rate distortion cost of the motion vector. |
130 | pub cost: u64, |
131 | /// Distortion metric value for the motion vector. |
132 | pub sad: u32, |
133 | } |
134 | |
135 | impl MVCandidateRD { |
136 | /// Creates an 'empty' value. |
137 | /// |
138 | /// To be considered empty, cost is set higher than any naturally occurring |
139 | /// cost value. The idea is that comparing to any valid rd output, the search |
140 | /// result will always be replaced. |
141 | #[inline (always)] |
142 | const fn empty() -> MVCandidateRD { |
143 | MVCandidateRD { sad: u32::MAX, cost: u64::MAX } |
144 | } |
145 | } |
146 | |
147 | #[derive (Debug, Copy, Clone, Eq, PartialEq)] |
148 | pub enum MVSamplingMode { |
149 | INIT, |
150 | CORNER { right: bool, bottom: bool }, |
151 | } |
152 | |
153 | pub fn estimate_tile_motion<T: Pixel>( |
154 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
155 | inter_cfg: &InterConfig, |
156 | ) { |
157 | let init_size = MIB_SIZE_LOG2; |
158 | |
159 | let mut prev_ssdec: Option<u8> = None; |
160 | for mv_size_in_b_log2 in (2..=init_size).rev() { |
161 | let init = mv_size_in_b_log2 == init_size; |
162 | |
163 | // Choose subsampling. Pass one is quarter res and pass two is at half res. |
164 | let ssdec = match init_size - mv_size_in_b_log2 { |
165 | 0 => 2, |
166 | 1 => 1, |
167 | _ => 0, |
168 | }; |
169 | |
170 | let new_subsampling = |
171 | if let Some(prev) = prev_ssdec { prev != ssdec } else { false }; |
172 | prev_ssdec = Some(ssdec); |
173 | |
174 | // 0.5 and 0.125 are a fudge factors |
175 | let lambda = (fi.me_lambda * 256.0 / (1 << (2 * ssdec)) as f64 |
176 | * if ssdec == 0 { 0.5 } else { 0.125 }) as u32; |
177 | |
178 | for sby in 0..ts.sb_height { |
179 | for sbx in 0..ts.sb_width { |
180 | let mut tested_frames_flags = 0; |
181 | for &ref_frame in inter_cfg.allowed_ref_frames() { |
182 | let frame_flag = 1 << fi.ref_frames[ref_frame.to_index()]; |
183 | if tested_frames_flags & frame_flag == frame_flag { |
184 | continue; |
185 | } |
186 | tested_frames_flags |= frame_flag; |
187 | |
188 | let tile_bo = |
189 | TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }) |
190 | .block_offset(0, 0); |
191 | |
192 | if new_subsampling { |
193 | refine_subsampled_sb_motion( |
194 | fi, |
195 | ts, |
196 | ref_frame, |
197 | mv_size_in_b_log2 + 1, |
198 | tile_bo, |
199 | ssdec, |
200 | lambda, |
201 | ); |
202 | } |
203 | |
204 | estimate_sb_motion( |
205 | fi, |
206 | ts, |
207 | ref_frame, |
208 | mv_size_in_b_log2, |
209 | tile_bo, |
210 | init, |
211 | ssdec, |
212 | lambda, |
213 | ); |
214 | } |
215 | } |
216 | } |
217 | } |
218 | } |
219 | |
220 | fn estimate_sb_motion<T: Pixel>( |
221 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, ref_frame: RefType, |
222 | mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, init: bool, ssdec: u8, |
223 | lambda: u32, |
224 | ) { |
225 | let pix_offset = tile_bo.to_luma_plane_offset(); |
226 | let sb_h: usize = SB_SIZE.min(ts.height - pix_offset.y as usize); |
227 | let sb_w: usize = SB_SIZE.min(ts.width - pix_offset.x as usize); |
228 | |
229 | let mv_size = MI_SIZE << mv_size_in_b_log2; |
230 | |
231 | // Process in blocks, cropping at edges. |
232 | for y in (0..sb_h).step_by(mv_size) { |
233 | for x in (0..sb_w).step_by(mv_size) { |
234 | let corner: MVSamplingMode = if init { |
235 | MVSamplingMode::INIT |
236 | } else { |
237 | // Processing the block a size up produces data that can be used by |
238 | // the right and bottom corners. |
239 | MVSamplingMode::CORNER { |
240 | right: x & mv_size == mv_size, |
241 | bottom: y & mv_size == mv_size, |
242 | } |
243 | }; |
244 | |
245 | let sub_bo = tile_bo |
246 | .with_offset(x as isize >> MI_SIZE_LOG2, y as isize >> MI_SIZE_LOG2); |
247 | |
248 | // Clamp to frame edge, rounding up in the case of subsampling. |
249 | // The rounding makes some assumptions about how subsampling is done. |
250 | let w = mv_size.min(sb_w - x + (1 << ssdec) - 1) >> ssdec; |
251 | let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec; |
252 | |
253 | // Run motion estimation. |
254 | // Note that the initial search (init) instructs the called function to |
255 | // perform a more extensive search. |
256 | if let Some(results) = estimate_motion( |
257 | fi, |
258 | ts, |
259 | w, |
260 | h, |
261 | sub_bo, |
262 | ref_frame, |
263 | None, |
264 | corner, |
265 | init, |
266 | ssdec, |
267 | Some(lambda), |
268 | ) { |
269 | // normalize sad to 128x128 block |
270 | let sad = (((results.rd.sad as u64) << (MAX_SB_SIZE_LOG2 * 2)) |
271 | / (w * h) as u64) as u32; |
272 | save_me_stats( |
273 | ts, |
274 | mv_size_in_b_log2, |
275 | sub_bo, |
276 | ref_frame, |
277 | MEStats { mv: results.mv, normalized_sad: sad }, |
278 | ); |
279 | } |
280 | } |
281 | } |
282 | } |
283 | |
284 | fn refine_subsampled_sb_motion<T: Pixel>( |
285 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, ref_frame: RefType, |
286 | mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, ssdec: u8, lambda: u32, |
287 | ) { |
288 | let pix_offset = tile_bo.to_luma_plane_offset(); |
289 | let sb_h: usize = SB_SIZE.min(ts.height - pix_offset.y as usize); |
290 | let sb_w: usize = SB_SIZE.min(ts.width - pix_offset.x as usize); |
291 | |
292 | let mv_size = MI_SIZE << mv_size_in_b_log2; |
293 | |
294 | // Process in blocks, cropping at edges. |
295 | for y in (0..sb_h).step_by(mv_size) { |
296 | for x in (0..sb_w).step_by(mv_size) { |
297 | let sub_bo = tile_bo |
298 | .with_offset(x as isize >> MI_SIZE_LOG2, y as isize >> MI_SIZE_LOG2); |
299 | |
300 | // Clamp to frame edge, rounding up in the case of subsampling. |
301 | // The rounding makes some assumptions about how subsampling is done. |
302 | let w = mv_size.min(sb_w - x + (1 << ssdec) - 1) >> ssdec; |
303 | let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec; |
304 | |
305 | // Refine the existing motion estimate |
306 | if let Some(results) = refine_subsampled_motion_estimate( |
307 | fi, ts, w, h, sub_bo, ref_frame, ssdec, lambda, |
308 | ) { |
309 | // normalize sad to 128x128 block |
310 | let sad = (((results.rd.sad as u64) << (MAX_SB_SIZE_LOG2 * 2)) |
311 | / (w * h) as u64) as u32; |
312 | save_me_stats( |
313 | ts, |
314 | mv_size_in_b_log2, |
315 | sub_bo, |
316 | ref_frame, |
317 | MEStats { mv: results.mv, normalized_sad: sad }, |
318 | ); |
319 | } |
320 | } |
321 | } |
322 | } |
323 | |
324 | fn save_me_stats<T: Pixel>( |
325 | ts: &mut TileStateMut<'_, T>, mv_size_in_b_log2: usize, |
326 | tile_bo: TileBlockOffset, ref_frame: RefType, stats: MEStats, |
327 | ) { |
328 | let size_in_b: usize = 1 << mv_size_in_b_log2; |
329 | let tile_me_stats: &mut TileMEStatsMut<'_> = &mut ts.me_stats[ref_frame.to_index()]; |
330 | let tile_bo_x_end: usize = (tile_bo.0.x + size_in_b).min(ts.mi_width); |
331 | let tile_bo_y_end: usize = (tile_bo.0.y + size_in_b).min(ts.mi_height); |
332 | for mi_y: usize in tile_bo.0.y..tile_bo_y_end { |
333 | for a: &mut MEStats in tile_me_stats[mi_y][tile_bo.0.x..tile_bo_x_end].iter_mut() { |
334 | *a = stats; |
335 | } |
336 | } |
337 | } |
338 | |
339 | fn get_mv_range( |
340 | w_in_b: usize, h_in_b: usize, bo: PlaneBlockOffset, blk_w: usize, |
341 | blk_h: usize, |
342 | ) -> (isize, isize, isize, isize) { |
343 | let border_w: isize = 128 + blk_w as isize * 8; |
344 | let border_h: isize = 128 + blk_h as isize * 8; |
345 | let mvx_min: isize = -(bo.0.x as isize) * (8 * MI_SIZE) as isize - border_w; |
346 | let mvx_max: isize = ((w_in_b - bo.0.x) as isize - (blk_w / MI_SIZE) as isize) |
347 | * (8 * MI_SIZE) as isize |
348 | + border_w; |
349 | let mvy_min: isize = -(bo.0.y as isize) * (8 * MI_SIZE) as isize - border_h; |
350 | let mvy_max: isize = ((h_in_b - bo.0.y) as isize - (blk_h / MI_SIZE) as isize) |
351 | * (8 * MI_SIZE) as isize |
352 | + border_h; |
353 | |
354 | // <https://aomediacodec.github.io/av1-spec/#assign-mv-semantics> |
355 | use crate::context::{MV_LOW, MV_UPP}; |
356 | ( |
357 | mvx_min.max(MV_LOW as isize + 1), |
358 | mvx_max.min(MV_UPP as isize - 1), |
359 | mvy_min.max(MV_LOW as isize + 1), |
360 | mvy_max.min(MV_UPP as isize - 1), |
361 | ) |
362 | } |
363 | |
364 | struct MotionEstimationSubsets { |
365 | min_sad: u32, |
366 | median: Option<MotionVector>, |
367 | subset_b: ArrayVec<MotionVector, 5>, |
368 | subset_c: ArrayVec<MotionVector, 5>, |
369 | } |
370 | |
371 | impl MotionEstimationSubsets { |
372 | fn all_mvs(&self) -> ArrayVec<MotionVector, 11> { |
373 | let mut all: ArrayVec = ArrayVec::new(); |
374 | if let Some(median: MotionVector) = self.median { |
375 | all.push(element:median); |
376 | } |
377 | |
378 | all.extend(self.subset_b.iter().copied()); |
379 | all.extend(self.subset_c.iter().copied()); |
380 | |
381 | all |
382 | } |
383 | } |
384 | |
385 | #[profiling::function ] |
386 | fn get_subset_predictors( |
387 | tile_bo: TileBlockOffset, tile_me_stats: &TileMEStats<'_>, |
388 | frame_ref_opt: Option<ReadGuardMEStats<'_>>, ref_frame_id: usize, |
389 | pix_w: usize, pix_h: usize, mvx_min: isize, mvx_max: isize, mvy_min: isize, |
390 | mvy_max: isize, corner: MVSamplingMode, ssdec: u8, |
391 | ) -> MotionEstimationSubsets { |
392 | let mut min_sad: u32 = u32::MAX; |
393 | let mut subset_b = ArrayVec::<MotionVector, 5>::new(); |
394 | let mut subset_c = ArrayVec::<MotionVector, 5>::new(); |
395 | |
396 | // rounded up width in blocks |
397 | let w = ((pix_w << ssdec) + MI_SIZE - 1) >> MI_SIZE_LOG2; |
398 | let h = ((pix_h << ssdec) + MI_SIZE - 1) >> MI_SIZE_LOG2; |
399 | |
400 | // Get predictors from the same frame. |
401 | |
402 | let clipped_half_w = (w >> 1).min(tile_me_stats.cols() - 1 - tile_bo.0.x); |
403 | let clipped_half_h = (h >> 1).min(tile_me_stats.rows() - 1 - tile_bo.0.y); |
404 | |
405 | let mut process_cand = |stats: MEStats| -> MotionVector { |
406 | min_sad = min_sad.min(stats.normalized_sad); |
407 | let mv = stats.mv.quantize_to_fullpel(); |
408 | MotionVector { |
409 | col: clamp(mv.col as isize, mvx_min, mvx_max) as i16, |
410 | row: clamp(mv.row as isize, mvy_min, mvy_max) as i16, |
411 | } |
412 | }; |
413 | |
414 | // Sample the middle of all block edges bordering this one. |
415 | // Note: If motion vectors haven't been precomputed to a given blocksize, then |
416 | // the right and bottom edges will be duplicates of the center predictor when |
417 | // processing in raster order. |
418 | |
419 | // left |
420 | if tile_bo.0.x > 0 { |
421 | subset_b.push(process_cand( |
422 | tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x - 1], |
423 | )); |
424 | } |
425 | // top |
426 | if tile_bo.0.y > 0 { |
427 | subset_b.push(process_cand( |
428 | tile_me_stats[tile_bo.0.y - 1][tile_bo.0.x + clipped_half_w], |
429 | )); |
430 | } |
431 | |
432 | // Sampling far right and far bottom edges was tested, but had worse results |
433 | // without an extensive threshold test (with threshold being applied after |
434 | // checking median and the best of each subset). |
435 | |
436 | // right |
437 | if let MVSamplingMode::CORNER { right: true, bottom: _ } = corner { |
438 | if tile_bo.0.x + w < tile_me_stats.cols() { |
439 | subset_b.push(process_cand( |
440 | tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x + w], |
441 | )); |
442 | } |
443 | } |
444 | // bottom |
445 | if let MVSamplingMode::CORNER { right: _, bottom: true } = corner { |
446 | if tile_bo.0.y + h < tile_me_stats.rows() { |
447 | subset_b.push(process_cand( |
448 | tile_me_stats[tile_bo.0.y + h][tile_bo.0.x + clipped_half_w], |
449 | )); |
450 | } |
451 | } |
452 | |
453 | let median = if corner != MVSamplingMode::INIT { |
454 | // Sample the center of the current block. |
455 | Some(process_cand( |
456 | tile_me_stats[tile_bo.0.y + clipped_half_h] |
457 | [tile_bo.0.x + clipped_half_w], |
458 | )) |
459 | } else if subset_b.len() != 3 { |
460 | None |
461 | } else { |
462 | let mut rows: ArrayVec<i16, 3> = subset_b.iter().map(|&a| a.row).collect(); |
463 | let mut cols: ArrayVec<i16, 3> = subset_b.iter().map(|&a| a.col).collect(); |
464 | rows.as_mut_slice().sort_unstable(); |
465 | cols.as_mut_slice().sort_unstable(); |
466 | Some(MotionVector { row: rows[1], col: cols[1] }) |
467 | }; |
468 | |
469 | // Zero motion vector, don't use add_cand since it skips zero vectors. |
470 | subset_b.push(MotionVector::default()); |
471 | |
472 | // EPZS subset C predictors. |
473 | // Sample the middle of bordering side of the left, right, top and bottom |
474 | // blocks of the previous frame. |
475 | // Sample the middle of this block in the previous frame. |
476 | |
477 | if let Some(frame_me_stats) = frame_ref_opt { |
478 | let prev_frame = &frame_me_stats[ref_frame_id]; |
479 | |
480 | let frame_bo = PlaneBlockOffset(BlockOffset { |
481 | x: tile_me_stats.x() + tile_bo.0.x, |
482 | y: tile_me_stats.y() + tile_bo.0.y, |
483 | }); |
484 | let clipped_half_w = (w >> 1).min(prev_frame.cols - 1 - frame_bo.0.x); |
485 | let clipped_half_h = (h >> 1).min(prev_frame.rows - 1 - frame_bo.0.y); |
486 | |
487 | // left |
488 | if frame_bo.0.x > 0 { |
489 | subset_c.push(process_cand( |
490 | prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x - 1], |
491 | )); |
492 | } |
493 | // top |
494 | if frame_bo.0.y > 0 { |
495 | subset_c.push(process_cand( |
496 | prev_frame[frame_bo.0.y - 1][frame_bo.0.x + clipped_half_w], |
497 | )); |
498 | } |
499 | // right |
500 | if frame_bo.0.x + w < prev_frame.cols { |
501 | subset_c.push(process_cand( |
502 | prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x + w], |
503 | )); |
504 | } |
505 | // bottom |
506 | if frame_bo.0.y + h < prev_frame.rows { |
507 | subset_c.push(process_cand( |
508 | prev_frame[frame_bo.0.y + h][frame_bo.0.x + clipped_half_w], |
509 | )); |
510 | } |
511 | |
512 | subset_c.push(process_cand( |
513 | prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x + clipped_half_w], |
514 | )); |
515 | } |
516 | |
517 | // Undo normalization to 128x128 block size |
518 | let min_sad = ((min_sad as u64 * (pix_w * pix_h) as u64) |
519 | >> (MAX_SB_SIZE_LOG2 * 2)) as u32; |
520 | |
521 | let dec_mv = |mv: MotionVector| MotionVector { |
522 | col: mv.col >> ssdec, |
523 | row: mv.row >> ssdec, |
524 | }; |
525 | let median = median.map(dec_mv); |
526 | for mv in subset_b.iter_mut() { |
527 | *mv = dec_mv(*mv); |
528 | } |
529 | for mv in subset_c.iter_mut() { |
530 | *mv = dec_mv(*mv); |
531 | } |
532 | |
533 | MotionEstimationSubsets { min_sad, median, subset_b, subset_c } |
534 | } |
535 | |
536 | pub fn estimate_motion<T: Pixel>( |
537 | fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, w: usize, h: usize, |
538 | tile_bo: TileBlockOffset, ref_frame: RefType, |
539 | pmv: Option<[MotionVector; 2]>, corner: MVSamplingMode, |
540 | extensive_search: bool, ssdec: u8, lambda: Option<u32>, |
541 | ) -> Option<MotionSearchResult> { |
542 | if let Some(ref rec) = |
543 | fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] |
544 | { |
545 | let frame_bo = ts.to_frame_block_offset(tile_bo); |
546 | let (mvx_min, mvx_max, mvy_min, mvy_max) = |
547 | get_mv_range(fi.w_in_b, fi.h_in_b, frame_bo, w << ssdec, h << ssdec); |
548 | |
549 | let lambda = lambda.unwrap_or({ |
550 | // 0.5 is a fudge factor |
551 | (fi.me_lambda * 256.0 * 0.5) as u32 |
552 | }); |
553 | |
554 | let global_mv = [MotionVector { row: 0, col: 0 }; 2]; |
555 | |
556 | let po = frame_bo.to_luma_plane_offset(); |
557 | let (mvx_min, mvx_max, mvy_min, mvy_max) = |
558 | (mvx_min >> ssdec, mvx_max >> ssdec, mvy_min >> ssdec, mvy_max >> ssdec); |
559 | let po = PlaneOffset { x: po.x >> ssdec, y: po.y >> ssdec }; |
560 | let p_ref = match ssdec { |
561 | 0 => &rec.frame.planes[0], |
562 | 1 => &rec.input_hres, |
563 | 2 => &rec.input_qres, |
564 | _ => unimplemented!(), |
565 | }; |
566 | |
567 | let org_region = &match ssdec { |
568 | 0 => ts.input_tile.planes[0] |
569 | .subregion(Area::BlockStartingAt { bo: tile_bo.0 }), |
570 | 1 => ts.input_hres.region(Area::StartingAt { x: po.x, y: po.y }), |
571 | 2 => ts.input_qres.region(Area::StartingAt { x: po.x, y: po.y }), |
572 | _ => unimplemented!(), |
573 | }; |
574 | |
575 | let mut best: MotionSearchResult = full_pixel_me( |
576 | fi, |
577 | ts, |
578 | org_region, |
579 | p_ref, |
580 | tile_bo, |
581 | po, |
582 | lambda, |
583 | pmv.unwrap_or(global_mv), |
584 | w, |
585 | h, |
586 | mvx_min, |
587 | mvx_max, |
588 | mvy_min, |
589 | mvy_max, |
590 | ref_frame, |
591 | corner, |
592 | extensive_search, |
593 | ssdec, |
594 | ); |
595 | |
596 | if let Some(pmv) = pmv { |
597 | let use_satd: bool = fi.config.speed_settings.motion.use_satd_subpel; |
598 | if use_satd { |
599 | best.rd = get_fullpel_mv_rd( |
600 | fi, |
601 | po, |
602 | org_region, |
603 | p_ref, |
604 | fi.sequence.bit_depth, |
605 | pmv, |
606 | lambda, |
607 | use_satd, |
608 | mvx_min, |
609 | mvx_max, |
610 | mvy_min, |
611 | mvy_max, |
612 | w, |
613 | h, |
614 | best.mv, |
615 | ); |
616 | } |
617 | |
618 | sub_pixel_me( |
619 | fi, po, org_region, p_ref, lambda, pmv, mvx_min, mvx_max, mvy_min, |
620 | mvy_max, w, h, use_satd, &mut best, ref_frame, |
621 | ); |
622 | } |
623 | |
624 | // Scale motion vectors to full res size |
625 | best.mv = best.mv << ssdec; |
626 | |
627 | Some(best) |
628 | } else { |
629 | None |
630 | } |
631 | } |
632 | |
633 | /// Refine motion estimation that was computed one level of subsampling up. |
634 | fn refine_subsampled_motion_estimate<T: Pixel>( |
635 | fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, w: usize, h: usize, |
636 | tile_bo: TileBlockOffset, ref_frame: RefType, ssdec: u8, lambda: u32, |
637 | ) -> Option<MotionSearchResult> { |
638 | if let Some(ref rec) = |
639 | fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] |
640 | { |
641 | let frame_bo = ts.to_frame_block_offset(tile_bo); |
642 | let (mvx_min, mvx_max, mvy_min, mvy_max) = |
643 | get_mv_range(fi.w_in_b, fi.h_in_b, frame_bo, w << ssdec, h << ssdec); |
644 | |
645 | let pmv = [MotionVector { row: 0, col: 0 }; 2]; |
646 | |
647 | let po = frame_bo.to_luma_plane_offset(); |
648 | let (mvx_min, mvx_max, mvy_min, mvy_max) = |
649 | (mvx_min >> ssdec, mvx_max >> ssdec, mvy_min >> ssdec, mvy_max >> ssdec); |
650 | let po = PlaneOffset { x: po.x >> ssdec, y: po.y >> ssdec }; |
651 | let p_ref = match ssdec { |
652 | 0 => &rec.frame.planes[0], |
653 | 1 => &rec.input_hres, |
654 | 2 => &rec.input_qres, |
655 | _ => unimplemented!(), |
656 | }; |
657 | |
658 | let org_region = &match ssdec { |
659 | 0 => ts.input_tile.planes[0] |
660 | .subregion(Area::BlockStartingAt { bo: tile_bo.0 }), |
661 | 1 => ts.input_hres.region(Area::StartingAt { x: po.x, y: po.y }), |
662 | 2 => ts.input_qres.region(Area::StartingAt { x: po.x, y: po.y }), |
663 | _ => unimplemented!(), |
664 | }; |
665 | |
666 | let mv = |
667 | ts.me_stats[ref_frame.to_index()][tile_bo.0.y][tile_bo.0.x].mv >> ssdec; |
668 | |
669 | // Given a motion vector at 0 at higher subsampling: |
670 | // | -1 | 0 | 1 | |
671 | // then the vectors at -1 to 2 should be tested at the current subsampling. |
672 | // |-------------| |
673 | // | -2 -1 | 0 1 | 2 3 | |
674 | // This corresponds to a 4x4 full search. |
675 | let x_lo = po.x + (mv.col as isize / 8 - 1).max(mvx_min / 8); |
676 | let x_hi = po.x + (mv.col as isize / 8 + 2).min(mvx_max / 8); |
677 | let y_lo = po.y + (mv.row as isize / 8 - 1).max(mvy_min / 8); |
678 | let y_hi = po.y + (mv.row as isize / 8 + 2).min(mvy_max / 8); |
679 | let mut results = full_search( |
680 | fi, x_lo, x_hi, y_lo, y_hi, w, h, org_region, p_ref, po, 1, lambda, pmv, |
681 | ); |
682 | |
683 | // Scale motion vectors to full res size |
684 | results.mv = results.mv << ssdec; |
685 | |
686 | Some(results) |
687 | } else { |
688 | None |
689 | } |
690 | } |
691 | |
692 | #[profiling::function ] |
693 | fn full_pixel_me<T: Pixel>( |
694 | fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, |
695 | org_region: &PlaneRegion<T>, p_ref: &Plane<T>, tile_bo: TileBlockOffset, |
696 | po: PlaneOffset, lambda: u32, pmv: [MotionVector; 2], w: usize, h: usize, |
697 | mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, |
698 | ref_frame: RefType, corner: MVSamplingMode, extensive_search: bool, |
699 | ssdec: u8, |
700 | ) -> MotionSearchResult { |
701 | let ref_frame_id = ref_frame.to_index(); |
702 | let tile_me_stats = &ts.me_stats[ref_frame_id].as_const(); |
703 | let frame_ref = fi.rec_buffer.frames[fi.ref_frames[0] as usize] |
704 | .as_ref() |
705 | .map(|frame_ref| frame_ref.frame_me_stats.read().expect("poisoned lock" )); |
706 | let subsets = get_subset_predictors( |
707 | tile_bo, |
708 | tile_me_stats, |
709 | frame_ref, |
710 | ref_frame_id, |
711 | w, |
712 | h, |
713 | mvx_min, |
714 | mvx_max, |
715 | mvy_min, |
716 | mvy_max, |
717 | corner, |
718 | ssdec, |
719 | ); |
720 | |
721 | let try_cands = |predictors: &[MotionVector], |
722 | best: &mut MotionSearchResult| { |
723 | let mut results = get_best_predictor( |
724 | fi, |
725 | po, |
726 | org_region, |
727 | p_ref, |
728 | predictors, |
729 | fi.sequence.bit_depth, |
730 | pmv, |
731 | lambda, |
732 | mvx_min, |
733 | mvx_max, |
734 | mvy_min, |
735 | mvy_max, |
736 | w, |
737 | h, |
738 | ); |
739 | fullpel_diamond_search( |
740 | fi, |
741 | po, |
742 | org_region, |
743 | p_ref, |
744 | &mut results, |
745 | fi.sequence.bit_depth, |
746 | pmv, |
747 | lambda, |
748 | mvx_min, |
749 | mvx_max, |
750 | mvy_min, |
751 | mvy_max, |
752 | w, |
753 | h, |
754 | ); |
755 | |
756 | if results.rd.cost < best.rd.cost { |
757 | *best = results; |
758 | } |
759 | }; |
760 | |
761 | let mut best: MotionSearchResult = MotionSearchResult::empty(); |
762 | if !extensive_search { |
763 | try_cands(&subsets.all_mvs(), &mut best); |
764 | best |
765 | } else { |
766 | // Perform a more thorough search before resorting to full search. |
767 | // Search the median, the best mvs of neighboring blocks, and motion vectors |
768 | // from the previous frame. Stop once a candidate with a sad less than a |
769 | // threshold is found. |
770 | |
771 | let thresh = (subsets.min_sad as f32 * 1.2) as u32 |
772 | + (((w * h) as u32) << (fi.sequence.bit_depth - 8)); |
773 | |
774 | if let Some(median) = subsets.median { |
775 | try_cands(&[median], &mut best); |
776 | |
777 | if best.rd.sad < thresh { |
778 | return best; |
779 | } |
780 | } |
781 | |
782 | try_cands(&subsets.subset_b, &mut best); |
783 | |
784 | if best.rd.sad < thresh { |
785 | return best; |
786 | } |
787 | |
788 | try_cands(&subsets.subset_c, &mut best); |
789 | |
790 | if best.rd.sad < thresh { |
791 | return best; |
792 | } |
793 | |
794 | // Preform UMH search, either as the last possible search when full search |
795 | // is disabled, or as the last search before resorting to full search. |
796 | uneven_multi_hex_search( |
797 | fi, |
798 | po, |
799 | org_region, |
800 | p_ref, |
801 | &mut best, |
802 | fi.sequence.bit_depth, |
803 | pmv, |
804 | lambda, |
805 | mvx_min, |
806 | mvx_max, |
807 | mvy_min, |
808 | mvy_max, |
809 | w, |
810 | h, |
811 | // Use 24, since it is the largest range that x264 uses. |
812 | 24, |
813 | ); |
814 | |
815 | if !fi.config.speed_settings.motion.me_allow_full_search |
816 | || best.rd.sad < thresh |
817 | { |
818 | return best; |
819 | } |
820 | |
821 | { |
822 | let range_x = (192 * fi.me_range_scale as isize) >> ssdec; |
823 | let range_y = (64 * fi.me_range_scale as isize) >> ssdec; |
824 | let x_lo = po.x + (-range_x).max(mvx_min / 8); |
825 | let x_hi = po.x + (range_x).min(mvx_max / 8); |
826 | let y_lo = po.y + (-range_y).max(mvy_min / 8); |
827 | let y_hi = po.y + (range_y).min(mvy_max / 8); |
828 | |
829 | let results = full_search( |
830 | fi, |
831 | x_lo, |
832 | x_hi, |
833 | y_lo, |
834 | y_hi, |
835 | w, |
836 | h, |
837 | org_region, |
838 | p_ref, |
839 | po, |
840 | // Full search is run at quarter resolution, except for short edges. |
841 | // When subsampling is lower than usual, the step size is raised so that |
842 | // the number of search locations stays the same. |
843 | 4 >> ssdec, |
844 | lambda, |
845 | [MotionVector::default(); 2], |
846 | ); |
847 | |
848 | if results.rd.cost < best.rd.cost { |
849 | results |
850 | } else { |
851 | best |
852 | } |
853 | } |
854 | } |
855 | } |
856 | |
857 | fn sub_pixel_me<T: Pixel>( |
858 | fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>, |
859 | p_ref: &Plane<T>, lambda: u32, pmv: [MotionVector; 2], mvx_min: isize, |
860 | mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, |
861 | use_satd: bool, best: &mut MotionSearchResult, ref_frame: RefType, |
862 | ) { |
863 | subpel_diamond_search( |
864 | fi, |
865 | po, |
866 | org_region, |
867 | p_ref, |
868 | fi.sequence.bit_depth, |
869 | pmv, |
870 | lambda, |
871 | mvx_min, |
872 | mvx_max, |
873 | mvy_min, |
874 | mvy_max, |
875 | w, |
876 | h, |
877 | use_satd, |
878 | current:best, |
879 | ref_frame, |
880 | ); |
881 | } |
882 | |
883 | #[profiling::function ] |
884 | fn get_best_predictor<T: Pixel>( |
885 | fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>, |
886 | p_ref: &Plane<T>, predictors: &[MotionVector], bit_depth: usize, |
887 | pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, |
888 | mvy_min: isize, mvy_max: isize, w: usize, h: usize, |
889 | ) -> MotionSearchResult { |
890 | let mut best: MotionSearchResult = MotionSearchResult::empty(); |
891 | |
892 | for &init_mv: MotionVector in predictors.iter() { |
893 | let rd: MVCandidateRD = get_fullpel_mv_rd( |
894 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, use_satd:false, mvx_min, |
895 | mvx_max, mvy_min, mvy_max, w, h, cand_mv:init_mv, |
896 | ); |
897 | |
898 | if rd.cost < best.rd.cost { |
899 | best.mv = init_mv; |
900 | best.rd = rd; |
901 | } |
902 | } |
903 | |
904 | best |
905 | } |
906 | |
907 | /// Declares an array of motion vectors in structure of arrays syntax. |
908 | /// Compared to [`search_pattern_subpel`], this version creates motion vectors |
909 | /// in fullpel resolution (x8). |
910 | macro_rules! search_pattern { |
911 | ($field_a:ident: [$($ll_a:expr),*], $field_b:ident: [$($ll_b:expr),*]) => { |
912 | [ $(MotionVector { $field_a: $ll_a << 3, $field_b: $ll_b << 3 } ),*] |
913 | }; |
914 | } |
915 | |
916 | /// Declares an array of motion vectors in structure of arrays syntax. |
917 | macro_rules! search_pattern_subpel { |
918 | ($field_a:ident: [$($ll_a:expr),*], $field_b:ident: [$($ll_b:expr),*]) => { |
919 | [ $(MotionVector { $field_a: $ll_a, $field_b: $ll_b } ),*] |
920 | }; |
921 | } |
922 | |
923 | /// Diamond pattern of radius 1 as shown below. For fullpel search, use |
924 | /// `DIAMOND_R1_PATTERN_FULLPEL` since it has been scaled for fullpel search. |
925 | /// ```text |
926 | /// X |
927 | /// XoX |
928 | /// X |
929 | /// ``` |
930 | /// 'X's are motion candidates and the 'o' is the center. |
931 | /// |
932 | const DIAMOND_R1_PATTERN_SUBPEL: [MotionVector; 4] = search_pattern_subpel!( |
933 | col: [ 0, 1, 0, -1], |
934 | row: [ 1, 0, -1, 0] |
935 | ); |
936 | /// Diamond pattern of radius 1 as shown below. Unlike `DIAMOND_R1_PATTERN`, the |
937 | /// vectors have been shifted fullpel scale. |
938 | /// ```text |
939 | /// X |
940 | /// XoX |
941 | /// X |
942 | /// ``` |
943 | /// 'X's are motion candidates and the 'o' is the center. |
944 | const DIAMOND_R1_PATTERN: [MotionVector; 4] = search_pattern!( |
945 | col: [ 0, 1, 0, -1], |
946 | row: [ 1, 0, -1, 0] |
947 | ); |
948 | |
949 | /// Run a full pixel diamond search. The search is run on multiple step sizes. |
950 | /// |
951 | /// For each step size, candidate motion vectors are examined for improvement |
952 | /// to the current search location. The search location is moved to the best |
953 | /// candidate (if any). This is repeated until the search location stops moving. |
954 | #[profiling::function ] |
955 | fn fullpel_diamond_search<T: Pixel>( |
956 | fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>, |
957 | p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize, |
958 | pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, |
959 | mvy_min: isize, mvy_max: isize, w: usize, h: usize, |
960 | ) { |
961 | // Define the initial and the final scale (log2) of the diamond. |
962 | let (mut diamond_radius_log2, diamond_radius_end_log2) = (1u8, 0u8); |
963 | |
964 | loop { |
965 | // Find the best candidate from the diamond pattern. |
966 | let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); |
967 | for &offset in &DIAMOND_R1_PATTERN { |
968 | let cand_mv = current.mv + (offset << diamond_radius_log2); |
969 | let rd = get_fullpel_mv_rd( |
970 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, |
971 | mvx_max, mvy_min, mvy_max, w, h, cand_mv, |
972 | ); |
973 | |
974 | if rd.cost < best_cand.rd.cost { |
975 | best_cand.mv = cand_mv; |
976 | best_cand.rd = rd; |
977 | } |
978 | } |
979 | |
980 | // Continue the search at this scale until the can't find a better candidate |
981 | // to use. |
982 | if current.rd.cost <= best_cand.rd.cost { |
983 | if diamond_radius_log2 == diamond_radius_end_log2 { |
984 | break; |
985 | } else { |
986 | diamond_radius_log2 -= 1; |
987 | } |
988 | } else { |
989 | *current = best_cand; |
990 | } |
991 | } |
992 | |
993 | assert!(!current.is_empty()); |
994 | } |
995 | |
996 | /// A hexagon pattern around a center point. The pattern is ordered so that the |
997 | /// offsets circle around the center. This is done to allow pruning locations |
998 | /// covered by the last iteration. |
999 | /// ```text |
1000 | /// 21012 |
1001 | /// 2 X X |
1002 | /// 1 |
1003 | /// 0 X o X |
1004 | /// 1 |
1005 | /// 2 X X |
1006 | /// ``` |
1007 | /// 'X's are motion candidates and the 'o' is the center. |
1008 | /// |
1009 | /// The illustration below shows the process of a hexagon search. |
1010 | /// ```text |
1011 | /// Step 1 Step 2 |
1012 | /// 1 1 1 1 2 |
1013 | /// |
1014 | /// 1(0)1 => 1 0(1)2 |
1015 | /// |
1016 | /// 1 1 1 1 2 |
1017 | /// ``` |
1018 | /// The search above has gone through the following steps. |
1019 | /// 1. Search '1' elements for better candidates than the center '0'. |
1020 | /// 2. Recenter around the best candidate ('(1)') and hexagon candidates that |
1021 | /// don't overlap with the previous search step (labeled '2'). |
1022 | const HEXAGON_PATTERN: [MotionVector; 6] = search_pattern!( |
1023 | col: [ 0, 2, 2, 0, -2, -2], |
1024 | row: [ -2, -1, 1, 2, 1, -1] |
1025 | ); |
1026 | |
1027 | /// A small square pattern around a center point. |
1028 | /// ```text |
1029 | /// 101 |
1030 | /// 1 XXX |
1031 | /// 0 XoX |
1032 | /// 1 XXX |
1033 | /// ``` |
1034 | /// 'X's are motion candidates and the 'o' is the center. |
1035 | const SQUARE_REFINE_PATTERN: [MotionVector; 8] = search_pattern!( |
1036 | col: [ -1, 0, 1, -1, 1, -1, 0, 1], |
1037 | row: [ 1, 1, 1, 0, 0, -1, -1, -1] |
1038 | ); |
1039 | |
1040 | /// Perform hexagon search and refine afterwards. |
1041 | /// |
1042 | /// In the hexagon search stage, candidate motion vectors are examined for |
1043 | /// improvement to the current search location. The search location is moved to |
1044 | /// the best candidate (if any). This is repeated until the search location |
1045 | /// stops moving. |
1046 | /// |
1047 | /// Refinement uses a square pattern that fits between the hexagon candidates. |
1048 | /// |
1049 | /// The hexagon pattern is defined by [`HEXAGON_PATTERN`] and the refinement |
1050 | /// is defined by [`SQUARE_REFINE_PATTERN`]. |
1051 | /// |
1052 | /// `current` provides the initial search location and serves as |
1053 | /// the output for the final search results. |
1054 | #[profiling::function ] |
1055 | fn hexagon_search<T: Pixel>( |
1056 | fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>, |
1057 | p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize, |
1058 | pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, |
1059 | mvy_min: isize, mvy_max: isize, w: usize, h: usize, |
1060 | ) { |
1061 | // The first iteration of hexagon search is implemented separate from |
1062 | // subsequent iterations, which overlap with previous iterations. |
1063 | |
1064 | // Holds what candidate is used (if any). This is used to determine which |
1065 | // candidates have already been tested in a previous iteration and can be |
1066 | // skipped. |
1067 | let mut best_cand_idx: usize = 0; |
1068 | let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); |
1069 | |
1070 | // First iteration of hexagon search. There are six candidates to consider. |
1071 | for i in 0..6 { |
1072 | let cand_mv = current.mv + HEXAGON_PATTERN[i]; |
1073 | let rd = get_fullpel_mv_rd( |
1074 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, |
1075 | mvx_max, mvy_min, mvy_max, w, h, cand_mv, |
1076 | ); |
1077 | |
1078 | if rd.cost < best_cand.rd.cost { |
1079 | best_cand_idx = i; |
1080 | best_cand.mv = cand_mv; |
1081 | best_cand.rd = rd; |
1082 | } |
1083 | } |
1084 | |
1085 | // Run additional iterations of hexagon search until the search location |
1086 | // doesn't update. |
1087 | while best_cand.rd.cost < current.rd.cost { |
1088 | // Update the search location. |
1089 | *current = best_cand; |
1090 | best_cand = MotionSearchResult::empty(); |
1091 | |
1092 | // Save the index/direction taken in the previous iteration to the current |
1093 | // search location. |
1094 | let center_cand_idx = best_cand_idx; |
1095 | |
1096 | // Look only at candidates that don't overlap with previous iterations. This |
1097 | // corresponds with the three offsets (2D) with the closest direction to |
1098 | // that traveled by the previous iteration. HEXAGON_PATTERN has clockwise |
1099 | // order, so the last direction -1, +0, and +1 (mod 6) give the indices for |
1100 | // these offsets. |
1101 | for idx_offset_mod6 in 5..=7 { |
1102 | let i = (center_cand_idx + idx_offset_mod6) % 6; |
1103 | let cand_mv = current.mv + HEXAGON_PATTERN[i]; |
1104 | |
1105 | let rd = get_fullpel_mv_rd( |
1106 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, |
1107 | mvx_max, mvy_min, mvy_max, w, h, cand_mv, |
1108 | ); |
1109 | |
1110 | if rd.cost < best_cand.rd.cost { |
1111 | best_cand_idx = i; |
1112 | best_cand.mv = cand_mv; |
1113 | best_cand.rd = rd; |
1114 | } |
1115 | } |
1116 | } |
1117 | |
1118 | // Refine the motion after completing hexagon search. |
1119 | let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); |
1120 | for &offset in &SQUARE_REFINE_PATTERN { |
1121 | let cand_mv = current.mv + offset; |
1122 | let rd = get_fullpel_mv_rd( |
1123 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, |
1124 | mvx_max, mvy_min, mvy_max, w, h, cand_mv, |
1125 | ); |
1126 | |
1127 | if rd.cost < best_cand.rd.cost { |
1128 | best_cand.mv = cand_mv; |
1129 | best_cand.rd = rd; |
1130 | } |
1131 | } |
1132 | if best_cand.rd.cost < current.rd.cost { |
1133 | *current = best_cand; |
1134 | } |
1135 | |
1136 | assert!(!current.is_empty()); |
1137 | } |
1138 | |
1139 | /// Uneven multi-hexagon search pattern around a center point. Used for locating |
1140 | /// irregular movement. |
1141 | /// ```text |
1142 | /// X |
1143 | /// X X |
1144 | /// X X |
1145 | /// X X |
1146 | /// X o X |
1147 | /// X X |
1148 | /// X X |
1149 | /// X X |
1150 | /// X |
1151 | /// ``` |
1152 | /// 'X's are motion candidates and the 'o' is the center. |
1153 | const UMH_PATTERN: [MotionVector; 16] = search_pattern!( |
1154 | col: [ -2, -1, 0, 1, 2, 3, 4, 3, 2, 1, 0, -1, -2, 3, -4, -3], |
1155 | row: [ 4, 4, 4, 4, 4, 2, 0, -2, -4, -4, -4, -4, -4, -2, 0, 2] |
1156 | ); |
1157 | |
1158 | /// Perform an uneven multi-hexagon search. There are 4 stages: |
1159 | /// 1. Unsymmetrical-cross search: Search the horizontal and vertical directions |
1160 | /// for the general direction of the motion. |
1161 | /// 2. A 5x5 full search is done to refine the current candidate. |
1162 | /// 3. Uneven multi-hexagon search. See [`UMH_PATTERN`]. |
1163 | /// 4. Refinement using standard hexagon search. |
1164 | /// |
1165 | /// `current` provides the initial search location and serves as |
1166 | /// the output for the final search results. |
1167 | /// |
1168 | /// `me_range` parameter determines how far these stages can search. |
1169 | #[profiling::function ] |
1170 | fn uneven_multi_hex_search<T: Pixel>( |
1171 | fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>, |
1172 | p_ref: &Plane<T>, current: &mut MotionSearchResult, bit_depth: usize, |
1173 | pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, |
1174 | mvy_min: isize, mvy_max: isize, w: usize, h: usize, me_range: i16, |
1175 | ) { |
1176 | assert!(!current.is_empty()); |
1177 | |
1178 | // Search in a cross pattern to obtain a rough approximate of motion. |
1179 | // The cross is split into a horizontal and vertical component. Video content |
1180 | // tends to have more horizontal motion, so the horizontal part of the cross |
1181 | // is twice as large as the vertical half. |
1182 | // X - |
1183 | // | <- me_range/2 |
1184 | // X | |
1185 | // X X X XoX X X X - |
1186 | // X |
1187 | // |
1188 | // X |
1189 | // |------| |
1190 | // \ |
1191 | // me_range |
1192 | let center = current.mv; |
1193 | |
1194 | // The larger, horizontal, part of the cross search. |
1195 | for i in (1..=me_range).step_by(2) { |
1196 | const HORIZONTAL_LINE: [MotionVector; 2] = search_pattern!( |
1197 | col: [ 0, 0], |
1198 | row: [-1, 1] |
1199 | ); |
1200 | |
1201 | for &offset in &HORIZONTAL_LINE { |
1202 | let cand_mv = center + offset * i; |
1203 | let rd = get_fullpel_mv_rd( |
1204 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, |
1205 | mvx_max, mvy_min, mvy_max, w, h, cand_mv, |
1206 | ); |
1207 | |
1208 | if rd.cost < current.rd.cost { |
1209 | current.mv = cand_mv; |
1210 | current.rd = rd; |
1211 | } |
1212 | } |
1213 | } |
1214 | |
1215 | // The smaller, vertical, part of the cross search |
1216 | for i in (1..=me_range >> 1).step_by(2) { |
1217 | const VERTICAL_LINE: [MotionVector; 2] = search_pattern!( |
1218 | col: [-1, 1], |
1219 | row: [ 0, 0] |
1220 | ); |
1221 | |
1222 | for &offset in &VERTICAL_LINE { |
1223 | let cand_mv = center + offset * i; |
1224 | let rd = get_fullpel_mv_rd( |
1225 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, |
1226 | mvx_max, mvy_min, mvy_max, w, h, cand_mv, |
1227 | ); |
1228 | |
1229 | if rd.cost < current.rd.cost { |
1230 | current.mv = cand_mv; |
1231 | current.rd = rd; |
1232 | } |
1233 | } |
1234 | } |
1235 | |
1236 | // 5x5 full search. Search a 5x5 square region around the current best mv. |
1237 | let center = current.mv; |
1238 | for row in -2..=2 { |
1239 | for col in -2..=2 { |
1240 | if row == 0 && col == 0 { |
1241 | continue; |
1242 | } |
1243 | let cand_mv = center + MotionVector { row, col }; |
1244 | let rd = get_fullpel_mv_rd( |
1245 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, |
1246 | mvx_max, mvy_min, mvy_max, w, h, cand_mv, |
1247 | ); |
1248 | |
1249 | if rd.cost < current.rd.cost { |
1250 | current.mv = cand_mv; |
1251 | current.rd = rd; |
1252 | } |
1253 | } |
1254 | } |
1255 | |
1256 | // Run the hexagons in uneven multi-hexagon. The hexagonal pattern is tested |
1257 | // around the best vector at multiple scales. |
1258 | // Example of the UMH pattern run on a scale of 1 and 2: |
1259 | // 2 - |
1260 | // | <- me_range |
1261 | // 2 2 | |
1262 | // | |
1263 | // 2 1 2 | |
1264 | // 1 1 | |
1265 | // 2 1 1 2 | |
1266 | // 1 1 | |
1267 | // 2 1 o 1 2 | |
1268 | // 1 1 | |
1269 | // 2 1 1 2 | |
1270 | // 1 1 | |
1271 | // 2 1 2 | |
1272 | // | |
1273 | // 2 2 | |
1274 | // | |
1275 | // 2 - |
1276 | // |---------------| |
1277 | // \ |
1278 | // me_range |
1279 | let center = current.mv; |
1280 | |
1281 | // Divide by 4, the radius of the UMH's hexagon. |
1282 | let iterations = me_range >> 2; |
1283 | for i in 1..=iterations { |
1284 | for &offset in &UMH_PATTERN { |
1285 | let cand_mv = center + offset * i; |
1286 | let rd = get_fullpel_mv_rd( |
1287 | fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, |
1288 | mvx_max, mvy_min, mvy_max, w, h, cand_mv, |
1289 | ); |
1290 | |
1291 | if rd.cost < current.rd.cost { |
1292 | current.mv = cand_mv; |
1293 | current.rd = rd; |
1294 | } |
1295 | } |
1296 | } |
1297 | |
1298 | // Refine the search results using a 'normal' hexagon search. |
1299 | hexagon_search( |
1300 | fi, po, org_region, p_ref, current, bit_depth, pmv, lambda, mvx_min, |
1301 | mvx_max, mvy_min, mvy_max, w, h, |
1302 | ); |
1303 | } |
1304 | |
1305 | /// Run a subpixel diamond search. The search is run on multiple step sizes. |
1306 | /// |
1307 | /// For each step size, candidate motion vectors are examined for improvement |
1308 | /// to the current search location. The search location is moved to the best |
1309 | /// candidate (if any). This is repeated until the search location stops moving. |
1310 | #[profiling::function ] |
1311 | fn subpel_diamond_search<T: Pixel>( |
1312 | fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>, |
1313 | _p_ref: &Plane<T>, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, |
1314 | mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, |
1315 | h: usize, use_satd: bool, current: &mut MotionSearchResult, |
1316 | ref_frame: RefType, |
1317 | ) { |
1318 | use crate::util::Aligned; |
1319 | |
1320 | // Motion compensation assembly has special requirements for edges |
1321 | let mc_w = w.next_power_of_two(); |
1322 | let mc_h = (h + 1) & !1; |
1323 | |
1324 | // Metadata for subpel scratch pad. |
1325 | let cfg = PlaneConfig::new(mc_w, mc_h, 0, 0, 0, 0, std::mem::size_of::<T>()); |
1326 | // Stack allocation for subpel scratch pad. |
1327 | // SAFETY: We write to the array below before reading from it. |
1328 | let mut buf: Aligned<[T; 128 * 128]> = unsafe { Aligned::uninitialized() }; |
1329 | let mut tmp_region = PlaneRegionMut::from_slice( |
1330 | &mut buf.data, |
1331 | &cfg, |
1332 | Rect { x: 0, y: 0, width: cfg.width, height: cfg.height }, |
1333 | ); |
1334 | |
1335 | // start at 1/2 pel and end at 1/4 or 1/8 pel |
1336 | let (mut diamond_radius_log2, diamond_radius_end_log2) = |
1337 | (2u8, u8::from(!fi.allow_high_precision_mv)); |
1338 | |
1339 | loop { |
1340 | // Find the best candidate from the diamond pattern. |
1341 | let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); |
1342 | for &offset in &DIAMOND_R1_PATTERN_SUBPEL { |
1343 | let cand_mv = current.mv + (offset << diamond_radius_log2); |
1344 | |
1345 | let rd = get_subpel_mv_rd( |
1346 | fi, |
1347 | po, |
1348 | org_region, |
1349 | bit_depth, |
1350 | pmv, |
1351 | lambda, |
1352 | use_satd, |
1353 | mvx_min, |
1354 | mvx_max, |
1355 | mvy_min, |
1356 | mvy_max, |
1357 | w, |
1358 | h, |
1359 | cand_mv, |
1360 | &mut tmp_region, |
1361 | ref_frame, |
1362 | ); |
1363 | |
1364 | if rd.cost < best_cand.rd.cost { |
1365 | best_cand.mv = cand_mv; |
1366 | best_cand.rd = rd; |
1367 | } |
1368 | } |
1369 | |
1370 | // Continue the search at this scale until a better candidate isn't found. |
1371 | if current.rd.cost <= best_cand.rd.cost { |
1372 | if diamond_radius_log2 == diamond_radius_end_log2 { |
1373 | break; |
1374 | } else { |
1375 | diamond_radius_log2 -= 1; |
1376 | } |
1377 | } else { |
1378 | *current = best_cand; |
1379 | } |
1380 | } |
1381 | |
1382 | assert!(!current.is_empty()); |
1383 | } |
1384 | |
1385 | #[inline ] |
1386 | fn get_fullpel_mv_rd<T: Pixel>( |
1387 | fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>, |
1388 | p_ref: &Plane<T>, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, |
1389 | use_satd: bool, mvx_min: isize, mvx_max: isize, mvy_min: isize, |
1390 | mvy_max: isize, w: usize, h: usize, cand_mv: MotionVector, |
1391 | ) -> MVCandidateRD { |
1392 | if (cand_mv.col as isize) < mvx_min |
1393 | || (cand_mv.col as isize) > mvx_max |
1394 | || (cand_mv.row as isize) < mvy_min |
1395 | || (cand_mv.row as isize) > mvy_max |
1396 | { |
1397 | return MVCandidateRD::empty(); |
1398 | } |
1399 | |
1400 | // Convert the motion vector into an full pixel offset. |
1401 | let plane_ref: PlaneRegion<'_, T> = p_ref.region(Area::StartingAt { |
1402 | x: po.x + (cand_mv.col / 8) as isize, |
1403 | y: po.y + (cand_mv.row / 8) as isize, |
1404 | }); |
1405 | compute_mv_rd( |
1406 | fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, plane_org:org_region, |
1407 | &plane_ref, |
1408 | ) |
1409 | } |
1410 | |
1411 | fn get_subpel_mv_rd<T: Pixel>( |
1412 | fi: &FrameInvariants<T>, po: PlaneOffset, org_region: &PlaneRegion<T>, |
1413 | bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, use_satd: bool, |
1414 | mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, |
1415 | h: usize, cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut<T>, |
1416 | ref_frame: RefType, |
1417 | ) -> MVCandidateRD { |
1418 | if (cand_mv.col as isize) < mvx_min |
1419 | || (cand_mv.col as isize) > mvx_max |
1420 | || (cand_mv.row as isize) < mvy_min |
1421 | || (cand_mv.row as isize) > mvy_max |
1422 | { |
1423 | return MVCandidateRD::empty(); |
1424 | } |
1425 | |
1426 | let tmp_width = tmp_region.rect().width; |
1427 | let tmp_height = tmp_region.rect().height; |
1428 | let tile_rect = |
1429 | TileRect { x: 0, y: 0, width: tmp_width, height: tmp_height }; |
1430 | |
1431 | PredictionMode::NEWMV.predict_inter_single( |
1432 | fi, tile_rect, 0, po, tmp_region, |
1433 | // motion comp's w & h on edges can be different than distortion's |
1434 | tmp_width, tmp_height, ref_frame, cand_mv, |
1435 | ); |
1436 | let plane_ref = tmp_region.as_const(); |
1437 | compute_mv_rd( |
1438 | fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region, |
1439 | &plane_ref, |
1440 | ) |
1441 | } |
1442 | |
1443 | /// Compute the rate distortion stats for a motion vector. |
1444 | #[inline (always)] |
1445 | fn compute_mv_rd<T: Pixel>( |
1446 | fi: &FrameInvariants<T>, pmv: [MotionVector; 2], lambda: u32, |
1447 | use_satd: bool, bit_depth: usize, w: usize, h: usize, cand_mv: MotionVector, |
1448 | plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, |
1449 | ) -> MVCandidateRD { |
1450 | let sad: u32 = if use_satd { |
1451 | get_satd(plane_org, plane_ref, w, h, bit_depth, _cpu:fi.cpu_feature_level) |
1452 | } else { |
1453 | get_sad(plane_org, plane_ref, w, h, bit_depth, _cpu:fi.cpu_feature_level) |
1454 | }; |
1455 | |
1456 | let rate1: u32 = get_mv_rate(a:cand_mv, b:pmv[0], fi.allow_high_precision_mv); |
1457 | let rate2: u32 = get_mv_rate(a:cand_mv, b:pmv[1], fi.allow_high_precision_mv); |
1458 | let rate: u32 = rate1.min(rate2 + 1); |
1459 | |
1460 | MVCandidateRD { cost: 256 * sad as u64 + rate as u64 * lambda as u64, sad } |
1461 | } |
1462 | |
1463 | #[profiling::function ] |
1464 | fn full_search<T: Pixel>( |
1465 | fi: &FrameInvariants<T>, x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize, |
1466 | w: usize, h: usize, org_region: &PlaneRegion<T>, p_ref: &Plane<T>, |
1467 | po: PlaneOffset, step: usize, lambda: u32, pmv: [MotionVector; 2], |
1468 | ) -> MotionSearchResult { |
1469 | let search_region = p_ref.region(Area::Rect { |
1470 | x: x_lo, |
1471 | y: y_lo, |
1472 | width: (x_hi - x_lo) as usize + w, |
1473 | height: (y_hi - y_lo) as usize + h, |
1474 | }); |
1475 | |
1476 | let mut best: MotionSearchResult = MotionSearchResult::empty(); |
1477 | |
1478 | // Select rectangular regions within search region with vert+horz windows |
1479 | for vert_window in search_region.vert_windows(h).step_by(step) { |
1480 | for ref_window in vert_window.horz_windows(w).step_by(step) { |
1481 | let &Rect { x, y, .. } = ref_window.rect(); |
1482 | |
1483 | let mv = MotionVector { |
1484 | row: 8 * (y as i16 - po.y as i16), |
1485 | col: 8 * (x as i16 - po.x as i16), |
1486 | }; |
1487 | |
1488 | let rd = compute_mv_rd( |
1489 | fi, |
1490 | pmv, |
1491 | lambda, |
1492 | false, |
1493 | fi.sequence.bit_depth, |
1494 | w, |
1495 | h, |
1496 | mv, |
1497 | org_region, |
1498 | &ref_window, |
1499 | ); |
1500 | |
1501 | if rd.cost < best.rd.cost { |
1502 | best.rd = rd; |
1503 | best.mv = mv; |
1504 | } |
1505 | } |
1506 | } |
1507 | |
1508 | best |
1509 | } |
1510 | |
1511 | #[inline (always)] |
1512 | fn get_mv_rate( |
1513 | a: MotionVector, b: MotionVector, allow_high_precision_mv: bool, |
1514 | ) -> u32 { |
1515 | #[inline (always)] |
1516 | fn diff_to_rate(diff: i16, allow_high_precision_mv: bool) -> u32 { |
1517 | let d: i16 = if allow_high_precision_mv { diff } else { diff >> 1 }; |
1518 | 2 * ILog::ilog(self:d.abs()) as u32 |
1519 | } |
1520 | |
1521 | diff_to_rate(diff:a.row - b.row, allow_high_precision_mv) |
1522 | + diff_to_rate(diff:a.col - b.col, allow_high_precision_mv) |
1523 | } |
1524 | |