1 | // Copyright (c) 2019-2022, The rav1e contributors. All rights reserved |
2 | // |
3 | // This source code is subject to the terms of the BSD 2 Clause License and |
4 | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
5 | // was not distributed with this source code in the LICENSE file, you can |
6 | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
7 | // Media Patent License 1.0 was not distributed with this source code in the |
8 | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
9 | |
10 | cfg_if::cfg_if! { |
11 | if #[cfg(nasm_x86_64)] { |
12 | pub use crate::asm::x86::mc::*; |
13 | } else if #[cfg(asm_neon)] { |
14 | pub use crate::asm::aarch64::mc::*; |
15 | } else { |
16 | pub use self::rust::*; |
17 | } |
18 | } |
19 | |
20 | use crate::cpu_features::CpuFeatureLevel; |
21 | use crate::frame::*; |
22 | use crate::tiling::*; |
23 | use crate::util::*; |
24 | |
25 | use simd_helpers::cold_for_target_arch; |
26 | use std::ops; |
27 | |
28 | #[derive (Clone, Copy, Debug, Default, PartialEq, Eq)] |
29 | pub struct MotionVector { |
30 | pub row: i16, |
31 | pub col: i16, |
32 | } |
33 | |
34 | impl MotionVector { |
35 | #[inline ] |
36 | pub const fn quantize_to_fullpel(self) -> Self { |
37 | Self { row: (self.row / 8) * 8, col: (self.col / 8) * 8 } |
38 | } |
39 | |
40 | #[inline ] |
41 | pub const fn is_zero(self) -> bool { |
42 | self.row == 0 && self.col == 0 |
43 | } |
44 | |
45 | #[inline ] |
46 | pub const fn is_valid(self) -> bool { |
47 | use crate::context::{MV_LOW, MV_UPP}; |
48 | ((MV_LOW as i16) < self.row && self.row < (MV_UPP as i16)) |
49 | && ((MV_LOW as i16) < self.col && self.col < (MV_UPP as i16)) |
50 | } |
51 | } |
52 | |
53 | impl ops::Mul<i16> for MotionVector { |
54 | type Output = MotionVector; |
55 | |
56 | #[inline ] |
57 | fn mul(self, rhs: i16) -> MotionVector { |
58 | MotionVector { row: self.row * rhs, col: self.col * rhs } |
59 | } |
60 | } |
61 | |
62 | impl ops::Mul<u16> for MotionVector { |
63 | type Output = MotionVector; |
64 | |
65 | #[inline ] |
66 | fn mul(self, rhs: u16) -> MotionVector { |
67 | MotionVector { row: self.row * rhs as i16, col: self.col * rhs as i16 } |
68 | } |
69 | } |
70 | |
71 | impl ops::Shr<u8> for MotionVector { |
72 | type Output = MotionVector; |
73 | |
74 | #[inline ] |
75 | fn shr(self, rhs: u8) -> MotionVector { |
76 | MotionVector { row: self.row >> rhs, col: self.col >> rhs } |
77 | } |
78 | } |
79 | |
80 | impl ops::Shl<u8> for MotionVector { |
81 | type Output = MotionVector; |
82 | |
83 | #[inline ] |
84 | fn shl(self, rhs: u8) -> MotionVector { |
85 | MotionVector { row: self.row << rhs, col: self.col << rhs } |
86 | } |
87 | } |
88 | |
89 | impl ops::Add<MotionVector> for MotionVector { |
90 | type Output = MotionVector; |
91 | |
92 | #[inline ] |
93 | fn add(self, rhs: MotionVector) -> MotionVector { |
94 | MotionVector { row: self.row + rhs.row, col: self.col + rhs.col } |
95 | } |
96 | } |
97 | |
98 | #[derive (Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] |
99 | #[allow (unused)] |
100 | pub enum FilterMode { |
101 | REGULAR = 0, |
102 | SMOOTH = 1, |
103 | SHARP = 2, |
104 | BILINEAR = 3, |
105 | SWITCHABLE = 4, |
106 | } |
107 | |
108 | pub const SUBPEL_FILTER_SIZE: usize = 8; |
109 | |
110 | const SUBPEL_FILTERS: [[[i32; SUBPEL_FILTER_SIZE]; 16]; 6] = [ |
111 | [ |
112 | [0, 0, 0, 128, 0, 0, 0, 0], |
113 | [0, 2, -6, 126, 8, -2, 0, 0], |
114 | [0, 2, -10, 122, 18, -4, 0, 0], |
115 | [0, 2, -12, 116, 28, -8, 2, 0], |
116 | [0, 2, -14, 110, 38, -10, 2, 0], |
117 | [0, 2, -14, 102, 48, -12, 2, 0], |
118 | [0, 2, -16, 94, 58, -12, 2, 0], |
119 | [0, 2, -14, 84, 66, -12, 2, 0], |
120 | [0, 2, -14, 76, 76, -14, 2, 0], |
121 | [0, 2, -12, 66, 84, -14, 2, 0], |
122 | [0, 2, -12, 58, 94, -16, 2, 0], |
123 | [0, 2, -12, 48, 102, -14, 2, 0], |
124 | [0, 2, -10, 38, 110, -14, 2, 0], |
125 | [0, 2, -8, 28, 116, -12, 2, 0], |
126 | [0, 0, -4, 18, 122, -10, 2, 0], |
127 | [0, 0, -2, 8, 126, -6, 2, 0], |
128 | ], |
129 | [ |
130 | [0, 0, 0, 128, 0, 0, 0, 0], |
131 | [0, 2, 28, 62, 34, 2, 0, 0], |
132 | [0, 0, 26, 62, 36, 4, 0, 0], |
133 | [0, 0, 22, 62, 40, 4, 0, 0], |
134 | [0, 0, 20, 60, 42, 6, 0, 0], |
135 | [0, 0, 18, 58, 44, 8, 0, 0], |
136 | [0, 0, 16, 56, 46, 10, 0, 0], |
137 | [0, -2, 16, 54, 48, 12, 0, 0], |
138 | [0, -2, 14, 52, 52, 14, -2, 0], |
139 | [0, 0, 12, 48, 54, 16, -2, 0], |
140 | [0, 0, 10, 46, 56, 16, 0, 0], |
141 | [0, 0, 8, 44, 58, 18, 0, 0], |
142 | [0, 0, 6, 42, 60, 20, 0, 0], |
143 | [0, 0, 4, 40, 62, 22, 0, 0], |
144 | [0, 0, 4, 36, 62, 26, 0, 0], |
145 | [0, 0, 2, 34, 62, 28, 2, 0], |
146 | ], |
147 | [ |
148 | [0, 0, 0, 128, 0, 0, 0, 0], |
149 | [-2, 2, -6, 126, 8, -2, 2, 0], |
150 | [-2, 6, -12, 124, 16, -6, 4, -2], |
151 | [-2, 8, -18, 120, 26, -10, 6, -2], |
152 | [-4, 10, -22, 116, 38, -14, 6, -2], |
153 | [-4, 10, -22, 108, 48, -18, 8, -2], |
154 | [-4, 10, -24, 100, 60, -20, 8, -2], |
155 | [-4, 10, -24, 90, 70, -22, 10, -2], |
156 | [-4, 12, -24, 80, 80, -24, 12, -4], |
157 | [-2, 10, -22, 70, 90, -24, 10, -4], |
158 | [-2, 8, -20, 60, 100, -24, 10, -4], |
159 | [-2, 8, -18, 48, 108, -22, 10, -4], |
160 | [-2, 6, -14, 38, 116, -22, 10, -4], |
161 | [-2, 6, -10, 26, 120, -18, 8, -2], |
162 | [-2, 4, -6, 16, 124, -12, 6, -2], |
163 | [0, 2, -2, 8, 126, -6, 2, -2], |
164 | ], |
165 | [ |
166 | [0, 0, 0, 128, 0, 0, 0, 0], |
167 | [0, 0, 0, 120, 8, 0, 0, 0], |
168 | [0, 0, 0, 112, 16, 0, 0, 0], |
169 | [0, 0, 0, 104, 24, 0, 0, 0], |
170 | [0, 0, 0, 96, 32, 0, 0, 0], |
171 | [0, 0, 0, 88, 40, 0, 0, 0], |
172 | [0, 0, 0, 80, 48, 0, 0, 0], |
173 | [0, 0, 0, 72, 56, 0, 0, 0], |
174 | [0, 0, 0, 64, 64, 0, 0, 0], |
175 | [0, 0, 0, 56, 72, 0, 0, 0], |
176 | [0, 0, 0, 48, 80, 0, 0, 0], |
177 | [0, 0, 0, 40, 88, 0, 0, 0], |
178 | [0, 0, 0, 32, 96, 0, 0, 0], |
179 | [0, 0, 0, 24, 104, 0, 0, 0], |
180 | [0, 0, 0, 16, 112, 0, 0, 0], |
181 | [0, 0, 0, 8, 120, 0, 0, 0], |
182 | ], |
183 | [ |
184 | [0, 0, 0, 128, 0, 0, 0, 0], |
185 | [0, 0, -4, 126, 8, -2, 0, 0], |
186 | [0, 0, -8, 122, 18, -4, 0, 0], |
187 | [0, 0, -10, 116, 28, -6, 0, 0], |
188 | [0, 0, -12, 110, 38, -8, 0, 0], |
189 | [0, 0, -12, 102, 48, -10, 0, 0], |
190 | [0, 0, -14, 94, 58, -10, 0, 0], |
191 | [0, 0, -12, 84, 66, -10, 0, 0], |
192 | [0, 0, -12, 76, 76, -12, 0, 0], |
193 | [0, 0, -10, 66, 84, -12, 0, 0], |
194 | [0, 0, -10, 58, 94, -14, 0, 0], |
195 | [0, 0, -10, 48, 102, -12, 0, 0], |
196 | [0, 0, -8, 38, 110, -12, 0, 0], |
197 | [0, 0, -6, 28, 116, -10, 0, 0], |
198 | [0, 0, -4, 18, 122, -8, 0, 0], |
199 | [0, 0, -2, 8, 126, -4, 0, 0], |
200 | ], |
201 | [ |
202 | [0, 0, 0, 128, 0, 0, 0, 0], |
203 | [0, 0, 30, 62, 34, 2, 0, 0], |
204 | [0, 0, 26, 62, 36, 4, 0, 0], |
205 | [0, 0, 22, 62, 40, 4, 0, 0], |
206 | [0, 0, 20, 60, 42, 6, 0, 0], |
207 | [0, 0, 18, 58, 44, 8, 0, 0], |
208 | [0, 0, 16, 56, 46, 10, 0, 0], |
209 | [0, 0, 14, 54, 48, 12, 0, 0], |
210 | [0, 0, 12, 52, 52, 12, 0, 0], |
211 | [0, 0, 12, 48, 54, 14, 0, 0], |
212 | [0, 0, 10, 46, 56, 16, 0, 0], |
213 | [0, 0, 8, 44, 58, 18, 0, 0], |
214 | [0, 0, 6, 42, 60, 20, 0, 0], |
215 | [0, 0, 4, 40, 62, 22, 0, 0], |
216 | [0, 0, 4, 36, 62, 26, 0, 0], |
217 | [0, 0, 2, 34, 62, 30, 0, 0], |
218 | ], |
219 | ]; |
220 | |
221 | pub(crate) mod rust { |
222 | use super::*; |
223 | use num_traits::*; |
224 | |
225 | unsafe fn run_filter<T: AsPrimitive<i32>>( |
226 | src: *const T, stride: usize, filter: [i32; 8], |
227 | ) -> i32 { |
228 | filter |
229 | .iter() |
230 | .enumerate() |
231 | .map(|(i, f)| { |
232 | let p = src.add(i * stride); |
233 | f * (*p).as_() |
234 | }) |
235 | .sum::<i32>() |
236 | } |
237 | |
238 | fn get_filter( |
239 | mode: FilterMode, frac: i32, length: usize, |
240 | ) -> [i32; SUBPEL_FILTER_SIZE] { |
241 | let filter_idx = if mode == FilterMode::BILINEAR || length > 4 { |
242 | mode as usize |
243 | } else { |
244 | (mode as usize).min(1) + 4 |
245 | }; |
246 | SUBPEL_FILTERS[filter_idx][frac as usize] |
247 | } |
248 | |
249 | #[cold_for_target_arch ("x86_64" )] |
250 | pub fn put_8tap<T: Pixel>( |
251 | dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, |
252 | height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, |
253 | mode_y: FilterMode, bit_depth: usize, _cpu: CpuFeatureLevel, |
254 | ) { |
255 | // The assembly only supports even heights and valid uncropped widths |
256 | assert_eq!(height & 1, 0); |
257 | assert!(width.is_power_of_two() && (2..=128).contains(&width)); |
258 | |
259 | let ref_stride = src.plane.cfg.stride; |
260 | let y_filter = get_filter(mode_y, row_frac, height); |
261 | let x_filter = get_filter(mode_x, col_frac, width); |
262 | let max_sample_val = (1 << bit_depth) - 1; |
263 | let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; |
264 | match (col_frac, row_frac) { |
265 | (0, 0) => { |
266 | for r in 0..height { |
267 | let src_slice = &src[r]; |
268 | let dst_slice = &mut dst[r]; |
269 | dst_slice[..width].copy_from_slice(&src_slice[..width]); |
270 | } |
271 | } |
272 | (0, _) => { |
273 | let offset_slice = src.go_up(3); |
274 | for r in 0..height { |
275 | let src_slice = &offset_slice[r]; |
276 | let dst_slice = &mut dst[r]; |
277 | for c in 0..width { |
278 | dst_slice[c] = T::cast_from( |
279 | round_shift( |
280 | // SAFETY: We pass this a raw pointer, but it's created from a |
281 | // checked slice, so we are safe. |
282 | unsafe { |
283 | run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter) |
284 | }, |
285 | 7, |
286 | ) |
287 | .clamp(0, max_sample_val), |
288 | ); |
289 | } |
290 | } |
291 | } |
292 | (_, 0) => { |
293 | let offset_slice = src.go_left(3); |
294 | for r in 0..height { |
295 | let src_slice = &offset_slice[r]; |
296 | let dst_slice = &mut dst[r]; |
297 | for c in 0..width { |
298 | dst_slice[c] = T::cast_from( |
299 | round_shift( |
300 | round_shift( |
301 | // SAFETY: We pass this a raw pointer, but it's created from a |
302 | // checked slice, so we are safe. |
303 | unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, |
304 | 7 - intermediate_bits, |
305 | ), |
306 | intermediate_bits, |
307 | ) |
308 | .clamp(0, max_sample_val), |
309 | ); |
310 | } |
311 | } |
312 | } |
313 | (_, _) => { |
314 | let mut intermediate: [i16; 8 * (128 + 7)] = [0; 8 * (128 + 7)]; |
315 | |
316 | let offset_slice = src.go_left(3).go_up(3); |
317 | for cg in (0..width).step_by(8) { |
318 | for r in 0..height + 7 { |
319 | let src_slice = &offset_slice[r]; |
320 | for c in cg..(cg + 8).min(width) { |
321 | intermediate[8 * r + (c - cg)] = round_shift( |
322 | // SAFETY: We pass this a raw pointer, but it's created from a |
323 | // checked slice, so we are safe. |
324 | unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, |
325 | 7 - intermediate_bits, |
326 | ) as i16; |
327 | } |
328 | } |
329 | |
330 | for r in 0..height { |
331 | let dst_slice = &mut dst[r]; |
332 | for c in cg..(cg + 8).min(width) { |
333 | dst_slice[c] = T::cast_from( |
334 | round_shift( |
335 | // SAFETY: We pass this a raw pointer, but it's created from a |
336 | // checked slice, so we are safe. |
337 | unsafe { |
338 | run_filter( |
339 | intermediate[8 * r + c - cg..].as_ptr(), |
340 | 8, |
341 | y_filter, |
342 | ) |
343 | }, |
344 | 7 + intermediate_bits, |
345 | ) |
346 | .clamp(0, max_sample_val), |
347 | ); |
348 | } |
349 | } |
350 | } |
351 | } |
352 | } |
353 | } |
354 | |
355 | // HBD output interval is [-20588, 36956] (10-bit), [-20602, 36983] (12-bit) |
356 | // Subtract PREP_BIAS to ensure result fits in i16 and matches dav1d assembly |
357 | const PREP_BIAS: i32 = 8192; |
358 | |
359 | #[cold_for_target_arch ("x86_64" )] |
360 | pub fn prep_8tap<T: Pixel>( |
361 | tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize, |
362 | col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, |
363 | bit_depth: usize, _cpu: CpuFeatureLevel, |
364 | ) { |
365 | // The assembly only supports even heights and valid uncropped widths |
366 | assert_eq!(height & 1, 0); |
367 | assert!(width.is_power_of_two() && (2..=128).contains(&width)); |
368 | |
369 | let ref_stride = src.plane.cfg.stride; |
370 | let y_filter = get_filter(mode_y, row_frac, height); |
371 | let x_filter = get_filter(mode_x, col_frac, width); |
372 | let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; |
373 | let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS }; |
374 | match (col_frac, row_frac) { |
375 | (0, 0) => { |
376 | for r in 0..height { |
377 | let src_slice = &src[r]; |
378 | for c in 0..width { |
379 | tmp[r * width + c] = (i16::cast_from(src_slice[c]) |
380 | << intermediate_bits) |
381 | - prep_bias as i16; |
382 | } |
383 | } |
384 | } |
385 | (0, _) => { |
386 | let offset_slice = src.go_up(3); |
387 | for r in 0..height { |
388 | let src_slice = &offset_slice[r]; |
389 | for c in 0..width { |
390 | tmp[r * width + c] = (round_shift( |
391 | // SAFETY: We pass this a raw pointer, but it's created from a |
392 | // checked slice, so we are safe. |
393 | unsafe { |
394 | run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter) |
395 | }, |
396 | 7 - intermediate_bits, |
397 | ) - prep_bias) as i16; |
398 | } |
399 | } |
400 | } |
401 | (_, 0) => { |
402 | let offset_slice = src.go_left(3); |
403 | for r in 0..height { |
404 | let src_slice = &offset_slice[r]; |
405 | for c in 0..width { |
406 | tmp[r * width + c] = (round_shift( |
407 | // SAFETY: We pass this a raw pointer, but it's created from a |
408 | // checked slice, so we are safe. |
409 | unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, |
410 | 7 - intermediate_bits, |
411 | ) - prep_bias) as i16; |
412 | } |
413 | } |
414 | } |
415 | (_, _) => { |
416 | let mut intermediate: [i16; 8 * (128 + 7)] = [0; 8 * (128 + 7)]; |
417 | |
418 | let offset_slice = src.go_left(3).go_up(3); |
419 | for cg in (0..width).step_by(8) { |
420 | for r in 0..height + 7 { |
421 | let src_slice = &offset_slice[r]; |
422 | for c in cg..(cg + 8).min(width) { |
423 | intermediate[8 * r + (c - cg)] = round_shift( |
424 | // SAFETY: We pass this a raw pointer, but it's created from a |
425 | // checked slice, so we are safe. |
426 | unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, |
427 | 7 - intermediate_bits, |
428 | ) as i16; |
429 | } |
430 | } |
431 | |
432 | for r in 0..height { |
433 | for c in cg..(cg + 8).min(width) { |
434 | tmp[r * width + c] = (round_shift( |
435 | // SAFETY: We pass this a raw pointer, but it's created from a |
436 | // checked slice, so we are safe. |
437 | unsafe { |
438 | run_filter( |
439 | intermediate[8 * r + c - cg..].as_ptr(), |
440 | 8, |
441 | y_filter, |
442 | ) |
443 | }, |
444 | 7, |
445 | ) - prep_bias) as i16; |
446 | } |
447 | } |
448 | } |
449 | } |
450 | } |
451 | } |
452 | |
453 | #[cold_for_target_arch ("x86_64" )] |
454 | pub fn mc_avg<T: Pixel>( |
455 | dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize, |
456 | height: usize, bit_depth: usize, _cpu: CpuFeatureLevel, |
457 | ) { |
458 | // The assembly only supports even heights and valid uncropped widths |
459 | assert_eq!(height & 1, 0); |
460 | assert!(width.is_power_of_two() && (2..=128).contains(&width)); |
461 | |
462 | let max_sample_val = (1 << bit_depth) - 1; |
463 | let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; |
464 | let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS * 2 }; |
465 | for r in 0..height { |
466 | let dst_slice = &mut dst[r]; |
467 | for c in 0..width { |
468 | dst_slice[c] = T::cast_from( |
469 | round_shift( |
470 | tmp1[r * width + c] as i32 |
471 | + tmp2[r * width + c] as i32 |
472 | + prep_bias, |
473 | intermediate_bits + 1, |
474 | ) |
475 | .clamp(0, max_sample_val), |
476 | ); |
477 | } |
478 | } |
479 | } |
480 | } |
481 | |