| 1 | // Copyright (c) 2019-2022, The rav1e contributors. All rights reserved |
| 2 | // |
| 3 | // This source code is subject to the terms of the BSD 2 Clause License and |
| 4 | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 5 | // was not distributed with this source code in the LICENSE file, you can |
| 6 | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 7 | // Media Patent License 1.0 was not distributed with this source code in the |
| 8 | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 9 | |
| 10 | cfg_if::cfg_if! { |
| 11 | if #[cfg(nasm_x86_64)] { |
| 12 | pub use crate::asm::x86::mc::*; |
| 13 | } else if #[cfg(asm_neon)] { |
| 14 | pub use crate::asm::aarch64::mc::*; |
| 15 | } else { |
| 16 | pub use self::rust::*; |
| 17 | } |
| 18 | } |
| 19 | |
| 20 | use crate::cpu_features::CpuFeatureLevel; |
| 21 | use crate::frame::*; |
| 22 | use crate::tiling::*; |
| 23 | use crate::util::*; |
| 24 | |
| 25 | use simd_helpers::cold_for_target_arch; |
| 26 | use std::ops; |
| 27 | |
| 28 | #[derive (Clone, Copy, Debug, Default, PartialEq, Eq)] |
| 29 | pub struct MotionVector { |
| 30 | pub row: i16, |
| 31 | pub col: i16, |
| 32 | } |
| 33 | |
| 34 | impl MotionVector { |
| 35 | #[inline ] |
| 36 | pub const fn quantize_to_fullpel(self) -> Self { |
| 37 | Self { row: (self.row / 8) * 8, col: (self.col / 8) * 8 } |
| 38 | } |
| 39 | |
| 40 | #[inline ] |
| 41 | pub const fn is_zero(self) -> bool { |
| 42 | self.row == 0 && self.col == 0 |
| 43 | } |
| 44 | |
| 45 | #[inline ] |
| 46 | pub const fn is_valid(self) -> bool { |
| 47 | use crate::context::{MV_LOW, MV_UPP}; |
| 48 | ((MV_LOW as i16) < self.row && self.row < (MV_UPP as i16)) |
| 49 | && ((MV_LOW as i16) < self.col && self.col < (MV_UPP as i16)) |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | impl ops::Mul<i16> for MotionVector { |
| 54 | type Output = MotionVector; |
| 55 | |
| 56 | #[inline ] |
| 57 | fn mul(self, rhs: i16) -> MotionVector { |
| 58 | MotionVector { row: self.row * rhs, col: self.col * rhs } |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | impl ops::Mul<u16> for MotionVector { |
| 63 | type Output = MotionVector; |
| 64 | |
| 65 | #[inline ] |
| 66 | fn mul(self, rhs: u16) -> MotionVector { |
| 67 | MotionVector { row: self.row * rhs as i16, col: self.col * rhs as i16 } |
| 68 | } |
| 69 | } |
| 70 | |
| 71 | impl ops::Shr<u8> for MotionVector { |
| 72 | type Output = MotionVector; |
| 73 | |
| 74 | #[inline ] |
| 75 | fn shr(self, rhs: u8) -> MotionVector { |
| 76 | MotionVector { row: self.row >> rhs, col: self.col >> rhs } |
| 77 | } |
| 78 | } |
| 79 | |
| 80 | impl ops::Shl<u8> for MotionVector { |
| 81 | type Output = MotionVector; |
| 82 | |
| 83 | #[inline ] |
| 84 | fn shl(self, rhs: u8) -> MotionVector { |
| 85 | MotionVector { row: self.row << rhs, col: self.col << rhs } |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | impl ops::Add<MotionVector> for MotionVector { |
| 90 | type Output = MotionVector; |
| 91 | |
| 92 | #[inline ] |
| 93 | fn add(self, rhs: MotionVector) -> MotionVector { |
| 94 | MotionVector { row: self.row + rhs.row, col: self.col + rhs.col } |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | #[derive (Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] |
| 99 | #[allow (unused)] |
| 100 | pub enum FilterMode { |
| 101 | REGULAR = 0, |
| 102 | SMOOTH = 1, |
| 103 | SHARP = 2, |
| 104 | BILINEAR = 3, |
| 105 | SWITCHABLE = 4, |
| 106 | } |
| 107 | |
| 108 | pub const SUBPEL_FILTER_SIZE: usize = 8; |
| 109 | |
| 110 | const SUBPEL_FILTERS: [[[i32; SUBPEL_FILTER_SIZE]; 16]; 6] = [ |
| 111 | [ |
| 112 | [0, 0, 0, 128, 0, 0, 0, 0], |
| 113 | [0, 2, -6, 126, 8, -2, 0, 0], |
| 114 | [0, 2, -10, 122, 18, -4, 0, 0], |
| 115 | [0, 2, -12, 116, 28, -8, 2, 0], |
| 116 | [0, 2, -14, 110, 38, -10, 2, 0], |
| 117 | [0, 2, -14, 102, 48, -12, 2, 0], |
| 118 | [0, 2, -16, 94, 58, -12, 2, 0], |
| 119 | [0, 2, -14, 84, 66, -12, 2, 0], |
| 120 | [0, 2, -14, 76, 76, -14, 2, 0], |
| 121 | [0, 2, -12, 66, 84, -14, 2, 0], |
| 122 | [0, 2, -12, 58, 94, -16, 2, 0], |
| 123 | [0, 2, -12, 48, 102, -14, 2, 0], |
| 124 | [0, 2, -10, 38, 110, -14, 2, 0], |
| 125 | [0, 2, -8, 28, 116, -12, 2, 0], |
| 126 | [0, 0, -4, 18, 122, -10, 2, 0], |
| 127 | [0, 0, -2, 8, 126, -6, 2, 0], |
| 128 | ], |
| 129 | [ |
| 130 | [0, 0, 0, 128, 0, 0, 0, 0], |
| 131 | [0, 2, 28, 62, 34, 2, 0, 0], |
| 132 | [0, 0, 26, 62, 36, 4, 0, 0], |
| 133 | [0, 0, 22, 62, 40, 4, 0, 0], |
| 134 | [0, 0, 20, 60, 42, 6, 0, 0], |
| 135 | [0, 0, 18, 58, 44, 8, 0, 0], |
| 136 | [0, 0, 16, 56, 46, 10, 0, 0], |
| 137 | [0, -2, 16, 54, 48, 12, 0, 0], |
| 138 | [0, -2, 14, 52, 52, 14, -2, 0], |
| 139 | [0, 0, 12, 48, 54, 16, -2, 0], |
| 140 | [0, 0, 10, 46, 56, 16, 0, 0], |
| 141 | [0, 0, 8, 44, 58, 18, 0, 0], |
| 142 | [0, 0, 6, 42, 60, 20, 0, 0], |
| 143 | [0, 0, 4, 40, 62, 22, 0, 0], |
| 144 | [0, 0, 4, 36, 62, 26, 0, 0], |
| 145 | [0, 0, 2, 34, 62, 28, 2, 0], |
| 146 | ], |
| 147 | [ |
| 148 | [0, 0, 0, 128, 0, 0, 0, 0], |
| 149 | [-2, 2, -6, 126, 8, -2, 2, 0], |
| 150 | [-2, 6, -12, 124, 16, -6, 4, -2], |
| 151 | [-2, 8, -18, 120, 26, -10, 6, -2], |
| 152 | [-4, 10, -22, 116, 38, -14, 6, -2], |
| 153 | [-4, 10, -22, 108, 48, -18, 8, -2], |
| 154 | [-4, 10, -24, 100, 60, -20, 8, -2], |
| 155 | [-4, 10, -24, 90, 70, -22, 10, -2], |
| 156 | [-4, 12, -24, 80, 80, -24, 12, -4], |
| 157 | [-2, 10, -22, 70, 90, -24, 10, -4], |
| 158 | [-2, 8, -20, 60, 100, -24, 10, -4], |
| 159 | [-2, 8, -18, 48, 108, -22, 10, -4], |
| 160 | [-2, 6, -14, 38, 116, -22, 10, -4], |
| 161 | [-2, 6, -10, 26, 120, -18, 8, -2], |
| 162 | [-2, 4, -6, 16, 124, -12, 6, -2], |
| 163 | [0, 2, -2, 8, 126, -6, 2, -2], |
| 164 | ], |
| 165 | [ |
| 166 | [0, 0, 0, 128, 0, 0, 0, 0], |
| 167 | [0, 0, 0, 120, 8, 0, 0, 0], |
| 168 | [0, 0, 0, 112, 16, 0, 0, 0], |
| 169 | [0, 0, 0, 104, 24, 0, 0, 0], |
| 170 | [0, 0, 0, 96, 32, 0, 0, 0], |
| 171 | [0, 0, 0, 88, 40, 0, 0, 0], |
| 172 | [0, 0, 0, 80, 48, 0, 0, 0], |
| 173 | [0, 0, 0, 72, 56, 0, 0, 0], |
| 174 | [0, 0, 0, 64, 64, 0, 0, 0], |
| 175 | [0, 0, 0, 56, 72, 0, 0, 0], |
| 176 | [0, 0, 0, 48, 80, 0, 0, 0], |
| 177 | [0, 0, 0, 40, 88, 0, 0, 0], |
| 178 | [0, 0, 0, 32, 96, 0, 0, 0], |
| 179 | [0, 0, 0, 24, 104, 0, 0, 0], |
| 180 | [0, 0, 0, 16, 112, 0, 0, 0], |
| 181 | [0, 0, 0, 8, 120, 0, 0, 0], |
| 182 | ], |
| 183 | [ |
| 184 | [0, 0, 0, 128, 0, 0, 0, 0], |
| 185 | [0, 0, -4, 126, 8, -2, 0, 0], |
| 186 | [0, 0, -8, 122, 18, -4, 0, 0], |
| 187 | [0, 0, -10, 116, 28, -6, 0, 0], |
| 188 | [0, 0, -12, 110, 38, -8, 0, 0], |
| 189 | [0, 0, -12, 102, 48, -10, 0, 0], |
| 190 | [0, 0, -14, 94, 58, -10, 0, 0], |
| 191 | [0, 0, -12, 84, 66, -10, 0, 0], |
| 192 | [0, 0, -12, 76, 76, -12, 0, 0], |
| 193 | [0, 0, -10, 66, 84, -12, 0, 0], |
| 194 | [0, 0, -10, 58, 94, -14, 0, 0], |
| 195 | [0, 0, -10, 48, 102, -12, 0, 0], |
| 196 | [0, 0, -8, 38, 110, -12, 0, 0], |
| 197 | [0, 0, -6, 28, 116, -10, 0, 0], |
| 198 | [0, 0, -4, 18, 122, -8, 0, 0], |
| 199 | [0, 0, -2, 8, 126, -4, 0, 0], |
| 200 | ], |
| 201 | [ |
| 202 | [0, 0, 0, 128, 0, 0, 0, 0], |
| 203 | [0, 0, 30, 62, 34, 2, 0, 0], |
| 204 | [0, 0, 26, 62, 36, 4, 0, 0], |
| 205 | [0, 0, 22, 62, 40, 4, 0, 0], |
| 206 | [0, 0, 20, 60, 42, 6, 0, 0], |
| 207 | [0, 0, 18, 58, 44, 8, 0, 0], |
| 208 | [0, 0, 16, 56, 46, 10, 0, 0], |
| 209 | [0, 0, 14, 54, 48, 12, 0, 0], |
| 210 | [0, 0, 12, 52, 52, 12, 0, 0], |
| 211 | [0, 0, 12, 48, 54, 14, 0, 0], |
| 212 | [0, 0, 10, 46, 56, 16, 0, 0], |
| 213 | [0, 0, 8, 44, 58, 18, 0, 0], |
| 214 | [0, 0, 6, 42, 60, 20, 0, 0], |
| 215 | [0, 0, 4, 40, 62, 22, 0, 0], |
| 216 | [0, 0, 4, 36, 62, 26, 0, 0], |
| 217 | [0, 0, 2, 34, 62, 30, 0, 0], |
| 218 | ], |
| 219 | ]; |
| 220 | |
| 221 | pub(crate) mod rust { |
| 222 | use super::*; |
| 223 | use num_traits::*; |
| 224 | |
| 225 | unsafe fn run_filter<T: AsPrimitive<i32>>( |
| 226 | src: *const T, stride: usize, filter: [i32; 8], |
| 227 | ) -> i32 { |
| 228 | filter |
| 229 | .iter() |
| 230 | .enumerate() |
| 231 | .map(|(i, f)| { |
| 232 | let p = src.add(i * stride); |
| 233 | f * (*p).as_() |
| 234 | }) |
| 235 | .sum::<i32>() |
| 236 | } |
| 237 | |
| 238 | fn get_filter( |
| 239 | mode: FilterMode, frac: i32, length: usize, |
| 240 | ) -> [i32; SUBPEL_FILTER_SIZE] { |
| 241 | let filter_idx = if mode == FilterMode::BILINEAR || length > 4 { |
| 242 | mode as usize |
| 243 | } else { |
| 244 | (mode as usize).min(1) + 4 |
| 245 | }; |
| 246 | SUBPEL_FILTERS[filter_idx][frac as usize] |
| 247 | } |
| 248 | |
| 249 | #[cold_for_target_arch ("x86_64" )] |
| 250 | pub fn put_8tap<T: Pixel>( |
| 251 | dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, |
| 252 | height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, |
| 253 | mode_y: FilterMode, bit_depth: usize, _cpu: CpuFeatureLevel, |
| 254 | ) { |
| 255 | // The assembly only supports even heights and valid uncropped widths |
| 256 | assert_eq!(height & 1, 0); |
| 257 | assert!(width.is_power_of_two() && (2..=128).contains(&width)); |
| 258 | |
| 259 | let ref_stride = src.plane.cfg.stride; |
| 260 | let y_filter = get_filter(mode_y, row_frac, height); |
| 261 | let x_filter = get_filter(mode_x, col_frac, width); |
| 262 | let max_sample_val = (1 << bit_depth) - 1; |
| 263 | let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; |
| 264 | match (col_frac, row_frac) { |
| 265 | (0, 0) => { |
| 266 | for r in 0..height { |
| 267 | let src_slice = &src[r]; |
| 268 | let dst_slice = &mut dst[r]; |
| 269 | dst_slice[..width].copy_from_slice(&src_slice[..width]); |
| 270 | } |
| 271 | } |
| 272 | (0, _) => { |
| 273 | let offset_slice = src.go_up(3); |
| 274 | for r in 0..height { |
| 275 | let src_slice = &offset_slice[r]; |
| 276 | let dst_slice = &mut dst[r]; |
| 277 | for c in 0..width { |
| 278 | dst_slice[c] = T::cast_from( |
| 279 | round_shift( |
| 280 | // SAFETY: We pass this a raw pointer, but it's created from a |
| 281 | // checked slice, so we are safe. |
| 282 | unsafe { |
| 283 | run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter) |
| 284 | }, |
| 285 | 7, |
| 286 | ) |
| 287 | .clamp(0, max_sample_val), |
| 288 | ); |
| 289 | } |
| 290 | } |
| 291 | } |
| 292 | (_, 0) => { |
| 293 | let offset_slice = src.go_left(3); |
| 294 | for r in 0..height { |
| 295 | let src_slice = &offset_slice[r]; |
| 296 | let dst_slice = &mut dst[r]; |
| 297 | for c in 0..width { |
| 298 | dst_slice[c] = T::cast_from( |
| 299 | round_shift( |
| 300 | round_shift( |
| 301 | // SAFETY: We pass this a raw pointer, but it's created from a |
| 302 | // checked slice, so we are safe. |
| 303 | unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, |
| 304 | 7 - intermediate_bits, |
| 305 | ), |
| 306 | intermediate_bits, |
| 307 | ) |
| 308 | .clamp(0, max_sample_val), |
| 309 | ); |
| 310 | } |
| 311 | } |
| 312 | } |
| 313 | (_, _) => { |
| 314 | let mut intermediate: [i16; 8 * (128 + 7)] = [0; 8 * (128 + 7)]; |
| 315 | |
| 316 | let offset_slice = src.go_left(3).go_up(3); |
| 317 | for cg in (0..width).step_by(8) { |
| 318 | for r in 0..height + 7 { |
| 319 | let src_slice = &offset_slice[r]; |
| 320 | for c in cg..(cg + 8).min(width) { |
| 321 | intermediate[8 * r + (c - cg)] = round_shift( |
| 322 | // SAFETY: We pass this a raw pointer, but it's created from a |
| 323 | // checked slice, so we are safe. |
| 324 | unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, |
| 325 | 7 - intermediate_bits, |
| 326 | ) as i16; |
| 327 | } |
| 328 | } |
| 329 | |
| 330 | for r in 0..height { |
| 331 | let dst_slice = &mut dst[r]; |
| 332 | for c in cg..(cg + 8).min(width) { |
| 333 | dst_slice[c] = T::cast_from( |
| 334 | round_shift( |
| 335 | // SAFETY: We pass this a raw pointer, but it's created from a |
| 336 | // checked slice, so we are safe. |
| 337 | unsafe { |
| 338 | run_filter( |
| 339 | intermediate[8 * r + c - cg..].as_ptr(), |
| 340 | 8, |
| 341 | y_filter, |
| 342 | ) |
| 343 | }, |
| 344 | 7 + intermediate_bits, |
| 345 | ) |
| 346 | .clamp(0, max_sample_val), |
| 347 | ); |
| 348 | } |
| 349 | } |
| 350 | } |
| 351 | } |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | // HBD output interval is [-20588, 36956] (10-bit), [-20602, 36983] (12-bit) |
| 356 | // Subtract PREP_BIAS to ensure result fits in i16 and matches dav1d assembly |
| 357 | const PREP_BIAS: i32 = 8192; |
| 358 | |
| 359 | #[cold_for_target_arch ("x86_64" )] |
| 360 | pub fn prep_8tap<T: Pixel>( |
| 361 | tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize, |
| 362 | col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, |
| 363 | bit_depth: usize, _cpu: CpuFeatureLevel, |
| 364 | ) { |
| 365 | // The assembly only supports even heights and valid uncropped widths |
| 366 | assert_eq!(height & 1, 0); |
| 367 | assert!(width.is_power_of_two() && (2..=128).contains(&width)); |
| 368 | |
| 369 | let ref_stride = src.plane.cfg.stride; |
| 370 | let y_filter = get_filter(mode_y, row_frac, height); |
| 371 | let x_filter = get_filter(mode_x, col_frac, width); |
| 372 | let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; |
| 373 | let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS }; |
| 374 | match (col_frac, row_frac) { |
| 375 | (0, 0) => { |
| 376 | for r in 0..height { |
| 377 | let src_slice = &src[r]; |
| 378 | for c in 0..width { |
| 379 | tmp[r * width + c] = (i16::cast_from(src_slice[c]) |
| 380 | << intermediate_bits) |
| 381 | - prep_bias as i16; |
| 382 | } |
| 383 | } |
| 384 | } |
| 385 | (0, _) => { |
| 386 | let offset_slice = src.go_up(3); |
| 387 | for r in 0..height { |
| 388 | let src_slice = &offset_slice[r]; |
| 389 | for c in 0..width { |
| 390 | tmp[r * width + c] = (round_shift( |
| 391 | // SAFETY: We pass this a raw pointer, but it's created from a |
| 392 | // checked slice, so we are safe. |
| 393 | unsafe { |
| 394 | run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter) |
| 395 | }, |
| 396 | 7 - intermediate_bits, |
| 397 | ) - prep_bias) as i16; |
| 398 | } |
| 399 | } |
| 400 | } |
| 401 | (_, 0) => { |
| 402 | let offset_slice = src.go_left(3); |
| 403 | for r in 0..height { |
| 404 | let src_slice = &offset_slice[r]; |
| 405 | for c in 0..width { |
| 406 | tmp[r * width + c] = (round_shift( |
| 407 | // SAFETY: We pass this a raw pointer, but it's created from a |
| 408 | // checked slice, so we are safe. |
| 409 | unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, |
| 410 | 7 - intermediate_bits, |
| 411 | ) - prep_bias) as i16; |
| 412 | } |
| 413 | } |
| 414 | } |
| 415 | (_, _) => { |
| 416 | let mut intermediate: [i16; 8 * (128 + 7)] = [0; 8 * (128 + 7)]; |
| 417 | |
| 418 | let offset_slice = src.go_left(3).go_up(3); |
| 419 | for cg in (0..width).step_by(8) { |
| 420 | for r in 0..height + 7 { |
| 421 | let src_slice = &offset_slice[r]; |
| 422 | for c in cg..(cg + 8).min(width) { |
| 423 | intermediate[8 * r + (c - cg)] = round_shift( |
| 424 | // SAFETY: We pass this a raw pointer, but it's created from a |
| 425 | // checked slice, so we are safe. |
| 426 | unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, |
| 427 | 7 - intermediate_bits, |
| 428 | ) as i16; |
| 429 | } |
| 430 | } |
| 431 | |
| 432 | for r in 0..height { |
| 433 | for c in cg..(cg + 8).min(width) { |
| 434 | tmp[r * width + c] = (round_shift( |
| 435 | // SAFETY: We pass this a raw pointer, but it's created from a |
| 436 | // checked slice, so we are safe. |
| 437 | unsafe { |
| 438 | run_filter( |
| 439 | intermediate[8 * r + c - cg..].as_ptr(), |
| 440 | 8, |
| 441 | y_filter, |
| 442 | ) |
| 443 | }, |
| 444 | 7, |
| 445 | ) - prep_bias) as i16; |
| 446 | } |
| 447 | } |
| 448 | } |
| 449 | } |
| 450 | } |
| 451 | } |
| 452 | |
| 453 | #[cold_for_target_arch ("x86_64" )] |
| 454 | pub fn mc_avg<T: Pixel>( |
| 455 | dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize, |
| 456 | height: usize, bit_depth: usize, _cpu: CpuFeatureLevel, |
| 457 | ) { |
| 458 | // The assembly only supports even heights and valid uncropped widths |
| 459 | assert_eq!(height & 1, 0); |
| 460 | assert!(width.is_power_of_two() && (2..=128).contains(&width)); |
| 461 | |
| 462 | let max_sample_val = (1 << bit_depth) - 1; |
| 463 | let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; |
| 464 | let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS * 2 }; |
| 465 | for r in 0..height { |
| 466 | let dst_slice = &mut dst[r]; |
| 467 | for c in 0..width { |
| 468 | dst_slice[c] = T::cast_from( |
| 469 | round_shift( |
| 470 | tmp1[r * width + c] as i32 |
| 471 | + tmp2[r * width + c] as i32 |
| 472 | + prep_bias, |
| 473 | intermediate_bits + 1, |
| 474 | ) |
| 475 | .clamp(0, max_sample_val), |
| 476 | ); |
| 477 | } |
| 478 | } |
| 479 | } |
| 480 | } |
| 481 | |