| 1 | // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved |
| 2 | // |
| 3 | // This source code is subject to the terms of the BSD 2 Clause License and |
| 4 | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 5 | // was not distributed with this source code in the LICENSE file, you can |
| 6 | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 7 | // Media Patent License 1.0 was not distributed with this source code in the |
| 8 | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 9 | |
| 10 | #![allow (non_upper_case_globals)] |
| 11 | |
| 12 | mod tables; |
| 13 | |
| 14 | cfg_if::cfg_if! { |
| 15 | if #[cfg(nasm_x86_64)] { |
| 16 | pub use crate::asm::x86::quantize::*; |
| 17 | } else { |
| 18 | pub use self::rust::*; |
| 19 | } |
| 20 | } |
| 21 | |
| 22 | pub use tables::*; |
| 23 | |
| 24 | use crate::scan_order::av1_scan_orders; |
| 25 | use crate::transform::{TxSize, TxType}; |
| 26 | use crate::util::*; |
| 27 | use std::convert::Into; |
| 28 | use std::mem; |
| 29 | use std::num::{NonZeroU16, NonZeroU32, NonZeroU64}; |
| 30 | |
| 31 | pub fn get_log_tx_scale(tx_size: TxSize) -> usize { |
| 32 | let num_pixels: usize = tx_size.area(); |
| 33 | |
| 34 | Into::<usize>::into(self:num_pixels > 256) |
| 35 | + Into::<usize>::into(self:num_pixels > 1024) |
| 36 | } |
| 37 | |
| 38 | pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { |
| 39 | let dc_q: [&[NonZeroU16; 256]; 3] = |
| 40 | [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3]; |
| 41 | let bd: usize = ((bit_depth ^ 8) >> 1).min(2); |
| 42 | dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] |
| 43 | } |
| 44 | |
| 45 | pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { |
| 46 | let ac_q: [&[NonZeroU16; 256]; 3] = |
| 47 | [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3]; |
| 48 | let bd: usize = ((bit_depth ^ 8) >> 1).min(2); |
| 49 | ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] |
| 50 | } |
| 51 | |
| 52 | // TODO: Handle lossless properly. |
| 53 | fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 { |
| 54 | if quantizer < qlookup[MINQ].get() as i64 { |
| 55 | MINQ as u8 |
| 56 | } else if quantizer >= qlookup[MAXQ].get() as i64 { |
| 57 | MAXQ as u8 |
| 58 | } else { |
| 59 | match qlookup |
| 60 | .binary_search(&NonZeroU16::new(quantizer as u16).expect("Not zero" )) |
| 61 | { |
| 62 | Ok(qi) => qi as u8, |
| 63 | Err(qi) => { |
| 64 | debug_assert!(qi > MINQ); |
| 65 | debug_assert!(qi <= MAXQ); |
| 66 | // Pick the closest quantizer in the log domain. |
| 67 | let qthresh = |
| 68 | (qlookup[qi - 1].get() as i32) * (qlookup[qi].get() as i32); |
| 69 | let q2_i32 = (quantizer as i32) * (quantizer as i32); |
| 70 | if q2_i32 < qthresh { |
| 71 | (qi - 1) as u8 |
| 72 | } else { |
| 73 | qi as u8 |
| 74 | } |
| 75 | } |
| 76 | } |
| 77 | } |
| 78 | } |
| 79 | |
| 80 | pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 { |
| 81 | let qlookup: &[NonZero; QINDEX_RANGE] = match bit_depth { |
| 82 | 8 => &dc_qlookup_Q3, |
| 83 | 10 => &dc_qlookup_10_Q3, |
| 84 | 12 => &dc_qlookup_12_Q3, |
| 85 | _ => unimplemented!(), |
| 86 | }; |
| 87 | select_qi(quantizer, qlookup) |
| 88 | } |
| 89 | |
| 90 | pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 { |
| 91 | let qlookup: &[NonZero; QINDEX_RANGE] = match bit_depth { |
| 92 | 8 => &ac_qlookup_Q3, |
| 93 | 10 => &ac_qlookup_10_Q3, |
| 94 | 12 => &ac_qlookup_12_Q3, |
| 95 | _ => unimplemented!(), |
| 96 | }; |
| 97 | select_qi(quantizer, qlookup) |
| 98 | } |
| 99 | |
| 100 | #[derive (Debug, Clone, Copy)] |
| 101 | pub struct QuantizationContext { |
| 102 | log_tx_scale: usize, |
| 103 | dc_quant: NonZeroU16, |
| 104 | dc_offset: u32, |
| 105 | dc_mul_add: (u32, u32, u32), |
| 106 | |
| 107 | ac_quant: NonZeroU16, |
| 108 | ac_offset_eob: u32, |
| 109 | ac_offset0: u32, |
| 110 | ac_offset1: u32, |
| 111 | ac_mul_add: (u32, u32, u32), |
| 112 | } |
| 113 | |
| 114 | impl Default for QuantizationContext { |
| 115 | fn default() -> Self { |
| 116 | QuantizationContext { |
| 117 | dc_quant: NonZeroU16::new(1).expect(msg:"Not zero" ), |
| 118 | ac_quant: NonZeroU16::new(1).expect(msg:"Not zero" ), |
| 119 | log_tx_scale: Default::default(), |
| 120 | dc_offset: Default::default(), |
| 121 | dc_mul_add: Default::default(), |
| 122 | ac_offset_eob: Default::default(), |
| 123 | ac_offset0: Default::default(), |
| 124 | ac_offset1: Default::default(), |
| 125 | ac_mul_add: Default::default(), |
| 126 | } |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | fn divu_gen(d: NonZeroU32) -> (u32, u32, u32) { |
| 131 | let nbits: u64 = (mem::size_of_val(&d) as u64) * 8; |
| 132 | let m: u64 = nbits - d.leading_zeros() as u64 - 1; |
| 133 | if d.is_power_of_two() { |
| 134 | (0xFFFF_FFFF, 0xFFFF_FFFF, m as u32) |
| 135 | } else { |
| 136 | let d: NonZero = NonZeroU64::from(d); |
| 137 | let t: u64 = (1u64 << (m + nbits)) / d; |
| 138 | |
| 139 | let d: u64 = d.get(); |
| 140 | let r: u64 = (t * d + d) & ((1 << nbits) - 1); |
| 141 | if r <= 1u64 << m { |
| 142 | (t as u32 + 1, 0u32, m as u32) |
| 143 | } else { |
| 144 | (t as u32, t as u32, m as u32) |
| 145 | } |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | #[inline ] |
| 150 | const fn divu_pair(x: u32, d: (u32, u32, u32)) -> u32 { |
| 151 | let x: u64 = x as u64; |
| 152 | let (a: u32, b: u32, shift: u32) = d; |
| 153 | let shift: u64 = shift as u64; |
| 154 | let a: u64 = a as u64; |
| 155 | let b: u64 = b as u64; |
| 156 | |
| 157 | (((a * x + b) >> 32) >> shift) as u32 |
| 158 | } |
| 159 | |
| 160 | #[inline ] |
| 161 | const fn copysign(value: u32, signed: i32) -> i32 { |
| 162 | if signed < 0 { |
| 163 | -(value as i32) |
| 164 | } else { |
| 165 | value as i32 |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | #[cfg (test)] |
| 170 | mod test { |
| 171 | use super::*; |
| 172 | use crate::transform::TxSize::*; |
| 173 | |
| 174 | #[test ] |
| 175 | fn test_divu_pair() { |
| 176 | for d in 1..1024 { |
| 177 | for x in 0..1000 { |
| 178 | let ab = divu_gen(NonZeroU32::new(d).unwrap()); |
| 179 | assert_eq!(x / d, divu_pair(x, ab)); |
| 180 | } |
| 181 | } |
| 182 | } |
| 183 | #[test ] |
| 184 | fn gen_divu_table() { |
| 185 | let b: Vec<(u32, u32, u32)> = |
| 186 | dc_qlookup_Q3.iter().map(|&v| divu_gen(v.into())).collect(); |
| 187 | |
| 188 | println!("{:?}" , b); |
| 189 | } |
| 190 | #[test ] |
| 191 | fn test_tx_log_scale() { |
| 192 | let tx_sizes = [ |
| 193 | (TX_4X4, 0), |
| 194 | (TX_8X8, 0), |
| 195 | (TX_16X16, 0), |
| 196 | (TX_32X32, 1), |
| 197 | (TX_64X64, 2), |
| 198 | (TX_4X8, 0), |
| 199 | (TX_8X4, 0), |
| 200 | (TX_8X16, 0), |
| 201 | (TX_16X8, 0), |
| 202 | (TX_16X32, 1), |
| 203 | (TX_32X16, 1), |
| 204 | (TX_32X64, 2), |
| 205 | (TX_64X32, 2), |
| 206 | (TX_4X16, 0), |
| 207 | (TX_16X4, 0), |
| 208 | (TX_8X32, 0), |
| 209 | (TX_32X8, 0), |
| 210 | (TX_16X64, 1), |
| 211 | (TX_64X16, 1), |
| 212 | ]; |
| 213 | for &tx_size in tx_sizes.iter() { |
| 214 | assert!(tx_size.1 == get_log_tx_scale(tx_size.0)); |
| 215 | } |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | impl QuantizationContext { |
| 220 | pub fn update( |
| 221 | &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize, |
| 222 | dc_delta_q: i8, ac_delta_q: i8, |
| 223 | ) { |
| 224 | self.log_tx_scale = get_log_tx_scale(tx_size); |
| 225 | |
| 226 | self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth); |
| 227 | self.dc_mul_add = divu_gen(self.dc_quant.into()); |
| 228 | |
| 229 | self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth); |
| 230 | self.ac_mul_add = divu_gen(self.ac_quant.into()); |
| 231 | |
| 232 | // All of these biases were derived by measuring the cost of coding |
| 233 | // a zero vs coding a one on any given coefficient position, or, in |
| 234 | // the case of the EOB bias, the cost of coding the block with |
| 235 | // the chosen EOB (rounding to one) vs rounding to zero and continuing |
| 236 | // to choose a new EOB. This was done over several clips, with the |
| 237 | // average of the bit costs taken over all blocks in the set, and a new |
| 238 | // bias derived via the method outlined in Jean-Marc Valin's |
| 239 | // Journal of Dubious Theoretical Results[1], aka: |
| 240 | // |
| 241 | // lambda = ln(2) / 6.0 |
| 242 | // threshold = 0.5 + (lambda * avg_rate_diff) / 2.0 |
| 243 | // bias = 1 - threshold |
| 244 | // |
| 245 | // lambda is a constant since our offsets are already adjusted for the |
| 246 | // quantizer. |
| 247 | // |
| 248 | // Biases were then updated, and cost collection was re-run, until |
| 249 | // the calculated biases started to converge after 2-4 iterations. |
| 250 | // |
| 251 | // In theory, the rounding biases for inter should be somewhat smaller |
| 252 | // than the biases for intra, but this turns out to only be the case |
| 253 | // for EOB optimization, or at least, is covered by EOB optimization. |
| 254 | // The RD-optimal rounding biases for the actual coefficients seem |
| 255 | // to be quite close (+/- 1/256), for both inter and intra, |
| 256 | // post-deadzoning. |
| 257 | // |
| 258 | // [1] https://jmvalin.ca/notes/theoretical_results.pdf |
| 259 | self.dc_offset = |
| 260 | self.dc_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256; |
| 261 | self.ac_offset0 = |
| 262 | self.ac_quant.get() as u32 * (if is_intra { 98 } else { 97 }) / 256; |
| 263 | self.ac_offset1 = |
| 264 | self.ac_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256; |
| 265 | self.ac_offset_eob = |
| 266 | self.ac_quant.get() as u32 * (if is_intra { 88 } else { 44 }) / 256; |
| 267 | } |
| 268 | |
| 269 | #[inline ] |
| 270 | pub fn quantize<T: Coefficient>( |
| 271 | &self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType, |
| 272 | ) -> u16 { |
| 273 | let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan; |
| 274 | let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan; |
| 275 | |
| 276 | qcoeffs[0] = { |
| 277 | let coeff: i32 = i32::cast_from(coeffs[0]) << self.log_tx_scale; |
| 278 | let abs_coeff = coeff.unsigned_abs(); |
| 279 | T::cast_from(copysign( |
| 280 | divu_pair(abs_coeff + self.dc_offset, self.dc_mul_add), |
| 281 | coeff, |
| 282 | )) |
| 283 | }; |
| 284 | |
| 285 | // Find the last non-zero coefficient using our smaller biases and |
| 286 | // zero everything else. |
| 287 | // This threshold is such that `abs(coeff) < deadzone` implies: |
| 288 | // (abs(coeff << log_tx_scale) + ac_offset_eob) / ac_quant == 0 |
| 289 | let deadzone = T::cast_from( |
| 290 | (self.ac_quant.get() as usize - self.ac_offset_eob as usize) |
| 291 | .align_power_of_two_and_shift(self.log_tx_scale), |
| 292 | ); |
| 293 | let eob = { |
| 294 | let eob_minus_one = iscan |
| 295 | .iter() |
| 296 | .zip(coeffs) |
| 297 | .map(|(&i, &c)| if c.abs() >= deadzone { i } else { 0 }) |
| 298 | .max() |
| 299 | .unwrap_or(0); |
| 300 | // We skip the DC coefficient since it has its own quantizer index. |
| 301 | if eob_minus_one > 0 { |
| 302 | eob_minus_one + 1 |
| 303 | } else { |
| 304 | u16::from(qcoeffs[0] != T::cast_from(0)) |
| 305 | } |
| 306 | }; |
| 307 | |
| 308 | // Here we use different rounding biases depending on whether we've |
| 309 | // had recent coefficients that are larger than one, or less than |
| 310 | // one. The reason for this is that a block usually has a chunk of |
| 311 | // large coefficients and a tail of zeroes and ones, and the tradeoffs |
| 312 | // for coding these two are different. In the tail of zeroes and ones, |
| 313 | // you'll likely end up spending most bits just saying where that |
| 314 | // coefficient is in the block, whereas in the chunk of larger |
| 315 | // coefficients, most bits will be spent on coding its magnitude. |
| 316 | // To that end, we want to bias more toward rounding to zero for |
| 317 | // that tail of zeroes and ones than we do for the larger coefficients. |
| 318 | let mut level_mode = 1; |
| 319 | let ac_quant = self.ac_quant.get() as u32; |
| 320 | for &pos in scan.iter().take(usize::from(eob)).skip(1) { |
| 321 | let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale; |
| 322 | let abs_coeff = coeff.unsigned_abs(); |
| 323 | |
| 324 | let level0 = divu_pair(abs_coeff, self.ac_mul_add); |
| 325 | let offset = if level0 > 1 - level_mode { |
| 326 | self.ac_offset1 |
| 327 | } else { |
| 328 | self.ac_offset0 |
| 329 | }; |
| 330 | |
| 331 | let abs_qcoeff: u32 = |
| 332 | level0 + (abs_coeff + offset >= (level0 + 1) * ac_quant) as u32; |
| 333 | if level_mode != 0 && abs_qcoeff == 0 { |
| 334 | level_mode = 0; |
| 335 | } else if abs_qcoeff > 1 { |
| 336 | level_mode = 1; |
| 337 | } |
| 338 | |
| 339 | qcoeffs[pos as usize] = T::cast_from(copysign(abs_qcoeff, coeff)); |
| 340 | } |
| 341 | |
| 342 | // Rather than zeroing the tail in scan order, assume that qcoeffs is |
| 343 | // pre-filled with zeros. |
| 344 | |
| 345 | // Check the eob is correct |
| 346 | debug_assert_eq!( |
| 347 | usize::from(eob), |
| 348 | scan |
| 349 | .iter() |
| 350 | .rposition(|&i| qcoeffs[i as usize] != T::cast_from(0)) |
| 351 | .map(|n| n + 1) |
| 352 | .unwrap_or(0) |
| 353 | ); |
| 354 | |
| 355 | eob |
| 356 | } |
| 357 | } |
| 358 | |
| 359 | pub mod rust { |
| 360 | use super::*; |
| 361 | use crate::cpu_features::CpuFeatureLevel; |
| 362 | use std::mem::MaybeUninit; |
| 363 | |
| 364 | pub fn dequantize<T: Coefficient>( |
| 365 | qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit<T>], |
| 366 | tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, |
| 367 | _cpu: CpuFeatureLevel, |
| 368 | ) { |
| 369 | let log_tx_scale = get_log_tx_scale(tx_size) as i32; |
| 370 | let offset = (1 << log_tx_scale) - 1; |
| 371 | |
| 372 | let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32; |
| 373 | let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32; |
| 374 | |
| 375 | for (i, (r, c)) in rcoeffs |
| 376 | .iter_mut() |
| 377 | .zip(coeffs.iter().map(|&c| i32::cast_from(c))) |
| 378 | .enumerate() |
| 379 | { |
| 380 | let quant = if i == 0 { dc_quant } else { ac_quant }; |
| 381 | r.write(T::cast_from( |
| 382 | (c * quant + ((c >> 31) & offset)) >> log_tx_scale, |
| 383 | )); |
| 384 | } |
| 385 | } |
| 386 | } |
| 387 | |