1 | // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved |
2 | // |
3 | // This source code is subject to the terms of the BSD 2 Clause License and |
4 | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
5 | // was not distributed with this source code in the LICENSE file, you can |
6 | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
7 | // Media Patent License 1.0 was not distributed with this source code in the |
8 | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
9 | |
10 | #![allow (non_upper_case_globals)] |
11 | |
12 | mod tables; |
13 | |
14 | cfg_if::cfg_if! { |
15 | if #[cfg(nasm_x86_64)] { |
16 | pub use crate::asm::x86::quantize::*; |
17 | } else { |
18 | pub use self::rust::*; |
19 | } |
20 | } |
21 | |
22 | pub use tables::*; |
23 | |
24 | use crate::scan_order::av1_scan_orders; |
25 | use crate::transform::{TxSize, TxType}; |
26 | use crate::util::*; |
27 | use std::convert::Into; |
28 | use std::mem; |
29 | use std::num::{NonZeroU16, NonZeroU32, NonZeroU64}; |
30 | |
31 | pub fn get_log_tx_scale(tx_size: TxSize) -> usize { |
32 | let num_pixels: usize = tx_size.area(); |
33 | |
34 | Into::<usize>::into(self:num_pixels > 256) |
35 | + Into::<usize>::into(self:num_pixels > 1024) |
36 | } |
37 | |
38 | pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { |
39 | let dc_q: [&[NonZeroU16; 256]; 3] = |
40 | [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3]; |
41 | let bd: usize = ((bit_depth ^ 8) >> 1).min(2); |
42 | dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] |
43 | } |
44 | |
45 | pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { |
46 | let ac_q: [&[NonZeroU16; 256]; 3] = |
47 | [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3]; |
48 | let bd: usize = ((bit_depth ^ 8) >> 1).min(2); |
49 | ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] |
50 | } |
51 | |
52 | // TODO: Handle lossless properly. |
53 | fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 { |
54 | if quantizer < qlookup[MINQ].get() as i64 { |
55 | MINQ as u8 |
56 | } else if quantizer >= qlookup[MAXQ].get() as i64 { |
57 | MAXQ as u8 |
58 | } else { |
59 | match qlookup |
60 | .binary_search(&NonZeroU16::new(quantizer as u16).expect("Not zero" )) |
61 | { |
62 | Ok(qi) => qi as u8, |
63 | Err(qi) => { |
64 | debug_assert!(qi > MINQ); |
65 | debug_assert!(qi <= MAXQ); |
66 | // Pick the closest quantizer in the log domain. |
67 | let qthresh = |
68 | (qlookup[qi - 1].get() as i32) * (qlookup[qi].get() as i32); |
69 | let q2_i32 = (quantizer as i32) * (quantizer as i32); |
70 | if q2_i32 < qthresh { |
71 | (qi - 1) as u8 |
72 | } else { |
73 | qi as u8 |
74 | } |
75 | } |
76 | } |
77 | } |
78 | } |
79 | |
80 | pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 { |
81 | let qlookup: &[NonZero; QINDEX_RANGE] = match bit_depth { |
82 | 8 => &dc_qlookup_Q3, |
83 | 10 => &dc_qlookup_10_Q3, |
84 | 12 => &dc_qlookup_12_Q3, |
85 | _ => unimplemented!(), |
86 | }; |
87 | select_qi(quantizer, qlookup) |
88 | } |
89 | |
90 | pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 { |
91 | let qlookup: &[NonZero; QINDEX_RANGE] = match bit_depth { |
92 | 8 => &ac_qlookup_Q3, |
93 | 10 => &ac_qlookup_10_Q3, |
94 | 12 => &ac_qlookup_12_Q3, |
95 | _ => unimplemented!(), |
96 | }; |
97 | select_qi(quantizer, qlookup) |
98 | } |
99 | |
100 | #[derive (Debug, Clone, Copy)] |
101 | pub struct QuantizationContext { |
102 | log_tx_scale: usize, |
103 | dc_quant: NonZeroU16, |
104 | dc_offset: u32, |
105 | dc_mul_add: (u32, u32, u32), |
106 | |
107 | ac_quant: NonZeroU16, |
108 | ac_offset_eob: u32, |
109 | ac_offset0: u32, |
110 | ac_offset1: u32, |
111 | ac_mul_add: (u32, u32, u32), |
112 | } |
113 | |
114 | impl Default for QuantizationContext { |
115 | fn default() -> Self { |
116 | QuantizationContext { |
117 | dc_quant: NonZeroU16::new(1).expect(msg:"Not zero" ), |
118 | ac_quant: NonZeroU16::new(1).expect(msg:"Not zero" ), |
119 | log_tx_scale: Default::default(), |
120 | dc_offset: Default::default(), |
121 | dc_mul_add: Default::default(), |
122 | ac_offset_eob: Default::default(), |
123 | ac_offset0: Default::default(), |
124 | ac_offset1: Default::default(), |
125 | ac_mul_add: Default::default(), |
126 | } |
127 | } |
128 | } |
129 | |
130 | fn divu_gen(d: NonZeroU32) -> (u32, u32, u32) { |
131 | let nbits: u64 = (mem::size_of_val(&d) as u64) * 8; |
132 | let m: u64 = nbits - d.leading_zeros() as u64 - 1; |
133 | if d.is_power_of_two() { |
134 | (0xFFFF_FFFF, 0xFFFF_FFFF, m as u32) |
135 | } else { |
136 | let d: NonZero = NonZeroU64::from(d); |
137 | let t: u64 = (1u64 << (m + nbits)) / d; |
138 | |
139 | let d: u64 = d.get(); |
140 | let r: u64 = (t * d + d) & ((1 << nbits) - 1); |
141 | if r <= 1u64 << m { |
142 | (t as u32 + 1, 0u32, m as u32) |
143 | } else { |
144 | (t as u32, t as u32, m as u32) |
145 | } |
146 | } |
147 | } |
148 | |
149 | #[inline ] |
150 | const fn divu_pair(x: u32, d: (u32, u32, u32)) -> u32 { |
151 | let x: u64 = x as u64; |
152 | let (a: u32, b: u32, shift: u32) = d; |
153 | let shift: u64 = shift as u64; |
154 | let a: u64 = a as u64; |
155 | let b: u64 = b as u64; |
156 | |
157 | (((a * x + b) >> 32) >> shift) as u32 |
158 | } |
159 | |
160 | #[inline ] |
161 | const fn copysign(value: u32, signed: i32) -> i32 { |
162 | if signed < 0 { |
163 | -(value as i32) |
164 | } else { |
165 | value as i32 |
166 | } |
167 | } |
168 | |
169 | #[cfg (test)] |
170 | mod test { |
171 | use super::*; |
172 | use crate::transform::TxSize::*; |
173 | |
174 | #[test ] |
175 | fn test_divu_pair() { |
176 | for d in 1..1024 { |
177 | for x in 0..1000 { |
178 | let ab = divu_gen(NonZeroU32::new(d).unwrap()); |
179 | assert_eq!(x / d, divu_pair(x, ab)); |
180 | } |
181 | } |
182 | } |
183 | #[test ] |
184 | fn gen_divu_table() { |
185 | let b: Vec<(u32, u32, u32)> = |
186 | dc_qlookup_Q3.iter().map(|&v| divu_gen(v.into())).collect(); |
187 | |
188 | println!("{:?}" , b); |
189 | } |
190 | #[test ] |
191 | fn test_tx_log_scale() { |
192 | let tx_sizes = [ |
193 | (TX_4X4, 0), |
194 | (TX_8X8, 0), |
195 | (TX_16X16, 0), |
196 | (TX_32X32, 1), |
197 | (TX_64X64, 2), |
198 | (TX_4X8, 0), |
199 | (TX_8X4, 0), |
200 | (TX_8X16, 0), |
201 | (TX_16X8, 0), |
202 | (TX_16X32, 1), |
203 | (TX_32X16, 1), |
204 | (TX_32X64, 2), |
205 | (TX_64X32, 2), |
206 | (TX_4X16, 0), |
207 | (TX_16X4, 0), |
208 | (TX_8X32, 0), |
209 | (TX_32X8, 0), |
210 | (TX_16X64, 1), |
211 | (TX_64X16, 1), |
212 | ]; |
213 | for &tx_size in tx_sizes.iter() { |
214 | assert!(tx_size.1 == get_log_tx_scale(tx_size.0)); |
215 | } |
216 | } |
217 | } |
218 | |
219 | impl QuantizationContext { |
220 | pub fn update( |
221 | &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize, |
222 | dc_delta_q: i8, ac_delta_q: i8, |
223 | ) { |
224 | self.log_tx_scale = get_log_tx_scale(tx_size); |
225 | |
226 | self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth); |
227 | self.dc_mul_add = divu_gen(self.dc_quant.into()); |
228 | |
229 | self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth); |
230 | self.ac_mul_add = divu_gen(self.ac_quant.into()); |
231 | |
232 | // All of these biases were derived by measuring the cost of coding |
233 | // a zero vs coding a one on any given coefficient position, or, in |
234 | // the case of the EOB bias, the cost of coding the block with |
235 | // the chosen EOB (rounding to one) vs rounding to zero and continuing |
236 | // to choose a new EOB. This was done over several clips, with the |
237 | // average of the bit costs taken over all blocks in the set, and a new |
238 | // bias derived via the method outlined in Jean-Marc Valin's |
239 | // Journal of Dubious Theoretical Results[1], aka: |
240 | // |
241 | // lambda = ln(2) / 6.0 |
242 | // threshold = 0.5 + (lambda * avg_rate_diff) / 2.0 |
243 | // bias = 1 - threshold |
244 | // |
245 | // lambda is a constant since our offsets are already adjusted for the |
246 | // quantizer. |
247 | // |
248 | // Biases were then updated, and cost collection was re-run, until |
249 | // the calculated biases started to converge after 2-4 iterations. |
250 | // |
251 | // In theory, the rounding biases for inter should be somewhat smaller |
252 | // than the biases for intra, but this turns out to only be the case |
253 | // for EOB optimization, or at least, is covered by EOB optimization. |
254 | // The RD-optimal rounding biases for the actual coefficients seem |
255 | // to be quite close (+/- 1/256), for both inter and intra, |
256 | // post-deadzoning. |
257 | // |
258 | // [1] https://jmvalin.ca/notes/theoretical_results.pdf |
259 | self.dc_offset = |
260 | self.dc_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256; |
261 | self.ac_offset0 = |
262 | self.ac_quant.get() as u32 * (if is_intra { 98 } else { 97 }) / 256; |
263 | self.ac_offset1 = |
264 | self.ac_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256; |
265 | self.ac_offset_eob = |
266 | self.ac_quant.get() as u32 * (if is_intra { 88 } else { 44 }) / 256; |
267 | } |
268 | |
269 | #[inline ] |
270 | pub fn quantize<T: Coefficient>( |
271 | &self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType, |
272 | ) -> u16 { |
273 | let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan; |
274 | let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan; |
275 | |
276 | qcoeffs[0] = { |
277 | let coeff: i32 = i32::cast_from(coeffs[0]) << self.log_tx_scale; |
278 | let abs_coeff = coeff.unsigned_abs(); |
279 | T::cast_from(copysign( |
280 | divu_pair(abs_coeff + self.dc_offset, self.dc_mul_add), |
281 | coeff, |
282 | )) |
283 | }; |
284 | |
285 | // Find the last non-zero coefficient using our smaller biases and |
286 | // zero everything else. |
287 | // This threshold is such that `abs(coeff) < deadzone` implies: |
288 | // (abs(coeff << log_tx_scale) + ac_offset_eob) / ac_quant == 0 |
289 | let deadzone = T::cast_from( |
290 | (self.ac_quant.get() as usize - self.ac_offset_eob as usize) |
291 | .align_power_of_two_and_shift(self.log_tx_scale), |
292 | ); |
293 | let eob = { |
294 | let eob_minus_one = iscan |
295 | .iter() |
296 | .zip(coeffs) |
297 | .map(|(&i, &c)| if c.abs() >= deadzone { i } else { 0 }) |
298 | .max() |
299 | .unwrap_or(0); |
300 | // We skip the DC coefficient since it has its own quantizer index. |
301 | if eob_minus_one > 0 { |
302 | eob_minus_one + 1 |
303 | } else { |
304 | u16::from(qcoeffs[0] != T::cast_from(0)) |
305 | } |
306 | }; |
307 | |
308 | // Here we use different rounding biases depending on whether we've |
309 | // had recent coefficients that are larger than one, or less than |
310 | // one. The reason for this is that a block usually has a chunk of |
311 | // large coefficients and a tail of zeroes and ones, and the tradeoffs |
312 | // for coding these two are different. In the tail of zeroes and ones, |
313 | // you'll likely end up spending most bits just saying where that |
314 | // coefficient is in the block, whereas in the chunk of larger |
315 | // coefficients, most bits will be spent on coding its magnitude. |
316 | // To that end, we want to bias more toward rounding to zero for |
317 | // that tail of zeroes and ones than we do for the larger coefficients. |
318 | let mut level_mode = 1; |
319 | let ac_quant = self.ac_quant.get() as u32; |
320 | for &pos in scan.iter().take(usize::from(eob)).skip(1) { |
321 | let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale; |
322 | let abs_coeff = coeff.unsigned_abs(); |
323 | |
324 | let level0 = divu_pair(abs_coeff, self.ac_mul_add); |
325 | let offset = if level0 > 1 - level_mode { |
326 | self.ac_offset1 |
327 | } else { |
328 | self.ac_offset0 |
329 | }; |
330 | |
331 | let abs_qcoeff: u32 = |
332 | level0 + (abs_coeff + offset >= (level0 + 1) * ac_quant) as u32; |
333 | if level_mode != 0 && abs_qcoeff == 0 { |
334 | level_mode = 0; |
335 | } else if abs_qcoeff > 1 { |
336 | level_mode = 1; |
337 | } |
338 | |
339 | qcoeffs[pos as usize] = T::cast_from(copysign(abs_qcoeff, coeff)); |
340 | } |
341 | |
342 | // Rather than zeroing the tail in scan order, assume that qcoeffs is |
343 | // pre-filled with zeros. |
344 | |
345 | // Check the eob is correct |
346 | debug_assert_eq!( |
347 | usize::from(eob), |
348 | scan |
349 | .iter() |
350 | .rposition(|&i| qcoeffs[i as usize] != T::cast_from(0)) |
351 | .map(|n| n + 1) |
352 | .unwrap_or(0) |
353 | ); |
354 | |
355 | eob |
356 | } |
357 | } |
358 | |
359 | pub mod rust { |
360 | use super::*; |
361 | use crate::cpu_features::CpuFeatureLevel; |
362 | use std::mem::MaybeUninit; |
363 | |
364 | pub fn dequantize<T: Coefficient>( |
365 | qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit<T>], |
366 | tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, |
367 | _cpu: CpuFeatureLevel, |
368 | ) { |
369 | let log_tx_scale = get_log_tx_scale(tx_size) as i32; |
370 | let offset = (1 << log_tx_scale) - 1; |
371 | |
372 | let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32; |
373 | let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32; |
374 | |
375 | for (i, (r, c)) in rcoeffs |
376 | .iter_mut() |
377 | .zip(coeffs.iter().map(|&c| i32::cast_from(c))) |
378 | .enumerate() |
379 | { |
380 | let quant = if i == 0 { dc_quant } else { ac_quant }; |
381 | r.write(T::cast_from( |
382 | (c * quant + ((c >> 31) & offset)) >> log_tx_scale, |
383 | )); |
384 | } |
385 | } |
386 | } |
387 | |