1// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10#![allow(non_upper_case_globals)]
11
12mod tables;
13
14cfg_if::cfg_if! {
15 if #[cfg(nasm_x86_64)] {
16 pub use crate::asm::x86::quantize::*;
17 } else {
18 pub use self::rust::*;
19 }
20}
21
22pub use tables::*;
23
24use crate::scan_order::av1_scan_orders;
25use crate::transform::{TxSize, TxType};
26use crate::util::*;
27use std::convert::Into;
28use std::mem;
29use std::num::{NonZeroU16, NonZeroU32, NonZeroU64};
30
31pub fn get_log_tx_scale(tx_size: TxSize) -> usize {
32 let num_pixels: usize = tx_size.area();
33
34 Into::<usize>::into(self:num_pixels > 256)
35 + Into::<usize>::into(self:num_pixels > 1024)
36}
37
38pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
39 let dc_q: [&[NonZeroU16; 256]; 3] =
40 [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3];
41 let bd: usize = ((bit_depth ^ 8) >> 1).min(2);
42 dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
43}
44
45pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
46 let ac_q: [&[NonZeroU16; 256]; 3] =
47 [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3];
48 let bd: usize = ((bit_depth ^ 8) >> 1).min(2);
49 ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
50}
51
52// TODO: Handle lossless properly.
53fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 {
54 if quantizer < qlookup[MINQ].get() as i64 {
55 MINQ as u8
56 } else if quantizer >= qlookup[MAXQ].get() as i64 {
57 MAXQ as u8
58 } else {
59 match qlookup
60 .binary_search(&NonZeroU16::new(quantizer as u16).expect("Not zero"))
61 {
62 Ok(qi) => qi as u8,
63 Err(qi) => {
64 debug_assert!(qi > MINQ);
65 debug_assert!(qi <= MAXQ);
66 // Pick the closest quantizer in the log domain.
67 let qthresh =
68 (qlookup[qi - 1].get() as i32) * (qlookup[qi].get() as i32);
69 let q2_i32 = (quantizer as i32) * (quantizer as i32);
70 if q2_i32 < qthresh {
71 (qi - 1) as u8
72 } else {
73 qi as u8
74 }
75 }
76 }
77 }
78}
79
80pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 {
81 let qlookup: &[NonZero; QINDEX_RANGE] = match bit_depth {
82 8 => &dc_qlookup_Q3,
83 10 => &dc_qlookup_10_Q3,
84 12 => &dc_qlookup_12_Q3,
85 _ => unimplemented!(),
86 };
87 select_qi(quantizer, qlookup)
88}
89
90pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 {
91 let qlookup: &[NonZero; QINDEX_RANGE] = match bit_depth {
92 8 => &ac_qlookup_Q3,
93 10 => &ac_qlookup_10_Q3,
94 12 => &ac_qlookup_12_Q3,
95 _ => unimplemented!(),
96 };
97 select_qi(quantizer, qlookup)
98}
99
100#[derive(Debug, Clone, Copy)]
101pub struct QuantizationContext {
102 log_tx_scale: usize,
103 dc_quant: NonZeroU16,
104 dc_offset: u32,
105 dc_mul_add: (u32, u32, u32),
106
107 ac_quant: NonZeroU16,
108 ac_offset_eob: u32,
109 ac_offset0: u32,
110 ac_offset1: u32,
111 ac_mul_add: (u32, u32, u32),
112}
113
114impl Default for QuantizationContext {
115 fn default() -> Self {
116 QuantizationContext {
117 dc_quant: NonZeroU16::new(1).expect(msg:"Not zero"),
118 ac_quant: NonZeroU16::new(1).expect(msg:"Not zero"),
119 log_tx_scale: Default::default(),
120 dc_offset: Default::default(),
121 dc_mul_add: Default::default(),
122 ac_offset_eob: Default::default(),
123 ac_offset0: Default::default(),
124 ac_offset1: Default::default(),
125 ac_mul_add: Default::default(),
126 }
127 }
128}
129
130fn divu_gen(d: NonZeroU32) -> (u32, u32, u32) {
131 let nbits: u64 = (mem::size_of_val(&d) as u64) * 8;
132 let m: u64 = nbits - d.leading_zeros() as u64 - 1;
133 if d.is_power_of_two() {
134 (0xFFFF_FFFF, 0xFFFF_FFFF, m as u32)
135 } else {
136 let d: NonZero = NonZeroU64::from(d);
137 let t: u64 = (1u64 << (m + nbits)) / d;
138
139 let d: u64 = d.get();
140 let r: u64 = (t * d + d) & ((1 << nbits) - 1);
141 if r <= 1u64 << m {
142 (t as u32 + 1, 0u32, m as u32)
143 } else {
144 (t as u32, t as u32, m as u32)
145 }
146 }
147}
148
149#[inline]
150const fn divu_pair(x: u32, d: (u32, u32, u32)) -> u32 {
151 let x: u64 = x as u64;
152 let (a: u32, b: u32, shift: u32) = d;
153 let shift: u64 = shift as u64;
154 let a: u64 = a as u64;
155 let b: u64 = b as u64;
156
157 (((a * x + b) >> 32) >> shift) as u32
158}
159
160#[inline]
161const fn copysign(value: u32, signed: i32) -> i32 {
162 if signed < 0 {
163 -(value as i32)
164 } else {
165 value as i32
166 }
167}
168
169#[cfg(test)]
170mod test {
171 use super::*;
172 use crate::transform::TxSize::*;
173
174 #[test]
175 fn test_divu_pair() {
176 for d in 1..1024 {
177 for x in 0..1000 {
178 let ab = divu_gen(NonZeroU32::new(d).unwrap());
179 assert_eq!(x / d, divu_pair(x, ab));
180 }
181 }
182 }
183 #[test]
184 fn gen_divu_table() {
185 let b: Vec<(u32, u32, u32)> =
186 dc_qlookup_Q3.iter().map(|&v| divu_gen(v.into())).collect();
187
188 println!("{:?}", b);
189 }
190 #[test]
191 fn test_tx_log_scale() {
192 let tx_sizes = [
193 (TX_4X4, 0),
194 (TX_8X8, 0),
195 (TX_16X16, 0),
196 (TX_32X32, 1),
197 (TX_64X64, 2),
198 (TX_4X8, 0),
199 (TX_8X4, 0),
200 (TX_8X16, 0),
201 (TX_16X8, 0),
202 (TX_16X32, 1),
203 (TX_32X16, 1),
204 (TX_32X64, 2),
205 (TX_64X32, 2),
206 (TX_4X16, 0),
207 (TX_16X4, 0),
208 (TX_8X32, 0),
209 (TX_32X8, 0),
210 (TX_16X64, 1),
211 (TX_64X16, 1),
212 ];
213 for &tx_size in tx_sizes.iter() {
214 assert!(tx_size.1 == get_log_tx_scale(tx_size.0));
215 }
216 }
217}
218
219impl QuantizationContext {
220 pub fn update(
221 &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize,
222 dc_delta_q: i8, ac_delta_q: i8,
223 ) {
224 self.log_tx_scale = get_log_tx_scale(tx_size);
225
226 self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth);
227 self.dc_mul_add = divu_gen(self.dc_quant.into());
228
229 self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth);
230 self.ac_mul_add = divu_gen(self.ac_quant.into());
231
232 // All of these biases were derived by measuring the cost of coding
233 // a zero vs coding a one on any given coefficient position, or, in
234 // the case of the EOB bias, the cost of coding the block with
235 // the chosen EOB (rounding to one) vs rounding to zero and continuing
236 // to choose a new EOB. This was done over several clips, with the
237 // average of the bit costs taken over all blocks in the set, and a new
238 // bias derived via the method outlined in Jean-Marc Valin's
239 // Journal of Dubious Theoretical Results[1], aka:
240 //
241 // lambda = ln(2) / 6.0
242 // threshold = 0.5 + (lambda * avg_rate_diff) / 2.0
243 // bias = 1 - threshold
244 //
245 // lambda is a constant since our offsets are already adjusted for the
246 // quantizer.
247 //
248 // Biases were then updated, and cost collection was re-run, until
249 // the calculated biases started to converge after 2-4 iterations.
250 //
251 // In theory, the rounding biases for inter should be somewhat smaller
252 // than the biases for intra, but this turns out to only be the case
253 // for EOB optimization, or at least, is covered by EOB optimization.
254 // The RD-optimal rounding biases for the actual coefficients seem
255 // to be quite close (+/- 1/256), for both inter and intra,
256 // post-deadzoning.
257 //
258 // [1] https://jmvalin.ca/notes/theoretical_results.pdf
259 self.dc_offset =
260 self.dc_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256;
261 self.ac_offset0 =
262 self.ac_quant.get() as u32 * (if is_intra { 98 } else { 97 }) / 256;
263 self.ac_offset1 =
264 self.ac_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256;
265 self.ac_offset_eob =
266 self.ac_quant.get() as u32 * (if is_intra { 88 } else { 44 }) / 256;
267 }
268
269 #[inline]
270 pub fn quantize<T: Coefficient>(
271 &self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType,
272 ) -> u16 {
273 let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan;
274 let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan;
275
276 qcoeffs[0] = {
277 let coeff: i32 = i32::cast_from(coeffs[0]) << self.log_tx_scale;
278 let abs_coeff = coeff.unsigned_abs();
279 T::cast_from(copysign(
280 divu_pair(abs_coeff + self.dc_offset, self.dc_mul_add),
281 coeff,
282 ))
283 };
284
285 // Find the last non-zero coefficient using our smaller biases and
286 // zero everything else.
287 // This threshold is such that `abs(coeff) < deadzone` implies:
288 // (abs(coeff << log_tx_scale) + ac_offset_eob) / ac_quant == 0
289 let deadzone = T::cast_from(
290 (self.ac_quant.get() as usize - self.ac_offset_eob as usize)
291 .align_power_of_two_and_shift(self.log_tx_scale),
292 );
293 let eob = {
294 let eob_minus_one = iscan
295 .iter()
296 .zip(coeffs)
297 .map(|(&i, &c)| if c.abs() >= deadzone { i } else { 0 })
298 .max()
299 .unwrap_or(0);
300 // We skip the DC coefficient since it has its own quantizer index.
301 if eob_minus_one > 0 {
302 eob_minus_one + 1
303 } else {
304 u16::from(qcoeffs[0] != T::cast_from(0))
305 }
306 };
307
308 // Here we use different rounding biases depending on whether we've
309 // had recent coefficients that are larger than one, or less than
310 // one. The reason for this is that a block usually has a chunk of
311 // large coefficients and a tail of zeroes and ones, and the tradeoffs
312 // for coding these two are different. In the tail of zeroes and ones,
313 // you'll likely end up spending most bits just saying where that
314 // coefficient is in the block, whereas in the chunk of larger
315 // coefficients, most bits will be spent on coding its magnitude.
316 // To that end, we want to bias more toward rounding to zero for
317 // that tail of zeroes and ones than we do for the larger coefficients.
318 let mut level_mode = 1;
319 let ac_quant = self.ac_quant.get() as u32;
320 for &pos in scan.iter().take(usize::from(eob)).skip(1) {
321 let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale;
322 let abs_coeff = coeff.unsigned_abs();
323
324 let level0 = divu_pair(abs_coeff, self.ac_mul_add);
325 let offset = if level0 > 1 - level_mode {
326 self.ac_offset1
327 } else {
328 self.ac_offset0
329 };
330
331 let abs_qcoeff: u32 =
332 level0 + (abs_coeff + offset >= (level0 + 1) * ac_quant) as u32;
333 if level_mode != 0 && abs_qcoeff == 0 {
334 level_mode = 0;
335 } else if abs_qcoeff > 1 {
336 level_mode = 1;
337 }
338
339 qcoeffs[pos as usize] = T::cast_from(copysign(abs_qcoeff, coeff));
340 }
341
342 // Rather than zeroing the tail in scan order, assume that qcoeffs is
343 // pre-filled with zeros.
344
345 // Check the eob is correct
346 debug_assert_eq!(
347 usize::from(eob),
348 scan
349 .iter()
350 .rposition(|&i| qcoeffs[i as usize] != T::cast_from(0))
351 .map(|n| n + 1)
352 .unwrap_or(0)
353 );
354
355 eob
356 }
357}
358
359pub mod rust {
360 use super::*;
361 use crate::cpu_features::CpuFeatureLevel;
362 use std::mem::MaybeUninit;
363
364 pub fn dequantize<T: Coefficient>(
365 qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit<T>],
366 tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
367 _cpu: CpuFeatureLevel,
368 ) {
369 let log_tx_scale = get_log_tx_scale(tx_size) as i32;
370 let offset = (1 << log_tx_scale) - 1;
371
372 let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32;
373 let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32;
374
375 for (i, (r, c)) in rcoeffs
376 .iter_mut()
377 .zip(coeffs.iter().map(|&c| i32::cast_from(c)))
378 .enumerate()
379 {
380 let quant = if i == 0 { dc_quant } else { ac_quant };
381 r.write(T::cast_from(
382 (c * quant + ((c >> 31) & offset)) >> log_tx_scale,
383 ));
384 }
385 }
386}
387