mod.rs source code [crates/rav1e/src/quantize/mod.rs]

1	// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
2	//
3	// This source code is subject to the terms of the BSD 2 Clause License and
4	// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5	// was not distributed with this source code in the LICENSE file, you can
6	// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7	// Media Patent License 1.0 was not distributed with this source code in the
8	// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10	#![allow(non_upper_case_globals)]
11
12	mod tables;
13
14	cfg_if::cfg_if! {
15	if #[cfg(nasm_x86_64)] {
16	pub use crate::asm::x86::quantize::*;
17	} else {
18	pub use self::rust::*;
19	}
20	}
21
22	pub use tables::*;
23
24	use crate::scan_order::av1_scan_orders;
25	use crate::transform::{TxSize, TxType};
26	use crate::util::*;
27	use std::convert::Into;
28	use std::mem;
29	use std::num::{NonZeroU16, NonZeroU32, NonZeroU64};
30
31	pub fn get_log_tx_scale(tx_size: TxSize) -> usize {
32	let num_pixels: usize = tx_size.area();
33
34	Into::<usize>::into(self:num_pixels > `256`)
35	+ Into::<usize>::into(self:num_pixels > `1024`)
36	}
37
38	pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
39	let dc_q: [&[NonZeroU16; `256`]; `3`] =
40	[&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3];
41	let bd: usize = ((bit_depth ^ `8`) >> `1`).min(`2`);
42	dc_q[bd][((qindex as isize + delta_q as isize).max(`0`) as usize).min(`255`)]
43	}
44
45	pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
46	let ac_q: [&[NonZeroU16; `256`]; `3`] =
47	[&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3];
48	let bd: usize = ((bit_depth ^ `8`) >> `1`).min(`2`);
49	ac_q[bd][((qindex as isize + delta_q as isize).max(`0`) as usize).min(`255`)]
50	}
51
52	// TODO: Handle lossless properly.
53	fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 {
54	if quantizer < qlookup[MINQ].get() as i64 {
55	MINQ as u8
56	} else if quantizer >= qlookup[MAXQ].get() as i64 {
57	MAXQ as u8
58	} else {
59	match qlookup
60	.binary_search(&NonZeroU16::new(quantizer as u16).expect("Not zero"))
61	{
62	Ok(qi) => qi as u8,
63	Err(qi) => {
64	debug_assert!(qi > MINQ);
65	debug_assert!(qi <= MAXQ);
66	// Pick the closest quantizer in the log domain.
67	let qthresh =
68	(qlookup[qi - `1`].get() as i32) * (qlookup[qi].get() as i32);
69	let q2_i32 = (quantizer as i32) * (quantizer as i32);
70	if q2_i32 < qthresh {
71	(qi - `1`) as u8
72	} else {
73	qi as u8
74	}
75	}
76	}
77	}
78	}
79
80	pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 {
81	let qlookup: &[NonZero; QINDEX_RANGE] = match bit_depth {
82	`8` => &dc_qlookup_Q3,
83	`10` => &dc_qlookup_10_Q3,
84	`12` => &dc_qlookup_12_Q3,
85	_ => unimplemented!(),
86	};
87	select_qi(quantizer, qlookup)
88	}
89
90	pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 {
91	let qlookup: &[NonZero; QINDEX_RANGE] = match bit_depth {
92	`8` => &ac_qlookup_Q3,
93	`10` => &ac_qlookup_10_Q3,
94	`12` => &ac_qlookup_12_Q3,
95	_ => unimplemented!(),
96	};
97	select_qi(quantizer, qlookup)
98	}
99
100	#[derive(Debug, Clone, Copy)]
101	pub struct QuantizationContext {
102	log_tx_scale: usize,
103	dc_quant: NonZeroU16,
104	dc_offset: u32,
105	dc_mul_add: (u32, u32, u32),
106
107	ac_quant: NonZeroU16,
108	ac_offset_eob: u32,
109	ac_offset0: u32,
110	ac_offset1: u32,
111	ac_mul_add: (u32, u32, u32),
112	}
113
114	impl Default for QuantizationContext {
115	fn default() -> Self {
116	QuantizationContext {
117	dc_quant: NonZeroU16::new(`1`).expect(msg:"Not zero"),
118	ac_quant: NonZeroU16::new(`1`).expect(msg:"Not zero"),
119	log_tx_scale: Default::default(),
120	dc_offset: Default::default(),
121	dc_mul_add: Default::default(),
122	ac_offset_eob: Default::default(),
123	ac_offset0: Default::default(),
124	ac_offset1: Default::default(),
125	ac_mul_add: Default::default(),
126	}
127	}
128	}
129
130	fn divu_gen(d: NonZeroU32) -> (u32, u32, u32) {
131	let nbits: u64 = (mem::size_of_val(&d) as u64) * `8`;
132	let m: u64 = nbits - d.leading_zeros() as u64 - `1`;
133	if d.is_power_of_two() {
134	(`0xFFFF_FFFF`, `0xFFFF_FFFF`, m as u32)
135	} else {
136	let d: NonZero = NonZeroU64::from(d);
137	let t: u64 = (`1u64` << (m + nbits)) / d;
138
139	let d: u64 = d.get();
140	let r: u64 = (t * d + d) & ((`1` << nbits) - `1`);
141	if r <= `1u64` << m {
142	(t as u32 + `1`, `0u32`, m as u32)
143	} else {
144	(t as u32, t as u32, m as u32)
145	}
146	}
147	}
148
149	#[inline]
150	const fn divu_pair(x: u32, d: (u32, u32, u32)) -> u32 {
151	let x: u64 = x as u64;
152	let (a: u32, b: u32, shift: u32) = d;
153	let shift: u64 = shift as u64;
154	let a: u64 = a as u64;
155	let b: u64 = b as u64;
156
157	(((a * x + b) >> `32`) >> shift) as u32
158	}
159
160	#[inline]
161	const fn copysign(value: u32, signed: i32) -> i32 {
162	if signed < `0` {
163	-(value as i32)
164	} else {
165	value as i32
166	}
167	}
168
169	#[cfg(test)]
170	mod test {
171	use super::*;
172	use crate::transform::TxSize::*;
173
174	#[test]
175	fn test_divu_pair() {
176	for d in `1`..`1024` {
177	for x in `0`..`1000` {
178	let ab = divu_gen(NonZeroU32::new(d).unwrap());
179	assert_eq!(x / d, divu_pair(x, ab));
180	}
181	}
182	}
183	#[test]
184	fn gen_divu_table() {
185	let b: Vec<(u32, u32, u32)> =
186	dc_qlookup_Q3.iter().map(\|&v\| divu_gen(v.into())).collect();
187
188	println!("{:?}", b);
189	}
190	#[test]
191	fn test_tx_log_scale() {
192	let tx_sizes = [
193	(TX_4X4, `0`),
194	(TX_8X8, `0`),
195	(TX_16X16, `0`),
196	(TX_32X32, `1`),
197	(TX_64X64, `2`),
198	(TX_4X8, `0`),
199	(TX_8X4, `0`),
200	(TX_8X16, `0`),
201	(TX_16X8, `0`),
202	(TX_16X32, `1`),
203	(TX_32X16, `1`),
204	(TX_32X64, `2`),
205	(TX_64X32, `2`),
206	(TX_4X16, `0`),
207	(TX_16X4, `0`),
208	(TX_8X32, `0`),
209	(TX_32X8, `0`),
210	(TX_16X64, `1`),
211	(TX_64X16, `1`),
212	];
213	for &tx_size in tx_sizes.iter() {
214	assert!(tx_size.`1` == get_log_tx_scale(tx_size.`0`));
215	}
216	}
217	}
218
219	impl QuantizationContext {
220	pub fn update(
221	&mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize,
222	dc_delta_q: i8, ac_delta_q: i8,
223	) {
224	self.log_tx_scale = get_log_tx_scale(tx_size);
225
226	self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth);
227	self.dc_mul_add = divu_gen(self.dc_quant.into());
228
229	self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth);
230	self.ac_mul_add = divu_gen(self.ac_quant.into());
231
232	// All of these biases were derived by measuring the cost of coding
233	// a zero vs coding a one on any given coefficient position, or, in
234	// the case of the EOB bias, the cost of coding the block with
235	// the chosen EOB (rounding to one) vs rounding to zero and continuing
236	// to choose a new EOB. This was done over several clips, with the
237	// average of the bit costs taken over all blocks in the set, and a new
238	// bias derived via the method outlined in Jean-Marc Valin's
239	// Journal of Dubious Theoretical Results[1], aka:
240	//
241	// lambda = ln(2) / 6.0
242	// threshold = 0.5 + (lambda avg_rate_diff) / 2.0*
243	// bias = 1 - threshold
244	//
245	// lambda is a constant since our offsets are already adjusted for the
246	// quantizer.
247	//
248	// Biases were then updated, and cost collection was re-run, until
249	// the calculated biases started to converge after 2-4 iterations.
250	//
251	// In theory, the rounding biases for inter should be somewhat smaller
252	// than the biases for intra, but this turns out to only be the case
253	// for EOB optimization, or at least, is covered by EOB optimization.
254	// The RD-optimal rounding biases for the actual coefficients seem
255	// to be quite close (+/- 1/256), for both inter and intra,
256	// post-deadzoning.
257	//
258	// [1] https://jmvalin.ca/notes/theoretical_results.pdf
259	self.dc_offset =
260	self.dc_quant.get() as u32 * (if is_intra { `109` } else { `108` }) / `256`;
261	self.ac_offset0 =
262	self.ac_quant.get() as u32 * (if is_intra { `98` } else { `97` }) / `256`;
263	self.ac_offset1 =
264	self.ac_quant.get() as u32 * (if is_intra { `109` } else { `108` }) / `256`;
265	self.ac_offset_eob =
266	self.ac_quant.get() as u32 * (if is_intra { `88` } else { `44` }) / `256`;
267	}
268
269	#[inline]
270	pub fn quantize<T: Coefficient>(
271	&self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType,
272	) -> u16 {
273	let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan;
274	let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan;
275
276	qcoeffs[`0`] = {
277	let coeff: i32 = i32::cast_from(coeffs[`0`]) << self.log_tx_scale;
278	let abs_coeff = coeff.unsigned_abs();
279	T::cast_from(copysign(
280	divu_pair(abs_coeff + self.dc_offset, self.dc_mul_add),
281	coeff,
282	))
283	};
284
285	// Find the last non-zero coefficient using our smaller biases and
286	// zero everything else.
287	// This threshold is such that `abs(coeff) < deadzone` implies:
288	// (abs(coeff << log_tx_scale) + ac_offset_eob) / ac_quant == 0
289	let deadzone = T::cast_from(
290	(self.ac_quant.get() as usize - self.ac_offset_eob as usize)
291	.align_power_of_two_and_shift(self.log_tx_scale),
292	);
293	let eob = {
294	let eob_minus_one = iscan
295	.iter()
296	.zip(coeffs)
297	.map(\|(&i, &c)\| if c.abs() >= deadzone { i } else { `0` })
298	.max()
299	.unwrap_or(`0`);
300	// We skip the DC coefficient since it has its own quantizer index.
301	if eob_minus_one > `0` {
302	eob_minus_one + `1`
303	} else {
304	u16::from(qcoeffs[`0`] != T::cast_from(`0`))
305	}
306	};
307
308	// Here we use different rounding biases depending on whether we've
309	// had recent coefficients that are larger than one, or less than
310	// one. The reason for this is that a block usually has a chunk of
311	// large coefficients and a tail of zeroes and ones, and the tradeoffs
312	// for coding these two are different. In the tail of zeroes and ones,
313	// you'll likely end up spending most bits just saying where that
314	// coefficient is in the block, whereas in the chunk of larger
315	// coefficients, most bits will be spent on coding its magnitude.
316	// To that end, we want to bias more toward rounding to zero for
317	// that tail of zeroes and ones than we do for the larger coefficients.
318	let mut level_mode = `1`;
319	let ac_quant = self.ac_quant.get() as u32;
320	for &pos in scan.iter().take(usize::from(eob)).skip(`1`) {
321	let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale;
322	let abs_coeff = coeff.unsigned_abs();
323
324	let level0 = divu_pair(abs_coeff, self.ac_mul_add);
325	let offset = if level0 > `1` - level_mode {
326	self.ac_offset1
327	} else {
328	self.ac_offset0
329	};
330
331	let abs_qcoeff: u32 =
332	level0 + (abs_coeff + offset >= (level0 + `1`) * ac_quant) as u32;
333	if level_mode != `0` && abs_qcoeff == `0` {
334	level_mode = `0`;
335	} else if abs_qcoeff > `1` {
336	level_mode = `1`;
337	}
338
339	qcoeffs[pos as usize] = T::cast_from(copysign(abs_qcoeff, coeff));
340	}
341
342	// Rather than zeroing the tail in scan order, assume that qcoeffs is
343	// pre-filled with zeros.
344
345	// Check the eob is correct
346	debug_assert_eq!(
347	usize::from(eob),
348	scan
349	.iter()
350	.rposition(\|&i\| qcoeffs[i as usize] != T::cast_from(`0`))
351	.map(\|n\| n + `1`)
352	.unwrap_or(`0`)
353	);
354
355	eob
356	}
357	}
358
359	pub mod rust {
360	use super::*;
361	use crate::cpu_features::CpuFeatureLevel;
362	use std::mem::MaybeUninit;
363
364	pub fn dequantize<T: Coefficient>(
365	qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit<T>],
366	tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
367	_cpu: CpuFeatureLevel,
368	) {
369	let log_tx_scale = get_log_tx_scale(tx_size) as i32;
370	let offset = (`1` << log_tx_scale) - `1`;
371
372	let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32;
373	let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32;
374
375	for (i, (r, c)) in rcoeffs
376	.iter_mut()
377	.zip(coeffs.iter().map(\|&c\| i32::cast_from(c)))
378	.enumerate()
379	{
380	let quant = if i == `0` { dc_quant } else { ac_quant };
381	r.write(T::cast_from(
382	(c * quant + ((c >> `31`) & offset)) >> log_tx_scale,
383	));
384	}
385	}
386	}
387