dist.rs source code [crates/rav1e/src/dist.rs]

1	// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved
2	//
3	// This source code is subject to the terms of the BSD 2 Clause License and
4	// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5	// was not distributed with this source code in the LICENSE file, you can
6	// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7	// Media Patent License 1.0 was not distributed with this source code in the
8	// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10	cfg_if::cfg_if! {
11	if #[cfg(nasm_x86_64)] {
12	pub use crate::asm::x86::dist::*;
13	} else if #[cfg(asm_neon)] {
14	pub use crate::asm::aarch64::dist::*;
15	} else {
16	pub use self::rust::*;
17	}
18	}
19
20	pub(crate) mod rust {
21	use crate::activity::apply_ssim_boost;
22	use crate::cpu_features::CpuFeatureLevel;
23	use crate::tiling::*;
24	use crate::util::*;
25
26	use crate::encoder::IMPORTANCE_BLOCK_SIZE;
27	use crate::rdo::DistortionScale;
28
29	/// Compute the sum of absolute differences over a block.
30	/// w and h can be at most 128, the size of the largest block.
31	pub fn get_sad<T: Pixel>(
32	plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
33	h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
34	) -> u32 {
35	debug_assert!(w <= `128` && h <= `128`);
36	let plane_org =
37	plane_org.subregion(Area::Rect { x: `0`, y: `0`, width: w, height: h });
38	let plane_ref =
39	plane_ref.subregion(Area::Rect { x: `0`, y: `0`, width: w, height: h });
40
41	plane_org
42	.rows_iter()
43	.zip(plane_ref.rows_iter())
44	.map(\|(src, dst)\| {
45	src
46	.iter()
47	.zip(dst)
48	.map(\|(&p1, &p2)\| i32::cast_from(p1).abs_diff(i32::cast_from(p2)))
49	.sum::<u32>()
50	})
51	.sum()
52	}
53
54	#[inline(always)]
55	const fn butterfly(a: i32, b: i32) -> (i32, i32) {
56	((a + b), (a - b))
57	}
58
59	#[inline(always)]
60	#[allow(clippy::identity_op, clippy::erasing_op)]
61	fn hadamard4_1d<
62	const LEN: usize,
63	const N: usize,
64	const STRIDE0: usize,
65	const STRIDE1: usize,
66	>(
67	data: &mut [i32; LEN],
68	) {
69	for i in `0`..N {
70	let sub: &mut [i32] = &mut data[i * STRIDE0..];
71	let (a0, a1) = butterfly(sub[`0` * STRIDE1], sub[`1` * STRIDE1]);
72	let (a2, a3) = butterfly(sub[`2` * STRIDE1], sub[`3` * STRIDE1]);
73	let (b0, b2) = butterfly(a0, a2);
74	let (b1, b3) = butterfly(a1, a3);
75	sub[`0` * STRIDE1] = b0;
76	sub[`1` * STRIDE1] = b1;
77	sub[`2` * STRIDE1] = b2;
78	sub[`3` * STRIDE1] = b3;
79	}
80	}
81
82	#[inline(always)]
83	#[allow(clippy::identity_op, clippy::erasing_op)]
84	fn hadamard8_1d<
85	const LEN: usize,
86	const N: usize,
87	const STRIDE0: usize,
88	const STRIDE1: usize,
89	>(
90	data: &mut [i32; LEN],
91	) {
92	for i in `0`..N {
93	let sub: &mut [i32] = &mut data[i * STRIDE0..];
94
95	let (a0, a1) = butterfly(sub[`0` * STRIDE1], sub[`1` * STRIDE1]);
96	let (a2, a3) = butterfly(sub[`2` * STRIDE1], sub[`3` * STRIDE1]);
97	let (a4, a5) = butterfly(sub[`4` * STRIDE1], sub[`5` * STRIDE1]);
98	let (a6, a7) = butterfly(sub[`6` * STRIDE1], sub[`7` * STRIDE1]);
99
100	let (b0, b2) = butterfly(a0, a2);
101	let (b1, b3) = butterfly(a1, a3);
102	let (b4, b6) = butterfly(a4, a6);
103	let (b5, b7) = butterfly(a5, a7);
104
105	let (c0, c4) = butterfly(b0, b4);
106	let (c1, c5) = butterfly(b1, b5);
107	let (c2, c6) = butterfly(b2, b6);
108	let (c3, c7) = butterfly(b3, b7);
109
110	sub[`0` * STRIDE1] = c0;
111	sub[`1` * STRIDE1] = c1;
112	sub[`2` * STRIDE1] = c2;
113	sub[`3` * STRIDE1] = c3;
114	sub[`4` * STRIDE1] = c4;
115	sub[`5` * STRIDE1] = c5;
116	sub[`6` * STRIDE1] = c6;
117	sub[`7` * STRIDE1] = c7;
118	}
119	}
120
121	#[inline(always)]
122	fn hadamard2d<const LEN: usize, const W: usize, const H: usize>(
123	data: &mut [i32; LEN],
124	) {
125	/Vertical transform./
126	let vert_func = if H == `4` {
127	hadamard4_1d::<LEN, W, `1`, H>
128	} else {
129	hadamard8_1d::<LEN, W, `1`, H>
130	};
131	vert_func(data);
132	/Horizontal transform./
133	let horz_func = if W == `4` {
134	hadamard4_1d::<LEN, H, W, `1`>
135	} else {
136	hadamard8_1d::<LEN, H, W, `1`>
137	};
138	horz_func(data);
139	}
140
141	// SAFETY: The length of data must be 16.
142	unsafe fn hadamard4x4(data: &mut [i32]) {
143	hadamard2d::<{ `4` * `4` }, `4`, `4`>(&mut (data.as_mut_ptr() as mut [i32; `16`]));
144	}
145
146	// SAFETY: The length of data must be 64.
147	unsafe fn hadamard8x8(data: &mut [i32]) {
148	hadamard2d::<{ `8` * `8` }, `8`, `8`>(&mut (data.as_mut_ptr() as mut [i32; `64`]));
149	}
150
151	/// Sum of absolute transformed differences over a block.
152	/// w and h can be at most 128, the size of the largest block.
153	/// Use the sum of 4x4 and 8x8 hadamard transforms for the transform, but
154	/// revert to sad on edges when these transforms do not fit into w and h.
155	/// 4x4 transforms instead of 8x8 transforms when width or height < 8.
156	pub fn get_satd<T: Pixel>(
157	plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
158	h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
159	) -> u32 {
160	assert!(w <= `128` && h <= `128`);
161	assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
162	assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h);
163
164	// Size of hadamard transform should be 4x4 or 8x8
165	// 4x and x4 use 4x4 and all other use 8x8
166	let size: usize = w.min(h).min(`8`);
167	let tx2d = if size == `4` { hadamard4x4 } else { hadamard8x8 };
168
169	let mut sum: u64 = `0`;
170
171	// Loop over chunks the size of the chosen transform
172	for chunk_y in (`0`..h).step_by(size) {
173	let chunk_h = (h - chunk_y).min(size);
174	for chunk_x in (`0`..w).step_by(size) {
175	let chunk_w = (w - chunk_x).min(size);
176	let chunk_area: Area = Area::Rect {
177	x: chunk_x as isize,
178	y: chunk_y as isize,
179	width: chunk_w,
180	height: chunk_h,
181	};
182	let chunk_org = plane_org.subregion(chunk_area);
183	let chunk_ref = plane_ref.subregion(chunk_area);
184
185	// Revert to sad on edge blocks (frame edges)
186	if chunk_w != size \|\| chunk_h != size {
187	sum += get_sad(
188	&chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu,
189	) as u64;
190	continue;
191	}
192
193	let buf: &mut [i32] = &mut [`0`; `8` * `8`][..size * size];
194
195	// Move the difference of the transforms to a buffer
196	for (row_diff, (row_org, row_ref)) in buf
197	.chunks_mut(size)
198	.zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter()))
199	{
200	for (diff, (a, b)) in
201	row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter()))
202	{
203	diff = i32::cast_from(a) - i32::cast_from(*b);
204	}
205	}
206
207	// Perform the hadamard transform on the differences
208	// SAFETY: A sufficient number elements exist for the size of the transform.
209	unsafe {
210	tx2d(buf);
211	}
212
213	// Sum the absolute values of the transformed differences
214	sum += buf.iter().map(\|a\| a.unsigned_abs() as u64).sum::<u64>();
215	}
216	}
217
218	// Normalize the results
219	let ln = msb(size as i32) as u64;
220	((sum + (`1` << ln >> `1`)) >> ln) as u32
221	}
222
223	/// Number of bits rounded off before summing in `get_weighted_sse`
224	pub const GET_WEIGHTED_SSE_SHIFT: u8 = `8`;
225
226	/// Computes weighted sum of squared error.
227	///
228	/// Each scale is applied to a 4x4 region in the provided inputs. Each scale
229	/// value is a fixed point number, currently [`DistortionScale`].
230	///
231	/// Implementations can require alignment (`bw` (block width) for [`src1`] and
232	/// [`src2`] and `bw/4` for `scale`).
233	#[inline(never)]
234	pub fn get_weighted_sse<T: Pixel>(
235	src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
236	scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
237	_cpu: CpuFeatureLevel,
238	) -> u64 {
239	let src1 = src1.subregion(Area::Rect { x: `0`, y: `0`, width: w, height: h });
240	// Always chunk and apply scaling on the sse of squares the size of
241	// decimated/sub-sampled importance block sizes.
242	// Warning: Changing this will require changing/disabling assembly.
243	let chunk_size: usize = IMPORTANCE_BLOCK_SIZE >> `1`;
244
245	// Iterator of a row of scales, stretched out to be per row
246	let scales = scale.chunks_exact(scale_stride);
247
248	let sse = src1
249	.vert_windows(chunk_size)
250	.step_by(chunk_size)
251	.zip(src2.vert_windows(chunk_size).step_by(chunk_size))
252	.zip(scales)
253	.map(\|((row1, row2), scales)\| {
254	row1
255	.horz_windows(chunk_size)
256	.step_by(chunk_size)
257	.zip(row2.horz_windows(chunk_size).step_by(chunk_size))
258	.zip(scales)
259	.map(\|((chunk1, chunk2), &scale)\| {
260	let sum = chunk1
261	.rows_iter()
262	.zip(chunk2.rows_iter())
263	.map(\|(chunk_row1, chunk_row2)\| {
264	chunk_row1
265	.iter()
266	.zip(chunk_row2)
267	.map(\|(&a, &b)\| {
268	let c = i32::cast_from(a) - i32::cast_from(b);
269	(c * c) as u32
270	})
271	.sum::<u32>()
272	})
273	.sum::<u32>();
274	(sum as u64 * scale as u64 + (`1` << GET_WEIGHTED_SSE_SHIFT >> `1`))
275	>> GET_WEIGHTED_SSE_SHIFT
276	})
277	.sum::<u64>()
278	})
279	.sum::<u64>();
280
281	let den = DistortionScale::new(`1`, `1` << GET_WEIGHTED_SSE_SHIFT).0 as u64;
282	(sse + (den >> `1`)) / den
283	}
284
285	/// Number of bits of precision used in `AREA_DIVISORS`
286	const AREA_DIVISOR_BITS: u8 = `14`;
287
288	/// Lookup table for 2^`AREA_DIVISOR_BITS` / (1 + x)
289	#[rustfmt::skip]
290	const AREA_DIVISORS: [u16; `64`] = [
291	`16384`, `8192`, `5461`, `4096`, `3277`, `2731`, `2341`, `2048`, `1820`, `1638`, `1489`, `1365`,
292	`1260`, `1170`, `1092`, `1024`, `964`, `910`, `862`, `819`, `780`, `745`, `712`, `683`,
293	`655`, `630`, `607`, `585`, `565`, `546`, `529`, `512`, `496`, `482`, `468`, `455`,
294	`443`, `431`, `420`, `410`, `400`, `390`, `381`, `372`, `364`, `356`, `349`, `341`,
295	`334`, `328`, `321`, `315`, `309`, `303`, `298`, `293`, `287`, `282`, `278`, `273`,
296	`269`, `264`, `260`, `256`,
297	];
298
299	/// Computes a distortion metric of the sum of squares weighted by activity.
300	/// w and h should be <= 8.
301	#[inline(never)]
302	pub fn cdef_dist_kernel<T: Pixel>(
303	src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
304	bit_depth: usize, _cpu: CpuFeatureLevel,
305	) -> u32 {
306	// TODO: Investigate using different constants in ssim boost for block sizes
307	// smaller than 8x8.
308
309	debug_assert!(src.plane_cfg.xdec == `0`);
310	debug_assert!(src.plane_cfg.ydec == `0`);
311	debug_assert!(dst.plane_cfg.xdec == `0`);
312	debug_assert!(dst.plane_cfg.ydec == `0`);
313
314	// Limit kernel to 8x8
315	debug_assert!(w <= `8`);
316	debug_assert!(h <= `8`);
317
318	// Compute the following summations.
319	let mut sum_s: u32 = `0`; // sum(src_{i,j})
320	let mut sum_d: u32 = `0`; // sum(dst_{i,j})
321	let mut sum_s2: u32 = `0`; // sum(src_{i,j}^2)
322	let mut sum_d2: u32 = `0`; // sum(dst_{i,j}^2)
323	let mut sum_sd: u32 = `0`; // sum(src_{i,j} dst_{i,j})*
324	for (row1, row2) in src.rows_iter().take(h).zip(dst.rows_iter()) {
325	for (s, d) in row1[..w].iter().zip(row2) {
326	let s: u32 = u32::cast_from(*s);
327	let d: u32 = u32::cast_from(*d);
328	sum_s += s;
329	sum_d += d;
330
331	sum_s2 += s * s;
332	sum_d2 += d * d;
333	sum_sd += s * d;
334	}
335	}
336
337	// To get the distortion, compute sum of squared error and apply a weight
338	// based on the variance of the two planes.
339	let sse = sum_d2 + sum_s2 - `2` * sum_sd;
340
341	// Convert to 64-bits to avoid overflow when squaring
342	let sum_s = sum_s as u64;
343	let sum_d = sum_d as u64;
344
345	// Calculate the variance (more accurately variancearea) of each plane.*
346	// var[iance] = avg(X^2) - avg(X)^2 = sum(X^2) / n - sum(X)^2 / n^2
347	// (n = # samples i.e. area)
348	// var n = sum(X^2) - sum(X)^2 / n*
349	// When w and h are powers of two, this can be done via shifting.
350	let div = AREA_DIVISORS[w * h - `1`] as u64;
351	let div_shift = AREA_DIVISOR_BITS;
352	// Due to rounding, negative values can occur when w or h aren't powers of
353	// two. Saturate to avoid underflow.
354	let mut svar = sum_s2.saturating_sub(
355	((sum_s * sum_s * div + (`1` << div_shift >> `1`)) >> div_shift) as u32,
356	);
357	let mut dvar = sum_d2.saturating_sub(
358	((sum_d * sum_d * div + (`1` << div_shift >> `1`)) >> div_shift) as u32,
359	);
360
361	// Scale variances up to 8x8 size.
362	// scaled variance = var (8x8) / wxh*
363	// For 8x8, this is a nop. For powers of 2, this is doable with shifting.
364	// TODO: It should be possible and faster to do this adjustment in ssim boost
365	let scale_shift = AREA_DIVISOR_BITS - `6`;
366	svar =
367	((svar as u64 * div + (`1` << scale_shift >> `1`)) >> scale_shift) as u32;
368	dvar =
369	((dvar as u64 * div + (`1` << scale_shift >> `1`)) >> scale_shift) as u32;
370
371	apply_ssim_boost(sse, svar, dvar, bit_depth)
372	}
373	}
374
375	#[cfg(test)]
376	pub mod test {
377	use super::*;
378	use crate::cpu_features::CpuFeatureLevel;
379	use crate::frame::*;
380	use crate::tiling::Area;
381	use crate::util::Pixel;
382
383	// Generate plane data for get_sad_same()
384	fn setup_planes<T: Pixel>() -> (Plane<T>, Plane<T>) {
385	// Two planes with different strides
386	let mut input_plane = Plane::new(`640`, `480`, `0`, `0`, `128` + `8`, `128` + `8`);
387	let mut rec_plane = Plane::new(`640`, `480`, `0`, `0`, `2` * `128` + `8`, `2` * `128` + `8`);
388
389	// Make the test pattern robust to data alignment
390	let xpad_off =
391	(input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - `8i32`;
392
393	for (i, row) in
394	input_plane.data.chunks_mut(input_plane.cfg.stride).enumerate()
395	{
396	for (j, pixel) in row.iter_mut().enumerate() {
397	let val = ((j + i) as i32 - xpad_off) & `255i32`;
398	assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
399	*pixel = T::cast_from(val);
400	}
401	}
402
403	for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate()
404	{
405	for (j, pixel) in row.iter_mut().enumerate() {
406	let val = (j as i32 - i as i32 - xpad_off) & `255i32`;
407	assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
408	*pixel = T::cast_from(val);
409	}
410	}
411
412	(input_plane, rec_plane)
413	}
414
415	// Regression and validation test for SAD computation
416	fn get_sad_same_inner<T: Pixel>() {
417	// dynamic allocation: test
418	let blocks: Vec<(usize, usize, u32)> = vec![
419	(`4`, `4`, `1912`),
420	(`4`, `8`, `4296`),
421	(`8`, `4`, `3496`),
422	(`8`, `8`, `7824`),
423	(`8`, `16`, `16592`),
424	(`16`, `8`, `14416`),
425	(`16`, `16`, `31136`),
426	(`16`, `32`, `60064`),
427	(`32`, `16`, `59552`),
428	(`32`, `32`, `120128`),
429	(`32`, `64`, `186688`),
430	(`64`, `32`, `250176`),
431	(`64`, `64`, `438912`),
432	(`64`, `128`, `654272`),
433	(`128`, `64`, `1016768`),
434	(`128`, `128`, `1689792`),
435	(`4`, `16`, `8680`),
436	(`16`, `4`, `6664`),
437	(`8`, `32`, `31056`),
438	(`32`, `8`, `27600`),
439	(`16`, `64`, `93344`),
440	(`64`, `16`, `116384`),
441	];
442
443	let bit_depth: usize = `8`;
444	let (input_plane, rec_plane) = setup_planes::<T>();
445
446	for (w, h, distortion) in blocks {
447	let area = Area::StartingAt { x: `32`, y: `40` };
448
449	let input_region = input_plane.region(area);
450	let rec_region = rec_plane.region(area);
451
452	assert_eq!(
453	distortion,
454	get_sad(
455	&input_region,
456	&rec_region,
457	w,
458	h,
459	bit_depth,
460	CpuFeatureLevel::default()
461	)
462	);
463	}
464	}
465
466	#[test]
467	fn get_sad_same_u8() {
468	get_sad_same_inner::<u8>();
469	}
470
471	#[test]
472	fn get_sad_same_u16() {
473	get_sad_same_inner::<u16>();
474	}
475
476	fn get_satd_same_inner<T: Pixel>() {
477	let blocks: Vec<(usize, usize, u32)> = vec![
478	(`4`, `4`, `1408`),
479	(`4`, `8`, `2016`),
480	(`8`, `4`, `1816`),
481	(`8`, `8`, `3984`),
482	(`8`, `16`, `5136`),
483	(`16`, `8`, `4864`),
484	(`16`, `16`, `9984`),
485	(`16`, `32`, `13824`),
486	(`32`, `16`, `13760`),
487	(`32`, `32`, `27952`),
488	(`32`, `64`, `37168`),
489	(`64`, `32`, `45104`),
490	(`64`, `64`, `84176`),
491	(`64`, `128`, `127920`),
492	(`128`, `64`, `173680`),
493	(`128`, `128`, `321456`),
494	(`4`, `16`, `3136`),
495	(`16`, `4`, `2632`),
496	(`8`, `32`, `7056`),
497	(`32`, `8`, `6624`),
498	(`16`, `64`, `18432`),
499	(`64`, `16`, `21312`),
500	];
501
502	let bit_depth: usize = `8`;
503	let (input_plane, rec_plane) = setup_planes::<T>();
504
505	for (w, h, distortion) in blocks {
506	let area = Area::StartingAt { x: `32`, y: `40` };
507
508	let input_region = input_plane.region(area);
509	let rec_region = rec_plane.region(area);
510
511	assert_eq!(
512	distortion,
513	get_satd(
514	&input_region,
515	&rec_region,
516	w,
517	h,
518	bit_depth,
519	CpuFeatureLevel::default()
520	)
521	);
522	}
523	}
524
525	#[test]
526	fn get_satd_same_u8() {
527	get_satd_same_inner::<u8>();
528	}
529
530	#[test]
531	fn get_satd_same_u16() {
532	get_satd_same_inner::<u16>();
533	}
534	}
535