lrf.rs source code [crates/rav1e/src/lrf.rs]

1	// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
2	//
3	// This source code is subject to the terms of the BSD 2 Clause License and
4	// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5	// was not distributed with this source code in the LICENSE file, you can
6	// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7	// Media Patent License 1.0 was not distributed with this source code in the
8	// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10	cfg_if::cfg_if! {
11	if #[cfg(nasm_x86_64)] {
12	use crate::asm::x86::lrf::*;
13	} else {
14	use self::rust::*;
15	}
16	}
17
18	use crate::api::SGRComplexityLevel;
19	use crate::color::ChromaSampling::Cs400;
20	use crate::context::{MAX_PLANES, SB_SIZE};
21	use crate::encoder::FrameInvariants;
22	use crate::frame::{
23	AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice,
24	};
25	use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect};
26	use crate::util::{clamp, CastFromPrimitive, ILog, Pixel};
27	use std::cmp;
28	use std::iter::FusedIterator;
29	use std::ops::{Index, IndexMut};
30
31	pub const RESTORATION_TILESIZE_MAX_LOG2: usize = `8`;
32
33	pub const RESTORE_NONE: u8 = `0`;
34	pub const RESTORE_SWITCHABLE: u8 = `1`;
35	pub const RESTORE_WIENER: u8 = `2`;
36	pub const RESTORE_SGRPROJ: u8 = `3`;
37
38	pub const WIENER_TAPS_MIN: [i8; `3`] = [`-5`, `-23`, `-17`];
39	pub const WIENER_TAPS_MID: [i8; `3`] = [`3`, `-7`, `15`];
40	pub const WIENER_TAPS_MAX: [i8; `3`] = [`10`, `8`, `46`];
41	#[allow(unused)]
42	pub const WIENER_TAPS_K: [i8; `3`] = [`1`, `2`, `3`];
43	pub const WIENER_BITS: usize = `7`;
44
45	pub const SGRPROJ_XQD_MIN: [i8; `2`] = [`-96`, `-32`];
46	pub const SGRPROJ_XQD_MID: [i8; `2`] = [`-32`, `31`];
47	pub const SGRPROJ_XQD_MAX: [i8; `2`] = [`31`, `95`];
48	pub const SGRPROJ_PRJ_SUBEXP_K: u8 = `4`;
49	pub const SGRPROJ_PRJ_BITS: u8 = `7`;
50	pub const SGRPROJ_PARAMS_BITS: u8 = `4`;
51	pub const SGRPROJ_MTABLE_BITS: u8 = `20`;
52	pub const SGRPROJ_SGR_BITS: u8 = `8`;
53	pub const SGRPROJ_RECIP_BITS: u8 = `12`;
54	pub const SGRPROJ_RST_BITS: u8 = `4`;
55	pub const SGRPROJ_PARAMS_S: [[u32; `2`]; `1` << SGRPROJ_PARAMS_BITS] = [
56	[`140`, `3236`],
57	[`112`, `2158`],
58	[`93`, `1618`],
59	[`80`, `1438`],
60	[`70`, `1295`],
61	[`58`, `1177`],
62	[`47`, `1079`],
63	[`37`, `996`],
64	[`30`, `925`],
65	[`25`, `863`],
66	[`0`, `2589`],
67	[`0`, `1618`],
68	[`0`, `1177`],
69	[`0`, `925`],
70	[`56`, `0`],
71	[`22`, `0`],
72	];
73
74	// List of indices to SGRPROJ_PARAMS_S values that at a given complexity level.
75	// SGRPROJ_ALL_SETS contains every possible index
76	const SGRPROJ_ALL_SETS: &[u8] =
77	&[`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`];
78	// SGRPROJ_REDUCED_SETS has half of the values. Using only these values gives
79	// most of the gains from sgr. The decision of which values to use is somewhat
80	// arbitrary. The sgr parameters has 3 discontinuous groups. The first has both
81	// parameters as non-zero. The other two are distinguishable by which of the
82	// two parameters is zero. There are an even number of each of these groups and
83	// the non-zero parameters grow as the indices increase. This array uses the
84	// 1st, 3rd, ... smallest params of each group.
85	const SGRPROJ_REDUCED_SETS: &[u8] = &[`1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`];
86
87	pub const fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] {
88	match complexity {
89	SGRComplexityLevel::Full => SGRPROJ_ALL_SETS,
90	SGRComplexityLevel::Reduced => SGRPROJ_REDUCED_SETS,
91	}
92	}
93
94	pub const SOLVE_IMAGE_MAX: usize = `1` << RESTORATION_TILESIZE_MAX_LOG2;
95	pub const SOLVE_IMAGE_STRIDE: usize = SOLVE_IMAGE_MAX + `6` + `2`;
96	pub const SOLVE_IMAGE_HEIGHT: usize = SOLVE_IMAGE_STRIDE;
97	pub const SOLVE_IMAGE_SIZE: usize = SOLVE_IMAGE_STRIDE * SOLVE_IMAGE_HEIGHT;
98
99	pub const STRIPE_IMAGE_MAX: usize = (`1` << RESTORATION_TILESIZE_MAX_LOG2)
100	+ (`1` << (RESTORATION_TILESIZE_MAX_LOG2 - `1`));
101	pub const STRIPE_IMAGE_STRIDE: usize = STRIPE_IMAGE_MAX + `6` + `2`;
102	pub const STRIPE_IMAGE_HEIGHT: usize = `64` + `6` + `2`;
103	pub const STRIPE_IMAGE_SIZE: usize = STRIPE_IMAGE_STRIDE * STRIPE_IMAGE_HEIGHT;
104
105	pub const IMAGE_WIDTH_MAX: usize = [STRIPE_IMAGE_MAX, SOLVE_IMAGE_MAX]
106	[(STRIPE_IMAGE_MAX < SOLVE_IMAGE_MAX) as usize];
107
108	/// The buffer used in `sgrproj_stripe_filter()` and `sgrproj_solve()`.
109	#[derive(Debug)]
110	pub struct IntegralImageBuffer {
111	pub integral_image: Vec<u32>,
112	pub sq_integral_image: Vec<u32>,
113	}
114
115	impl IntegralImageBuffer {
116	/// Creates a new buffer with the given size, filled with zeros.
117	#[inline]
118	pub fn zeroed(size: usize) -> Self {
119	Self { integral_image: vec![`0`; size], sq_integral_image: vec![`0`; size] }
120	}
121	}
122
123	#[allow(unused)] // Wiener coming soon!
124	#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
125	pub enum RestorationFilter {
126	#[default]
127	None,
128	Wiener {
129	coeffs: [[i8; `3`]; `2`],
130	},
131	Sgrproj {
132	set: u8,
133	xqd: [i8; `2`],
134	},
135	}
136
137	impl RestorationFilter {
138	pub const fn notequal(self, cmp: RestorationFilter) -> bool {
139	match self {
140	RestorationFilter::None {} => !matches!(cmp, RestorationFilter::None {}),
141	RestorationFilter::Sgrproj { set, xqd } => {
142	if let RestorationFilter::Sgrproj { set: set2, xqd: xqd2 } = cmp {
143	!(set == set2 && xqd[`0`] == xqd2[`0`] && xqd[`1`] == xqd2[`1`])
144	} else {
145	`true`
146	}
147	}
148	RestorationFilter::Wiener { coeffs } => {
149	if let RestorationFilter::Wiener { coeffs: coeffs2 } = cmp {
150	!(coeffs[`0`][`0`] == coeffs2[`0`][`0`]
151	&& coeffs[`0`][`1`] == coeffs2[`0`][`1`]
152	&& coeffs[`0`][`2`] == coeffs2[`0`][`2`]
153	&& coeffs[`1`][`0`] == coeffs2[`1`][`0`]
154	&& coeffs[`1`][`1`] == coeffs2[`1`][`1`]
155	&& coeffs[`1`][`2`] == coeffs2[`1`][`2`])
156	} else {
157	`true`
158	}
159	}
160	}
161	}
162	}
163
164	pub(crate) mod rust {
165	use crate::cpu_features::CpuFeatureLevel;
166	use crate::frame::PlaneSlice;
167	use crate::lrf::{
168	get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS,
169	SGRPROJ_SGR_BITS,
170	};
171	use crate::util::CastFromPrimitive;
172	use crate::Pixel;
173
174	#[inline(always)]
175	pub(crate) fn sgrproj_box_ab_internal<const BD: usize>(
176	r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
177	iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32,
178	) {
179	let d: usize = r * `2` + `1`;
180	let n: usize = d * d;
181	let one_over_n = if r == `1` { `455` } else { `164` };
182
183	assert!(iimg.len() > (y + d) * iimg_stride + stripe_w + `1` + d);
184	assert!(iimg_sq.len() > (y + d) * iimg_stride + stripe_w + `1` + d);
185	assert!(af.len() > stripe_w + `1`);
186	assert!(bf.len() > stripe_w + `1`);
187
188	for x in start_x..stripe_w + `2` {
189	// SAFETY: We perform the bounds checks above, once for the whole loop
190	unsafe {
191	let sum = get_integral_square(iimg, iimg_stride, x, y, d);
192	let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d);
193	let (reta, retb) =
194	sgrproj_sum_finish::<BD>(ssq, sum, n as u32, one_over_n, s);
195	*af.get_unchecked_mut(x) = reta;
196	*bf.get_unchecked_mut(x) = retb;
197	}
198	}
199	}
200
201	// computes an intermediate (ab) row for stripe_w + 2 columns at row y
202	pub(crate) fn sgrproj_box_ab_r1<const BD: usize>(
203	af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
204	iimg_stride: usize, y: usize, stripe_w: usize, s: u32,
205	_cpu: CpuFeatureLevel,
206	) {
207	sgrproj_box_ab_internal::<BD>(
208	`1`,
209	af,
210	bf,
211	iimg,
212	iimg_sq,
213	iimg_stride,
214	`0`,
215	y,
216	stripe_w,
217	s,
218	);
219	}
220
221	// computes an intermediate (ab) row for stripe_w + 2 columns at row y
222	pub(crate) fn sgrproj_box_ab_r2<const BD: usize>(
223	af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
224	iimg_stride: usize, y: usize, stripe_w: usize, s: u32,
225	_cpu: CpuFeatureLevel,
226	) {
227	sgrproj_box_ab_internal::<BD>(
228	`2`,
229	af,
230	bf,
231	iimg,
232	iimg_sq,
233	iimg_stride,
234	`0`,
235	y,
236	stripe_w,
237	s,
238	);
239	}
240
241	pub(crate) fn sgrproj_box_f_r0<T: Pixel>(
242	f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
243	_cpu: CpuFeatureLevel,
244	) {
245	sgrproj_box_f_r0_internal(f, `0`, y, w, cdeffed);
246	}
247
248	#[inline(always)]
249	pub(crate) fn sgrproj_box_f_r0_internal<T: Pixel>(
250	f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
251	) {
252	let line = cdeffed.row(y);
253	for (fp, &v) in f[start_x..w].iter_mut().zip(line[start_x..w].iter()) {
254	fp = u32*::cast_from(v) << SGRPROJ_RST_BITS;
255	}
256	}
257
258	pub(crate) fn sgrproj_box_f_r1<T: Pixel>(
259	af: &[&[u32]; `3`], bf: &[&[u32]; `3`], f: &mut [u32], y: usize, w: usize,
260	cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
261	) {
262	sgrproj_box_f_r1_internal(af, bf, f, `0`, y, w, cdeffed);
263	}
264
265	#[inline(always)]
266	pub(crate) fn sgrproj_box_f_r1_internal<T: Pixel>(
267	af: &[&[u32]; `3`], bf: &[&[u32]; `3`], f: &mut [u32], start_x: usize,
268	y: usize, w: usize, cdeffed: &PlaneSlice<T>,
269	) {
270	let shift = `5` + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
271	let line = cdeffed.row(y);
272	for x in start_x..w {
273	let a = `3` * (af[`0`][x] + af[`2`][x] + af[`0`][x + `2`] + af[`2`][x + `2`])
274	+ `4`
275	* (af[`1`][x]
276	+ af[`0`][x + `1`]
277	+ af[`1`][x + `1`]
278	+ af[`2`][x + `1`]
279	+ af[`1`][x + `2`]);
280	let b = `3` * (bf[`0`][x] + bf[`2`][x] + bf[`0`][x + `2`] + bf[`2`][x + `2`])
281	+ `4`
282	* (bf[`1`][x]
283	+ bf[`0`][x + `1`]
284	+ bf[`1`][x + `1`]
285	+ bf[`2`][x + `1`]
286	+ bf[`1`][x + `2`]);
287	let v = a * u32::cast_from(line[x]) + b;
288	f[x] = (v + (`1` << shift >> `1`)) >> shift;
289	}
290	}
291
292	pub(crate) fn sgrproj_box_f_r2<T: Pixel>(
293	af: &[&[u32]; `2`], bf: &[&[u32]; `2`], f0: &mut [u32], f1: &mut [u32],
294	y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
295	) {
296	sgrproj_box_f_r2_internal(af, bf, f0, f1, `0`, y, w, cdeffed);
297	}
298
299	#[inline(always)]
300	pub(crate) fn sgrproj_box_f_r2_internal<T: Pixel>(
301	af: &[&[u32]; `2`], bf: &[&[u32]; `2`], f0: &mut [u32], f1: &mut [u32],
302	start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
303	) {
304	let shift = `5` + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
305	let shifto = `4` + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
306	let line = cdeffed.row(y);
307	let line1 = cdeffed.row(y + `1`);
308
309	let af0 = af[`0`][start_x..w + `3`].windows(`3`);
310	let af1 = af[`1`][start_x..w + `3`].windows(`3`);
311	let bf0 = bf[`0`][start_x..w + `3`].windows(`3`);
312	let bf1 = bf[`1`][start_x..w + `3`].windows(`3`);
313
314	let af_it = af0.zip(af1);
315	let bf_it = bf0.zip(bf1);
316
317	let in0 = line[start_x..w].iter();
318	let in1 = line1[start_x..w].iter();
319
320	let o0 = f0[start_x..w].iter_mut();
321	let o1 = f1[start_x..w].iter_mut();
322
323	let in_iter = in0.zip(in1);
324	let out_iter = o0.zip(o1);
325
326	let io_iter = out_iter.zip(in_iter);
327
328	for (((o0, o1), (&p0, &p1)), ((af_0, af_1), (bf_0, bf_1))) in
329	io_iter.zip(af_it.zip(bf_it))
330	{
331	let a = `5` * (af_0[`0`] + af_0[`2`]) + `6` * af_0[`1`];
332	let b = `5` * (bf_0[`0`] + bf_0[`2`]) + `6` * bf_0[`1`];
333	let ao = `5` * (af_1[`0`] + af_1[`2`]) + `6` * af_1[`1`];
334	let bo = `5` * (bf_1[`0`] + bf_1[`2`]) + `6` * bf_1[`1`];
335	let v = (a + ao) * u32::cast_from(p0) + b + bo;
336	*o0 = (v + (`1` << shift >> `1`)) >> shift;
337	let vo = ao * u32::cast_from(p1) + bo;
338	*o1 = (vo + (`1` << shifto >> `1`)) >> shifto;
339	}
340	}
341	}
342
343	#[inline(always)]
344	fn sgrproj_sum_finish<const BD: usize>(
345	ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32,
346	) -> (u32, u32) {
347	let bdm8: usize = BD - `8`;
348	let scaled_ssq: u32 = (ssq + (`1` << (`2` * bdm8) >> `1`)) >> (`2` * bdm8);
349	let scaled_sum: u32 = (sum + (`1` << bdm8 >> `1`)) >> bdm8;
350	let p: u32 = (scaled_ssq * n).saturating_sub(scaled_sum * scaled_sum);
351	let z: u32 = (p * s + (`1` << SGRPROJ_MTABLE_BITS >> `1`)) >> SGRPROJ_MTABLE_BITS;
352	let a: u32 = if z >= `255` {
353	`256`
354	} else if z == `0` {
355	`1`
356	} else {
357	((z << SGRPROJ_SGR_BITS) + z / `2`) / (z + `1`)
358	};
359	let b: u32 = ((`1` << SGRPROJ_SGR_BITS) - a) * sum * one_over_n;
360	(a, (b + (`1` << SGRPROJ_RECIP_BITS >> `1`)) >> SGRPROJ_RECIP_BITS)
361	}
362
363	// Using an integral image, compute the sum of a square region
364	// SAFETY: The size of `iimg` must be at least `(y + size) stride + x + size`*
365	#[inline(always)]
366	unsafe fn get_integral_square(
367	iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
368	) -> u32 {
369	// Cancel out overflow in iimg by using wrapping arithmetic
370	let top_left: u32 = iimg.get_unchecked(index:y stride + x);
371	let top_right: u32 = iimg.get_unchecked(index:y stride + x + size);
372	let bottom_left: u32 = iimg.get_unchecked((y + size) stride + x);
373	let bottom_right: u32 = iimg.get_unchecked((y + size) stride + x + size);
374	top_leftu32
375	.wrapping_add(bottom_right)
376	.wrapping_sub(bottom_left)
377	.wrapping_sub(top_right)
378	}
379
380	struct VertPaddedIter<'a, T: Pixel> {
381	// The two sources that can be selected when clipping
382	deblocked: &'a Plane<T>,
383	cdeffed: &'a Plane<T>,
384	// x index to choice where on the row to start
385	x: isize,
386	// y index that will be mutated
387	y: isize,
388	// The index at which to terminate. Can be larger than the slice length.
389	end: isize,
390	// Used for source buffer choice/clipping. May (and regularly will)
391	// be negative.
392	stripe_begin: isize,
393	// Also used for source buffer choice/clipping. May specify a stripe boundary
394	// less than, equal to, or larger than the buffers we're accessing.
395	stripe_end: isize,
396	// Active area cropping is done by specifying a value smaller than the height
397	// of the plane.
398	crop: isize,
399	}
400
401	impl<'a, T: Pixel> VertPaddedIter<'a, T> {
402	fn new(
403	cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>,
404	stripe_h: usize, crop: usize,
405	) -> VertPaddedIter<'a, T> {
406	// cdeffed and deblocked must start at the same coordinates from their
407	// underlying planes. Since cropping is provided via a separate params, the
408	// height of the underlying planes do not need to match.
409	assert_eq!(cdeffed.x, deblocked.x);
410	assert_eq!(cdeffed.y, deblocked.y);
411
412	// To share integral images, always use the max box filter radius of 2
413	let r = `2`;
414
415	// The number of rows outside the stripe are needed
416	let rows_above = r + `2`;
417	let rows_below = `2`;
418
419	// Offset crop and stripe_h so they are relative to the underlying plane
420	// and not the plane slice.
421	let crop = crop as isize + deblocked.y;
422	let stripe_end = stripe_h as isize + deblocked.y;
423
424	// Move y up the number rows above.
425	// If y is negative we repeat the first row
426	let y = deblocked.y - rows_above as isize;
427
428	VertPaddedIter {
429	deblocked: deblocked.plane,
430	cdeffed: cdeffed.plane,
431	x: deblocked.x,
432	y,
433	end: (rows_above + stripe_h + rows_below) as isize + y,
434	stripe_begin: deblocked.y,
435	stripe_end,
436	crop,
437	}
438	}
439	}
440
441	impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> {
442	type Item = &'a [T];
443
444	#[inline(always)]
445	fn next(&mut self) -> Option<Self::Item> {
446	if self.end > self.y {
447	// clamp before deciding the source
448	// clamp vertically to storage at top and passed-in height at bottom
449	let cropped_y = clamp(self.y, `0`, self.crop - `1`);
450	// clamp vertically to stripe limits
451	let ly = clamp(cropped_y, self.stripe_begin - `2`, self.stripe_end + `1`);
452
453	// decide if we're vertically inside or outside the strip
454	let src_plane = if ly >= self.stripe_begin && ly < self.stripe_end {
455	self.cdeffed
456	} else {
457	self.deblocked
458	};
459	// cannot directly return self.ps.row(row) due to lifetime issue
460	let range = src_plane.row_range(self.x, ly);
461	self.y += `1`;
462	Some(&src_plane.data[range])
463	} else {
464	None
465	}
466	}
467
468	fn size_hint(&self) -> (usize, Option<usize>) {
469	let remaining = self.end - self.y;
470	debug_assert!(remaining >= `0`);
471	let remaining = remaining as usize;
472
473	(remaining, Some(remaining))
474	}
475	}
476
477	impl<T: Pixel> ExactSizeIterator for VertPaddedIter<'_, T> {}
478	impl<T: Pixel> FusedIterator for VertPaddedIter<'_, T> {}
479
480	struct HorzPaddedIter<'a, T: Pixel> {
481	// Active area cropping is done using the length of the slice
482	slice: &'a [T],
483	// x index of the iterator
484	// When less than 0, repeat the first element. When greater than end, repeat
485	// the last element
486	index: isize,
487	// The index at which to terminate. Can be larger than the slice length.
488	end: usize,
489	}
490
491	impl<'a, T: Pixel> HorzPaddedIter<'a, T> {
492	fn new(
493	slice: &'a [T], start_index: isize, width: usize,
494	) -> HorzPaddedIter<'a, T> {
495	HorzPaddedIter {
496	slice,
497	index: start_index,
498	end: (width as isize + start_index) as usize,
499	}
500	}
501	}
502
503	impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> {
504	type Item = &'a T;
505
506	#[inline(always)]
507	fn next(&mut self) -> Option<Self::Item> {
508	if self.index < self.end as isize {
509	// clamp to the edges of the frame
510	let x: usize = clamp(self.index, min:`0`, self.slice.len() as isize - `1`) as usize;
511	self.index += `1`;
512	Some(&self.slice[x])
513	} else {
514	None
515	}
516	}
517
518	#[inline(always)]
519	fn size_hint(&self) -> (usize, Option<usize>) {
520	let size: usize = (self.end as isize - self.index) as usize;
521	(size, Some(size))
522	}
523	}
524
525	impl<T: Pixel> ExactSizeIterator for HorzPaddedIter<'_, T> {}
526	impl<T: Pixel> FusedIterator for HorzPaddedIter<'_, T> {}
527
528	#[profiling::function]
529	pub fn setup_integral_image<T: Pixel>(
530	integral_image_buffer: &mut IntegralImageBuffer,
531	integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize,
532	stripe_h: usize, cdeffed: &PlaneSlice<T>, deblocked: &PlaneSlice<T>,
533	) {
534	let integral_image = &mut integral_image_buffer.integral_image;
535	let sq_integral_image = &mut integral_image_buffer.sq_integral_image;
536
537	// Number of elements outside the stripe
538	let left_w = `4`; // max radius of 2 + 2 padding
539	let right_w = `3`; // max radius of 2 + 1 padding
540
541	assert_eq!(cdeffed.x, deblocked.x);
542
543	// Find how many unique elements to use to the left and right
544	let left_uniques = if cdeffed.x == `0` { `0` } else { left_w };
545	let right_uniques = right_w.min(crop_w - stripe_w);
546
547	// Find the total number of unique elements used
548	let row_uniques = left_uniques + stripe_w + right_uniques;
549
550	// Negative start indices result in repeating the first element of the row
551	let start_index_x = if cdeffed.x == `0` { -(left_w as isize) } else { `0` };
552
553	let mut rows_iter = VertPaddedIter::new(
554	// Move left to encompass all the used data
555	&cdeffed.go_left(left_uniques),
556	&deblocked.go_left(left_uniques),
557	// since r2 uses every other row, we need an extra row if stripe_h is odd
558	stripe_h + (stripe_h & `1`),
559	crop_h,
560	)
561	.map(\|row: &[T]\| {
562	HorzPaddedIter::new(
563	// Limit how many unique elements we use
564	&row[..row_uniques],
565	start_index_x,
566	left_w + stripe_w + right_w,
567	)
568	});
569
570	// Setup the first row
571	{
572	let mut sum: u32 = `0`;
573	let mut sq_sum: u32 = `0`;
574	// Remove the first row and use it outside of the main loop
575	let row = rows_iter.next().unwrap();
576	for (src, (integral, sq_integral)) in
577	row.zip(integral_image.iter_mut().zip(sq_integral_image.iter_mut()))
578	{
579	let current = u32::cast_from(*src);
580
581	// Wrap adds to prevent undefined behaviour on overflow. Overflow is
582	// cancelled out when calculating the sum of a region.
583	sum = sum.wrapping_add(current);
584	*integral = sum;
585	sq_sum = sq_sum.wrapping_add(current * current);
586	*sq_integral = sq_sum;
587	}
588	}
589	// Calculate all other rows
590	let mut integral_slice = &mut integral_image[..];
591	let mut sq_integral_slice = &mut sq_integral_image[..];
592	for row in rows_iter {
593	let mut sum: u32 = `0`;
594	let mut sq_sum: u32 = `0`;
595
596	// Split the data between the previous row and future rows.
597	// This allows us to mutate the current row while accessing the
598	// previous row.
599	let (integral_row_prev, integral_row) =
600	integral_slice.split_at_mut(integral_image_stride);
601	let (sq_integral_row_prev, sq_integral_row) =
602	sq_integral_slice.split_at_mut(integral_image_stride);
603	for (
604	src,
605	((integral_above, sq_integral_above), (integral, sq_integral)),
606	) in row.zip(
607	integral_row_prev
608	.iter()
609	.zip(sq_integral_row_prev.iter())
610	.zip(integral_row.iter_mut().zip(sq_integral_row.iter_mut())),
611	) {
612	let current = u32::cast_from(*src);
613	// Wrap adds to prevent undefined behaviour on overflow. Overflow is
614	// cancelled out when calculating the sum of a region.
615	sum = sum.wrapping_add(current);
616	integral = sum.wrapping_add(integral_above);
617	sq_sum = sq_sum.wrapping_add(current * current);
618	sq_integral = sq_sum.wrapping_add(sq_integral_above);
619	}
620
621	// The current row also contains all future rows. Replacing the slice with
622	// it moves down a row.
623	integral_slice = integral_row;
624	sq_integral_slice = sq_integral_row;
625	}
626	}
627
628	#[profiling::function]
629	pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
630	set: u8, xqd: [i8; `2`], fi: &FrameInvariants<T>,
631	integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize,
632	cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>,
633	) {
634	let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect();
635	let mut a_r2: [[u32; IMAGE_WIDTH_MAX + `2`]; `2`] =
636	[[`0`; IMAGE_WIDTH_MAX + `2`]; `2`];
637	let mut b_r2: [[u32; IMAGE_WIDTH_MAX + `2`]; `2`] =
638	[[`0`; IMAGE_WIDTH_MAX + `2`]; `2`];
639	let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [`0`; IMAGE_WIDTH_MAX];
640	let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [`0`; IMAGE_WIDTH_MAX];
641	let mut a_r1: [[u32; IMAGE_WIDTH_MAX + `2`]; `3`] =
642	[[`0`; IMAGE_WIDTH_MAX + `2`]; `3`];
643	let mut b_r1: [[u32; IMAGE_WIDTH_MAX + `2`]; `3`] =
644	[[`0`; IMAGE_WIDTH_MAX + `2`]; `3`];
645	let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [`0`; IMAGE_WIDTH_MAX];
646
647	let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][`0`];
648	let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][`1`];
649
650	let fn_ab_r1 = match fi.sequence.bit_depth {
651	`8` => sgrproj_box_ab_r1::<`8`>,
652	`10` => sgrproj_box_ab_r1::<`10`>,
653	`12` => sgrproj_box_ab_r1::<`12`>,
654	_ => unimplemented!(),
655	};
656	let fn_ab_r2 = match fi.sequence.bit_depth {
657	`8` => sgrproj_box_ab_r2::<`8`>,
658	`10` => sgrproj_box_ab_r2::<`10`>,
659	`12` => sgrproj_box_ab_r2::<`12`>,
660	_ => unimplemented!(),
661	};
662
663	/ prime the intermediate arrays /
664	// One oddness about the radius=2 intermediate array computations that
665	// the spec doesn't make clear: Although the spec defines computation
666	// of every row (of a, b and f), only half of the rows (every-other
667	// row) are actually used.
668	let integral_image = &integral_image_buffer.integral_image;
669	let sq_integral_image = &integral_image_buffer.sq_integral_image;
670	if s_r2 > `0` {
671	fn_ab_r2(
672	&mut a_r2[`0`],
673	&mut b_r2[`0`],
674	integral_image,
675	sq_integral_image,
676	integral_image_stride,
677	`0`,
678	stripe_w,
679	s_r2,
680	fi.cpu_feature_level,
681	);
682	}
683	if s_r1 > `0` {
684	let integral_image_offset = integral_image_stride + `1`;
685	fn_ab_r1(
686	&mut a_r1[`0`],
687	&mut b_r1[`0`],
688	&integral_image[integral_image_offset..],
689	&sq_integral_image[integral_image_offset..],
690	integral_image_stride,
691	`0`,
692	stripe_w,
693	s_r1,
694	fi.cpu_feature_level,
695	);
696	fn_ab_r1(
697	&mut a_r1[`1`],
698	&mut b_r1[`1`],
699	&integral_image[integral_image_offset..],
700	&sq_integral_image[integral_image_offset..],
701	integral_image_stride,
702	`1`,
703	stripe_w,
704	s_r1,
705	fi.cpu_feature_level,
706	);
707	}
708
709	/ iterate by row /
710	// Increment by two to handle the use of even rows by r=2 and run a nested
711	// loop to handle increments of one.
712	for y in (`0`..stripe_h).step_by(`2`) {
713	// get results to use y and y+1
714	let f_r2_ab: [&[u32]; `2`] = if s_r2 > `0` {
715	fn_ab_r2(
716	&mut a_r2[(y / `2` + `1`) % `2`],
717	&mut b_r2[(y / `2` + `1`) % `2`],
718	integral_image,
719	sq_integral_image,
720	integral_image_stride,
721	y + `2`,
722	stripe_w,
723	s_r2,
724	fi.cpu_feature_level,
725	);
726	let ap0: [&[u32]; `2`] = [&a_r2[(y / `2`) % `2`], &a_r2[(y / `2` + `1`) % `2`]];
727	let bp0: [&[u32]; `2`] = [&b_r2[(y / `2`) % `2`], &b_r2[(y / `2` + `1`) % `2`]];
728	sgrproj_box_f_r2(
729	&ap0,
730	&bp0,
731	&mut f_r2_0,
732	&mut f_r2_1,
733	y,
734	stripe_w,
735	cdeffed,
736	fi.cpu_feature_level,
737	);
738	[&f_r2_0, &f_r2_1]
739	} else {
740	sgrproj_box_f_r0(
741	&mut f_r2_0,
742	y,
743	stripe_w,
744	cdeffed,
745	fi.cpu_feature_level,
746	);
747	// share results for both rows
748	[&f_r2_0, &f_r2_0]
749	};
750	for dy in `0`..(`2`.min(stripe_h - y)) {
751	let y = y + dy;
752	if s_r1 > `0` {
753	let integral_image_offset = integral_image_stride + `1`;
754	fn_ab_r1(
755	&mut a_r1[(y + `2`) % `3`],
756	&mut b_r1[(y + `2`) % `3`],
757	&integral_image[integral_image_offset..],
758	&sq_integral_image[integral_image_offset..],
759	integral_image_stride,
760	y + `2`,
761	stripe_w,
762	s_r1,
763	fi.cpu_feature_level,
764	);
765	let ap1: [&[u32]; `3`] =
766	[&a_r1[y % `3`], &a_r1[(y + `1`) % `3`], &a_r1[(y + `2`) % `3`]];
767	let bp1: [&[u32]; `3`] =
768	[&b_r1[y % `3`], &b_r1[(y + `1`) % `3`], &b_r1[(y + `2`) % `3`]];
769	sgrproj_box_f_r1(
770	&ap1,
771	&bp1,
772	&mut f_r1,
773	y,
774	stripe_w,
775	cdeffed,
776	fi.cpu_feature_level,
777	);
778	} else {
779	sgrproj_box_f_r0(
780	&mut f_r1,
781	y,
782	stripe_w,
783	cdeffed,
784	fi.cpu_feature_level,
785	);
786	}
787
788	/ apply filter /
789	let w0 = xqd[`0`] as i32;
790	let w1 = xqd[`1`] as i32;
791	let w2 = (`1` << SGRPROJ_PRJ_BITS) - w0 - w1;
792
793	let line = &cdeffed[y];
794
795	#[inline(always)]
796	fn apply_filter<U: Pixel>(
797	out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32],
798	stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32,
799	) {
800	let line_it = line[..stripe_w].iter();
801	let f_r2_ab_it = f_r2_ab[..stripe_w].iter();
802	let f_r1_it = f_r1[..stripe_w].iter();
803	let out_it = out[..stripe_w].iter_mut();
804
805	for ((o, &u), (&f_r2_ab, &f_r1)) in
806	out_it.zip(line_it).zip(f_r2_ab_it.zip(f_r1_it))
807	{
808	let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
809	let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32;
810	let s = (v + (`1` << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> `1`))
811	>> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
812	*o = U::cast_from(clamp(s, `0`, (`1` << bit_depth) - `1`));
813	}
814	}
815
816	apply_filter(
817	&mut out[y],
818	line,
819	&f_r1,
820	f_r2_ab[dy],
821	stripe_w,
822	fi.sequence.bit_depth,
823	w0,
824	w1,
825	w2,
826	);
827	}
828	}
829	}
830
831	// Frame inputs below aren't all equal, and will change as work
832	// continues. There's no deblocked reconstruction available at this
833	// point of RDO, so we use the non-deblocked reconstruction, cdef and
834	// input. The input can be a full-sized frame. Cdef input is a partial
835	// frame constructed specifically for RDO.
836
837	// For simplicity, this ignores stripe segmentation (it's possible the
838	// extra complexity isn't worth it and we'll ignore stripes
839	// permanently during RDO, but that's not been tested yet). Data
840	// access inside the cdef frame is monolithic and clipped to the cdef
841	// borders.
842
843	// Input params follow the same rules as sgrproj_stripe_filter.
844	// Inputs are relative to the colocated slice views.
845	#[profiling::function]
846	pub fn sgrproj_solve<T: Pixel>(
847	set: u8, fi: &FrameInvariants<T>,
848	integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>,
849	cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize,
850	) -> (i8, i8) {
851	let mut a_r2: [[u32; IMAGE_WIDTH_MAX + `2`]; `2`] =
852	[[`0`; IMAGE_WIDTH_MAX + `2`]; `2`];
853	let mut b_r2: [[u32; IMAGE_WIDTH_MAX + `2`]; `2`] =
854	[[`0`; IMAGE_WIDTH_MAX + `2`]; `2`];
855	let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [`0`; IMAGE_WIDTH_MAX];
856	let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [`0`; IMAGE_WIDTH_MAX];
857	let mut a_r1: [[u32; IMAGE_WIDTH_MAX + `2`]; `3`] =
858	[[`0`; IMAGE_WIDTH_MAX + `2`]; `3`];
859	let mut b_r1: [[u32; IMAGE_WIDTH_MAX + `2`]; `3`] =
860	[[`0`; IMAGE_WIDTH_MAX + `2`]; `3`];
861	let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [`0`; IMAGE_WIDTH_MAX];
862
863	let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][`0`];
864	let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][`1`];
865
866	let mut h: [[f64; `2`]; `2`] = [[`0.`, `0.`], [`0.`, `0.`]];
867	let mut c: [f64; `2`] = [`0.`, `0.`];
868
869	let fn_ab_r1 = match fi.sequence.bit_depth {
870	`8` => sgrproj_box_ab_r1::<`8`>,
871	`10` => sgrproj_box_ab_r1::<`10`>,
872	`12` => sgrproj_box_ab_r1::<`12`>,
873	_ => unimplemented!(),
874	};
875	let fn_ab_r2 = match fi.sequence.bit_depth {
876	`8` => sgrproj_box_ab_r2::<`8`>,
877	`10` => sgrproj_box_ab_r2::<`10`>,
878	`12` => sgrproj_box_ab_r2::<`12`>,
879	_ => unimplemented!(),
880	};
881
882	/ prime the intermediate arrays /
883	// One oddness about the radius=2 intermediate array computations that
884	// the spec doesn't make clear: Although the spec defines computation
885	// of every row (of a, b and f), only half of the rows (every-other
886	// row) are actually used.
887	let integral_image = &integral_image_buffer.integral_image;
888	let sq_integral_image = &integral_image_buffer.sq_integral_image;
889	if s_r2 > `0` {
890	fn_ab_r2(
891	&mut a_r2[`0`],
892	&mut b_r2[`0`],
893	integral_image,
894	sq_integral_image,
895	SOLVE_IMAGE_STRIDE,
896	`0`,
897	cdef_w,
898	s_r2,
899	fi.cpu_feature_level,
900	);
901	}
902	if s_r1 > `0` {
903	let integral_image_offset = SOLVE_IMAGE_STRIDE + `1`;
904	fn_ab_r1(
905	&mut a_r1[`0`],
906	&mut b_r1[`0`],
907	&integral_image[integral_image_offset..],
908	&sq_integral_image[integral_image_offset..],
909	SOLVE_IMAGE_STRIDE,
910	`0`,
911	cdef_w,
912	s_r1,
913	fi.cpu_feature_level,
914	);
915	fn_ab_r1(
916	&mut a_r1[`1`],
917	&mut b_r1[`1`],
918	&integral_image[integral_image_offset..],
919	&sq_integral_image[integral_image_offset..],
920	SOLVE_IMAGE_STRIDE,
921	`1`,
922	cdef_w,
923	s_r1,
924	fi.cpu_feature_level,
925	);
926	}
927
928	/ iterate by row /
929	// Increment by two to handle the use of even rows by r=2 and run a nested
930	// loop to handle increments of one.
931	for y in (`0`..cdef_h).step_by(`2`) {
932	// get results to use y and y+1
933	let f_r2_01: [&[u32]; `2`] = if s_r2 > `0` {
934	fn_ab_r2(
935	&mut a_r2[(y / `2` + `1`) % `2`],
936	&mut b_r2[(y / `2` + `1`) % `2`],
937	integral_image,
938	sq_integral_image,
939	SOLVE_IMAGE_STRIDE,
940	y + `2`,
941	cdef_w,
942	s_r2,
943	fi.cpu_feature_level,
944	);
945	let ap0: [&[u32]; `2`] = [&a_r2[(y / `2`) % `2`], &a_r2[(y / `2` + `1`) % `2`]];
946	let bp0: [&[u32]; `2`] = [&b_r2[(y / `2`) % `2`], &b_r2[(y / `2` + `1`) % `2`]];
947	sgrproj_box_f_r2(
948	&ap0,
949	&bp0,
950	&mut f_r2_0,
951	&mut f_r2_1,
952	y,
953	cdef_w,
954	cdeffed,
955	fi.cpu_feature_level,
956	);
957	[&f_r2_0, &f_r2_1]
958	} else {
959	sgrproj_box_f_r0(&mut f_r2_0, y, cdef_w, cdeffed, fi.cpu_feature_level);
960	// share results for both rows
961	[&f_r2_0, &f_r2_0]
962	};
963	for dy in `0`..(`2`.min(cdef_h - y)) {
964	let y = y + dy;
965	if s_r1 > `0` {
966	let integral_image_offset = SOLVE_IMAGE_STRIDE + `1`;
967	fn_ab_r1(
968	&mut a_r1[(y + `2`) % `3`],
969	&mut b_r1[(y + `2`) % `3`],
970	&integral_image[integral_image_offset..],
971	&sq_integral_image[integral_image_offset..],
972	SOLVE_IMAGE_STRIDE,
973	y + `2`,
974	cdef_w,
975	s_r1,
976	fi.cpu_feature_level,
977	);
978	let ap1: [&[u32]; `3`] =
979	[&a_r1[y % `3`], &a_r1[(y + `1`) % `3`], &a_r1[(y + `2`) % `3`]];
980	let bp1: [&[u32]; `3`] =
981	[&b_r1[y % `3`], &b_r1[(y + `1`) % `3`], &b_r1[(y + `2`) % `3`]];
982	sgrproj_box_f_r1(
983	&ap1,
984	&bp1,
985	&mut f_r1,
986	y,
987	cdef_w,
988	cdeffed,
989	fi.cpu_feature_level,
990	);
991	} else {
992	sgrproj_box_f_r0(&mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level);
993	}
994
995	#[inline(always)]
996	fn process_line<T: Pixel>(
997	h: &mut [[f64; `2`]; `2`], c: &mut [f64; `2`], cdeffed: &[T], input: &[T],
998	f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize,
999	) {
1000	let cdeffed_it = cdeffed[..cdef_w].iter();
1001	let input_it = input[..cdef_w].iter();
1002	let f_r2_ab_it = f_r2_ab[..cdef_w].iter();
1003	let f_r1_it = f_r1[..cdef_w].iter();
1004
1005	#[derive(Debug, Copy, Clone)]
1006	struct Sums {
1007	h: [[i64; `2`]; `2`],
1008	c: [i64; `2`],
1009	}
1010
1011	let sums: Sums = cdeffed_it
1012	.zip(input_it)
1013	.zip(f_r2_ab_it.zip(f_r1_it))
1014	.map(\|((&u, &i), (&f2, &f1))\| {
1015	let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
1016	let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u;
1017	let f2 = f2 as i32 - u;
1018	let f1 = f1 as i32 - u;
1019	(s as i64, f1 as i64, f2 as i64)
1020	})
1021	.fold(Sums { h: [[`0`; `2`]; `2`], c: [`0`; `2`] }, \|sums, (s, f1, f2)\| {
1022	let mut ret: Sums = sums;
1023	ret.h[`0`][`0`] += f2 * f2;
1024	ret.h[`1`][`1`] += f1 * f1;
1025	ret.h[`0`][`1`] += f1 * f2;
1026	ret.c[`0`] += f2 * s;
1027	ret.c[`1`] += f1 * s;
1028	ret
1029	});
1030
1031	h[`0`][`0`] += sums.h[`0`][`0`] as f64;
1032	h[`1`][`1`] += sums.h[`1`][`1`] as f64;
1033	h[`0`][`1`] += sums.h[`0`][`1`] as f64;
1034	c[`0`] += sums.c[`0`] as f64;
1035	c[`1`] += sums.c[`1`] as f64;
1036	}
1037
1038	process_line(
1039	&mut h,
1040	&mut c,
1041	&cdeffed[y],
1042	&input[y],
1043	&f_r1,
1044	f_r2_01[dy],
1045	cdef_w,
1046	);
1047	}
1048	}
1049
1050	// this is lifted almost in-tact from libaom
1051	let n = cdef_w as f64 * cdef_h as f64;
1052	h[`0`][`0`] /= n;
1053	h[`0`][`1`] /= n;
1054	h[`1`][`1`] /= n;
1055	h[`1`][`0`] = h[`0`][`1`];
1056	c[`0`] = (`1` << SGRPROJ_PRJ_BITS) as f64* / n;
1057	c[`1`] = (`1` << SGRPROJ_PRJ_BITS) as f64* / n;
1058	let (xq0, xq1) = if s_r2 == `0` {
1059	// H matrix is now only the scalar h[1][1]
1060	// C vector is now only the scalar c[1]
1061	if h[`1`][`1`] == `0.` {
1062	(`0`, `0`)
1063	} else {
1064	(`0`, (c[`1`] / h[`1`][`1`]).round() as i32)
1065	}
1066	} else if s_r1 == `0` {
1067	// H matrix is now only the scalar h[0][0]
1068	// C vector is now only the scalar c[0]
1069	if h[`0`][`0`] == `0.` {
1070	(`0`, `0`)
1071	} else {
1072	((c[`0`] / h[`0`][`0`]).round() as i32, `0`)
1073	}
1074	} else {
1075	let det = h[`0`][`0`].mul_add(h[`1`][`1`], -h[`0`][`1`] * h[`1`][`0`]);
1076	if det == `0.` {
1077	(`0`, `0`)
1078	} else {
1079	// If scaling up dividend would overflow, instead scale down the divisor
1080	let div1 = h[`1`][`1`].mul_add(c[`0`], -h[`0`][`1`] * c[`1`]);
1081	let div2 = h[`0`][`0`].mul_add(c[`1`], -h[`1`][`0`] * c[`0`]);
1082	((div1 / det).round() as i32, (div2 / det).round() as i32)
1083	}
1084	};
1085	{
1086	let xqd0 =
1087	clamp(xq0, SGRPROJ_XQD_MIN[`0`] as i32, SGRPROJ_XQD_MAX[`0`] as i32);
1088	let xqd1 = clamp(
1089	(`1` << SGRPROJ_PRJ_BITS) - xqd0 - xq1,
1090	SGRPROJ_XQD_MIN[`1`] as i32,
1091	SGRPROJ_XQD_MAX[`1`] as i32,
1092	);
1093	(xqd0 as i8, xqd1 as i8)
1094	}
1095	}
1096
1097	#[profiling::function]
1098	fn wiener_stripe_filter<T: Pixel>(
1099	coeffs: [[i8; `3`]; `2`], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize,
1100	stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize,
1101	cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>,
1102	) {
1103	let bit_depth = fi.sequence.bit_depth;
1104	let round_h = if bit_depth == `12` { `5` } else { `3` };
1105	let round_v = if bit_depth == `12` { `9` } else { `11` };
1106	let offset = `1` << (bit_depth + WIENER_BITS - round_h - `1`);
1107	let limit = (`1` << (bit_depth + `1` + WIENER_BITS - round_h)) - `1`;
1108
1109	let mut coeffs_ = [[`0`; `3`]; `2`];
1110	for i in `0`..`2` {
1111	for j in `0`..`3` {
1112	coeffs_[i][j] = i32::from(coeffs[i][j]);
1113	}
1114	}
1115
1116	let mut work: [i32; SB_SIZE + `7`] = [`0`; SB_SIZE + `7`];
1117	let vfilter: [i32; `7`] = [
1118	coeffs_[`0`][`0`],
1119	coeffs_[`0`][`1`],
1120	coeffs_[`0`][`2`],
1121	`128` - `2` * (coeffs_[`0`][`0`] + coeffs_[`0`][`1`] + coeffs_[`0`][`2`]),
1122	coeffs_[`0`][`2`],
1123	coeffs_[`0`][`1`],
1124	coeffs_[`0`][`0`],
1125	];
1126	let hfilter: [i32; `7`] = [
1127	coeffs_[`1`][`0`],
1128	coeffs_[`1`][`1`],
1129	coeffs_[`1`][`2`],
1130	`128` - `2` * (coeffs_[`1`][`0`] + coeffs_[`1`][`1`] + coeffs_[`1`][`2`]),
1131	coeffs_[`1`][`2`],
1132	coeffs_[`1`][`1`],
1133	coeffs_[`1`][`0`],
1134	];
1135
1136	// unlike x, our y can be negative to start as the first stripe
1137	// starts off the top of the frame by 8 pixels, and can also run off the end of the frame
1138	let start_wi = if stripe_y < `0` { -stripe_y } else { `0` } as usize;
1139	let start_yi = if stripe_y < `0` { `0` } else { stripe_y } as usize;
1140	let end_i = cmp::max(
1141	`0`,
1142	if stripe_h as isize + stripe_y > crop_h as isize {
1143	crop_h as isize - stripe_y - start_wi as isize
1144	} else {
1145	stripe_h as isize - start_wi as isize
1146	},
1147	) as usize;
1148
1149	let mut out_slice =
1150	out.mut_slice(PlaneOffset { x: `0`, y: start_yi as isize });
1151
1152	for xi in stripe_x..stripe_x + stripe_w {
1153	let n = cmp::min(`7`, crop_w as isize + `3` - xi as isize);
1154	for yi in stripe_y - `3`..stripe_y + stripe_h as isize + `4` {
1155	let mut acc = `0`;
1156	let src = if yi < stripe_y {
1157	let ly = cmp::max(clamp(yi, `0`, crop_h as isize - `1`), stripe_y - `2`);
1158	deblocked.row(ly)
1159	} else if yi < stripe_y + stripe_h as isize {
1160	let ly = clamp(yi, `0`, crop_h as isize - `1`);
1161	cdeffed.row(ly)
1162	} else {
1163	let ly = cmp::min(
1164	clamp(yi, `0`, crop_h as isize - `1`),
1165	stripe_y + stripe_h as isize + `1`,
1166	);
1167	deblocked.row(ly)
1168	};
1169	let start = i32::cast_from(src[`0`]);
1170	let end = i32::cast_from(src[crop_w - `1`]);
1171	for i in `0`..`3` - xi as isize {
1172	acc += hfilter[i as usize] * start;
1173	}
1174
1175	let off = `3` - (xi as isize);
1176	let s = cmp::max(`0`, off) as usize;
1177	let s1 = (s as isize - off) as usize;
1178	let n1 = (n - off) as usize;
1179
1180	for (hf, &v) in hfilter[s..n as usize].iter().zip(src[s1..n1].iter()) {
1181	acc += hf * i32::cast_from(v);
1182	}
1183
1184	for i in n..`7` {
1185	acc += hfilter[i as usize] * end;
1186	}
1187
1188	acc = (acc + (`1` << round_h >> `1`)) >> round_h;
1189	work[(yi - stripe_y + `3`) as usize] = clamp(acc, -offset, limit - offset);
1190	}
1191
1192	for (wi, dst) in (start_wi..start_wi + end_i)
1193	.zip(out_slice.rows_iter_mut().map(\|row\| &mut row[xi]).take(end_i))
1194	{
1195	let mut acc = `0`;
1196	for (i, src) in (`0`..`7`).zip(work[wi..wi + `7`].iter_mut()) {
1197	acc += vfilter[i] * *src;
1198	}
1199	*dst = T::cast_from(clamp(
1200	(acc + (`1` << round_v >> `1`)) >> round_v,
1201	`0`,
1202	(`1` << bit_depth) - `1`,
1203	));
1204	}
1205	}
1206	}
1207
1208	#[derive(Copy, Clone, Debug, Default)]
1209	pub struct RestorationUnit {
1210	pub filter: RestorationFilter,
1211	}
1212
1213	#[derive(Clone, Debug)]
1214	pub struct FrameRestorationUnits {
1215	units: Box<[RestorationUnit]>,
1216	pub cols: usize,
1217	pub rows: usize,
1218	}
1219
1220	impl FrameRestorationUnits {
1221	pub fn new(cols: usize, rows: usize) -> Self {
1222	Self {
1223	units: vec![RestorationUnit::default(); cols * rows].into_boxed_slice(),
1224	cols,
1225	rows,
1226	}
1227	}
1228	}
1229
1230	impl Index<usize> for FrameRestorationUnits {
1231	type Output = [RestorationUnit];
1232	#[inline(always)]
1233	fn index(&self, index: usize) -> &Self::Output {
1234	&self.units[index * self.cols..(index + `1`) * self.cols]
1235	}
1236	}
1237
1238	impl IndexMut<usize> for FrameRestorationUnits {
1239	#[inline(always)]
1240	fn index_mut(&mut self, index: usize) -> &mut Self::Output {
1241	&mut self.units[index * self.cols..(index + `1`) * self.cols]
1242	}
1243	}
1244
1245	#[derive(Clone, Debug)]
1246	pub struct RestorationPlaneConfig {
1247	pub lrf_type: u8,
1248	pub unit_size: usize,
1249	// (1 << sb_x_shift) gives the number of superblocks horizontally or
1250	// vertically in a restoration unit, not accounting for RU stretching
1251	pub sb_h_shift: usize,
1252	pub sb_v_shift: usize,
1253	pub sb_cols: usize, // actual number of SB cols in this LRU (accounting for stretch and crop)
1254	pub sb_rows: usize, // actual number of SB rows in this LRU (accounting for stretch and crop)
1255	// stripe height is 64 in all cases except 4:2:0 chroma planes where
1256	// it is 32. This is independent of all other setup parameters
1257	pub stripe_height: usize,
1258	pub cols: usize,
1259	pub rows: usize,
1260	}
1261
1262	#[derive(Clone, Debug)]
1263	pub struct RestorationPlane {
1264	pub cfg: RestorationPlaneConfig,
1265	pub units: FrameRestorationUnits,
1266	}
1267
1268	#[derive(Clone, Default)]
1269	pub struct RestorationPlaneOffset {
1270	pub row: usize,
1271	pub col: usize,
1272	}
1273
1274	impl RestorationPlane {
1275	pub fn new(
1276	lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize,
1277	sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize,
1278	rows: usize,
1279	) -> RestorationPlane {
1280	let stripe_height = if stripe_decimate != `0` { `32` } else { `64` };
1281	RestorationPlane {
1282	cfg: RestorationPlaneConfig {
1283	lrf_type,
1284	unit_size,
1285	sb_h_shift,
1286	sb_v_shift,
1287	sb_cols,
1288	sb_rows,
1289	stripe_height,
1290	cols,
1291	rows,
1292	},
1293	units: FrameRestorationUnits::new(cols, rows),
1294	}
1295	}
1296
1297	// Stripes are always 64 pixels high in a non-subsampled
1298	// frame, and decimated from 64 pixels in chroma. When
1299	// filtering, they are not co-located on Y with superblocks.
1300	fn restoration_unit_index_by_stripe(
1301	&self, stripenum: usize, rux: usize,
1302	) -> (usize, usize) {
1303	(
1304	cmp::min(rux, self.cfg.cols - `1`),
1305	cmp::min(
1306	stripenum * self.cfg.stripe_height / self.cfg.unit_size,
1307	self.cfg.rows - `1`,
1308	),
1309	)
1310	}
1311
1312	pub fn restoration_unit_by_stripe(
1313	&self, stripenum: usize, rux: usize,
1314	) -> &RestorationUnit {
1315	let (x, y) = self.restoration_unit_index_by_stripe(stripenum, rux);
1316	&self.units[y][x]
1317	}
1318	}
1319
1320	#[derive(Clone, Debug)]
1321	pub struct RestorationState {
1322	pub planes: [RestorationPlane; MAX_PLANES],
1323	}
1324
1325	impl RestorationState {
1326	pub fn new<T: Pixel>(fi: &FrameInvariants<T>, input: &Frame<T>) -> Self {
1327	let PlaneConfig { xdec, ydec, .. } = input.planes[`1`].cfg;
1328	// stripe size is decimated in 4:2:0 (and only 4:2:0)
1329	let stripe_uv_decimate = usize::from(xdec > `0` && ydec > `0`);
1330	let y_sb_log2 = if fi.sequence.use_128x128_superblock { `7` } else { `6` };
1331	let uv_sb_h_log2 = y_sb_log2 - xdec;
1332	let uv_sb_v_log2 = y_sb_log2 - ydec;
1333
1334	let (lrf_y_shift, lrf_uv_shift) = if fi.sequence.enable_large_lru
1335	&& fi.sequence.enable_restoration
1336	{
1337	assert!(
1338	fi.width > `1` && fi.height > `1`,
1339	"Width and height must be higher than 1 for LRF setup"
1340	);
1341
1342	// Specific content does affect optimal LRU size choice, but the
1343	// quantizer in use is a surprisingly strong selector.
1344	let lrf_base_shift = if fi.base_q_idx > `200` {
1345	`0` // big
1346	} else if fi.base_q_idx > `160` {
1347	`1`
1348	} else {
1349	`2` // small
1350	};
1351	let lrf_chroma_shift = if stripe_uv_decimate > `0` {
1352	// 4:2:0 only
1353	if lrf_base_shift == `2` {
1354	`1` // smallest chroma LRU is a win at low quant
1355	} else {
1356	// Will a down-shifted chroma LRU eliminate stretch in chroma?
1357	// If so, that's generally a win.
1358	let lrf_unit_size =
1359	`1` << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_base_shift);
1360	let unshifted_stretch = ((fi.width >> xdec) - `1`) % lrf_unit_size
1361	<= lrf_unit_size / `2`
1362	\|\| ((fi.height >> ydec) - `1`) % lrf_unit_size <= lrf_unit_size / `2`;
1363	let shifted_stretch = ((fi.width >> xdec) - `1`)
1364	% (lrf_unit_size >> `1`)
1365	<= lrf_unit_size / `4`
1366	\|\| ((fi.height >> ydec) - `1`) % (lrf_unit_size >> `1`)
1367	<= lrf_unit_size / `4`;
1368	// shift to eliminate stretch if needed,
1369	// otherwise do not shift and save the signaling bits
1370	usize::from(unshifted_stretch && !shifted_stretch)
1371	}
1372	} else {
1373	`0`
1374	};
1375	(lrf_base_shift, lrf_base_shift + lrf_chroma_shift)
1376	} else {
1377	// Explicit request to tie LRU size to superblock size ==
1378	// smallest possible LRU size
1379	let lrf_y_shift = if fi.sequence.use_128x128_superblock { `1` } else { `2` };
1380	(lrf_y_shift, lrf_y_shift + stripe_uv_decimate)
1381	};
1382
1383	let mut y_unit_size = `1` << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_y_shift);
1384	let mut uv_unit_size = `1` << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_uv_shift);
1385
1386	let tiling = fi.sequence.tiling;
1387	// Right now we defer to tiling setup: don't choose an LRU size
1388	// large enough that a tile is not an integer number of LRUs
1389	// wide/high.
1390	if tiling.cols > `1` \|\| tiling.rows > `1` {
1391	// despite suggestions to the contrary, tiles can be
1392	// non-powers-of-2.
1393	let trailing_h_zeros = tiling.tile_width_sb.trailing_zeros() as usize;
1394	let trailing_v_zeros = tiling.tile_height_sb.trailing_zeros() as usize;
1395	let tile_aligned_y_unit_size =
1396	`1` << (y_sb_log2 + trailing_h_zeros.min(trailing_v_zeros));
1397	let tile_aligned_uv_h_unit_size = `1` << (uv_sb_h_log2 + trailing_h_zeros);
1398	let tile_aligned_uv_v_unit_size = `1` << (uv_sb_v_log2 + trailing_v_zeros);
1399	y_unit_size = y_unit_size.min(tile_aligned_y_unit_size);
1400	uv_unit_size = uv_unit_size
1401	.min(tile_aligned_uv_h_unit_size.min(tile_aligned_uv_v_unit_size));
1402
1403	// But it's actually worse: LRUs can't span tiles (in our
1404	// one-pass design that is, spec allows it). However, the spec
1405	// mandates the last LRU stretches forward into any
1406	// less-than-half-LRU span of superblocks at the right and
1407	// bottom of a frame. These superblocks may well be in a
1408	// different tile! Even if LRUs are minimum size (one
1409	// superblock), when the right or bottom edge of the frame is a
1410	// superblock that's less than half the width/height of a normal
1411	// superblock, the LRU is forced by the spec to span into it
1412	// (and thus a different tile). Tiling is under no such
1413	// restriction; it could decide the right/left sliver will be in
1414	// its own tile row/column. We can't disallow the combination
1415	// here. The tiling code will have to either prevent it or
1416	// tolerate it. (prayer mechanic == Issue #1629).
1417	}
1418
1419	// When coding 4:2:2 and 4:4:4, spec requires Y and UV LRU sizes
1420	// to be the same. If they differ at this*
1421	// point, it's due to a tiling restriction enforcing a maximum
1422	// size, so force both to the smaller value.
1423	//
1424	// see sec 5.9.20, "Loop restoration params syntax". The*
1425	// bitstream provides means of coding a different UV LRU size only
1426	// when chroma is in use and both x and y are subsampled in the
1427	// chroma planes.
1428	if ydec == `0` && y_unit_size != uv_unit_size {
1429	y_unit_size = uv_unit_size.min(y_unit_size);
1430	uv_unit_size = y_unit_size;
1431	}
1432
1433	// derive the rest
1434	let y_unit_log2 = y_unit_size.ilog() - `1`;
1435	let uv_unit_log2 = uv_unit_size.ilog() - `1`;
1436	let y_cols = ((fi.width + (y_unit_size >> `1`)) / y_unit_size).max(`1`);
1437	let y_rows = ((fi.height + (y_unit_size >> `1`)) / y_unit_size).max(`1`);
1438	let uv_cols = ((((fi.width + (`1` << xdec >> `1`)) >> xdec)
1439	+ (uv_unit_size >> `1`))
1440	/ uv_unit_size)
1441	.max(`1`);
1442	let uv_rows = ((((fi.height + (`1` << ydec >> `1`)) >> ydec)
1443	+ (uv_unit_size >> `1`))
1444	/ uv_unit_size)
1445	.max(`1`);
1446
1447	RestorationState {
1448	planes: [
1449	RestorationPlane::new(
1450	RESTORE_SWITCHABLE,
1451	y_unit_size,
1452	y_unit_log2 - y_sb_log2,
1453	y_unit_log2 - y_sb_log2,
1454	fi.sb_width,
1455	fi.sb_height,
1456	`0`,
1457	y_cols,
1458	y_rows,
1459	),
1460	RestorationPlane::new(
1461	RESTORE_SWITCHABLE,
1462	uv_unit_size,
1463	uv_unit_log2 - uv_sb_h_log2,
1464	uv_unit_log2 - uv_sb_v_log2,
1465	fi.sb_width,
1466	fi.sb_height,
1467	stripe_uv_decimate,
1468	uv_cols,
1469	uv_rows,
1470	),
1471	RestorationPlane::new(
1472	RESTORE_SWITCHABLE,
1473	uv_unit_size,
1474	uv_unit_log2 - uv_sb_h_log2,
1475	uv_unit_log2 - uv_sb_v_log2,
1476	fi.sb_width,
1477	fi.sb_height,
1478	stripe_uv_decimate,
1479	uv_cols,
1480	uv_rows,
1481	),
1482	],
1483	}
1484	}
1485
1486	#[profiling::function]
1487	pub fn lrf_filter_frame<T: Pixel>(
1488	&mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>,
1489	fi: &FrameInvariants<T>,
1490	) {
1491	let cdeffed = out.clone();
1492	let planes =
1493	if fi.sequence.chroma_sampling == Cs400 { `1` } else { MAX_PLANES };
1494
1495	// unlike the other loop filters that operate over the padded
1496	// frame dimensions, restoration filtering and source pixel
1497	// accesses are clipped to the original frame dimensions
1498	// that's why we use fi.width and fi.height instead of PlaneConfig fields
1499
1500	// number of stripes (counted according to colocated Y luma position)
1501	let stripe_n = (fi.height + `7`) / `64` + `1`;
1502
1503	// Buffers for the stripe filter.
1504	let mut stripe_filter_buffer =
1505	IntegralImageBuffer::zeroed(STRIPE_IMAGE_SIZE);
1506
1507	for pli in `0`..planes {
1508	let rp = &self.planes[pli];
1509	let xdec = out.planes[pli].cfg.xdec;
1510	let ydec = out.planes[pli].cfg.ydec;
1511	let crop_w = (fi.width + (`1` << xdec >> `1`)) >> xdec;
1512	let crop_h = (fi.height + (`1` << ydec >> `1`)) >> ydec;
1513
1514	for si in `0`..stripe_n {
1515	let (stripe_start_y, stripe_size) = if si == `0` {
1516	(`0`, (`64` - `8`) >> ydec)
1517	} else {
1518	let start = (si * `64` - `8`) >> ydec;
1519	(
1520	start as isize,
1521	// one past, unlike spec
1522	(`64` >> ydec).min(crop_h - start),
1523	)
1524	};
1525
1526	// horizontally, go rdu-by-rdu
1527	for rux in `0`..rp.cfg.cols {
1528	// stripe x pixel locations must be clipped to frame, last may need to stretch
1529	let x = rux * rp.cfg.unit_size;
1530	let size =
1531	if rux == rp.cfg.cols - `1` { crop_w - x } else { rp.cfg.unit_size };
1532	let ru = rp.restoration_unit_by_stripe(si, rux);
1533	match ru.filter {
1534	RestorationFilter::Wiener { coeffs } => {
1535	wiener_stripe_filter(
1536	coeffs,
1537	fi,
1538	crop_w,
1539	crop_h,
1540	size,
1541	stripe_size,
1542	x,
1543	stripe_start_y,
1544	&cdeffed.planes[pli],
1545	&pre_cdef.planes[pli],
1546	&mut out.planes[pli],
1547	);
1548	}
1549	RestorationFilter::Sgrproj { set, xqd } => {
1550	if !fi.sequence.enable_cdef {
1551	continue;
1552	}
1553
1554	setup_integral_image(
1555	&mut stripe_filter_buffer,
1556	STRIPE_IMAGE_STRIDE,
1557	crop_w - x,
1558	(crop_h as isize - stripe_start_y) as usize,
1559	size,
1560	stripe_size,
1561	&cdeffed.planes[pli]
1562	.slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1563	&pre_cdef.planes[pli]
1564	.slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1565	);
1566
1567	sgrproj_stripe_filter(
1568	set,
1569	xqd,
1570	fi,
1571	&stripe_filter_buffer,
1572	STRIPE_IMAGE_STRIDE,
1573	&cdeffed.planes[pli]
1574	.slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1575	&mut out.planes[pli].region_mut(Area::Rect {
1576	x: x as isize,
1577	y: stripe_start_y,
1578	width: size,
1579	height: stripe_size,
1580	}),
1581	);
1582	}
1583	RestorationFilter::None => {
1584	// do nothing
1585	}
1586	}
1587	}
1588	}
1589	}
1590	}
1591	}
1592