1// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10cfg_if::cfg_if! {
11 if #[cfg(nasm_x86_64)] {
12 use crate::asm::x86::lrf::*;
13 } else {
14 use self::rust::*;
15 }
16}
17
18use crate::api::SGRComplexityLevel;
19use crate::color::ChromaSampling::Cs400;
20use crate::context::{MAX_PLANES, SB_SIZE};
21use crate::encoder::FrameInvariants;
22use crate::frame::{
23 AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice,
24};
25use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect};
26use crate::util::{clamp, CastFromPrimitive, ILog, Pixel};
27use std::cmp;
28use std::iter::FusedIterator;
29use std::ops::{Index, IndexMut};
30
31pub const RESTORATION_TILESIZE_MAX_LOG2: usize = 8;
32
33pub const RESTORE_NONE: u8 = 0;
34pub const RESTORE_SWITCHABLE: u8 = 1;
35pub const RESTORE_WIENER: u8 = 2;
36pub const RESTORE_SGRPROJ: u8 = 3;
37
38pub const WIENER_TAPS_MIN: [i8; 3] = [-5, -23, -17];
39pub const WIENER_TAPS_MID: [i8; 3] = [3, -7, 15];
40pub const WIENER_TAPS_MAX: [i8; 3] = [10, 8, 46];
41#[allow(unused)]
42pub const WIENER_TAPS_K: [i8; 3] = [1, 2, 3];
43pub const WIENER_BITS: usize = 7;
44
45pub const SGRPROJ_XQD_MIN: [i8; 2] = [-96, -32];
46pub const SGRPROJ_XQD_MID: [i8; 2] = [-32, 31];
47pub const SGRPROJ_XQD_MAX: [i8; 2] = [31, 95];
48pub const SGRPROJ_PRJ_SUBEXP_K: u8 = 4;
49pub const SGRPROJ_PRJ_BITS: u8 = 7;
50pub const SGRPROJ_PARAMS_BITS: u8 = 4;
51pub const SGRPROJ_MTABLE_BITS: u8 = 20;
52pub const SGRPROJ_SGR_BITS: u8 = 8;
53pub const SGRPROJ_RECIP_BITS: u8 = 12;
54pub const SGRPROJ_RST_BITS: u8 = 4;
55pub const SGRPROJ_PARAMS_S: [[u32; 2]; 1 << SGRPROJ_PARAMS_BITS] = [
56 [140, 3236],
57 [112, 2158],
58 [93, 1618],
59 [80, 1438],
60 [70, 1295],
61 [58, 1177],
62 [47, 1079],
63 [37, 996],
64 [30, 925],
65 [25, 863],
66 [0, 2589],
67 [0, 1618],
68 [0, 1177],
69 [0, 925],
70 [56, 0],
71 [22, 0],
72];
73
74// List of indices to SGRPROJ_PARAMS_S values that at a given complexity level.
75// SGRPROJ_ALL_SETS contains every possible index
76const SGRPROJ_ALL_SETS: &[u8] =
77 &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
78// SGRPROJ_REDUCED_SETS has half of the values. Using only these values gives
79// most of the gains from sgr. The decision of which values to use is somewhat
80// arbitrary. The sgr parameters has 3 discontinuous groups. The first has both
81// parameters as non-zero. The other two are distinguishable by which of the
82// two parameters is zero. There are an even number of each of these groups and
83// the non-zero parameters grow as the indices increase. This array uses the
84// 1st, 3rd, ... smallest params of each group.
85const SGRPROJ_REDUCED_SETS: &[u8] = &[1, 3, 5, 7, 9, 11, 13, 15];
86
87pub const fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] {
88 match complexity {
89 SGRComplexityLevel::Full => SGRPROJ_ALL_SETS,
90 SGRComplexityLevel::Reduced => SGRPROJ_REDUCED_SETS,
91 }
92}
93
94pub const SOLVE_IMAGE_MAX: usize = 1 << RESTORATION_TILESIZE_MAX_LOG2;
95pub const SOLVE_IMAGE_STRIDE: usize = SOLVE_IMAGE_MAX + 6 + 2;
96pub const SOLVE_IMAGE_HEIGHT: usize = SOLVE_IMAGE_STRIDE;
97pub const SOLVE_IMAGE_SIZE: usize = SOLVE_IMAGE_STRIDE * SOLVE_IMAGE_HEIGHT;
98
99pub const STRIPE_IMAGE_MAX: usize = (1 << RESTORATION_TILESIZE_MAX_LOG2)
100 + (1 << (RESTORATION_TILESIZE_MAX_LOG2 - 1));
101pub const STRIPE_IMAGE_STRIDE: usize = STRIPE_IMAGE_MAX + 6 + 2;
102pub const STRIPE_IMAGE_HEIGHT: usize = 64 + 6 + 2;
103pub const STRIPE_IMAGE_SIZE: usize = STRIPE_IMAGE_STRIDE * STRIPE_IMAGE_HEIGHT;
104
105pub const IMAGE_WIDTH_MAX: usize = [STRIPE_IMAGE_MAX, SOLVE_IMAGE_MAX]
106 [(STRIPE_IMAGE_MAX < SOLVE_IMAGE_MAX) as usize];
107
108/// The buffer used in `sgrproj_stripe_filter()` and `sgrproj_solve()`.
109#[derive(Debug)]
110pub struct IntegralImageBuffer {
111 pub integral_image: Vec<u32>,
112 pub sq_integral_image: Vec<u32>,
113}
114
115impl IntegralImageBuffer {
116 /// Creates a new buffer with the given size, filled with zeros.
117 #[inline]
118 pub fn zeroed(size: usize) -> Self {
119 Self { integral_image: vec![0; size], sq_integral_image: vec![0; size] }
120 }
121}
122
123#[allow(unused)] // Wiener coming soon!
124#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
125pub enum RestorationFilter {
126 #[default]
127 None,
128 Wiener {
129 coeffs: [[i8; 3]; 2],
130 },
131 Sgrproj {
132 set: u8,
133 xqd: [i8; 2],
134 },
135}
136
137impl RestorationFilter {
138 pub const fn notequal(self, cmp: RestorationFilter) -> bool {
139 match self {
140 RestorationFilter::None {} => !matches!(cmp, RestorationFilter::None {}),
141 RestorationFilter::Sgrproj { set, xqd } => {
142 if let RestorationFilter::Sgrproj { set: set2, xqd: xqd2 } = cmp {
143 !(set == set2 && xqd[0] == xqd2[0] && xqd[1] == xqd2[1])
144 } else {
145 true
146 }
147 }
148 RestorationFilter::Wiener { coeffs } => {
149 if let RestorationFilter::Wiener { coeffs: coeffs2 } = cmp {
150 !(coeffs[0][0] == coeffs2[0][0]
151 && coeffs[0][1] == coeffs2[0][1]
152 && coeffs[0][2] == coeffs2[0][2]
153 && coeffs[1][0] == coeffs2[1][0]
154 && coeffs[1][1] == coeffs2[1][1]
155 && coeffs[1][2] == coeffs2[1][2])
156 } else {
157 true
158 }
159 }
160 }
161 }
162}
163
164pub(crate) mod rust {
165 use crate::cpu_features::CpuFeatureLevel;
166 use crate::frame::PlaneSlice;
167 use crate::lrf::{
168 get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS,
169 SGRPROJ_SGR_BITS,
170 };
171 use crate::util::CastFromPrimitive;
172 use crate::Pixel;
173
174 #[inline(always)]
175 pub(crate) fn sgrproj_box_ab_internal<const BD: usize>(
176 r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
177 iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32,
178 ) {
179 let d: usize = r * 2 + 1;
180 let n: usize = d * d;
181 let one_over_n = if r == 1 { 455 } else { 164 };
182
183 assert!(iimg.len() > (y + d) * iimg_stride + stripe_w + 1 + d);
184 assert!(iimg_sq.len() > (y + d) * iimg_stride + stripe_w + 1 + d);
185 assert!(af.len() > stripe_w + 1);
186 assert!(bf.len() > stripe_w + 1);
187
188 for x in start_x..stripe_w + 2 {
189 // SAFETY: We perform the bounds checks above, once for the whole loop
190 unsafe {
191 let sum = get_integral_square(iimg, iimg_stride, x, y, d);
192 let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d);
193 let (reta, retb) =
194 sgrproj_sum_finish::<BD>(ssq, sum, n as u32, one_over_n, s);
195 *af.get_unchecked_mut(x) = reta;
196 *bf.get_unchecked_mut(x) = retb;
197 }
198 }
199 }
200
201 // computes an intermediate (ab) row for stripe_w + 2 columns at row y
202 pub(crate) fn sgrproj_box_ab_r1<const BD: usize>(
203 af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
204 iimg_stride: usize, y: usize, stripe_w: usize, s: u32,
205 _cpu: CpuFeatureLevel,
206 ) {
207 sgrproj_box_ab_internal::<BD>(
208 1,
209 af,
210 bf,
211 iimg,
212 iimg_sq,
213 iimg_stride,
214 0,
215 y,
216 stripe_w,
217 s,
218 );
219 }
220
221 // computes an intermediate (ab) row for stripe_w + 2 columns at row y
222 pub(crate) fn sgrproj_box_ab_r2<const BD: usize>(
223 af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
224 iimg_stride: usize, y: usize, stripe_w: usize, s: u32,
225 _cpu: CpuFeatureLevel,
226 ) {
227 sgrproj_box_ab_internal::<BD>(
228 2,
229 af,
230 bf,
231 iimg,
232 iimg_sq,
233 iimg_stride,
234 0,
235 y,
236 stripe_w,
237 s,
238 );
239 }
240
241 pub(crate) fn sgrproj_box_f_r0<T: Pixel>(
242 f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
243 _cpu: CpuFeatureLevel,
244 ) {
245 sgrproj_box_f_r0_internal(f, 0, y, w, cdeffed);
246 }
247
248 #[inline(always)]
249 pub(crate) fn sgrproj_box_f_r0_internal<T: Pixel>(
250 f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
251 ) {
252 let line = cdeffed.row(y);
253 for (fp, &v) in f[start_x..w].iter_mut().zip(line[start_x..w].iter()) {
254 *fp = u32::cast_from(v) << SGRPROJ_RST_BITS;
255 }
256 }
257
258 pub(crate) fn sgrproj_box_f_r1<T: Pixel>(
259 af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
260 cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
261 ) {
262 sgrproj_box_f_r1_internal(af, bf, f, 0, y, w, cdeffed);
263 }
264
265 #[inline(always)]
266 pub(crate) fn sgrproj_box_f_r1_internal<T: Pixel>(
267 af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize,
268 y: usize, w: usize, cdeffed: &PlaneSlice<T>,
269 ) {
270 let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
271 let line = cdeffed.row(y);
272 for x in start_x..w {
273 let a = 3 * (af[0][x] + af[2][x] + af[0][x + 2] + af[2][x + 2])
274 + 4
275 * (af[1][x]
276 + af[0][x + 1]
277 + af[1][x + 1]
278 + af[2][x + 1]
279 + af[1][x + 2]);
280 let b = 3 * (bf[0][x] + bf[2][x] + bf[0][x + 2] + bf[2][x + 2])
281 + 4
282 * (bf[1][x]
283 + bf[0][x + 1]
284 + bf[1][x + 1]
285 + bf[2][x + 1]
286 + bf[1][x + 2]);
287 let v = a * u32::cast_from(line[x]) + b;
288 f[x] = (v + (1 << shift >> 1)) >> shift;
289 }
290 }
291
292 pub(crate) fn sgrproj_box_f_r2<T: Pixel>(
293 af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
294 y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
295 ) {
296 sgrproj_box_f_r2_internal(af, bf, f0, f1, 0, y, w, cdeffed);
297 }
298
299 #[inline(always)]
300 pub(crate) fn sgrproj_box_f_r2_internal<T: Pixel>(
301 af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
302 start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
303 ) {
304 let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
305 let shifto = 4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
306 let line = cdeffed.row(y);
307 let line1 = cdeffed.row(y + 1);
308
309 let af0 = af[0][start_x..w + 3].windows(3);
310 let af1 = af[1][start_x..w + 3].windows(3);
311 let bf0 = bf[0][start_x..w + 3].windows(3);
312 let bf1 = bf[1][start_x..w + 3].windows(3);
313
314 let af_it = af0.zip(af1);
315 let bf_it = bf0.zip(bf1);
316
317 let in0 = line[start_x..w].iter();
318 let in1 = line1[start_x..w].iter();
319
320 let o0 = f0[start_x..w].iter_mut();
321 let o1 = f1[start_x..w].iter_mut();
322
323 let in_iter = in0.zip(in1);
324 let out_iter = o0.zip(o1);
325
326 let io_iter = out_iter.zip(in_iter);
327
328 for (((o0, o1), (&p0, &p1)), ((af_0, af_1), (bf_0, bf_1))) in
329 io_iter.zip(af_it.zip(bf_it))
330 {
331 let a = 5 * (af_0[0] + af_0[2]) + 6 * af_0[1];
332 let b = 5 * (bf_0[0] + bf_0[2]) + 6 * bf_0[1];
333 let ao = 5 * (af_1[0] + af_1[2]) + 6 * af_1[1];
334 let bo = 5 * (bf_1[0] + bf_1[2]) + 6 * bf_1[1];
335 let v = (a + ao) * u32::cast_from(p0) + b + bo;
336 *o0 = (v + (1 << shift >> 1)) >> shift;
337 let vo = ao * u32::cast_from(p1) + bo;
338 *o1 = (vo + (1 << shifto >> 1)) >> shifto;
339 }
340 }
341}
342
343#[inline(always)]
344fn sgrproj_sum_finish<const BD: usize>(
345 ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32,
346) -> (u32, u32) {
347 let bdm8: usize = BD - 8;
348 let scaled_ssq: u32 = (ssq + (1 << (2 * bdm8) >> 1)) >> (2 * bdm8);
349 let scaled_sum: u32 = (sum + (1 << bdm8 >> 1)) >> bdm8;
350 let p: u32 = (scaled_ssq * n).saturating_sub(scaled_sum * scaled_sum);
351 let z: u32 = (p * s + (1 << SGRPROJ_MTABLE_BITS >> 1)) >> SGRPROJ_MTABLE_BITS;
352 let a: u32 = if z >= 255 {
353 256
354 } else if z == 0 {
355 1
356 } else {
357 ((z << SGRPROJ_SGR_BITS) + z / 2) / (z + 1)
358 };
359 let b: u32 = ((1 << SGRPROJ_SGR_BITS) - a) * sum * one_over_n;
360 (a, (b + (1 << SGRPROJ_RECIP_BITS >> 1)) >> SGRPROJ_RECIP_BITS)
361}
362
363// Using an integral image, compute the sum of a square region
364// SAFETY: The size of `iimg` must be at least `(y + size) * stride + x + size`
365#[inline(always)]
366unsafe fn get_integral_square(
367 iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
368) -> u32 {
369 // Cancel out overflow in iimg by using wrapping arithmetic
370 let top_left: u32 = *iimg.get_unchecked(index:y * stride + x);
371 let top_right: u32 = *iimg.get_unchecked(index:y * stride + x + size);
372 let bottom_left: u32 = *iimg.get_unchecked((y + size) * stride + x);
373 let bottom_right: u32 = *iimg.get_unchecked((y + size) * stride + x + size);
374 top_leftu32
375 .wrapping_add(bottom_right)
376 .wrapping_sub(bottom_left)
377 .wrapping_sub(top_right)
378}
379
380struct VertPaddedIter<'a, T: Pixel> {
381 // The two sources that can be selected when clipping
382 deblocked: &'a Plane<T>,
383 cdeffed: &'a Plane<T>,
384 // x index to choice where on the row to start
385 x: isize,
386 // y index that will be mutated
387 y: isize,
388 // The index at which to terminate. Can be larger than the slice length.
389 end: isize,
390 // Used for source buffer choice/clipping. May (and regularly will)
391 // be negative.
392 stripe_begin: isize,
393 // Also used for source buffer choice/clipping. May specify a stripe boundary
394 // less than, equal to, or larger than the buffers we're accessing.
395 stripe_end: isize,
396 // Active area cropping is done by specifying a value smaller than the height
397 // of the plane.
398 crop: isize,
399}
400
401impl<'a, T: Pixel> VertPaddedIter<'a, T> {
402 fn new(
403 cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>,
404 stripe_h: usize, crop: usize,
405 ) -> VertPaddedIter<'a, T> {
406 // cdeffed and deblocked must start at the same coordinates from their
407 // underlying planes. Since cropping is provided via a separate params, the
408 // height of the underlying planes do not need to match.
409 assert_eq!(cdeffed.x, deblocked.x);
410 assert_eq!(cdeffed.y, deblocked.y);
411
412 // To share integral images, always use the max box filter radius of 2
413 let r = 2;
414
415 // The number of rows outside the stripe are needed
416 let rows_above = r + 2;
417 let rows_below = 2;
418
419 // Offset crop and stripe_h so they are relative to the underlying plane
420 // and not the plane slice.
421 let crop = crop as isize + deblocked.y;
422 let stripe_end = stripe_h as isize + deblocked.y;
423
424 // Move y up the number rows above.
425 // If y is negative we repeat the first row
426 let y = deblocked.y - rows_above as isize;
427
428 VertPaddedIter {
429 deblocked: deblocked.plane,
430 cdeffed: cdeffed.plane,
431 x: deblocked.x,
432 y,
433 end: (rows_above + stripe_h + rows_below) as isize + y,
434 stripe_begin: deblocked.y,
435 stripe_end,
436 crop,
437 }
438 }
439}
440
441impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> {
442 type Item = &'a [T];
443
444 #[inline(always)]
445 fn next(&mut self) -> Option<Self::Item> {
446 if self.end > self.y {
447 // clamp before deciding the source
448 // clamp vertically to storage at top and passed-in height at bottom
449 let cropped_y = clamp(self.y, 0, self.crop - 1);
450 // clamp vertically to stripe limits
451 let ly = clamp(cropped_y, self.stripe_begin - 2, self.stripe_end + 1);
452
453 // decide if we're vertically inside or outside the strip
454 let src_plane = if ly >= self.stripe_begin && ly < self.stripe_end {
455 self.cdeffed
456 } else {
457 self.deblocked
458 };
459 // cannot directly return self.ps.row(row) due to lifetime issue
460 let range = src_plane.row_range(self.x, ly);
461 self.y += 1;
462 Some(&src_plane.data[range])
463 } else {
464 None
465 }
466 }
467
468 fn size_hint(&self) -> (usize, Option<usize>) {
469 let remaining = self.end - self.y;
470 debug_assert!(remaining >= 0);
471 let remaining = remaining as usize;
472
473 (remaining, Some(remaining))
474 }
475}
476
477impl<T: Pixel> ExactSizeIterator for VertPaddedIter<'_, T> {}
478impl<T: Pixel> FusedIterator for VertPaddedIter<'_, T> {}
479
480struct HorzPaddedIter<'a, T: Pixel> {
481 // Active area cropping is done using the length of the slice
482 slice: &'a [T],
483 // x index of the iterator
484 // When less than 0, repeat the first element. When greater than end, repeat
485 // the last element
486 index: isize,
487 // The index at which to terminate. Can be larger than the slice length.
488 end: usize,
489}
490
491impl<'a, T: Pixel> HorzPaddedIter<'a, T> {
492 fn new(
493 slice: &'a [T], start_index: isize, width: usize,
494 ) -> HorzPaddedIter<'a, T> {
495 HorzPaddedIter {
496 slice,
497 index: start_index,
498 end: (width as isize + start_index) as usize,
499 }
500 }
501}
502
503impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> {
504 type Item = &'a T;
505
506 #[inline(always)]
507 fn next(&mut self) -> Option<Self::Item> {
508 if self.index < self.end as isize {
509 // clamp to the edges of the frame
510 let x: usize = clamp(self.index, min:0, self.slice.len() as isize - 1) as usize;
511 self.index += 1;
512 Some(&self.slice[x])
513 } else {
514 None
515 }
516 }
517
518 #[inline(always)]
519 fn size_hint(&self) -> (usize, Option<usize>) {
520 let size: usize = (self.end as isize - self.index) as usize;
521 (size, Some(size))
522 }
523}
524
525impl<T: Pixel> ExactSizeIterator for HorzPaddedIter<'_, T> {}
526impl<T: Pixel> FusedIterator for HorzPaddedIter<'_, T> {}
527
528#[profiling::function]
529pub fn setup_integral_image<T: Pixel>(
530 integral_image_buffer: &mut IntegralImageBuffer,
531 integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize,
532 stripe_h: usize, cdeffed: &PlaneSlice<T>, deblocked: &PlaneSlice<T>,
533) {
534 let integral_image = &mut integral_image_buffer.integral_image;
535 let sq_integral_image = &mut integral_image_buffer.sq_integral_image;
536
537 // Number of elements outside the stripe
538 let left_w = 4; // max radius of 2 + 2 padding
539 let right_w = 3; // max radius of 2 + 1 padding
540
541 assert_eq!(cdeffed.x, deblocked.x);
542
543 // Find how many unique elements to use to the left and right
544 let left_uniques = if cdeffed.x == 0 { 0 } else { left_w };
545 let right_uniques = right_w.min(crop_w - stripe_w);
546
547 // Find the total number of unique elements used
548 let row_uniques = left_uniques + stripe_w + right_uniques;
549
550 // Negative start indices result in repeating the first element of the row
551 let start_index_x = if cdeffed.x == 0 { -(left_w as isize) } else { 0 };
552
553 let mut rows_iter = VertPaddedIter::new(
554 // Move left to encompass all the used data
555 &cdeffed.go_left(left_uniques),
556 &deblocked.go_left(left_uniques),
557 // since r2 uses every other row, we need an extra row if stripe_h is odd
558 stripe_h + (stripe_h & 1),
559 crop_h,
560 )
561 .map(|row: &[T]| {
562 HorzPaddedIter::new(
563 // Limit how many unique elements we use
564 &row[..row_uniques],
565 start_index_x,
566 left_w + stripe_w + right_w,
567 )
568 });
569
570 // Setup the first row
571 {
572 let mut sum: u32 = 0;
573 let mut sq_sum: u32 = 0;
574 // Remove the first row and use it outside of the main loop
575 let row = rows_iter.next().unwrap();
576 for (src, (integral, sq_integral)) in
577 row.zip(integral_image.iter_mut().zip(sq_integral_image.iter_mut()))
578 {
579 let current = u32::cast_from(*src);
580
581 // Wrap adds to prevent undefined behaviour on overflow. Overflow is
582 // cancelled out when calculating the sum of a region.
583 sum = sum.wrapping_add(current);
584 *integral = sum;
585 sq_sum = sq_sum.wrapping_add(current * current);
586 *sq_integral = sq_sum;
587 }
588 }
589 // Calculate all other rows
590 let mut integral_slice = &mut integral_image[..];
591 let mut sq_integral_slice = &mut sq_integral_image[..];
592 for row in rows_iter {
593 let mut sum: u32 = 0;
594 let mut sq_sum: u32 = 0;
595
596 // Split the data between the previous row and future rows.
597 // This allows us to mutate the current row while accessing the
598 // previous row.
599 let (integral_row_prev, integral_row) =
600 integral_slice.split_at_mut(integral_image_stride);
601 let (sq_integral_row_prev, sq_integral_row) =
602 sq_integral_slice.split_at_mut(integral_image_stride);
603 for (
604 src,
605 ((integral_above, sq_integral_above), (integral, sq_integral)),
606 ) in row.zip(
607 integral_row_prev
608 .iter()
609 .zip(sq_integral_row_prev.iter())
610 .zip(integral_row.iter_mut().zip(sq_integral_row.iter_mut())),
611 ) {
612 let current = u32::cast_from(*src);
613 // Wrap adds to prevent undefined behaviour on overflow. Overflow is
614 // cancelled out when calculating the sum of a region.
615 sum = sum.wrapping_add(current);
616 *integral = sum.wrapping_add(*integral_above);
617 sq_sum = sq_sum.wrapping_add(current * current);
618 *sq_integral = sq_sum.wrapping_add(*sq_integral_above);
619 }
620
621 // The current row also contains all future rows. Replacing the slice with
622 // it moves down a row.
623 integral_slice = integral_row;
624 sq_integral_slice = sq_integral_row;
625 }
626}
627
628#[profiling::function]
629pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
630 set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>,
631 integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize,
632 cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>,
633) {
634 let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect();
635 let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
636 [[0; IMAGE_WIDTH_MAX + 2]; 2];
637 let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
638 [[0; IMAGE_WIDTH_MAX + 2]; 2];
639 let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
640 let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
641 let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
642 [[0; IMAGE_WIDTH_MAX + 2]; 3];
643 let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
644 [[0; IMAGE_WIDTH_MAX + 2]; 3];
645 let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
646
647 let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
648 let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
649
650 let fn_ab_r1 = match fi.sequence.bit_depth {
651 8 => sgrproj_box_ab_r1::<8>,
652 10 => sgrproj_box_ab_r1::<10>,
653 12 => sgrproj_box_ab_r1::<12>,
654 _ => unimplemented!(),
655 };
656 let fn_ab_r2 = match fi.sequence.bit_depth {
657 8 => sgrproj_box_ab_r2::<8>,
658 10 => sgrproj_box_ab_r2::<10>,
659 12 => sgrproj_box_ab_r2::<12>,
660 _ => unimplemented!(),
661 };
662
663 /* prime the intermediate arrays */
664 // One oddness about the radius=2 intermediate array computations that
665 // the spec doesn't make clear: Although the spec defines computation
666 // of every row (of a, b and f), only half of the rows (every-other
667 // row) are actually used.
668 let integral_image = &integral_image_buffer.integral_image;
669 let sq_integral_image = &integral_image_buffer.sq_integral_image;
670 if s_r2 > 0 {
671 fn_ab_r2(
672 &mut a_r2[0],
673 &mut b_r2[0],
674 integral_image,
675 sq_integral_image,
676 integral_image_stride,
677 0,
678 stripe_w,
679 s_r2,
680 fi.cpu_feature_level,
681 );
682 }
683 if s_r1 > 0 {
684 let integral_image_offset = integral_image_stride + 1;
685 fn_ab_r1(
686 &mut a_r1[0],
687 &mut b_r1[0],
688 &integral_image[integral_image_offset..],
689 &sq_integral_image[integral_image_offset..],
690 integral_image_stride,
691 0,
692 stripe_w,
693 s_r1,
694 fi.cpu_feature_level,
695 );
696 fn_ab_r1(
697 &mut a_r1[1],
698 &mut b_r1[1],
699 &integral_image[integral_image_offset..],
700 &sq_integral_image[integral_image_offset..],
701 integral_image_stride,
702 1,
703 stripe_w,
704 s_r1,
705 fi.cpu_feature_level,
706 );
707 }
708
709 /* iterate by row */
710 // Increment by two to handle the use of even rows by r=2 and run a nested
711 // loop to handle increments of one.
712 for y in (0..stripe_h).step_by(2) {
713 // get results to use y and y+1
714 let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 {
715 fn_ab_r2(
716 &mut a_r2[(y / 2 + 1) % 2],
717 &mut b_r2[(y / 2 + 1) % 2],
718 integral_image,
719 sq_integral_image,
720 integral_image_stride,
721 y + 2,
722 stripe_w,
723 s_r2,
724 fi.cpu_feature_level,
725 );
726 let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
727 let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
728 sgrproj_box_f_r2(
729 &ap0,
730 &bp0,
731 &mut f_r2_0,
732 &mut f_r2_1,
733 y,
734 stripe_w,
735 cdeffed,
736 fi.cpu_feature_level,
737 );
738 [&f_r2_0, &f_r2_1]
739 } else {
740 sgrproj_box_f_r0(
741 &mut f_r2_0,
742 y,
743 stripe_w,
744 cdeffed,
745 fi.cpu_feature_level,
746 );
747 // share results for both rows
748 [&f_r2_0, &f_r2_0]
749 };
750 for dy in 0..(2.min(stripe_h - y)) {
751 let y = y + dy;
752 if s_r1 > 0 {
753 let integral_image_offset = integral_image_stride + 1;
754 fn_ab_r1(
755 &mut a_r1[(y + 2) % 3],
756 &mut b_r1[(y + 2) % 3],
757 &integral_image[integral_image_offset..],
758 &sq_integral_image[integral_image_offset..],
759 integral_image_stride,
760 y + 2,
761 stripe_w,
762 s_r1,
763 fi.cpu_feature_level,
764 );
765 let ap1: [&[u32]; 3] =
766 [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
767 let bp1: [&[u32]; 3] =
768 [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
769 sgrproj_box_f_r1(
770 &ap1,
771 &bp1,
772 &mut f_r1,
773 y,
774 stripe_w,
775 cdeffed,
776 fi.cpu_feature_level,
777 );
778 } else {
779 sgrproj_box_f_r0(
780 &mut f_r1,
781 y,
782 stripe_w,
783 cdeffed,
784 fi.cpu_feature_level,
785 );
786 }
787
788 /* apply filter */
789 let w0 = xqd[0] as i32;
790 let w1 = xqd[1] as i32;
791 let w2 = (1 << SGRPROJ_PRJ_BITS) - w0 - w1;
792
793 let line = &cdeffed[y];
794
795 #[inline(always)]
796 fn apply_filter<U: Pixel>(
797 out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32],
798 stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32,
799 ) {
800 let line_it = line[..stripe_w].iter();
801 let f_r2_ab_it = f_r2_ab[..stripe_w].iter();
802 let f_r1_it = f_r1[..stripe_w].iter();
803 let out_it = out[..stripe_w].iter_mut();
804
805 for ((o, &u), (&f_r2_ab, &f_r1)) in
806 out_it.zip(line_it).zip(f_r2_ab_it.zip(f_r1_it))
807 {
808 let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
809 let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32;
810 let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1))
811 >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
812 *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1));
813 }
814 }
815
816 apply_filter(
817 &mut out[y],
818 line,
819 &f_r1,
820 f_r2_ab[dy],
821 stripe_w,
822 fi.sequence.bit_depth,
823 w0,
824 w1,
825 w2,
826 );
827 }
828 }
829}
830
831// Frame inputs below aren't all equal, and will change as work
832// continues. There's no deblocked reconstruction available at this
833// point of RDO, so we use the non-deblocked reconstruction, cdef and
834// input. The input can be a full-sized frame. Cdef input is a partial
835// frame constructed specifically for RDO.
836
837// For simplicity, this ignores stripe segmentation (it's possible the
838// extra complexity isn't worth it and we'll ignore stripes
839// permanently during RDO, but that's not been tested yet). Data
840// access inside the cdef frame is monolithic and clipped to the cdef
841// borders.
842
843// Input params follow the same rules as sgrproj_stripe_filter.
844// Inputs are relative to the colocated slice views.
845#[profiling::function]
846pub fn sgrproj_solve<T: Pixel>(
847 set: u8, fi: &FrameInvariants<T>,
848 integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>,
849 cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize,
850) -> (i8, i8) {
851 let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
852 [[0; IMAGE_WIDTH_MAX + 2]; 2];
853 let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
854 [[0; IMAGE_WIDTH_MAX + 2]; 2];
855 let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
856 let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
857 let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
858 [[0; IMAGE_WIDTH_MAX + 2]; 3];
859 let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
860 [[0; IMAGE_WIDTH_MAX + 2]; 3];
861 let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
862
863 let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
864 let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
865
866 let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]];
867 let mut c: [f64; 2] = [0., 0.];
868
869 let fn_ab_r1 = match fi.sequence.bit_depth {
870 8 => sgrproj_box_ab_r1::<8>,
871 10 => sgrproj_box_ab_r1::<10>,
872 12 => sgrproj_box_ab_r1::<12>,
873 _ => unimplemented!(),
874 };
875 let fn_ab_r2 = match fi.sequence.bit_depth {
876 8 => sgrproj_box_ab_r2::<8>,
877 10 => sgrproj_box_ab_r2::<10>,
878 12 => sgrproj_box_ab_r2::<12>,
879 _ => unimplemented!(),
880 };
881
882 /* prime the intermediate arrays */
883 // One oddness about the radius=2 intermediate array computations that
884 // the spec doesn't make clear: Although the spec defines computation
885 // of every row (of a, b and f), only half of the rows (every-other
886 // row) are actually used.
887 let integral_image = &integral_image_buffer.integral_image;
888 let sq_integral_image = &integral_image_buffer.sq_integral_image;
889 if s_r2 > 0 {
890 fn_ab_r2(
891 &mut a_r2[0],
892 &mut b_r2[0],
893 integral_image,
894 sq_integral_image,
895 SOLVE_IMAGE_STRIDE,
896 0,
897 cdef_w,
898 s_r2,
899 fi.cpu_feature_level,
900 );
901 }
902 if s_r1 > 0 {
903 let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
904 fn_ab_r1(
905 &mut a_r1[0],
906 &mut b_r1[0],
907 &integral_image[integral_image_offset..],
908 &sq_integral_image[integral_image_offset..],
909 SOLVE_IMAGE_STRIDE,
910 0,
911 cdef_w,
912 s_r1,
913 fi.cpu_feature_level,
914 );
915 fn_ab_r1(
916 &mut a_r1[1],
917 &mut b_r1[1],
918 &integral_image[integral_image_offset..],
919 &sq_integral_image[integral_image_offset..],
920 SOLVE_IMAGE_STRIDE,
921 1,
922 cdef_w,
923 s_r1,
924 fi.cpu_feature_level,
925 );
926 }
927
928 /* iterate by row */
929 // Increment by two to handle the use of even rows by r=2 and run a nested
930 // loop to handle increments of one.
931 for y in (0..cdef_h).step_by(2) {
932 // get results to use y and y+1
933 let f_r2_01: [&[u32]; 2] = if s_r2 > 0 {
934 fn_ab_r2(
935 &mut a_r2[(y / 2 + 1) % 2],
936 &mut b_r2[(y / 2 + 1) % 2],
937 integral_image,
938 sq_integral_image,
939 SOLVE_IMAGE_STRIDE,
940 y + 2,
941 cdef_w,
942 s_r2,
943 fi.cpu_feature_level,
944 );
945 let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
946 let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
947 sgrproj_box_f_r2(
948 &ap0,
949 &bp0,
950 &mut f_r2_0,
951 &mut f_r2_1,
952 y,
953 cdef_w,
954 cdeffed,
955 fi.cpu_feature_level,
956 );
957 [&f_r2_0, &f_r2_1]
958 } else {
959 sgrproj_box_f_r0(&mut f_r2_0, y, cdef_w, cdeffed, fi.cpu_feature_level);
960 // share results for both rows
961 [&f_r2_0, &f_r2_0]
962 };
963 for dy in 0..(2.min(cdef_h - y)) {
964 let y = y + dy;
965 if s_r1 > 0 {
966 let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
967 fn_ab_r1(
968 &mut a_r1[(y + 2) % 3],
969 &mut b_r1[(y + 2) % 3],
970 &integral_image[integral_image_offset..],
971 &sq_integral_image[integral_image_offset..],
972 SOLVE_IMAGE_STRIDE,
973 y + 2,
974 cdef_w,
975 s_r1,
976 fi.cpu_feature_level,
977 );
978 let ap1: [&[u32]; 3] =
979 [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
980 let bp1: [&[u32]; 3] =
981 [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
982 sgrproj_box_f_r1(
983 &ap1,
984 &bp1,
985 &mut f_r1,
986 y,
987 cdef_w,
988 cdeffed,
989 fi.cpu_feature_level,
990 );
991 } else {
992 sgrproj_box_f_r0(&mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level);
993 }
994
995 #[inline(always)]
996 fn process_line<T: Pixel>(
997 h: &mut [[f64; 2]; 2], c: &mut [f64; 2], cdeffed: &[T], input: &[T],
998 f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize,
999 ) {
1000 let cdeffed_it = cdeffed[..cdef_w].iter();
1001 let input_it = input[..cdef_w].iter();
1002 let f_r2_ab_it = f_r2_ab[..cdef_w].iter();
1003 let f_r1_it = f_r1[..cdef_w].iter();
1004
1005 #[derive(Debug, Copy, Clone)]
1006 struct Sums {
1007 h: [[i64; 2]; 2],
1008 c: [i64; 2],
1009 }
1010
1011 let sums: Sums = cdeffed_it
1012 .zip(input_it)
1013 .zip(f_r2_ab_it.zip(f_r1_it))
1014 .map(|((&u, &i), (&f2, &f1))| {
1015 let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
1016 let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u;
1017 let f2 = f2 as i32 - u;
1018 let f1 = f1 as i32 - u;
1019 (s as i64, f1 as i64, f2 as i64)
1020 })
1021 .fold(Sums { h: [[0; 2]; 2], c: [0; 2] }, |sums, (s, f1, f2)| {
1022 let mut ret: Sums = sums;
1023 ret.h[0][0] += f2 * f2;
1024 ret.h[1][1] += f1 * f1;
1025 ret.h[0][1] += f1 * f2;
1026 ret.c[0] += f2 * s;
1027 ret.c[1] += f1 * s;
1028 ret
1029 });
1030
1031 h[0][0] += sums.h[0][0] as f64;
1032 h[1][1] += sums.h[1][1] as f64;
1033 h[0][1] += sums.h[0][1] as f64;
1034 c[0] += sums.c[0] as f64;
1035 c[1] += sums.c[1] as f64;
1036 }
1037
1038 process_line(
1039 &mut h,
1040 &mut c,
1041 &cdeffed[y],
1042 &input[y],
1043 &f_r1,
1044 f_r2_01[dy],
1045 cdef_w,
1046 );
1047 }
1048 }
1049
1050 // this is lifted almost in-tact from libaom
1051 let n = cdef_w as f64 * cdef_h as f64;
1052 h[0][0] /= n;
1053 h[0][1] /= n;
1054 h[1][1] /= n;
1055 h[1][0] = h[0][1];
1056 c[0] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1057 c[1] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1058 let (xq0, xq1) = if s_r2 == 0 {
1059 // H matrix is now only the scalar h[1][1]
1060 // C vector is now only the scalar c[1]
1061 if h[1][1] == 0. {
1062 (0, 0)
1063 } else {
1064 (0, (c[1] / h[1][1]).round() as i32)
1065 }
1066 } else if s_r1 == 0 {
1067 // H matrix is now only the scalar h[0][0]
1068 // C vector is now only the scalar c[0]
1069 if h[0][0] == 0. {
1070 (0, 0)
1071 } else {
1072 ((c[0] / h[0][0]).round() as i32, 0)
1073 }
1074 } else {
1075 let det = h[0][0].mul_add(h[1][1], -h[0][1] * h[1][0]);
1076 if det == 0. {
1077 (0, 0)
1078 } else {
1079 // If scaling up dividend would overflow, instead scale down the divisor
1080 let div1 = h[1][1].mul_add(c[0], -h[0][1] * c[1]);
1081 let div2 = h[0][0].mul_add(c[1], -h[1][0] * c[0]);
1082 ((div1 / det).round() as i32, (div2 / det).round() as i32)
1083 }
1084 };
1085 {
1086 let xqd0 =
1087 clamp(xq0, SGRPROJ_XQD_MIN[0] as i32, SGRPROJ_XQD_MAX[0] as i32);
1088 let xqd1 = clamp(
1089 (1 << SGRPROJ_PRJ_BITS) - xqd0 - xq1,
1090 SGRPROJ_XQD_MIN[1] as i32,
1091 SGRPROJ_XQD_MAX[1] as i32,
1092 );
1093 (xqd0 as i8, xqd1 as i8)
1094 }
1095}
1096
1097#[profiling::function]
1098fn wiener_stripe_filter<T: Pixel>(
1099 coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize,
1100 stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize,
1101 cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>,
1102) {
1103 let bit_depth = fi.sequence.bit_depth;
1104 let round_h = if bit_depth == 12 { 5 } else { 3 };
1105 let round_v = if bit_depth == 12 { 9 } else { 11 };
1106 let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1);
1107 let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1;
1108
1109 let mut coeffs_ = [[0; 3]; 2];
1110 for i in 0..2 {
1111 for j in 0..3 {
1112 coeffs_[i][j] = i32::from(coeffs[i][j]);
1113 }
1114 }
1115
1116 let mut work: [i32; SB_SIZE + 7] = [0; SB_SIZE + 7];
1117 let vfilter: [i32; 7] = [
1118 coeffs_[0][0],
1119 coeffs_[0][1],
1120 coeffs_[0][2],
1121 128 - 2 * (coeffs_[0][0] + coeffs_[0][1] + coeffs_[0][2]),
1122 coeffs_[0][2],
1123 coeffs_[0][1],
1124 coeffs_[0][0],
1125 ];
1126 let hfilter: [i32; 7] = [
1127 coeffs_[1][0],
1128 coeffs_[1][1],
1129 coeffs_[1][2],
1130 128 - 2 * (coeffs_[1][0] + coeffs_[1][1] + coeffs_[1][2]),
1131 coeffs_[1][2],
1132 coeffs_[1][1],
1133 coeffs_[1][0],
1134 ];
1135
1136 // unlike x, our y can be negative to start as the first stripe
1137 // starts off the top of the frame by 8 pixels, and can also run off the end of the frame
1138 let start_wi = if stripe_y < 0 { -stripe_y } else { 0 } as usize;
1139 let start_yi = if stripe_y < 0 { 0 } else { stripe_y } as usize;
1140 let end_i = cmp::max(
1141 0,
1142 if stripe_h as isize + stripe_y > crop_h as isize {
1143 crop_h as isize - stripe_y - start_wi as isize
1144 } else {
1145 stripe_h as isize - start_wi as isize
1146 },
1147 ) as usize;
1148
1149 let mut out_slice =
1150 out.mut_slice(PlaneOffset { x: 0, y: start_yi as isize });
1151
1152 for xi in stripe_x..stripe_x + stripe_w {
1153 let n = cmp::min(7, crop_w as isize + 3 - xi as isize);
1154 for yi in stripe_y - 3..stripe_y + stripe_h as isize + 4 {
1155 let mut acc = 0;
1156 let src = if yi < stripe_y {
1157 let ly = cmp::max(clamp(yi, 0, crop_h as isize - 1), stripe_y - 2);
1158 deblocked.row(ly)
1159 } else if yi < stripe_y + stripe_h as isize {
1160 let ly = clamp(yi, 0, crop_h as isize - 1);
1161 cdeffed.row(ly)
1162 } else {
1163 let ly = cmp::min(
1164 clamp(yi, 0, crop_h as isize - 1),
1165 stripe_y + stripe_h as isize + 1,
1166 );
1167 deblocked.row(ly)
1168 };
1169 let start = i32::cast_from(src[0]);
1170 let end = i32::cast_from(src[crop_w - 1]);
1171 for i in 0..3 - xi as isize {
1172 acc += hfilter[i as usize] * start;
1173 }
1174
1175 let off = 3 - (xi as isize);
1176 let s = cmp::max(0, off) as usize;
1177 let s1 = (s as isize - off) as usize;
1178 let n1 = (n - off) as usize;
1179
1180 for (hf, &v) in hfilter[s..n as usize].iter().zip(src[s1..n1].iter()) {
1181 acc += hf * i32::cast_from(v);
1182 }
1183
1184 for i in n..7 {
1185 acc += hfilter[i as usize] * end;
1186 }
1187
1188 acc = (acc + (1 << round_h >> 1)) >> round_h;
1189 work[(yi - stripe_y + 3) as usize] = clamp(acc, -offset, limit - offset);
1190 }
1191
1192 for (wi, dst) in (start_wi..start_wi + end_i)
1193 .zip(out_slice.rows_iter_mut().map(|row| &mut row[xi]).take(end_i))
1194 {
1195 let mut acc = 0;
1196 for (i, src) in (0..7).zip(work[wi..wi + 7].iter_mut()) {
1197 acc += vfilter[i] * *src;
1198 }
1199 *dst = T::cast_from(clamp(
1200 (acc + (1 << round_v >> 1)) >> round_v,
1201 0,
1202 (1 << bit_depth) - 1,
1203 ));
1204 }
1205 }
1206}
1207
1208#[derive(Copy, Clone, Debug, Default)]
1209pub struct RestorationUnit {
1210 pub filter: RestorationFilter,
1211}
1212
1213#[derive(Clone, Debug)]
1214pub struct FrameRestorationUnits {
1215 units: Box<[RestorationUnit]>,
1216 pub cols: usize,
1217 pub rows: usize,
1218}
1219
1220impl FrameRestorationUnits {
1221 pub fn new(cols: usize, rows: usize) -> Self {
1222 Self {
1223 units: vec![RestorationUnit::default(); cols * rows].into_boxed_slice(),
1224 cols,
1225 rows,
1226 }
1227 }
1228}
1229
1230impl Index<usize> for FrameRestorationUnits {
1231 type Output = [RestorationUnit];
1232 #[inline(always)]
1233 fn index(&self, index: usize) -> &Self::Output {
1234 &self.units[index * self.cols..(index + 1) * self.cols]
1235 }
1236}
1237
1238impl IndexMut<usize> for FrameRestorationUnits {
1239 #[inline(always)]
1240 fn index_mut(&mut self, index: usize) -> &mut Self::Output {
1241 &mut self.units[index * self.cols..(index + 1) * self.cols]
1242 }
1243}
1244
1245#[derive(Clone, Debug)]
1246pub struct RestorationPlaneConfig {
1247 pub lrf_type: u8,
1248 pub unit_size: usize,
1249 // (1 << sb_x_shift) gives the number of superblocks horizontally or
1250 // vertically in a restoration unit, not accounting for RU stretching
1251 pub sb_h_shift: usize,
1252 pub sb_v_shift: usize,
1253 pub sb_cols: usize, // actual number of SB cols in this LRU (accounting for stretch and crop)
1254 pub sb_rows: usize, // actual number of SB rows in this LRU (accounting for stretch and crop)
1255 // stripe height is 64 in all cases except 4:2:0 chroma planes where
1256 // it is 32. This is independent of all other setup parameters
1257 pub stripe_height: usize,
1258 pub cols: usize,
1259 pub rows: usize,
1260}
1261
1262#[derive(Clone, Debug)]
1263pub struct RestorationPlane {
1264 pub cfg: RestorationPlaneConfig,
1265 pub units: FrameRestorationUnits,
1266}
1267
1268#[derive(Clone, Default)]
1269pub struct RestorationPlaneOffset {
1270 pub row: usize,
1271 pub col: usize,
1272}
1273
1274impl RestorationPlane {
1275 pub fn new(
1276 lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize,
1277 sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize,
1278 rows: usize,
1279 ) -> RestorationPlane {
1280 let stripe_height = if stripe_decimate != 0 { 32 } else { 64 };
1281 RestorationPlane {
1282 cfg: RestorationPlaneConfig {
1283 lrf_type,
1284 unit_size,
1285 sb_h_shift,
1286 sb_v_shift,
1287 sb_cols,
1288 sb_rows,
1289 stripe_height,
1290 cols,
1291 rows,
1292 },
1293 units: FrameRestorationUnits::new(cols, rows),
1294 }
1295 }
1296
1297 // Stripes are always 64 pixels high in a non-subsampled
1298 // frame, and decimated from 64 pixels in chroma. When
1299 // filtering, they are not co-located on Y with superblocks.
1300 fn restoration_unit_index_by_stripe(
1301 &self, stripenum: usize, rux: usize,
1302 ) -> (usize, usize) {
1303 (
1304 cmp::min(rux, self.cfg.cols - 1),
1305 cmp::min(
1306 stripenum * self.cfg.stripe_height / self.cfg.unit_size,
1307 self.cfg.rows - 1,
1308 ),
1309 )
1310 }
1311
1312 pub fn restoration_unit_by_stripe(
1313 &self, stripenum: usize, rux: usize,
1314 ) -> &RestorationUnit {
1315 let (x, y) = self.restoration_unit_index_by_stripe(stripenum, rux);
1316 &self.units[y][x]
1317 }
1318}
1319
1320#[derive(Clone, Debug)]
1321pub struct RestorationState {
1322 pub planes: [RestorationPlane; MAX_PLANES],
1323}
1324
1325impl RestorationState {
1326 pub fn new<T: Pixel>(fi: &FrameInvariants<T>, input: &Frame<T>) -> Self {
1327 let PlaneConfig { xdec, ydec, .. } = input.planes[1].cfg;
1328 // stripe size is decimated in 4:2:0 (and only 4:2:0)
1329 let stripe_uv_decimate = usize::from(xdec > 0 && ydec > 0);
1330 let y_sb_log2 = if fi.sequence.use_128x128_superblock { 7 } else { 6 };
1331 let uv_sb_h_log2 = y_sb_log2 - xdec;
1332 let uv_sb_v_log2 = y_sb_log2 - ydec;
1333
1334 let (lrf_y_shift, lrf_uv_shift) = if fi.sequence.enable_large_lru
1335 && fi.sequence.enable_restoration
1336 {
1337 assert!(
1338 fi.width > 1 && fi.height > 1,
1339 "Width and height must be higher than 1 for LRF setup"
1340 );
1341
1342 // Specific content does affect optimal LRU size choice, but the
1343 // quantizer in use is a surprisingly strong selector.
1344 let lrf_base_shift = if fi.base_q_idx > 200 {
1345 0 // big
1346 } else if fi.base_q_idx > 160 {
1347 1
1348 } else {
1349 2 // small
1350 };
1351 let lrf_chroma_shift = if stripe_uv_decimate > 0 {
1352 // 4:2:0 only
1353 if lrf_base_shift == 2 {
1354 1 // smallest chroma LRU is a win at low quant
1355 } else {
1356 // Will a down-shifted chroma LRU eliminate stretch in chroma?
1357 // If so, that's generally a win.
1358 let lrf_unit_size =
1359 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_base_shift);
1360 let unshifted_stretch = ((fi.width >> xdec) - 1) % lrf_unit_size
1361 <= lrf_unit_size / 2
1362 || ((fi.height >> ydec) - 1) % lrf_unit_size <= lrf_unit_size / 2;
1363 let shifted_stretch = ((fi.width >> xdec) - 1)
1364 % (lrf_unit_size >> 1)
1365 <= lrf_unit_size / 4
1366 || ((fi.height >> ydec) - 1) % (lrf_unit_size >> 1)
1367 <= lrf_unit_size / 4;
1368 // shift to eliminate stretch if needed,
1369 // otherwise do not shift and save the signaling bits
1370 usize::from(unshifted_stretch && !shifted_stretch)
1371 }
1372 } else {
1373 0
1374 };
1375 (lrf_base_shift, lrf_base_shift + lrf_chroma_shift)
1376 } else {
1377 // Explicit request to tie LRU size to superblock size ==
1378 // smallest possible LRU size
1379 let lrf_y_shift = if fi.sequence.use_128x128_superblock { 1 } else { 2 };
1380 (lrf_y_shift, lrf_y_shift + stripe_uv_decimate)
1381 };
1382
1383 let mut y_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_y_shift);
1384 let mut uv_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_uv_shift);
1385
1386 let tiling = fi.sequence.tiling;
1387 // Right now we defer to tiling setup: don't choose an LRU size
1388 // large enough that a tile is not an integer number of LRUs
1389 // wide/high.
1390 if tiling.cols > 1 || tiling.rows > 1 {
1391 // despite suggestions to the contrary, tiles can be
1392 // non-powers-of-2.
1393 let trailing_h_zeros = tiling.tile_width_sb.trailing_zeros() as usize;
1394 let trailing_v_zeros = tiling.tile_height_sb.trailing_zeros() as usize;
1395 let tile_aligned_y_unit_size =
1396 1 << (y_sb_log2 + trailing_h_zeros.min(trailing_v_zeros));
1397 let tile_aligned_uv_h_unit_size = 1 << (uv_sb_h_log2 + trailing_h_zeros);
1398 let tile_aligned_uv_v_unit_size = 1 << (uv_sb_v_log2 + trailing_v_zeros);
1399 y_unit_size = y_unit_size.min(tile_aligned_y_unit_size);
1400 uv_unit_size = uv_unit_size
1401 .min(tile_aligned_uv_h_unit_size.min(tile_aligned_uv_v_unit_size));
1402
1403 // But it's actually worse: LRUs can't span tiles (in our
1404 // one-pass design that is, spec allows it). However, the spec
1405 // mandates the last LRU stretches forward into any
1406 // less-than-half-LRU span of superblocks at the right and
1407 // bottom of a frame. These superblocks may well be in a
1408 // different tile! Even if LRUs are minimum size (one
1409 // superblock), when the right or bottom edge of the frame is a
1410 // superblock that's less than half the width/height of a normal
1411 // superblock, the LRU is forced by the spec to span into it
1412 // (and thus a different tile). Tiling is under no such
1413 // restriction; it could decide the right/left sliver will be in
1414 // its own tile row/column. We can't disallow the combination
1415 // here. The tiling code will have to either prevent it or
1416 // tolerate it. (prayer mechanic == Issue #1629).
1417 }
1418
1419 // When coding 4:2:2 and 4:4:4, spec requires Y and UV LRU sizes
1420 // to be the same*. If they differ at this
1421 // point, it's due to a tiling restriction enforcing a maximum
1422 // size, so force both to the smaller value.
1423 //
1424 // *see sec 5.9.20, "Loop restoration params syntax". The
1425 // bitstream provides means of coding a different UV LRU size only
1426 // when chroma is in use and both x and y are subsampled in the
1427 // chroma planes.
1428 if ydec == 0 && y_unit_size != uv_unit_size {
1429 y_unit_size = uv_unit_size.min(y_unit_size);
1430 uv_unit_size = y_unit_size;
1431 }
1432
1433 // derive the rest
1434 let y_unit_log2 = y_unit_size.ilog() - 1;
1435 let uv_unit_log2 = uv_unit_size.ilog() - 1;
1436 let y_cols = ((fi.width + (y_unit_size >> 1)) / y_unit_size).max(1);
1437 let y_rows = ((fi.height + (y_unit_size >> 1)) / y_unit_size).max(1);
1438 let uv_cols = ((((fi.width + (1 << xdec >> 1)) >> xdec)
1439 + (uv_unit_size >> 1))
1440 / uv_unit_size)
1441 .max(1);
1442 let uv_rows = ((((fi.height + (1 << ydec >> 1)) >> ydec)
1443 + (uv_unit_size >> 1))
1444 / uv_unit_size)
1445 .max(1);
1446
1447 RestorationState {
1448 planes: [
1449 RestorationPlane::new(
1450 RESTORE_SWITCHABLE,
1451 y_unit_size,
1452 y_unit_log2 - y_sb_log2,
1453 y_unit_log2 - y_sb_log2,
1454 fi.sb_width,
1455 fi.sb_height,
1456 0,
1457 y_cols,
1458 y_rows,
1459 ),
1460 RestorationPlane::new(
1461 RESTORE_SWITCHABLE,
1462 uv_unit_size,
1463 uv_unit_log2 - uv_sb_h_log2,
1464 uv_unit_log2 - uv_sb_v_log2,
1465 fi.sb_width,
1466 fi.sb_height,
1467 stripe_uv_decimate,
1468 uv_cols,
1469 uv_rows,
1470 ),
1471 RestorationPlane::new(
1472 RESTORE_SWITCHABLE,
1473 uv_unit_size,
1474 uv_unit_log2 - uv_sb_h_log2,
1475 uv_unit_log2 - uv_sb_v_log2,
1476 fi.sb_width,
1477 fi.sb_height,
1478 stripe_uv_decimate,
1479 uv_cols,
1480 uv_rows,
1481 ),
1482 ],
1483 }
1484 }
1485
1486 #[profiling::function]
1487 pub fn lrf_filter_frame<T: Pixel>(
1488 &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>,
1489 fi: &FrameInvariants<T>,
1490 ) {
1491 let cdeffed = out.clone();
1492 let planes =
1493 if fi.sequence.chroma_sampling == Cs400 { 1 } else { MAX_PLANES };
1494
1495 // unlike the other loop filters that operate over the padded
1496 // frame dimensions, restoration filtering and source pixel
1497 // accesses are clipped to the original frame dimensions
1498 // that's why we use fi.width and fi.height instead of PlaneConfig fields
1499
1500 // number of stripes (counted according to colocated Y luma position)
1501 let stripe_n = (fi.height + 7) / 64 + 1;
1502
1503 // Buffers for the stripe filter.
1504 let mut stripe_filter_buffer =
1505 IntegralImageBuffer::zeroed(STRIPE_IMAGE_SIZE);
1506
1507 for pli in 0..planes {
1508 let rp = &self.planes[pli];
1509 let xdec = out.planes[pli].cfg.xdec;
1510 let ydec = out.planes[pli].cfg.ydec;
1511 let crop_w = (fi.width + (1 << xdec >> 1)) >> xdec;
1512 let crop_h = (fi.height + (1 << ydec >> 1)) >> ydec;
1513
1514 for si in 0..stripe_n {
1515 let (stripe_start_y, stripe_size) = if si == 0 {
1516 (0, (64 - 8) >> ydec)
1517 } else {
1518 let start = (si * 64 - 8) >> ydec;
1519 (
1520 start as isize,
1521 // one past, unlike spec
1522 (64 >> ydec).min(crop_h - start),
1523 )
1524 };
1525
1526 // horizontally, go rdu-by-rdu
1527 for rux in 0..rp.cfg.cols {
1528 // stripe x pixel locations must be clipped to frame, last may need to stretch
1529 let x = rux * rp.cfg.unit_size;
1530 let size =
1531 if rux == rp.cfg.cols - 1 { crop_w - x } else { rp.cfg.unit_size };
1532 let ru = rp.restoration_unit_by_stripe(si, rux);
1533 match ru.filter {
1534 RestorationFilter::Wiener { coeffs } => {
1535 wiener_stripe_filter(
1536 coeffs,
1537 fi,
1538 crop_w,
1539 crop_h,
1540 size,
1541 stripe_size,
1542 x,
1543 stripe_start_y,
1544 &cdeffed.planes[pli],
1545 &pre_cdef.planes[pli],
1546 &mut out.planes[pli],
1547 );
1548 }
1549 RestorationFilter::Sgrproj { set, xqd } => {
1550 if !fi.sequence.enable_cdef {
1551 continue;
1552 }
1553
1554 setup_integral_image(
1555 &mut stripe_filter_buffer,
1556 STRIPE_IMAGE_STRIDE,
1557 crop_w - x,
1558 (crop_h as isize - stripe_start_y) as usize,
1559 size,
1560 stripe_size,
1561 &cdeffed.planes[pli]
1562 .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1563 &pre_cdef.planes[pli]
1564 .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1565 );
1566
1567 sgrproj_stripe_filter(
1568 set,
1569 xqd,
1570 fi,
1571 &stripe_filter_buffer,
1572 STRIPE_IMAGE_STRIDE,
1573 &cdeffed.planes[pli]
1574 .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1575 &mut out.planes[pli].region_mut(Area::Rect {
1576 x: x as isize,
1577 y: stripe_start_y,
1578 width: size,
1579 height: stripe_size,
1580 }),
1581 );
1582 }
1583 RestorationFilter::None => {
1584 // do nothing
1585 }
1586 }
1587 }
1588 }
1589 }
1590 }
1591}
1592