1// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10cfg_if::cfg_if! {
11 if #[cfg(nasm_x86_64)] {
12 pub use crate::asm::x86::dist::*;
13 } else if #[cfg(asm_neon)] {
14 pub use crate::asm::aarch64::dist::*;
15 } else {
16 pub use self::rust::*;
17 }
18}
19
20pub(crate) mod rust {
21 use crate::activity::apply_ssim_boost;
22 use crate::cpu_features::CpuFeatureLevel;
23 use crate::tiling::*;
24 use crate::util::*;
25
26 use crate::encoder::IMPORTANCE_BLOCK_SIZE;
27 use crate::rdo::DistortionScale;
28
29 /// Compute the sum of absolute differences over a block.
30 /// w and h can be at most 128, the size of the largest block.
31 pub fn get_sad<T: Pixel>(
32 plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
33 h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
34 ) -> u32 {
35 debug_assert!(w <= 128 && h <= 128);
36 let plane_org =
37 plane_org.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
38 let plane_ref =
39 plane_ref.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
40
41 plane_org
42 .rows_iter()
43 .zip(plane_ref.rows_iter())
44 .map(|(src, dst)| {
45 src
46 .iter()
47 .zip(dst)
48 .map(|(&p1, &p2)| i32::cast_from(p1).abs_diff(i32::cast_from(p2)))
49 .sum::<u32>()
50 })
51 .sum()
52 }
53
54 #[inline(always)]
55 const fn butterfly(a: i32, b: i32) -> (i32, i32) {
56 ((a + b), (a - b))
57 }
58
59 #[inline(always)]
60 #[allow(clippy::identity_op, clippy::erasing_op)]
61 fn hadamard4_1d<
62 const LEN: usize,
63 const N: usize,
64 const STRIDE0: usize,
65 const STRIDE1: usize,
66 >(
67 data: &mut [i32; LEN],
68 ) {
69 for i in 0..N {
70 let sub: &mut [i32] = &mut data[i * STRIDE0..];
71 let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
72 let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
73 let (b0, b2) = butterfly(a0, a2);
74 let (b1, b3) = butterfly(a1, a3);
75 sub[0 * STRIDE1] = b0;
76 sub[1 * STRIDE1] = b1;
77 sub[2 * STRIDE1] = b2;
78 sub[3 * STRIDE1] = b3;
79 }
80 }
81
82 #[inline(always)]
83 #[allow(clippy::identity_op, clippy::erasing_op)]
84 fn hadamard8_1d<
85 const LEN: usize,
86 const N: usize,
87 const STRIDE0: usize,
88 const STRIDE1: usize,
89 >(
90 data: &mut [i32; LEN],
91 ) {
92 for i in 0..N {
93 let sub: &mut [i32] = &mut data[i * STRIDE0..];
94
95 let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
96 let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
97 let (a4, a5) = butterfly(sub[4 * STRIDE1], sub[5 * STRIDE1]);
98 let (a6, a7) = butterfly(sub[6 * STRIDE1], sub[7 * STRIDE1]);
99
100 let (b0, b2) = butterfly(a0, a2);
101 let (b1, b3) = butterfly(a1, a3);
102 let (b4, b6) = butterfly(a4, a6);
103 let (b5, b7) = butterfly(a5, a7);
104
105 let (c0, c4) = butterfly(b0, b4);
106 let (c1, c5) = butterfly(b1, b5);
107 let (c2, c6) = butterfly(b2, b6);
108 let (c3, c7) = butterfly(b3, b7);
109
110 sub[0 * STRIDE1] = c0;
111 sub[1 * STRIDE1] = c1;
112 sub[2 * STRIDE1] = c2;
113 sub[3 * STRIDE1] = c3;
114 sub[4 * STRIDE1] = c4;
115 sub[5 * STRIDE1] = c5;
116 sub[6 * STRIDE1] = c6;
117 sub[7 * STRIDE1] = c7;
118 }
119 }
120
121 #[inline(always)]
122 fn hadamard2d<const LEN: usize, const W: usize, const H: usize>(
123 data: &mut [i32; LEN],
124 ) {
125 /*Vertical transform.*/
126 let vert_func = if H == 4 {
127 hadamard4_1d::<LEN, W, 1, H>
128 } else {
129 hadamard8_1d::<LEN, W, 1, H>
130 };
131 vert_func(data);
132 /*Horizontal transform.*/
133 let horz_func = if W == 4 {
134 hadamard4_1d::<LEN, H, W, 1>
135 } else {
136 hadamard8_1d::<LEN, H, W, 1>
137 };
138 horz_func(data);
139 }
140
141 // SAFETY: The length of data must be 16.
142 unsafe fn hadamard4x4(data: &mut [i32]) {
143 hadamard2d::<{ 4 * 4 }, 4, 4>(&mut *(data.as_mut_ptr() as *mut [i32; 16]));
144 }
145
146 // SAFETY: The length of data must be 64.
147 unsafe fn hadamard8x8(data: &mut [i32]) {
148 hadamard2d::<{ 8 * 8 }, 8, 8>(&mut *(data.as_mut_ptr() as *mut [i32; 64]));
149 }
150
151 /// Sum of absolute transformed differences over a block.
152 /// w and h can be at most 128, the size of the largest block.
153 /// Use the sum of 4x4 and 8x8 hadamard transforms for the transform, but
154 /// revert to sad on edges when these transforms do not fit into w and h.
155 /// 4x4 transforms instead of 8x8 transforms when width or height < 8.
156 pub fn get_satd<T: Pixel>(
157 plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
158 h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
159 ) -> u32 {
160 assert!(w <= 128 && h <= 128);
161 assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
162 assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h);
163
164 // Size of hadamard transform should be 4x4 or 8x8
165 // 4x* and *x4 use 4x4 and all other use 8x8
166 let size: usize = w.min(h).min(8);
167 let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 };
168
169 let mut sum: u64 = 0;
170
171 // Loop over chunks the size of the chosen transform
172 for chunk_y in (0..h).step_by(size) {
173 let chunk_h = (h - chunk_y).min(size);
174 for chunk_x in (0..w).step_by(size) {
175 let chunk_w = (w - chunk_x).min(size);
176 let chunk_area: Area = Area::Rect {
177 x: chunk_x as isize,
178 y: chunk_y as isize,
179 width: chunk_w,
180 height: chunk_h,
181 };
182 let chunk_org = plane_org.subregion(chunk_area);
183 let chunk_ref = plane_ref.subregion(chunk_area);
184
185 // Revert to sad on edge blocks (frame edges)
186 if chunk_w != size || chunk_h != size {
187 sum += get_sad(
188 &chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu,
189 ) as u64;
190 continue;
191 }
192
193 let buf: &mut [i32] = &mut [0; 8 * 8][..size * size];
194
195 // Move the difference of the transforms to a buffer
196 for (row_diff, (row_org, row_ref)) in buf
197 .chunks_mut(size)
198 .zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter()))
199 {
200 for (diff, (a, b)) in
201 row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter()))
202 {
203 *diff = i32::cast_from(*a) - i32::cast_from(*b);
204 }
205 }
206
207 // Perform the hadamard transform on the differences
208 // SAFETY: A sufficient number elements exist for the size of the transform.
209 unsafe {
210 tx2d(buf);
211 }
212
213 // Sum the absolute values of the transformed differences
214 sum += buf.iter().map(|a| a.unsigned_abs() as u64).sum::<u64>();
215 }
216 }
217
218 // Normalize the results
219 let ln = msb(size as i32) as u64;
220 ((sum + (1 << ln >> 1)) >> ln) as u32
221 }
222
223 /// Number of bits rounded off before summing in `get_weighted_sse`
224 pub const GET_WEIGHTED_SSE_SHIFT: u8 = 8;
225
226 /// Computes weighted sum of squared error.
227 ///
228 /// Each scale is applied to a 4x4 region in the provided inputs. Each scale
229 /// value is a fixed point number, currently [`DistortionScale`].
230 ///
231 /// Implementations can require alignment (`bw` (block width) for [`src1`] and
232 /// [`src2`] and `bw/4` for `scale`).
233 #[inline(never)]
234 pub fn get_weighted_sse<T: Pixel>(
235 src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
236 scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
237 _cpu: CpuFeatureLevel,
238 ) -> u64 {
239 let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
240 // Always chunk and apply scaling on the sse of squares the size of
241 // decimated/sub-sampled importance block sizes.
242 // Warning: Changing this will require changing/disabling assembly.
243 let chunk_size: usize = IMPORTANCE_BLOCK_SIZE >> 1;
244
245 // Iterator of a row of scales, stretched out to be per row
246 let scales = scale.chunks_exact(scale_stride);
247
248 let sse = src1
249 .vert_windows(chunk_size)
250 .step_by(chunk_size)
251 .zip(src2.vert_windows(chunk_size).step_by(chunk_size))
252 .zip(scales)
253 .map(|((row1, row2), scales)| {
254 row1
255 .horz_windows(chunk_size)
256 .step_by(chunk_size)
257 .zip(row2.horz_windows(chunk_size).step_by(chunk_size))
258 .zip(scales)
259 .map(|((chunk1, chunk2), &scale)| {
260 let sum = chunk1
261 .rows_iter()
262 .zip(chunk2.rows_iter())
263 .map(|(chunk_row1, chunk_row2)| {
264 chunk_row1
265 .iter()
266 .zip(chunk_row2)
267 .map(|(&a, &b)| {
268 let c = i32::cast_from(a) - i32::cast_from(b);
269 (c * c) as u32
270 })
271 .sum::<u32>()
272 })
273 .sum::<u32>();
274 (sum as u64 * scale as u64 + (1 << GET_WEIGHTED_SSE_SHIFT >> 1))
275 >> GET_WEIGHTED_SSE_SHIFT
276 })
277 .sum::<u64>()
278 })
279 .sum::<u64>();
280
281 let den = DistortionScale::new(1, 1 << GET_WEIGHTED_SSE_SHIFT).0 as u64;
282 (sse + (den >> 1)) / den
283 }
284
285 /// Number of bits of precision used in `AREA_DIVISORS`
286 const AREA_DIVISOR_BITS: u8 = 14;
287
288 /// Lookup table for 2^`AREA_DIVISOR_BITS` / (1 + x)
289 #[rustfmt::skip]
290 const AREA_DIVISORS: [u16; 64] = [
291 16384, 8192, 5461, 4096, 3277, 2731, 2341, 2048, 1820, 1638, 1489, 1365,
292 1260, 1170, 1092, 1024, 964, 910, 862, 819, 780, 745, 712, 683,
293 655, 630, 607, 585, 565, 546, 529, 512, 496, 482, 468, 455,
294 443, 431, 420, 410, 400, 390, 381, 372, 364, 356, 349, 341,
295 334, 328, 321, 315, 309, 303, 298, 293, 287, 282, 278, 273,
296 269, 264, 260, 256,
297 ];
298
299 /// Computes a distortion metric of the sum of squares weighted by activity.
300 /// w and h should be <= 8.
301 #[inline(never)]
302 pub fn cdef_dist_kernel<T: Pixel>(
303 src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
304 bit_depth: usize, _cpu: CpuFeatureLevel,
305 ) -> u32 {
306 // TODO: Investigate using different constants in ssim boost for block sizes
307 // smaller than 8x8.
308
309 debug_assert!(src.plane_cfg.xdec == 0);
310 debug_assert!(src.plane_cfg.ydec == 0);
311 debug_assert!(dst.plane_cfg.xdec == 0);
312 debug_assert!(dst.plane_cfg.ydec == 0);
313
314 // Limit kernel to 8x8
315 debug_assert!(w <= 8);
316 debug_assert!(h <= 8);
317
318 // Compute the following summations.
319 let mut sum_s: u32 = 0; // sum(src_{i,j})
320 let mut sum_d: u32 = 0; // sum(dst_{i,j})
321 let mut sum_s2: u32 = 0; // sum(src_{i,j}^2)
322 let mut sum_d2: u32 = 0; // sum(dst_{i,j}^2)
323 let mut sum_sd: u32 = 0; // sum(src_{i,j} * dst_{i,j})
324 for (row1, row2) in src.rows_iter().take(h).zip(dst.rows_iter()) {
325 for (s, d) in row1[..w].iter().zip(row2) {
326 let s: u32 = u32::cast_from(*s);
327 let d: u32 = u32::cast_from(*d);
328 sum_s += s;
329 sum_d += d;
330
331 sum_s2 += s * s;
332 sum_d2 += d * d;
333 sum_sd += s * d;
334 }
335 }
336
337 // To get the distortion, compute sum of squared error and apply a weight
338 // based on the variance of the two planes.
339 let sse = sum_d2 + sum_s2 - 2 * sum_sd;
340
341 // Convert to 64-bits to avoid overflow when squaring
342 let sum_s = sum_s as u64;
343 let sum_d = sum_d as u64;
344
345 // Calculate the variance (more accurately variance*area) of each plane.
346 // var[iance] = avg(X^2) - avg(X)^2 = sum(X^2) / n - sum(X)^2 / n^2
347 // (n = # samples i.e. area)
348 // var * n = sum(X^2) - sum(X)^2 / n
349 // When w and h are powers of two, this can be done via shifting.
350 let div = AREA_DIVISORS[w * h - 1] as u64;
351 let div_shift = AREA_DIVISOR_BITS;
352 // Due to rounding, negative values can occur when w or h aren't powers of
353 // two. Saturate to avoid underflow.
354 let mut svar = sum_s2.saturating_sub(
355 ((sum_s * sum_s * div + (1 << div_shift >> 1)) >> div_shift) as u32,
356 );
357 let mut dvar = sum_d2.saturating_sub(
358 ((sum_d * sum_d * div + (1 << div_shift >> 1)) >> div_shift) as u32,
359 );
360
361 // Scale variances up to 8x8 size.
362 // scaled variance = var * (8x8) / wxh
363 // For 8x8, this is a nop. For powers of 2, this is doable with shifting.
364 // TODO: It should be possible and faster to do this adjustment in ssim boost
365 let scale_shift = AREA_DIVISOR_BITS - 6;
366 svar =
367 ((svar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
368 dvar =
369 ((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
370
371 apply_ssim_boost(sse, svar, dvar, bit_depth)
372 }
373}
374
375#[cfg(test)]
376pub mod test {
377 use super::*;
378 use crate::cpu_features::CpuFeatureLevel;
379 use crate::frame::*;
380 use crate::tiling::Area;
381 use crate::util::Pixel;
382
383 // Generate plane data for get_sad_same()
384 fn setup_planes<T: Pixel>() -> (Plane<T>, Plane<T>) {
385 // Two planes with different strides
386 let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8);
387 let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8);
388
389 // Make the test pattern robust to data alignment
390 let xpad_off =
391 (input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - 8i32;
392
393 for (i, row) in
394 input_plane.data.chunks_mut(input_plane.cfg.stride).enumerate()
395 {
396 for (j, pixel) in row.iter_mut().enumerate() {
397 let val = ((j + i) as i32 - xpad_off) & 255i32;
398 assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
399 *pixel = T::cast_from(val);
400 }
401 }
402
403 for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate()
404 {
405 for (j, pixel) in row.iter_mut().enumerate() {
406 let val = (j as i32 - i as i32 - xpad_off) & 255i32;
407 assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
408 *pixel = T::cast_from(val);
409 }
410 }
411
412 (input_plane, rec_plane)
413 }
414
415 // Regression and validation test for SAD computation
416 fn get_sad_same_inner<T: Pixel>() {
417 // dynamic allocation: test
418 let blocks: Vec<(usize, usize, u32)> = vec![
419 (4, 4, 1912),
420 (4, 8, 4296),
421 (8, 4, 3496),
422 (8, 8, 7824),
423 (8, 16, 16592),
424 (16, 8, 14416),
425 (16, 16, 31136),
426 (16, 32, 60064),
427 (32, 16, 59552),
428 (32, 32, 120128),
429 (32, 64, 186688),
430 (64, 32, 250176),
431 (64, 64, 438912),
432 (64, 128, 654272),
433 (128, 64, 1016768),
434 (128, 128, 1689792),
435 (4, 16, 8680),
436 (16, 4, 6664),
437 (8, 32, 31056),
438 (32, 8, 27600),
439 (16, 64, 93344),
440 (64, 16, 116384),
441 ];
442
443 let bit_depth: usize = 8;
444 let (input_plane, rec_plane) = setup_planes::<T>();
445
446 for (w, h, distortion) in blocks {
447 let area = Area::StartingAt { x: 32, y: 40 };
448
449 let input_region = input_plane.region(area);
450 let rec_region = rec_plane.region(area);
451
452 assert_eq!(
453 distortion,
454 get_sad(
455 &input_region,
456 &rec_region,
457 w,
458 h,
459 bit_depth,
460 CpuFeatureLevel::default()
461 )
462 );
463 }
464 }
465
466 #[test]
467 fn get_sad_same_u8() {
468 get_sad_same_inner::<u8>();
469 }
470
471 #[test]
472 fn get_sad_same_u16() {
473 get_sad_same_inner::<u16>();
474 }
475
476 fn get_satd_same_inner<T: Pixel>() {
477 let blocks: Vec<(usize, usize, u32)> = vec![
478 (4, 4, 1408),
479 (4, 8, 2016),
480 (8, 4, 1816),
481 (8, 8, 3984),
482 (8, 16, 5136),
483 (16, 8, 4864),
484 (16, 16, 9984),
485 (16, 32, 13824),
486 (32, 16, 13760),
487 (32, 32, 27952),
488 (32, 64, 37168),
489 (64, 32, 45104),
490 (64, 64, 84176),
491 (64, 128, 127920),
492 (128, 64, 173680),
493 (128, 128, 321456),
494 (4, 16, 3136),
495 (16, 4, 2632),
496 (8, 32, 7056),
497 (32, 8, 6624),
498 (16, 64, 18432),
499 (64, 16, 21312),
500 ];
501
502 let bit_depth: usize = 8;
503 let (input_plane, rec_plane) = setup_planes::<T>();
504
505 for (w, h, distortion) in blocks {
506 let area = Area::StartingAt { x: 32, y: 40 };
507
508 let input_region = input_plane.region(area);
509 let rec_region = rec_plane.region(area);
510
511 assert_eq!(
512 distortion,
513 get_satd(
514 &input_region,
515 &rec_region,
516 w,
517 h,
518 bit_depth,
519 CpuFeatureLevel::default()
520 )
521 );
522 }
523 }
524
525 #[test]
526 fn get_satd_same_u8() {
527 get_satd_same_inner::<u8>();
528 }
529
530 #[test]
531 fn get_satd_same_u16() {
532 get_satd_same_inner::<u16>();
533 }
534}
535