1//! Optimized alpha blending routines based on libwebp
2//!
3//! <https://github.com/webmproject/libwebp/blob/e4f7a9f0c7c9fbfae1568bc7fa5c94b989b50872/src/demux/anim_decode.c#L215-L267>
4
5const fn channel_shift(i: u32) -> u32 {
6 i * 8
7}
8
9/// Blend a single channel of `src` over `dst`, given their alpha channel values.
10/// `src` and `dst` are assumed to be NOT pre-multiplied by alpha.
11fn blend_channel_nonpremult(
12 src: u32,
13 src_a: u8,
14 dst: u32,
15 dst_a: u8,
16 scale: u32,
17 shift: u32,
18) -> u8 {
19 let src_channel: u8 = ((src >> shift) & 0xff) as u8;
20 let dst_channel: u8 = ((dst >> shift) & 0xff) as u8;
21 let blend_unscaled: u32 =
22 (u32::from(src_channel) * u32::from(src_a)) + (u32::from(dst_channel) * u32::from(dst_a));
23 debug_assert!(u64::from(blend_unscaled) < (1u64 << 32) / u64::from(scale));
24 ((blend_unscaled * scale) >> channel_shift(3)) as u8
25}
26
27/// Blend `src` over `dst` assuming they are NOT pre-multiplied by alpha.
28fn blend_pixel_nonpremult(src: u32, dst: u32) -> u32 {
29 let src_a = ((src >> channel_shift(3)) & 0xff) as u8;
30
31 if src_a == 0 {
32 dst
33 } else {
34 let dst_a = ((dst >> channel_shift(3)) & 0xff) as u8;
35 // Approximate integer arithmetic for: dst_factor_a = (dst_a * (255 - src_a)) / 255
36 // libwebp used the following formula here:
37 //let dst_factor_a = (dst_a as u32 * (256 - src_a as u32)) >> 8;
38 // however, we've found that we can use a more precise approximation without losing performance:
39 let dst_factor_a = div_by_255(u32::from(dst_a) * (255 - u32::from(src_a)));
40 let blend_a = u32::from(src_a) + dst_factor_a;
41 let scale = (1u32 << 24) / blend_a;
42
43 let blend_r =
44 blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(0));
45 let blend_g =
46 blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(1));
47 let blend_b =
48 blend_channel_nonpremult(src, src_a, dst, dst_factor_a as u8, scale, channel_shift(2));
49 debug_assert!(u32::from(src_a) + dst_factor_a < 256);
50
51 (u32::from(blend_r) << channel_shift(0))
52 | (u32::from(blend_g) << channel_shift(1))
53 | (u32::from(blend_b) << channel_shift(2))
54 | (blend_a << channel_shift(3))
55 }
56}
57
58pub(crate) fn do_alpha_blending(buffer: [u8; 4], canvas: [u8; 4]) -> [u8; 4] {
59 // The original C code contained different shift functions for different endianness,
60 // but they didn't work when ported to Rust directly (and probably didn't work in C either).
61 // So instead we reverse the order of bytes on big-endian here, at the interface.
62 // `from_le_bytes` is a no-op on little endian (most systems) and a cheap shuffle on big endian.
63 blend_pixel_nonpremult(src:u32::from_le_bytes(buffer), dst:u32::from_le_bytes(canvas)).to_le_bytes()
64}
65
66/// Divides by 255, rounding to nearest (as opposed to down, like regular integer division does).
67/// TODO: cannot output 256, so the output is effecitively u8. Plumb that through the code.
68//
69// Sources:
70// https://arxiv.org/pdf/2202.02864
71// https://github.com/image-rs/image-webp/issues/119#issuecomment-2544007820
72#[inline]
73const fn div_by_255(v: u32) -> u32 {
74 (((v + 0x80) >> 8) + v + 0x80) >> 8
75}
76
77#[cfg(test)]
78mod tests {
79 use super::*;
80
81 fn do_alpha_blending_reference(buffer: [u8; 4], canvas: [u8; 4]) -> [u8; 4] {
82 let canvas_alpha = f64::from(canvas[3]);
83 let buffer_alpha = f64::from(buffer[3]);
84 let blend_alpha_f64 = buffer_alpha + canvas_alpha * (1.0 - buffer_alpha / 255.0);
85 //value should be between 0 and 255, this truncates the fractional part
86 let blend_alpha: u8 = blend_alpha_f64 as u8;
87
88 let blend_rgb: [u8; 3] = if blend_alpha == 0 {
89 [0, 0, 0]
90 } else {
91 let mut rgb = [0u8; 3];
92 for i in 0..3 {
93 let canvas_f64 = f64::from(canvas[i]);
94 let buffer_f64 = f64::from(buffer[i]);
95
96 let val = (buffer_f64 * buffer_alpha
97 + canvas_f64 * canvas_alpha * (1.0 - buffer_alpha / 255.0))
98 / blend_alpha_f64;
99 //value should be between 0 and 255, this truncates the fractional part
100 rgb[i] = val as u8;
101 }
102
103 rgb
104 };
105
106 [blend_rgb[0], blend_rgb[1], blend_rgb[2], blend_alpha]
107 }
108
109 #[test]
110 #[ignore] // takes too long to run on CI. Run this locally when changing the function.
111 fn alpha_blending_optimization() {
112 for r1 in 0..u8::MAX {
113 for a1 in 11..u8::MAX {
114 for r2 in 0..u8::MAX {
115 for a2 in 11..u8::MAX {
116 let opt = do_alpha_blending([r1, 0, 0, a1], [r2, 0, 0, a2]);
117 let slow = do_alpha_blending_reference([r1, 0, 0, a1], [r2, 0, 0, a2]);
118 // libwebp doesn't do exact blending and so we don't either
119 for (o, s) in opt.iter().zip(slow.iter()) {
120 assert!(
121 o.abs_diff(*s) <= 3,
122 "Mismatch in results! opt: {opt:?}, slow: {slow:?}, blended values: [{r1}, 0, 0, {a1}], [{r2}, 0, 0, {a2}]"
123 );
124 }
125 }
126 }
127 }
128 }
129 }
130}
131