1 | #[cfg (target_arch = "wasm32" )] |
2 | use std::arch::wasm32::*; |
3 | |
4 | #[cfg (target_arch = "wasm32" )] |
5 | #[target_feature (enable = "simd128" )] |
6 | fn idct8(data: &mut [v128; 8]) { |
7 | // The fixed-point constants here are obtained by taking the fractional part of the constants |
8 | // from the non-SIMD implementation and scaling them up by 1<<15. This is because |
9 | // i16x8_q15mulr_sat(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some |
10 | // slight differences in rounding). |
11 | |
12 | // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it |
13 | // doesn't apply any further scaling and fixed point constants have a different precision. |
14 | |
15 | let p2 = data[2]; |
16 | let p3 = data[6]; |
17 | let p1 = i16x8_q15mulr_sat(i16x8_add_sat(p2, p3), i16x8_splat(17734)); // 0.5411961 |
18 | let t2 = i16x8_sub_sat( |
19 | i16x8_sub_sat(p1, p3), |
20 | i16x8_q15mulr_sat(p3, i16x8_splat(27779)), // 0.847759065 |
21 | ); |
22 | let t3 = i16x8_add_sat(p1, i16x8_q15mulr_sat(p2, i16x8_splat(25079))); // 0.765366865 |
23 | |
24 | let p2 = data[0]; |
25 | let p3 = data[4]; |
26 | let t0 = i16x8_add_sat(p2, p3); |
27 | let t1 = i16x8_sub_sat(p2, p3); |
28 | |
29 | let x0 = i16x8_add_sat(t0, t3); |
30 | let x3 = i16x8_sub_sat(t0, t3); |
31 | let x1 = i16x8_add_sat(t1, t2); |
32 | let x2 = i16x8_sub_sat(t1, t2); |
33 | |
34 | let t0 = data[7]; |
35 | let t1 = data[5]; |
36 | let t2 = data[3]; |
37 | let t3 = data[1]; |
38 | |
39 | let p3 = i16x8_add_sat(t0, t2); |
40 | let p4 = i16x8_add_sat(t1, t3); |
41 | let p1 = i16x8_add_sat(t0, t3); |
42 | let p2 = i16x8_add_sat(t1, t2); |
43 | let p5 = i16x8_add_sat(p3, p4); |
44 | let p5 = i16x8_add_sat(p5, i16x8_q15mulr_sat(p5, i16x8_splat(5763))); // 0.175875602 |
45 | |
46 | let t0 = i16x8_q15mulr_sat(t0, i16x8_splat(9786)); // 0.298631336 |
47 | let t1 = i16x8_add_sat( |
48 | i16x8_add_sat(t1, t1), |
49 | i16x8_q15mulr_sat(t1, i16x8_splat(1741)), // 0.053119869 |
50 | ); |
51 | let t2 = i16x8_add_sat( |
52 | i16x8_add_sat(t2, i16x8_add_sat(t2, t2)), |
53 | i16x8_q15mulr_sat(t2, i16x8_splat(2383)), // 0.072711026 |
54 | ); |
55 | let t3 = i16x8_add_sat(t3, i16x8_q15mulr_sat(t3, i16x8_splat(16427))); // 0.501321110 |
56 | |
57 | let p1 = i16x8_sub_sat(p5, i16x8_q15mulr_sat(p1, i16x8_splat(29490))); // 0.899976223 |
58 | let p2 = i16x8_sub_sat( |
59 | i16x8_sub_sat(i16x8_sub_sat(p5, p2), p2), |
60 | i16x8_q15mulr_sat(p2, i16x8_splat(18446)), // 0.562915447 |
61 | ); |
62 | |
63 | let p3 = i16x8_sub_sat( |
64 | i16x8_q15mulr_sat(p3, i16x8_splat(-31509)), // -0.961570560 |
65 | p3, |
66 | ); |
67 | let p4 = i16x8_q15mulr_sat(p4, i16x8_splat(-12785)); // -0.390180644 |
68 | |
69 | let t3 = i16x8_add_sat(i16x8_add_sat(p1, p4), t3); |
70 | let t2 = i16x8_add_sat(i16x8_add_sat(p2, p3), t2); |
71 | let t1 = i16x8_add_sat(i16x8_add_sat(p2, p4), t1); |
72 | let t0 = i16x8_add_sat(i16x8_add_sat(p1, p3), t0); |
73 | |
74 | data[0] = i16x8_add_sat(x0, t3); |
75 | data[7] = i16x8_sub_sat(x0, t3); |
76 | data[1] = i16x8_add_sat(x1, t2); |
77 | data[6] = i16x8_sub_sat(x1, t2); |
78 | data[2] = i16x8_add_sat(x2, t1); |
79 | data[5] = i16x8_sub_sat(x2, t1); |
80 | data[3] = i16x8_add_sat(x3, t0); |
81 | data[4] = i16x8_sub_sat(x3, t0); |
82 | } |
83 | |
84 | #[cfg (target_arch = "wasm32" )] |
85 | #[target_feature (enable = "simd128" )] |
86 | fn transpose8(data: &mut [v128; 8]) { |
87 | // Transpose a 8x8 matrix with a sequence of interleaving operations. |
88 | // Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e. |
89 | // A0 B0 A1 B1 ... |
90 | // dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved - |
91 | // A0 B0 C0 D0 A1 B1 C1 D1 ... |
92 | let d01l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[0], data[1]); |
93 | let d23l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[2], data[3]); |
94 | let d45l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[4], data[5]); |
95 | let d67l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[6], data[7]); |
96 | let d01h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[0], data[1]); |
97 | let d23h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[2], data[3]); |
98 | let d45h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[4], data[5]); |
99 | let d67h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[6], data[7]); |
100 | |
101 | // Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers. |
102 | let d0123ll = i32x4_shuffle::<0, 4, 1, 5>(d01l, d23l); |
103 | let d0123lh = i32x4_shuffle::<2, 6, 3, 7>(d01l, d23l); |
104 | let d4567ll = i32x4_shuffle::<0, 4, 1, 5>(d45l, d67l); |
105 | let d4567lh = i32x4_shuffle::<2, 6, 3, 7>(d45l, d67l); |
106 | let d0123hl = i32x4_shuffle::<0, 4, 1, 5>(d01h, d23h); |
107 | let d0123hh = i32x4_shuffle::<2, 6, 3, 7>(d01h, d23h); |
108 | let d4567hl = i32x4_shuffle::<0, 4, 1, 5>(d45h, d67h); |
109 | let d4567hh = i32x4_shuffle::<2, 6, 3, 7>(d45h, d67h); |
110 | |
111 | // Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers. |
112 | data[0] = i64x2_shuffle::<0, 2>(d0123ll, d4567ll); |
113 | data[1] = i64x2_shuffle::<1, 3>(d0123ll, d4567ll); |
114 | data[2] = i64x2_shuffle::<0, 2>(d0123lh, d4567lh); |
115 | data[3] = i64x2_shuffle::<1, 3>(d0123lh, d4567lh); |
116 | data[4] = i64x2_shuffle::<0, 2>(d0123hl, d4567hl); |
117 | data[5] = i64x2_shuffle::<1, 3>(d0123hl, d4567hl); |
118 | data[6] = i64x2_shuffle::<0, 2>(d0123hh, d4567hh); |
119 | data[7] = i64x2_shuffle::<1, 3>(d0123hh, d4567hh); |
120 | } |
121 | |
122 | #[cfg (target_arch = "wasm32" )] |
123 | #[target_feature (enable = "simd128" )] |
124 | pub fn dequantize_and_idct_block_8x8( |
125 | coefficients: &[i16; 64], |
126 | quantization_table: &[u16; 64], |
127 | output_linestride: usize, |
128 | output: &mut [u8], |
129 | ) { |
130 | // The loop below will write to positions [output_linestride * i, output_linestride * i + 8) |
131 | // for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7, |
132 | // and if that position is in-bounds, so are all other accesses. |
133 | assert!( |
134 | output.len() |
135 | > output_linestride |
136 | .checked_mul(7) |
137 | .unwrap() |
138 | .checked_add(7) |
139 | .unwrap() |
140 | ); |
141 | |
142 | const SHIFT: u32 = 3; |
143 | |
144 | // Read the DCT coefficients, scale them up and dequantize them. |
145 | let mut data = [i16x8_splat(0); 8]; |
146 | unsafe { |
147 | for i in 0..8 { |
148 | data[i] = i16x8_shl( |
149 | i16x8_mul( |
150 | v128_load(coefficients.as_ptr().wrapping_add(i * 8) as *const _), |
151 | v128_load(quantization_table.as_ptr().wrapping_add(i * 8) as *const _), |
152 | ), |
153 | SHIFT, |
154 | ); |
155 | } |
156 | } |
157 | |
158 | // Usual column IDCT - transpose - column IDCT - transpose approach. |
159 | idct8(&mut data); |
160 | transpose8(&mut data); |
161 | idct8(&mut data); |
162 | transpose8(&mut data); |
163 | |
164 | for i in 0..8 { |
165 | // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is |
166 | // increased by 3. |
167 | // As values will be stored in a u8, they need to be 128-centered and not 0-centered. |
168 | // We add 128 with the appropriate shift for that purpose. |
169 | const OFFSET: i16 = 128 << (SHIFT + 3); |
170 | // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting. |
171 | const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1; |
172 | |
173 | let data_with_offset = i16x8_add_sat(data[i], i16x8_splat(OFFSET + ROUNDING_BIAS)); |
174 | |
175 | // SAFETY: the assert at the start of this function ensures |
176 | // `output_linestride * i + 7` < output.len(), so all accesses are in-bounds. |
177 | unsafe { |
178 | v128_store64_lane::<0>( |
179 | u8x16_narrow_i16x8( |
180 | i16x8_shr(data_with_offset, SHIFT + 3), |
181 | i16x8_splat(0), |
182 | ), |
183 | output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _, |
184 | ); |
185 | } |
186 | } |
187 | } |
188 | |
189 | #[cfg (target_arch = "wasm32" )] |
190 | #[target_feature (enable = "simd128" )] |
191 | pub fn color_convert_line_ycbcr(y_slice: &[u8], cb_slice: &[u8], cr_slice: &[u8], output: &mut [u8]) -> usize { |
192 | |
193 | assert!(output.len() % 3 == 0); |
194 | let num = output.len() / 3; |
195 | assert!(num <= y_slice.len()); |
196 | assert!(num <= cb_slice.len()); |
197 | assert!(num <= cr_slice.len()); |
198 | |
199 | let num_vecs = num / 8; |
200 | |
201 | for i in 0..num_vecs { |
202 | const SHIFT: u32 = 6; |
203 | // Load. |
204 | let y: v128; |
205 | let cb: v128; |
206 | let cr: v128; |
207 | // SAFETY: i is at most `num / 8 - 8`, so the highest v128_load64_zero reads from |
208 | // [num - 8, num). The above asserts ensure this is in-bounds. |
209 | unsafe { |
210 | y = v128_load64_zero(y_slice.as_ptr().wrapping_add(i * 8) as *const _); |
211 | cb = v128_load64_zero(cb_slice.as_ptr().wrapping_add(i * 8) as *const _); |
212 | cr = v128_load64_zero(cr_slice.as_ptr().wrapping_add(i * 8) as *const _); |
213 | } |
214 | |
215 | // Convert to 16 bit. |
216 | let y = i16x8_shl(i16x8_extend_low_u8x16(y), SHIFT); |
217 | let cb = i16x8_shl(i16x8_extend_low_u8x16(cb), SHIFT); |
218 | let cr = i16x8_shl(i16x8_extend_low_u8x16(cr), SHIFT); |
219 | |
220 | // Add offsets |
221 | let c128 = i16x8_splat(128 << SHIFT); |
222 | let y = i16x8_add_sat(y, i16x8_splat((1 << SHIFT) >> 1)); |
223 | let cb = i16x8_sub_sat(cb, c128); |
224 | let cr = i16x8_sub_sat(cr, c128); |
225 | |
226 | // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772 |
227 | let cr_140200 = i16x8_add_sat(i16x8_q15mulr_sat(cr, i16x8_splat(13173)), cr); |
228 | let cb_034414 = i16x8_q15mulr_sat(cb, i16x8_splat(11276)); |
229 | let cr_071414 = i16x8_q15mulr_sat(cr, i16x8_splat(23401)); |
230 | let cb_177200 = i16x8_add_sat(i16x8_q15mulr_sat(cb, i16x8_splat(25297)), cb); |
231 | |
232 | // Last conversion step. |
233 | let r = i16x8_add_sat(y, cr_140200); |
234 | let g = i16x8_sub_sat(y, i16x8_add_sat(cb_034414, cr_071414)); |
235 | let b = i16x8_add_sat(y, cb_177200); |
236 | |
237 | // Shift back and convert to u8. |
238 | let zero = u8x16_splat(0); |
239 | let r = u8x16_narrow_i16x8(i16x8_shr(r, SHIFT), zero); |
240 | let g = u8x16_narrow_i16x8(i16x8_shr(g, SHIFT), zero); |
241 | let b = u8x16_narrow_i16x8(i16x8_shr(b, SHIFT), zero); |
242 | |
243 | // Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb... |
244 | |
245 | let rg_lanes = i8x16_shuffle::<0, 16, |
246 | 1, 17, |
247 | 2, 18, |
248 | 3, 19, |
249 | 4, 20, |
250 | 5, 21, |
251 | 6, 22, |
252 | 7, 23>(r, g); |
253 | |
254 | let rgb_low = i8x16_shuffle::<0, 1, 16, // r0, g0, b0 |
255 | 2, 3, 17, // r1, g1, b1 |
256 | 4, 5, 18, // r2, g2, b2 |
257 | 6, 7, 19, // r3, g3, b3 |
258 | 8, 9, 20, // r4, g4, b4 |
259 | 10>(rg_lanes, b); // r5 |
260 | |
261 | let rgb_hi = i8x16_shuffle::<11, 21, 12, // g5, b5, r6 |
262 | 13, 22, 14, // g6, b6, r7 |
263 | 15, 23, 0, // g7, b7, -- |
264 | 0, 0, 0, // --, --, -- |
265 | 0, 0, 0, // --, --, -- |
266 | 0>(rg_lanes, b); // -- |
267 | |
268 | // SAFETY: i is at most `output.len() / 24 - 1` so the highest possible write is to |
269 | // `output.len() - 1`. |
270 | unsafe { |
271 | v128_store(output.as_mut_ptr().wrapping_add(24 * i) as *mut _, rgb_low); |
272 | v128_store64_lane::<0>(rgb_hi, output.as_mut_ptr().wrapping_add(24 * i + 16) as *mut _); |
273 | } |
274 | } |
275 | |
276 | num_vecs * 8 |
277 | } |
278 | |