neon.rs source code [crates/jpeg_decoder/src/arch/neon.rs]

1	#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
2	use core::arch::aarch64::*;
3
4	#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
5	#[target_feature(enable = "neon")]
6	unsafe fn idct8(data: &mut [int16x8_t; `8`]) {
7	// The fixed-point constants here are obtained by taking the fractional part of the constants
8	// from the non-SIMD implementation and scaling them up by 1<<15. This is because
9	// vqrdmulhq_n_s16(a, b) is effectively equivalent to (ab)>>15 (except for possibly some*
10	// slight differences in rounding).
11
12	// The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
13	// doesn't apply any further scaling and fixed point constants have a different precision.
14
15	let p2 = data[`2`];
16	let p3 = data[`6`];
17	let p1 = vqrdmulhq_n_s16(vqaddq_s16(p2, p3), `17734`); // 0.5411961
18	let t2 = vqsubq_s16(
19	vqsubq_s16(p1, p3),
20	vqrdmulhq_n_s16(p3, `27779`), // 0.847759065
21	);
22	let t3 = vqaddq_s16(p1, vqrdmulhq_n_s16(p2, `25079`)); // 0.765366865
23
24	let p2 = data[`0`];
25	let p3 = data[`4`];
26	let t0 = vqaddq_s16(p2, p3);
27	let t1 = vqsubq_s16(p2, p3);
28
29	let x0 = vqaddq_s16(t0, t3);
30	let x3 = vqsubq_s16(t0, t3);
31	let x1 = vqaddq_s16(t1, t2);
32	let x2 = vqsubq_s16(t1, t2);
33
34	let t0 = data[`7`];
35	let t1 = data[`5`];
36	let t2 = data[`3`];
37	let t3 = data[`1`];
38
39	let p3 = vqaddq_s16(t0, t2);
40	let p4 = vqaddq_s16(t1, t3);
41	let p1 = vqaddq_s16(t0, t3);
42	let p2 = vqaddq_s16(t1, t2);
43	let p5 = vqaddq_s16(p3, p4);
44	let p5 = vqaddq_s16(p5, vqrdmulhq_n_s16(p5, `5763`)); // 0.175875602
45
46	let t0 = vqrdmulhq_n_s16(t0, `9786`); // 0.298631336
47	let t1 = vqaddq_s16(
48	vqaddq_s16(t1, t1),
49	vqrdmulhq_n_s16(t1, `1741`), // 0.053119869
50	);
51	let t2 = vqaddq_s16(
52	vqaddq_s16(t2, vqaddq_s16(t2, t2)),
53	vqrdmulhq_n_s16(t2, `2383`), // 0.072711026
54	);
55	let t3 = vqaddq_s16(t3, vqrdmulhq_n_s16(t3, `16427`)); // 0.501321110
56
57	let p1 = vqsubq_s16(p5, vqrdmulhq_n_s16(p1, `29490`)); // 0.899976223
58	let p2 = vqsubq_s16(
59	vqsubq_s16(vqsubq_s16(p5, p2), p2),
60	vqrdmulhq_n_s16(p2, `18446`), // 0.562915447
61	);
62
63	let p3 = vqsubq_s16(
64	vqrdmulhq_n_s16(p3, `-31509`), // -0.961570560
65	p3,
66	);
67	let p4 = vqrdmulhq_n_s16(p4, `-12785`); // -0.390180644
68
69	let t3 = vqaddq_s16(vqaddq_s16(p1, p4), t3);
70	let t2 = vqaddq_s16(vqaddq_s16(p2, p3), t2);
71	let t1 = vqaddq_s16(vqaddq_s16(p2, p4), t1);
72	let t0 = vqaddq_s16(vqaddq_s16(p1, p3), t0);
73
74	data[`0`] = vqaddq_s16(x0, t3);
75	data[`7`] = vqsubq_s16(x0, t3);
76	data[`1`] = vqaddq_s16(x1, t2);
77	data[`6`] = vqsubq_s16(x1, t2);
78	data[`2`] = vqaddq_s16(x2, t1);
79	data[`5`] = vqsubq_s16(x2, t1);
80	data[`3`] = vqaddq_s16(x3, t0);
81	data[`4`] = vqsubq_s16(x3, t0);
82	}
83
84	#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
85	#[target_feature(enable = "neon")]
86	unsafe fn transpose8(data: &mut [int16x8_t; `8`]) {
87	// Use NEON's 2x2 matrix transposes (vtrn) to do the transposition in each 4x4 block, then
88	// combine the 4x4 blocks.
89	let a01 = vtrnq_s16(data[`0`], data[`1`]);
90	let a23 = vtrnq_s16(data[`2`], data[`3`]);
91
92	let four0 = vtrnq_s32(vreinterpretq_s32_s16(a01.0), vreinterpretq_s32_s16(a23.0));
93	let four1 = vtrnq_s32(vreinterpretq_s32_s16(a01.1), vreinterpretq_s32_s16(a23.1));
94
95	let a45 = vtrnq_s16(data[`4`], data[`5`]);
96	let a67 = vtrnq_s16(data[`6`], data[`7`]);
97
98	let four2 = vtrnq_s32(vreinterpretq_s32_s16(a45.0), vreinterpretq_s32_s16(a67.0));
99	let four3 = vtrnq_s32(vreinterpretq_s32_s16(a45.1), vreinterpretq_s32_s16(a67.1));
100
101	data[`0`] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.0), vget_low_s32(four2.0)));
102	data[`1`] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.0), vget_low_s32(four3.0)));
103	data[`2`] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.1), vget_low_s32(four2.1)));
104	data[`3`] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.1), vget_low_s32(four3.1)));
105	data[`4`] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.0), vget_high_s32(four2.0)));
106	data[`5`] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.0), vget_high_s32(four3.0)));
107	data[`6`] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.1), vget_high_s32(four2.1)));
108	data[`7`] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.1), vget_high_s32(four3.1)));
109	}
110
111	#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
112	#[target_feature(enable = "neon")]
113	pub unsafe fn dequantize_and_idct_block_8x8(
114	coefficients: &[i16; `64`],
115	quantization_table: &[u16; `64`],
116	output_linestride: usize,
117	output: &mut [u8],
118	) {
119	// The loop below will write to positions [output_linestride i, output_linestride * i + 8)*
120	// for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade 7 + 7,*
121	// and if that position is in-bounds, so are all other accesses.
122	assert!(
123	output.len()
124	> output_linestride
125	.checked_mul(`7`)
126	.unwrap()
127	.checked_add(`7`)
128	.unwrap()
129	);
130
131	const SHIFT: i32 = `3`;
132
133	// Read the DCT coefficients, scale them up and dequantize them.
134	let mut data = [vdupq_n_s16(`0`); `8`];
135	for i in `0`..`8` {
136	data[i] = vshlq_n_s16(
137	vmulq_s16(
138	vld1q_s16(coefficients.as_ptr().wrapping_add(i * `8`)),
139	vreinterpretq_s16_u16(vld1q_u16(quantization_table.as_ptr().wrapping_add(i * `8`))),
140	),
141	SHIFT,
142	);
143	}
144
145	// Usual column IDCT - transpose - column IDCT - transpose approach.
146	idct8(&mut data);
147	transpose8(&mut data);
148	idct8(&mut data);
149	transpose8(&mut data);
150
151	for i in `0`..`8` {
152	// The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
153	// increased by 3.
154	// As values will be stored in a u8, they need to be 128-centered and not 0-centered.
155	// We add 128 with the appropriate shift for that purpose.
156	const OFFSET: i16 = `128` << (SHIFT + `3`);
157	// We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
158	const ROUNDING_BIAS: i16 = (`1` << (SHIFT + `3`)) >> `1`;
159
160	let data_with_offset = vqaddq_s16(data[i], vdupq_n_s16(OFFSET + ROUNDING_BIAS));
161
162	vst1_u8(
163	output.as_mut_ptr().wrapping_add(output_linestride * i),
164	vqshrun_n_s16(data_with_offset, SHIFT + `3`),
165	);
166	}
167	}
168
169	#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
170	#[target_feature(enable = "neon")]
171	pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize {
172	assert!(output.len() % `3` == `0`);
173	let num = output.len() / `3`;
174	assert!(num <= y.len());
175	assert!(num <= cb.len());
176	assert!(num <= cr.len());
177	let num_vecs = num / `8`;
178
179	for i in `0`..num_vecs {
180	const SHIFT: i32 = `6`;
181	// Load.
182	let y = vld1_u8(y.as_ptr().wrapping_add(i * `8`));
183	let cb = vld1_u8(cb.as_ptr().wrapping_add(i * `8`));
184	let cr = vld1_u8(cr.as_ptr().wrapping_add(i * `8`));
185
186	// Convert to 16 bit and shift.
187	let y = vreinterpretq_s16_u16(vshll_n_u8(y, SHIFT));
188	let cb = vreinterpretq_s16_u16(vshll_n_u8(cb, SHIFT));
189	let cr = vreinterpretq_s16_u16(vshll_n_u8(cr, SHIFT));
190
191	// Add offsets
192	let y = vqaddq_s16(y, vdupq_n_s16((`1` << SHIFT) >> `1`));
193	let c128 = vdupq_n_s16(`128` << SHIFT);
194	let cb = vqsubq_s16(cb, c128);
195	let cr = vqsubq_s16(cr, c128);
196
197	// Compute cr 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772*
198	let cr_140200 = vqaddq_s16(vqrdmulhq_n_s16(cr, `13173`), cr);
199	let cb_034414 = vqrdmulhq_n_s16(cb, `11276`);
200	let cr_071414 = vqrdmulhq_n_s16(cr, `23401`);
201	let cb_177200 = vqaddq_s16(vqrdmulhq_n_s16(cb, `25297`), cb);
202
203	// Last conversion step.
204	let r = vqaddq_s16(y, cr_140200);
205	let g = vqsubq_s16(y, vqaddq_s16(cb_034414, cr_071414));
206	let b = vqaddq_s16(y, cb_177200);
207
208	// Shift back and convert to u8.
209	let r = vqshrun_n_s16(r, SHIFT);
210	let g = vqshrun_n_s16(g, SHIFT);
211	let b = vqshrun_n_s16(b, SHIFT);
212
213	// Shuffle + store.
214	vst3_u8(
215	output.as_mut_ptr().wrapping_add(`24` * i),
216	uint8x8x3_t(r, g, b),
217	);
218	}
219
220	num_vecs * `8`
221	}
222