1#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
2use core::arch::aarch64::*;
3
4#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
5#[target_feature(enable = "neon")]
6unsafe fn idct8(data: &mut [int16x8_t; 8]) {
7 // The fixed-point constants here are obtained by taking the fractional part of the constants
8 // from the non-SIMD implementation and scaling them up by 1<<15. This is because
9 // vqrdmulhq_n_s16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
10 // slight differences in rounding).
11
12 // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
13 // doesn't apply any further scaling and fixed point constants have a different precision.
14
15 let p2 = data[2];
16 let p3 = data[6];
17 let p1 = vqrdmulhq_n_s16(vqaddq_s16(p2, p3), 17734); // 0.5411961
18 let t2 = vqsubq_s16(
19 vqsubq_s16(p1, p3),
20 vqrdmulhq_n_s16(p3, 27779), // 0.847759065
21 );
22 let t3 = vqaddq_s16(p1, vqrdmulhq_n_s16(p2, 25079)); // 0.765366865
23
24 let p2 = data[0];
25 let p3 = data[4];
26 let t0 = vqaddq_s16(p2, p3);
27 let t1 = vqsubq_s16(p2, p3);
28
29 let x0 = vqaddq_s16(t0, t3);
30 let x3 = vqsubq_s16(t0, t3);
31 let x1 = vqaddq_s16(t1, t2);
32 let x2 = vqsubq_s16(t1, t2);
33
34 let t0 = data[7];
35 let t1 = data[5];
36 let t2 = data[3];
37 let t3 = data[1];
38
39 let p3 = vqaddq_s16(t0, t2);
40 let p4 = vqaddq_s16(t1, t3);
41 let p1 = vqaddq_s16(t0, t3);
42 let p2 = vqaddq_s16(t1, t2);
43 let p5 = vqaddq_s16(p3, p4);
44 let p5 = vqaddq_s16(p5, vqrdmulhq_n_s16(p5, 5763)); // 0.175875602
45
46 let t0 = vqrdmulhq_n_s16(t0, 9786); // 0.298631336
47 let t1 = vqaddq_s16(
48 vqaddq_s16(t1, t1),
49 vqrdmulhq_n_s16(t1, 1741), // 0.053119869
50 );
51 let t2 = vqaddq_s16(
52 vqaddq_s16(t2, vqaddq_s16(t2, t2)),
53 vqrdmulhq_n_s16(t2, 2383), // 0.072711026
54 );
55 let t3 = vqaddq_s16(t3, vqrdmulhq_n_s16(t3, 16427)); // 0.501321110
56
57 let p1 = vqsubq_s16(p5, vqrdmulhq_n_s16(p1, 29490)); // 0.899976223
58 let p2 = vqsubq_s16(
59 vqsubq_s16(vqsubq_s16(p5, p2), p2),
60 vqrdmulhq_n_s16(p2, 18446), // 0.562915447
61 );
62
63 let p3 = vqsubq_s16(
64 vqrdmulhq_n_s16(p3, -31509), // -0.961570560
65 p3,
66 );
67 let p4 = vqrdmulhq_n_s16(p4, -12785); // -0.390180644
68
69 let t3 = vqaddq_s16(vqaddq_s16(p1, p4), t3);
70 let t2 = vqaddq_s16(vqaddq_s16(p2, p3), t2);
71 let t1 = vqaddq_s16(vqaddq_s16(p2, p4), t1);
72 let t0 = vqaddq_s16(vqaddq_s16(p1, p3), t0);
73
74 data[0] = vqaddq_s16(x0, t3);
75 data[7] = vqsubq_s16(x0, t3);
76 data[1] = vqaddq_s16(x1, t2);
77 data[6] = vqsubq_s16(x1, t2);
78 data[2] = vqaddq_s16(x2, t1);
79 data[5] = vqsubq_s16(x2, t1);
80 data[3] = vqaddq_s16(x3, t0);
81 data[4] = vqsubq_s16(x3, t0);
82}
83
84#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
85#[target_feature(enable = "neon")]
86unsafe fn transpose8(data: &mut [int16x8_t; 8]) {
87 // Use NEON's 2x2 matrix transposes (vtrn) to do the transposition in each 4x4 block, then
88 // combine the 4x4 blocks.
89 let a01 = vtrnq_s16(data[0], data[1]);
90 let a23 = vtrnq_s16(data[2], data[3]);
91
92 let four0 = vtrnq_s32(vreinterpretq_s32_s16(a01.0), vreinterpretq_s32_s16(a23.0));
93 let four1 = vtrnq_s32(vreinterpretq_s32_s16(a01.1), vreinterpretq_s32_s16(a23.1));
94
95 let a45 = vtrnq_s16(data[4], data[5]);
96 let a67 = vtrnq_s16(data[6], data[7]);
97
98 let four2 = vtrnq_s32(vreinterpretq_s32_s16(a45.0), vreinterpretq_s32_s16(a67.0));
99 let four3 = vtrnq_s32(vreinterpretq_s32_s16(a45.1), vreinterpretq_s32_s16(a67.1));
100
101 data[0] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.0), vget_low_s32(four2.0)));
102 data[1] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.0), vget_low_s32(four3.0)));
103 data[2] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.1), vget_low_s32(four2.1)));
104 data[3] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.1), vget_low_s32(four3.1)));
105 data[4] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.0), vget_high_s32(four2.0)));
106 data[5] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.0), vget_high_s32(four3.0)));
107 data[6] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.1), vget_high_s32(four2.1)));
108 data[7] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.1), vget_high_s32(four3.1)));
109}
110
111#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
112#[target_feature(enable = "neon")]
113pub unsafe fn dequantize_and_idct_block_8x8(
114 coefficients: &[i16; 64],
115 quantization_table: &[u16; 64],
116 output_linestride: usize,
117 output: &mut [u8],
118) {
119 // The loop below will write to positions [output_linestride * i, output_linestride * i + 8)
120 // for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7,
121 // and if that position is in-bounds, so are all other accesses.
122 assert!(
123 output.len()
124 > output_linestride
125 .checked_mul(7)
126 .unwrap()
127 .checked_add(7)
128 .unwrap()
129 );
130
131 const SHIFT: i32 = 3;
132
133 // Read the DCT coefficients, scale them up and dequantize them.
134 let mut data = [vdupq_n_s16(0); 8];
135 for i in 0..8 {
136 data[i] = vshlq_n_s16(
137 vmulq_s16(
138 vld1q_s16(coefficients.as_ptr().wrapping_add(i * 8)),
139 vreinterpretq_s16_u16(vld1q_u16(quantization_table.as_ptr().wrapping_add(i * 8))),
140 ),
141 SHIFT,
142 );
143 }
144
145 // Usual column IDCT - transpose - column IDCT - transpose approach.
146 idct8(&mut data);
147 transpose8(&mut data);
148 idct8(&mut data);
149 transpose8(&mut data);
150
151 for i in 0..8 {
152 // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
153 // increased by 3.
154 // As values will be stored in a u8, they need to be 128-centered and not 0-centered.
155 // We add 128 with the appropriate shift for that purpose.
156 const OFFSET: i16 = 128 << (SHIFT + 3);
157 // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
158 const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1;
159
160 let data_with_offset = vqaddq_s16(data[i], vdupq_n_s16(OFFSET + ROUNDING_BIAS));
161
162 vst1_u8(
163 output.as_mut_ptr().wrapping_add(output_linestride * i),
164 vqshrun_n_s16(data_with_offset, SHIFT + 3),
165 );
166 }
167}
168
169#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
170#[target_feature(enable = "neon")]
171pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize {
172 assert!(output.len() % 3 == 0);
173 let num = output.len() / 3;
174 assert!(num <= y.len());
175 assert!(num <= cb.len());
176 assert!(num <= cr.len());
177 let num_vecs = num / 8;
178
179 for i in 0..num_vecs {
180 const SHIFT: i32 = 6;
181 // Load.
182 let y = vld1_u8(y.as_ptr().wrapping_add(i * 8));
183 let cb = vld1_u8(cb.as_ptr().wrapping_add(i * 8));
184 let cr = vld1_u8(cr.as_ptr().wrapping_add(i * 8));
185
186 // Convert to 16 bit and shift.
187 let y = vreinterpretq_s16_u16(vshll_n_u8(y, SHIFT));
188 let cb = vreinterpretq_s16_u16(vshll_n_u8(cb, SHIFT));
189 let cr = vreinterpretq_s16_u16(vshll_n_u8(cr, SHIFT));
190
191 // Add offsets
192 let y = vqaddq_s16(y, vdupq_n_s16((1 << SHIFT) >> 1));
193 let c128 = vdupq_n_s16(128 << SHIFT);
194 let cb = vqsubq_s16(cb, c128);
195 let cr = vqsubq_s16(cr, c128);
196
197 // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
198 let cr_140200 = vqaddq_s16(vqrdmulhq_n_s16(cr, 13173), cr);
199 let cb_034414 = vqrdmulhq_n_s16(cb, 11276);
200 let cr_071414 = vqrdmulhq_n_s16(cr, 23401);
201 let cb_177200 = vqaddq_s16(vqrdmulhq_n_s16(cb, 25297), cb);
202
203 // Last conversion step.
204 let r = vqaddq_s16(y, cr_140200);
205 let g = vqsubq_s16(y, vqaddq_s16(cb_034414, cr_071414));
206 let b = vqaddq_s16(y, cb_177200);
207
208 // Shift back and convert to u8.
209 let r = vqshrun_n_s16(r, SHIFT);
210 let g = vqshrun_n_s16(g, SHIFT);
211 let b = vqshrun_n_s16(b, SHIFT);
212
213 // Shuffle + store.
214 vst3_u8(
215 output.as_mut_ptr().wrapping_add(24 * i),
216 uint8x8x3_t(r, g, b),
217 );
218 }
219
220 num_vecs * 8
221}
222

Provided by KDAB

Privacy Policy