1 | #[cfg (all(feature = "nightly_aarch64_neon" , target_arch = "aarch64" ))] |
2 | use core::arch::aarch64::*; |
3 | |
4 | #[cfg (all(feature = "nightly_aarch64_neon" , target_arch = "aarch64" ))] |
5 | #[target_feature (enable = "neon" )] |
6 | unsafe fn idct8(data: &mut [int16x8_t; 8]) { |
7 | // The fixed-point constants here are obtained by taking the fractional part of the constants |
8 | // from the non-SIMD implementation and scaling them up by 1<<15. This is because |
9 | // vqrdmulhq_n_s16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some |
10 | // slight differences in rounding). |
11 | |
12 | // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it |
13 | // doesn't apply any further scaling and fixed point constants have a different precision. |
14 | |
15 | let p2 = data[2]; |
16 | let p3 = data[6]; |
17 | let p1 = vqrdmulhq_n_s16(vqaddq_s16(p2, p3), 17734); // 0.5411961 |
18 | let t2 = vqsubq_s16( |
19 | vqsubq_s16(p1, p3), |
20 | vqrdmulhq_n_s16(p3, 27779), // 0.847759065 |
21 | ); |
22 | let t3 = vqaddq_s16(p1, vqrdmulhq_n_s16(p2, 25079)); // 0.765366865 |
23 | |
24 | let p2 = data[0]; |
25 | let p3 = data[4]; |
26 | let t0 = vqaddq_s16(p2, p3); |
27 | let t1 = vqsubq_s16(p2, p3); |
28 | |
29 | let x0 = vqaddq_s16(t0, t3); |
30 | let x3 = vqsubq_s16(t0, t3); |
31 | let x1 = vqaddq_s16(t1, t2); |
32 | let x2 = vqsubq_s16(t1, t2); |
33 | |
34 | let t0 = data[7]; |
35 | let t1 = data[5]; |
36 | let t2 = data[3]; |
37 | let t3 = data[1]; |
38 | |
39 | let p3 = vqaddq_s16(t0, t2); |
40 | let p4 = vqaddq_s16(t1, t3); |
41 | let p1 = vqaddq_s16(t0, t3); |
42 | let p2 = vqaddq_s16(t1, t2); |
43 | let p5 = vqaddq_s16(p3, p4); |
44 | let p5 = vqaddq_s16(p5, vqrdmulhq_n_s16(p5, 5763)); // 0.175875602 |
45 | |
46 | let t0 = vqrdmulhq_n_s16(t0, 9786); // 0.298631336 |
47 | let t1 = vqaddq_s16( |
48 | vqaddq_s16(t1, t1), |
49 | vqrdmulhq_n_s16(t1, 1741), // 0.053119869 |
50 | ); |
51 | let t2 = vqaddq_s16( |
52 | vqaddq_s16(t2, vqaddq_s16(t2, t2)), |
53 | vqrdmulhq_n_s16(t2, 2383), // 0.072711026 |
54 | ); |
55 | let t3 = vqaddq_s16(t3, vqrdmulhq_n_s16(t3, 16427)); // 0.501321110 |
56 | |
57 | let p1 = vqsubq_s16(p5, vqrdmulhq_n_s16(p1, 29490)); // 0.899976223 |
58 | let p2 = vqsubq_s16( |
59 | vqsubq_s16(vqsubq_s16(p5, p2), p2), |
60 | vqrdmulhq_n_s16(p2, 18446), // 0.562915447 |
61 | ); |
62 | |
63 | let p3 = vqsubq_s16( |
64 | vqrdmulhq_n_s16(p3, -31509), // -0.961570560 |
65 | p3, |
66 | ); |
67 | let p4 = vqrdmulhq_n_s16(p4, -12785); // -0.390180644 |
68 | |
69 | let t3 = vqaddq_s16(vqaddq_s16(p1, p4), t3); |
70 | let t2 = vqaddq_s16(vqaddq_s16(p2, p3), t2); |
71 | let t1 = vqaddq_s16(vqaddq_s16(p2, p4), t1); |
72 | let t0 = vqaddq_s16(vqaddq_s16(p1, p3), t0); |
73 | |
74 | data[0] = vqaddq_s16(x0, t3); |
75 | data[7] = vqsubq_s16(x0, t3); |
76 | data[1] = vqaddq_s16(x1, t2); |
77 | data[6] = vqsubq_s16(x1, t2); |
78 | data[2] = vqaddq_s16(x2, t1); |
79 | data[5] = vqsubq_s16(x2, t1); |
80 | data[3] = vqaddq_s16(x3, t0); |
81 | data[4] = vqsubq_s16(x3, t0); |
82 | } |
83 | |
84 | #[cfg (all(feature = "nightly_aarch64_neon" , target_arch = "aarch64" ))] |
85 | #[target_feature (enable = "neon" )] |
86 | unsafe fn transpose8(data: &mut [int16x8_t; 8]) { |
87 | // Use NEON's 2x2 matrix transposes (vtrn) to do the transposition in each 4x4 block, then |
88 | // combine the 4x4 blocks. |
89 | let a01 = vtrnq_s16(data[0], data[1]); |
90 | let a23 = vtrnq_s16(data[2], data[3]); |
91 | |
92 | let four0 = vtrnq_s32(vreinterpretq_s32_s16(a01.0), vreinterpretq_s32_s16(a23.0)); |
93 | let four1 = vtrnq_s32(vreinterpretq_s32_s16(a01.1), vreinterpretq_s32_s16(a23.1)); |
94 | |
95 | let a45 = vtrnq_s16(data[4], data[5]); |
96 | let a67 = vtrnq_s16(data[6], data[7]); |
97 | |
98 | let four2 = vtrnq_s32(vreinterpretq_s32_s16(a45.0), vreinterpretq_s32_s16(a67.0)); |
99 | let four3 = vtrnq_s32(vreinterpretq_s32_s16(a45.1), vreinterpretq_s32_s16(a67.1)); |
100 | |
101 | data[0] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.0), vget_low_s32(four2.0))); |
102 | data[1] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.0), vget_low_s32(four3.0))); |
103 | data[2] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.1), vget_low_s32(four2.1))); |
104 | data[3] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.1), vget_low_s32(four3.1))); |
105 | data[4] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.0), vget_high_s32(four2.0))); |
106 | data[5] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.0), vget_high_s32(four3.0))); |
107 | data[6] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.1), vget_high_s32(four2.1))); |
108 | data[7] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.1), vget_high_s32(four3.1))); |
109 | } |
110 | |
111 | #[cfg (all(feature = "nightly_aarch64_neon" , target_arch = "aarch64" ))] |
112 | #[target_feature (enable = "neon" )] |
113 | pub unsafe fn dequantize_and_idct_block_8x8( |
114 | coefficients: &[i16; 64], |
115 | quantization_table: &[u16; 64], |
116 | output_linestride: usize, |
117 | output: &mut [u8], |
118 | ) { |
119 | // The loop below will write to positions [output_linestride * i, output_linestride * i + 8) |
120 | // for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7, |
121 | // and if that position is in-bounds, so are all other accesses. |
122 | assert!( |
123 | output.len() |
124 | > output_linestride |
125 | .checked_mul(7) |
126 | .unwrap() |
127 | .checked_add(7) |
128 | .unwrap() |
129 | ); |
130 | |
131 | const SHIFT: i32 = 3; |
132 | |
133 | // Read the DCT coefficients, scale them up and dequantize them. |
134 | let mut data = [vdupq_n_s16(0); 8]; |
135 | for i in 0..8 { |
136 | data[i] = vshlq_n_s16( |
137 | vmulq_s16( |
138 | vld1q_s16(coefficients.as_ptr().wrapping_add(i * 8)), |
139 | vreinterpretq_s16_u16(vld1q_u16(quantization_table.as_ptr().wrapping_add(i * 8))), |
140 | ), |
141 | SHIFT, |
142 | ); |
143 | } |
144 | |
145 | // Usual column IDCT - transpose - column IDCT - transpose approach. |
146 | idct8(&mut data); |
147 | transpose8(&mut data); |
148 | idct8(&mut data); |
149 | transpose8(&mut data); |
150 | |
151 | for i in 0..8 { |
152 | // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is |
153 | // increased by 3. |
154 | // As values will be stored in a u8, they need to be 128-centered and not 0-centered. |
155 | // We add 128 with the appropriate shift for that purpose. |
156 | const OFFSET: i16 = 128 << (SHIFT + 3); |
157 | // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting. |
158 | const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1; |
159 | |
160 | let data_with_offset = vqaddq_s16(data[i], vdupq_n_s16(OFFSET + ROUNDING_BIAS)); |
161 | |
162 | vst1_u8( |
163 | output.as_mut_ptr().wrapping_add(output_linestride * i), |
164 | vqshrun_n_s16(data_with_offset, SHIFT + 3), |
165 | ); |
166 | } |
167 | } |
168 | |
169 | #[cfg (all(feature = "nightly_aarch64_neon" , target_arch = "aarch64" ))] |
170 | #[target_feature (enable = "neon" )] |
171 | pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize { |
172 | assert!(output.len() % 3 == 0); |
173 | let num = output.len() / 3; |
174 | assert!(num <= y.len()); |
175 | assert!(num <= cb.len()); |
176 | assert!(num <= cr.len()); |
177 | let num_vecs = num / 8; |
178 | |
179 | for i in 0..num_vecs { |
180 | const SHIFT: i32 = 6; |
181 | // Load. |
182 | let y = vld1_u8(y.as_ptr().wrapping_add(i * 8)); |
183 | let cb = vld1_u8(cb.as_ptr().wrapping_add(i * 8)); |
184 | let cr = vld1_u8(cr.as_ptr().wrapping_add(i * 8)); |
185 | |
186 | // Convert to 16 bit and shift. |
187 | let y = vreinterpretq_s16_u16(vshll_n_u8(y, SHIFT)); |
188 | let cb = vreinterpretq_s16_u16(vshll_n_u8(cb, SHIFT)); |
189 | let cr = vreinterpretq_s16_u16(vshll_n_u8(cr, SHIFT)); |
190 | |
191 | // Add offsets |
192 | let y = vqaddq_s16(y, vdupq_n_s16((1 << SHIFT) >> 1)); |
193 | let c128 = vdupq_n_s16(128 << SHIFT); |
194 | let cb = vqsubq_s16(cb, c128); |
195 | let cr = vqsubq_s16(cr, c128); |
196 | |
197 | // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772 |
198 | let cr_140200 = vqaddq_s16(vqrdmulhq_n_s16(cr, 13173), cr); |
199 | let cb_034414 = vqrdmulhq_n_s16(cb, 11276); |
200 | let cr_071414 = vqrdmulhq_n_s16(cr, 23401); |
201 | let cb_177200 = vqaddq_s16(vqrdmulhq_n_s16(cb, 25297), cb); |
202 | |
203 | // Last conversion step. |
204 | let r = vqaddq_s16(y, cr_140200); |
205 | let g = vqsubq_s16(y, vqaddq_s16(cb_034414, cr_071414)); |
206 | let b = vqaddq_s16(y, cb_177200); |
207 | |
208 | // Shift back and convert to u8. |
209 | let r = vqshrun_n_s16(r, SHIFT); |
210 | let g = vqshrun_n_s16(g, SHIFT); |
211 | let b = vqshrun_n_s16(b, SHIFT); |
212 | |
213 | // Shuffle + store. |
214 | vst3_u8( |
215 | output.as_mut_ptr().wrapping_add(24 * i), |
216 | uint8x8x3_t(r, g, b), |
217 | ); |
218 | } |
219 | |
220 | num_vecs * 8 |
221 | } |
222 | |