1 | // Copyright 2020 Yevhenii Reizner |
2 | // |
3 | // Use of this source code is governed by a BSD-style license that can be |
4 | // found in the LICENSE file. |
5 | |
6 | // No need to use explicit 256bit AVX2 SIMD. |
7 | // `-C target-cpu=native` will autovectorize it better than us. |
8 | // Not even sure why explicit instructions are so slow... |
9 | // |
10 | // On ARM AArch64 we can actually get up to 2x performance boost by using SIMD. |
11 | // |
12 | // We also have to inline all the methods. They are pretty large, |
13 | // but without the inlining the performance is plummeting. |
14 | |
15 | #[cfg (all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] |
16 | use bytemuck::cast; |
17 | #[cfg (all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] |
18 | use core::arch::aarch64::uint16x8_t; |
19 | |
20 | #[allow (non_camel_case_types)] |
21 | #[derive (Copy, Clone, PartialEq, Default, Debug)] |
22 | pub struct u16x16(pub [u16; 16]); |
23 | |
24 | macro_rules! impl_u16x16_op { |
25 | ($a:expr, $op:ident, $b:expr) => { |
26 | u16x16([ |
27 | $a.0[0].$op($b.0[0]), |
28 | $a.0[1].$op($b.0[1]), |
29 | $a.0[2].$op($b.0[2]), |
30 | $a.0[3].$op($b.0[3]), |
31 | $a.0[4].$op($b.0[4]), |
32 | $a.0[5].$op($b.0[5]), |
33 | $a.0[6].$op($b.0[6]), |
34 | $a.0[7].$op($b.0[7]), |
35 | $a.0[8].$op($b.0[8]), |
36 | $a.0[9].$op($b.0[9]), |
37 | $a.0[10].$op($b.0[10]), |
38 | $a.0[11].$op($b.0[11]), |
39 | $a.0[12].$op($b.0[12]), |
40 | $a.0[13].$op($b.0[13]), |
41 | $a.0[14].$op($b.0[14]), |
42 | $a.0[15].$op($b.0[15]), |
43 | ]) |
44 | }; |
45 | } |
46 | |
47 | #[cfg (all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] |
48 | macro_rules! impl_aarch64_call { |
49 | ($f:ident, $a:expr, $b:expr) => { |
50 | let a = $a.split(); |
51 | let b = $b.split(); |
52 | Self(bytemuck::cast([ |
53 | unsafe { core::arch::aarch64::$f(a.0, b.0) }, |
54 | unsafe { core::arch::aarch64::$f(a.1, b.1) }, |
55 | ])) |
56 | }; |
57 | } |
58 | |
59 | impl u16x16 { |
60 | #[inline ] |
61 | pub fn splat(n: u16) -> Self { |
62 | Self([n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n]) |
63 | } |
64 | |
65 | #[inline ] |
66 | pub fn as_slice(&self) -> &[u16; 16] { |
67 | &self.0 |
68 | } |
69 | |
70 | #[inline ] |
71 | pub fn min(&self, rhs: &Self) -> Self { |
72 | cfg_if::cfg_if! { |
73 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
74 | impl_aarch64_call!(vminq_u16, self, rhs) |
75 | } else { |
76 | impl_u16x16_op!(self, min, rhs) |
77 | } |
78 | } |
79 | } |
80 | |
81 | #[inline ] |
82 | pub fn max(&self, rhs: &Self) -> Self { |
83 | cfg_if::cfg_if! { |
84 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
85 | impl_aarch64_call!(vmaxq_u16, self, rhs) |
86 | } else { |
87 | impl_u16x16_op!(self, max, rhs) |
88 | } |
89 | } |
90 | } |
91 | |
92 | #[inline ] |
93 | pub fn cmp_le(&self, rhs: &Self) -> Self { |
94 | cfg_if::cfg_if! { |
95 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
96 | impl_aarch64_call!(vcleq_u16, self, rhs) |
97 | } else { |
98 | Self([ |
99 | if self.0[ 0] <= rhs.0[ 0] { !0 } else { 0 }, |
100 | if self.0[ 1] <= rhs.0[ 1] { !0 } else { 0 }, |
101 | if self.0[ 2] <= rhs.0[ 2] { !0 } else { 0 }, |
102 | if self.0[ 3] <= rhs.0[ 3] { !0 } else { 0 }, |
103 | if self.0[ 4] <= rhs.0[ 4] { !0 } else { 0 }, |
104 | if self.0[ 5] <= rhs.0[ 5] { !0 } else { 0 }, |
105 | if self.0[ 6] <= rhs.0[ 6] { !0 } else { 0 }, |
106 | if self.0[ 7] <= rhs.0[ 7] { !0 } else { 0 }, |
107 | if self.0[ 8] <= rhs.0[ 8] { !0 } else { 0 }, |
108 | if self.0[ 9] <= rhs.0[ 9] { !0 } else { 0 }, |
109 | if self.0[10] <= rhs.0[10] { !0 } else { 0 }, |
110 | if self.0[11] <= rhs.0[11] { !0 } else { 0 }, |
111 | if self.0[12] <= rhs.0[12] { !0 } else { 0 }, |
112 | if self.0[13] <= rhs.0[13] { !0 } else { 0 }, |
113 | if self.0[14] <= rhs.0[14] { !0 } else { 0 }, |
114 | if self.0[15] <= rhs.0[15] { !0 } else { 0 }, |
115 | ]) |
116 | } |
117 | } |
118 | } |
119 | |
120 | #[inline ] |
121 | pub fn blend(self, t: Self, e: Self) -> Self { |
122 | (t & self) | (e & !self) |
123 | } |
124 | |
125 | #[inline ] |
126 | #[cfg (all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] |
127 | pub fn split(self) -> (uint16x8_t, uint16x8_t) { |
128 | let pair: [uint16x8_t; 2] = cast(self.0); |
129 | (pair[0], pair[1]) |
130 | } |
131 | } |
132 | |
133 | impl core::ops::Add<u16x16> for u16x16 { |
134 | type Output = Self; |
135 | |
136 | #[inline ] |
137 | fn add(self, rhs: Self) -> Self::Output { |
138 | cfg_if::cfg_if! { |
139 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
140 | impl_aarch64_call!(vaddq_u16, self, rhs) |
141 | } else { |
142 | impl_u16x16_op!(self, add, rhs) |
143 | } |
144 | } |
145 | } |
146 | } |
147 | |
148 | impl core::ops::Sub<u16x16> for u16x16 { |
149 | type Output = Self; |
150 | |
151 | #[inline ] |
152 | fn sub(self, rhs: Self) -> Self::Output { |
153 | cfg_if::cfg_if! { |
154 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
155 | impl_aarch64_call!(vsubq_u16, self, rhs) |
156 | } else { |
157 | impl_u16x16_op!(self, sub, rhs) |
158 | } |
159 | } |
160 | } |
161 | } |
162 | |
163 | impl core::ops::Mul<u16x16> for u16x16 { |
164 | type Output = Self; |
165 | |
166 | #[inline ] |
167 | fn mul(self, rhs: Self) -> Self::Output { |
168 | cfg_if::cfg_if! { |
169 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
170 | impl_aarch64_call!(vmulq_u16, self, rhs) |
171 | } else { |
172 | impl_u16x16_op!(self, mul, rhs) |
173 | } |
174 | } |
175 | } |
176 | } |
177 | |
178 | impl core::ops::Div<u16x16> for u16x16 { |
179 | type Output = Self; |
180 | |
181 | #[inline ] |
182 | fn div(self, rhs: Self) -> Self::Output { |
183 | impl_u16x16_op!(self, div, rhs) |
184 | } |
185 | } |
186 | |
187 | impl core::ops::BitAnd<u16x16> for u16x16 { |
188 | type Output = Self; |
189 | |
190 | #[inline ] |
191 | fn bitand(self, rhs: Self) -> Self::Output { |
192 | cfg_if::cfg_if! { |
193 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
194 | impl_aarch64_call!(vandq_u16, self, rhs) |
195 | } else { |
196 | impl_u16x16_op!(self, bitand, rhs) |
197 | } |
198 | } |
199 | } |
200 | } |
201 | |
202 | impl core::ops::BitOr<u16x16> for u16x16 { |
203 | type Output = Self; |
204 | |
205 | #[inline ] |
206 | fn bitor(self, rhs: Self) -> Self::Output { |
207 | cfg_if::cfg_if! { |
208 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
209 | impl_aarch64_call!(vorrq_u16, self, rhs) |
210 | } else { |
211 | impl_u16x16_op!(self, bitor, rhs) |
212 | } |
213 | } |
214 | } |
215 | } |
216 | |
217 | impl core::ops::Not for u16x16 { |
218 | type Output = Self; |
219 | |
220 | #[inline ] |
221 | fn not(self) -> Self::Output { |
222 | u16x16([ |
223 | !self.0[0], |
224 | !self.0[1], |
225 | !self.0[2], |
226 | !self.0[3], |
227 | !self.0[4], |
228 | !self.0[5], |
229 | !self.0[6], |
230 | !self.0[7], |
231 | !self.0[8], |
232 | !self.0[9], |
233 | !self.0[10], |
234 | !self.0[11], |
235 | !self.0[12], |
236 | !self.0[13], |
237 | !self.0[14], |
238 | !self.0[15], |
239 | ]) |
240 | } |
241 | } |
242 | |
243 | impl core::ops::Shr for u16x16 { |
244 | type Output = Self; |
245 | |
246 | #[inline ] |
247 | fn shr(self, rhs: Self) -> Self::Output { |
248 | impl_u16x16_op!(self, shr, rhs) |
249 | } |
250 | } |
251 | |