1// Copyright 2020 Yevhenii Reizner
2//
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file.
5
6// No need to use explicit 256bit AVX2 SIMD.
7// `-C target-cpu=native` will autovectorize it better than us.
8// Not even sure why explicit instructions are so slow...
9//
10// On ARM AArch64 we can actually get up to 2x performance boost by using SIMD.
11//
12// We also have to inline all the methods. They are pretty large,
13// but without the inlining the performance is plummeting.
14
15#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
16use bytemuck::cast;
17#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
18use core::arch::aarch64::uint16x8_t;
19
20#[allow(non_camel_case_types)]
21#[derive(Copy, Clone, PartialEq, Default, Debug)]
22pub struct u16x16(pub [u16; 16]);
23
24macro_rules! impl_u16x16_op {
25 ($a:expr, $op:ident, $b:expr) => {
26 u16x16([
27 $a.0[0].$op($b.0[0]),
28 $a.0[1].$op($b.0[1]),
29 $a.0[2].$op($b.0[2]),
30 $a.0[3].$op($b.0[3]),
31 $a.0[4].$op($b.0[4]),
32 $a.0[5].$op($b.0[5]),
33 $a.0[6].$op($b.0[6]),
34 $a.0[7].$op($b.0[7]),
35 $a.0[8].$op($b.0[8]),
36 $a.0[9].$op($b.0[9]),
37 $a.0[10].$op($b.0[10]),
38 $a.0[11].$op($b.0[11]),
39 $a.0[12].$op($b.0[12]),
40 $a.0[13].$op($b.0[13]),
41 $a.0[14].$op($b.0[14]),
42 $a.0[15].$op($b.0[15]),
43 ])
44 };
45}
46
47#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
48macro_rules! impl_aarch64_call {
49 ($f:ident, $a:expr, $b:expr) => {
50 let a = $a.split();
51 let b = $b.split();
52 Self(bytemuck::cast([
53 unsafe { core::arch::aarch64::$f(a.0, b.0) },
54 unsafe { core::arch::aarch64::$f(a.1, b.1) },
55 ]))
56 };
57}
58
59impl u16x16 {
60 #[inline]
61 pub fn splat(n: u16) -> Self {
62 Self([n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n])
63 }
64
65 #[inline]
66 pub fn as_slice(&self) -> &[u16; 16] {
67 &self.0
68 }
69
70 #[inline]
71 pub fn min(&self, rhs: &Self) -> Self {
72 cfg_if::cfg_if! {
73 if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
74 impl_aarch64_call!(vminq_u16, self, rhs)
75 } else {
76 impl_u16x16_op!(self, min, rhs)
77 }
78 }
79 }
80
81 #[inline]
82 pub fn max(&self, rhs: &Self) -> Self {
83 cfg_if::cfg_if! {
84 if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
85 impl_aarch64_call!(vmaxq_u16, self, rhs)
86 } else {
87 impl_u16x16_op!(self, max, rhs)
88 }
89 }
90 }
91
92 #[inline]
93 pub fn cmp_le(&self, rhs: &Self) -> Self {
94 cfg_if::cfg_if! {
95 if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
96 impl_aarch64_call!(vcleq_u16, self, rhs)
97 } else {
98 Self([
99 if self.0[ 0] <= rhs.0[ 0] { !0 } else { 0 },
100 if self.0[ 1] <= rhs.0[ 1] { !0 } else { 0 },
101 if self.0[ 2] <= rhs.0[ 2] { !0 } else { 0 },
102 if self.0[ 3] <= rhs.0[ 3] { !0 } else { 0 },
103 if self.0[ 4] <= rhs.0[ 4] { !0 } else { 0 },
104 if self.0[ 5] <= rhs.0[ 5] { !0 } else { 0 },
105 if self.0[ 6] <= rhs.0[ 6] { !0 } else { 0 },
106 if self.0[ 7] <= rhs.0[ 7] { !0 } else { 0 },
107 if self.0[ 8] <= rhs.0[ 8] { !0 } else { 0 },
108 if self.0[ 9] <= rhs.0[ 9] { !0 } else { 0 },
109 if self.0[10] <= rhs.0[10] { !0 } else { 0 },
110 if self.0[11] <= rhs.0[11] { !0 } else { 0 },
111 if self.0[12] <= rhs.0[12] { !0 } else { 0 },
112 if self.0[13] <= rhs.0[13] { !0 } else { 0 },
113 if self.0[14] <= rhs.0[14] { !0 } else { 0 },
114 if self.0[15] <= rhs.0[15] { !0 } else { 0 },
115 ])
116 }
117 }
118 }
119
120 #[inline]
121 pub fn blend(self, t: Self, e: Self) -> Self {
122 (t & self) | (e & !self)
123 }
124
125 #[inline]
126 #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
127 pub fn split(self) -> (uint16x8_t, uint16x8_t) {
128 let pair: [uint16x8_t; 2] = cast(self.0);
129 (pair[0], pair[1])
130 }
131}
132
133impl core::ops::Add<u16x16> for u16x16 {
134 type Output = Self;
135
136 #[inline]
137 fn add(self, rhs: Self) -> Self::Output {
138 cfg_if::cfg_if! {
139 if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
140 impl_aarch64_call!(vaddq_u16, self, rhs)
141 } else {
142 impl_u16x16_op!(self, add, rhs)
143 }
144 }
145 }
146}
147
148impl core::ops::Sub<u16x16> for u16x16 {
149 type Output = Self;
150
151 #[inline]
152 fn sub(self, rhs: Self) -> Self::Output {
153 cfg_if::cfg_if! {
154 if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
155 impl_aarch64_call!(vsubq_u16, self, rhs)
156 } else {
157 impl_u16x16_op!(self, sub, rhs)
158 }
159 }
160 }
161}
162
163impl core::ops::Mul<u16x16> for u16x16 {
164 type Output = Self;
165
166 #[inline]
167 fn mul(self, rhs: Self) -> Self::Output {
168 cfg_if::cfg_if! {
169 if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
170 impl_aarch64_call!(vmulq_u16, self, rhs)
171 } else {
172 impl_u16x16_op!(self, mul, rhs)
173 }
174 }
175 }
176}
177
178impl core::ops::Div<u16x16> for u16x16 {
179 type Output = Self;
180
181 #[inline]
182 fn div(self, rhs: Self) -> Self::Output {
183 impl_u16x16_op!(self, div, rhs)
184 }
185}
186
187impl core::ops::BitAnd<u16x16> for u16x16 {
188 type Output = Self;
189
190 #[inline]
191 fn bitand(self, rhs: Self) -> Self::Output {
192 cfg_if::cfg_if! {
193 if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
194 impl_aarch64_call!(vandq_u16, self, rhs)
195 } else {
196 impl_u16x16_op!(self, bitand, rhs)
197 }
198 }
199 }
200}
201
202impl core::ops::BitOr<u16x16> for u16x16 {
203 type Output = Self;
204
205 #[inline]
206 fn bitor(self, rhs: Self) -> Self::Output {
207 cfg_if::cfg_if! {
208 if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
209 impl_aarch64_call!(vorrq_u16, self, rhs)
210 } else {
211 impl_u16x16_op!(self, bitor, rhs)
212 }
213 }
214 }
215}
216
217impl core::ops::Not for u16x16 {
218 type Output = Self;
219
220 #[inline]
221 fn not(self) -> Self::Output {
222 u16x16([
223 !self.0[0],
224 !self.0[1],
225 !self.0[2],
226 !self.0[3],
227 !self.0[4],
228 !self.0[5],
229 !self.0[6],
230 !self.0[7],
231 !self.0[8],
232 !self.0[9],
233 !self.0[10],
234 !self.0[11],
235 !self.0[12],
236 !self.0[13],
237 !self.0[14],
238 !self.0[15],
239 ])
240 }
241}
242
243impl core::ops::Shr for u16x16 {
244 type Output = Self;
245
246 #[inline]
247 fn shr(self, rhs: Self) -> Self::Output {
248 impl_u16x16_op!(self, shr, rhs)
249 }
250}
251