| 1 | // Copyright 2020 Yevhenii Reizner |
| 2 | // |
| 3 | // Use of this source code is governed by a BSD-style license that can be |
| 4 | // found in the LICENSE file. |
| 5 | |
| 6 | // No need to use explicit 256bit AVX2 SIMD. |
| 7 | // `-C target-cpu=native` will autovectorize it better than us. |
| 8 | // Not even sure why explicit instructions are so slow... |
| 9 | // |
| 10 | // On ARM AArch64 we can actually get up to 2x performance boost by using SIMD. |
| 11 | // |
| 12 | // We also have to inline all the methods. They are pretty large, |
| 13 | // but without the inlining the performance is plummeting. |
| 14 | |
| 15 | #[cfg (all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] |
| 16 | use bytemuck::cast; |
| 17 | #[cfg (all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] |
| 18 | use core::arch::aarch64::uint16x8_t; |
| 19 | |
| 20 | #[allow (non_camel_case_types)] |
| 21 | #[derive (Copy, Clone, PartialEq, Default, Debug)] |
| 22 | pub struct u16x16(pub [u16; 16]); |
| 23 | |
| 24 | macro_rules! impl_u16x16_op { |
| 25 | ($a:expr, $op:ident, $b:expr) => { |
| 26 | u16x16([ |
| 27 | $a.0[0].$op($b.0[0]), |
| 28 | $a.0[1].$op($b.0[1]), |
| 29 | $a.0[2].$op($b.0[2]), |
| 30 | $a.0[3].$op($b.0[3]), |
| 31 | $a.0[4].$op($b.0[4]), |
| 32 | $a.0[5].$op($b.0[5]), |
| 33 | $a.0[6].$op($b.0[6]), |
| 34 | $a.0[7].$op($b.0[7]), |
| 35 | $a.0[8].$op($b.0[8]), |
| 36 | $a.0[9].$op($b.0[9]), |
| 37 | $a.0[10].$op($b.0[10]), |
| 38 | $a.0[11].$op($b.0[11]), |
| 39 | $a.0[12].$op($b.0[12]), |
| 40 | $a.0[13].$op($b.0[13]), |
| 41 | $a.0[14].$op($b.0[14]), |
| 42 | $a.0[15].$op($b.0[15]), |
| 43 | ]) |
| 44 | }; |
| 45 | } |
| 46 | |
| 47 | #[cfg (all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] |
| 48 | macro_rules! impl_aarch64_call { |
| 49 | ($f:ident, $a:expr, $b:expr) => { |
| 50 | let a = $a.split(); |
| 51 | let b = $b.split(); |
| 52 | Self(bytemuck::cast([ |
| 53 | unsafe { core::arch::aarch64::$f(a.0, b.0) }, |
| 54 | unsafe { core::arch::aarch64::$f(a.1, b.1) }, |
| 55 | ])) |
| 56 | }; |
| 57 | } |
| 58 | |
| 59 | impl u16x16 { |
| 60 | #[inline ] |
| 61 | pub fn splat(n: u16) -> Self { |
| 62 | Self([n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n]) |
| 63 | } |
| 64 | |
| 65 | #[inline ] |
| 66 | pub fn as_slice(&self) -> &[u16; 16] { |
| 67 | &self.0 |
| 68 | } |
| 69 | |
| 70 | #[inline ] |
| 71 | pub fn min(&self, rhs: &Self) -> Self { |
| 72 | cfg_if::cfg_if! { |
| 73 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
| 74 | impl_aarch64_call!(vminq_u16, self, rhs) |
| 75 | } else { |
| 76 | impl_u16x16_op!(self, min, rhs) |
| 77 | } |
| 78 | } |
| 79 | } |
| 80 | |
| 81 | #[inline ] |
| 82 | pub fn max(&self, rhs: &Self) -> Self { |
| 83 | cfg_if::cfg_if! { |
| 84 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
| 85 | impl_aarch64_call!(vmaxq_u16, self, rhs) |
| 86 | } else { |
| 87 | impl_u16x16_op!(self, max, rhs) |
| 88 | } |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | #[inline ] |
| 93 | pub fn cmp_le(&self, rhs: &Self) -> Self { |
| 94 | cfg_if::cfg_if! { |
| 95 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
| 96 | impl_aarch64_call!(vcleq_u16, self, rhs) |
| 97 | } else { |
| 98 | Self([ |
| 99 | if self.0[ 0] <= rhs.0[ 0] { !0 } else { 0 }, |
| 100 | if self.0[ 1] <= rhs.0[ 1] { !0 } else { 0 }, |
| 101 | if self.0[ 2] <= rhs.0[ 2] { !0 } else { 0 }, |
| 102 | if self.0[ 3] <= rhs.0[ 3] { !0 } else { 0 }, |
| 103 | if self.0[ 4] <= rhs.0[ 4] { !0 } else { 0 }, |
| 104 | if self.0[ 5] <= rhs.0[ 5] { !0 } else { 0 }, |
| 105 | if self.0[ 6] <= rhs.0[ 6] { !0 } else { 0 }, |
| 106 | if self.0[ 7] <= rhs.0[ 7] { !0 } else { 0 }, |
| 107 | if self.0[ 8] <= rhs.0[ 8] { !0 } else { 0 }, |
| 108 | if self.0[ 9] <= rhs.0[ 9] { !0 } else { 0 }, |
| 109 | if self.0[10] <= rhs.0[10] { !0 } else { 0 }, |
| 110 | if self.0[11] <= rhs.0[11] { !0 } else { 0 }, |
| 111 | if self.0[12] <= rhs.0[12] { !0 } else { 0 }, |
| 112 | if self.0[13] <= rhs.0[13] { !0 } else { 0 }, |
| 113 | if self.0[14] <= rhs.0[14] { !0 } else { 0 }, |
| 114 | if self.0[15] <= rhs.0[15] { !0 } else { 0 }, |
| 115 | ]) |
| 116 | } |
| 117 | } |
| 118 | } |
| 119 | |
| 120 | #[inline ] |
| 121 | pub fn blend(self, t: Self, e: Self) -> Self { |
| 122 | (t & self) | (e & !self) |
| 123 | } |
| 124 | |
| 125 | #[inline ] |
| 126 | #[cfg (all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] |
| 127 | pub fn split(self) -> (uint16x8_t, uint16x8_t) { |
| 128 | let pair: [uint16x8_t; 2] = cast(self.0); |
| 129 | (pair[0], pair[1]) |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | impl core::ops::Add<u16x16> for u16x16 { |
| 134 | type Output = Self; |
| 135 | |
| 136 | #[inline ] |
| 137 | fn add(self, rhs: Self) -> Self::Output { |
| 138 | cfg_if::cfg_if! { |
| 139 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
| 140 | impl_aarch64_call!(vaddq_u16, self, rhs) |
| 141 | } else { |
| 142 | impl_u16x16_op!(self, add, rhs) |
| 143 | } |
| 144 | } |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | impl core::ops::Sub<u16x16> for u16x16 { |
| 149 | type Output = Self; |
| 150 | |
| 151 | #[inline ] |
| 152 | fn sub(self, rhs: Self) -> Self::Output { |
| 153 | cfg_if::cfg_if! { |
| 154 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
| 155 | impl_aarch64_call!(vsubq_u16, self, rhs) |
| 156 | } else { |
| 157 | impl_u16x16_op!(self, sub, rhs) |
| 158 | } |
| 159 | } |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | impl core::ops::Mul<u16x16> for u16x16 { |
| 164 | type Output = Self; |
| 165 | |
| 166 | #[inline ] |
| 167 | fn mul(self, rhs: Self) -> Self::Output { |
| 168 | cfg_if::cfg_if! { |
| 169 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
| 170 | impl_aarch64_call!(vmulq_u16, self, rhs) |
| 171 | } else { |
| 172 | impl_u16x16_op!(self, mul, rhs) |
| 173 | } |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | impl core::ops::Div<u16x16> for u16x16 { |
| 179 | type Output = Self; |
| 180 | |
| 181 | #[inline ] |
| 182 | fn div(self, rhs: Self) -> Self::Output { |
| 183 | impl_u16x16_op!(self, div, rhs) |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | impl core::ops::BitAnd<u16x16> for u16x16 { |
| 188 | type Output = Self; |
| 189 | |
| 190 | #[inline ] |
| 191 | fn bitand(self, rhs: Self) -> Self::Output { |
| 192 | cfg_if::cfg_if! { |
| 193 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
| 194 | impl_aarch64_call!(vandq_u16, self, rhs) |
| 195 | } else { |
| 196 | impl_u16x16_op!(self, bitand, rhs) |
| 197 | } |
| 198 | } |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | impl core::ops::BitOr<u16x16> for u16x16 { |
| 203 | type Output = Self; |
| 204 | |
| 205 | #[inline ] |
| 206 | fn bitor(self, rhs: Self) -> Self::Output { |
| 207 | cfg_if::cfg_if! { |
| 208 | if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
| 209 | impl_aarch64_call!(vorrq_u16, self, rhs) |
| 210 | } else { |
| 211 | impl_u16x16_op!(self, bitor, rhs) |
| 212 | } |
| 213 | } |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | impl core::ops::Not for u16x16 { |
| 218 | type Output = Self; |
| 219 | |
| 220 | #[inline ] |
| 221 | fn not(self) -> Self::Output { |
| 222 | u16x16([ |
| 223 | !self.0[0], |
| 224 | !self.0[1], |
| 225 | !self.0[2], |
| 226 | !self.0[3], |
| 227 | !self.0[4], |
| 228 | !self.0[5], |
| 229 | !self.0[6], |
| 230 | !self.0[7], |
| 231 | !self.0[8], |
| 232 | !self.0[9], |
| 233 | !self.0[10], |
| 234 | !self.0[11], |
| 235 | !self.0[12], |
| 236 | !self.0[13], |
| 237 | !self.0[14], |
| 238 | !self.0[15], |
| 239 | ]) |
| 240 | } |
| 241 | } |
| 242 | |
| 243 | impl core::ops::Shr for u16x16 { |
| 244 | type Output = Self; |
| 245 | |
| 246 | #[inline ] |
| 247 | fn shr(self, rhs: Self) -> Self::Output { |
| 248 | impl_u16x16_op!(self, shr, rhs) |
| 249 | } |
| 250 | } |
| 251 | |