1// Copyright 2020 Yevhenii Reizner
2//
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file.
5
6// Based on https://github.com/Lokathor/wide (Zlib)
7
8cfg_if::cfg_if! {
9 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
10 #[cfg(target_arch = "x86")]
11 use core::arch::x86::*;
12 #[cfg(target_arch = "x86_64")]
13 use core::arch::x86_64::*;
14
15 // unused when AVX is available
16 #[cfg(not(all(feature = "simd", target_feature = "avx2")))]
17 use bytemuck::cast;
18
19 #[derive(Clone, Copy, Debug)]
20 #[repr(C, align(16))]
21 pub struct u32x4(__m128i);
22 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
23 use core::arch::wasm32::*;
24
25 #[derive(Clone, Copy, Debug)]
26 #[repr(C, align(16))]
27 pub struct u32x4(v128);
28 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
29 use core::arch::aarch64::*;
30
31 #[derive(Clone, Copy, Debug)]
32 #[repr(C, align(16))]
33 pub struct u32x4(uint32x4_t);
34 } else {
35 #[derive(Clone, Copy, Debug)]
36 #[repr(C, align(16))]
37 pub struct u32x4([u32; 4]);
38 }
39}
40
41unsafe impl bytemuck::Zeroable for u32x4 {}
42unsafe impl bytemuck::Pod for u32x4 {}
43
44impl Default for u32x4 {
45 fn default() -> Self {
46 Self::splat(0)
47 }
48}
49
50impl u32x4 {
51 pub fn splat(n: u32) -> Self {
52 bytemuck::cast([n, n, n, n])
53 }
54
55 // unused when AVX is available
56 #[cfg(not(all(feature = "simd", target_feature = "avx2")))]
57 pub fn cmp_eq(self, rhs: Self) -> Self {
58 cfg_if::cfg_if! {
59 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
60 Self(unsafe { _mm_cmpeq_epi32(self.0, rhs.0) })
61 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
62 Self(u32x4_eq(self.0, rhs.0))
63 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
64 Self(unsafe { vceqq_u32(self.0, rhs.0) })
65 } else {
66 Self([
67 if self.0[0] == rhs.0[0] { u32::MAX } else { 0 },
68 if self.0[1] == rhs.0[1] { u32::MAX } else { 0 },
69 if self.0[2] == rhs.0[2] { u32::MAX } else { 0 },
70 if self.0[3] == rhs.0[3] { u32::MAX } else { 0 },
71 ])
72 }
73 }
74 }
75
76 // unused when AVX is available
77 #[cfg(not(all(feature = "simd", target_feature = "avx2")))]
78 pub fn shl<const RHS: i32>(self) -> Self {
79 cfg_if::cfg_if! {
80 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
81 let shift = cast([RHS as u64, 0]);
82 Self(unsafe { _mm_sll_epi32(self.0, shift) })
83 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
84 Self(u32x4_shl(self.0, RHS as _))
85 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
86 Self(unsafe { vshlq_n_u32::<RHS>(self.0) })
87 } else {
88 let u = RHS as u64;
89 Self([
90 self.0[0] << u,
91 self.0[1] << u,
92 self.0[2] << u,
93 self.0[3] << u,
94 ])
95 }
96 }
97 }
98
99 // unused when AVX is available
100 #[cfg(not(all(feature = "simd", target_feature = "avx2")))]
101 pub fn shr<const RHS: i32>(self) -> Self {
102 cfg_if::cfg_if! {
103 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
104 let shift: __m128i = cast([RHS as u64, 0]);
105 Self(unsafe { _mm_srl_epi32(self.0, shift) })
106 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
107 Self(u32x4_shr(self.0, RHS as _))
108 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
109 Self(unsafe { vshrq_n_u32::<RHS>(self.0) })
110 } else {
111 let u = RHS as u64;
112 Self([
113 self.0[0] >> u,
114 self.0[1] >> u,
115 self.0[2] >> u,
116 self.0[3] >> u,
117 ])
118 }
119 }
120 }
121}
122
123impl core::ops::Not for u32x4 {
124 type Output = Self;
125
126 fn not(self) -> Self {
127 cfg_if::cfg_if! {
128 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
129 let all_bits = unsafe { _mm_set1_epi32(-1) };
130 Self(unsafe { _mm_xor_si128(self.0, all_bits) })
131 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
132 Self(v128_not(self.0))
133 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
134 Self(unsafe { vmvnq_u32(self.0) })
135 } else {
136 Self([
137 !self.0[0],
138 !self.0[1],
139 !self.0[2],
140 !self.0[3],
141 ])
142 }
143 }
144 }
145}
146
147impl core::ops::Add for u32x4 {
148 type Output = Self;
149
150 fn add(self, rhs: Self) -> Self::Output {
151 cfg_if::cfg_if! {
152 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
153 Self(unsafe { _mm_add_epi32(self.0, rhs.0) })
154 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
155 Self(u32x4_add(self.0, rhs.0))
156 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
157 Self(unsafe { vaddq_u32(self.0, rhs.0) })
158 } else {
159 Self([
160 self.0[0].wrapping_add(rhs.0[0]),
161 self.0[1].wrapping_add(rhs.0[1]),
162 self.0[2].wrapping_add(rhs.0[2]),
163 self.0[3].wrapping_add(rhs.0[3]),
164 ])
165 }
166 }
167 }
168}
169
170impl core::ops::BitAnd for u32x4 {
171 type Output = Self;
172
173 fn bitand(self, rhs: Self) -> Self::Output {
174 cfg_if::cfg_if! {
175 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
176 Self(unsafe { _mm_and_si128(self.0, rhs.0) })
177 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
178 Self(v128_and(self.0, rhs.0))
179 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
180 Self(unsafe { vandq_u32(self.0, rhs.0) })
181 } else {
182 Self([
183 self.0[0] & rhs.0[0],
184 self.0[1] & rhs.0[1],
185 self.0[2] & rhs.0[2],
186 self.0[3] & rhs.0[3],
187 ])
188 }
189 }
190 }
191}
192