1// Copyright 2020 Yevhenii Reizner
2//
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file.
5
6// Based on https://github.com/Lokathor/wide (Zlib)
7
8use bytemuck::cast;
9
10use super::f32x4;
11
12cfg_if::cfg_if! {
13 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
14 #[cfg(target_arch = "x86")]
15 use core::arch::x86::*;
16 #[cfg(target_arch = "x86_64")]
17 use core::arch::x86_64::*;
18
19 #[derive(Clone, Copy, Debug)]
20 #[repr(C, align(16))]
21 pub struct i32x4(pub __m128i);
22 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
23 use core::arch::wasm32::*;
24
25 #[derive(Clone, Copy, Debug)]
26 #[repr(C, align(16))]
27 pub struct i32x4(pub v128);
28 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
29 use core::arch::aarch64::*;
30
31 #[derive(Clone, Copy, Debug)]
32 #[repr(C, align(16))]
33 pub struct i32x4(pub int32x4_t);
34 } else {
35 #[derive(Clone, Copy, Debug)]
36 #[repr(C, align(16))]
37 pub struct i32x4([i32; 4]);
38 }
39}
40
41unsafe impl bytemuck::Zeroable for i32x4 {}
42unsafe impl bytemuck::Pod for i32x4 {}
43
44impl Default for i32x4 {
45 fn default() -> Self {
46 Self::splat(0)
47 }
48}
49
50impl i32x4 {
51 pub fn splat(n: i32) -> Self {
52 cast([n, n, n, n])
53 }
54
55 pub fn blend(self, t: Self, f: Self) -> Self {
56 cfg_if::cfg_if! {
57 if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] {
58 Self(unsafe { _mm_blendv_epi8(f.0, t.0, self.0) })
59 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
60 Self(v128_bitselect(t.0, f.0, self.0))
61 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
62 Self(unsafe { vbslq_s32(cast(self.0), t.0, f.0) })
63 } else {
64 super::generic_bit_blend(self, t, f)
65 }
66 }
67 }
68
69 pub fn cmp_eq(self, rhs: Self) -> Self {
70 cfg_if::cfg_if! {
71 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
72 cast(Self(cast(unsafe { _mm_cmpeq_epi32(self.0, rhs.0) })))
73 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
74 Self(i32x4_eq(self.0, rhs.0))
75 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
76 Self(unsafe { cast(vceqq_s32(self.0, rhs.0)) })
77 } else {
78 Self([
79 if self.0[0] == rhs.0[0] { -1 } else { 0 },
80 if self.0[1] == rhs.0[1] { -1 } else { 0 },
81 if self.0[2] == rhs.0[2] { -1 } else { 0 },
82 if self.0[3] == rhs.0[3] { -1 } else { 0 },
83 ])
84 }
85 }
86 }
87
88 pub fn cmp_gt(self, rhs: Self) -> Self {
89 cfg_if::cfg_if! {
90 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
91 cast(Self(cast(unsafe { _mm_cmpgt_epi32(self.0, rhs.0) })))
92 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
93 Self(i32x4_gt(self.0, rhs.0))
94 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
95 Self(unsafe { cast(vcgtq_s32(self.0, rhs.0)) })
96 } else {
97 Self([
98 if self.0[0] > rhs.0[0] { -1 } else { 0 },
99 if self.0[1] > rhs.0[1] { -1 } else { 0 },
100 if self.0[2] > rhs.0[2] { -1 } else { 0 },
101 if self.0[3] > rhs.0[3] { -1 } else { 0 },
102 ])
103 }
104 }
105 }
106
107 pub fn cmp_lt(self, rhs: Self) -> Self {
108 cfg_if::cfg_if! {
109 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
110 cast(Self(cast(unsafe { _mm_cmplt_epi32(self.0, rhs.0) })))
111 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
112 Self(i32x4_lt(self.0, rhs.0))
113 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
114 Self(unsafe { cast(vcltq_s32(self.0, rhs.0)) })
115 } else {
116 Self([
117 if self.0[0] < rhs.0[0] { -1 } else { 0 },
118 if self.0[1] < rhs.0[1] { -1 } else { 0 },
119 if self.0[2] < rhs.0[2] { -1 } else { 0 },
120 if self.0[3] < rhs.0[3] { -1 } else { 0 },
121 ])
122 }
123 }
124 }
125
126 pub fn to_f32x4(self) -> f32x4 {
127 cfg_if::cfg_if! {
128 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
129 cast(Self(cast(unsafe { _mm_cvtepi32_ps(self.0) })))
130 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
131 cast(Self(f32x4_convert_i32x4(self.0)))
132 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
133 cast(Self(unsafe { cast(vcvtq_f32_s32(self.0)) }))
134 } else {
135 let arr: [i32; 4] = cast(self);
136 cast([
137 arr[0] as f32,
138 arr[1] as f32,
139 arr[2] as f32,
140 arr[3] as f32,
141 ])
142 }
143 }
144 }
145
146 pub fn to_f32x4_bitcast(self) -> f32x4 {
147 bytemuck::cast(self)
148 }
149}
150
151impl From<[i32; 4]> for i32x4 {
152 fn from(v: [i32; 4]) -> Self {
153 cast(v)
154 }
155}
156
157impl From<i32x4> for [i32; 4] {
158 fn from(v: i32x4) -> Self {
159 cast(v)
160 }
161}
162
163impl core::ops::Add for i32x4 {
164 type Output = Self;
165
166 fn add(self, rhs: Self) -> Self::Output {
167 cfg_if::cfg_if! {
168 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
169 Self(unsafe { _mm_add_epi32(self.0, rhs.0) })
170 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
171 Self(i32x4_add(self.0, rhs.0))
172 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
173 Self(unsafe { vaddq_s32(self.0, rhs.0) })
174 } else {
175 Self([
176 self.0[0].wrapping_add(rhs.0[0]),
177 self.0[1].wrapping_add(rhs.0[1]),
178 self.0[2].wrapping_add(rhs.0[2]),
179 self.0[3].wrapping_add(rhs.0[3]),
180 ])
181 }
182 }
183 }
184}
185
186impl core::ops::BitAnd for i32x4 {
187 type Output = Self;
188
189 fn bitand(self, rhs: Self) -> Self::Output {
190 cfg_if::cfg_if! {
191 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
192 Self(unsafe { _mm_and_si128(self.0, rhs.0) })
193 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
194 Self(v128_and(self.0, rhs.0))
195 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
196 Self(unsafe { vandq_s32(self.0, rhs.0) })
197 } else {
198 Self([
199 self.0[0] & rhs.0[0],
200 self.0[1] & rhs.0[1],
201 self.0[2] & rhs.0[2],
202 self.0[3] & rhs.0[3],
203 ])
204 }
205 }
206 }
207}
208
209impl core::ops::Mul for i32x4 {
210 type Output = Self;
211
212 fn mul(self, rhs: Self) -> Self::Output {
213 cfg_if::cfg_if! {
214 if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] {
215 Self(unsafe { _mm_mullo_epi32(self.0, rhs.0) })
216 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
217 Self(i32x4_mul(self.0, rhs.0))
218 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
219 Self(unsafe { vmulq_s32(self.0, rhs.0) })
220 } else {
221 // Cast is required, since we have to use scalar multiplication on SSE2.
222 let a: [i32; 4] = cast(self);
223 let b: [i32; 4] = cast(rhs);
224 Self(cast([
225 a[0].wrapping_mul(b[0]),
226 a[1].wrapping_mul(b[1]),
227 a[2].wrapping_mul(b[2]),
228 a[3].wrapping_mul(b[3]),
229 ]))
230 }
231 }
232 }
233}
234
235impl core::ops::BitOr for i32x4 {
236 type Output = Self;
237
238 #[inline]
239 fn bitor(self, rhs: Self) -> Self::Output {
240 cfg_if::cfg_if! {
241 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
242 Self(unsafe { _mm_or_si128(self.0, rhs.0) })
243 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
244 Self(v128_or(self.0, rhs.0))
245 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
246 Self(unsafe { vorrq_s32(self.0, rhs.0) })
247 } else {
248 Self([
249 self.0[0] | rhs.0[0],
250 self.0[1] | rhs.0[1],
251 self.0[2] | rhs.0[2],
252 self.0[3] | rhs.0[3],
253 ])
254 }
255 }
256 }
257}
258
259impl core::ops::BitXor for i32x4 {
260 type Output = Self;
261
262 #[inline]
263 fn bitxor(self, rhs: Self) -> Self::Output {
264 cfg_if::cfg_if! {
265 if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
266 Self(unsafe { _mm_xor_si128(self.0, rhs.0) })
267 } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
268 Self(v128_xor(self.0, rhs.0))
269 } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
270 Self(unsafe { veorq_s32(self.0, rhs.0) })
271 } else {
272 Self([
273 self.0[0] ^ rhs.0[0],
274 self.0[1] ^ rhs.0[1],
275 self.0[2] ^ rhs.0[2],
276 self.0[3] ^ rhs.0[3],
277 ])
278 }
279 }
280 }
281}
282