1 | // Copyright 2020 Yevhenii Reizner |
2 | // |
3 | // Use of this source code is governed by a BSD-style license that can be |
4 | // found in the LICENSE file. |
5 | |
6 | // Based on https://github.com/Lokathor/wide (Zlib) |
7 | |
8 | use bytemuck::cast; |
9 | |
10 | #[cfg (all(not(feature = "std" ), feature = "no-std-float" ))] |
11 | use tiny_skia_path::NoStdFloat; |
12 | |
13 | use super::i32x4; |
14 | |
15 | cfg_if::cfg_if! { |
16 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
17 | #[cfg (target_arch = "x86" )] |
18 | use core::arch::x86::*; |
19 | #[cfg (target_arch = "x86_64" )] |
20 | use core::arch::x86_64::*; |
21 | |
22 | #[derive (Clone, Copy, Debug)] |
23 | #[repr (C, align(16))] |
24 | pub struct f32x4(__m128); |
25 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
26 | use core::arch::wasm32::*; |
27 | |
28 | // repr(transparent) allows for directly passing the v128 on the WASM stack. |
29 | #[derive (Clone, Copy, Debug)] |
30 | #[repr (transparent)] |
31 | pub struct f32x4(v128); |
32 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
33 | use core::arch::aarch64::*; |
34 | |
35 | #[derive (Clone, Copy, Debug)] |
36 | #[repr (C, align(16))] |
37 | pub struct f32x4(float32x4_t); |
38 | } else { |
39 | use super::FasterMinMax; |
40 | |
41 | #[derive (Clone, Copy, Debug)] |
42 | #[repr (C, align(16))] |
43 | pub struct f32x4([f32; 4]); |
44 | } |
45 | } |
46 | |
47 | unsafe impl bytemuck::Zeroable for f32x4 {} |
48 | unsafe impl bytemuck::Pod for f32x4 {} |
49 | |
50 | impl Default for f32x4 { |
51 | fn default() -> Self { |
52 | Self::splat(0.0) |
53 | } |
54 | } |
55 | |
56 | impl f32x4 { |
57 | pub fn splat(n: f32) -> Self { |
58 | Self::from([n, n, n, n]) |
59 | } |
60 | |
61 | pub fn floor(self) -> Self { |
62 | cfg_if::cfg_if! { |
63 | if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
64 | Self(f32x4_floor(self.0)) |
65 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
66 | Self(unsafe { vrndmq_f32(self.0) }) |
67 | } else { |
68 | let roundtrip: f32x4 = cast(self.trunc_int().to_f32x4()); |
69 | roundtrip - roundtrip.cmp_gt(self).blend(f32x4::splat(1.0), f32x4::default()) |
70 | } |
71 | } |
72 | } |
73 | |
74 | pub fn abs(self) -> Self { |
75 | cfg_if::cfg_if! { |
76 | if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
77 | Self(f32x4_abs(self.0)) |
78 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
79 | Self(unsafe { vabsq_f32(self.0) }) |
80 | } else { |
81 | let non_sign_bits = f32x4::splat(f32::from_bits(i32::MAX as u32)); |
82 | self & non_sign_bits |
83 | } |
84 | } |
85 | } |
86 | |
87 | pub fn max(self, rhs: Self) -> Self { |
88 | // These technically don't have the same semantics for NaN and 0, but it |
89 | // doesn't seem to matter as Skia does it the same way. |
90 | cfg_if::cfg_if! { |
91 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
92 | Self(unsafe { _mm_max_ps(self.0, rhs.0) }) |
93 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
94 | Self(f32x4_pmax(self.0, rhs.0)) |
95 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
96 | Self(unsafe { vmaxq_f32(self.0, rhs.0) }) |
97 | } else { |
98 | Self([ |
99 | self.0[0].faster_max(rhs.0[0]), |
100 | self.0[1].faster_max(rhs.0[1]), |
101 | self.0[2].faster_max(rhs.0[2]), |
102 | self.0[3].faster_max(rhs.0[3]), |
103 | ]) |
104 | } |
105 | } |
106 | } |
107 | |
108 | pub fn min(self, rhs: Self) -> Self { |
109 | // These technically don't have the same semantics for NaN and 0, but it |
110 | // doesn't seem to matter as Skia does it the same way. |
111 | cfg_if::cfg_if! { |
112 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
113 | Self(unsafe { _mm_min_ps(self.0, rhs.0) }) |
114 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
115 | Self(f32x4_pmin(self.0, rhs.0)) |
116 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
117 | Self(unsafe { vminq_f32(self.0, rhs.0) }) |
118 | } else { |
119 | Self([ |
120 | self.0[0].faster_min(rhs.0[0]), |
121 | self.0[1].faster_min(rhs.0[1]), |
122 | self.0[2].faster_min(rhs.0[2]), |
123 | self.0[3].faster_min(rhs.0[3]), |
124 | ]) |
125 | } |
126 | } |
127 | } |
128 | |
129 | pub fn cmp_eq(self, rhs: Self) -> Self { |
130 | cfg_if::cfg_if! { |
131 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
132 | Self(unsafe { _mm_cmpeq_ps(self.0, rhs.0) }) |
133 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
134 | Self(f32x4_eq(self.0, rhs.0)) |
135 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
136 | Self(cast(unsafe { vceqq_f32(self.0, rhs.0) })) |
137 | } else { |
138 | Self([ |
139 | if self.0[0] == rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, |
140 | if self.0[1] == rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, |
141 | if self.0[2] == rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, |
142 | if self.0[3] == rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, |
143 | ]) |
144 | } |
145 | } |
146 | } |
147 | |
148 | pub fn cmp_ne(self, rhs: Self) -> Self { |
149 | cfg_if::cfg_if! { |
150 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
151 | Self(unsafe { _mm_cmpneq_ps(self.0, rhs.0) }) |
152 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
153 | Self(f32x4_ne(self.0, rhs.0)) |
154 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
155 | Self(cast(unsafe { vmvnq_u32(vceqq_f32(self.0, rhs.0)) })) |
156 | } else { |
157 | Self([ |
158 | if self.0[0] != rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, |
159 | if self.0[1] != rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, |
160 | if self.0[2] != rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, |
161 | if self.0[3] != rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, |
162 | ]) |
163 | } |
164 | } |
165 | } |
166 | |
167 | pub fn cmp_ge(self, rhs: Self) -> Self { |
168 | cfg_if::cfg_if! { |
169 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
170 | Self(unsafe { _mm_cmpge_ps(self.0, rhs.0) }) |
171 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
172 | Self(f32x4_ge(self.0, rhs.0)) |
173 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
174 | Self(cast(unsafe { vcgeq_f32(self.0, rhs.0) })) |
175 | } else { |
176 | Self([ |
177 | if self.0[0] >= rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, |
178 | if self.0[1] >= rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, |
179 | if self.0[2] >= rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, |
180 | if self.0[3] >= rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, |
181 | ]) |
182 | } |
183 | } |
184 | } |
185 | |
186 | pub fn cmp_gt(self, rhs: Self) -> Self { |
187 | cfg_if::cfg_if! { |
188 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
189 | Self(unsafe { _mm_cmpgt_ps(self.0, rhs.0) }) |
190 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
191 | Self(f32x4_gt(self.0, rhs.0)) |
192 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
193 | Self(cast(unsafe { vcgtq_f32(self.0, rhs.0) })) |
194 | } else { |
195 | Self([ |
196 | if self.0[0] > rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, |
197 | if self.0[1] > rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, |
198 | if self.0[2] > rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, |
199 | if self.0[3] > rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, |
200 | ]) |
201 | } |
202 | } |
203 | } |
204 | |
205 | pub fn cmp_le(self, rhs: Self) -> Self { |
206 | cfg_if::cfg_if! { |
207 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
208 | Self(unsafe { _mm_cmple_ps(self.0, rhs.0) }) |
209 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
210 | Self(f32x4_le(self.0, rhs.0)) |
211 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
212 | Self(cast(unsafe { vcleq_f32(self.0, rhs.0) })) |
213 | } else { |
214 | Self([ |
215 | if self.0[0] <= rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, |
216 | if self.0[1] <= rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, |
217 | if self.0[2] <= rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, |
218 | if self.0[3] <= rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, |
219 | ]) |
220 | } |
221 | } |
222 | } |
223 | |
224 | pub fn cmp_lt(self, rhs: Self) -> Self { |
225 | cfg_if::cfg_if! { |
226 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
227 | Self(unsafe { _mm_cmplt_ps(self.0, rhs.0) }) |
228 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
229 | Self(f32x4_lt(self.0, rhs.0)) |
230 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
231 | Self(cast(unsafe { vcltq_f32(self.0, rhs.0) })) |
232 | } else { |
233 | Self([ |
234 | if self.0[0] < rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, |
235 | if self.0[1] < rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, |
236 | if self.0[2] < rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, |
237 | if self.0[3] < rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, |
238 | ]) |
239 | } |
240 | } |
241 | } |
242 | |
243 | #[inline ] |
244 | pub fn blend(self, t: Self, f: Self) -> Self { |
245 | cfg_if::cfg_if! { |
246 | if #[cfg(all(feature = "simd" , target_feature = "sse4.1" ))] { |
247 | Self(unsafe { _mm_blendv_ps(f.0, t.0, self.0) }) |
248 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
249 | Self(v128_bitselect(t.0, f.0, self.0)) |
250 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
251 | Self(unsafe { cast(vbslq_u32( cast(self.0), cast(t.0), cast(f.0))) }) |
252 | } else { |
253 | super::generic_bit_blend(self, t, f) |
254 | } |
255 | } |
256 | } |
257 | |
258 | pub fn round(self) -> Self { |
259 | cfg_if::cfg_if! { |
260 | if #[cfg(all(feature = "simd" , target_feature = "sse4.1" ))] { |
261 | Self( |
262 | unsafe { _mm_round_ps(self.0, _MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT) }, |
263 | ) |
264 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
265 | Self(f32x4_nearest(self.0)) |
266 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
267 | Self(unsafe { vrndnq_f32(self.0) }) |
268 | } else { |
269 | use super::u32x4; |
270 | |
271 | let to_int = f32x4::splat(1.0 / f32::EPSILON); |
272 | let u: u32x4 = cast(self); |
273 | let e: i32x4 = cast(u.shr::<23>() & u32x4::splat(0xff)); |
274 | let mut y: f32x4; |
275 | |
276 | let no_op_magic = i32x4::splat(0x7f + 23); |
277 | let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic)); |
278 | let no_op_val: f32x4 = self; |
279 | |
280 | let zero_magic = i32x4::splat(0x7f - 1); |
281 | let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic)); |
282 | let zero_val: f32x4 = self * f32x4::splat(0.0); |
283 | |
284 | let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).cmp_lt(i32x4::default())); |
285 | let x: f32x4 = neg_bit.blend(-self, self); |
286 | y = x + to_int - to_int - x; |
287 | y = y.cmp_gt(f32x4::splat(0.5)).blend( |
288 | y + x - f32x4::splat(-1.0), |
289 | y.cmp_lt(f32x4::splat(-0.5)).blend(y + x + f32x4::splat(1.0), y + x), |
290 | ); |
291 | y = neg_bit.blend(-y, y); |
292 | |
293 | no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y)) |
294 | } |
295 | } |
296 | } |
297 | |
298 | pub fn round_int(self) -> i32x4 { |
299 | // These technically don't have the same semantics for NaN and out of |
300 | // range values, but it doesn't seem to matter as Skia does it the same |
301 | // way. |
302 | cfg_if::cfg_if! { |
303 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
304 | i32x4(unsafe { _mm_cvtps_epi32(self.0) }) |
305 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
306 | i32x4(i32x4_trunc_sat_f32x4(self.round().0)) |
307 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
308 | i32x4(unsafe { vcvtnq_s32_f32(self.0) } ) |
309 | } else { |
310 | let rounded: [f32; 4] = cast(self.round()); |
311 | cast([ |
312 | rounded[0] as i32, |
313 | rounded[1] as i32, |
314 | rounded[2] as i32, |
315 | rounded[3] as i32, |
316 | ]) |
317 | } |
318 | } |
319 | } |
320 | |
321 | pub fn trunc_int(self) -> i32x4 { |
322 | // These technically don't have the same semantics for NaN and out of |
323 | // range values, but it doesn't seem to matter as Skia does it the same |
324 | // way. |
325 | cfg_if::cfg_if! { |
326 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
327 | i32x4(unsafe { _mm_cvttps_epi32(self.0) }) |
328 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
329 | i32x4(i32x4_trunc_sat_f32x4(self.0)) |
330 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
331 | i32x4(unsafe { vcvtq_s32_f32(self.0) }) |
332 | } else { |
333 | cast([ |
334 | self.0[0] as i32, |
335 | self.0[1] as i32, |
336 | self.0[2] as i32, |
337 | self.0[3] as i32, |
338 | ]) |
339 | } |
340 | } |
341 | } |
342 | |
343 | pub fn recip_fast(self) -> Self { |
344 | cfg_if::cfg_if! { |
345 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
346 | Self(unsafe { _mm_rcp_ps(self.0) }) |
347 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
348 | Self(f32x4_div(f32x4_splat(1.0), self.0)) |
349 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
350 | unsafe { |
351 | let a = vrecpeq_f32(self.0); |
352 | let a = vmulq_f32(vrecpsq_f32(self.0, a), a); |
353 | Self(a) |
354 | } |
355 | } else { |
356 | Self::from([ |
357 | 1.0 / self.0[0], |
358 | 1.0 / self.0[1], |
359 | 1.0 / self.0[2], |
360 | 1.0 / self.0[3], |
361 | ]) |
362 | } |
363 | } |
364 | } |
365 | |
366 | pub fn recip_sqrt(self) -> Self { |
367 | cfg_if::cfg_if! { |
368 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
369 | Self(unsafe { _mm_rsqrt_ps(self.0) }) |
370 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
371 | Self(f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.0))) |
372 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
373 | unsafe { |
374 | let a = vrsqrteq_f32(self.0); |
375 | let a = vmulq_f32(vrsqrtsq_f32(self.0, vmulq_f32(a, a)), a); |
376 | Self(a) |
377 | } |
378 | } else { |
379 | Self::from([ |
380 | 1.0 / self.0[0].sqrt(), |
381 | 1.0 / self.0[1].sqrt(), |
382 | 1.0 / self.0[2].sqrt(), |
383 | 1.0 / self.0[3].sqrt(), |
384 | ]) |
385 | } |
386 | } |
387 | } |
388 | |
389 | pub fn sqrt(self) -> Self { |
390 | cfg_if::cfg_if! { |
391 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
392 | Self(unsafe { _mm_sqrt_ps(self.0) }) |
393 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
394 | Self(f32x4_sqrt(self.0)) |
395 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
396 | Self(unsafe { vsqrtq_f32(self.0) }) |
397 | } else { |
398 | Self::from([ |
399 | self.0[0].sqrt(), |
400 | self.0[1].sqrt(), |
401 | self.0[2].sqrt(), |
402 | self.0[3].sqrt(), |
403 | ]) |
404 | } |
405 | } |
406 | } |
407 | } |
408 | |
409 | impl From<[f32; 4]> for f32x4 { |
410 | fn from(v: [f32; 4]) -> Self { |
411 | cast(v) |
412 | } |
413 | } |
414 | |
415 | impl From<f32x4> for [f32; 4] { |
416 | fn from(v: f32x4) -> Self { |
417 | cast(v) |
418 | } |
419 | } |
420 | |
421 | impl core::ops::Add for f32x4 { |
422 | type Output = Self; |
423 | |
424 | fn add(self, rhs: Self) -> Self::Output { |
425 | cfg_if::cfg_if! { |
426 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
427 | Self(unsafe { _mm_add_ps(self.0, rhs.0) }) |
428 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
429 | Self(f32x4_add(self.0, rhs.0)) |
430 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
431 | Self(unsafe { vaddq_f32(self.0, rhs.0) }) |
432 | } else { |
433 | Self([ |
434 | self.0[0] + rhs.0[0], |
435 | self.0[1] + rhs.0[1], |
436 | self.0[2] + rhs.0[2], |
437 | self.0[3] + rhs.0[3], |
438 | ]) |
439 | } |
440 | } |
441 | } |
442 | } |
443 | |
444 | impl core::ops::AddAssign for f32x4 { |
445 | fn add_assign(&mut self, rhs: f32x4) { |
446 | *self = *self + rhs; |
447 | } |
448 | } |
449 | |
450 | impl core::ops::Sub for f32x4 { |
451 | type Output = Self; |
452 | |
453 | fn sub(self, rhs: Self) -> Self::Output { |
454 | cfg_if::cfg_if! { |
455 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
456 | Self(unsafe { _mm_sub_ps(self.0, rhs.0) }) |
457 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
458 | Self(f32x4_sub(self.0, rhs.0)) |
459 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
460 | Self(unsafe { vsubq_f32(self.0, rhs.0) }) |
461 | } else { |
462 | Self([ |
463 | self.0[0] - rhs.0[0], |
464 | self.0[1] - rhs.0[1], |
465 | self.0[2] - rhs.0[2], |
466 | self.0[3] - rhs.0[3], |
467 | ]) |
468 | } |
469 | } |
470 | } |
471 | } |
472 | |
473 | impl core::ops::Mul for f32x4 { |
474 | type Output = Self; |
475 | |
476 | fn mul(self, rhs: Self) -> Self::Output { |
477 | cfg_if::cfg_if! { |
478 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
479 | Self(unsafe { _mm_mul_ps(self.0, rhs.0) }) |
480 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
481 | Self(f32x4_mul(self.0, rhs.0)) |
482 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
483 | Self(unsafe { vmulq_f32(self.0, rhs.0) }) |
484 | } else { |
485 | Self([ |
486 | self.0[0] * rhs.0[0], |
487 | self.0[1] * rhs.0[1], |
488 | self.0[2] * rhs.0[2], |
489 | self.0[3] * rhs.0[3], |
490 | ]) |
491 | } |
492 | } |
493 | } |
494 | } |
495 | |
496 | impl core::ops::MulAssign for f32x4 { |
497 | fn mul_assign(&mut self, rhs: f32x4) { |
498 | *self = *self * rhs; |
499 | } |
500 | } |
501 | |
502 | impl core::ops::Div for f32x4 { |
503 | type Output = Self; |
504 | |
505 | fn div(self, rhs: Self) -> Self::Output { |
506 | cfg_if::cfg_if! { |
507 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
508 | Self(unsafe { _mm_div_ps(self.0, rhs.0) }) |
509 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
510 | Self(f32x4_div(self.0, rhs.0)) |
511 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
512 | Self(unsafe { vdivq_f32(self.0, rhs.0) }) |
513 | } else { |
514 | Self([ |
515 | self.0[0] / rhs.0[0], |
516 | self.0[1] / rhs.0[1], |
517 | self.0[2] / rhs.0[2], |
518 | self.0[3] / rhs.0[3], |
519 | ]) |
520 | } |
521 | } |
522 | } |
523 | } |
524 | |
525 | impl core::ops::BitAnd for f32x4 { |
526 | type Output = Self; |
527 | |
528 | #[inline (always)] |
529 | fn bitand(self, rhs: Self) -> Self::Output { |
530 | cfg_if::cfg_if! { |
531 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
532 | Self(unsafe { _mm_and_ps(self.0, rhs.0) }) |
533 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
534 | Self(v128_and(self.0, rhs.0)) |
535 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
536 | Self(cast(unsafe { vandq_u32(cast(self.0), cast(rhs.0)) })) |
537 | } else { |
538 | Self([ |
539 | f32::from_bits(self.0[0].to_bits() & rhs.0[0].to_bits()), |
540 | f32::from_bits(self.0[1].to_bits() & rhs.0[1].to_bits()), |
541 | f32::from_bits(self.0[2].to_bits() & rhs.0[2].to_bits()), |
542 | f32::from_bits(self.0[3].to_bits() & rhs.0[3].to_bits()), |
543 | ]) |
544 | } |
545 | } |
546 | } |
547 | } |
548 | |
549 | impl core::ops::BitOr for f32x4 { |
550 | type Output = Self; |
551 | |
552 | #[inline (always)] |
553 | fn bitor(self, rhs: Self) -> Self::Output { |
554 | cfg_if::cfg_if! { |
555 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
556 | Self(unsafe { _mm_or_ps(self.0, rhs.0) }) |
557 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
558 | Self(v128_or(self.0, rhs.0)) |
559 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
560 | Self(cast(unsafe { vorrq_u32(cast(self.0), cast(rhs.0)) })) |
561 | } else { |
562 | Self([ |
563 | f32::from_bits(self.0[0].to_bits() | rhs.0[0].to_bits()), |
564 | f32::from_bits(self.0[1].to_bits() | rhs.0[1].to_bits()), |
565 | f32::from_bits(self.0[2].to_bits() | rhs.0[2].to_bits()), |
566 | f32::from_bits(self.0[3].to_bits() | rhs.0[3].to_bits()), |
567 | ]) |
568 | } |
569 | } |
570 | } |
571 | } |
572 | |
573 | impl core::ops::BitXor for f32x4 { |
574 | type Output = Self; |
575 | |
576 | #[inline (always)] |
577 | fn bitxor(self, rhs: Self) -> Self::Output { |
578 | cfg_if::cfg_if! { |
579 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
580 | Self(unsafe { _mm_xor_ps(self.0, rhs.0) }) |
581 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
582 | Self(v128_xor(self.0, rhs.0)) |
583 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
584 | Self(cast(unsafe { veorq_u32(cast(self.0), cast(rhs.0)) })) |
585 | } else { |
586 | Self([ |
587 | f32::from_bits(self.0[0].to_bits() ^ rhs.0[0].to_bits()), |
588 | f32::from_bits(self.0[1].to_bits() ^ rhs.0[1].to_bits()), |
589 | f32::from_bits(self.0[2].to_bits() ^ rhs.0[2].to_bits()), |
590 | f32::from_bits(self.0[3].to_bits() ^ rhs.0[3].to_bits()), |
591 | ]) |
592 | } |
593 | } |
594 | } |
595 | } |
596 | |
597 | impl core::ops::Neg for f32x4 { |
598 | type Output = Self; |
599 | |
600 | fn neg(self) -> Self { |
601 | Self::default() - self |
602 | } |
603 | } |
604 | |
605 | impl core::ops::Not for f32x4 { |
606 | type Output = Self; |
607 | |
608 | fn not(self) -> Self { |
609 | cfg_if::cfg_if! { |
610 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
611 | unsafe { |
612 | let all_bits = _mm_set1_ps(f32::from_bits(u32::MAX)); |
613 | Self(_mm_xor_ps(self.0, all_bits)) |
614 | } |
615 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
616 | Self(v128_not(self.0)) |
617 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
618 | Self(cast(unsafe { vmvnq_u32(cast(self.0)) })) |
619 | } else { |
620 | self ^ Self::splat(cast(u32::MAX)) |
621 | } |
622 | } |
623 | } |
624 | } |
625 | |
626 | impl core::cmp::PartialEq for f32x4 { |
627 | fn eq(&self, rhs: &Self) -> bool { |
628 | cfg_if::cfg_if! { |
629 | if #[cfg(all(feature = "simd" , target_feature = "sse2" ))] { |
630 | unsafe { _mm_movemask_ps(_mm_cmpeq_ps(self.0, rhs.0)) == 0b1111 } |
631 | } else if #[cfg(all(feature = "simd" , target_arch = "aarch64" , target_feature = "neon" ))] { |
632 | unsafe { vminvq_u32(vceqq_f32(self.0, rhs.0)) != 0 } |
633 | } else if #[cfg(all(feature = "simd" , target_feature = "simd128" ))] { |
634 | u32x4_all_true(f32x4_eq(self.0, rhs.0)) |
635 | } else { |
636 | self.0 == rhs.0 |
637 | } |
638 | } |
639 | } |
640 | } |
641 | |