1 | // Copyright 2020 Yevhenii Reizner |
2 | // |
3 | // Use of this source code is governed by a BSD-style license that can be |
4 | // found in the LICENSE file. |
5 | |
6 | // Based on https://github.com/Lokathor/wide (Zlib) |
7 | |
8 | use bytemuck::cast; |
9 | |
10 | use super::{i32x8, u32x8}; |
11 | |
12 | cfg_if::cfg_if! { |
13 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
14 | #[cfg (target_arch = "x86" )] |
15 | use core::arch::x86::*; |
16 | #[cfg (target_arch = "x86_64" )] |
17 | use core::arch::x86_64::*; |
18 | |
19 | #[derive (Clone, Copy, Debug)] |
20 | #[repr (C, align(32))] |
21 | pub struct f32x8(__m256); |
22 | } else { |
23 | use super::f32x4; |
24 | |
25 | #[derive (Clone, Copy, Debug)] |
26 | #[repr (C, align(32))] |
27 | pub struct f32x8(pub f32x4, pub f32x4); |
28 | } |
29 | } |
30 | |
31 | unsafe impl bytemuck::Zeroable for f32x8 {} |
32 | unsafe impl bytemuck::Pod for f32x8 {} |
33 | |
34 | impl Default for f32x8 { |
35 | fn default() -> Self { |
36 | Self::splat(0.0) |
37 | } |
38 | } |
39 | |
40 | impl f32x8 { |
41 | pub fn splat(n: f32) -> Self { |
42 | cast([n, n, n, n, n, n, n, n]) |
43 | } |
44 | |
45 | pub fn floor(self) -> Self { |
46 | let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8()); |
47 | roundtrip |
48 | - roundtrip |
49 | .cmp_gt(self) |
50 | .blend(f32x8::splat(1.0), f32x8::default()) |
51 | } |
52 | |
53 | pub fn fract(self) -> Self { |
54 | self - self.floor() |
55 | } |
56 | |
57 | pub fn normalize(self) -> Self { |
58 | self.max(f32x8::default()).min(f32x8::splat(1.0)) |
59 | } |
60 | |
61 | pub fn to_i32x8_bitcast(self) -> i32x8 { |
62 | bytemuck::cast(self) |
63 | } |
64 | |
65 | pub fn to_u32x8_bitcast(self) -> u32x8 { |
66 | bytemuck::cast(self) |
67 | } |
68 | |
69 | pub fn cmp_eq(self, rhs: Self) -> Self { |
70 | cfg_if::cfg_if! { |
71 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
72 | Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_EQ_OQ) }) |
73 | } else { |
74 | Self(self.0.cmp_eq(rhs.0), self.1.cmp_eq(rhs.1)) |
75 | } |
76 | } |
77 | } |
78 | |
79 | pub fn cmp_ne(self, rhs: Self) -> Self { |
80 | cfg_if::cfg_if! { |
81 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
82 | Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_NEQ_OQ) }) |
83 | } else { |
84 | Self(self.0.cmp_ne(rhs.0), self.1.cmp_ne(rhs.1)) |
85 | } |
86 | } |
87 | } |
88 | |
89 | pub fn cmp_ge(self, rhs: Self) -> Self { |
90 | cfg_if::cfg_if! { |
91 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
92 | Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_GE_OQ) }) |
93 | } else { |
94 | Self(self.0.cmp_ge(rhs.0), self.1.cmp_ge(rhs.1)) |
95 | } |
96 | } |
97 | } |
98 | |
99 | pub fn cmp_gt(self, rhs: Self) -> Self { |
100 | cfg_if::cfg_if! { |
101 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
102 | Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_GT_OQ) }) |
103 | } else { |
104 | Self(self.0.cmp_gt(rhs.0), self.1.cmp_gt(rhs.1)) |
105 | } |
106 | } |
107 | } |
108 | |
109 | pub fn cmp_le(self, rhs: Self) -> Self { |
110 | cfg_if::cfg_if! { |
111 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
112 | Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_LE_OQ) }) |
113 | } else { |
114 | Self(self.0.cmp_le(rhs.0), self.1.cmp_le(rhs.1)) |
115 | } |
116 | } |
117 | } |
118 | |
119 | pub fn cmp_lt(self, rhs: Self) -> Self { |
120 | cfg_if::cfg_if! { |
121 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
122 | Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_LT_OQ) }) |
123 | } else { |
124 | Self(self.0.cmp_lt(rhs.0), self.1.cmp_lt(rhs.1)) |
125 | } |
126 | } |
127 | } |
128 | |
129 | #[inline ] |
130 | pub fn blend(self, t: Self, f: Self) -> Self { |
131 | cfg_if::cfg_if! { |
132 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
133 | Self(unsafe { _mm256_blendv_ps(f.0, t.0, self.0) }) |
134 | } else { |
135 | Self(self.0.blend(t.0, f.0), self.1.blend(t.1, f.1)) |
136 | } |
137 | } |
138 | } |
139 | |
140 | pub fn abs(self) -> Self { |
141 | let non_sign_bits = f32x8::splat(f32::from_bits(i32::MAX as u32)); |
142 | self & non_sign_bits |
143 | } |
144 | |
145 | pub fn max(self, rhs: Self) -> Self { |
146 | // These technically don't have the same semantics for NaN and 0, but it |
147 | // doesn't seem to matter as Skia does it the same way. |
148 | cfg_if::cfg_if! { |
149 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
150 | Self(unsafe { _mm256_max_ps(self.0, rhs.0) }) |
151 | } else { |
152 | Self(self.0.max(rhs.0), self.1.max(rhs.1)) |
153 | } |
154 | } |
155 | } |
156 | |
157 | pub fn min(self, rhs: Self) -> Self { |
158 | // These technically don't have the same semantics for NaN and 0, but it |
159 | // doesn't seem to matter as Skia does it the same way. |
160 | cfg_if::cfg_if! { |
161 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
162 | Self(unsafe { _mm256_min_ps(self.0, rhs.0) }) |
163 | } else { |
164 | Self(self.0.min(rhs.0), self.1.min(rhs.1)) |
165 | } |
166 | } |
167 | } |
168 | |
169 | pub fn is_finite(self) -> Self { |
170 | let shifted_exp_mask = u32x8::splat(0xFF000000); |
171 | let u: u32x8 = cast(self); |
172 | let shift_u = u.shl::<1>(); |
173 | let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask); |
174 | cast(out) |
175 | } |
176 | |
177 | pub fn round(self) -> Self { |
178 | cfg_if::cfg_if! { |
179 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
180 | Self(unsafe { _mm256_round_ps(self.0, _MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT) }) |
181 | } else { |
182 | Self(self.0.round(), self.1.round()) |
183 | } |
184 | } |
185 | } |
186 | |
187 | pub fn round_int(self) -> i32x8 { |
188 | // These technically don't have the same semantics for NaN and out of |
189 | // range values, but it doesn't seem to matter as Skia does it the same |
190 | // way. |
191 | cfg_if::cfg_if! { |
192 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
193 | cast(unsafe { _mm256_cvtps_epi32(self.0) }) |
194 | } else { |
195 | i32x8(self.0.round_int(), self.1.round_int()) |
196 | } |
197 | } |
198 | } |
199 | |
200 | pub fn trunc_int(self) -> i32x8 { |
201 | // These technically don't have the same semantics for NaN and out of |
202 | // range values, but it doesn't seem to matter as Skia does it the same |
203 | // way. |
204 | cfg_if::cfg_if! { |
205 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
206 | cast(unsafe { _mm256_cvttps_epi32(self.0) }) |
207 | } else { |
208 | i32x8(self.0.trunc_int(), self.1.trunc_int()) |
209 | } |
210 | } |
211 | } |
212 | |
213 | pub fn recip_fast(self) -> Self { |
214 | cfg_if::cfg_if! { |
215 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
216 | Self(unsafe { _mm256_rcp_ps(self.0) }) |
217 | } else { |
218 | Self(self.0.recip_fast(), self.1.recip_fast()) |
219 | } |
220 | } |
221 | } |
222 | |
223 | pub fn recip_sqrt(self) -> Self { |
224 | cfg_if::cfg_if! { |
225 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
226 | Self(unsafe { _mm256_rsqrt_ps(self.0) }) |
227 | } else { |
228 | Self(self.0.recip_sqrt(), self.1.recip_sqrt()) |
229 | } |
230 | } |
231 | } |
232 | |
233 | pub fn sqrt(self) -> Self { |
234 | cfg_if::cfg_if! { |
235 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
236 | Self(unsafe { _mm256_sqrt_ps(self.0) }) |
237 | } else { |
238 | Self(self.0.sqrt(), self.1.sqrt()) |
239 | } |
240 | } |
241 | } |
242 | } |
243 | |
244 | impl From<[f32; 8]> for f32x8 { |
245 | fn from(v: [f32; 8]) -> Self { |
246 | cast(v) |
247 | } |
248 | } |
249 | |
250 | impl From<f32x8> for [f32; 8] { |
251 | fn from(v: f32x8) -> Self { |
252 | cast(v) |
253 | } |
254 | } |
255 | |
256 | impl core::ops::Add for f32x8 { |
257 | type Output = Self; |
258 | |
259 | fn add(self, rhs: Self) -> Self::Output { |
260 | cfg_if::cfg_if! { |
261 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
262 | Self(unsafe { _mm256_add_ps(self.0, rhs.0) }) |
263 | } else { |
264 | Self(self.0 + rhs.0, self.1 + rhs.1) |
265 | } |
266 | } |
267 | } |
268 | } |
269 | |
270 | impl core::ops::AddAssign for f32x8 { |
271 | fn add_assign(&mut self, rhs: f32x8) { |
272 | *self = *self + rhs; |
273 | } |
274 | } |
275 | |
276 | impl core::ops::Sub for f32x8 { |
277 | type Output = Self; |
278 | |
279 | fn sub(self, rhs: Self) -> Self::Output { |
280 | cfg_if::cfg_if! { |
281 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
282 | Self(unsafe { _mm256_sub_ps(self.0, rhs.0) }) |
283 | } else { |
284 | Self(self.0 - rhs.0, self.1 - rhs.1) |
285 | } |
286 | } |
287 | } |
288 | } |
289 | |
290 | impl core::ops::Mul for f32x8 { |
291 | type Output = Self; |
292 | |
293 | fn mul(self, rhs: Self) -> Self::Output { |
294 | cfg_if::cfg_if! { |
295 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
296 | Self(unsafe { _mm256_mul_ps(self.0, rhs.0) }) |
297 | } else { |
298 | Self(self.0 * rhs.0, self.1 * rhs.1) |
299 | } |
300 | } |
301 | } |
302 | } |
303 | |
304 | impl core::ops::MulAssign for f32x8 { |
305 | fn mul_assign(&mut self, rhs: f32x8) { |
306 | *self = *self * rhs; |
307 | } |
308 | } |
309 | |
310 | impl core::ops::Div for f32x8 { |
311 | type Output = Self; |
312 | |
313 | fn div(self, rhs: Self) -> Self::Output { |
314 | cfg_if::cfg_if! { |
315 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
316 | Self(unsafe { _mm256_div_ps(self.0, rhs.0) }) |
317 | } else { |
318 | Self(self.0 / rhs.0, self.1 / rhs.1) |
319 | } |
320 | } |
321 | } |
322 | } |
323 | |
324 | impl core::ops::BitAnd for f32x8 { |
325 | type Output = Self; |
326 | |
327 | #[inline (always)] |
328 | fn bitand(self, rhs: Self) -> Self::Output { |
329 | cfg_if::cfg_if! { |
330 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
331 | Self(unsafe { _mm256_and_ps(self.0, rhs.0) }) |
332 | } else { |
333 | Self(self.0 & rhs.0, self.1 & rhs.1) |
334 | } |
335 | } |
336 | } |
337 | } |
338 | |
339 | impl core::ops::BitOr for f32x8 { |
340 | type Output = Self; |
341 | |
342 | #[inline (always)] |
343 | fn bitor(self, rhs: Self) -> Self::Output { |
344 | cfg_if::cfg_if! { |
345 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
346 | Self(unsafe { _mm256_or_ps(self.0, rhs.0) }) |
347 | } else { |
348 | Self(self.0 | rhs.0, self.1 | rhs.1) |
349 | } |
350 | } |
351 | } |
352 | } |
353 | |
354 | impl core::ops::BitXor for f32x8 { |
355 | type Output = Self; |
356 | |
357 | #[inline (always)] |
358 | fn bitxor(self, rhs: Self) -> Self::Output { |
359 | cfg_if::cfg_if! { |
360 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
361 | Self(unsafe { _mm256_xor_ps(self.0, rhs.0) }) |
362 | } else { |
363 | Self(self.0 ^ rhs.0, self.1 ^ rhs.1) |
364 | } |
365 | } |
366 | } |
367 | } |
368 | |
369 | impl core::ops::Neg for f32x8 { |
370 | type Output = Self; |
371 | |
372 | fn neg(self) -> Self { |
373 | Self::default() - self |
374 | } |
375 | } |
376 | |
377 | impl core::ops::Not for f32x8 { |
378 | type Output = Self; |
379 | |
380 | fn not(self) -> Self { |
381 | cfg_if::cfg_if! { |
382 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
383 | let all_bits = unsafe { _mm256_set1_ps(f32::from_bits(u32::MAX)) }; |
384 | Self(unsafe { _mm256_xor_ps(self.0, all_bits) }) |
385 | } else { |
386 | Self(!self.0, !self.1) |
387 | } |
388 | } |
389 | } |
390 | } |
391 | |
392 | impl core::cmp::PartialEq for f32x8 { |
393 | fn eq(&self, rhs: &Self) -> bool { |
394 | cfg_if::cfg_if! { |
395 | if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
396 | let mask = unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_EQ_OQ) }; |
397 | unsafe { _mm256_movemask_ps(mask) == 0b1111_1111 } |
398 | } else { |
399 | self.0 == rhs.0 && self.1 == rhs.1 |
400 | } |
401 | } |
402 | } |
403 | } |
404 | |