1 | // Copyright 2020 Yevhenii Reizner |
2 | // |
3 | // Use of this source code is governed by a BSD-style license that can be |
4 | // found in the LICENSE file. |
5 | |
6 | // Based on https://github.com/Lokathor/wide (Zlib) |
7 | |
8 | use bytemuck::cast; |
9 | |
10 | use super::{f32x8, u32x8}; |
11 | |
12 | cfg_if::cfg_if! { |
13 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
14 | #[cfg (target_arch = "x86" )] |
15 | use core::arch::x86::*; |
16 | #[cfg (target_arch = "x86_64" )] |
17 | use core::arch::x86_64::*; |
18 | |
19 | #[derive (Clone, Copy, Debug)] |
20 | #[repr (C, align(32))] |
21 | pub struct i32x8(__m256i); |
22 | } else { |
23 | use super::i32x4; |
24 | |
25 | #[derive (Clone, Copy, Debug)] |
26 | #[repr (C, align(32))] |
27 | pub struct i32x8(pub i32x4, pub i32x4); |
28 | } |
29 | } |
30 | |
31 | unsafe impl bytemuck::Zeroable for i32x8 {} |
32 | unsafe impl bytemuck::Pod for i32x8 {} |
33 | |
34 | impl Default for i32x8 { |
35 | fn default() -> Self { |
36 | Self::splat(0) |
37 | } |
38 | } |
39 | |
40 | impl i32x8 { |
41 | pub fn splat(n: i32) -> Self { |
42 | cast([n, n, n, n, n, n, n, n]) |
43 | } |
44 | |
45 | pub fn blend(self, t: Self, f: Self) -> Self { |
46 | cfg_if::cfg_if! { |
47 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
48 | Self(unsafe { _mm256_blendv_epi8(f.0, t.0, self.0) }) |
49 | } else { |
50 | Self(self.0.blend(t.0, f.0), self.1.blend(t.1, f.1)) |
51 | } |
52 | } |
53 | } |
54 | |
55 | pub fn cmp_eq(self, rhs: Self) -> Self { |
56 | cfg_if::cfg_if! { |
57 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
58 | Self(unsafe { _mm256_cmpeq_epi32(self.0, rhs.0) }) |
59 | } else { |
60 | Self(self.0.cmp_eq(rhs.0), self.1.cmp_eq(rhs.1)) |
61 | } |
62 | } |
63 | } |
64 | |
65 | pub fn cmp_gt(self, rhs: Self) -> Self { |
66 | cfg_if::cfg_if! { |
67 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
68 | Self(unsafe { _mm256_cmpgt_epi32(self.0, rhs.0) }) |
69 | } else { |
70 | Self(self.0.cmp_gt(rhs.0), self.1.cmp_gt(rhs.1)) |
71 | } |
72 | } |
73 | } |
74 | |
75 | pub fn cmp_lt(self, rhs: Self) -> Self { |
76 | cfg_if::cfg_if! { |
77 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
78 | // There is no `_mm256_cmpLT_epi32`, therefore we have to use |
79 | // `_mm256_cmpGT_epi32` and then invert the result. |
80 | let v = unsafe { _mm256_cmpgt_epi32(self.0, rhs.0) }; |
81 | let all_bits = unsafe { _mm256_set1_epi16(-1) }; |
82 | Self(unsafe { _mm256_xor_si256(v, all_bits) }) |
83 | } else { |
84 | Self(self.0.cmp_lt(rhs.0), self.1.cmp_lt(rhs.1)) |
85 | } |
86 | } |
87 | } |
88 | |
89 | pub fn to_f32x8(self) -> f32x8 { |
90 | cfg_if::cfg_if! { |
91 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
92 | cast(unsafe { _mm256_cvtepi32_ps(self.0) }) |
93 | } else if #[cfg(all(feature = "simd" , target_feature = "avx" ))] { |
94 | cast([self.0.to_f32x4(), self.1.to_f32x4()]) |
95 | } else { |
96 | f32x8(self.0.to_f32x4(), self.1.to_f32x4()) |
97 | } |
98 | } |
99 | } |
100 | |
101 | pub fn to_u32x8_bitcast(self) -> u32x8 { |
102 | bytemuck::cast(self) |
103 | } |
104 | |
105 | pub fn to_f32x8_bitcast(self) -> f32x8 { |
106 | bytemuck::cast(self) |
107 | } |
108 | } |
109 | |
110 | impl From<[i32; 8]> for i32x8 { |
111 | fn from(v: [i32; 8]) -> Self { |
112 | cast(v) |
113 | } |
114 | } |
115 | |
116 | impl From<i32x8> for [i32; 8] { |
117 | fn from(v: i32x8) -> Self { |
118 | cast(v) |
119 | } |
120 | } |
121 | |
122 | impl core::ops::Add for i32x8 { |
123 | type Output = Self; |
124 | |
125 | fn add(self, rhs: Self) -> Self::Output { |
126 | cfg_if::cfg_if! { |
127 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
128 | Self(unsafe { _mm256_add_epi32(self.0, rhs.0) }) |
129 | } else { |
130 | Self(self.0 + rhs.0, self.1 + rhs.1) |
131 | } |
132 | } |
133 | } |
134 | } |
135 | |
136 | impl core::ops::BitAnd for i32x8 { |
137 | type Output = Self; |
138 | |
139 | fn bitand(self, rhs: Self) -> Self::Output { |
140 | cfg_if::cfg_if! { |
141 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
142 | Self(unsafe { _mm256_and_si256(self.0, rhs.0) }) |
143 | } else { |
144 | Self(self.0 & rhs.0, self.1 & rhs.1) |
145 | } |
146 | } |
147 | } |
148 | } |
149 | |
150 | impl core::ops::Mul for i32x8 { |
151 | type Output = Self; |
152 | |
153 | fn mul(self, rhs: Self) -> Self::Output { |
154 | cfg_if::cfg_if! { |
155 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
156 | Self(unsafe { _mm256_mullo_epi32(self.0, rhs.0) }) |
157 | } else { |
158 | Self(self.0 * rhs.0, self.1 * rhs.1) |
159 | } |
160 | } |
161 | } |
162 | } |
163 | |
164 | impl core::ops::BitOr for i32x8 { |
165 | type Output = Self; |
166 | |
167 | #[inline ] |
168 | fn bitor(self, rhs: Self) -> Self::Output { |
169 | cfg_if::cfg_if! { |
170 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
171 | Self(unsafe { _mm256_or_si256(self.0, rhs.0) }) |
172 | } else { |
173 | Self(self.0 | rhs.0, self.1 | rhs.1) |
174 | } |
175 | } |
176 | } |
177 | } |
178 | |
179 | impl core::ops::BitXor for i32x8 { |
180 | type Output = Self; |
181 | |
182 | #[inline ] |
183 | fn bitxor(self, rhs: Self) -> Self::Output { |
184 | cfg_if::cfg_if! { |
185 | if #[cfg(all(feature = "simd" , target_feature = "avx2" ))] { |
186 | Self(unsafe { _mm256_xor_si256(self.0, rhs.0) }) |
187 | } else { |
188 | Self(self.0 ^ rhs.0, self.1 ^ rhs.1) |
189 | } |
190 | } |
191 | } |
192 | } |
193 | |