1 | //! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD. |
2 | |
3 | use crate::types::*; |
4 | use crate::{vec128_storage, vec256_storage, vec512_storage}; |
5 | use core::marker::PhantomData; |
6 | use core::ops::*; |
7 | use zerocopy::{AsBytes, FromBytes, FromZeroes}; |
8 | |
9 | #[derive (Copy, Clone, Default, FromBytes, AsBytes, FromZeroes)] |
10 | #[repr (transparent)] |
11 | #[allow (non_camel_case_types)] |
12 | pub struct x2<W, G>(pub [W; 2], PhantomData<G>); |
13 | impl<W, G> x2<W, G> { |
14 | #[inline (always)] |
15 | pub fn new(xs: [W; 2]) -> Self { |
16 | x2(xs, PhantomData) |
17 | } |
18 | } |
19 | macro_rules! fwd_binop_x2 { |
20 | ($trait:ident, $fn:ident) => { |
21 | impl<W: $trait + Copy, G> $trait for x2<W, G> { |
22 | type Output = x2<W::Output, G>; |
23 | #[inline(always)] |
24 | fn $fn(self, rhs: Self) -> Self::Output { |
25 | x2::new([self.0[0].$fn(rhs.0[0]), self.0[1].$fn(rhs.0[1])]) |
26 | } |
27 | } |
28 | }; |
29 | } |
30 | macro_rules! fwd_binop_assign_x2 { |
31 | ($trait:ident, $fn_assign:ident) => { |
32 | impl<W: $trait + Copy, G> $trait for x2<W, G> { |
33 | #[inline(always)] |
34 | fn $fn_assign(&mut self, rhs: Self) { |
35 | (self.0[0]).$fn_assign(rhs.0[0]); |
36 | (self.0[1]).$fn_assign(rhs.0[1]); |
37 | } |
38 | } |
39 | }; |
40 | } |
41 | macro_rules! fwd_unop_x2 { |
42 | ($fn:ident) => { |
43 | #[inline(always)] |
44 | fn $fn(self) -> Self { |
45 | x2::new([self.0[0].$fn(), self.0[1].$fn()]) |
46 | } |
47 | }; |
48 | } |
49 | impl<W, G> RotateEachWord32 for x2<W, G> |
50 | where |
51 | W: Copy + RotateEachWord32, |
52 | { |
53 | fwd_unop_x2!(rotate_each_word_right7); |
54 | fwd_unop_x2!(rotate_each_word_right8); |
55 | fwd_unop_x2!(rotate_each_word_right11); |
56 | fwd_unop_x2!(rotate_each_word_right12); |
57 | fwd_unop_x2!(rotate_each_word_right16); |
58 | fwd_unop_x2!(rotate_each_word_right20); |
59 | fwd_unop_x2!(rotate_each_word_right24); |
60 | fwd_unop_x2!(rotate_each_word_right25); |
61 | } |
62 | impl<W, G> RotateEachWord64 for x2<W, G> |
63 | where |
64 | W: Copy + RotateEachWord64, |
65 | { |
66 | fwd_unop_x2!(rotate_each_word_right32); |
67 | } |
68 | impl<W, G> RotateEachWord128 for x2<W, G> where W: RotateEachWord128 {} |
69 | impl<W, G> BitOps0 for x2<W, G> |
70 | where |
71 | W: BitOps0, |
72 | G: Copy, |
73 | { |
74 | } |
75 | impl<W, G> BitOps32 for x2<W, G> |
76 | where |
77 | W: BitOps32 + BitOps0, |
78 | G: Copy, |
79 | { |
80 | } |
81 | impl<W, G> BitOps64 for x2<W, G> |
82 | where |
83 | W: BitOps64 + BitOps0, |
84 | G: Copy, |
85 | { |
86 | } |
87 | impl<W, G> BitOps128 for x2<W, G> |
88 | where |
89 | W: BitOps128 + BitOps0, |
90 | G: Copy, |
91 | { |
92 | } |
93 | fwd_binop_x2!(BitAnd, bitand); |
94 | fwd_binop_x2!(BitOr, bitor); |
95 | fwd_binop_x2!(BitXor, bitxor); |
96 | fwd_binop_x2!(AndNot, andnot); |
97 | fwd_binop_assign_x2!(BitAndAssign, bitand_assign); |
98 | fwd_binop_assign_x2!(BitOrAssign, bitor_assign); |
99 | fwd_binop_assign_x2!(BitXorAssign, bitxor_assign); |
100 | impl<W, G> ArithOps for x2<W, G> |
101 | where |
102 | W: ArithOps, |
103 | G: Copy, |
104 | { |
105 | } |
106 | fwd_binop_x2!(Add, add); |
107 | fwd_binop_assign_x2!(AddAssign, add_assign); |
108 | impl<W: Not + Copy, G> Not for x2<W, G> { |
109 | type Output = x2<W::Output, G>; |
110 | #[inline (always)] |
111 | fn not(self) -> Self::Output { |
112 | x2::new([self.0[0].not(), self.0[1].not()]) |
113 | } |
114 | } |
115 | impl<W, G> UnsafeFrom<[W; 2]> for x2<W, G> { |
116 | #[inline (always)] |
117 | unsafe fn unsafe_from(xs: [W; 2]) -> Self { |
118 | x2::new(xs) |
119 | } |
120 | } |
121 | impl<W: Copy, G> Vec2<W> for x2<W, G> { |
122 | #[inline (always)] |
123 | fn extract(self, i: u32) -> W { |
124 | self.0[i as usize] |
125 | } |
126 | #[inline (always)] |
127 | fn insert(mut self, w: W, i: u32) -> Self { |
128 | self.0[i as usize] = w; |
129 | self |
130 | } |
131 | } |
132 | impl<W: Copy + Store<vec128_storage>, G> Store<vec256_storage> for x2<W, G> { |
133 | #[inline (always)] |
134 | unsafe fn unpack(p: vec256_storage) -> Self { |
135 | let p: [vec128_storage; 2] = p.split128(); |
136 | x2::new([W::unpack(p[0]), W::unpack(p[1])]) |
137 | } |
138 | } |
139 | impl<W, G> From<x2<W, G>> for vec256_storage |
140 | where |
141 | W: Copy, |
142 | vec128_storage: From<W>, |
143 | { |
144 | #[inline (always)] |
145 | fn from(x: x2<W, G>) -> Self { |
146 | vec256_storage::new128([x.0[0].into(), x.0[1].into()]) |
147 | } |
148 | } |
149 | impl<W, G> Swap64 for x2<W, G> |
150 | where |
151 | W: Swap64 + Copy, |
152 | { |
153 | fwd_unop_x2!(swap1); |
154 | fwd_unop_x2!(swap2); |
155 | fwd_unop_x2!(swap4); |
156 | fwd_unop_x2!(swap8); |
157 | fwd_unop_x2!(swap16); |
158 | fwd_unop_x2!(swap32); |
159 | fwd_unop_x2!(swap64); |
160 | } |
161 | impl<W: Copy, G> MultiLane<[W; 2]> for x2<W, G> { |
162 | #[inline (always)] |
163 | fn to_lanes(self) -> [W; 2] { |
164 | self.0 |
165 | } |
166 | #[inline (always)] |
167 | fn from_lanes(lanes: [W; 2]) -> Self { |
168 | x2::new(xs:lanes) |
169 | } |
170 | } |
171 | impl<W: BSwap + Copy, G> BSwap for x2<W, G> { |
172 | #[inline (always)] |
173 | fn bswap(self) -> Self { |
174 | x2::new([self.0[0].bswap(), self.0[1].bswap()]) |
175 | } |
176 | } |
177 | impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> { |
178 | #[inline (always)] |
179 | unsafe fn unsafe_read_le(input: &[u8]) -> Self { |
180 | let input: (&[u8], &[u8]) = input.split_at(mid:input.len() / 2); |
181 | x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)]) |
182 | } |
183 | #[inline (always)] |
184 | unsafe fn unsafe_read_be(input: &[u8]) -> Self { |
185 | let input: (&[u8], &[u8]) = input.split_at(mid:input.len() / 2); |
186 | x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)]) |
187 | } |
188 | #[inline (always)] |
189 | fn write_le(self, out: &mut [u8]) { |
190 | let out: (&mut [u8], &mut [u8]) = out.split_at_mut(mid:out.len() / 2); |
191 | self.0[0].write_le(out.0); |
192 | self.0[1].write_le(out.1); |
193 | } |
194 | #[inline (always)] |
195 | fn write_be(self, out: &mut [u8]) { |
196 | let out: (&mut [u8], &mut [u8]) = out.split_at_mut(mid:out.len() / 2); |
197 | self.0[0].write_be(out.0); |
198 | self.0[1].write_be(out.1); |
199 | } |
200 | } |
201 | impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> { |
202 | #[inline (always)] |
203 | fn shuffle_lane_words2301(self) -> Self { |
204 | Self::new([ |
205 | self.0[0].shuffle_lane_words2301(), |
206 | self.0[1].shuffle_lane_words2301(), |
207 | ]) |
208 | } |
209 | #[inline (always)] |
210 | fn shuffle_lane_words1230(self) -> Self { |
211 | Self::new([ |
212 | self.0[0].shuffle_lane_words1230(), |
213 | self.0[1].shuffle_lane_words1230(), |
214 | ]) |
215 | } |
216 | #[inline (always)] |
217 | fn shuffle_lane_words3012(self) -> Self { |
218 | Self::new([ |
219 | self.0[0].shuffle_lane_words3012(), |
220 | self.0[1].shuffle_lane_words3012(), |
221 | ]) |
222 | } |
223 | } |
224 | |
225 | #[derive (Copy, Clone, Default, FromBytes, AsBytes, FromZeroes)] |
226 | #[repr (transparent)] |
227 | #[allow (non_camel_case_types)] |
228 | pub struct x4<W>(pub [W; 4]); |
229 | impl<W> x4<W> { |
230 | #[inline (always)] |
231 | pub fn new(xs: [W; 4]) -> Self { |
232 | x4(xs) |
233 | } |
234 | } |
235 | macro_rules! fwd_binop_x4 { |
236 | ($trait:ident, $fn:ident) => { |
237 | impl<W: $trait + Copy> $trait for x4<W> { |
238 | type Output = x4<W::Output>; |
239 | #[inline(always)] |
240 | fn $fn(self, rhs: Self) -> Self::Output { |
241 | x4([ |
242 | self.0[0].$fn(rhs.0[0]), |
243 | self.0[1].$fn(rhs.0[1]), |
244 | self.0[2].$fn(rhs.0[2]), |
245 | self.0[3].$fn(rhs.0[3]), |
246 | ]) |
247 | } |
248 | } |
249 | }; |
250 | } |
251 | macro_rules! fwd_binop_assign_x4 { |
252 | ($trait:ident, $fn_assign:ident) => { |
253 | impl<W: $trait + Copy> $trait for x4<W> { |
254 | #[inline(always)] |
255 | fn $fn_assign(&mut self, rhs: Self) { |
256 | self.0[0].$fn_assign(rhs.0[0]); |
257 | self.0[1].$fn_assign(rhs.0[1]); |
258 | self.0[2].$fn_assign(rhs.0[2]); |
259 | self.0[3].$fn_assign(rhs.0[3]); |
260 | } |
261 | } |
262 | }; |
263 | } |
264 | macro_rules! fwd_unop_x4 { |
265 | ($fn:ident) => { |
266 | #[inline(always)] |
267 | fn $fn(self) -> Self { |
268 | x4([ |
269 | self.0[0].$fn(), |
270 | self.0[1].$fn(), |
271 | self.0[2].$fn(), |
272 | self.0[3].$fn(), |
273 | ]) |
274 | } |
275 | }; |
276 | } |
277 | impl<W> RotateEachWord32 for x4<W> |
278 | where |
279 | W: Copy + RotateEachWord32, |
280 | { |
281 | fwd_unop_x4!(rotate_each_word_right7); |
282 | fwd_unop_x4!(rotate_each_word_right8); |
283 | fwd_unop_x4!(rotate_each_word_right11); |
284 | fwd_unop_x4!(rotate_each_word_right12); |
285 | fwd_unop_x4!(rotate_each_word_right16); |
286 | fwd_unop_x4!(rotate_each_word_right20); |
287 | fwd_unop_x4!(rotate_each_word_right24); |
288 | fwd_unop_x4!(rotate_each_word_right25); |
289 | } |
290 | impl<W> RotateEachWord64 for x4<W> |
291 | where |
292 | W: Copy + RotateEachWord64, |
293 | { |
294 | fwd_unop_x4!(rotate_each_word_right32); |
295 | } |
296 | impl<W> RotateEachWord128 for x4<W> where W: RotateEachWord128 {} |
297 | impl<W> BitOps0 for x4<W> where W: BitOps0 {} |
298 | impl<W> BitOps32 for x4<W> where W: BitOps32 + BitOps0 {} |
299 | impl<W> BitOps64 for x4<W> where W: BitOps64 + BitOps0 {} |
300 | impl<W> BitOps128 for x4<W> where W: BitOps128 + BitOps0 {} |
301 | fwd_binop_x4!(BitAnd, bitand); |
302 | fwd_binop_x4!(BitOr, bitor); |
303 | fwd_binop_x4!(BitXor, bitxor); |
304 | fwd_binop_x4!(AndNot, andnot); |
305 | fwd_binop_assign_x4!(BitAndAssign, bitand_assign); |
306 | fwd_binop_assign_x4!(BitOrAssign, bitor_assign); |
307 | fwd_binop_assign_x4!(BitXorAssign, bitxor_assign); |
308 | impl<W> ArithOps for x4<W> where W: ArithOps {} |
309 | fwd_binop_x4!(Add, add); |
310 | fwd_binop_assign_x4!(AddAssign, add_assign); |
311 | impl<W: Not + Copy> Not for x4<W> { |
312 | type Output = x4<W::Output>; |
313 | #[inline (always)] |
314 | fn not(self) -> Self::Output { |
315 | x4([ |
316 | self.0[0].not(), |
317 | self.0[1].not(), |
318 | self.0[2].not(), |
319 | self.0[3].not(), |
320 | ]) |
321 | } |
322 | } |
323 | impl<W> UnsafeFrom<[W; 4]> for x4<W> { |
324 | #[inline (always)] |
325 | unsafe fn unsafe_from(xs: [W; 4]) -> Self { |
326 | x4(xs) |
327 | } |
328 | } |
329 | impl<W: Copy> Vec4<W> for x4<W> { |
330 | #[inline (always)] |
331 | fn extract(self, i: u32) -> W { |
332 | self.0[i as usize] |
333 | } |
334 | #[inline (always)] |
335 | fn insert(mut self, w: W, i: u32) -> Self { |
336 | self.0[i as usize] = w; |
337 | self |
338 | } |
339 | } |
340 | impl<W: Copy> Vec4Ext<W> for x4<W> { |
341 | #[inline (always)] |
342 | fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) |
343 | where |
344 | Self: Sized, |
345 | { |
346 | ( |
347 | x4([a.0[0], b.0[0], c.0[0], d.0[0]]), |
348 | x4([a.0[1], b.0[1], c.0[1], d.0[1]]), |
349 | x4([a.0[2], b.0[2], c.0[2], d.0[2]]), |
350 | x4([a.0[3], b.0[3], c.0[3], d.0[3]]), |
351 | ) |
352 | } |
353 | } |
354 | impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> { |
355 | #[inline (always)] |
356 | unsafe fn unpack(p: vec512_storage) -> Self { |
357 | let p: [vec128_storage; 4] = p.split128(); |
358 | x4([ |
359 | W::unpack(p[0]), |
360 | W::unpack(p[1]), |
361 | W::unpack(p[2]), |
362 | W::unpack(p[3]), |
363 | ]) |
364 | } |
365 | } |
366 | impl<W> From<x4<W>> for vec512_storage |
367 | where |
368 | W: Copy, |
369 | vec128_storage: From<W>, |
370 | { |
371 | #[inline (always)] |
372 | fn from(x: x4<W>) -> Self { |
373 | vec512_storage::new128([x.0[0].into(), x.0[1].into(), x.0[2].into(), x.0[3].into()]) |
374 | } |
375 | } |
376 | impl<W> Swap64 for x4<W> |
377 | where |
378 | W: Swap64 + Copy, |
379 | { |
380 | fwd_unop_x4!(swap1); |
381 | fwd_unop_x4!(swap2); |
382 | fwd_unop_x4!(swap4); |
383 | fwd_unop_x4!(swap8); |
384 | fwd_unop_x4!(swap16); |
385 | fwd_unop_x4!(swap32); |
386 | fwd_unop_x4!(swap64); |
387 | } |
388 | impl<W: Copy> MultiLane<[W; 4]> for x4<W> { |
389 | #[inline (always)] |
390 | fn to_lanes(self) -> [W; 4] { |
391 | self.0 |
392 | } |
393 | #[inline (always)] |
394 | fn from_lanes(lanes: [W; 4]) -> Self { |
395 | x4(lanes) |
396 | } |
397 | } |
398 | impl<W: BSwap + Copy> BSwap for x4<W> { |
399 | #[inline (always)] |
400 | fn bswap(self) -> Self { |
401 | x4([ |
402 | self.0[0].bswap(), |
403 | self.0[1].bswap(), |
404 | self.0[2].bswap(), |
405 | self.0[3].bswap(), |
406 | ]) |
407 | } |
408 | } |
409 | impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> { |
410 | #[inline (always)] |
411 | unsafe fn unsafe_read_le(input: &[u8]) -> Self { |
412 | let n = input.len() / 4; |
413 | x4([ |
414 | W::unsafe_read_le(&input[..n]), |
415 | W::unsafe_read_le(&input[n..n * 2]), |
416 | W::unsafe_read_le(&input[n * 2..n * 3]), |
417 | W::unsafe_read_le(&input[n * 3..]), |
418 | ]) |
419 | } |
420 | #[inline (always)] |
421 | unsafe fn unsafe_read_be(input: &[u8]) -> Self { |
422 | let n = input.len() / 4; |
423 | x4([ |
424 | W::unsafe_read_be(&input[..n]), |
425 | W::unsafe_read_be(&input[n..n * 2]), |
426 | W::unsafe_read_be(&input[n * 2..n * 3]), |
427 | W::unsafe_read_be(&input[n * 3..]), |
428 | ]) |
429 | } |
430 | #[inline (always)] |
431 | fn write_le(self, out: &mut [u8]) { |
432 | let n = out.len() / 4; |
433 | self.0[0].write_le(&mut out[..n]); |
434 | self.0[1].write_le(&mut out[n..n * 2]); |
435 | self.0[2].write_le(&mut out[n * 2..n * 3]); |
436 | self.0[3].write_le(&mut out[n * 3..]); |
437 | } |
438 | #[inline (always)] |
439 | fn write_be(self, out: &mut [u8]) { |
440 | let n = out.len() / 4; |
441 | self.0[0].write_be(&mut out[..n]); |
442 | self.0[1].write_be(&mut out[n..n * 2]); |
443 | self.0[2].write_be(&mut out[n * 2..n * 3]); |
444 | self.0[3].write_be(&mut out[n * 3..]); |
445 | } |
446 | } |
447 | impl<W: Copy + LaneWords4> LaneWords4 for x4<W> { |
448 | #[inline (always)] |
449 | fn shuffle_lane_words2301(self) -> Self { |
450 | x4([ |
451 | self.0[0].shuffle_lane_words2301(), |
452 | self.0[1].shuffle_lane_words2301(), |
453 | self.0[2].shuffle_lane_words2301(), |
454 | self.0[3].shuffle_lane_words2301(), |
455 | ]) |
456 | } |
457 | #[inline (always)] |
458 | fn shuffle_lane_words1230(self) -> Self { |
459 | x4([ |
460 | self.0[0].shuffle_lane_words1230(), |
461 | self.0[1].shuffle_lane_words1230(), |
462 | self.0[2].shuffle_lane_words1230(), |
463 | self.0[3].shuffle_lane_words1230(), |
464 | ]) |
465 | } |
466 | #[inline (always)] |
467 | fn shuffle_lane_words3012(self) -> Self { |
468 | x4([ |
469 | self.0[0].shuffle_lane_words3012(), |
470 | self.0[1].shuffle_lane_words3012(), |
471 | self.0[2].shuffle_lane_words3012(), |
472 | self.0[3].shuffle_lane_words3012(), |
473 | ]) |
474 | } |
475 | } |
476 | |