1 | /* |
2 | * Copyright (c) 2023. |
3 | * |
4 | * This software is free software; |
5 | * |
6 | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
7 | */ |
8 | |
9 | #![cfg (all(feature = "x86" , any(target_arch = "x86" , target_arch = "x86_64" )))] |
10 | //! This module provides unsafe ways to do some things |
11 | #![allow (clippy::wildcard_imports)] |
12 | |
13 | #[cfg (target_arch = "x86" )] |
14 | use core::arch::x86::*; |
15 | #[cfg (target_arch = "x86_64" )] |
16 | use core::arch::x86_64::*; |
17 | use core::ops::{Add, AddAssign, Mul, MulAssign, Sub}; |
18 | |
19 | /// A copy of `_MM_SHUFFLE()` that doesn't require |
20 | /// a nightly compiler |
21 | #[inline ] |
22 | const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 { |
23 | (z << 6) | (y << 4) | (x << 2) | w |
24 | } |
25 | |
26 | /// An abstraction of an AVX ymm register that |
27 | ///allows some things to not look ugly |
28 | #[derive (Clone, Copy)] |
29 | pub struct YmmRegister { |
30 | /// An AVX register |
31 | pub(crate) mm256: __m256i |
32 | } |
33 | |
34 | impl Add for YmmRegister { |
35 | type Output = YmmRegister; |
36 | |
37 | #[inline ] |
38 | fn add(self, rhs: Self) -> Self::Output { |
39 | unsafe { |
40 | return YmmRegister { |
41 | mm256: _mm256_add_epi32(self.mm256, b:rhs.mm256) |
42 | }; |
43 | } |
44 | } |
45 | } |
46 | |
47 | impl Add<i32> for YmmRegister { |
48 | type Output = YmmRegister; |
49 | |
50 | #[inline ] |
51 | fn add(self, rhs: i32) -> Self::Output { |
52 | unsafe { |
53 | let tmp: __m256i = _mm256_set1_epi32(rhs); |
54 | |
55 | return YmmRegister { |
56 | mm256: _mm256_add_epi32(self.mm256, b:tmp) |
57 | }; |
58 | } |
59 | } |
60 | } |
61 | |
62 | impl Sub for YmmRegister { |
63 | type Output = YmmRegister; |
64 | |
65 | #[inline ] |
66 | fn sub(self, rhs: Self) -> Self::Output { |
67 | unsafe { |
68 | return YmmRegister { |
69 | mm256: _mm256_sub_epi32(self.mm256, b:rhs.mm256) |
70 | }; |
71 | } |
72 | } |
73 | } |
74 | |
75 | impl AddAssign for YmmRegister { |
76 | #[inline ] |
77 | fn add_assign(&mut self, rhs: Self) { |
78 | unsafe { |
79 | self.mm256 = _mm256_add_epi32(self.mm256, b:rhs.mm256); |
80 | } |
81 | } |
82 | } |
83 | |
84 | impl AddAssign<i32> for YmmRegister { |
85 | #[inline ] |
86 | fn add_assign(&mut self, rhs: i32) { |
87 | unsafe { |
88 | let tmp: __m256i = _mm256_set1_epi32(rhs); |
89 | |
90 | self.mm256 = _mm256_add_epi32(self.mm256, b:tmp); |
91 | } |
92 | } |
93 | } |
94 | |
95 | impl Mul for YmmRegister { |
96 | type Output = YmmRegister; |
97 | |
98 | #[inline ] |
99 | fn mul(self, rhs: Self) -> Self::Output { |
100 | unsafe { |
101 | YmmRegister { |
102 | mm256: _mm256_mullo_epi32(self.mm256, b:rhs.mm256) |
103 | } |
104 | } |
105 | } |
106 | } |
107 | |
108 | impl Mul<i32> for YmmRegister { |
109 | type Output = YmmRegister; |
110 | |
111 | #[inline ] |
112 | fn mul(self, rhs: i32) -> Self::Output { |
113 | unsafe { |
114 | let tmp: __m256i = _mm256_set1_epi32(rhs); |
115 | |
116 | YmmRegister { |
117 | mm256: _mm256_mullo_epi32(self.mm256, b:tmp) |
118 | } |
119 | } |
120 | } |
121 | } |
122 | |
123 | impl MulAssign for YmmRegister { |
124 | #[inline ] |
125 | fn mul_assign(&mut self, rhs: Self) { |
126 | unsafe { |
127 | self.mm256 = _mm256_mullo_epi32(self.mm256, b:rhs.mm256); |
128 | } |
129 | } |
130 | } |
131 | |
132 | impl MulAssign<i32> for YmmRegister { |
133 | #[inline ] |
134 | fn mul_assign(&mut self, rhs: i32) { |
135 | unsafe { |
136 | let tmp: __m256i = _mm256_set1_epi32(rhs); |
137 | |
138 | self.mm256 = _mm256_mullo_epi32(self.mm256, b:tmp); |
139 | } |
140 | } |
141 | } |
142 | |
143 | impl MulAssign<__m256i> for YmmRegister { |
144 | #[inline ] |
145 | fn mul_assign(&mut self, rhs: __m256i) { |
146 | unsafe { |
147 | self.mm256 = _mm256_mullo_epi32(self.mm256, b:rhs); |
148 | } |
149 | } |
150 | } |
151 | |
152 | type Reg = YmmRegister; |
153 | |
154 | /// Transpose an array of 8 by 8 i32's using avx intrinsics |
155 | /// |
156 | /// This was translated from [here](https://newbedev.com/transpose-an-8x8-float-using-avx-avx2) |
157 | #[allow (unused_parens, clippy::too_many_arguments)] |
158 | #[target_feature (enable = "avx2" )] |
159 | #[inline ] |
160 | pub unsafe fn transpose( |
161 | v0: &mut Reg, v1: &mut Reg, v2: &mut Reg, v3: &mut Reg, v4: &mut Reg, v5: &mut Reg, |
162 | v6: &mut Reg, v7: &mut Reg |
163 | ) { |
164 | macro_rules! merge_epi32 { |
165 | ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => { |
166 | let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0)); |
167 | |
168 | let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0)); |
169 | |
170 | $v2 = _mm256_unpacklo_epi32(va, vb); |
171 | |
172 | $v3 = _mm256_unpackhi_epi32(va, vb); |
173 | }; |
174 | } |
175 | |
176 | macro_rules! merge_epi64 { |
177 | ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => { |
178 | let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0)); |
179 | |
180 | let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0)); |
181 | |
182 | $v2 = _mm256_unpacklo_epi64(va, vb); |
183 | |
184 | $v3 = _mm256_unpackhi_epi64(va, vb); |
185 | }; |
186 | } |
187 | |
188 | macro_rules! merge_si128 { |
189 | ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => { |
190 | $v2 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 2, 0, 0)); |
191 | |
192 | $v3 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 3, 0, 1)); |
193 | }; |
194 | } |
195 | |
196 | let (w0, w1, w2, w3, w4, w5, w6, w7); |
197 | |
198 | merge_epi32!((v0.mm256), (v1.mm256), w0, w1); |
199 | |
200 | merge_epi32!((v2.mm256), (v3.mm256), w2, w3); |
201 | |
202 | merge_epi32!((v4.mm256), (v5.mm256), w4, w5); |
203 | |
204 | merge_epi32!((v6.mm256), (v7.mm256), w6, w7); |
205 | |
206 | let (x0, x1, x2, x3, x4, x5, x6, x7); |
207 | |
208 | merge_epi64!(w0, w2, x0, x1); |
209 | |
210 | merge_epi64!(w1, w3, x2, x3); |
211 | |
212 | merge_epi64!(w4, w6, x4, x5); |
213 | |
214 | merge_epi64!(w5, w7, x6, x7); |
215 | |
216 | merge_si128!(x0, x4, (v0.mm256), (v1.mm256)); |
217 | |
218 | merge_si128!(x1, x5, (v2.mm256), (v3.mm256)); |
219 | |
220 | merge_si128!(x2, x6, (v4.mm256), (v5.mm256)); |
221 | |
222 | merge_si128!(x3, x7, (v6.mm256), (v7.mm256)); |
223 | } |
224 | |