| 1 | /* |
| 2 | * Copyright (c) 2023. |
| 3 | * |
| 4 | * This software is free software; |
| 5 | * |
| 6 | * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license |
| 7 | */ |
| 8 | |
| 9 | #![cfg (all(feature = "x86" , any(target_arch = "x86" , target_arch = "x86_64" )))] |
| 10 | //! This module provides unsafe ways to do some things |
| 11 | #![allow (clippy::wildcard_imports)] |
| 12 | |
| 13 | #[cfg (target_arch = "x86" )] |
| 14 | use core::arch::x86::*; |
| 15 | #[cfg (target_arch = "x86_64" )] |
| 16 | use core::arch::x86_64::*; |
| 17 | use core::ops::{Add, AddAssign, Mul, MulAssign, Sub}; |
| 18 | |
| 19 | /// A copy of `_MM_SHUFFLE()` that doesn't require |
| 20 | /// a nightly compiler |
| 21 | #[inline ] |
| 22 | const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 { |
| 23 | (z << 6) | (y << 4) | (x << 2) | w |
| 24 | } |
| 25 | |
| 26 | /// An abstraction of an AVX ymm register that |
| 27 | ///allows some things to not look ugly |
| 28 | #[derive (Clone, Copy)] |
| 29 | pub struct YmmRegister { |
| 30 | /// An AVX register |
| 31 | pub(crate) mm256: __m256i |
| 32 | } |
| 33 | |
| 34 | impl Add for YmmRegister { |
| 35 | type Output = YmmRegister; |
| 36 | |
| 37 | #[inline ] |
| 38 | fn add(self, rhs: Self) -> Self::Output { |
| 39 | unsafe { |
| 40 | return YmmRegister { |
| 41 | mm256: _mm256_add_epi32(self.mm256, b:rhs.mm256) |
| 42 | }; |
| 43 | } |
| 44 | } |
| 45 | } |
| 46 | |
| 47 | impl Add<i32> for YmmRegister { |
| 48 | type Output = YmmRegister; |
| 49 | |
| 50 | #[inline ] |
| 51 | fn add(self, rhs: i32) -> Self::Output { |
| 52 | unsafe { |
| 53 | let tmp: __m256i = _mm256_set1_epi32(rhs); |
| 54 | |
| 55 | return YmmRegister { |
| 56 | mm256: _mm256_add_epi32(self.mm256, b:tmp) |
| 57 | }; |
| 58 | } |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | impl Sub for YmmRegister { |
| 63 | type Output = YmmRegister; |
| 64 | |
| 65 | #[inline ] |
| 66 | fn sub(self, rhs: Self) -> Self::Output { |
| 67 | unsafe { |
| 68 | return YmmRegister { |
| 69 | mm256: _mm256_sub_epi32(self.mm256, b:rhs.mm256) |
| 70 | }; |
| 71 | } |
| 72 | } |
| 73 | } |
| 74 | |
| 75 | impl AddAssign for YmmRegister { |
| 76 | #[inline ] |
| 77 | fn add_assign(&mut self, rhs: Self) { |
| 78 | unsafe { |
| 79 | self.mm256 = _mm256_add_epi32(self.mm256, b:rhs.mm256); |
| 80 | } |
| 81 | } |
| 82 | } |
| 83 | |
| 84 | impl AddAssign<i32> for YmmRegister { |
| 85 | #[inline ] |
| 86 | fn add_assign(&mut self, rhs: i32) { |
| 87 | unsafe { |
| 88 | let tmp: __m256i = _mm256_set1_epi32(rhs); |
| 89 | |
| 90 | self.mm256 = _mm256_add_epi32(self.mm256, b:tmp); |
| 91 | } |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | impl Mul for YmmRegister { |
| 96 | type Output = YmmRegister; |
| 97 | |
| 98 | #[inline ] |
| 99 | fn mul(self, rhs: Self) -> Self::Output { |
| 100 | unsafe { |
| 101 | YmmRegister { |
| 102 | mm256: _mm256_mullo_epi32(self.mm256, b:rhs.mm256) |
| 103 | } |
| 104 | } |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | impl Mul<i32> for YmmRegister { |
| 109 | type Output = YmmRegister; |
| 110 | |
| 111 | #[inline ] |
| 112 | fn mul(self, rhs: i32) -> Self::Output { |
| 113 | unsafe { |
| 114 | let tmp: __m256i = _mm256_set1_epi32(rhs); |
| 115 | |
| 116 | YmmRegister { |
| 117 | mm256: _mm256_mullo_epi32(self.mm256, b:tmp) |
| 118 | } |
| 119 | } |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | impl MulAssign for YmmRegister { |
| 124 | #[inline ] |
| 125 | fn mul_assign(&mut self, rhs: Self) { |
| 126 | unsafe { |
| 127 | self.mm256 = _mm256_mullo_epi32(self.mm256, b:rhs.mm256); |
| 128 | } |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | impl MulAssign<i32> for YmmRegister { |
| 133 | #[inline ] |
| 134 | fn mul_assign(&mut self, rhs: i32) { |
| 135 | unsafe { |
| 136 | let tmp: __m256i = _mm256_set1_epi32(rhs); |
| 137 | |
| 138 | self.mm256 = _mm256_mullo_epi32(self.mm256, b:tmp); |
| 139 | } |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | impl MulAssign<__m256i> for YmmRegister { |
| 144 | #[inline ] |
| 145 | fn mul_assign(&mut self, rhs: __m256i) { |
| 146 | unsafe { |
| 147 | self.mm256 = _mm256_mullo_epi32(self.mm256, b:rhs); |
| 148 | } |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | type Reg = YmmRegister; |
| 153 | |
| 154 | /// Transpose an array of 8 by 8 i32's using avx intrinsics |
| 155 | /// |
| 156 | /// This was translated from [here](https://newbedev.com/transpose-an-8x8-float-using-avx-avx2) |
| 157 | #[allow (unused_parens, clippy::too_many_arguments)] |
| 158 | #[target_feature (enable = "avx2" )] |
| 159 | #[inline ] |
| 160 | pub unsafe fn transpose( |
| 161 | v0: &mut Reg, v1: &mut Reg, v2: &mut Reg, v3: &mut Reg, v4: &mut Reg, v5: &mut Reg, |
| 162 | v6: &mut Reg, v7: &mut Reg |
| 163 | ) { |
| 164 | macro_rules! merge_epi32 { |
| 165 | ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => { |
| 166 | let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0)); |
| 167 | |
| 168 | let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0)); |
| 169 | |
| 170 | $v2 = _mm256_unpacklo_epi32(va, vb); |
| 171 | |
| 172 | $v3 = _mm256_unpackhi_epi32(va, vb); |
| 173 | }; |
| 174 | } |
| 175 | |
| 176 | macro_rules! merge_epi64 { |
| 177 | ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => { |
| 178 | let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0)); |
| 179 | |
| 180 | let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0)); |
| 181 | |
| 182 | $v2 = _mm256_unpacklo_epi64(va, vb); |
| 183 | |
| 184 | $v3 = _mm256_unpackhi_epi64(va, vb); |
| 185 | }; |
| 186 | } |
| 187 | |
| 188 | macro_rules! merge_si128 { |
| 189 | ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => { |
| 190 | $v2 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 2, 0, 0)); |
| 191 | |
| 192 | $v3 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 3, 0, 1)); |
| 193 | }; |
| 194 | } |
| 195 | |
| 196 | let (w0, w1, w2, w3, w4, w5, w6, w7); |
| 197 | |
| 198 | merge_epi32!((v0.mm256), (v1.mm256), w0, w1); |
| 199 | |
| 200 | merge_epi32!((v2.mm256), (v3.mm256), w2, w3); |
| 201 | |
| 202 | merge_epi32!((v4.mm256), (v5.mm256), w4, w5); |
| 203 | |
| 204 | merge_epi32!((v6.mm256), (v7.mm256), w6, w7); |
| 205 | |
| 206 | let (x0, x1, x2, x3, x4, x5, x6, x7); |
| 207 | |
| 208 | merge_epi64!(w0, w2, x0, x1); |
| 209 | |
| 210 | merge_epi64!(w1, w3, x2, x3); |
| 211 | |
| 212 | merge_epi64!(w4, w6, x4, x5); |
| 213 | |
| 214 | merge_epi64!(w5, w7, x6, x7); |
| 215 | |
| 216 | merge_si128!(x0, x4, (v0.mm256), (v1.mm256)); |
| 217 | |
| 218 | merge_si128!(x1, x5, (v2.mm256), (v3.mm256)); |
| 219 | |
| 220 | merge_si128!(x2, x6, (v4.mm256), (v5.mm256)); |
| 221 | |
| 222 | merge_si128!(x3, x7, (v6.mm256), (v7.mm256)); |
| 223 | } |
| 224 | |