| 1 | #[cfg (test)] |
| 2 | use stdarch_test::assert_instr; |
| 3 | |
| 4 | /// Load tile configuration from a 64-byte memory location specified by mem_addr. |
| 5 | /// The tile configuration format is specified below, and includes the tile type pallette, |
| 6 | /// the number of bytes per row, and the number of rows. If the specified pallette_id is zero, |
| 7 | /// that signifies the init state for both the tile config and the tile data, and the tiles are zeroed. |
| 8 | /// Any invalid configurations will result in #GP fault. |
| 9 | /// |
| 10 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_loadconfig&ig_expand=6875) |
| 11 | #[inline ] |
| 12 | #[target_feature (enable = "amx-tile" )] |
| 13 | #[cfg_attr (test, assert_instr(ldtilecfg))] |
| 14 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 15 | pub unsafe fn _tile_loadconfig(mem_addr: *const u8) { |
| 16 | ldtilecfg(mem_addr); |
| 17 | } |
| 18 | |
| 19 | /// Stores the current tile configuration to a 64-byte memory location specified by mem_addr. |
| 20 | /// The tile configuration format is specified below, and includes the tile type pallette, |
| 21 | /// the number of bytes per row, and the number of rows. If tiles are not configured, all zeroes will be stored to memory. |
| 22 | /// |
| 23 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_storeconfig&ig_expand=6879) |
| 24 | #[inline ] |
| 25 | #[target_feature (enable = "amx-tile" )] |
| 26 | #[cfg_attr (test, assert_instr(sttilecfg))] |
| 27 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 28 | pub unsafe fn _tile_storeconfig(mem_addr: *mut u8) { |
| 29 | sttilecfg(mem_addr); |
| 30 | } |
| 31 | |
| 32 | /// Load tile rows from memory specifieid by base address and stride into destination tile dst using the tile configuration previously configured via _tile_loadconfig. |
| 33 | /// |
| 34 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_loadd&ig_expand=6877) |
| 35 | #[inline ] |
| 36 | #[rustc_legacy_const_generics (0)] |
| 37 | #[target_feature (enable = "amx-tile" )] |
| 38 | #[cfg_attr (test, assert_instr(tileloadd, DST = 0))] |
| 39 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 40 | pub unsafe fn _tile_loadd<const DST: i32>(base: *const u8, stride: usize) { |
| 41 | static_assert_uimm_bits!(DST, 3); |
| 42 | tileloadd64(DST as i8, base, stride); |
| 43 | } |
| 44 | |
| 45 | /// Release the tile configuration to return to the init state, which releases all storage it currently holds. |
| 46 | /// |
| 47 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_release&ig_expand=6878) |
| 48 | #[inline ] |
| 49 | #[target_feature (enable = "amx-tile" )] |
| 50 | #[cfg_attr (test, assert_instr(tilerelease))] |
| 51 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 52 | pub unsafe fn _tile_release() { |
| 53 | tilerelease(); |
| 54 | } |
| 55 | |
| 56 | /// Store the tile specified by src to memory specifieid by base address and stride using the tile configuration previously configured via _tile_loadconfig. |
| 57 | /// |
| 58 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_stored&ig_expand=6881) |
| 59 | #[inline ] |
| 60 | #[rustc_legacy_const_generics (0)] |
| 61 | #[target_feature (enable = "amx-tile" )] |
| 62 | #[cfg_attr (test, assert_instr(tilestored, DST = 0))] |
| 63 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 64 | pub unsafe fn _tile_stored<const DST: i32>(base: *mut u8, stride: usize) { |
| 65 | static_assert_uimm_bits!(DST, 3); |
| 66 | tilestored64(DST as i8, base, stride); |
| 67 | } |
| 68 | |
| 69 | /// Load tile rows from memory specifieid by base address and stride into destination tile dst using the tile configuration |
| 70 | /// previously configured via _tile_loadconfig. This intrinsic provides a hint to the implementation that the data will |
| 71 | /// likely not be reused in the near future and the data caching can be optimized accordingly. |
| 72 | /// |
| 73 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_stream_loadd&ig_expand=6883) |
| 74 | #[inline ] |
| 75 | #[rustc_legacy_const_generics (0)] |
| 76 | #[target_feature (enable = "amx-tile" )] |
| 77 | #[cfg_attr (test, assert_instr(tileloaddt1, DST = 0))] |
| 78 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 79 | pub unsafe fn _tile_stream_loadd<const DST: i32>(base: *const u8, stride: usize) { |
| 80 | static_assert_uimm_bits!(DST, 3); |
| 81 | tileloaddt164(DST as i8, base, stride); |
| 82 | } |
| 83 | |
| 84 | /// Zero the tile specified by tdest. |
| 85 | /// |
| 86 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_zero&ig_expand=6885) |
| 87 | #[inline ] |
| 88 | #[rustc_legacy_const_generics (0)] |
| 89 | #[target_feature (enable = "amx-tile" )] |
| 90 | #[cfg_attr (test, assert_instr(tilezero, DST = 0))] |
| 91 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 92 | pub unsafe fn _tile_zero<const DST: i32>() { |
| 93 | static_assert_uimm_bits!(DST, 3); |
| 94 | tilezero(DST as i8); |
| 95 | } |
| 96 | |
| 97 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles a and b, |
| 98 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
| 99 | /// with elements in dst, and store the 32-bit result back to tile dst. |
| 100 | /// |
| 101 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbf16ps&ig_expand=6864) |
| 102 | #[inline ] |
| 103 | #[rustc_legacy_const_generics (0, 1, 2)] |
| 104 | #[target_feature (enable = "amx-bf16" )] |
| 105 | #[cfg_attr (test, assert_instr(tdpbf16ps, DST = 0, A = 1, B = 2))] |
| 106 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 107 | pub unsafe fn _tile_dpbf16ps<const DST: i32, const A: i32, const B: i32>() { |
| 108 | static_assert_uimm_bits!(DST, 3); |
| 109 | static_assert_uimm_bits!(A, 3); |
| 110 | static_assert_uimm_bits!(B, 3); |
| 111 | tdpbf16ps(DST as i8, A as i8, B as i8); |
| 112 | } |
| 113 | |
| 114 | /// Compute dot-product of bytes in tiles with a source/destination accumulator. |
| 115 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding |
| 116 | /// signed 8-bit integers in b, producing 4 intermediate 32-bit results. |
| 117 | /// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst. |
| 118 | /// |
| 119 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbssd&ig_expand=6866) |
| 120 | #[inline ] |
| 121 | #[rustc_legacy_const_generics (0, 1, 2)] |
| 122 | #[target_feature (enable = "amx-int8" )] |
| 123 | #[cfg_attr (test, assert_instr(tdpbssd, DST = 0, A = 1, B = 2))] |
| 124 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 125 | pub unsafe fn _tile_dpbssd<const DST: i32, const A: i32, const B: i32>() { |
| 126 | static_assert_uimm_bits!(DST, 3); |
| 127 | static_assert_uimm_bits!(A, 3); |
| 128 | static_assert_uimm_bits!(B, 3); |
| 129 | tdpbssd(DST as i8, A as i8, B as i8); |
| 130 | } |
| 131 | |
| 132 | /// Compute dot-product of bytes in tiles with a source/destination accumulator. |
| 133 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding |
| 134 | /// unsigned 8-bit integers in b, producing 4 intermediate 32-bit results. |
| 135 | /// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst. |
| 136 | /// |
| 137 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbsud&ig_expand=6868) |
| 138 | #[inline ] |
| 139 | #[rustc_legacy_const_generics (0, 1, 2)] |
| 140 | #[target_feature (enable = "amx-int8" )] |
| 141 | #[cfg_attr (test, assert_instr(tdpbsud, DST = 0, A = 1, B = 2))] |
| 142 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 143 | pub unsafe fn _tile_dpbsud<const DST: i32, const A: i32, const B: i32>() { |
| 144 | static_assert_uimm_bits!(DST, 3); |
| 145 | static_assert_uimm_bits!(A, 3); |
| 146 | static_assert_uimm_bits!(B, 3); |
| 147 | tdpbsud(DST as i8, A as i8, B as i8); |
| 148 | } |
| 149 | |
| 150 | /// Compute dot-product of bytes in tiles with a source/destination accumulator. |
| 151 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding |
| 152 | /// signed 8-bit integers in b, producing 4 intermediate 32-bit results. |
| 153 | /// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst. |
| 154 | /// |
| 155 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbusd&ig_expand=6870) |
| 156 | #[inline ] |
| 157 | #[rustc_legacy_const_generics (0, 1, 2)] |
| 158 | #[target_feature (enable = "amx-int8" )] |
| 159 | #[cfg_attr (test, assert_instr(tdpbusd, DST = 0, A = 1, B = 2))] |
| 160 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 161 | pub unsafe fn _tile_dpbusd<const DST: i32, const A: i32, const B: i32>() { |
| 162 | static_assert_uimm_bits!(DST, 3); |
| 163 | static_assert_uimm_bits!(A, 3); |
| 164 | static_assert_uimm_bits!(B, 3); |
| 165 | tdpbusd(DST as i8, A as i8, B as i8); |
| 166 | } |
| 167 | |
| 168 | /// Compute dot-product of bytes in tiles with a source/destination accumulator. |
| 169 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding |
| 170 | /// unsigned 8-bit integers in b, producing 4 intermediate 32-bit results. |
| 171 | /// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst. |
| 172 | /// |
| 173 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbuud&ig_expand=6872) |
| 174 | #[inline ] |
| 175 | #[rustc_legacy_const_generics (0, 1, 2)] |
| 176 | #[target_feature (enable = "amx-int8" )] |
| 177 | #[cfg_attr (test, assert_instr(tdpbuud, DST = 0, A = 1, B = 2))] |
| 178 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 179 | pub unsafe fn _tile_dpbuud<const DST: i32, const A: i32, const B: i32>() { |
| 180 | static_assert_uimm_bits!(DST, 3); |
| 181 | static_assert_uimm_bits!(A, 3); |
| 182 | static_assert_uimm_bits!(B, 3); |
| 183 | tdpbuud(DST as i8, A as i8, B as i8); |
| 184 | } |
| 185 | |
| 186 | /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles a and b, |
| 187 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
| 188 | /// with elements in dst, and store the 32-bit result back to tile dst. |
| 189 | /// |
| 190 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpfp16ps&ig_expand=6874) |
| 191 | #[inline ] |
| 192 | #[rustc_legacy_const_generics (0, 1, 2)] |
| 193 | #[target_feature (enable = "amx-fp16" )] |
| 194 | #[cfg_attr (test, assert_instr(tdpfp16ps, DST = 0, A = 1, B = 2))] |
| 195 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 196 | pub unsafe fn _tile_dpfp16ps<const DST: i32, const A: i32, const B: i32>() { |
| 197 | static_assert_uimm_bits!(DST, 3); |
| 198 | static_assert_uimm_bits!(A, 3); |
| 199 | static_assert_uimm_bits!(B, 3); |
| 200 | tdpfp16ps(DST as i8, A as i8, B as i8); |
| 201 | } |
| 202 | |
| 203 | /// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. |
| 204 | /// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part. |
| 205 | /// Calculates the imaginary part of the result. For each possible combination of (row of a, column of b), |
| 206 | /// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b). |
| 207 | /// The imaginary part of the a element is multiplied with the real part of the corresponding b element, and the real part of |
| 208 | /// the a element is multiplied with the imaginary part of the corresponding b elements. The two accumulated results are added, |
| 209 | /// and then accumulated into the corresponding row and column of dst. |
| 210 | /// |
| 211 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_cmmimfp16ps&ig_expand=6860) |
| 212 | #[inline ] |
| 213 | #[rustc_legacy_const_generics (0, 1, 2)] |
| 214 | #[target_feature (enable = "amx-complex" )] |
| 215 | #[cfg_attr (test, assert_instr(tcmmimfp16ps, DST = 0, A = 1, B = 2))] |
| 216 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 217 | pub unsafe fn _tile_cmmimfp16ps<const DST: i32, const A: i32, const B: i32>() { |
| 218 | static_assert_uimm_bits!(DST, 3); |
| 219 | static_assert_uimm_bits!(A, 3); |
| 220 | static_assert_uimm_bits!(B, 3); |
| 221 | tcmmimfp16ps(DST as i8, A as i8, B as i8); |
| 222 | } |
| 223 | |
| 224 | /// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. |
| 225 | /// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part. |
| 226 | /// Calculates the real part of the result. For each possible combination of (row of a, column of b), |
| 227 | /// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b). |
| 228 | /// The real part of the a element is multiplied with the real part of the corresponding b element, and the negated imaginary part of |
| 229 | /// the a element is multiplied with the imaginary part of the corresponding b elements. |
| 230 | /// The two accumulated results are added, and then accumulated into the corresponding row and column of dst. |
| 231 | /// |
| 232 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_cmmrlfp16ps&ig_expand=6862) |
| 233 | #[inline ] |
| 234 | #[rustc_legacy_const_generics (0, 1, 2)] |
| 235 | #[target_feature (enable = "amx-complex" )] |
| 236 | #[cfg_attr (test, assert_instr(tcmmrlfp16ps, DST = 0, A = 1, B = 2))] |
| 237 | #[unstable (feature = "x86_amx_intrinsics" , issue = "126622" )] |
| 238 | pub unsafe fn _tile_cmmrlfp16ps<const DST: i32, const A: i32, const B: i32>() { |
| 239 | static_assert_uimm_bits!(DST, 3); |
| 240 | static_assert_uimm_bits!(A, 3); |
| 241 | static_assert_uimm_bits!(B, 3); |
| 242 | tcmmrlfp16ps(DST as i8, A as i8, B as i8); |
| 243 | } |
| 244 | |
| 245 | #[allow (improper_ctypes)] |
| 246 | unsafe extern "C" { |
| 247 | #[link_name = "llvm.x86.ldtilecfg" ] |
| 248 | unsafefn ldtilecfg(mem_addr: *const u8); |
| 249 | #[link_name = "llvm.x86.sttilecfg" ] |
| 250 | unsafefn sttilecfg(mem_addr: *mut u8); |
| 251 | #[link_name = "llvm.x86.tileloadd64" ] |
| 252 | unsafefn tileloadd64(dst: i8, base: *const u8, stride: usize); |
| 253 | #[link_name = "llvm.x86.tileloaddt164" ] |
| 254 | unsafefn tileloaddt164(dst: i8, base: *const u8, stride: usize); |
| 255 | #[link_name = "llvm.x86.tilerelease" ] |
| 256 | unsafefn tilerelease(); |
| 257 | #[link_name = "llvm.x86.tilestored64" ] |
| 258 | unsafefn tilestored64(dst: i8, base: *mut u8, stride: usize); |
| 259 | #[link_name = "llvm.x86.tilezero" ] |
| 260 | unsafefn tilezero(dst: i8); |
| 261 | #[link_name = "llvm.x86.tdpbf16ps" ] |
| 262 | unsafefn tdpbf16ps(dst: i8, a: i8, b: i8); |
| 263 | #[link_name = "llvm.x86.tdpbuud" ] |
| 264 | unsafefn tdpbuud(dst: i8, a: i8, b: i8); |
| 265 | #[link_name = "llvm.x86.tdpbusd" ] |
| 266 | unsafefn tdpbusd(dst: i8, a: i8, b: i8); |
| 267 | #[link_name = "llvm.x86.tdpbsud" ] |
| 268 | unsafefn tdpbsud(dst: i8, a: i8, b: i8); |
| 269 | #[link_name = "llvm.x86.tdpbssd" ] |
| 270 | unsafefn tdpbssd(dst: i8, a: i8, b: i8); |
| 271 | #[link_name = "llvm.x86.tdpfp16ps" ] |
| 272 | unsafefn tdpfp16ps(dst: i8, a: i8, b: i8); |
| 273 | #[link_name = "llvm.x86.tcmmimfp16ps" ] |
| 274 | unsafefn tcmmimfp16ps(dst: i8, a: i8, b: i8); |
| 275 | #[link_name = "llvm.x86.tcmmrlfp16ps" ] |
| 276 | unsafefn tcmmrlfp16ps(dst: i8, a: i8, b: i8); |
| 277 | } |
| 278 | |
| 279 | #[cfg (test)] |
| 280 | mod tests { |
| 281 | use crate::core_arch::x86::_mm_cvtness_sbh; |
| 282 | use crate::core_arch::x86_64::*; |
| 283 | use core::mem::transmute; |
| 284 | use stdarch_test::simd_test; |
| 285 | #[cfg (target_os = "linux" )] |
| 286 | use syscalls::{Sysno, syscall}; |
| 287 | |
| 288 | #[allow (non_camel_case_types)] |
| 289 | #[repr (packed)] |
| 290 | #[derive (Copy, Clone, Default, Debug, PartialEq)] |
| 291 | struct __tilecfg { |
| 292 | /// 0 `or` 1 |
| 293 | palette: u8, |
| 294 | start_row: u8, |
| 295 | /// reserved, must be zero |
| 296 | reserved_a0: [u8; 14], |
| 297 | /// number of bytes of one row in each tile |
| 298 | colsb: [u16; 8], |
| 299 | /// reserved, must be zero |
| 300 | reserved_b0: [u16; 8], |
| 301 | /// number of rows in each tile |
| 302 | rows: [u8; 8], |
| 303 | /// reserved, must be zero |
| 304 | reserved_c0: [u8; 8], |
| 305 | } |
| 306 | |
| 307 | impl __tilecfg { |
| 308 | fn new(palette: u8, start_row: u8, colsb: [u16; 8], rows: [u8; 8]) -> Self { |
| 309 | Self { |
| 310 | palette, |
| 311 | start_row, |
| 312 | reserved_a0: [0u8; 14], |
| 313 | colsb, |
| 314 | reserved_b0: [0u16; 8], |
| 315 | rows, |
| 316 | reserved_c0: [0u8; 8], |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | const fn as_ptr(&self) -> *const u8 { |
| 321 | self as *const Self as *const u8 |
| 322 | } |
| 323 | |
| 324 | fn as_mut_ptr(&mut self) -> *mut u8 { |
| 325 | self as *mut Self as *mut u8 |
| 326 | } |
| 327 | } |
| 328 | |
| 329 | #[cfg (not(target_os = "linux" ))] |
| 330 | #[target_feature (enable = "amx-tile" )] |
| 331 | fn _init_amx() {} |
| 332 | |
| 333 | #[cfg (target_os = "linux" )] |
| 334 | #[target_feature (enable = "amx-tile" )] |
| 335 | #[inline ] |
| 336 | unsafe fn _init_amx() { |
| 337 | let mut ret: usize; |
| 338 | let mut xfeatures: usize = 0; |
| 339 | ret = syscall!(Sysno::arch_prctl, 0x1022, &mut xfeatures as *mut usize) |
| 340 | .expect("arch_prctl ARCH_GET_XCOMP_PERM syscall failed" ); |
| 341 | if ret != 0 { |
| 342 | panic!("Failed to get XFEATURES" ); |
| 343 | } else { |
| 344 | match 0b11 & (xfeatures >> 17) { |
| 345 | 0 => panic!("AMX is not available" ), |
| 346 | 1 => { |
| 347 | ret = syscall!(Sysno::arch_prctl, 0x1023, 18) |
| 348 | .expect("arch_prctl ARCH_REQ_XCOMP_PERM syscall failed" ); |
| 349 | if ret != 0 { |
| 350 | panic!("Failed to enable AMX" ); |
| 351 | } |
| 352 | } |
| 353 | 3 => {} |
| 354 | _ => unreachable!(), |
| 355 | } |
| 356 | } |
| 357 | } |
| 358 | |
| 359 | #[simd_test(enable = "amx-tile" )] |
| 360 | unsafe fn test_tile_loadconfig() { |
| 361 | let config = __tilecfg::default(); |
| 362 | _tile_loadconfig(config.as_ptr()); |
| 363 | _tile_release(); |
| 364 | } |
| 365 | |
| 366 | #[simd_test(enable = "amx-tile" )] |
| 367 | unsafe fn test_tile_storeconfig() { |
| 368 | let config = __tilecfg::new(1, 0, [32; 8], [8; 8]); |
| 369 | _tile_loadconfig(config.as_ptr()); |
| 370 | let mut _config = __tilecfg::default(); |
| 371 | _tile_storeconfig(_config.as_mut_ptr()); |
| 372 | _tile_release(); |
| 373 | assert_eq!(config, _config); |
| 374 | } |
| 375 | |
| 376 | #[simd_test(enable = "amx-tile" )] |
| 377 | unsafe fn test_tile_zero() { |
| 378 | _init_amx(); |
| 379 | let mut config = __tilecfg::default(); |
| 380 | config.palette = 1; |
| 381 | config.colsb[0] = 64; |
| 382 | config.rows[0] = 16; |
| 383 | _tile_loadconfig(config.as_ptr()); |
| 384 | _tile_zero::<0>(); |
| 385 | let mut out = [[1_i8; 64]; 16]; |
| 386 | _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64); |
| 387 | _tile_release(); |
| 388 | assert_eq!(out, [[0; 64]; 16]); |
| 389 | } |
| 390 | |
| 391 | #[simd_test(enable = "amx-tile" )] |
| 392 | unsafe fn test_tile_stored() { |
| 393 | _init_amx(); |
| 394 | let mut config = __tilecfg::default(); |
| 395 | config.palette = 1; |
| 396 | config.colsb[0] = 64; |
| 397 | config.rows[0] = 16; |
| 398 | _tile_loadconfig(config.as_ptr()); |
| 399 | _tile_zero::<0>(); |
| 400 | let mut out = [[1_i8; 64]; 16]; |
| 401 | _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64); |
| 402 | _tile_release(); |
| 403 | assert_eq!(out, [[0; 64]; 16]); |
| 404 | } |
| 405 | |
| 406 | #[simd_test(enable = "amx-tile" )] |
| 407 | unsafe fn test_tile_loadd() { |
| 408 | _init_amx(); |
| 409 | let mut config = __tilecfg::default(); |
| 410 | config.palette = 1; |
| 411 | config.colsb[0] = 64; |
| 412 | config.rows[0] = 16; |
| 413 | _tile_loadconfig(config.as_ptr()); |
| 414 | _tile_zero::<0>(); |
| 415 | let mat = [1_i8; 1024]; |
| 416 | _tile_loadd::<0>(&mat as *const i8 as *const u8, 64); |
| 417 | let mut out = [[0_i8; 64]; 16]; |
| 418 | _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64); |
| 419 | _tile_release(); |
| 420 | assert_eq!(out, [[1; 64]; 16]); |
| 421 | } |
| 422 | |
| 423 | #[simd_test(enable = "amx-tile" )] |
| 424 | unsafe fn test_tile_stream_loadd() { |
| 425 | _init_amx(); |
| 426 | let mut config = __tilecfg::default(); |
| 427 | config.palette = 1; |
| 428 | config.colsb[0] = 64; |
| 429 | config.rows[0] = 16; |
| 430 | _tile_loadconfig(config.as_ptr()); |
| 431 | _tile_zero::<0>(); |
| 432 | let mat = [1_i8; 1024]; |
| 433 | _tile_stream_loadd::<0>(&mat as *const i8 as *const u8, 64); |
| 434 | let mut out = [[0_i8; 64]; 16]; |
| 435 | _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64); |
| 436 | _tile_release(); |
| 437 | assert_eq!(out, [[1; 64]; 16]); |
| 438 | } |
| 439 | |
| 440 | #[simd_test(enable = "amx-tile" )] |
| 441 | unsafe fn test_tile_release() { |
| 442 | _tile_release(); |
| 443 | } |
| 444 | |
| 445 | #[simd_test(enable = "amx-bf16,avx512f" )] |
| 446 | unsafe fn test_tile_dpbf16ps() { |
| 447 | _init_amx(); |
| 448 | let bf16_1: u16 = _mm_cvtness_sbh(1.0).to_bits(); |
| 449 | let bf16_2: u16 = _mm_cvtness_sbh(2.0).to_bits(); |
| 450 | let ones: [u8; 1024] = transmute([bf16_1; 512]); |
| 451 | let twos: [u8; 1024] = transmute([bf16_2; 512]); |
| 452 | let mut res = [[0f32; 16]; 16]; |
| 453 | let mut config = __tilecfg::default(); |
| 454 | config.palette = 1; |
| 455 | (0..=2).for_each(|i| { |
| 456 | config.colsb[i] = 64; |
| 457 | config.rows[i] = 16; |
| 458 | }); |
| 459 | _tile_loadconfig(config.as_ptr()); |
| 460 | _tile_zero::<0>(); |
| 461 | _tile_loadd::<1>(&ones as *const u8, 64); |
| 462 | _tile_loadd::<2>(&twos as *const u8, 64); |
| 463 | _tile_dpbf16ps::<0, 1, 2>(); |
| 464 | _tile_stored::<0>(&mut res as *mut [f32; 16] as *mut u8, 64); |
| 465 | _tile_release(); |
| 466 | assert_eq!(res, [[64f32; 16]; 16]); |
| 467 | } |
| 468 | |
| 469 | #[simd_test(enable = "amx-int8" )] |
| 470 | unsafe fn test_tile_dpbssd() { |
| 471 | _init_amx(); |
| 472 | let ones = [-1_i8; 1024]; |
| 473 | let twos = [-2_i8; 1024]; |
| 474 | let mut res = [[0_i32; 16]; 16]; |
| 475 | let mut config = __tilecfg::default(); |
| 476 | config.palette = 1; |
| 477 | (0..=2).for_each(|i| { |
| 478 | config.colsb[i] = 64; |
| 479 | config.rows[i] = 16; |
| 480 | }); |
| 481 | _tile_loadconfig(config.as_ptr()); |
| 482 | _tile_zero::<0>(); |
| 483 | _tile_loadd::<1>(&ones as *const i8 as *const u8, 64); |
| 484 | _tile_loadd::<2>(&twos as *const i8 as *const u8, 64); |
| 485 | _tile_dpbssd::<0, 1, 2>(); |
| 486 | _tile_stored::<0>(&mut res as *mut [i32; 16] as *mut u8, 64); |
| 487 | _tile_release(); |
| 488 | assert_eq!(res, [[128_i32; 16]; 16]); |
| 489 | } |
| 490 | |
| 491 | #[simd_test(enable = "amx-int8" )] |
| 492 | unsafe fn test_tile_dpbsud() { |
| 493 | _init_amx(); |
| 494 | let ones = [-1_i8; 1024]; |
| 495 | let twos = [2_u8; 1024]; |
| 496 | let mut res = [[0_i32; 16]; 16]; |
| 497 | let mut config = __tilecfg::default(); |
| 498 | config.palette = 1; |
| 499 | (0..=2).for_each(|i| { |
| 500 | config.colsb[i] = 64; |
| 501 | config.rows[i] = 16; |
| 502 | }); |
| 503 | _tile_loadconfig(config.as_ptr()); |
| 504 | _tile_zero::<0>(); |
| 505 | _tile_loadd::<1>(&ones as *const i8 as *const u8, 64); |
| 506 | _tile_loadd::<2>(&twos as *const u8, 64); |
| 507 | _tile_dpbsud::<0, 1, 2>(); |
| 508 | _tile_stored::<0>(&mut res as *mut [i32; 16] as *mut u8, 64); |
| 509 | _tile_release(); |
| 510 | assert_eq!(res, [[-128_i32; 16]; 16]); |
| 511 | } |
| 512 | |
| 513 | #[simd_test(enable = "amx-int8" )] |
| 514 | unsafe fn test_tile_dpbusd() { |
| 515 | _init_amx(); |
| 516 | let ones = [1_u8; 1024]; |
| 517 | let twos = [-2_i8; 1024]; |
| 518 | let mut res = [[0_i32; 16]; 16]; |
| 519 | let mut config = __tilecfg::default(); |
| 520 | config.palette = 1; |
| 521 | (0..=2).for_each(|i| { |
| 522 | config.colsb[i] = 64; |
| 523 | config.rows[i] = 16; |
| 524 | }); |
| 525 | _tile_loadconfig(config.as_ptr()); |
| 526 | _tile_zero::<0>(); |
| 527 | _tile_loadd::<1>(&ones as *const u8, 64); |
| 528 | _tile_loadd::<2>(&twos as *const i8 as *const u8, 64); |
| 529 | _tile_dpbusd::<0, 1, 2>(); |
| 530 | _tile_stored::<0>(&mut res as *mut [i32; 16] as *mut u8, 64); |
| 531 | _tile_release(); |
| 532 | assert_eq!(res, [[-128_i32; 16]; 16]); |
| 533 | } |
| 534 | |
| 535 | #[simd_test(enable = "amx-int8" )] |
| 536 | unsafe fn test_tile_dpbuud() { |
| 537 | _init_amx(); |
| 538 | let ones = [1_u8; 1024]; |
| 539 | let twos = [2_u8; 1024]; |
| 540 | let mut res = [[0_i32; 16]; 16]; |
| 541 | let mut config = __tilecfg::default(); |
| 542 | config.palette = 1; |
| 543 | (0..=2).for_each(|i| { |
| 544 | config.colsb[i] = 64; |
| 545 | config.rows[i] = 16; |
| 546 | }); |
| 547 | _tile_loadconfig(config.as_ptr()); |
| 548 | _tile_zero::<0>(); |
| 549 | _tile_loadd::<1>(&ones as *const u8, 64); |
| 550 | _tile_loadd::<2>(&twos as *const u8, 64); |
| 551 | _tile_dpbuud::<0, 1, 2>(); |
| 552 | _tile_stored::<0>(&mut res as *mut [i32; 16] as *mut u8, 64); |
| 553 | _tile_release(); |
| 554 | assert_eq!(res, [[128_i32; 16]; 16]); |
| 555 | } |
| 556 | |
| 557 | #[simd_test(enable = "amx-fp16" )] |
| 558 | unsafe fn test_tile_dpfp16ps() { |
| 559 | _init_amx(); |
| 560 | let ones = [1f16; 512]; |
| 561 | let twos = [2f16; 512]; |
| 562 | let mut res = [[0f32; 16]; 16]; |
| 563 | let mut config = __tilecfg::default(); |
| 564 | config.palette = 1; |
| 565 | (0..=2).for_each(|i| { |
| 566 | config.colsb[i] = 64; |
| 567 | config.rows[i] = 16; |
| 568 | }); |
| 569 | _tile_loadconfig(config.as_ptr()); |
| 570 | _tile_zero::<0>(); |
| 571 | _tile_loadd::<1>(&ones as *const f16 as *const u8, 64); |
| 572 | _tile_loadd::<2>(&twos as *const f16 as *const u8, 64); |
| 573 | _tile_dpfp16ps::<0, 1, 2>(); |
| 574 | _tile_stored::<0>(&mut res as *mut [f32; 16] as *mut u8, 64); |
| 575 | _tile_release(); |
| 576 | assert_eq!(res, [[64f32; 16]; 16]); |
| 577 | } |
| 578 | |
| 579 | #[simd_test(enable = "amx-complex" )] |
| 580 | unsafe fn test_tile_cmmimfp16ps() { |
| 581 | _init_amx(); |
| 582 | let ones = [1f16; 512]; |
| 583 | let twos = [2f16; 512]; |
| 584 | let mut res = [[0f32; 16]; 16]; |
| 585 | let mut config = __tilecfg::default(); |
| 586 | config.palette = 1; |
| 587 | (0..=2).for_each(|i| { |
| 588 | config.colsb[i] = 64; |
| 589 | config.rows[i] = 16; |
| 590 | }); |
| 591 | _tile_loadconfig(config.as_ptr()); |
| 592 | _tile_zero::<0>(); |
| 593 | _tile_loadd::<1>(&ones as *const f16 as *const u8, 64); |
| 594 | _tile_loadd::<2>(&twos as *const f16 as *const u8, 64); |
| 595 | _tile_cmmimfp16ps::<0, 1, 2>(); |
| 596 | _tile_stored::<0>(&mut res as *mut [f32; 16] as *mut u8, 64); |
| 597 | _tile_release(); |
| 598 | assert_eq!(res, [[64f32; 16]; 16]); |
| 599 | } |
| 600 | |
| 601 | #[simd_test(enable = "amx-complex" )] |
| 602 | unsafe fn test_tile_cmmrlfp16ps() { |
| 603 | _init_amx(); |
| 604 | let ones = [1f16; 512]; |
| 605 | let twos = [2f16; 512]; |
| 606 | let mut res = [[0f32; 16]; 16]; |
| 607 | let mut config = __tilecfg::default(); |
| 608 | config.palette = 1; |
| 609 | (0..=2).for_each(|i| { |
| 610 | config.colsb[i] = 64; |
| 611 | config.rows[i] = 16; |
| 612 | }); |
| 613 | _tile_loadconfig(config.as_ptr()); |
| 614 | _tile_zero::<0>(); |
| 615 | _tile_loadd::<1>(&ones as *const f16 as *const u8, 64); |
| 616 | _tile_loadd::<2>(&twos as *const f16 as *const u8, 64); |
| 617 | _tile_cmmrlfp16ps::<0, 1, 2>(); |
| 618 | _tile_stored::<0>(&mut res as *mut [f32; 16] as *mut u8, 64); |
| 619 | _tile_release(); |
| 620 | assert_eq!(res, [[0f32; 16]; 16]); |
| 621 | } |
| 622 | } |
| 623 | |