| 1 | #include <cstdint> |
| 2 | |
| 3 | struct alignas(32) ymm_t { |
| 4 | uint64_t a, b, c, d; |
| 5 | }; |
| 6 | |
| 7 | int main() { |
| 8 | constexpr ymm_t ymm[] = { |
| 9 | { .a: 0x0706050403020100, .b: 0x0F0E0D0C0B0A0908, |
| 10 | .c: 0x1716151413121110, .d: 0x1F1E1D1C1B1A1918, }, |
| 11 | { .a: 0x0807060504030201, .b: 0x100F0E0D0C0B0A09, |
| 12 | .c: 0x1817161514131211, .d: 0x201F1E1D1C1B1A19, }, |
| 13 | { .a: 0x0908070605040302, .b: 0x11100F0E0D0C0B0A, |
| 14 | .c: 0x1918171615141312, .d: 0x21201F1E1D1C1B1A, }, |
| 15 | { .a: 0x0A09080706050403, .b: 0x1211100F0E0D0C0B, |
| 16 | .c: 0x1A19181716151413, .d: 0x2221201F1E1D1C1B, }, |
| 17 | { .a: 0x0B0A090807060504, .b: 0x131211100F0E0D0C, |
| 18 | .c: 0x1B1A191817161514, .d: 0x232221201F1E1D1C, }, |
| 19 | { .a: 0x0C0B0A0908070605, .b: 0x14131211100F0E0D, |
| 20 | .c: 0x1C1B1A1918171615, .d: 0x24232221201F1E1D, }, |
| 21 | { .a: 0x0D0C0B0A09080706, .b: 0x1514131211100F0E, |
| 22 | .c: 0x1D1C1B1A19181716, .d: 0x2524232221201F1E, }, |
| 23 | { .a: 0x0E0D0C0B0A090807, .b: 0x161514131211100F, |
| 24 | .c: 0x1E1D1C1B1A191817, .d: 0x262524232221201F, }, |
| 25 | #if defined(__x86_64__) || defined(_M_X64) |
| 26 | { .a: 0x0F0E0D0C0B0A0908, .b: 0x1716151413121110, |
| 27 | .c: 0x1F1E1D1C1B1A1918, .d: 0x2726252423222120, }, |
| 28 | { .a: 0x100F0E0D0C0B0A09, .b: 0x1817161514131211, |
| 29 | .c: 0x201F1E1D1C1B1A19, .d: 0x2827262524232221, }, |
| 30 | { .a: 0x11100F0E0D0C0B0A, .b: 0x1918171615141312, |
| 31 | .c: 0x21201F1E1D1C1B1A, .d: 0x2928272625242322, }, |
| 32 | { .a: 0x1211100F0E0D0C0B, .b: 0x1A19181716151413, |
| 33 | .c: 0x2221201F1E1D1C1B, .d: 0x2A29282726252423, }, |
| 34 | { .a: 0x131211100F0E0D0C, .b: 0x1B1A191817161514, |
| 35 | .c: 0x232221201F1E1D1C, .d: 0x2B2A292827262524, }, |
| 36 | { .a: 0x14131211100F0E0D, .b: 0x1C1B1A1918171615, |
| 37 | .c: 0x24232221201F1E1D, .d: 0x2C2B2A2928272625, }, |
| 38 | { .a: 0x1514131211100F0E, .b: 0x1D1C1B1A19181716, |
| 39 | .c: 0x2524232221201F1E, .d: 0x2D2C2B2A29282726, }, |
| 40 | { .a: 0x161514131211100F, .b: 0x1E1D1C1B1A191817, |
| 41 | .c: 0x262524232221201F, .d: 0x2E2D2C2B2A292827, }, |
| 42 | #endif |
| 43 | }; |
| 44 | |
| 45 | asm volatile( |
| 46 | "vmovaps 0x000(%0), %%ymm0\n\t" |
| 47 | "vmovaps 0x020(%0), %%ymm1\n\t" |
| 48 | "vmovaps 0x040(%0), %%ymm2\n\t" |
| 49 | "vmovaps 0x060(%0), %%ymm3\n\t" |
| 50 | "vmovaps 0x080(%0), %%ymm4\n\t" |
| 51 | "vmovaps 0x0A0(%0), %%ymm5\n\t" |
| 52 | "vmovaps 0x0C0(%0), %%ymm6\n\t" |
| 53 | "vmovaps 0x0E0(%0), %%ymm7\n\t" |
| 54 | #if defined(__x86_64__) || defined(_M_X64) |
| 55 | "vmovaps 0x100(%0), %%ymm8\n\t" |
| 56 | "vmovaps 0x120(%0), %%ymm9\n\t" |
| 57 | "vmovaps 0x140(%0), %%ymm10\n\t" |
| 58 | "vmovaps 0x160(%0), %%ymm11\n\t" |
| 59 | "vmovaps 0x180(%0), %%ymm12\n\t" |
| 60 | "vmovaps 0x1A0(%0), %%ymm13\n\t" |
| 61 | "vmovaps 0x1C0(%0), %%ymm14\n\t" |
| 62 | "vmovaps 0x1E0(%0), %%ymm15\n\t" |
| 63 | #endif |
| 64 | "\n\t" |
| 65 | "int3\n\t" |
| 66 | : |
| 67 | : "b" (ymm) |
| 68 | : "%ymm0" , "%ymm1" , "%ymm2" , "%ymm3" , "%ymm4" , "%ymm5" , "%ymm6" , "%ymm7" |
| 69 | #if defined(__x86_64__) || defined(_M_X64) |
| 70 | , "%ymm8" , "%ymm9" , "%ymm10" , "%ymm11" , "%ymm12" , "%ymm13" , "%ymm14" , |
| 71 | "%ymm15" |
| 72 | #endif |
| 73 | ); |
| 74 | |
| 75 | return 0; |
| 76 | } |
| 77 | |