| 1 | /// BUILT with |
| 2 | /// xcrun -sdk macosx.internal clang -mcpu=apple-m4 -g sme.c -o sme |
| 3 | |
| 4 | #include <stdint.h> |
| 5 | #include <stdio.h> |
| 6 | #include <stdlib.h> |
| 7 | |
| 8 | void write_sve_regs() { |
| 9 | asm volatile("ptrue p0.b\n\t" ); |
| 10 | asm volatile("ptrue p1.h\n\t" ); |
| 11 | asm volatile("ptrue p2.s\n\t" ); |
| 12 | asm volatile("ptrue p3.d\n\t" ); |
| 13 | asm volatile("pfalse p4.b\n\t" ); |
| 14 | asm volatile("ptrue p5.b\n\t" ); |
| 15 | asm volatile("ptrue p6.h\n\t" ); |
| 16 | asm volatile("ptrue p7.s\n\t" ); |
| 17 | asm volatile("ptrue p8.d\n\t" ); |
| 18 | asm volatile("pfalse p9.b\n\t" ); |
| 19 | asm volatile("ptrue p10.b\n\t" ); |
| 20 | asm volatile("ptrue p11.h\n\t" ); |
| 21 | asm volatile("ptrue p12.s\n\t" ); |
| 22 | asm volatile("ptrue p13.d\n\t" ); |
| 23 | asm volatile("pfalse p14.b\n\t" ); |
| 24 | asm volatile("ptrue p15.b\n\t" ); |
| 25 | |
| 26 | asm volatile("cpy z0.b, p0/z, #1\n\t" ); |
| 27 | asm volatile("cpy z1.b, p5/z, #2\n\t" ); |
| 28 | asm volatile("cpy z2.b, p10/z, #3\n\t" ); |
| 29 | asm volatile("cpy z3.b, p15/z, #4\n\t" ); |
| 30 | asm volatile("cpy z4.b, p0/z, #5\n\t" ); |
| 31 | asm volatile("cpy z5.b, p5/z, #6\n\t" ); |
| 32 | asm volatile("cpy z6.b, p10/z, #7\n\t" ); |
| 33 | asm volatile("cpy z7.b, p15/z, #8\n\t" ); |
| 34 | asm volatile("cpy z8.b, p0/z, #9\n\t" ); |
| 35 | asm volatile("cpy z9.b, p5/z, #10\n\t" ); |
| 36 | asm volatile("cpy z10.b, p10/z, #11\n\t" ); |
| 37 | asm volatile("cpy z11.b, p15/z, #12\n\t" ); |
| 38 | asm volatile("cpy z12.b, p0/z, #13\n\t" ); |
| 39 | asm volatile("cpy z13.b, p5/z, #14\n\t" ); |
| 40 | asm volatile("cpy z14.b, p10/z, #15\n\t" ); |
| 41 | asm volatile("cpy z15.b, p15/z, #16\n\t" ); |
| 42 | asm volatile("cpy z16.b, p0/z, #17\n\t" ); |
| 43 | asm volatile("cpy z17.b, p5/z, #18\n\t" ); |
| 44 | asm volatile("cpy z18.b, p10/z, #19\n\t" ); |
| 45 | asm volatile("cpy z19.b, p15/z, #20\n\t" ); |
| 46 | asm volatile("cpy z20.b, p0/z, #21\n\t" ); |
| 47 | asm volatile("cpy z21.b, p5/z, #22\n\t" ); |
| 48 | asm volatile("cpy z22.b, p10/z, #23\n\t" ); |
| 49 | asm volatile("cpy z23.b, p15/z, #24\n\t" ); |
| 50 | asm volatile("cpy z24.b, p0/z, #25\n\t" ); |
| 51 | asm volatile("cpy z25.b, p5/z, #26\n\t" ); |
| 52 | asm volatile("cpy z26.b, p10/z, #27\n\t" ); |
| 53 | asm volatile("cpy z27.b, p15/z, #28\n\t" ); |
| 54 | asm volatile("cpy z28.b, p0/z, #29\n\t" ); |
| 55 | asm volatile("cpy z29.b, p5/z, #30\n\t" ); |
| 56 | asm volatile("cpy z30.b, p10/z, #31\n\t" ); |
| 57 | asm volatile("cpy z31.b, p15/z, #32\n\t" ); |
| 58 | } |
| 59 | |
| 60 | #define MAX_VL_BYTES 256 |
| 61 | void set_za_register(int svl, int value_offset) { |
| 62 | uint8_t data[MAX_VL_BYTES]; |
| 63 | |
| 64 | // ldr za will actually wrap the selected vector row, by the number of rows |
| 65 | // you have. So setting one that didn't exist would actually set one that did. |
| 66 | // That's why we need the streaming vector length here. |
| 67 | for (int i = 0; i < svl; ++i) { |
| 68 | // This may involve instructions that require the smefa64 extension. |
| 69 | for (int j = 0; j < MAX_VL_BYTES; j++) |
| 70 | data[j] = i + value_offset; |
| 71 | // Each one of these loads a VL sized row of ZA. |
| 72 | asm volatile("mov w12, %w0\n\t" |
| 73 | "ldr za[w12, 0], [%1]\n\t" ::"r" (i), |
| 74 | "r" (&data) |
| 75 | : "w12" ); |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | static uint16_t arm_sme_svl_b(void) { |
| 80 | uint64_t ret = 0; |
| 81 | asm volatile("rdsvl %[ret], #1" : [ret] "=r" (ret)); |
| 82 | return (uint16_t)ret; |
| 83 | } |
| 84 | |
| 85 | void arm_sme2_set_zt0() { |
| 86 | #define ZTO_LEN (512 / 8) |
| 87 | uint8_t data[ZTO_LEN]; |
| 88 | for (unsigned i = 0; i < ZTO_LEN; ++i) |
| 89 | data[i] = i + 0; |
| 90 | |
| 91 | asm volatile("ldr zt0, [%0]" ::"r" (&data)); |
| 92 | #undef ZT0_LEN |
| 93 | } |
| 94 | |
| 95 | int main() { |
| 96 | printf(format: "Enable SME mode\n" ); // break before sme |
| 97 | |
| 98 | asm volatile("smstart" ); |
| 99 | |
| 100 | write_sve_regs(); |
| 101 | |
| 102 | set_za_register(svl: arm_sme_svl_b(), value_offset: 4); |
| 103 | |
| 104 | arm_sme2_set_zt0(); |
| 105 | |
| 106 | int c = 10; // break while sme |
| 107 | c += 5; |
| 108 | c += 5; |
| 109 | |
| 110 | asm volatile("smstop" ); |
| 111 | |
| 112 | printf(format: "SME mode disabled\n" ); // break after sme |
| 113 | } |
| 114 | |