| 1 | #include <gpuintrin.h> |
|---|---|
| 2 | #include <stdint.h> |
| 3 | |
| 4 | [[clang::loader_uninitialized]] |
| 5 | __gpu_local uint32_t shared_mem[64]; |
| 6 | |
| 7 | __gpu_kernel void localmem_static(uint32_t *out) { |
| 8 | shared_mem[__gpu_thread_id(0)] = 2; |
| 9 | |
| 10 | __gpu_sync_threads(); |
| 11 | |
| 12 | if (__gpu_thread_id(dim: 0) == 0) { |
| 13 | out[__gpu_block_id(dim: 0)] = 0; |
| 14 | for (uint32_t i = 0; i < __gpu_num_threads(0); i++) |
| 15 | out[__gpu_block_id(0)] += shared_mem[i]; |
| 16 | } |
| 17 | } |
| 18 |
