| 1 | #include <gpuintrin.h> |
|---|---|
| 2 | #include <stdint.h> |
| 3 | |
| 4 | extern __gpu_local uint32_t shared_mem[]; |
| 5 | |
| 6 | __gpu_kernel void localmem_reduction(uint32_t *out) { |
| 7 | shared_mem[__gpu_thread_id(0)] = 2; |
| 8 | |
| 9 | __gpu_sync_threads(); |
| 10 | |
| 11 | if (__gpu_thread_id(dim: 0) == 0) { |
| 12 | out[__gpu_block_id(dim: 0)] = 0; |
| 13 | for (uint32_t i = 0; i < __gpu_num_threads(0); i++) |
| 14 | out[__gpu_block_id(0)] += shared_mem[i]; |
| 15 | } |
| 16 | } |
| 17 |
