| 1 | //===-- Test for the parallel scan and reduction operations on the GPU ----===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "src/__support/CPP/bit.h" |
| 10 | #include "src/__support/GPU/utils.h" |
| 11 | #include "test/IntegrationTest/test.h" |
| 12 | |
| 13 | using namespace LIBC_NAMESPACE; |
| 14 | |
| 15 | static uint32_t sum(uint32_t n) { return n * (n + 1) / 2; } |
| 16 | |
| 17 | // Tests a reduction within a convergant warp or wavefront using some known |
| 18 | // values. For example, if every element in the lane is one, then the sum should |
| 19 | // be the size of the warp or wavefront, i.e. 1 + 1 + 1 ... + 1. |
| 20 | static void test_reduce() { |
| 21 | uint64_t mask = gpu::get_lane_mask(); |
| 22 | uint32_t x = gpu::reduce(mask, 1); |
| 23 | EXPECT_EQ(x, gpu::get_lane_size()); |
| 24 | |
| 25 | uint32_t y = gpu::reduce(mask, gpu::get_lane_id()); |
| 26 | EXPECT_EQ(y, sum(gpu::get_lane_size() - 1)); |
| 27 | |
| 28 | uint32_t z = 0; |
| 29 | if (gpu::get_lane_id() % 2) |
| 30 | z = gpu::reduce(gpu::get_lane_mask(), 1); |
| 31 | gpu::sync_lane(mask); |
| 32 | |
| 33 | EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_size() / 2 : 0); |
| 34 | } |
| 35 | |
| 36 | // Tests an accumulation scan within a convergent warp or wavefront using some |
| 37 | // known values. For example, if every element in the lane is one, then the scan |
| 38 | // should have each element be equivalent to its ID, i.e. 1, 1 + 1, ... |
| 39 | static void test_scan() { |
| 40 | uint64_t mask = gpu::get_lane_mask(); |
| 41 | |
| 42 | uint32_t x = gpu::scan(mask, 1); |
| 43 | EXPECT_EQ(x, gpu::get_lane_id() + 1); |
| 44 | |
| 45 | uint32_t y = gpu::scan(mask, gpu::get_lane_id()); |
| 46 | EXPECT_EQ(y, sum(gpu::get_lane_id())); |
| 47 | |
| 48 | uint32_t z = 0; |
| 49 | if (gpu::get_lane_id() % 2) |
| 50 | z = gpu::scan(gpu::get_lane_mask(), 1); |
| 51 | gpu::sync_lane(mask); |
| 52 | |
| 53 | EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0); |
| 54 | } |
| 55 | |
| 56 | static uint32_t random(uint64_t *rand_next) { |
| 57 | uint64_t x = *rand_next; |
| 58 | x ^= x >> 12; |
| 59 | x ^= x << 25; |
| 60 | x ^= x >> 27; |
| 61 | *rand_next = x; |
| 62 | return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32); |
| 63 | } |
| 64 | |
| 65 | // Scan operations can break down under thread divergence, make sure that the |
| 66 | // function works under some random divergence. We do this by trivially |
| 67 | // implementing a scan with shared scratch memory and then comparing the |
| 68 | // results. |
| 69 | static void test_scan_divergent() { |
| 70 | static uint32_t input[64] = {0}; |
| 71 | static uint32_t result[64] = {0}; |
| 72 | uint64_t state = gpu::processor_clock() + __gpu_lane_id(); |
| 73 | |
| 74 | for (int i = 0; i < 64; ++i) { |
| 75 | uint64_t lanemask = gpu::get_lane_mask(); |
| 76 | if (random(&state) & (1ull << gpu::get_lane_id())) { |
| 77 | uint64_t divergent = gpu::get_lane_mask(); |
| 78 | uint32_t value = random(&state) % 256; |
| 79 | input[gpu::get_lane_id()] = value; |
| 80 | |
| 81 | if (gpu::is_first_lane(divergent)) { |
| 82 | uint32_t accumulator = 0; |
| 83 | for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) { |
| 84 | uint32_t tmp = input[lane]; |
| 85 | result[lane] = tmp + accumulator; |
| 86 | accumulator += tmp; |
| 87 | } |
| 88 | } |
| 89 | gpu::sync_lane(divergent); |
| 90 | |
| 91 | uint32_t scan = gpu::scan(divergent, value); |
| 92 | EXPECT_EQ(scan, result[gpu::get_lane_id()]); |
| 93 | } |
| 94 | if (gpu::is_first_lane(lanemask)) |
| 95 | __builtin_memset(input, 0, sizeof(input)); |
| 96 | gpu::sync_lane(lanemask); |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | TEST_MAIN(int argc, char **argv, char **envp) { |
| 101 | if (gpu::get_thread_id() >= gpu::get_lane_size()) |
| 102 | return 0; |
| 103 | |
| 104 | test_reduce(); |
| 105 | |
| 106 | test_scan(); |
| 107 | |
| 108 | test_scan_divergent(); |
| 109 | |
| 110 | return 0; |
| 111 | } |
| 112 | |