1 | // RUN: %libomptarget-compile-run-and-check-generic |
2 | // RUN: %libomptarget-compileopt-run-and-check-generic |
3 | |
4 | // TODO: This requires malloc support for the threads states. |
5 | // FIXME: Flaky on all GPU targets. |
6 | // UNSUPPORTED: amdgcn-amd-amdhsa |
7 | // UNSUPPORTED: nvptx64-nvidia-cuda |
8 | // UNSUPPORTED: nvptx64-nvidia-cuda-LTO |
9 | |
10 | #include <omp.h> |
11 | #include <stdio.h> |
12 | #define N 10 |
13 | |
14 | int isCPU() { return 1; } |
15 | |
16 | #pragma omp begin declare variant match(device = {kind(gpu)}) |
17 | int isCPU() { return 0; } |
18 | #pragma omp end declare variant |
19 | |
20 | int main(void) { |
21 | long int aa = 0; |
22 | int res = 0; |
23 | |
24 | int ng = 12; |
25 | int cmom = 14; |
26 | int nxyz; |
27 | |
28 | #pragma omp target map(from : nxyz, ng, cmom) |
29 | { |
30 | nxyz = isCPU() ? 2 : 5000; |
31 | ng = isCPU() ? 2 : 12; |
32 | cmom = isCPU() ? 2 : 14; |
33 | } |
34 | |
35 | #pragma omp target teams distribute num_teams(nxyz) \ |
36 | thread_limit(ng *(cmom - 1)) map(tofrom : aa) |
37 | for (int gid = 0; gid < nxyz; gid++) { |
38 | #pragma omp parallel for collapse(2) |
39 | for (unsigned int g = 0; g < ng; g++) { |
40 | for (unsigned int l = 0; l < cmom - 1; l++) { |
41 | int a = 0; |
42 | #pragma omp parallel for reduction(+ : a) |
43 | for (int i = 0; i < N; i++) { |
44 | a += i; |
45 | } |
46 | #pragma omp atomic |
47 | aa += a; |
48 | } |
49 | } |
50 | } |
51 | long exp = (long)ng * (cmom - 1) * nxyz * (N * (N - 1) / 2); |
52 | printf(format: "The result is = %ld exp:%ld!\n" , aa, exp); |
53 | if (aa != exp) { |
54 | printf(format: "Failed %ld\n" , aa); |
55 | return 1; |
56 | } |
57 | // CHECK: Success |
58 | printf(format: "Success\n" ); |
59 | return 0; |
60 | } |
61 | |