| 1 | // RUN: %libomptarget-compile-run-and-check-generic |
| 2 | // RUN: %libomptarget-compileopt-run-and-check-generic |
| 3 | |
| 4 | // TODO: This requires malloc support for the threads states. |
| 5 | // FIXME: Flaky on all GPU targets. |
| 6 | // UNSUPPORTED: amdgcn-amd-amdhsa |
| 7 | // UNSUPPORTED: nvptx64-nvidia-cuda |
| 8 | // UNSUPPORTED: nvptx64-nvidia-cuda-LTO |
| 9 | |
| 10 | #include <omp.h> |
| 11 | #include <stdio.h> |
| 12 | #define N 10 |
| 13 | |
| 14 | int isCPU() { return 1; } |
| 15 | |
| 16 | #pragma omp begin declare variant match(device = {kind(gpu)}) |
| 17 | int isCPU() { return 0; } |
| 18 | #pragma omp end declare variant |
| 19 | |
| 20 | int main(void) { |
| 21 | long int aa = 0; |
| 22 | int res = 0; |
| 23 | |
| 24 | int ng = 12; |
| 25 | int cmom = 14; |
| 26 | int nxyz; |
| 27 | |
| 28 | #pragma omp target map(from : nxyz, ng, cmom) |
| 29 | { |
| 30 | nxyz = isCPU() ? 2 : 5000; |
| 31 | ng = isCPU() ? 2 : 12; |
| 32 | cmom = isCPU() ? 2 : 14; |
| 33 | } |
| 34 | |
| 35 | #pragma omp target teams distribute num_teams(nxyz) \ |
| 36 | thread_limit(ng *(cmom - 1)) map(tofrom : aa) |
| 37 | for (int gid = 0; gid < nxyz; gid++) { |
| 38 | #pragma omp parallel for collapse(2) |
| 39 | for (unsigned int g = 0; g < ng; g++) { |
| 40 | for (unsigned int l = 0; l < cmom - 1; l++) { |
| 41 | int a = 0; |
| 42 | #pragma omp parallel for reduction(+ : a) |
| 43 | for (int i = 0; i < N; i++) { |
| 44 | a += i; |
| 45 | } |
| 46 | #pragma omp atomic |
| 47 | aa += a; |
| 48 | } |
| 49 | } |
| 50 | } |
| 51 | long exp = (long)ng * (cmom - 1) * nxyz * (N * (N - 1) / 2); |
| 52 | printf(format: "The result is = %ld exp:%ld!\n" , aa, exp); |
| 53 | if (aa != exp) { |
| 54 | printf(format: "Failed %ld\n" , aa); |
| 55 | return 1; |
| 56 | } |
| 57 | // CHECK: Success |
| 58 | printf(format: "Success\n" ); |
| 59 | return 0; |
| 60 | } |
| 61 | |