| 1 | // RUN: %libomp-cxx-compile-and-run |
| 2 | // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run |
| 3 | // GCC-5 is needed for OpenMP 4.0 support (taskgroup) |
| 4 | // XFAIL: gcc-4 |
| 5 | #include <cstdio> |
| 6 | #include <cmath> |
| 7 | #include <cassert> |
| 8 | #include <omp.h> |
| 9 | |
| 10 | // Total number of loop iterations, should be multiple of T for this test |
| 11 | #define N 10000 |
| 12 | |
| 13 | // Flag to request lazy (1) or eager (0) allocation of reduction objects |
| 14 | #ifndef FLG |
| 15 | #define FLG 0 |
| 16 | #endif |
| 17 | |
| 18 | /* |
| 19 | // initial user's code that corresponds to pseudo code of the test |
| 20 | #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x) |
| 21 | { |
| 22 | for( int l = 0; l < N; ++l ) { |
| 23 | #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x) |
| 24 | { |
| 25 | i += l; |
| 26 | if( l%2 ) |
| 27 | x *= 1.0 / (l + 1); |
| 28 | else |
| 29 | x *= (l + 1); |
| 30 | } |
| 31 | } |
| 32 | |
| 33 | #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y) |
| 34 | { |
| 35 | for( int l = 0; l < N; ++l ) { |
| 36 | #pragma omp task firstprivate(l) in_reduction(+:j,y) \ |
| 37 | in_reduction(*:x) in_reduction(-:k) |
| 38 | { |
| 39 | j += l; |
| 40 | k -= l; |
| 41 | y += (double)l; |
| 42 | if( l%2 ) |
| 43 | x *= 1.0 / (l + 1); |
| 44 | else |
| 45 | x *= (l + 1); |
| 46 | } |
| 47 | #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k) |
| 48 | { |
| 49 | i -= l; |
| 50 | k -= l; |
| 51 | y += (double)l; |
| 52 | } |
| 53 | #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x) |
| 54 | { |
| 55 | j += l; |
| 56 | if( l%2 ) |
| 57 | x *= 1.0 / (l + 1); |
| 58 | else |
| 59 | x *= (l + 1); |
| 60 | } |
| 61 | } |
| 62 | } // inner reduction |
| 63 | |
| 64 | for( int l = 0; l < N; ++l ) { |
| 65 | #pragma omp task firstprivate(l) in_reduction(+:j) |
| 66 | j += l; |
| 67 | } |
| 68 | } // outer reduction |
| 69 | */ |
| 70 | |
| 71 | //------------------------------------------------ |
| 72 | // OpenMP runtime library routines |
| 73 | #ifdef __cplusplus |
| 74 | extern "C" { |
| 75 | #endif |
| 76 | extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item); |
| 77 | extern void* __kmpc_task_reduction_init(int gtid, int num, void* data); |
| 78 | extern int __kmpc_global_thread_num(void*); |
| 79 | #ifdef __cplusplus |
| 80 | } |
| 81 | #endif |
| 82 | |
| 83 | //------------------------------------------------ |
| 84 | // Compiler-generated code |
| 85 | |
| 86 | typedef struct _task_red_item { |
| 87 | void *shar; // shared reduction item |
| 88 | size_t size; // size of data item |
| 89 | void *f_init; // data initialization routine |
| 90 | void *f_fini; // data finalization routine |
| 91 | void *f_comb; // data combiner routine |
| 92 | unsigned flags; |
| 93 | } _task_red_item_t; |
| 94 | |
| 95 | // int:+ no need in init/fini callbacks, valid for subtraction |
| 96 | void __red_int_add_comb(void *lhs, void *rhs) // combiner |
| 97 | { *(int*)lhs += *(int*)rhs; } |
| 98 | |
| 99 | // long long:+ no need in init/fini callbacks, valid for subtraction |
| 100 | void __red_llong_add_comb(void *lhs, void *rhs) // combiner |
| 101 | { *(long long*)lhs += *(long long*)rhs; } |
| 102 | |
| 103 | // double:* no need in fini callback |
| 104 | void __red_dbl_mul_init(void *data) // initializer |
| 105 | { *(double*)data = 1.0; } |
| 106 | void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner |
| 107 | { *(double*)lhs *= *(double*)rhs; } |
| 108 | |
| 109 | // double:+ no need in init/fini callbacks |
| 110 | void __red_dbl_add_comb(void *lhs, void *rhs) // combiner |
| 111 | { *(double*)lhs += *(double*)rhs; } |
| 112 | |
| 113 | // ============================== |
| 114 | |
| 115 | void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py) |
| 116 | { |
| 117 | for( int l = 0; l < N; ++l ) { |
| 118 | *pi += l; |
| 119 | if( l%2 ) |
| 120 | *px *= 1.0 / (l + 1); |
| 121 | else |
| 122 | *px *= (l + 1); |
| 123 | } |
| 124 | for( int l = 0; l < N; ++l ) { |
| 125 | *pj += l; |
| 126 | *pk -= l; |
| 127 | *py += (double)l; |
| 128 | if( l%2 ) |
| 129 | *px *= 1.0 / (l + 1); |
| 130 | else |
| 131 | *px *= (l + 1); |
| 132 | |
| 133 | *pi -= l; |
| 134 | *pk -= l; |
| 135 | *py += (double)l; |
| 136 | |
| 137 | *pj += l; |
| 138 | if( l%2 ) |
| 139 | *px *= 1.0 / (l + 1); |
| 140 | else |
| 141 | *px *= (l + 1); |
| 142 | } |
| 143 | for( int l = 0; l < N; ++l ) { |
| 144 | *pj += l; |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | //------------------------------------------------ |
| 149 | // Test case |
| 150 | int main() |
| 151 | { |
| 152 | int nthreads = omp_get_max_threads(); |
| 153 | int err = 0; |
| 154 | void** ptrs = (void**)malloc(size: nthreads*sizeof(void*)); |
| 155 | |
| 156 | // user's code ====================================== |
| 157 | // variables for serial calculations: |
| 158 | int is = 3; |
| 159 | long long js = -9999999; |
| 160 | double xs = 99999.0; |
| 161 | long long ks = 99999999; |
| 162 | double ys = -99999999.0; |
| 163 | // variables for parallel calculations: |
| 164 | int ip = 3; |
| 165 | long long jp = -9999999; |
| 166 | double xp = 99999.0; |
| 167 | long long kp = 99999999; |
| 168 | double yp = -99999999.0; |
| 169 | |
| 170 | calc_serial(pi: &is, pj: &js, px: &xs, pk: &ks, py: &ys); |
| 171 | // ================================================== |
| 172 | for (int i = 0; i < nthreads; ++i) |
| 173 | ptrs[i] = NULL; |
| 174 | #pragma omp parallel |
| 175 | { |
| 176 | #pragma omp single nowait |
| 177 | { |
| 178 | // outer taskgroup reduces (i,j,x) |
| 179 | #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x) |
| 180 | { |
| 181 | _task_red_item_t red_data[3]; |
| 182 | red_data[0].shar = &ip; |
| 183 | red_data[0].size = sizeof(ip); |
| 184 | red_data[0].f_init = NULL; // RTL will zero thread-specific objects |
| 185 | red_data[0].f_fini = NULL; // no destructors needed |
| 186 | red_data[0].f_comb = (void*)&__red_int_add_comb; |
| 187 | red_data[0].flags = FLG; |
| 188 | red_data[1].shar = &jp; |
| 189 | red_data[1].size = sizeof(jp); |
| 190 | red_data[1].f_init = NULL; // RTL will zero thread-specific objects |
| 191 | red_data[1].f_fini = NULL; // no destructors needed |
| 192 | red_data[1].f_comb = (void*)&__red_llong_add_comb; |
| 193 | red_data[1].flags = FLG; |
| 194 | red_data[2].shar = &xp; |
| 195 | red_data[2].size = sizeof(xp); |
| 196 | red_data[2].f_init = (void*)&__red_dbl_mul_init; |
| 197 | red_data[2].f_fini = NULL; // no destructors needed |
| 198 | red_data[2].f_comb = (void*)&__red_dbl_mul_comb; |
| 199 | red_data[2].flags = FLG; |
| 200 | int gtid = __kmpc_global_thread_num(NULL); |
| 201 | void* tg1 = __kmpc_task_reduction_init(gtid, num: 3, data: red_data); |
| 202 | |
| 203 | for( int l = 0; l < N; l += 2 ) { |
| 204 | // 2 iterations per task to get correct x value; actually any even |
| 205 | // number of iters per task will work, otherwise x looses precision |
| 206 | #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x) |
| 207 | { |
| 208 | int gtid = __kmpc_global_thread_num(NULL); |
| 209 | int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: &ip); |
| 210 | double *p_xp = (double*)__kmpc_task_reduction_get_th_data( |
| 211 | gtid, tg: tg1, item: &xp); |
| 212 | if (!ptrs[gtid]) ptrs[gtid] = p_xp; |
| 213 | |
| 214 | // user's pseudo-code ============================== |
| 215 | *p_ip += l; |
| 216 | *p_xp *= (l + 1); |
| 217 | |
| 218 | *p_ip += l + 1; |
| 219 | *p_xp *= 1.0 / (l + 2); |
| 220 | // ================================================== |
| 221 | } |
| 222 | } |
| 223 | // inner taskgroup reduces (i,k,y), i is same object as in outer one |
| 224 | #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y) |
| 225 | { |
| 226 | _task_red_item_t red_data[3]; |
| 227 | red_data[0].shar = &ip; |
| 228 | red_data[0].size = sizeof(ip); |
| 229 | red_data[0].f_init = NULL; // RTL will zero thread-specific objects |
| 230 | red_data[0].f_fini = NULL; // no destructors needed |
| 231 | red_data[0].f_comb = (void*)&__red_int_add_comb; |
| 232 | red_data[0].flags = FLG; |
| 233 | red_data[1].shar = &kp; |
| 234 | red_data[1].size = sizeof(kp); |
| 235 | red_data[1].f_init = NULL; // RTL will zero thread-specific objects |
| 236 | red_data[1].f_fini = NULL; // no destructors needed |
| 237 | red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and - |
| 238 | red_data[1].flags = FLG; |
| 239 | red_data[2].shar = &yp; |
| 240 | red_data[2].size = sizeof(yp); |
| 241 | red_data[2].f_init = NULL; // RTL will zero thread-specific objects |
| 242 | red_data[2].f_fini = NULL; // no destructors needed |
| 243 | red_data[2].f_comb = (void*)&__red_dbl_add_comb; |
| 244 | red_data[2].flags = FLG; |
| 245 | int gtid = __kmpc_global_thread_num(NULL); |
| 246 | void* tg2 = __kmpc_task_reduction_init(gtid, num: 3, data: red_data); |
| 247 | |
| 248 | for( int l = 0; l < N; l += 2 ) { |
| 249 | #pragma omp task firstprivate(l) |
| 250 | // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k) |
| 251 | { |
| 252 | int gtid = __kmpc_global_thread_num(NULL); |
| 253 | long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( |
| 254 | gtid, tg: tg1, item: &jp); |
| 255 | long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( |
| 256 | gtid, tg: tg2, item: &kp); |
| 257 | double *p_xp = (double*)__kmpc_task_reduction_get_th_data( |
| 258 | gtid, tg: tg1, item: &xp); |
| 259 | double *p_yp = (double*)__kmpc_task_reduction_get_th_data( |
| 260 | gtid, tg: tg2, item: &yp); |
| 261 | // user's pseudo-code ============================== |
| 262 | *p_jp += l; |
| 263 | *p_kp -= l; |
| 264 | *p_yp += (double)l; |
| 265 | *p_xp *= (l + 1); |
| 266 | |
| 267 | *p_jp += l + 1; |
| 268 | *p_kp -= l + 1; |
| 269 | *p_yp += (double)(l + 1); |
| 270 | *p_xp *= 1.0 / (l + 2); |
| 271 | // ================================================= |
| 272 | { |
| 273 | // the following code is here just to check __kmpc_task_reduction_get_th_data: |
| 274 | int tid = omp_get_thread_num(); |
| 275 | void *addr1; |
| 276 | void *addr2; |
| 277 | addr1 = __kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: &xp); // from shared |
| 278 | addr2 = __kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: addr1); // from private |
| 279 | if (addr1 != addr2) { |
| 280 | #pragma omp atomic |
| 281 | ++err; |
| 282 | printf(format: "Wrong thread-specific addresses %d s:%p p:%p\n" , tid, addr1, addr2); |
| 283 | } |
| 284 | // from neighbour w/o taskgroup (should start lookup from current tg2) |
| 285 | if (tid > 0) { |
| 286 | if (ptrs[tid-1]) { |
| 287 | addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, item: ptrs[tid-1]); |
| 288 | if (addr1 != addr2) { |
| 289 | #pragma omp atomic |
| 290 | ++err; |
| 291 | printf(format: "Wrong thread-specific addresses %d s:%p n:%p\n" , |
| 292 | tid, addr1, addr2); |
| 293 | } |
| 294 | } |
| 295 | } else { |
| 296 | if (ptrs[nthreads-1]) { |
| 297 | addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, item: ptrs[nthreads-1]); |
| 298 | if (addr1 != addr2) { |
| 299 | #pragma omp atomic |
| 300 | ++err; |
| 301 | printf(format: "Wrong thread-specific addresses %d s:%p n:%p\n" , |
| 302 | tid, addr1, addr2); |
| 303 | } |
| 304 | } |
| 305 | } |
| 306 | // ---------------------------------------------- |
| 307 | } |
| 308 | } |
| 309 | #pragma omp task firstprivate(l) |
| 310 | // in_reduction(+:y) in_reduction(-:i,k) |
| 311 | { |
| 312 | int gtid = __kmpc_global_thread_num(NULL); |
| 313 | int *p_ip = (int*)__kmpc_task_reduction_get_th_data( |
| 314 | gtid, tg: tg2, item: &ip); |
| 315 | long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( |
| 316 | gtid, tg: tg2, item: &kp); |
| 317 | double *p_yp = (double*)__kmpc_task_reduction_get_th_data( |
| 318 | gtid, tg: tg2, item: &yp); |
| 319 | |
| 320 | // user's pseudo-code ============================== |
| 321 | *p_ip -= l; |
| 322 | *p_kp -= l; |
| 323 | *p_yp += (double)l; |
| 324 | |
| 325 | *p_ip -= l + 1; |
| 326 | *p_kp -= l + 1; |
| 327 | *p_yp += (double)(l + 1); |
| 328 | // ================================================= |
| 329 | } |
| 330 | #pragma omp task firstprivate(l) |
| 331 | // in_reduction(+:j) in_reduction(*:x) |
| 332 | { |
| 333 | int gtid = __kmpc_global_thread_num(NULL); |
| 334 | long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( |
| 335 | gtid, tg: tg1, item: &jp); |
| 336 | double *p_xp = (double*)__kmpc_task_reduction_get_th_data( |
| 337 | gtid, tg: tg1, item: &xp); |
| 338 | // user's pseudo-code ============================== |
| 339 | *p_jp += l; |
| 340 | *p_xp *= (l + 1); |
| 341 | |
| 342 | *p_jp += l + 1; |
| 343 | *p_xp *= 1.0 / (l + 2); |
| 344 | // ================================================= |
| 345 | } |
| 346 | } |
| 347 | } // inner reduction |
| 348 | |
| 349 | for( int l = 0; l < N; l += 2 ) { |
| 350 | #pragma omp task firstprivate(l) // in_reduction(+:j) |
| 351 | { |
| 352 | int gtid = __kmpc_global_thread_num(NULL); |
| 353 | long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( |
| 354 | gtid, tg: tg1, item: &jp); |
| 355 | // user's pseudo-code ============================== |
| 356 | *p_jp += l; |
| 357 | *p_jp += l + 1; |
| 358 | // ================================================= |
| 359 | } |
| 360 | } |
| 361 | } // outer reduction |
| 362 | } // end single |
| 363 | } // end parallel |
| 364 | // check results |
| 365 | #if _DEBUG |
| 366 | printf(format: "reduction flags = %u\n" , FLG); |
| 367 | #endif |
| 368 | if (ip == is && jp == js && ks == kp && |
| 369 | fabs(x: xp - xs) < 0.01 && fabs(x: yp - ys) < 0.01) |
| 370 | printf(format: "passed\n" ); |
| 371 | else |
| 372 | printf(format: "failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n" , |
| 373 | is, js, xs, ks, ys, |
| 374 | ip, jp, xp, kp, yp); |
| 375 | return 0; |
| 376 | } |
| 377 | |