1 | // RUN: %libomp-cxx-compile-and-run |
2 | // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run |
3 | // GCC-5 is needed for OpenMP 4.0 support (taskgroup) |
4 | // XFAIL: gcc-4 |
5 | #include <cstdio> |
6 | #include <cmath> |
7 | #include <cassert> |
8 | #include <omp.h> |
9 | |
10 | // Total number of loop iterations, should be multiple of T for this test |
11 | #define N 10000 |
12 | |
13 | // Flag to request lazy (1) or eager (0) allocation of reduction objects |
14 | #ifndef FLG |
15 | #define FLG 0 |
16 | #endif |
17 | |
18 | /* |
19 | // initial user's code that corresponds to pseudo code of the test |
20 | #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x) |
21 | { |
22 | for( int l = 0; l < N; ++l ) { |
23 | #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x) |
24 | { |
25 | i += l; |
26 | if( l%2 ) |
27 | x *= 1.0 / (l + 1); |
28 | else |
29 | x *= (l + 1); |
30 | } |
31 | } |
32 | |
33 | #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y) |
34 | { |
35 | for( int l = 0; l < N; ++l ) { |
36 | #pragma omp task firstprivate(l) in_reduction(+:j,y) \ |
37 | in_reduction(*:x) in_reduction(-:k) |
38 | { |
39 | j += l; |
40 | k -= l; |
41 | y += (double)l; |
42 | if( l%2 ) |
43 | x *= 1.0 / (l + 1); |
44 | else |
45 | x *= (l + 1); |
46 | } |
47 | #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k) |
48 | { |
49 | i -= l; |
50 | k -= l; |
51 | y += (double)l; |
52 | } |
53 | #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x) |
54 | { |
55 | j += l; |
56 | if( l%2 ) |
57 | x *= 1.0 / (l + 1); |
58 | else |
59 | x *= (l + 1); |
60 | } |
61 | } |
62 | } // inner reduction |
63 | |
64 | for( int l = 0; l < N; ++l ) { |
65 | #pragma omp task firstprivate(l) in_reduction(+:j) |
66 | j += l; |
67 | } |
68 | } // outer reduction |
69 | */ |
70 | |
71 | //------------------------------------------------ |
72 | // OpenMP runtime library routines |
73 | #ifdef __cplusplus |
74 | extern "C" { |
75 | #endif |
76 | extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item); |
77 | extern void* __kmpc_task_reduction_init(int gtid, int num, void* data); |
78 | extern int __kmpc_global_thread_num(void*); |
79 | #ifdef __cplusplus |
80 | } |
81 | #endif |
82 | |
83 | //------------------------------------------------ |
84 | // Compiler-generated code |
85 | |
86 | typedef struct _task_red_item { |
87 | void *shar; // shared reduction item |
88 | size_t size; // size of data item |
89 | void *f_init; // data initialization routine |
90 | void *f_fini; // data finalization routine |
91 | void *f_comb; // data combiner routine |
92 | unsigned flags; |
93 | } _task_red_item_t; |
94 | |
95 | // int:+ no need in init/fini callbacks, valid for subtraction |
96 | void __red_int_add_comb(void *lhs, void *rhs) // combiner |
97 | { *(int*)lhs += *(int*)rhs; } |
98 | |
99 | // long long:+ no need in init/fini callbacks, valid for subtraction |
100 | void __red_llong_add_comb(void *lhs, void *rhs) // combiner |
101 | { *(long long*)lhs += *(long long*)rhs; } |
102 | |
103 | // double:* no need in fini callback |
104 | void __red_dbl_mul_init(void *data) // initializer |
105 | { *(double*)data = 1.0; } |
106 | void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner |
107 | { *(double*)lhs *= *(double*)rhs; } |
108 | |
109 | // double:+ no need in init/fini callbacks |
110 | void __red_dbl_add_comb(void *lhs, void *rhs) // combiner |
111 | { *(double*)lhs += *(double*)rhs; } |
112 | |
113 | // ============================== |
114 | |
115 | void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py) |
116 | { |
117 | for( int l = 0; l < N; ++l ) { |
118 | *pi += l; |
119 | if( l%2 ) |
120 | *px *= 1.0 / (l + 1); |
121 | else |
122 | *px *= (l + 1); |
123 | } |
124 | for( int l = 0; l < N; ++l ) { |
125 | *pj += l; |
126 | *pk -= l; |
127 | *py += (double)l; |
128 | if( l%2 ) |
129 | *px *= 1.0 / (l + 1); |
130 | else |
131 | *px *= (l + 1); |
132 | |
133 | *pi -= l; |
134 | *pk -= l; |
135 | *py += (double)l; |
136 | |
137 | *pj += l; |
138 | if( l%2 ) |
139 | *px *= 1.0 / (l + 1); |
140 | else |
141 | *px *= (l + 1); |
142 | } |
143 | for( int l = 0; l < N; ++l ) { |
144 | *pj += l; |
145 | } |
146 | } |
147 | |
148 | //------------------------------------------------ |
149 | // Test case |
150 | int main() |
151 | { |
152 | int nthreads = omp_get_max_threads(); |
153 | int err = 0; |
154 | void** ptrs = (void**)malloc(size: nthreads*sizeof(void*)); |
155 | |
156 | // user's code ====================================== |
157 | // variables for serial calculations: |
158 | int is = 3; |
159 | long long js = -9999999; |
160 | double xs = 99999.0; |
161 | long long ks = 99999999; |
162 | double ys = -99999999.0; |
163 | // variables for parallel calculations: |
164 | int ip = 3; |
165 | long long jp = -9999999; |
166 | double xp = 99999.0; |
167 | long long kp = 99999999; |
168 | double yp = -99999999.0; |
169 | |
170 | calc_serial(pi: &is, pj: &js, px: &xs, pk: &ks, py: &ys); |
171 | // ================================================== |
172 | for (int i = 0; i < nthreads; ++i) |
173 | ptrs[i] = NULL; |
174 | #pragma omp parallel |
175 | { |
176 | #pragma omp single nowait |
177 | { |
178 | // outer taskgroup reduces (i,j,x) |
179 | #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x) |
180 | { |
181 | _task_red_item_t red_data[3]; |
182 | red_data[0].shar = &ip; |
183 | red_data[0].size = sizeof(ip); |
184 | red_data[0].f_init = NULL; // RTL will zero thread-specific objects |
185 | red_data[0].f_fini = NULL; // no destructors needed |
186 | red_data[0].f_comb = (void*)&__red_int_add_comb; |
187 | red_data[0].flags = FLG; |
188 | red_data[1].shar = &jp; |
189 | red_data[1].size = sizeof(jp); |
190 | red_data[1].f_init = NULL; // RTL will zero thread-specific objects |
191 | red_data[1].f_fini = NULL; // no destructors needed |
192 | red_data[1].f_comb = (void*)&__red_llong_add_comb; |
193 | red_data[1].flags = FLG; |
194 | red_data[2].shar = &xp; |
195 | red_data[2].size = sizeof(xp); |
196 | red_data[2].f_init = (void*)&__red_dbl_mul_init; |
197 | red_data[2].f_fini = NULL; // no destructors needed |
198 | red_data[2].f_comb = (void*)&__red_dbl_mul_comb; |
199 | red_data[2].flags = FLG; |
200 | int gtid = __kmpc_global_thread_num(NULL); |
201 | void* tg1 = __kmpc_task_reduction_init(gtid, num: 3, data: red_data); |
202 | |
203 | for( int l = 0; l < N; l += 2 ) { |
204 | // 2 iterations per task to get correct x value; actually any even |
205 | // number of iters per task will work, otherwise x looses precision |
206 | #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x) |
207 | { |
208 | int gtid = __kmpc_global_thread_num(NULL); |
209 | int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: &ip); |
210 | double *p_xp = (double*)__kmpc_task_reduction_get_th_data( |
211 | gtid, tg: tg1, item: &xp); |
212 | if (!ptrs[gtid]) ptrs[gtid] = p_xp; |
213 | |
214 | // user's pseudo-code ============================== |
215 | *p_ip += l; |
216 | *p_xp *= (l + 1); |
217 | |
218 | *p_ip += l + 1; |
219 | *p_xp *= 1.0 / (l + 2); |
220 | // ================================================== |
221 | } |
222 | } |
223 | // inner taskgroup reduces (i,k,y), i is same object as in outer one |
224 | #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y) |
225 | { |
226 | _task_red_item_t red_data[3]; |
227 | red_data[0].shar = &ip; |
228 | red_data[0].size = sizeof(ip); |
229 | red_data[0].f_init = NULL; // RTL will zero thread-specific objects |
230 | red_data[0].f_fini = NULL; // no destructors needed |
231 | red_data[0].f_comb = (void*)&__red_int_add_comb; |
232 | red_data[0].flags = FLG; |
233 | red_data[1].shar = &kp; |
234 | red_data[1].size = sizeof(kp); |
235 | red_data[1].f_init = NULL; // RTL will zero thread-specific objects |
236 | red_data[1].f_fini = NULL; // no destructors needed |
237 | red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and - |
238 | red_data[1].flags = FLG; |
239 | red_data[2].shar = &yp; |
240 | red_data[2].size = sizeof(yp); |
241 | red_data[2].f_init = NULL; // RTL will zero thread-specific objects |
242 | red_data[2].f_fini = NULL; // no destructors needed |
243 | red_data[2].f_comb = (void*)&__red_dbl_add_comb; |
244 | red_data[2].flags = FLG; |
245 | int gtid = __kmpc_global_thread_num(NULL); |
246 | void* tg2 = __kmpc_task_reduction_init(gtid, num: 3, data: red_data); |
247 | |
248 | for( int l = 0; l < N; l += 2 ) { |
249 | #pragma omp task firstprivate(l) |
250 | // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k) |
251 | { |
252 | int gtid = __kmpc_global_thread_num(NULL); |
253 | long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( |
254 | gtid, tg: tg1, item: &jp); |
255 | long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( |
256 | gtid, tg: tg2, item: &kp); |
257 | double *p_xp = (double*)__kmpc_task_reduction_get_th_data( |
258 | gtid, tg: tg1, item: &xp); |
259 | double *p_yp = (double*)__kmpc_task_reduction_get_th_data( |
260 | gtid, tg: tg2, item: &yp); |
261 | // user's pseudo-code ============================== |
262 | *p_jp += l; |
263 | *p_kp -= l; |
264 | *p_yp += (double)l; |
265 | *p_xp *= (l + 1); |
266 | |
267 | *p_jp += l + 1; |
268 | *p_kp -= l + 1; |
269 | *p_yp += (double)(l + 1); |
270 | *p_xp *= 1.0 / (l + 2); |
271 | // ================================================= |
272 | { |
273 | // the following code is here just to check __kmpc_task_reduction_get_th_data: |
274 | int tid = omp_get_thread_num(); |
275 | void *addr1; |
276 | void *addr2; |
277 | addr1 = __kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: &xp); // from shared |
278 | addr2 = __kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: addr1); // from private |
279 | if (addr1 != addr2) { |
280 | #pragma omp atomic |
281 | ++err; |
282 | printf(format: "Wrong thread-specific addresses %d s:%p p:%p\n" , tid, addr1, addr2); |
283 | } |
284 | // from neighbour w/o taskgroup (should start lookup from current tg2) |
285 | if (tid > 0) { |
286 | if (ptrs[tid-1]) { |
287 | addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, item: ptrs[tid-1]); |
288 | if (addr1 != addr2) { |
289 | #pragma omp atomic |
290 | ++err; |
291 | printf(format: "Wrong thread-specific addresses %d s:%p n:%p\n" , |
292 | tid, addr1, addr2); |
293 | } |
294 | } |
295 | } else { |
296 | if (ptrs[nthreads-1]) { |
297 | addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, item: ptrs[nthreads-1]); |
298 | if (addr1 != addr2) { |
299 | #pragma omp atomic |
300 | ++err; |
301 | printf(format: "Wrong thread-specific addresses %d s:%p n:%p\n" , |
302 | tid, addr1, addr2); |
303 | } |
304 | } |
305 | } |
306 | // ---------------------------------------------- |
307 | } |
308 | } |
309 | #pragma omp task firstprivate(l) |
310 | // in_reduction(+:y) in_reduction(-:i,k) |
311 | { |
312 | int gtid = __kmpc_global_thread_num(NULL); |
313 | int *p_ip = (int*)__kmpc_task_reduction_get_th_data( |
314 | gtid, tg: tg2, item: &ip); |
315 | long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( |
316 | gtid, tg: tg2, item: &kp); |
317 | double *p_yp = (double*)__kmpc_task_reduction_get_th_data( |
318 | gtid, tg: tg2, item: &yp); |
319 | |
320 | // user's pseudo-code ============================== |
321 | *p_ip -= l; |
322 | *p_kp -= l; |
323 | *p_yp += (double)l; |
324 | |
325 | *p_ip -= l + 1; |
326 | *p_kp -= l + 1; |
327 | *p_yp += (double)(l + 1); |
328 | // ================================================= |
329 | } |
330 | #pragma omp task firstprivate(l) |
331 | // in_reduction(+:j) in_reduction(*:x) |
332 | { |
333 | int gtid = __kmpc_global_thread_num(NULL); |
334 | long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( |
335 | gtid, tg: tg1, item: &jp); |
336 | double *p_xp = (double*)__kmpc_task_reduction_get_th_data( |
337 | gtid, tg: tg1, item: &xp); |
338 | // user's pseudo-code ============================== |
339 | *p_jp += l; |
340 | *p_xp *= (l + 1); |
341 | |
342 | *p_jp += l + 1; |
343 | *p_xp *= 1.0 / (l + 2); |
344 | // ================================================= |
345 | } |
346 | } |
347 | } // inner reduction |
348 | |
349 | for( int l = 0; l < N; l += 2 ) { |
350 | #pragma omp task firstprivate(l) // in_reduction(+:j) |
351 | { |
352 | int gtid = __kmpc_global_thread_num(NULL); |
353 | long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( |
354 | gtid, tg: tg1, item: &jp); |
355 | // user's pseudo-code ============================== |
356 | *p_jp += l; |
357 | *p_jp += l + 1; |
358 | // ================================================= |
359 | } |
360 | } |
361 | } // outer reduction |
362 | } // end single |
363 | } // end parallel |
364 | // check results |
365 | #if _DEBUG |
366 | printf(format: "reduction flags = %u\n" , FLG); |
367 | #endif |
368 | if (ip == is && jp == js && ks == kp && |
369 | fabs(x: xp - xs) < 0.01 && fabs(x: yp - ys) < 0.01) |
370 | printf(format: "passed\n" ); |
371 | else |
372 | printf(format: "failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n" , |
373 | is, js, xs, ks, ys, |
374 | ip, jp, xp, kp, yp); |
375 | return 0; |
376 | } |
377 | |