kmp_task_reduction_nest.cpp source code [openmp/runtime/test/tasking/kmp_task_reduction_nest.cpp]

1	// RUN: %libomp-cxx-compile-and-run
2	// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
3	// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
4	// XFAIL: gcc-4
5	#include <cstdio>
6	#include <cmath>
7	#include <cassert>
8	#include <omp.h>
9
10	// Total number of loop iterations, should be multiple of T for this test
11	#define N 10000
12
13	// Flag to request lazy (1) or eager (0) allocation of reduction objects
14	#ifndef FLG
15	#define FLG 0
16	#endif
17
18	/*
19	// initial user's code that corresponds to pseudo code of the test
20	#pragma omp taskgroup task_reduction(+:i,j) task_reduction(:x)*
21	{
22	for( int l = 0; l < N; ++l ) {
23	#pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(:x)*
24	{
25	i += l;
26	if( l%2 )
27	x = 1.0 / (l + 1);*
28	else
29	x = (l + 1);*
30	}
31	}
32
33	#pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
34	{
35	for( int l = 0; l < N; ++l ) {
36	#pragma omp task firstprivate(l) in_reduction(+:j,y) \
37	in_reduction(:x) in_reduction(-:k)*
38	{
39	j += l;
40	k -= l;
41	y += (double)l;
42	if( l%2 )
43	x = 1.0 / (l + 1);*
44	else
45	x = (l + 1);*
46	}
47	#pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
48	{
49	i -= l;
50	k -= l;
51	y += (double)l;
52	}
53	#pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(:x)*
54	{
55	j += l;
56	if( l%2 )
57	x = 1.0 / (l + 1);*
58	else
59	x = (l + 1);*
60	}
61	}
62	} // inner reduction
63
64	for( int l = 0; l < N; ++l ) {
65	#pragma omp task firstprivate(l) in_reduction(+:j)
66	j += l;
67	}
68	} // outer reduction
69	*/
70
71	//------------------------------------------------
72	// OpenMP runtime library routines
73	#ifdef __cplusplus
74	extern "C" {
75	#endif
76	extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
77	extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
78	extern int __kmpc_global_thread_num(void*);
79	#ifdef __cplusplus
80	}
81	#endif
82
83	//------------------------------------------------
84	// Compiler-generated code
85
86	typedef struct _task_red_item {
87	void shar; // shared reduction item*
88	size_t size; // size of data item
89	void f_init; // data initialization routine*
90	void f_fini; // data finalization routine*
91	void f_comb; // data combiner routine*
92	unsigned flags;
93	} _task_red_item_t;
94
95	// int:+ no need in init/fini callbacks, valid for subtraction
96	void __red_int_add_comb(void lhs, void* rhs) // combiner*
97	{ (int*)lhs += (int*)rhs; }
98
99	// long long:+ no need in init/fini callbacks, valid for subtraction
100	void __red_llong_add_comb(void lhs, void* rhs) // combiner*
101	{ (long* long)lhs += (long long*)rhs; }
102
103	// double: no need in fini callback*
104	void __red_dbl_mul_init(void data) // initializer*
105	{ (double**)data = `1.0`; }
106	void __red_dbl_mul_comb(void lhs, void* rhs) // combiner*
107	{ (double*)lhs = (double**)rhs; }
108
109	// double:+ no need in init/fini callbacks
110	void __red_dbl_add_comb(void lhs, void* rhs) // combiner*
111	{ (double*)lhs += (double*)rhs; }
112
113	// ==============================
114
115	void calc_serial(int pi, long* long pj, double* px, long* long pk, double* *py)
116	{
117	for( int l = `0`; l < N; ++l ) {
118	*pi += l;
119	if( l%`2` )
120	px = `1.0` / (l + `1`);
121	else
122	px = (l + `1`);
123	}
124	for( int l = `0`; l < N; ++l ) {
125	*pj += l;
126	*pk -= l;
127	py += (double*)l;
128	if( l%`2` )
129	px = `1.0` / (l + `1`);
130	else
131	px = (l + `1`);
132
133	*pi -= l;
134	*pk -= l;
135	py += (double*)l;
136
137	*pj += l;
138	if( l%`2` )
139	px = `1.0` / (l + `1`);
140	else
141	px = (l + `1`);
142	}
143	for( int l = `0`; l < N; ++l ) {
144	*pj += l;
145	}
146	}
147
148	//------------------------------------------------
149	// Test case
150	int main()
151	{
152	int nthreads = omp_get_max_threads();
153	int err = `0`;
154	void** ptrs = (void)malloc(size: nthreadssizeof(void**));
155
156	// user's code ======================================
157	// variables for serial calculations:
158	int is = `3`;
159	long long js = -`9999999`;
160	double xs = `99999.0`;
161	long long ks = `99999999`;
162	double ys = -`99999999.0`;
163	// variables for parallel calculations:
164	int ip = `3`;
165	long long jp = -`9999999`;
166	double xp = `99999.0`;
167	long long kp = `99999999`;
168	double yp = -`99999999.0`;
169
170	calc_serial(pi: &is, pj: &js, px: &xs, pk: &ks, py: &ys);
171	// ==================================================
172	for (int i = `0`; i < nthreads; ++i)
173	ptrs[i] = NULL;
174	#pragma omp parallel
175	{
176	#pragma omp single nowait
177	{
178	// outer taskgroup reduces (i,j,x)
179	#pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
180	{
181	_task_red_item_t red_data[`3`];
182	red_data[`0`].shar = &ip;
183	red_data[`0`].size = sizeof(ip);
184	red_data[`0`].f_init = NULL; // RTL will zero thread-specific objects
185	red_data[`0`].f_fini = NULL; // no destructors needed
186	red_data[`0`].f_comb = (void*)&__red_int_add_comb;
187	red_data[`0`].flags = FLG;
188	red_data[`1`].shar = &jp;
189	red_data[`1`].size = sizeof(jp);
190	red_data[`1`].f_init = NULL; // RTL will zero thread-specific objects
191	red_data[`1`].f_fini = NULL; // no destructors needed
192	red_data[`1`].f_comb = (void*)&__red_llong_add_comb;
193	red_data[`1`].flags = FLG;
194	red_data[`2`].shar = &xp;
195	red_data[`2`].size = sizeof(xp);
196	red_data[`2`].f_init = (void*)&__red_dbl_mul_init;
197	red_data[`2`].f_fini = NULL; // no destructors needed
198	red_data[`2`].f_comb = (void*)&__red_dbl_mul_comb;
199	red_data[`2`].flags = FLG;
200	int gtid = __kmpc_global_thread_num(NULL);
201	void* tg1 = __kmpc_task_reduction_init(gtid, num: `3`, data: red_data);
202
203	for( int l = `0`; l < N; l += `2` ) {
204	// 2 iterations per task to get correct x value; actually any even
205	// number of iters per task will work, otherwise x looses precision
206	#pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
207	{
208	int gtid = __kmpc_global_thread_num(NULL);
209	int p_ip = (int**)__kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: &ip);
210	double p_xp = (double**)__kmpc_task_reduction_get_th_data(
211	gtid, tg: tg1, item: &xp);
212	if (!ptrs[gtid]) ptrs[gtid] = p_xp;
213
214	// user's pseudo-code ==============================
215	*p_ip += l;
216	p_xp = (l + `1`);
217
218	*p_ip += l + `1`;
219	p_xp = `1.0` / (l + `2`);
220	// ==================================================
221	}
222	}
223	// inner taskgroup reduces (i,k,y), i is same object as in outer one
224	#pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
225	{
226	_task_red_item_t red_data[`3`];
227	red_data[`0`].shar = &ip;
228	red_data[`0`].size = sizeof(ip);
229	red_data[`0`].f_init = NULL; // RTL will zero thread-specific objects
230	red_data[`0`].f_fini = NULL; // no destructors needed
231	red_data[`0`].f_comb = (void*)&__red_int_add_comb;
232	red_data[`0`].flags = FLG;
233	red_data[`1`].shar = &kp;
234	red_data[`1`].size = sizeof(kp);
235	red_data[`1`].f_init = NULL; // RTL will zero thread-specific objects
236	red_data[`1`].f_fini = NULL; // no destructors needed
237	red_data[`1`].f_comb = (void)&__red_llong_add_comb; // same for + and -*
238	red_data[`1`].flags = FLG;
239	red_data[`2`].shar = &yp;
240	red_data[`2`].size = sizeof(yp);
241	red_data[`2`].f_init = NULL; // RTL will zero thread-specific objects
242	red_data[`2`].f_fini = NULL; // no destructors needed
243	red_data[`2`].f_comb = (void*)&__red_dbl_add_comb;
244	red_data[`2`].flags = FLG;
245	int gtid = __kmpc_global_thread_num(NULL);
246	void* tg2 = __kmpc_task_reduction_init(gtid, num: `3`, data: red_data);
247
248	for( int l = `0`; l < N; l += `2` ) {
249	#pragma omp task firstprivate(l)
250	// in_reduction(+:j,y) in_reduction(:x) in_reduction(-:k)*
251	{
252	int gtid = __kmpc_global_thread_num(NULL);
253	long long p_jp = (long* long*)__kmpc_task_reduction_get_th_data(
254	gtid, tg: tg1, item: &jp);
255	long long p_kp = (long* long*)__kmpc_task_reduction_get_th_data(
256	gtid, tg: tg2, item: &kp);
257	double p_xp = (double**)__kmpc_task_reduction_get_th_data(
258	gtid, tg: tg1, item: &xp);
259	double p_yp = (double**)__kmpc_task_reduction_get_th_data(
260	gtid, tg: tg2, item: &yp);
261	// user's pseudo-code ==============================
262	*p_jp += l;
263	*p_kp -= l;
264	p_yp += (double*)l;
265	p_xp = (l + `1`);
266
267	*p_jp += l + `1`;
268	*p_kp -= l + `1`;
269	p_yp += (double*)(l + `1`);
270	p_xp = `1.0` / (l + `2`);
271	// =================================================
272	{
273	// the following code is here just to check __kmpc_task_reduction_get_th_data:
274	int tid = omp_get_thread_num();
275	void *addr1;
276	void *addr2;
277	addr1 = __kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: &xp); // from shared
278	addr2 = __kmpc_task_reduction_get_th_data(gtid, tg: tg1, item: addr1); // from private
279	if (addr1 != addr2) {
280	#pragma omp atomic
281	++err;
282	printf(format: "Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
283	}
284	// from neighbour w/o taskgroup (should start lookup from current tg2)
285	if (tid > `0`) {
286	if (ptrs[tid-`1`]) {
287	addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, item: ptrs[tid-`1`]);
288	if (addr1 != addr2) {
289	#pragma omp atomic
290	++err;
291	printf(format: "Wrong thread-specific addresses %d s:%p n:%p\n",
292	tid, addr1, addr2);
293	}
294	}
295	} else {
296	if (ptrs[nthreads-`1`]) {
297	addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, item: ptrs[nthreads-`1`]);
298	if (addr1 != addr2) {
299	#pragma omp atomic
300	++err;
301	printf(format: "Wrong thread-specific addresses %d s:%p n:%p\n",
302	tid, addr1, addr2);
303	}
304	}
305	}
306	// ----------------------------------------------
307	}
308	}
309	#pragma omp task firstprivate(l)
310	// in_reduction(+:y) in_reduction(-:i,k)
311	{
312	int gtid = __kmpc_global_thread_num(NULL);
313	int p_ip = (int**)__kmpc_task_reduction_get_th_data(
314	gtid, tg: tg2, item: &ip);
315	long long p_kp = (long* long*)__kmpc_task_reduction_get_th_data(
316	gtid, tg: tg2, item: &kp);
317	double p_yp = (double**)__kmpc_task_reduction_get_th_data(
318	gtid, tg: tg2, item: &yp);
319
320	// user's pseudo-code ==============================
321	*p_ip -= l;
322	*p_kp -= l;
323	p_yp += (double*)l;
324
325	*p_ip -= l + `1`;
326	*p_kp -= l + `1`;
327	p_yp += (double*)(l + `1`);
328	// =================================================
329	}
330	#pragma omp task firstprivate(l)
331	// in_reduction(+:j) in_reduction(:x)*
332	{
333	int gtid = __kmpc_global_thread_num(NULL);
334	long long p_jp = (long* long*)__kmpc_task_reduction_get_th_data(
335	gtid, tg: tg1, item: &jp);
336	double p_xp = (double**)__kmpc_task_reduction_get_th_data(
337	gtid, tg: tg1, item: &xp);
338	// user's pseudo-code ==============================
339	*p_jp += l;
340	p_xp = (l + `1`);
341
342	*p_jp += l + `1`;
343	p_xp = `1.0` / (l + `2`);
344	// =================================================
345	}
346	}
347	} // inner reduction
348
349	for( int l = `0`; l < N; l += `2` ) {
350	#pragma omp task firstprivate(l) // in_reduction(+:j)
351	{
352	int gtid = __kmpc_global_thread_num(NULL);
353	long long p_jp = (long* long*)__kmpc_task_reduction_get_th_data(
354	gtid, tg: tg1, item: &jp);
355	// user's pseudo-code ==============================
356	*p_jp += l;
357	*p_jp += l + `1`;
358	// =================================================
359	}
360	}
361	} // outer reduction
362	} // end single
363	} // end parallel
364	// check results
365	#if _DEBUG
366	printf(format: "reduction flags = %u\n", FLG);
367	#endif
368	if (ip == is && jp == js && ks == kp &&
369	fabs(x: xp - xs) < `0.01` && fabs(x: yp - ys) < `0.01`)
370	printf(format: "passed\n");
371	else
372	printf(format: "failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
373	is, js, xs, ks, ys,
374	ip, jp, xp, kp, yp);
375	return `0`;
376	}
377

source code of openmp/runtime/test/tasking/kmp_task_reduction_nest.cpp