1// RUN: %libomp-compile && env LIBOMP_USE_HIDDEN_HELPER_TASK=0 LIBOMP_NUM_HIDDEN_HELPER_THREADS=0 %libomp-run
2/*
3 Test for the 'schedule(simd:guided)' clause.
4 Compiler needs to generate a dynamic dispatching and pass the schedule
5 value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations.
6*/
7#include <stdio.h>
8#include <stdlib.h>
9#include <omp.h>
10
11#if defined(WIN32) || defined(_WIN32)
12#include <windows.h>
13#define delay() Sleep(1);
14#else
15#include <unistd.h>
16#define delay() usleep(10);
17#endif
18
19// uncomment for debug diagnostics:
20//#define DEBUG
21
22#define SIMD_LEN 4
23
24// ---------------------------------------------------------------------------
25// Various definitions copied from OpenMP RTL
26enum sched {
27 kmp_sch_static_balanced_chunked = 45,
28 kmp_sch_guided_simd = 46,
29 kmp_sch_runtime_simd = 47,
30};
31typedef unsigned u32;
32typedef long long i64;
33typedef unsigned long long u64;
34typedef struct {
35 int reserved_1;
36 int flags;
37 int reserved_2;
38 int reserved_3;
39 char *psource;
40} id;
41
42extern int __kmpc_global_thread_num(id*);
43extern void __kmpc_barrier(id*, int gtid);
44extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
45extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
46extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
47extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
48// End of definitions copied from OpenMP RTL.
49// ---------------------------------------------------------------------------
50static id loc = {.reserved_1: 0, .flags: 2, .reserved_2: 0, .reserved_3: 0, .psource: ";file;func;0;0;;"};
51// This variable is defined in OpenMP RTL but we can't have it exposed so we
52// need to redefine it here.
53static int __kmp_hidden_helper_threads_num = 0;
54
55// ---------------------------------------------------------------------------
56int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
57 int err = 0;
58 static int volatile loop_sync = 0;
59 i64 lb; // Chunk lower bound
60 i64 ub; // Chunk upper bound
61 i64 st; // Chunk stride
62 int rc;
63 int tid = omp_get_thread_num();
64 int gtid = tid;
65 if (gtid) {
66 gtid += __kmp_hidden_helper_threads_num;
67 }
68 int last;
69#if DEBUG
70 printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
71 (int)sizeof(i64), gtid, tid,
72 (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
73#endif
74 // Don't test degenerate cases that should have been discovered by codegen
75 if (loop_st == 0)
76 return 0;
77 if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
78 return 0;
79
80 __kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd,
81 loop_lb, loop_ub, loop_st, loop_chunk);
82 if (tid == 0) {
83 // Let the master thread handle the chunks alone
84 int chunk; // No of current chunk
85 i64 next_lb; // Lower bound of the next chunk
86 i64 last_ub; // Upper bound of the last processed chunk
87 u64 cur; // Number of interations in current chunk
88 u64 max; // Max allowed iterations for current chunk
89 int undersized = 0;
90
91 chunk = 0;
92 next_lb = loop_lb;
93 max = (loop_ub - loop_lb) / loop_st + 1;
94 // The first chunk can consume all iterations
95 while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) {
96 ++ chunk;
97#if DEBUG
98 printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
99#endif
100 // Check if previous chunk (it is not the final chunk) is undersized
101 if (undersized) {
102 printf(format: "Error with chunk %d\n", chunk);
103 err++;
104 }
105 // Check lower and upper bounds
106 if (lb != next_lb) {
107 printf(format: "Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
108 err++;
109 }
110 if (loop_st > 0) {
111 if (!(ub <= loop_ub)) {
112 printf(format: "Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
113 err++;
114 }
115 if (!(lb <= ub)) {
116 printf(format: "Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
117 err++;
118 }
119 } else {
120 if (!(ub >= loop_ub)) {
121 printf(format: "Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
122 err++;
123 }
124 if (!(lb >= ub)) {
125 printf(format: "Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
126 err++;
127 }
128 }; // if
129 // Stride should not change
130 if (!(st == loop_st)) {
131 printf(format: "Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
132 err++;
133 }
134 cur = (ub - lb) / loop_st + 1;
135 // Guided scheduling uses FP computations, so current chunk may
136 // be a bit bigger (+1) than allowed maximum
137 if (!(cur <= max + 1)) {
138 printf(format: "Error with iter %llu, %llu\n", cur, max);
139 err++;
140 }
141 // Update maximum for the next chunk
142 if (cur < max)
143 max = cur;
144 next_lb = ub + loop_st;
145 last_ub = ub;
146 undersized = (cur < loop_chunk);
147 }; // while
148 // Must have at least one chunk
149 if (!(chunk > 0)) {
150 printf(format: "Error with chunk %d\n", chunk);
151 err++;
152 }
153 // Must have the right last iteration index
154 if (loop_st > 0) {
155 if (!(last_ub <= loop_ub)) {
156 printf(format: "Error with last1 %d, %d, ch %d\n",
157 (int)last_ub, (int)loop_ub, chunk);
158 err++;
159 }
160 if (!(last_ub + loop_st > loop_ub)) {
161 printf(format: "Error with last2 %d, %d, %d, ch %d\n",
162 (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
163 err++;
164 }
165 } else {
166 if (!(last_ub >= loop_ub)) {
167 printf(format: "Error with last1 %d, %d, ch %d\n",
168 (int)last_ub, (int)loop_ub, chunk);
169 err++;
170 }
171 if (!(last_ub + loop_st < loop_ub)) {
172 printf(format: "Error with last2 %d, %d, %d, ch %d\n",
173 (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
174 err++;
175 }
176 }; // if
177 // Let non-master threads go
178 loop_sync = 1;
179 } else {
180 int i;
181 // Workers wait for master thread to finish, then call __kmpc_dispatch_next
182 for (i = 0; i < 1000000; ++ i) {
183 if (loop_sync != 0) {
184 break;
185 }; // if
186 }; // for i
187 while (loop_sync == 0) {
188 delay();
189 }; // while
190 // At this moment we do not have any more chunks -- all the chunks already
191 // processed by master thread
192 rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st);
193 if (rc) {
194 printf(format: "Error return value\n");
195 err++;
196 }
197 }; // if
198
199 __kmpc_barrier(&loc, gtid);
200 if (tid == 0) {
201 loop_sync = 0; // Restore original state
202#if DEBUG
203 printf("run_loop_64(): at the end\n");
204#endif
205 }; // if
206 __kmpc_barrier(&loc, gtid);
207 return err;
208} // run_loop
209
210// ---------------------------------------------------------------------------
211int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
212 int err = 0;
213 static int volatile loop_sync = 0;
214 int lb; // Chunk lower bound
215 int ub; // Chunk upper bound
216 int st; // Chunk stride
217 int rc;
218 int tid = omp_get_thread_num();
219 int gtid = tid;
220 if (gtid) {
221 gtid += __kmp_hidden_helper_threads_num;
222 }
223 int last;
224#if DEBUG
225 printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
226 (int)sizeof(int), gtid, tid,
227 (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
228#endif
229 // Don't test degenerate cases that should have been discovered by codegen
230 if (loop_st == 0)
231 return 0;
232 if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
233 return 0;
234
235 __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd,
236 loop_lb, loop_ub, loop_st, loop_chunk);
237 if (tid == 0) {
238 // Let the master thread handle the chunks alone
239 int chunk; // No of current chunk
240 int next_lb; // Lower bound of the next chunk
241 int last_ub; // Upper bound of the last processed chunk
242 u64 cur; // Number of interations in current chunk
243 u64 max; // Max allowed iterations for current chunk
244 int undersized = 0;
245
246 chunk = 0;
247 next_lb = loop_lb;
248 max = (loop_ub - loop_lb) / loop_st + 1;
249 // The first chunk can consume all iterations
250 while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
251 ++ chunk;
252#if DEBUG
253 printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
254#endif
255 // Check if previous chunk (it is not the final chunk) is undersized
256 if (undersized) {
257 printf(format: "Error with chunk %d\n", chunk);
258 err++;
259 }
260 // Check lower and upper bounds
261 if (lb != next_lb) {
262 printf(format: "Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
263 err++;
264 }
265 if (loop_st > 0) {
266 if (!(ub <= loop_ub)) {
267 printf(format: "Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
268 err++;
269 }
270 if (!(lb <= ub)) {
271 printf(format: "Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
272 err++;
273 }
274 } else {
275 if (!(ub >= loop_ub)) {
276 printf(format: "Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
277 err++;
278 }
279 if (!(lb >= ub)) {
280 printf(format: "Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
281 err++;
282 }
283 }; // if
284 // Stride should not change
285 if (!(st == loop_st)) {
286 printf(format: "Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
287 err++;
288 }
289 cur = (ub - lb) / loop_st + 1;
290 // Guided scheduling uses FP computations, so current chunk may
291 // be a bit bigger (+1) than allowed maximum
292 if (!(cur <= max + 1)) {
293 printf(format: "Error with iter %llu, %llu\n", cur, max);
294 err++;
295 }
296 // Update maximum for the next chunk
297 if (cur < max)
298 max = cur;
299 next_lb = ub + loop_st;
300 last_ub = ub;
301 undersized = (cur < loop_chunk);
302 }; // while
303 // Must have at least one chunk
304 if (!(chunk > 0)) {
305 printf(format: "Error with chunk %d\n", chunk);
306 err++;
307 }
308 // Must have the right last iteration index
309 if (loop_st > 0) {
310 if (!(last_ub <= loop_ub)) {
311 printf(format: "Error with last1 %d, %d, ch %d\n",
312 (int)last_ub, (int)loop_ub, chunk);
313 err++;
314 }
315 if (!(last_ub + loop_st > loop_ub)) {
316 printf(format: "Error with last2 %d, %d, %d, ch %d\n",
317 (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
318 err++;
319 }
320 } else {
321 if (!(last_ub >= loop_ub)) {
322 printf(format: "Error with last1 %d, %d, ch %d\n",
323 (int)last_ub, (int)loop_ub, chunk);
324 err++;
325 }
326 if (!(last_ub + loop_st < loop_ub)) {
327 printf(format: "Error with last2 %d, %d, %d, ch %d\n",
328 (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
329 err++;
330 }
331 }; // if
332 // Let non-master threads go
333 loop_sync = 1;
334 } else {
335 int i;
336 // Workers wait for master thread to finish, then call __kmpc_dispatch_next
337 for (i = 0; i < 1000000; ++ i) {
338 if (loop_sync != 0) {
339 break;
340 }; // if
341 }; // for i
342 while (loop_sync == 0) {
343 delay();
344 }; // while
345 // At this moment we do not have any more chunks -- all the chunks already
346 // processed by the master thread
347 rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st);
348 if (rc) {
349 printf(format: "Error return value\n");
350 err++;
351 }
352 }; // if
353
354 __kmpc_barrier(&loc, gtid);
355 if (tid == 0) {
356 loop_sync = 0; // Restore original state
357#if DEBUG
358 printf("run_loop<>(): at the end\n");
359#endif
360 }; // if
361 __kmpc_barrier(&loc, gtid);
362 return err;
363} // run_loop
364
365// ---------------------------------------------------------------------------
366int run_64(int num_th)
367{
368 int err = 0;
369#pragma omp parallel num_threads(num_th)
370 {
371 int chunk;
372 i64 st, lb, ub;
373 for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
374 for (st = 1; st <= 3; ++ st) {
375 for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
376 for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
377 err += run_loop_64(loop_lb: lb, loop_ub: ub, loop_st: st, loop_chunk: chunk);
378 err += run_loop_64(loop_lb: ub, loop_ub: lb, loop_st: -st, loop_chunk: chunk);
379 }; // for ub
380 }; // for lb
381 }; // for st
382 }; // for chunk
383 }
384 return err;
385} // run_all
386
387int run_32(int num_th)
388{
389 int err = 0;
390#pragma omp parallel num_threads(num_th)
391 {
392 int chunk, st, lb, ub;
393 for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
394 for (st = 1; st <= 3; ++ st) {
395 for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
396 for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
397 err += run_loop_32(loop_lb: lb, loop_ub: ub, loop_st: st, loop_chunk: chunk);
398 err += run_loop_32(loop_lb: ub, loop_ub: lb, loop_st: -st, loop_chunk: chunk);
399 }; // for ub
400 }; // for lb
401 }; // for st
402 }; // for chunk
403 }
404 return err;
405} // run_all
406
407// ---------------------------------------------------------------------------
408int main()
409{
410 {
411 const char *env = getenv(name: "LIBOMP_NUM_HIDDEN_HELPER_THREADS");
412 if (env) {
413 __kmp_hidden_helper_threads_num = atoi(nptr: env);
414 }
415 }
416
417 int n, err = 0;
418 for (n = 1; n <= 4; ++ n) {
419 err += run_32(num_th: n);
420 err += run_64(num_th: n);
421 }; // for n
422 if (err)
423 printf(format: "failed with %d errors\n", err);
424 else
425 printf(format: "passed\n");
426 return err;
427}
428

source code of openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c