| 1 | // RUN: %libomp-compile-and-run |
| 2 | |
| 3 | // The test checks schedule(simd:runtime) |
| 4 | // in combination with omp_set_schedule() |
| 5 | #include <stdio.h> |
| 6 | #include <stdlib.h> |
| 7 | #include <omp.h> |
| 8 | |
| 9 | #if defined(WIN32) || defined(_WIN32) |
| 10 | #include <windows.h> |
| 11 | #define delay() Sleep(1); |
| 12 | #define seten(a,b,c) _putenv_s((a),(b)) |
| 13 | #else |
| 14 | #include <unistd.h> |
| 15 | #define delay() usleep(10); |
| 16 | #define seten(a,b,c) setenv((a),(b),(c)) |
| 17 | #endif |
| 18 | |
| 19 | #define SIMD_LEN 4 |
| 20 | int err = 0; |
| 21 | |
| 22 | // --------------------------------------------------------------------------- |
| 23 | // Various definitions copied from OpenMP RTL. |
| 24 | enum sched { |
| 25 | kmp_sch_static_balanced_chunked = 45, |
| 26 | kmp_sch_guided_simd = 46, |
| 27 | kmp_sch_runtime_simd = 47, |
| 28 | }; |
| 29 | typedef unsigned u32; |
| 30 | typedef long long i64; |
| 31 | typedef unsigned long long u64; |
| 32 | typedef struct { |
| 33 | int reserved_1; |
| 34 | int flags; |
| 35 | int reserved_2; |
| 36 | int reserved_3; |
| 37 | char *psource; |
| 38 | } id; |
| 39 | |
| 40 | #ifdef __cplusplus |
| 41 | extern "C" { |
| 42 | #endif |
| 43 | int __kmpc_global_thread_num(id*); |
| 44 | void __kmpc_barrier(id*, int gtid); |
| 45 | void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int); |
| 46 | void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64); |
| 47 | int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*); |
| 48 | int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*); |
| 49 | #ifdef __cplusplus |
| 50 | } // extern "C" |
| 51 | #endif |
| 52 | // End of definitions copied from OpenMP RTL. |
| 53 | // --------------------------------------------------------------------------- |
| 54 | static id loc = {.reserved_1: 0, .flags: 2, .reserved_2: 0, .reserved_3: 0, .psource: ";file;func;0;0;;" }; |
| 55 | |
| 56 | // --------------------------------------------------------------------------- |
| 57 | void |
| 58 | run_loop( |
| 59 | int loop_lb, // Loop lower bound. |
| 60 | int loop_ub, // Loop upper bound. |
| 61 | int loop_st, // Loop stride. |
| 62 | int lchunk |
| 63 | ) { |
| 64 | static int volatile loop_sync = 0; |
| 65 | int lb; // Chunk lower bound. |
| 66 | int ub; // Chunk upper bound. |
| 67 | int st; // Chunk stride. |
| 68 | int rc; |
| 69 | int nthreads = omp_get_num_threads(); |
| 70 | int tid = omp_get_thread_num(); |
| 71 | int gtid = __kmpc_global_thread_num(&loc); |
| 72 | int last; |
| 73 | int tc = (loop_ub - loop_lb) / loop_st + 1; |
| 74 | int ch; |
| 75 | int no_chunk = 0; |
| 76 | if (lchunk == 0) { |
| 77 | no_chunk = 1; |
| 78 | lchunk = 1; |
| 79 | } |
| 80 | ch = lchunk * SIMD_LEN; |
| 81 | #if _DEBUG > 1 |
| 82 | printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n" , |
| 83 | gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk); |
| 84 | #endif |
| 85 | // Don't test degenerate cases that should have been discovered by codegen. |
| 86 | if (loop_st == 0) |
| 87 | return; |
| 88 | if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) |
| 89 | return; |
| 90 | __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd, |
| 91 | loop_lb, loop_ub, loop_st, SIMD_LEN); |
| 92 | { |
| 93 | // Let the master thread handle the chunks alone. |
| 94 | int chunk; // No of current chunk. |
| 95 | int last_ub; // Upper bound of the last processed chunk. |
| 96 | u64 cur; // Number of interations in current chunk. |
| 97 | u64 max; // Max allowed iterations for current chunk. |
| 98 | int undersized = 0; |
| 99 | last_ub = loop_ub; |
| 100 | chunk = 0; |
| 101 | max = (loop_ub - loop_lb) / loop_st + 1; |
| 102 | // The first chunk can consume all iterations. |
| 103 | while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) { |
| 104 | ++ chunk; |
| 105 | #if _DEBUG |
| 106 | printf(format: "th %d: chunk=%d, lb=%d, ub=%d ch %d\n" , |
| 107 | tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1)); |
| 108 | #endif |
| 109 | // Check if previous chunk (it is not the final chunk) is undersized. |
| 110 | if (undersized) |
| 111 | printf(format: "Error with chunk %d, th %d, err %d\n" , chunk, tid, ++err); |
| 112 | if (loop_st > 0) { |
| 113 | if (!(ub <= loop_ub)) |
| 114 | printf(format: "Error with ub %d, %d, ch %d, err %d\n" , |
| 115 | (int)ub, (int)loop_ub, chunk, ++err); |
| 116 | if (!(lb <= ub)) |
| 117 | printf(format: "Error with bounds %d, %d, %d, err %d\n" , |
| 118 | (int)lb, (int)ub, chunk, ++err); |
| 119 | } else { |
| 120 | if (!(ub >= loop_ub)) |
| 121 | printf(format: "Error with ub %d, %d, %d, err %d\n" , |
| 122 | (int)ub, (int)loop_ub, chunk, ++err); |
| 123 | if (!(lb >= ub)) |
| 124 | printf(format: "Error with bounds %d, %d, %d, err %d\n" , |
| 125 | (int)lb, (int)ub, chunk, ++err); |
| 126 | }; // if |
| 127 | // Stride should not change. |
| 128 | if (!(st == loop_st)) |
| 129 | printf(format: "Error with st %d, %d, ch %d, err %d\n" , |
| 130 | (int)st, (int)loop_st, chunk, ++err); |
| 131 | cur = ( ub - lb ) / loop_st + 1; |
| 132 | // Guided scheduling uses FP computations, so current chunk may |
| 133 | // be a bit bigger (+1) than allowed maximum. |
| 134 | if (!( cur <= max + 1)) |
| 135 | printf(format: "Error with iter %llu, %llu, err %d\n" , cur, max, ++err); |
| 136 | // Update maximum for the next chunk. |
| 137 | if (last) { |
| 138 | if (!no_chunk && cur > ch && nthreads > 1) |
| 139 | printf(format: "Error: too big last chunk %d (%d), tid %d, err %d\n" , |
| 140 | (int)cur, ch, tid, ++err); |
| 141 | } else { |
| 142 | if (cur % ch) |
| 143 | printf(format: "Error with chunk %d, %d, ch %d, tid %d, err %d\n" , |
| 144 | chunk, (int)cur, ch, tid, ++err); |
| 145 | } |
| 146 | if (cur < max) |
| 147 | max = cur; |
| 148 | last_ub = ub; |
| 149 | undersized = (cur < ch); |
| 150 | #if _DEBUG > 1 |
| 151 | if (last) |
| 152 | printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n" , |
| 153 | undersized,cur,ch,tid,ub,lb,loop_st); |
| 154 | #endif |
| 155 | } // while |
| 156 | // Must have the right last iteration index. |
| 157 | if (loop_st > 0) { |
| 158 | if (!(last_ub <= loop_ub)) |
| 159 | printf(format: "Error with last1 %d, %d, ch %d, err %d\n" , |
| 160 | (int)last_ub, (int)loop_ub, chunk, ++err); |
| 161 | if (last && !(last_ub + loop_st > loop_ub)) |
| 162 | printf(format: "Error with last2 %d, %d, %d, ch %d, err %d\n" , |
| 163 | (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); |
| 164 | } else { |
| 165 | if (!(last_ub >= loop_ub)) |
| 166 | printf(format: "Error with last1 %d, %d, ch %d, err %d\n" , |
| 167 | (int)last_ub, (int)loop_ub, chunk, ++err); |
| 168 | if (last && !(last_ub + loop_st < loop_ub)) |
| 169 | printf(format: "Error with last2 %d, %d, %d, ch %d, err %d\n" , |
| 170 | (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); |
| 171 | } // if |
| 172 | } |
| 173 | __kmpc_barrier(&loc, gtid); |
| 174 | } // run_loop |
| 175 | |
| 176 | int main(int argc, char *argv[]) |
| 177 | { |
| 178 | int chunk = 0; |
| 179 | // static (no chunk) |
| 180 | omp_set_schedule(omp_sched_static,0); |
| 181 | #pragma omp parallel// num_threads(num_th) |
| 182 | run_loop(loop_lb: 0, loop_ub: 26, loop_st: 1, lchunk: chunk); |
| 183 | |
| 184 | // auto (chunk should be ignorted) |
| 185 | omp_set_schedule(omp_sched_auto,0); |
| 186 | #pragma omp parallel// num_threads(num_th) |
| 187 | run_loop(loop_lb: 0, loop_ub: 26, loop_st: 1, lchunk: chunk); |
| 188 | |
| 189 | // static,1 |
| 190 | chunk = 1; |
| 191 | omp_set_schedule(omp_sched_static,1); |
| 192 | #pragma omp parallel// num_threads(num_th) |
| 193 | run_loop(loop_lb: 0, loop_ub: 26, loop_st: 1, lchunk: chunk); |
| 194 | |
| 195 | // dynamic,1 |
| 196 | omp_set_schedule(omp_sched_dynamic,1); |
| 197 | #pragma omp parallel// num_threads(num_th) |
| 198 | run_loop(loop_lb: 0, loop_ub: 26, loop_st: 1, lchunk: chunk); |
| 199 | |
| 200 | // guided,1 |
| 201 | omp_set_schedule(omp_sched_guided,1); |
| 202 | #pragma omp parallel// num_threads(num_th) |
| 203 | run_loop(loop_lb: 0, loop_ub: 26, loop_st: 1, lchunk: chunk); |
| 204 | |
| 205 | // dynamic,0 - use default chunk size 1 |
| 206 | omp_set_schedule(omp_sched_dynamic,0); |
| 207 | #pragma omp parallel// num_threads(num_th) |
| 208 | run_loop(loop_lb: 0, loop_ub: 26, loop_st: 1, lchunk: chunk); |
| 209 | |
| 210 | // guided,0 - use default chunk size 1 |
| 211 | omp_set_schedule(omp_sched_guided,0); |
| 212 | #pragma omp parallel// num_threads(num_th) |
| 213 | run_loop(loop_lb: 0, loop_ub: 26, loop_st: 1, lchunk: chunk); |
| 214 | |
| 215 | if (err) { |
| 216 | printf(format: "failed, err = %d\n" , err); |
| 217 | return 1; |
| 218 | } else { |
| 219 | printf(format: "passed\n" ); |
| 220 | return 0; |
| 221 | } |
| 222 | } |
| 223 | |