1 | #ifndef KMP_STATS_H |
2 | #define KMP_STATS_H |
3 | |
4 | /** @file kmp_stats.h |
5 | * Functions for collecting statistics. |
6 | */ |
7 | |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
11 | // See https://llvm.org/LICENSE.txt for license information. |
12 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #include "kmp_config.h" |
17 | #include "kmp_debug.h" |
18 | |
19 | #if KMP_STATS_ENABLED |
20 | /* Statistics accumulator. |
21 | Accumulates number of samples and computes min, max, mean, standard deviation |
22 | on the fly. |
23 | |
24 | Online variance calculation algorithm from |
25 | http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm |
26 | */ |
27 | |
28 | #include "kmp_stats_timing.h" |
29 | #include <limits> |
30 | #include <math.h> |
31 | #include <new> // placement new |
32 | #include <stdint.h> |
33 | #include <string> |
34 | #include <vector> |
35 | |
36 | /* Enable developer statistics here if you want them. They are more detailed |
37 | than is useful for application characterisation and are intended for the |
38 | runtime library developer. */ |
39 | #define KMP_DEVELOPER_STATS 0 |
40 | |
41 | /* Enable/Disable histogram output */ |
42 | #define KMP_STATS_HIST 0 |
43 | |
44 | /*! |
45 | * @ingroup STATS_GATHERING |
46 | * \brief flags to describe the statistic (timer or counter) |
47 | * |
48 | */ |
49 | enum stats_flags_e { |
50 | noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic |
51 | onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread |
52 | noUnits = 1 << 2, //!< statistic doesn't need units printed next to it |
53 | notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads |
54 | logEvent = 1 << 4 //!< statistic can be logged on the event timeline when |
55 | //! KMP_STATS_EVENTS is on (valid only for timers) |
56 | }; |
57 | |
58 | /*! |
59 | * @ingroup STATS_GATHERING |
60 | * \brief the states which a thread can be in |
61 | * |
62 | */ |
63 | enum stats_state_e { |
64 | IDLE, |
65 | SERIAL_REGION, |
66 | FORK_JOIN_BARRIER, |
67 | PLAIN_BARRIER, |
68 | TASKWAIT, |
69 | TASKYIELD, |
70 | TASKGROUP, |
71 | IMPLICIT_TASK, |
72 | EXPLICIT_TASK, |
73 | TEAMS_REGION |
74 | }; |
75 | |
76 | /*! |
77 | * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h |
78 | * |
79 | * @param macro a user defined macro that takes three arguments - |
80 | * macro(COUNTER_NAME, flags, arg) |
81 | * @param arg a user defined argument to send to the user defined macro |
82 | * |
83 | * \details A counter counts the occurrence of some event. Each thread |
84 | * accumulates its own count, at the end of execution the counts are aggregated |
85 | * treating each thread as a separate measurement. (Unless onlyInMaster is set, |
86 | * in which case there's only a single measurement). The min,mean,max are |
87 | * therefore the values for the threads. Adding the counter here and then |
88 | * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you |
89 | * need to do. All of the tables and printing is generated from this macro. |
90 | * Format is "macro(name, flags, arg)" |
91 | * |
92 | * @ingroup STATS_GATHERING |
93 | */ |
94 | // clang-format off |
95 | #define KMP_FOREACH_COUNTER(macro, arg) \ |
96 | macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \ |
97 | macro(OMP_NESTED_PARALLEL, 0, arg) \ |
98 | macro(OMP_LOOP_STATIC, 0, arg) \ |
99 | macro(OMP_LOOP_STATIC_STEAL, 0, arg) \ |
100 | macro(OMP_LOOP_DYNAMIC, 0, arg) \ |
101 | macro(OMP_DISTRIBUTE, 0, arg) \ |
102 | macro(OMP_BARRIER, 0, arg) \ |
103 | macro(OMP_CRITICAL, 0, arg) \ |
104 | macro(OMP_SINGLE, 0, arg) \ |
105 | macro(OMP_SECTIONS, 0, arg) \ |
106 | macro(OMP_MASTER, 0, arg) \ |
107 | macro(OMP_MASKED, 0, arg) \ |
108 | macro(OMP_TEAMS, 0, arg) \ |
109 | macro(OMP_set_lock, 0, arg) \ |
110 | macro(OMP_test_lock, 0, arg) \ |
111 | macro(REDUCE_wait, 0, arg) \ |
112 | macro(REDUCE_nowait, 0, arg) \ |
113 | macro(OMP_TASKYIELD, 0, arg) \ |
114 | macro(OMP_TASKLOOP, 0, arg) \ |
115 | macro(TASK_executed, 0, arg) \ |
116 | macro(TASK_cancelled, 0, arg) \ |
117 | macro(TASK_stolen, 0, arg) |
118 | // clang-format on |
119 | |
120 | /*! |
121 | * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h |
122 | * |
123 | * @param macro a user defined macro that takes three arguments - |
124 | * macro(TIMER_NAME, flags, arg) |
125 | * @param arg a user defined argument to send to the user defined macro |
126 | * |
127 | * \details A timer collects multiple samples of some count in each thread and |
128 | * then finally aggregates all of the samples from all of the threads. For most |
129 | * timers the printing code also provides an aggregation over the thread totals. |
130 | * These are printed as TOTAL_foo. The count is normally a time (in ticks), |
131 | * hence the name "timer". (But can be any value, so we use this for "number of |
132 | * arguments passed to fork" as well). For timers the threads are not |
133 | * significant, it's the individual observations that count, so the statistics |
134 | * are at that level. Format is "macro(name, flags, arg)" |
135 | * |
136 | * @ingroup STATS_GATHERING2 |
137 | */ |
138 | // clang-format off |
139 | #define KMP_FOREACH_TIMER(macro, arg) \ |
140 | macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ |
141 | macro (OMP_parallel, stats_flags_e::logEvent, arg) \ |
142 | macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \ |
143 | macro (OMP_teams, stats_flags_e::logEvent, arg) \ |
144 | macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \ |
145 | macro (OMP_loop_static, 0, arg) \ |
146 | macro (OMP_loop_static_scheduling, 0, arg) \ |
147 | macro (OMP_loop_dynamic, 0, arg) \ |
148 | macro (OMP_loop_dynamic_scheduling, 0, arg) \ |
149 | macro (OMP_distribute, 0, arg) \ |
150 | macro (OMP_distribute_scheduling, 0, arg) \ |
151 | macro (OMP_critical, 0, arg) \ |
152 | macro (OMP_critical_wait, 0, arg) \ |
153 | macro (OMP_single, 0, arg) \ |
154 | macro (OMP_sections, 0, arg) \ |
155 | macro (OMP_sections_overhead, 0, arg) \ |
156 | macro (OMP_master, 0, arg) \ |
157 | macro (OMP_masked, 0, arg) \ |
158 | macro (OMP_task_immediate, 0, arg) \ |
159 | macro (OMP_task_taskwait, 0, arg) \ |
160 | macro (OMP_task_taskyield, 0, arg) \ |
161 | macro (OMP_task_taskgroup, 0, arg) \ |
162 | macro (OMP_task_join_bar, 0, arg) \ |
163 | macro (OMP_task_plain_bar, 0, arg) \ |
164 | macro (OMP_taskloop_scheduling, 0, arg) \ |
165 | macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ |
166 | macro (OMP_idle, stats_flags_e::logEvent, arg) \ |
167 | macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ |
168 | macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ |
169 | macro (OMP_serial, stats_flags_e::logEvent, arg) \ |
170 | macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \ |
171 | arg) \ |
172 | macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ |
173 | arg) \ |
174 | macro (OMP_loop_static_iterations, \ |
175 | stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
176 | macro (OMP_loop_static_total_iterations, \ |
177 | stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
178 | macro (OMP_loop_dynamic_iterations, \ |
179 | stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
180 | macro (OMP_loop_dynamic_total_iterations, \ |
181 | stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
182 | macro (OMP_distribute_iterations, \ |
183 | stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
184 | KMP_FOREACH_DEVELOPER_TIMER(macro, arg) |
185 | // clang-format on |
186 | |
187 | // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either |
188 | // initializing OpenMP or being created by a primary |
189 | // thread) until the thread is destroyed |
190 | // OMP_parallel -- Time thread spends executing work directly |
191 | // within a #pragma omp parallel |
192 | // OMP_parallel_overhead -- Time thread spends setting up a parallel region |
193 | // OMP_loop_static -- Time thread spends executing loop iterations from |
194 | // a statically scheduled loop |
195 | // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations |
196 | // from a statically scheduled loop |
197 | // OMP_loop_dynamic -- Time thread spends executing loop iterations from |
198 | // a dynamically scheduled loop |
199 | // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations |
200 | // from a dynamically scheduled loop |
201 | // OMP_critical -- Time thread spends executing critical section |
202 | // OMP_critical_wait -- Time thread spends waiting to enter |
203 | // a critical section |
204 | // OMP_single -- Time spent executing a "single" region |
205 | // OMP_master -- Time spent executing a "master" region |
206 | // OMP_masked -- Time spent executing a "masked" region |
207 | // OMP_task_immediate -- Time spent executing non-deferred tasks |
208 | // OMP_task_taskwait -- Time spent executing tasks inside a taskwait |
209 | // construct |
210 | // OMP_task_taskyield -- Time spent executing tasks inside a taskyield |
211 | // construct |
212 | // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup |
213 | // construct |
214 | // OMP_task_join_bar -- Time spent executing tasks inside a join barrier |
215 | // OMP_task_plain_bar -- Time spent executing tasks inside a barrier |
216 | // construct |
217 | // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop |
218 | // construct |
219 | // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or |
220 | // inside implicit barrier at end of worksharing |
221 | // construct |
222 | // OMP_idle -- Time worker threads spend waiting for next |
223 | // parallel region |
224 | // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a |
225 | // parallel region |
226 | // OMP_join_barrier -- Time spent in a the join barrier surrounding a |
227 | // parallel region |
228 | // OMP_serial -- Time thread zero spends executing serial code |
229 | // OMP_set_numthreads -- Values passed to omp_set_num_threads |
230 | // OMP_PARALLEL_args -- Number of arguments passed to a parallel region |
231 | // OMP_loop_static_iterations -- Number of iterations thread is assigned for |
232 | // statically scheduled loops |
233 | // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for |
234 | // dynamically scheduled loops |
235 | |
236 | #if (KMP_DEVELOPER_STATS) |
237 | // Timers which are of interest to runtime library developers, not end users. |
238 | // These have to be explicitly enabled in addition to the other stats. |
239 | |
240 | // KMP_fork_barrier -- time in __kmp_fork_barrier |
241 | // KMP_join_barrier -- time in __kmp_join_barrier |
242 | // KMP_barrier -- time in __kmp_barrier |
243 | // KMP_end_split_barrier -- time in __kmp_end_split_barrier |
244 | // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy |
245 | // KMP_icv_copy -- start/stop timer for any ICV copying |
246 | // KMP_linear_gather -- time in __kmp_linear_barrier_gather |
247 | // KMP_linear_release -- time in __kmp_linear_barrier_release |
248 | // KMP_tree_gather -- time in __kmp_tree_barrier_gather |
249 | // KMP_tree_release -- time in __kmp_tree_barrier_release |
250 | // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather |
251 | // KMP_hyper_release -- time in __kmp_hyper_barrier_release |
252 | // KMP_dist_gather -- time in __kmp_dist_barrier_gather |
253 | // KMP_dist_release -- time in __kmp_dist_barrier_release |
254 | // clang-format off |
255 | #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ |
256 | macro(KMP_fork_call, 0, arg) \ |
257 | macro(KMP_join_call, 0, arg) \ |
258 | macro(KMP_end_split_barrier, 0, arg) \ |
259 | macro(KMP_hier_gather, 0, arg) \ |
260 | macro(KMP_hier_release, 0, arg) \ |
261 | macro(KMP_hyper_gather, 0, arg) \ |
262 | macro(KMP_hyper_release, 0, arg) \ |
263 | macro(KMP_dist_gather, 0, arg) \ |
264 | macro(KMP_dist_release, 0, arg) \ |
265 | macro(KMP_linear_gather, 0, arg) \ |
266 | macro(KMP_linear_release, 0, arg) \ |
267 | macro(KMP_tree_gather, 0, arg) \ |
268 | macro(KMP_tree_release, 0, arg) \ |
269 | macro(USER_resume, 0, arg) \ |
270 | macro(USER_suspend, 0, arg) \ |
271 | macro(USER_mwait, 0, arg) \ |
272 | macro(KMP_allocate_team, 0, arg) \ |
273 | macro(KMP_setup_icv_copy, 0, arg) \ |
274 | macro(USER_icv_copy, 0, arg) \ |
275 | macro (FOR_static_steal_stolen, \ |
276 | stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
277 | macro (FOR_static_steal_chunks, \ |
278 | stats_flags_e::noUnits | stats_flags_e::noTotal, arg) |
279 | #else |
280 | #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) |
281 | #endif |
282 | // clang-format on |
283 | |
284 | /*! |
285 | * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro. |
286 | * |
287 | * @param macro a user defined macro that takes three arguments - |
288 | * macro(TIMER_NAME, flags, arg) |
289 | * @param arg a user defined argument to send to the user defined macro |
290 | * |
291 | * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE |
292 | * BAD THINGS WILL HAPPEN! |
293 | * |
294 | * \details Explicit timers are ones where we need to allocate a timer itself |
295 | * (as well as the accumulated timing statistics). We allocate these on a |
296 | * per-thread basis, and explicitly start and stop them. Block timers just |
297 | * allocate the timer itself on the stack, and use the destructor to notice |
298 | * block exit; they don't need to be defined here. The name here should be the |
299 | * same as that of a timer above. |
300 | * |
301 | * @ingroup STATS_GATHERING |
302 | */ |
303 | #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg) |
304 | |
305 | #define ENUMERATE(name, ignore, prefix) prefix##name, |
306 | enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST }; |
307 | |
308 | enum explicit_timer_e { |
309 | KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST |
310 | }; |
311 | |
312 | enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST }; |
313 | #undef ENUMERATE |
314 | |
315 | /* |
316 | * A logarithmic histogram. It accumulates the number of values in each power of |
317 | * ten bin. So 1<=x<10, 10<=x<100, ... |
318 | * Mostly useful where we have some big outliers and want to see information |
319 | * about them. |
320 | */ |
321 | class logHistogram { |
322 | enum { |
323 | numBins = 31, /* Number of powers of 10. If this changes you need to change |
324 | * the initializer for binMax */ |
325 | |
326 | /* |
327 | * If you want to use this to analyse values that may be less than 1, (for |
328 | * instance times in s), then the logOffset gives you negative powers. |
329 | * In our case here, we're just looking at times in ticks, or counts, so we |
330 | * can never see values with magnitude < 1 (other than zero), so we can set |
331 | * it to 0. As above change the initializer if you change this. |
332 | */ |
333 | logOffset = 0 |
334 | }; |
335 | uint32_t KMP_ALIGN_CACHE zeroCount; |
336 | struct { |
337 | uint32_t count; |
338 | double total; |
339 | } bins[numBins]; |
340 | |
341 | static double binMax[numBins]; |
342 | |
343 | #ifdef KMP_DEBUG |
344 | uint64_t _total; |
345 | |
346 | void check() const { |
347 | uint64_t t = zeroCount; |
348 | for (int i = 0; i < numBins; i++) |
349 | t += bins[i].count; |
350 | KMP_DEBUG_ASSERT(t == _total); |
351 | } |
352 | #else |
353 | void check() const {} |
354 | #endif |
355 | |
356 | public: |
357 | logHistogram() { reset(); } |
358 | |
359 | logHistogram(logHistogram const &o) { |
360 | for (int i = 0; i < numBins; i++) |
361 | bins[i] = o.bins[i]; |
362 | #ifdef KMP_DEBUG |
363 | _total = o._total; |
364 | #endif |
365 | } |
366 | |
367 | void reset() { |
368 | zeroCount = 0; |
369 | for (int i = 0; i < numBins; i++) { |
370 | bins[i].count = 0; |
371 | bins[i].total = 0; |
372 | } |
373 | |
374 | #ifdef KMP_DEBUG |
375 | _total = 0; |
376 | #endif |
377 | } |
378 | uint32_t count(int b) const { return bins[b + logOffset].count; } |
379 | double total(int b) const { return bins[b + logOffset].total; } |
380 | static uint32_t findBin(double sample); |
381 | |
382 | logHistogram &operator+=(logHistogram const &o) { |
383 | zeroCount += o.zeroCount; |
384 | for (int i = 0; i < numBins; i++) { |
385 | bins[i].count += o.bins[i].count; |
386 | bins[i].total += o.bins[i].total; |
387 | } |
388 | #ifdef KMP_DEBUG |
389 | _total += o._total; |
390 | check(); |
391 | #endif |
392 | |
393 | return *this; |
394 | } |
395 | |
396 | void addSample(double sample); |
397 | int minBin() const; |
398 | int maxBin() const; |
399 | |
400 | std::string format(char) const; |
401 | }; |
402 | |
403 | class statistic { |
404 | double KMP_ALIGN_CACHE minVal; |
405 | double maxVal; |
406 | double meanVal; |
407 | double m2; |
408 | uint64_t sampleCount; |
409 | double offset; |
410 | bool collectingHist; |
411 | logHistogram hist; |
412 | |
413 | public: |
414 | statistic(bool doHist = bool(KMP_STATS_HIST)) { |
415 | reset(); |
416 | collectingHist = doHist; |
417 | } |
418 | statistic(statistic const &o) |
419 | : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), |
420 | sampleCount(o.sampleCount), offset(o.offset), |
421 | collectingHist(o.collectingHist), hist(o.hist) {} |
422 | statistic(double minv, double maxv, double meanv, uint64_t sc, double sd) |
423 | : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc), |
424 | sampleCount(sc), offset(0.0), collectingHist(false) {} |
425 | bool haveHist() const { return collectingHist; } |
426 | double getMin() const { return minVal; } |
427 | double getMean() const { return meanVal; } |
428 | double getMax() const { return maxVal; } |
429 | uint64_t getCount() const { return sampleCount; } |
430 | double getSD() const { return sqrt(m2 / sampleCount); } |
431 | double getTotal() const { return sampleCount * meanVal; } |
432 | logHistogram const *getHist() const { return &hist; } |
433 | void setOffset(double d) { offset = d; } |
434 | |
435 | void reset() { |
436 | minVal = (std::numeric_limits<double>::max)(); |
437 | maxVal = -minVal; |
438 | meanVal = 0.0; |
439 | m2 = 0.0; |
440 | sampleCount = 0; |
441 | offset = 0.0; |
442 | hist.reset(); |
443 | } |
444 | void addSample(double sample); |
445 | void scale(double factor); |
446 | void scaleDown(double f) { scale(1. / f); } |
447 | void forceCount(uint64_t count) { sampleCount = count; } |
448 | statistic &operator+=(statistic const &other); |
449 | |
450 | std::string format(char unit, bool total = false) const; |
451 | std::string formatHist(char unit) const { return hist.format(unit); } |
452 | }; |
453 | |
454 | struct statInfo { |
455 | const char *name; |
456 | uint32_t flags; |
457 | }; |
458 | |
459 | class timeStat : public statistic { |
460 | static statInfo timerInfo[]; |
461 | |
462 | public: |
463 | timeStat() : statistic() {} |
464 | static const char *name(timer_e e) { return timerInfo[e].name; } |
465 | static bool noTotal(timer_e e) { |
466 | return timerInfo[e].flags & stats_flags_e::noTotal; |
467 | } |
468 | static bool masterOnly(timer_e e) { |
469 | return timerInfo[e].flags & stats_flags_e::onlyInMaster; |
470 | } |
471 | static bool workerOnly(timer_e e) { |
472 | return timerInfo[e].flags & stats_flags_e::notInMaster; |
473 | } |
474 | static bool noUnits(timer_e e) { |
475 | return timerInfo[e].flags & stats_flags_e::noUnits; |
476 | } |
477 | static bool logEvent(timer_e e) { |
478 | return timerInfo[e].flags & stats_flags_e::logEvent; |
479 | } |
480 | static void clearEventFlags() { |
481 | for (int i = 0; i < TIMER_LAST; i++) { |
482 | timerInfo[i].flags &= (~(stats_flags_e::logEvent)); |
483 | } |
484 | } |
485 | }; |
486 | |
487 | // Where we need explicitly to start and end the timer, this version can be used |
488 | // Since these timers normally aren't nicely scoped, so don't have a good place |
489 | // to live on the stack of the thread, they're more work to use. |
490 | class explicitTimer { |
491 | timeStat *stat; |
492 | timer_e timerEnumValue; |
493 | tsc_tick_count startTime; |
494 | tsc_tick_count pauseStartTime; |
495 | tsc_tick_count::tsc_interval_t totalPauseTime; |
496 | |
497 | public: |
498 | explicitTimer(timeStat *s, timer_e te) |
499 | : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0), |
500 | totalPauseTime() {} |
501 | |
502 | // void setStat(timeStat *s) { stat = s; } |
503 | void start(tsc_tick_count tick); |
504 | void pause(tsc_tick_count tick) { pauseStartTime = tick; } |
505 | void resume(tsc_tick_count tick) { |
506 | totalPauseTime += (tick - pauseStartTime); |
507 | } |
508 | void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr); |
509 | void reset() { |
510 | startTime = 0; |
511 | pauseStartTime = 0; |
512 | totalPauseTime = 0; |
513 | } |
514 | timer_e get_type() const { return timerEnumValue; } |
515 | }; |
516 | |
517 | // Where you need to partition a threads clock ticks into separate states |
518 | // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and |
519 | // DOING_NOTHING would render these conditions: |
520 | // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive |
521 | // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice |
522 | // versa |
523 | class partitionedTimers { |
524 | private: |
525 | std::vector<explicitTimer> timer_stack; |
526 | |
527 | public: |
528 | partitionedTimers(); |
529 | void init(explicitTimer timer); |
530 | void exchange(explicitTimer timer); |
531 | void push(explicitTimer timer); |
532 | void pop(); |
533 | void windup(); |
534 | }; |
535 | |
536 | // Special wrapper around the partitioned timers to aid timing code blocks |
537 | // It avoids the need to have an explicit end, leaving the scope suffices. |
538 | class blockPartitionedTimer { |
539 | partitionedTimers *part_timers; |
540 | |
541 | public: |
542 | blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer) |
543 | : part_timers(pt) { |
544 | part_timers->push(timer); |
545 | } |
546 | ~blockPartitionedTimer() { part_timers->pop(); } |
547 | }; |
548 | |
549 | // Special wrapper around the thread state to aid in keeping state in code |
550 | // blocks It avoids the need to have an explicit end, leaving the scope |
551 | // suffices. |
552 | class blockThreadState { |
553 | stats_state_e *state_pointer; |
554 | stats_state_e old_state; |
555 | |
556 | public: |
557 | blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state) |
558 | : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { |
559 | *state_pointer = new_state; |
560 | } |
561 | ~blockThreadState() { *state_pointer = old_state; } |
562 | }; |
563 | |
564 | // If all you want is a count, then you can use this... |
565 | // The individual per-thread counts will be aggregated into a statistic at |
566 | // program exit. |
567 | class counter { |
568 | uint64_t value; |
569 | static const statInfo counterInfo[]; |
570 | |
571 | public: |
572 | counter() : value(0) {} |
573 | void increment() { value++; } |
574 | uint64_t getValue() const { return value; } |
575 | void reset() { value = 0; } |
576 | static const char *name(counter_e e) { return counterInfo[e].name; } |
577 | static bool masterOnly(counter_e e) { |
578 | return counterInfo[e].flags & stats_flags_e::onlyInMaster; |
579 | } |
580 | }; |
581 | |
582 | /* **************************************************************** |
583 | Class to implement an event |
584 | |
585 | There are four components to an event: start time, stop time |
586 | nest_level, and timer_name. |
587 | The start and stop time should be obvious (recorded in clock ticks). |
588 | The nest_level relates to the bar width in the timeline graph. |
589 | The timer_name is used to determine which timer event triggered this event. |
590 | |
591 | the interface to this class is through four read-only operations: |
592 | 1) getStart() -- returns the start time as 64 bit integer |
593 | 2) getStop() -- returns the stop time as 64 bit integer |
594 | 3) getNestLevel() -- returns the nest level of the event |
595 | 4) getTimerName() -- returns the timer name that triggered event |
596 | |
597 | *MORE ON NEST_LEVEL* |
598 | The nest level is used in the bar graph that represents the timeline. |
599 | Its main purpose is for showing how events are nested inside eachother. |
600 | For example, say events, A, B, and C are recorded. If the timeline |
601 | looks like this: |
602 | |
603 | Begin -------------------------------------------------------------> Time |
604 | | | | | | | |
605 | A B C C B A |
606 | start start start end end end |
607 | |
608 | Then A, B, C will have a nest level of 1, 2, 3 respectively. |
609 | These values are then used to calculate the barwidth so you can |
610 | see that inside A, B has occurred, and inside B, C has occurred. |
611 | Currently, this is shown with A's bar width being larger than B's |
612 | bar width, and B's bar width being larger than C's bar width. |
613 | |
614 | **************************************************************** */ |
615 | class kmp_stats_event { |
616 | uint64_t start; |
617 | uint64_t stop; |
618 | int nest_level; |
619 | timer_e timer_name; |
620 | |
621 | public: |
622 | kmp_stats_event() |
623 | : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {} |
624 | kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) |
625 | : start(strt), stop(stp), nest_level(nst), timer_name(nme) {} |
626 | inline uint64_t getStart() const { return start; } |
627 | inline uint64_t getStop() const { return stop; } |
628 | inline int getNestLevel() const { return nest_level; } |
629 | inline timer_e getTimerName() const { return timer_name; } |
630 | }; |
631 | |
632 | /* **************************************************************** |
633 | Class to implement a dynamically expandable array of events |
634 | |
635 | --------------------------------------------------------- |
636 | | event 1 | event 2 | event 3 | event 4 | ... | event N | |
637 | --------------------------------------------------------- |
638 | |
639 | An event is pushed onto the back of this array at every |
640 | explicitTimer->stop() call. The event records the thread #, |
641 | start time, stop time, and nest level related to the bar width. |
642 | |
643 | The event vector starts at size INIT_SIZE and grows (doubles in size) |
644 | if needed. An implication of this behavior is that log(N) |
645 | reallocations are needed (where N is number of events). If you want |
646 | to avoid reallocations, then set INIT_SIZE to a large value. |
647 | |
648 | the interface to this class is through six operations: |
649 | 1) reset() -- sets the internal_size back to 0 but does not deallocate any |
650 | memory |
651 | 2) size() -- returns the number of valid elements in the vector |
652 | 3) push_back(start, stop, nest, timer_name) -- pushes an event onto |
653 | the back of the array |
654 | 4) deallocate() -- frees all memory associated with the vector |
655 | 5) sort() -- sorts the vector by start time |
656 | 6) operator[index] or at(index) -- returns event reference at that index |
657 | **************************************************************** */ |
658 | class kmp_stats_event_vector { |
659 | kmp_stats_event *events; |
660 | int internal_size; |
661 | int allocated_size; |
662 | static const int INIT_SIZE = 1024; |
663 | |
664 | public: |
665 | kmp_stats_event_vector() { |
666 | events = |
667 | (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE); |
668 | internal_size = 0; |
669 | allocated_size = INIT_SIZE; |
670 | } |
671 | ~kmp_stats_event_vector() {} |
672 | inline void reset() { internal_size = 0; } |
673 | inline int size() const { return internal_size; } |
674 | void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, |
675 | timer_e name) { |
676 | int i; |
677 | if (internal_size == allocated_size) { |
678 | kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate( |
679 | sizeof(kmp_stats_event) * allocated_size * 2); |
680 | for (i = 0; i < internal_size; i++) |
681 | tmp[i] = events[i]; |
682 | __kmp_free(events); |
683 | events = tmp; |
684 | allocated_size *= 2; |
685 | } |
686 | events[internal_size] = |
687 | kmp_stats_event(start_time, stop_time, nest_level, name); |
688 | internal_size++; |
689 | return; |
690 | } |
691 | void deallocate(); |
692 | void sort(); |
693 | const kmp_stats_event &operator[](int index) const { return events[index]; } |
694 | kmp_stats_event &operator[](int index) { return events[index]; } |
695 | const kmp_stats_event &at(int index) const { return events[index]; } |
696 | kmp_stats_event &at(int index) { return events[index]; } |
697 | }; |
698 | |
699 | /* **************************************************************** |
700 | Class to implement a doubly-linked, circular, statistics list |
701 | |
702 | |---| ---> |---| ---> |---| ---> |---| ---> ... next |
703 | | | | | | | | | |
704 | |---| <--- |---| <--- |---| <--- |---| <--- ... prev |
705 | Sentinel first second third |
706 | Node node node node |
707 | |
708 | The Sentinel Node is the user handle on the list. |
709 | The first node corresponds to thread 0's statistics. |
710 | The second node corresponds to thread 1's statistics and so on... |
711 | |
712 | Each node has a _timers, _counters, and _explicitTimers array to hold that |
713 | thread's statistics. The _explicitTimers point to the correct _timer and |
714 | update its statistics at every stop() call. The explicitTimers' pointers are |
715 | set up in the constructor. Each node also has an event vector to hold that |
716 | thread's timing events. The event vector expands as necessary and records |
717 | the start-stop times for each timer. |
718 | |
719 | The nestLevel variable is for plotting events and is related |
720 | to the bar width in the timeline graph. |
721 | |
722 | Every thread will have a thread local pointer to its node in |
723 | the list. The sentinel node is used by the primary thread to |
724 | store "dummy" statistics before __kmp_create_worker() is called. |
725 | **************************************************************** */ |
726 | class kmp_stats_list { |
727 | int gtid; |
728 | timeStat _timers[TIMER_LAST + 1]; |
729 | counter _counters[COUNTER_LAST + 1]; |
730 | explicitTimer thread_life_timer; |
731 | partitionedTimers _partitionedTimers; |
732 | int _nestLevel; // one per thread |
733 | kmp_stats_event_vector _event_vector; |
734 | kmp_stats_list *next; |
735 | kmp_stats_list *prev; |
736 | stats_state_e state; |
737 | int thread_is_idle_flag; |
738 | |
739 | public: |
740 | kmp_stats_list() |
741 | : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life], |
742 | TIMER_OMP_worker_thread_life), |
743 | _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), |
744 | thread_is_idle_flag(0) {} |
745 | ~kmp_stats_list() {} |
746 | inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; } |
747 | inline counter *getCounter(counter_e idx) { return &_counters[idx]; } |
748 | inline partitionedTimers *getPartitionedTimers() { |
749 | return &_partitionedTimers; |
750 | } |
751 | inline timeStat *getTimers() { return _timers; } |
752 | inline counter *getCounters() { return _counters; } |
753 | inline kmp_stats_event_vector &getEventVector() { return _event_vector; } |
754 | inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); } |
755 | inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); } |
756 | inline void resetEventVector() { _event_vector.reset(); } |
757 | inline void incrementNestValue() { _nestLevel++; } |
758 | inline int getNestValue() { return _nestLevel; } |
759 | inline void decrementNestValue() { _nestLevel--; } |
760 | inline int getGtid() const { return gtid; } |
761 | inline void setGtid(int newgtid) { gtid = newgtid; } |
762 | inline void setState(stats_state_e newstate) { state = newstate; } |
763 | inline stats_state_e getState() const { return state; } |
764 | inline stats_state_e *getStatePointer() { return &state; } |
765 | inline bool isIdle() { return thread_is_idle_flag == 1; } |
766 | inline void setIdleFlag() { thread_is_idle_flag = 1; } |
767 | inline void resetIdleFlag() { thread_is_idle_flag = 0; } |
768 | kmp_stats_list *push_back(int gtid); // returns newly created list node |
769 | inline void push_event(uint64_t start_time, uint64_t stop_time, |
770 | int nest_level, timer_e name) { |
771 | _event_vector.push_back(start_time, stop_time, nest_level, name); |
772 | } |
773 | void deallocate(); |
774 | class iterator; |
775 | kmp_stats_list::iterator begin(); |
776 | kmp_stats_list::iterator end(); |
777 | int size(); |
778 | class iterator { |
779 | kmp_stats_list *ptr; |
780 | friend kmp_stats_list::iterator kmp_stats_list::begin(); |
781 | friend kmp_stats_list::iterator kmp_stats_list::end(); |
782 | |
783 | public: |
784 | iterator(); |
785 | ~iterator(); |
786 | iterator operator++(); |
787 | iterator operator++(int dummy); |
788 | iterator operator--(); |
789 | iterator operator--(int dummy); |
790 | bool operator!=(const iterator &rhs); |
791 | bool operator==(const iterator &rhs); |
792 | kmp_stats_list *operator*() const; // dereference operator |
793 | }; |
794 | }; |
795 | |
796 | /* **************************************************************** |
797 | Class to encapsulate all output functions and the environment variables |
798 | |
799 | This module holds filenames for various outputs (normal stats, events, plot |
800 | file), as well as coloring information for the plot file. |
801 | |
802 | The filenames and flags variables are read from environment variables. |
803 | These are read once by the constructor of the global variable |
804 | __kmp_stats_output which calls init(). |
805 | |
806 | During this init() call, event flags for the timeStat::timerInfo[] global |
807 | array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes). |
808 | |
809 | The only interface function that is public is outputStats(heading). This |
810 | function should print out everything it needs to, either to files or stderr, |
811 | depending on the environment variables described below |
812 | |
813 | ENVIRONMENT VARIABLES: |
814 | KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this |
815 | file, otherwise, print to stderr |
816 | KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to |
817 | either KMP_STATS_FILE or stderr |
818 | KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename, |
819 | otherwise, the plot file is sent to "events.plt" |
820 | KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log |
821 | events |
822 | KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file, |
823 | otherwise, output is sent to "events.dat" |
824 | **************************************************************** */ |
825 | class kmp_stats_output_module { |
826 | |
827 | public: |
828 | struct rgb_color { |
829 | float r; |
830 | float g; |
831 | float b; |
832 | }; |
833 | |
834 | private: |
835 | std::string outputFileName; |
836 | static const char *eventsFileName; |
837 | static const char *plotFileName; |
838 | static int printPerThreadFlag; |
839 | static int printPerThreadEventsFlag; |
840 | static const rgb_color globalColorArray[]; |
841 | static rgb_color timerColorInfo[]; |
842 | |
843 | void init(); |
844 | static void setupEventColors(); |
845 | static void printPloticusFile(); |
846 | static void printHeaderInfo(FILE *statsOut); |
847 | static void printTimerStats(FILE *statsOut, statistic const *theStats, |
848 | statistic const *totalStats); |
849 | static void printCounterStats(FILE *statsOut, statistic const *theStats); |
850 | static void printCounters(FILE *statsOut, counter const *theCounters); |
851 | static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents, |
852 | int gtid); |
853 | static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; } |
854 | static void windupExplicitTimers(); |
855 | bool eventPrintingEnabled() const { return printPerThreadEventsFlag; } |
856 | |
857 | public: |
858 | kmp_stats_output_module() { init(); } |
859 | void outputStats(const char *heading); |
860 | }; |
861 | |
862 | #ifdef __cplusplus |
863 | extern "C" { |
864 | #endif |
865 | void __kmp_stats_init(); |
866 | void __kmp_stats_fini(); |
867 | void __kmp_reset_stats(); |
868 | void __kmp_output_stats(const char *); |
869 | void __kmp_accumulate_stats_at_exit(void); |
870 | // thread local pointer to stats node within list |
871 | extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr; |
872 | // head to stats list. |
873 | extern kmp_stats_list *__kmp_stats_list; |
874 | // lock for __kmp_stats_list |
875 | extern kmp_tas_lock_t __kmp_stats_lock; |
876 | // reference start time |
877 | extern tsc_tick_count __kmp_stats_start_time; |
878 | // interface to output |
879 | extern kmp_stats_output_module __kmp_stats_output; |
880 | |
881 | #ifdef __cplusplus |
882 | } |
883 | #endif |
884 | |
885 | // Simple, standard interfaces that drop out completely if stats aren't enabled |
886 | |
887 | /*! |
888 | * \brief Adds value to specified timer (name). |
889 | * |
890 | * @param name timer name as specified under the KMP_FOREACH_TIMER() macro |
891 | * @param value double precision sample value to add to statistics for the timer |
892 | * |
893 | * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to |
894 | * a timer statistics. |
895 | * |
896 | * @ingroup STATS_GATHERING |
897 | */ |
898 | #define KMP_COUNT_VALUE(name, value) \ |
899 | __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value) |
900 | |
901 | /*! |
902 | * \brief Increments specified counter (name). |
903 | * |
904 | * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro |
905 | * |
906 | * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics |
907 | * counter for the executing thread. |
908 | * |
909 | * @ingroup STATS_GATHERING |
910 | */ |
911 | #define KMP_COUNT_BLOCK(name) \ |
912 | __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment() |
913 | |
914 | /*! |
915 | * \brief Outputs the current thread statistics and reset them. |
916 | * |
917 | * @param heading_string heading put above the final stats output |
918 | * |
919 | * \details Explicitly stops all timers and outputs all stats. Environment |
920 | * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a |
921 | * filename instead of stderr. Environment variable, |
922 | * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific |
923 | * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be |
924 | * defined with any value, which will print out thread specific stats, or it can |
925 | * be undefined (not specified in the environment) and thread specific stats |
926 | * won't be printed. It should be noted that all statistics are reset when this |
927 | * macro is called. |
928 | * |
929 | * @ingroup STATS_GATHERING |
930 | */ |
931 | #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string) |
932 | |
933 | /*! |
934 | * \brief Initializes the partitioned timers to begin with name. |
935 | * |
936 | * @param name timer which you want this thread to begin with |
937 | * |
938 | * @ingroup STATS_GATHERING |
939 | */ |
940 | #define KMP_INIT_PARTITIONED_TIMERS(name) \ |
941 | __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \ |
942 | __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) |
943 | |
944 | #define KMP_TIME_PARTITIONED_BLOCK(name) \ |
945 | blockPartitionedTimer __PBLOCKTIME__( \ |
946 | __kmp_stats_thread_ptr->getPartitionedTimers(), \ |
947 | explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ |
948 | TIMER_##name)) |
949 | |
950 | #define KMP_PUSH_PARTITIONED_TIMER(name) \ |
951 | __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \ |
952 | __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) |
953 | |
954 | #define KMP_POP_PARTITIONED_TIMER() \ |
955 | __kmp_stats_thread_ptr->getPartitionedTimers()->pop() |
956 | |
957 | #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \ |
958 | __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \ |
959 | __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) |
960 | |
961 | #define KMP_SET_THREAD_STATE(state_name) \ |
962 | __kmp_stats_thread_ptr->setState(state_name) |
963 | |
964 | #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState() |
965 | |
966 | #define KMP_SET_THREAD_STATE_BLOCK(state_name) \ |
967 | blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \ |
968 | state_name) |
969 | |
970 | /*! |
971 | * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) |
972 | * |
973 | * \details Reset all stats for all threads. |
974 | * |
975 | * @ingroup STATS_GATHERING |
976 | */ |
977 | #define KMP_RESET_STATS() __kmp_reset_stats() |
978 | |
979 | #if (KMP_DEVELOPER_STATS) |
980 | #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v) |
981 | #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n) |
982 | #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n) |
983 | #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n) |
984 | #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n) |
985 | #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \ |
986 | KMP_EXCHANGE_PARTITIONED_TIMER(n) |
987 | #else |
988 | // Null definitions |
989 | #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) |
990 | #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) |
991 | #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) |
992 | #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) |
993 | #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) |
994 | #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) |
995 | #endif |
996 | |
997 | #else // KMP_STATS_ENABLED |
998 | |
999 | // Null definitions |
1000 | #define KMP_COUNT_VALUE(n, v) ((void)0) |
1001 | #define KMP_COUNT_BLOCK(n) ((void)0) |
1002 | |
1003 | #define KMP_OUTPUT_STATS(heading_string) ((void)0) |
1004 | #define KMP_RESET_STATS() ((void)0) |
1005 | |
1006 | #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) |
1007 | #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) |
1008 | #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) |
1009 | #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) |
1010 | #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) |
1011 | #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) |
1012 | #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) |
1013 | #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) |
1014 | #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) |
1015 | #define KMP_POP_PARTITIONED_TIMER() ((void)0) |
1016 | #define KMP_SET_THREAD_STATE(state_name) ((void)0) |
1017 | #define KMP_GET_THREAD_STATE() ((void)0) |
1018 | #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) |
1019 | #endif // KMP_STATS_ENABLED |
1020 | |
1021 | #endif // KMP_STATS_H |
1022 | |