1 | // Copyright 2015 Google Inc. All rights reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | #include "benchmark_runner.h" |
16 | |
17 | #include "benchmark/benchmark.h" |
18 | #include "benchmark_api_internal.h" |
19 | #include "internal_macros.h" |
20 | |
21 | #ifndef BENCHMARK_OS_WINDOWS |
22 | #if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT) |
23 | #include <sys/resource.h> |
24 | #endif |
25 | #include <sys/time.h> |
26 | #include <unistd.h> |
27 | #endif |
28 | |
29 | #include <algorithm> |
30 | #include <atomic> |
31 | #include <climits> |
32 | #include <cmath> |
33 | #include <condition_variable> |
34 | #include <cstdio> |
35 | #include <cstdlib> |
36 | #include <fstream> |
37 | #include <iostream> |
38 | #include <limits> |
39 | #include <memory> |
40 | #include <string> |
41 | #include <thread> |
42 | #include <utility> |
43 | |
44 | #include "check.h" |
45 | #include "colorprint.h" |
46 | #include "commandlineflags.h" |
47 | #include "complexity.h" |
48 | #include "counter.h" |
49 | #include "internal_macros.h" |
50 | #include "log.h" |
51 | #include "mutex.h" |
52 | #include "perf_counters.h" |
53 | #include "re.h" |
54 | #include "statistics.h" |
55 | #include "string_util.h" |
56 | #include "thread_manager.h" |
57 | #include "thread_timer.h" |
58 | |
59 | namespace benchmark { |
60 | |
61 | namespace internal { |
62 | |
63 | MemoryManager* memory_manager = nullptr; |
64 | |
65 | namespace { |
66 | |
67 | static constexpr IterationCount kMaxIterations = 1000000000000; |
68 | const double kDefaultMinTime = |
69 | std::strtod(nptr: ::benchmark::kDefaultMinTimeStr, /*p_end*/ endptr: nullptr); |
70 | |
71 | BenchmarkReporter::Run CreateRunReport( |
72 | const benchmark::internal::BenchmarkInstance& b, |
73 | const internal::ThreadManager::Result& results, |
74 | IterationCount memory_iterations, |
75 | const MemoryManager::Result* memory_result, double seconds, |
76 | int64_t repetition_index, int64_t repeats) { |
77 | // Create report about this benchmark run. |
78 | BenchmarkReporter::Run report; |
79 | |
80 | report.run_name = b.name(); |
81 | report.family_index = b.family_index(); |
82 | report.per_family_instance_index = b.per_family_instance_index(); |
83 | report.skipped = results.skipped_; |
84 | report.skip_message = results.skip_message_; |
85 | report.report_label = results.report_label_; |
86 | // This is the total iterations across all threads. |
87 | report.iterations = results.iterations; |
88 | report.time_unit = b.time_unit(); |
89 | report.threads = b.threads(); |
90 | report.repetition_index = repetition_index; |
91 | report.repetitions = repeats; |
92 | |
93 | if (!report.skipped) { |
94 | if (b.use_manual_time()) { |
95 | report.real_accumulated_time = results.manual_time_used; |
96 | } else { |
97 | report.real_accumulated_time = results.real_time_used; |
98 | } |
99 | report.use_real_time_for_initial_big_o = b.use_manual_time(); |
100 | report.cpu_accumulated_time = results.cpu_time_used; |
101 | report.complexity_n = results.complexity_n; |
102 | report.complexity = b.complexity(); |
103 | report.complexity_lambda = b.complexity_lambda(); |
104 | report.statistics = &b.statistics(); |
105 | report.counters = results.counters; |
106 | |
107 | if (memory_iterations > 0) { |
108 | assert(memory_result != nullptr); |
109 | report.memory_result = memory_result; |
110 | report.allocs_per_iter = |
111 | memory_iterations ? static_cast<double>(memory_result->num_allocs) / |
112 | static_cast<double>(memory_iterations) |
113 | : 0; |
114 | } |
115 | |
116 | internal::Finish(l: &report.counters, iterations: results.iterations, time: seconds, |
117 | num_threads: b.threads()); |
118 | } |
119 | return report; |
120 | } |
121 | |
122 | // Execute one thread of benchmark b for the specified number of iterations. |
123 | // Adds the stats collected for the thread into manager->results. |
124 | void RunInThread(const BenchmarkInstance* b, IterationCount iters, |
125 | int thread_id, ThreadManager* manager, |
126 | PerfCountersMeasurement* perf_counters_measurement) { |
127 | internal::ThreadTimer timer( |
128 | b->measure_process_cpu_time() |
129 | ? internal::ThreadTimer::CreateProcessCpuTime() |
130 | : internal::ThreadTimer::Create()); |
131 | |
132 | State st = |
133 | b->Run(iters, thread_id, timer: &timer, manager, perf_counters_measurement); |
134 | BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) |
135 | << "Benchmark returned before State::KeepRunning() returned false!" ; |
136 | { |
137 | MutexLock l(manager->GetBenchmarkMutex()); |
138 | internal::ThreadManager::Result& results = manager->results; |
139 | results.iterations += st.iterations(); |
140 | results.cpu_time_used += timer.cpu_time_used(); |
141 | results.real_time_used += timer.real_time_used(); |
142 | results.manual_time_used += timer.manual_time_used(); |
143 | results.complexity_n += st.complexity_length_n(); |
144 | internal::Increment(l: &results.counters, r: st.counters); |
145 | } |
146 | manager->NotifyThreadComplete(); |
147 | } |
148 | |
149 | double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b, |
150 | const BenchTimeType& iters_or_time) { |
151 | if (!IsZero(n: b.min_time())) return b.min_time(); |
152 | // If the flag was used to specify number of iters, then return the default |
153 | // min_time. |
154 | if (iters_or_time.tag == BenchTimeType::ITERS) return kDefaultMinTime; |
155 | |
156 | return iters_or_time.time; |
157 | } |
158 | |
159 | IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b, |
160 | const BenchTimeType& iters_or_time) { |
161 | if (b.iterations() != 0) return b.iterations(); |
162 | |
163 | // We've already concluded that this flag is currently used to pass |
164 | // iters but do a check here again anyway. |
165 | BM_CHECK(iters_or_time.tag == BenchTimeType::ITERS); |
166 | return iters_or_time.iters; |
167 | } |
168 | |
169 | } // end namespace |
170 | |
171 | BenchTimeType ParseBenchMinTime(const std::string& value) { |
172 | BenchTimeType ret; |
173 | |
174 | if (value.empty()) { |
175 | ret.tag = BenchTimeType::TIME; |
176 | ret.time = 0.0; |
177 | return ret; |
178 | } |
179 | |
180 | if (value.back() == 'x') { |
181 | char* p_end; |
182 | // Reset errno before it's changed by strtol. |
183 | errno = 0; |
184 | IterationCount num_iters = std::strtol(nptr: value.c_str(), endptr: &p_end, base: 10); |
185 | |
186 | // After a valid parse, p_end should have been set to |
187 | // point to the 'x' suffix. |
188 | BM_CHECK(errno == 0 && p_end != nullptr && *p_end == 'x') |
189 | << "Malformed iters value passed to --benchmark_min_time: `" << value |
190 | << "`. Expected --benchmark_min_time=<integer>x." ; |
191 | |
192 | ret.tag = BenchTimeType::ITERS; |
193 | ret.iters = num_iters; |
194 | return ret; |
195 | } |
196 | |
197 | bool has_suffix = value.back() == 's'; |
198 | if (!has_suffix) { |
199 | BM_VLOG(0) << "Value passed to --benchmark_min_time should have a suffix. " |
200 | "Eg., `30s` for 30-seconds." ; |
201 | } |
202 | |
203 | char* p_end; |
204 | // Reset errno before it's changed by strtod. |
205 | errno = 0; |
206 | double min_time = std::strtod(nptr: value.c_str(), endptr: &p_end); |
207 | |
208 | // After a successful parse, p_end should point to the suffix 's', |
209 | // or the end of the string if the suffix was omitted. |
210 | BM_CHECK(errno == 0 && p_end != nullptr && |
211 | ((has_suffix && *p_end == 's') || *p_end == '\0')) |
212 | << "Malformed seconds value passed to --benchmark_min_time: `" << value |
213 | << "`. Expected --benchmark_min_time=<float>x." ; |
214 | |
215 | ret.tag = BenchTimeType::TIME; |
216 | ret.time = min_time; |
217 | |
218 | return ret; |
219 | } |
220 | |
221 | BenchmarkRunner::BenchmarkRunner( |
222 | const benchmark::internal::BenchmarkInstance& b_, |
223 | PerfCountersMeasurement* pcm_, |
224 | BenchmarkReporter::PerFamilyRunReports* reports_for_family_) |
225 | : b(b_), |
226 | reports_for_family(reports_for_family_), |
227 | parsed_benchtime_flag(ParseBenchMinTime(value: FLAGS_benchmark_min_time)), |
228 | min_time(ComputeMinTime(b: b_, iters_or_time: parsed_benchtime_flag)), |
229 | min_warmup_time((!IsZero(n: b.min_time()) && b.min_warmup_time() > 0.0) |
230 | ? b.min_warmup_time() |
231 | : FLAGS_benchmark_min_warmup_time), |
232 | warmup_done(!(min_warmup_time > 0.0)), |
233 | repeats(b.repetitions() != 0 ? b.repetitions() |
234 | : FLAGS_benchmark_repetitions), |
235 | has_explicit_iteration_count(b.iterations() != 0 || |
236 | parsed_benchtime_flag.tag == |
237 | BenchTimeType::ITERS), |
238 | pool(b.threads() - 1), |
239 | iters(has_explicit_iteration_count |
240 | ? ComputeIters(b: b_, iters_or_time: parsed_benchtime_flag) |
241 | : 1), |
242 | perf_counters_measurement_ptr(pcm_) { |
243 | run_results.display_report_aggregates_only = |
244 | (FLAGS_benchmark_report_aggregates_only || |
245 | FLAGS_benchmark_display_aggregates_only); |
246 | run_results.file_report_aggregates_only = |
247 | FLAGS_benchmark_report_aggregates_only; |
248 | if (b.aggregation_report_mode() != internal::ARM_Unspecified) { |
249 | run_results.display_report_aggregates_only = |
250 | (b.aggregation_report_mode() & |
251 | internal::ARM_DisplayReportAggregatesOnly); |
252 | run_results.file_report_aggregates_only = |
253 | (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly); |
254 | BM_CHECK(FLAGS_benchmark_perf_counters.empty() || |
255 | (perf_counters_measurement_ptr->num_counters() == 0)) |
256 | << "Perf counters were requested but could not be set up." ; |
257 | } |
258 | } |
259 | |
260 | BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { |
261 | BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n" ; |
262 | |
263 | std::unique_ptr<internal::ThreadManager> manager; |
264 | manager.reset(p: new internal::ThreadManager(b.threads())); |
265 | |
266 | // Run all but one thread in separate threads |
267 | for (std::size_t ti = 0; ti < pool.size(); ++ti) { |
268 | pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1), |
269 | manager.get(), perf_counters_measurement_ptr); |
270 | } |
271 | // And run one thread here directly. |
272 | // (If we were asked to run just one thread, we don't create new threads.) |
273 | // Yes, we need to do this here *after* we start the separate threads. |
274 | RunInThread(b: &b, iters, thread_id: 0, manager: manager.get(), perf_counters_measurement: perf_counters_measurement_ptr); |
275 | |
276 | // The main thread has finished. Now let's wait for the other threads. |
277 | manager->WaitForAllThreads(); |
278 | for (std::thread& thread : pool) thread.join(); |
279 | |
280 | IterationResults i; |
281 | // Acquire the measurements/counters from the manager, UNDER THE LOCK! |
282 | { |
283 | MutexLock l(manager->GetBenchmarkMutex()); |
284 | i.results = manager->results; |
285 | } |
286 | |
287 | // And get rid of the manager. |
288 | manager.reset(); |
289 | |
290 | // Adjust real/manual time stats since they were reported per thread. |
291 | i.results.real_time_used /= b.threads(); |
292 | i.results.manual_time_used /= b.threads(); |
293 | // If we were measuring whole-process CPU usage, adjust the CPU time too. |
294 | if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads(); |
295 | |
296 | BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/" |
297 | << i.results.real_time_used << "\n" ; |
298 | |
299 | // By using KeepRunningBatch a benchmark can iterate more times than |
300 | // requested, so take the iteration count from i.results. |
301 | i.iters = i.results.iterations / b.threads(); |
302 | |
303 | // Base decisions off of real time if requested by this benchmark. |
304 | i.seconds = i.results.cpu_time_used; |
305 | if (b.use_manual_time()) { |
306 | i.seconds = i.results.manual_time_used; |
307 | } else if (b.use_real_time()) { |
308 | i.seconds = i.results.real_time_used; |
309 | } |
310 | |
311 | return i; |
312 | } |
313 | |
314 | IterationCount BenchmarkRunner::PredictNumItersNeeded( |
315 | const IterationResults& i) const { |
316 | // See how much iterations should be increased by. |
317 | // Note: Avoid division by zero with max(seconds, 1ns). |
318 | double multiplier = GetMinTimeToApply() * 1.4 / std::max(a: i.seconds, b: 1e-9); |
319 | // If our last run was at least 10% of FLAGS_benchmark_min_time then we |
320 | // use the multiplier directly. |
321 | // Otherwise we use at most 10 times expansion. |
322 | // NOTE: When the last run was at least 10% of the min time the max |
323 | // expansion should be 14x. |
324 | const bool is_significant = (i.seconds / GetMinTimeToApply()) > 0.1; |
325 | multiplier = is_significant ? multiplier : 10.0; |
326 | |
327 | // So what seems to be the sufficiently-large iteration count? Round up. |
328 | const IterationCount max_next_iters = static_cast<IterationCount>( |
329 | std::llround(x: std::max(a: multiplier * static_cast<double>(i.iters), |
330 | b: static_cast<double>(i.iters) + 1.0))); |
331 | // But we do have *some* limits though.. |
332 | const IterationCount next_iters = std::min(a: max_next_iters, b: kMaxIterations); |
333 | |
334 | BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n" ; |
335 | return next_iters; // round up before conversion to integer. |
336 | } |
337 | |
338 | bool BenchmarkRunner::ShouldReportIterationResults( |
339 | const IterationResults& i) const { |
340 | // Determine if this run should be reported; |
341 | // Either it has run for a sufficient amount of time |
342 | // or because an error was reported. |
343 | return i.results.skipped_ || |
344 | i.iters >= kMaxIterations || // Too many iterations already. |
345 | i.seconds >= |
346 | GetMinTimeToApply() || // The elapsed time is large enough. |
347 | // CPU time is specified but the elapsed real time greatly exceeds |
348 | // the minimum time. |
349 | // Note that user provided timers are except from this test. |
350 | ((i.results.real_time_used >= 5 * GetMinTimeToApply()) && |
351 | !b.use_manual_time()); |
352 | } |
353 | |
354 | double BenchmarkRunner::GetMinTimeToApply() const { |
355 | // In order to re-use functionality to run and measure benchmarks for running |
356 | // a warmup phase of the benchmark, we need a way of telling whether to apply |
357 | // min_time or min_warmup_time. This function will figure out if we are in the |
358 | // warmup phase and therefore need to apply min_warmup_time or if we already |
359 | // in the benchmarking phase and min_time needs to be applied. |
360 | return warmup_done ? min_time : min_warmup_time; |
361 | } |
362 | |
363 | void BenchmarkRunner::FinishWarmUp(const IterationCount& i) { |
364 | warmup_done = true; |
365 | iters = i; |
366 | } |
367 | |
368 | void BenchmarkRunner::RunWarmUp() { |
369 | // Use the same mechanisms for warming up the benchmark as used for actually |
370 | // running and measuring the benchmark. |
371 | IterationResults i_warmup; |
372 | // Dont use the iterations determined in the warmup phase for the actual |
373 | // measured benchmark phase. While this may be a good starting point for the |
374 | // benchmark and it would therefore get rid of the need to figure out how many |
375 | // iterations are needed if min_time is set again, this may also be a complete |
376 | // wrong guess since the warmup loops might be considerably slower (e.g |
377 | // because of caching effects). |
378 | const IterationCount i_backup = iters; |
379 | |
380 | for (;;) { |
381 | b.Setup(); |
382 | i_warmup = DoNIterations(); |
383 | b.Teardown(); |
384 | |
385 | const bool finish = ShouldReportIterationResults(i: i_warmup); |
386 | |
387 | if (finish) { |
388 | FinishWarmUp(i: i_backup); |
389 | break; |
390 | } |
391 | |
392 | // Although we are running "only" a warmup phase where running enough |
393 | // iterations at once without measuring time isn't as important as it is for |
394 | // the benchmarking phase, we still do it the same way as otherwise it is |
395 | // very confusing for the user to know how to choose a proper value for |
396 | // min_warmup_time if a different approach on running it is used. |
397 | iters = PredictNumItersNeeded(i: i_warmup); |
398 | assert(iters > i_warmup.iters && |
399 | "if we did more iterations than we want to do the next time, " |
400 | "then we should have accepted the current iteration run." ); |
401 | } |
402 | } |
403 | |
404 | void BenchmarkRunner::DoOneRepetition() { |
405 | assert(HasRepeatsRemaining() && "Already done all repetitions?" ); |
406 | |
407 | const bool is_the_first_repetition = num_repetitions_done == 0; |
408 | |
409 | // In case a warmup phase is requested by the benchmark, run it now. |
410 | // After running the warmup phase the BenchmarkRunner should be in a state as |
411 | // this warmup never happened except the fact that warmup_done is set. Every |
412 | // other manipulation of the BenchmarkRunner instance would be a bug! Please |
413 | // fix it. |
414 | if (!warmup_done) RunWarmUp(); |
415 | |
416 | IterationResults i; |
417 | // We *may* be gradually increasing the length (iteration count) |
418 | // of the benchmark until we decide the results are significant. |
419 | // And once we do, we report those last results and exit. |
420 | // Please do note that the if there are repetitions, the iteration count |
421 | // is *only* calculated for the *first* repetition, and other repetitions |
422 | // simply use that precomputed iteration count. |
423 | for (;;) { |
424 | b.Setup(); |
425 | i = DoNIterations(); |
426 | b.Teardown(); |
427 | |
428 | // Do we consider the results to be significant? |
429 | // If we are doing repetitions, and the first repetition was already done, |
430 | // it has calculated the correct iteration time, so we have run that very |
431 | // iteration count just now. No need to calculate anything. Just report. |
432 | // Else, the normal rules apply. |
433 | const bool results_are_significant = !is_the_first_repetition || |
434 | has_explicit_iteration_count || |
435 | ShouldReportIterationResults(i); |
436 | |
437 | if (results_are_significant) break; // Good, let's report them! |
438 | |
439 | // Nope, bad iteration. Let's re-estimate the hopefully-sufficient |
440 | // iteration count, and run the benchmark again... |
441 | |
442 | iters = PredictNumItersNeeded(i); |
443 | assert(iters > i.iters && |
444 | "if we did more iterations than we want to do the next time, " |
445 | "then we should have accepted the current iteration run." ); |
446 | } |
447 | |
448 | // Oh, one last thing, we need to also produce the 'memory measurements'.. |
449 | MemoryManager::Result* memory_result = nullptr; |
450 | IterationCount memory_iterations = 0; |
451 | if (memory_manager != nullptr) { |
452 | // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an |
453 | // optional so we don't have to own the Result here. |
454 | // Can't do it now due to cxx03. |
455 | memory_results.push_back(x: MemoryManager::Result()); |
456 | memory_result = &memory_results.back(); |
457 | // Only run a few iterations to reduce the impact of one-time |
458 | // allocations in benchmarks that are not properly managed. |
459 | memory_iterations = std::min<IterationCount>(a: 16, b: iters); |
460 | memory_manager->Start(); |
461 | std::unique_ptr<internal::ThreadManager> manager; |
462 | manager.reset(p: new internal::ThreadManager(1)); |
463 | b.Setup(); |
464 | RunInThread(b: &b, iters: memory_iterations, thread_id: 0, manager: manager.get(), |
465 | perf_counters_measurement: perf_counters_measurement_ptr); |
466 | manager->WaitForAllThreads(); |
467 | manager.reset(); |
468 | b.Teardown(); |
469 | memory_manager->Stop(result&: *memory_result); |
470 | } |
471 | |
472 | // Ok, now actually report. |
473 | BenchmarkReporter::Run report = |
474 | CreateRunReport(b, results: i.results, memory_iterations, memory_result, seconds: i.seconds, |
475 | repetition_index: num_repetitions_done, repeats); |
476 | |
477 | if (reports_for_family) { |
478 | ++reports_for_family->num_runs_done; |
479 | if (!report.skipped) reports_for_family->Runs.push_back(x: report); |
480 | } |
481 | |
482 | run_results.non_aggregates.push_back(x: report); |
483 | |
484 | ++num_repetitions_done; |
485 | } |
486 | |
487 | RunResults&& BenchmarkRunner::GetResults() { |
488 | assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?" ); |
489 | |
490 | // Calculate additional statistics over the repetitions of this instance. |
491 | run_results.aggregates_only = ComputeStats(reports: run_results.non_aggregates); |
492 | |
493 | return std::move(run_results); |
494 | } |
495 | |
496 | } // end namespace internal |
497 | |
498 | } // end namespace benchmark |
499 | |