| 1 | #include <random> |
| 2 | #include <thread> |
| 3 | |
| 4 | #include "../src/perf_counters.h" |
| 5 | #include "gmock/gmock.h" |
| 6 | #include "gtest/gtest.h" |
| 7 | |
| 8 | #ifndef GTEST_SKIP |
| 9 | struct MsgHandler { |
| 10 | void operator=(std::ostream&) {} |
| 11 | }; |
| 12 | #define GTEST_SKIP() return MsgHandler() = std::cout |
| 13 | #endif |
| 14 | |
| 15 | using benchmark::internal::PerfCounters; |
| 16 | using benchmark::internal::PerfCountersMeasurement; |
| 17 | using benchmark::internal::PerfCounterValues; |
| 18 | using ::testing::AllOf; |
| 19 | using ::testing::Gt; |
| 20 | using ::testing::Lt; |
| 21 | |
| 22 | namespace { |
| 23 | const char kGenericPerfEvent1[] = "CYCLES" ; |
| 24 | const char kGenericPerfEvent2[] = "INSTRUCTIONS" ; |
| 25 | |
| 26 | TEST(PerfCountersTest, Init) { |
| 27 | EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported); |
| 28 | } |
| 29 | |
| 30 | TEST(PerfCountersTest, OneCounter) { |
| 31 | if (!PerfCounters::kSupported) { |
| 32 | GTEST_SKIP() << "Performance counters not supported.\n" ; |
| 33 | } |
| 34 | EXPECT_TRUE(PerfCounters::Initialize()); |
| 35 | EXPECT_EQ(PerfCounters::Create(counter_names: {kGenericPerfEvent1}).num_counters(), 1); |
| 36 | } |
| 37 | |
| 38 | TEST(PerfCountersTest, NegativeTest) { |
| 39 | if (!PerfCounters::kSupported) { |
| 40 | EXPECT_FALSE(PerfCounters::Initialize()); |
| 41 | return; |
| 42 | } |
| 43 | EXPECT_TRUE(PerfCounters::Initialize()); |
| 44 | // Safety checks |
| 45 | // Create() will always create a valid object, even if passed no or |
| 46 | // wrong arguments as the new behavior is to warn and drop unsupported |
| 47 | // counters |
| 48 | EXPECT_EQ(PerfCounters::Create(counter_names: {}).num_counters(), 0); |
| 49 | EXPECT_EQ(PerfCounters::Create(counter_names: {"" }).num_counters(), 0); |
| 50 | EXPECT_EQ(PerfCounters::Create(counter_names: {"not a counter name" }).num_counters(), 0); |
| 51 | { |
| 52 | // Try sneaking in a bad egg to see if it is filtered out. The |
| 53 | // number of counters has to be two, not zero |
| 54 | auto counter = |
| 55 | PerfCounters::Create(counter_names: {kGenericPerfEvent2, "" , kGenericPerfEvent1}); |
| 56 | EXPECT_EQ(counter.num_counters(), 2); |
| 57 | EXPECT_EQ(counter.names(), std::vector<std::string>( |
| 58 | {kGenericPerfEvent2, kGenericPerfEvent1})); |
| 59 | } |
| 60 | { |
| 61 | // Try sneaking in an outrageous counter, like a fat finger mistake |
| 62 | auto counter = PerfCounters::Create( |
| 63 | counter_names: {kGenericPerfEvent2, "not a counter name" , kGenericPerfEvent1}); |
| 64 | EXPECT_EQ(counter.num_counters(), 2); |
| 65 | EXPECT_EQ(counter.names(), std::vector<std::string>( |
| 66 | {kGenericPerfEvent2, kGenericPerfEvent1})); |
| 67 | } |
| 68 | { |
| 69 | // Finally try a golden input - it should like both of them |
| 70 | EXPECT_EQ(PerfCounters::Create(counter_names: {kGenericPerfEvent1, kGenericPerfEvent2}) |
| 71 | .num_counters(), |
| 72 | 2); |
| 73 | } |
| 74 | { |
| 75 | // Add a bad apple in the end of the chain to check the edges |
| 76 | auto counter = PerfCounters::Create( |
| 77 | counter_names: {kGenericPerfEvent1, kGenericPerfEvent2, "bad event name" }); |
| 78 | EXPECT_EQ(counter.num_counters(), 2); |
| 79 | EXPECT_EQ(counter.names(), std::vector<std::string>( |
| 80 | {kGenericPerfEvent1, kGenericPerfEvent2})); |
| 81 | } |
| 82 | } |
| 83 | |
| 84 | TEST(PerfCountersTest, Read1Counter) { |
| 85 | if (!PerfCounters::kSupported) { |
| 86 | GTEST_SKIP() << "Test skipped because libpfm is not supported.\n" ; |
| 87 | } |
| 88 | EXPECT_TRUE(PerfCounters::Initialize()); |
| 89 | auto counters = PerfCounters::Create(counter_names: {kGenericPerfEvent1}); |
| 90 | EXPECT_EQ(counters.num_counters(), 1); |
| 91 | PerfCounterValues values1(1); |
| 92 | EXPECT_TRUE(counters.Snapshot(values: &values1)); |
| 93 | EXPECT_GT(values1[0], 0); |
| 94 | PerfCounterValues values2(1); |
| 95 | EXPECT_TRUE(counters.Snapshot(values: &values2)); |
| 96 | EXPECT_GT(values2[0], 0); |
| 97 | EXPECT_GT(values2[0], values1[0]); |
| 98 | } |
| 99 | |
| 100 | TEST(PerfCountersTest, Read2Counters) { |
| 101 | if (!PerfCounters::kSupported) { |
| 102 | GTEST_SKIP() << "Test skipped because libpfm is not supported.\n" ; |
| 103 | } |
| 104 | EXPECT_TRUE(PerfCounters::Initialize()); |
| 105 | auto counters = |
| 106 | PerfCounters::Create(counter_names: {kGenericPerfEvent1, kGenericPerfEvent2}); |
| 107 | EXPECT_EQ(counters.num_counters(), 2); |
| 108 | PerfCounterValues values1(2); |
| 109 | EXPECT_TRUE(counters.Snapshot(values: &values1)); |
| 110 | EXPECT_GT(values1[0], 0); |
| 111 | EXPECT_GT(values1[1], 0); |
| 112 | PerfCounterValues values2(2); |
| 113 | EXPECT_TRUE(counters.Snapshot(values: &values2)); |
| 114 | EXPECT_GT(values2[0], 0); |
| 115 | EXPECT_GT(values2[1], 0); |
| 116 | } |
| 117 | |
| 118 | TEST(PerfCountersTest, ReopenExistingCounters) { |
| 119 | // This test works in recent and old Intel hardware, Pixel 3, and Pixel 6. |
| 120 | // However we cannot make assumptions beyond 2 HW counters due to Pixel 6. |
| 121 | if (!PerfCounters::kSupported) { |
| 122 | GTEST_SKIP() << "Test skipped because libpfm is not supported.\n" ; |
| 123 | } |
| 124 | EXPECT_TRUE(PerfCounters::Initialize()); |
| 125 | std::vector<std::string> kMetrics({kGenericPerfEvent1}); |
| 126 | std::vector<PerfCounters> counters(2); |
| 127 | for (auto& counter : counters) { |
| 128 | counter = PerfCounters::Create(counter_names: kMetrics); |
| 129 | } |
| 130 | PerfCounterValues values(1); |
| 131 | EXPECT_TRUE(counters[0].Snapshot(values: &values)); |
| 132 | EXPECT_TRUE(counters[1].Snapshot(values: &values)); |
| 133 | } |
| 134 | |
| 135 | TEST(PerfCountersTest, CreateExistingMeasurements) { |
| 136 | // The test works (i.e. causes read to fail) for the assumptions |
| 137 | // about hardware capabilities (i.e. small number (2) hardware |
| 138 | // counters) at this date, |
| 139 | // the same as previous test ReopenExistingCounters. |
| 140 | if (!PerfCounters::kSupported) { |
| 141 | GTEST_SKIP() << "Test skipped because libpfm is not supported.\n" ; |
| 142 | } |
| 143 | EXPECT_TRUE(PerfCounters::Initialize()); |
| 144 | |
| 145 | // This means we will try 10 counters but we can only guarantee |
| 146 | // for sure at this time that only 3 will work. Perhaps in the future |
| 147 | // we could use libpfm to query for the hardware limits on this |
| 148 | // particular platform. |
| 149 | const int kMaxCounters = 10; |
| 150 | const int kMinValidCounters = 2; |
| 151 | |
| 152 | // Let's use a ubiquitous counter that is guaranteed to work |
| 153 | // on all platforms |
| 154 | const std::vector<std::string> kMetrics{"cycles" }; |
| 155 | |
| 156 | // Cannot create a vector of actual objects because the |
| 157 | // copy constructor of PerfCounters is deleted - and so is |
| 158 | // implicitly deleted on PerfCountersMeasurement too |
| 159 | std::vector<std::unique_ptr<PerfCountersMeasurement>> |
| 160 | perf_counter_measurements; |
| 161 | |
| 162 | perf_counter_measurements.reserve(n: kMaxCounters); |
| 163 | for (int j = 0; j < kMaxCounters; ++j) { |
| 164 | perf_counter_measurements.emplace_back( |
| 165 | args: new PerfCountersMeasurement(kMetrics)); |
| 166 | } |
| 167 | |
| 168 | std::vector<std::pair<std::string, double>> measurements; |
| 169 | |
| 170 | // Start all counters together to see if they hold |
| 171 | size_t max_counters = kMaxCounters; |
| 172 | for (size_t i = 0; i < kMaxCounters; ++i) { |
| 173 | auto& counter(*perf_counter_measurements[i]); |
| 174 | EXPECT_EQ(counter.num_counters(), 1); |
| 175 | if (!counter.Start()) { |
| 176 | max_counters = i; |
| 177 | break; |
| 178 | }; |
| 179 | } |
| 180 | |
| 181 | ASSERT_GE(max_counters, kMinValidCounters); |
| 182 | |
| 183 | // Start all together |
| 184 | for (size_t i = 0; i < max_counters; ++i) { |
| 185 | auto& counter(*perf_counter_measurements[i]); |
| 186 | EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters)); |
| 187 | } |
| 188 | |
| 189 | // Start/stop individually |
| 190 | for (size_t i = 0; i < max_counters; ++i) { |
| 191 | auto& counter(*perf_counter_measurements[i]); |
| 192 | measurements.clear(); |
| 193 | counter.Start(); |
| 194 | EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters)); |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | // We try to do some meaningful work here but the compiler |
| 199 | // insists in optimizing away our loop so we had to add a |
| 200 | // no-optimize macro. In case it fails, we added some entropy |
| 201 | // to this pool as well. |
| 202 | |
| 203 | BENCHMARK_DONT_OPTIMIZE size_t do_work() { |
| 204 | static std::mt19937 rd{std::random_device{}()}; |
| 205 | static std::uniform_int_distribution<size_t> mrand(0, 10); |
| 206 | const size_t kNumLoops = 1000000; |
| 207 | size_t sum = 0; |
| 208 | for (size_t j = 0; j < kNumLoops; ++j) { |
| 209 | sum += mrand(rd); |
| 210 | } |
| 211 | benchmark::DoNotOptimize(value&: sum); |
| 212 | return sum; |
| 213 | } |
| 214 | |
| 215 | void measure(size_t threadcount, PerfCounterValues* before, |
| 216 | PerfCounterValues* after) { |
| 217 | BM_CHECK_NE(before, nullptr); |
| 218 | BM_CHECK_NE(after, nullptr); |
| 219 | std::vector<std::thread> threads(threadcount); |
| 220 | auto work = [&]() { BM_CHECK(do_work() > 1000); }; |
| 221 | |
| 222 | // We need to first set up the counters, then start the threads, so the |
| 223 | // threads would inherit the counters. But later, we need to first destroy |
| 224 | // the thread pool (so all the work finishes), then measure the counters. So |
| 225 | // the scopes overlap, and we need to explicitly control the scope of the |
| 226 | // threadpool. |
| 227 | auto counters = |
| 228 | PerfCounters::Create(counter_names: {kGenericPerfEvent1, kGenericPerfEvent2}); |
| 229 | for (auto& t : threads) t = std::thread(work); |
| 230 | counters.Snapshot(values: before); |
| 231 | for (auto& t : threads) t.join(); |
| 232 | counters.Snapshot(values: after); |
| 233 | } |
| 234 | |
| 235 | TEST(PerfCountersTest, MultiThreaded) { |
| 236 | if (!PerfCounters::kSupported) { |
| 237 | GTEST_SKIP() << "Test skipped because libpfm is not supported." ; |
| 238 | } |
| 239 | EXPECT_TRUE(PerfCounters::Initialize()); |
| 240 | PerfCounterValues before(2); |
| 241 | PerfCounterValues after(2); |
| 242 | |
| 243 | // Notice that this test will work even if we taskset it to a single CPU |
| 244 | // In this case the threads will run sequentially |
| 245 | // Start two threads and measure the number of combined cycles and |
| 246 | // instructions |
| 247 | measure(threadcount: 2, before: &before, after: &after); |
| 248 | std::vector<double> Elapsed2Threads{ |
| 249 | static_cast<double>(after[0] - before[0]), |
| 250 | static_cast<double>(after[1] - before[1])}; |
| 251 | |
| 252 | // Start four threads and measure the number of combined cycles and |
| 253 | // instructions |
| 254 | measure(threadcount: 4, before: &before, after: &after); |
| 255 | std::vector<double> Elapsed4Threads{ |
| 256 | static_cast<double>(after[0] - before[0]), |
| 257 | static_cast<double>(after[1] - before[1])}; |
| 258 | |
| 259 | // The following expectations fail (at least on a beefy workstation with lots |
| 260 | // of cpus) - it seems that in some circumstances the runtime of 4 threads |
| 261 | // can even be better than with 2. |
| 262 | // So instead of expecting 4 threads to be slower, let's just make sure they |
| 263 | // do not differ too much in general (one is not more than 10x than the |
| 264 | // other). |
| 265 | EXPECT_THAT(Elapsed4Threads[0] / Elapsed2Threads[0], AllOf(Gt(0.1), Lt(10))); |
| 266 | EXPECT_THAT(Elapsed4Threads[1] / Elapsed2Threads[1], AllOf(Gt(0.1), Lt(10))); |
| 267 | } |
| 268 | |
| 269 | TEST(PerfCountersTest, HardwareLimits) { |
| 270 | // The test works (i.e. causes read to fail) for the assumptions |
| 271 | // about hardware capabilities (i.e. small number (3-4) hardware |
| 272 | // counters) at this date, |
| 273 | // the same as previous test ReopenExistingCounters. |
| 274 | if (!PerfCounters::kSupported) { |
| 275 | GTEST_SKIP() << "Test skipped because libpfm is not supported.\n" ; |
| 276 | } |
| 277 | EXPECT_TRUE(PerfCounters::Initialize()); |
| 278 | |
| 279 | // Taken from `perf list`, but focusses only on those HW events that actually |
| 280 | // were reported when running `sudo perf stat -a sleep 10`, intersected over |
| 281 | // several platforms. All HW events listed in the first command not reported |
| 282 | // in the second seem to not work. This is sad as we don't really get to test |
| 283 | // the grouping here (groups can contain up to 6 members)... |
| 284 | std::vector<std::string> counter_names{ |
| 285 | "cycles" , // leader |
| 286 | "instructions" , // |
| 287 | "branch-misses" , // |
| 288 | }; |
| 289 | |
| 290 | // In the off-chance that some of these values are not supported, |
| 291 | // we filter them out so the test will complete without failure |
| 292 | // albeit it might not actually test the grouping on that platform |
| 293 | std::vector<std::string> valid_names; |
| 294 | for (const std::string& name : counter_names) { |
| 295 | if (PerfCounters::IsCounterSupported(name)) { |
| 296 | valid_names.push_back(x: name); |
| 297 | } |
| 298 | } |
| 299 | PerfCountersMeasurement counter(valid_names); |
| 300 | |
| 301 | std::vector<std::pair<std::string, double>> measurements; |
| 302 | |
| 303 | counter.Start(); |
| 304 | EXPECT_TRUE(counter.Stop(measurements)); |
| 305 | } |
| 306 | |
| 307 | } // namespace |
| 308 | |