| 1 | #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
| 2 | #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
| 3 | |
| 4 | #include "benchmarks/gpu/BenchmarkLogger.h" |
| 5 | #include "benchmarks/gpu/timing/timing.h" |
| 6 | #include "src/__support/CPP/array.h" |
| 7 | #include "src/__support/CPP/functional.h" |
| 8 | #include "src/__support/CPP/limits.h" |
| 9 | #include "src/__support/CPP/string_view.h" |
| 10 | #include "src/__support/CPP/type_traits.h" |
| 11 | #include "src/__support/FPUtil/FPBits.h" |
| 12 | #include "src/__support/macros/config.h" |
| 13 | #include "src/stdlib/rand.h" |
| 14 | #include "src/time/clock.h" |
| 15 | |
| 16 | #include <stdint.h> |
| 17 | |
| 18 | namespace LIBC_NAMESPACE_DECL { |
| 19 | |
| 20 | namespace benchmarks { |
| 21 | |
| 22 | struct BenchmarkOptions { |
| 23 | uint32_t initial_iterations = 1; |
| 24 | uint32_t min_iterations = 1; |
| 25 | uint32_t max_iterations = 10000000; |
| 26 | uint32_t min_samples = 4; |
| 27 | uint32_t max_samples = 1000; |
| 28 | int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us |
| 29 | int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second |
| 30 | double epsilon = 0.0001; |
| 31 | double scaling_factor = 1.4; |
| 32 | }; |
| 33 | |
| 34 | struct Measurement { |
| 35 | uint32_t iterations = 0; |
| 36 | uint64_t elapsed_cycles = 0; |
| 37 | }; |
| 38 | |
| 39 | class RefinableRuntimeEstimation { |
| 40 | uint64_t total_cycles = 0; |
| 41 | uint32_t total_iterations = 0; |
| 42 | |
| 43 | public: |
| 44 | uint64_t update(const Measurement &M) { |
| 45 | total_cycles += M.elapsed_cycles; |
| 46 | total_iterations += M.iterations; |
| 47 | return total_cycles / total_iterations; |
| 48 | } |
| 49 | }; |
| 50 | |
| 51 | // Tracks the progression of the runtime estimation |
| 52 | class RuntimeEstimationProgression { |
| 53 | RefinableRuntimeEstimation rre; |
| 54 | |
| 55 | public: |
| 56 | uint64_t current_estimation = 0; |
| 57 | |
| 58 | double compute_improvement(const Measurement &M) { |
| 59 | const uint64_t new_estimation = rre.update(M); |
| 60 | double ratio = |
| 61 | (static_cast<double>(current_estimation) / new_estimation) - 1.0; |
| 62 | |
| 63 | // Get absolute value |
| 64 | if (ratio < 0) |
| 65 | ratio *= -1; |
| 66 | |
| 67 | current_estimation = new_estimation; |
| 68 | return ratio; |
| 69 | } |
| 70 | }; |
| 71 | |
| 72 | struct BenchmarkResult { |
| 73 | uint64_t cycles = 0; |
| 74 | double standard_deviation = 0; |
| 75 | uint64_t min = UINT64_MAX; |
| 76 | uint64_t max = 0; |
| 77 | uint32_t samples = 0; |
| 78 | uint32_t total_iterations = 0; |
| 79 | clock_t total_time = 0; |
| 80 | }; |
| 81 | |
| 82 | BenchmarkResult benchmark(const BenchmarkOptions &options, |
| 83 | cpp::function<uint64_t(void)> wrapper_func); |
| 84 | |
| 85 | class Benchmark { |
| 86 | const cpp::function<uint64_t(void)> func; |
| 87 | const cpp::string_view suite_name; |
| 88 | const cpp::string_view test_name; |
| 89 | const uint32_t num_threads; |
| 90 | |
| 91 | public: |
| 92 | Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name, |
| 93 | char const *test_name, uint32_t num_threads) |
| 94 | : func(func), suite_name(suite_name), test_name(test_name), |
| 95 | num_threads(num_threads) { |
| 96 | add_benchmark(benchmark: this); |
| 97 | } |
| 98 | |
| 99 | static void run_benchmarks(); |
| 100 | const cpp::string_view get_suite_name() const { return suite_name; } |
| 101 | const cpp::string_view get_test_name() const { return test_name; } |
| 102 | |
| 103 | protected: |
| 104 | static void add_benchmark(Benchmark *benchmark); |
| 105 | |
| 106 | private: |
| 107 | BenchmarkResult run() { |
| 108 | BenchmarkOptions options; |
| 109 | return benchmark(options, func); |
| 110 | } |
| 111 | }; |
| 112 | |
| 113 | // We want our random values to be approximately |
| 114 | // Output: a random number with the exponent field between min_exp and max_exp, |
| 115 | // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1), |
| 116 | // Caveats: |
| 117 | // -EXP_BIAS corresponding to denormal values, |
| 118 | // EXP_BIAS + 1 corresponding to inf or nan. |
| 119 | template <typename T> |
| 120 | static T |
| 121 | get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS, |
| 122 | int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) { |
| 123 | using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>; |
| 124 | |
| 125 | // Required to correctly instantiate FPBits for floats and doubles. |
| 126 | using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>), |
| 127 | uint64_t, uint32_t>; |
| 128 | RandType bits; |
| 129 | if constexpr (cpp::is_same_v<T, uint64_t>) |
| 130 | bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) | |
| 131 | static_cast<uint64_t>(LIBC_NAMESPACE::rand()); |
| 132 | else |
| 133 | bits = LIBC_NAMESPACE::rand(); |
| 134 | double scale = |
| 135 | static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1); |
| 136 | FPBits fp(bits); |
| 137 | fp.set_biased_exponent( |
| 138 | static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp)); |
| 139 | return fp.get_val(); |
| 140 | } |
| 141 | |
| 142 | template <typename T> class MathPerf { |
| 143 | using FPBits = fputil::FPBits<T>; |
| 144 | using StorageType = typename FPBits::StorageType; |
| 145 | static constexpr StorageType UIntMax = |
| 146 | cpp::numeric_limits<StorageType>::max(); |
| 147 | |
| 148 | public: |
| 149 | template <size_t N = 1> |
| 150 | static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) { |
| 151 | cpp::array<T, N> inputs; |
| 152 | for (size_t i = 0; i < N; ++i) |
| 153 | inputs[i] = get_rand_input<T>(min_exp, max_exp); |
| 154 | |
| 155 | uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); |
| 156 | |
| 157 | return total_time / N; |
| 158 | } |
| 159 | |
| 160 | // Throughput benchmarking for functions that take 2 inputs. |
| 161 | template <size_t N = 1> |
| 162 | static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp, |
| 163 | int arg1_max_exp, int arg2_min_exp, |
| 164 | int arg2_max_exp) { |
| 165 | cpp::array<T, N> inputs1; |
| 166 | cpp::array<T, N> inputs2; |
| 167 | for (size_t i = 0; i < N; ++i) { |
| 168 | inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp); |
| 169 | inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp); |
| 170 | } |
| 171 | |
| 172 | uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); |
| 173 | |
| 174 | return total_time / N; |
| 175 | } |
| 176 | }; |
| 177 | |
| 178 | } // namespace benchmarks |
| 179 | } // namespace LIBC_NAMESPACE_DECL |
| 180 | |
| 181 | // Passing -1 indicates the benchmark should be run with as many threads as |
| 182 | // allocated by the user in the benchmark's CMake. |
| 183 | #define BENCHMARK(SuiteName, TestName, Func) \ |
| 184 | LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
| 185 | Func, #SuiteName, #TestName, -1) |
| 186 | |
| 187 | #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \ |
| 188 | LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
| 189 | Func, #SuiteName, #TestName, NumThreads) |
| 190 | |
| 191 | #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ |
| 192 | BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1) |
| 193 | |
| 194 | #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ |
| 195 | BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ |
| 196 | LIBC_NAMESPACE::gpu::get_lane_size()) |
| 197 | #endif |
| 198 | |