1 | #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
2 | #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
3 | |
4 | #include "benchmarks/gpu/BenchmarkLogger.h" |
5 | #include "benchmarks/gpu/timing/timing.h" |
6 | #include "src/__support/CPP/array.h" |
7 | #include "src/__support/CPP/functional.h" |
8 | #include "src/__support/CPP/limits.h" |
9 | #include "src/__support/CPP/string_view.h" |
10 | #include "src/__support/CPP/type_traits.h" |
11 | #include "src/__support/FPUtil/FPBits.h" |
12 | #include "src/__support/macros/config.h" |
13 | #include "src/stdlib/rand.h" |
14 | #include "src/time/clock.h" |
15 | |
16 | #include <stdint.h> |
17 | |
18 | namespace LIBC_NAMESPACE_DECL { |
19 | |
20 | namespace benchmarks { |
21 | |
22 | struct BenchmarkOptions { |
23 | uint32_t initial_iterations = 1; |
24 | uint32_t min_iterations = 1; |
25 | uint32_t max_iterations = 10000000; |
26 | uint32_t min_samples = 4; |
27 | uint32_t max_samples = 1000; |
28 | int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us |
29 | int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second |
30 | double epsilon = 0.0001; |
31 | double scaling_factor = 1.4; |
32 | }; |
33 | |
34 | struct Measurement { |
35 | uint32_t iterations = 0; |
36 | uint64_t elapsed_cycles = 0; |
37 | }; |
38 | |
39 | class RefinableRuntimeEstimation { |
40 | uint64_t total_cycles = 0; |
41 | uint32_t total_iterations = 0; |
42 | |
43 | public: |
44 | uint64_t update(const Measurement &M) { |
45 | total_cycles += M.elapsed_cycles; |
46 | total_iterations += M.iterations; |
47 | return total_cycles / total_iterations; |
48 | } |
49 | }; |
50 | |
51 | // Tracks the progression of the runtime estimation |
52 | class RuntimeEstimationProgression { |
53 | RefinableRuntimeEstimation rre; |
54 | |
55 | public: |
56 | uint64_t current_estimation = 0; |
57 | |
58 | double compute_improvement(const Measurement &M) { |
59 | const uint64_t new_estimation = rre.update(M); |
60 | double ratio = |
61 | (static_cast<double>(current_estimation) / new_estimation) - 1.0; |
62 | |
63 | // Get absolute value |
64 | if (ratio < 0) |
65 | ratio *= -1; |
66 | |
67 | current_estimation = new_estimation; |
68 | return ratio; |
69 | } |
70 | }; |
71 | |
72 | struct BenchmarkResult { |
73 | uint64_t cycles = 0; |
74 | double standard_deviation = 0; |
75 | uint64_t min = UINT64_MAX; |
76 | uint64_t max = 0; |
77 | uint32_t samples = 0; |
78 | uint32_t total_iterations = 0; |
79 | clock_t total_time = 0; |
80 | }; |
81 | |
82 | BenchmarkResult benchmark(const BenchmarkOptions &options, |
83 | cpp::function<uint64_t(void)> wrapper_func); |
84 | |
85 | class Benchmark { |
86 | const cpp::function<uint64_t(void)> func; |
87 | const cpp::string_view suite_name; |
88 | const cpp::string_view test_name; |
89 | const uint32_t num_threads; |
90 | |
91 | public: |
92 | Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name, |
93 | char const *test_name, uint32_t num_threads) |
94 | : func(func), suite_name(suite_name), test_name(test_name), |
95 | num_threads(num_threads) { |
96 | add_benchmark(benchmark: this); |
97 | } |
98 | |
99 | static void run_benchmarks(); |
100 | const cpp::string_view get_suite_name() const { return suite_name; } |
101 | const cpp::string_view get_test_name() const { return test_name; } |
102 | |
103 | protected: |
104 | static void add_benchmark(Benchmark *benchmark); |
105 | |
106 | private: |
107 | BenchmarkResult run() { |
108 | BenchmarkOptions options; |
109 | return benchmark(options, func); |
110 | } |
111 | }; |
112 | |
113 | // We want our random values to be approximately |
114 | // Output: a random number with the exponent field between min_exp and max_exp, |
115 | // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1), |
116 | // Caveats: |
117 | // -EXP_BIAS corresponding to denormal values, |
118 | // EXP_BIAS + 1 corresponding to inf or nan. |
119 | template <typename T> |
120 | static T |
121 | get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS, |
122 | int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) { |
123 | using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>; |
124 | |
125 | // Required to correctly instantiate FPBits for floats and doubles. |
126 | using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>), |
127 | uint64_t, uint32_t>; |
128 | RandType bits; |
129 | if constexpr (cpp::is_same_v<T, uint64_t>) |
130 | bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) | |
131 | static_cast<uint64_t>(LIBC_NAMESPACE::rand()); |
132 | else |
133 | bits = LIBC_NAMESPACE::rand(); |
134 | double scale = |
135 | static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1); |
136 | FPBits fp(bits); |
137 | fp.set_biased_exponent( |
138 | static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp)); |
139 | return fp.get_val(); |
140 | } |
141 | |
142 | template <typename T> class MathPerf { |
143 | using FPBits = fputil::FPBits<T>; |
144 | using StorageType = typename FPBits::StorageType; |
145 | static constexpr StorageType UIntMax = |
146 | cpp::numeric_limits<StorageType>::max(); |
147 | |
148 | public: |
149 | template <size_t N = 1> |
150 | static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) { |
151 | cpp::array<T, N> inputs; |
152 | for (size_t i = 0; i < N; ++i) |
153 | inputs[i] = get_rand_input<T>(min_exp, max_exp); |
154 | |
155 | uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); |
156 | |
157 | return total_time / N; |
158 | } |
159 | |
160 | // Throughput benchmarking for functions that take 2 inputs. |
161 | template <size_t N = 1> |
162 | static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp, |
163 | int arg1_max_exp, int arg2_min_exp, |
164 | int arg2_max_exp) { |
165 | cpp::array<T, N> inputs1; |
166 | cpp::array<T, N> inputs2; |
167 | for (size_t i = 0; i < N; ++i) { |
168 | inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp); |
169 | inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp); |
170 | } |
171 | |
172 | uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); |
173 | |
174 | return total_time / N; |
175 | } |
176 | }; |
177 | |
178 | } // namespace benchmarks |
179 | } // namespace LIBC_NAMESPACE_DECL |
180 | |
181 | // Passing -1 indicates the benchmark should be run with as many threads as |
182 | // allocated by the user in the benchmark's CMake. |
183 | #define BENCHMARK(SuiteName, TestName, Func) \ |
184 | LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
185 | Func, #SuiteName, #TestName, -1) |
186 | |
187 | #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \ |
188 | LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
189 | Func, #SuiteName, #TestName, NumThreads) |
190 | |
191 | #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ |
192 | BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1) |
193 | |
194 | #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ |
195 | BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ |
196 | LIBC_NAMESPACE::gpu::get_lane_size()) |
197 | #endif |
198 | |