LibcGpuBenchmark.h source code [libc/benchmarks/gpu/LibcGpuBenchmark.h]

1	#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
2	#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
3
4	#include "benchmarks/gpu/BenchmarkLogger.h"
5	#include "benchmarks/gpu/timing/timing.h"
6	#include "src/__support/CPP/array.h"
7	#include "src/__support/CPP/functional.h"
8	#include "src/__support/CPP/limits.h"
9	#include "src/__support/CPP/string_view.h"
10	#include "src/__support/CPP/type_traits.h"
11	#include "src/__support/FPUtil/FPBits.h"
12	#include "src/__support/macros/config.h"
13	#include "src/stdlib/rand.h"
14	#include "src/time/clock.h"
15
16	#include <stdint.h>
17
18	namespace LIBC_NAMESPACE_DECL {
19
20	namespace benchmarks {
21
22	struct BenchmarkOptions {
23	uint32_t initial_iterations = `1`;
24	uint32_t min_iterations = `1`;
25	uint32_t max_iterations = `10000000`;
26	uint32_t min_samples = `4`;
27	uint32_t max_samples = `1000`;
28	int64_t min_duration = `500` * `1000`; // 500 1000 nanoseconds = 500 us*
29	int64_t max_duration = `1000` * `1000` * `1000`; // 1e9 nanoseconds = 1 second
30	double epsilon = `0.0001`;
31	double scaling_factor = `1.4`;
32	};
33
34	struct Measurement {
35	uint32_t iterations = `0`;
36	uint64_t elapsed_cycles = `0`;
37	};
38
39	class RefinableRuntimeEstimation {
40	uint64_t total_cycles = `0`;
41	uint32_t total_iterations = `0`;
42
43	public:
44	uint64_t update(const Measurement &M) {
45	total_cycles += M.elapsed_cycles;
46	total_iterations += M.iterations;
47	return total_cycles / total_iterations;
48	}
49	};
50
51	// Tracks the progression of the runtime estimation
52	class RuntimeEstimationProgression {
53	RefinableRuntimeEstimation rre;
54
55	public:
56	uint64_t current_estimation = `0`;
57
58	double compute_improvement(const Measurement &M) {
59	const uint64_t new_estimation = rre.update(M);
60	double ratio =
61	(static_cast<double>(current_estimation) / new_estimation) - `1.0`;
62
63	// Get absolute value
64	if (ratio < `0`)
65	ratio *= -`1`;
66
67	current_estimation = new_estimation;
68	return ratio;
69	}
70	};
71
72	struct BenchmarkResult {
73	uint64_t cycles = `0`;
74	double standard_deviation = `0`;
75	uint64_t min = UINT64_MAX;
76	uint64_t max = `0`;
77	uint32_t samples = `0`;
78	uint32_t total_iterations = `0`;
79	clock_t total_time = `0`;
80	};
81
82	BenchmarkResult benchmark(const BenchmarkOptions &options,
83	cpp::function<uint64_t(void)> wrapper_func);
84
85	class Benchmark {
86	const cpp::function<uint64_t(void)> func;
87	const cpp::string_view suite_name;
88	const cpp::string_view test_name;
89	const uint32_t num_threads;
90
91	public:
92	Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
93	char const *test_name, uint32_t num_threads)
94	: func(func), suite_name(suite_name), test_name(test_name),
95	num_threads(num_threads) {
96	add_benchmark(benchmark: this);
97	}
98
99	static void run_benchmarks();
100	const cpp::string_view get_suite_name() const { return suite_name; }
101	const cpp::string_view get_test_name() const { return test_name; }
102
103	protected:
104	static void add_benchmark(Benchmark *benchmark);
105
106	private:
107	BenchmarkResult run() {
108	BenchmarkOptions options;
109	return benchmark(options, func);
110	}
111	};
112
113	// We want our random values to be approximately
114	// Output: a random number with the exponent field between min_exp and max_exp,
115	// i.e. 2^min_exp <= \|real_value\| < 2^(max_exp + 1),
116	// Caveats:
117	// -EXP_BIAS corresponding to denormal values,
118	// EXP_BIAS + 1 corresponding to inf or nan.
119	template <typename T>
120	static T
121	get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
122	int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
123	using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
124
125	// Required to correctly instantiate FPBits for floats and doubles.
126	using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
127	uint64_t, uint32_t>;
128	RandType bits;
129	if constexpr (cpp::is_same_v<T, uint64_t>)
130	bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << `32`) \|
131	static_cast<uint64_t>(LIBC_NAMESPACE::rand());
132	else
133	bits = LIBC_NAMESPACE::rand();
134	double scale =
135	static_cast<double>(max_exp - min_exp + `1`) / (`2` * FPBits::EXP_BIAS + `1`);
136	FPBits fp(bits);
137	fp.set_biased_exponent(
138	static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
139	return fp.get_val();
140	}
141
142	template <typename T> class MathPerf {
143	using FPBits = fputil::FPBits<T>;
144	using StorageType = typename FPBits::StorageType;
145	static constexpr StorageType UIntMax =
146	cpp::numeric_limits<StorageType>::max();
147
148	public:
149	template <size_t N = `1`>
150	static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
151	cpp::array<T, N> inputs;
152	for (size_t i = `0`; i < N; ++i)
153	inputs[i] = get_rand_input<T>(min_exp, max_exp);
154
155	uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
156
157	return total_time / N;
158	}
159
160	// Throughput benchmarking for functions that take 2 inputs.
161	template <size_t N = `1`>
162	static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
163	int arg1_max_exp, int arg2_min_exp,
164	int arg2_max_exp) {
165	cpp::array<T, N> inputs1;
166	cpp::array<T, N> inputs2;
167	for (size_t i = `0`; i < N; ++i) {
168	inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
169	inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
170	}
171
172	uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
173
174	return total_time / N;
175	}
176	};
177
178	} // namespace benchmarks
179	} // namespace LIBC_NAMESPACE_DECL
180
181	// Passing -1 indicates the benchmark should be run with as many threads as
182	// allocated by the user in the benchmark's CMake.
183	#define BENCHMARK(SuiteName, TestName, Func) \
184	LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
185	Func, #SuiteName, #TestName, -1)
186
187	#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \
188	LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
189	Func, #SuiteName, #TestName, NumThreads)
190
191	#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \
192	BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
193
194	#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
195	BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
196	LIBC_NAMESPACE::gpu::get_lane_size())
197	#endif
198

source code of libc/benchmarks/gpu/LibcGpuBenchmark.h