FuzzedDataProvider.h source code [compiler-rt/include/fuzzer/FuzzedDataProvider.h]

1	//===- FuzzedDataProvider.h - Utility header for fuzz targets ---- C++ -* ===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	// A single header library providing an utility class to break up an array of
9	// bytes. Whenever run on the same input, provides the same output, as long as
10	// its methods are called in the same order, with the same arguments.
11	//===----------------------------------------------------------------------===//
12
13	#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
14	#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
15
16	#include <algorithm>
17	#include <array>
18	#include <climits>
19	#include <cstddef>
20	#include <cstdint>
21	#include <cstdlib>
22	#include <cstring>
23	#include <initializer_list>
24	#include <limits>
25	#include <string>
26	#include <type_traits>
27	#include <utility>
28	#include <vector>
29
30	// In addition to the comments below, the API is also briefly documented at
31	// https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider
32	class FuzzedDataProvider {
33	public:
34	// \|data\| is an array of length \|size\| that the FuzzedDataProvider wraps to
35	// provide more granular access. \|data\| must outlive the FuzzedDataProvider.
36	FuzzedDataProvider(const uint8_t *data, size_t size)
37	: data_ptr_(data), remaining_bytes_(size) {}
38	~FuzzedDataProvider() = default;
39
40	// See the implementation below (after the class definition) for more verbose
41	// comments for each of the methods.
42
43	// Methods returning std::vector of bytes. These are the most popular choice
44	// when splitting fuzzing input into pieces, as every piece is put into a
45	// separate buffer (i.e. ASan would catch any under-/overflow) and the memory
46	// will be released automatically.
47	template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes);
48	template <typename T>
49	std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, T terminator = `0`);
50	template <typename T> std::vector<T> ConsumeRemainingBytes();
51
52	// Methods returning strings. Use only when you need a std::string or a null
53	// terminated C-string. Otherwise, prefer the methods returning std::vector.
54	std::string ConsumeBytesAsString(size_t num_bytes);
55	std::string ConsumeRandomLengthString(size_t max_length);
56	std::string ConsumeRandomLengthString();
57	std::string ConsumeRemainingBytesAsString();
58
59	// Methods returning integer values.
60	template <typename T> T ConsumeIntegral();
61	template <typename T> T ConsumeIntegralInRange(T min, T max);
62
63	// Methods returning floating point values.
64	template <typename T> T ConsumeFloatingPoint();
65	template <typename T> T ConsumeFloatingPointInRange(T min, T max);
66
67	// 0 <= return value <= 1.
68	template <typename T> T ConsumeProbability();
69
70	bool ConsumeBool();
71
72	// Returns a value chosen from the given enum.
73	template <typename T> T ConsumeEnum();
74
75	// Returns a value from the given array.
76	template <typename T, size_t size> T PickValueInArray(const T (&array)[size]);
77	template <typename T, size_t size>
78	T PickValueInArray(const std::array<T, size> &array);
79	template <typename T> T PickValueInArray(std::initializer_list<const T> list);
80
81	// Writes data to the given destination and returns number of bytes written.
82	size_t ConsumeData(void *destination, size_t num_bytes);
83
84	// Reports the remaining bytes available for fuzzed input.
85	size_t remaining_bytes() { return remaining_bytes_; }
86
87	private:
88	FuzzedDataProvider(const FuzzedDataProvider &) = delete;
89	FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete;
90
91	void CopyAndAdvance(void *destination, size_t num_bytes);
92
93	void Advance(size_t num_bytes);
94
95	template <typename T>
96	std::vector<T> ConsumeBytes(size_t size, size_t num_bytes);
97
98	template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value);
99
100	const uint8_t *data_ptr_;
101	size_t remaining_bytes_;
102	};
103
104	// Returns a std::vector containing \|num_bytes\| of input data. If fewer than
105	// \|num_bytes\| of data remain, returns a shorter std::vector containing all
106	// of the data that's left. Can be used with any byte sized type, such as
107	// char, unsigned char, uint8_t, etc.
108	template <typename T>
109	std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t num_bytes) {
110	num_bytes = std::min(a: num_bytes, b: remaining_bytes_);
111	return ConsumeBytes<T>(num_bytes, num_bytes);
112	}
113
114	// Similar to \|ConsumeBytes\|, but also appends the terminator value at the end
115	// of the resulting vector. Useful, when a mutable null-terminated C-string is
116	// needed, for example. But that is a rare case. Better avoid it, if possible,
117	// and prefer using \|ConsumeBytes\| or \|ConsumeBytesAsString\| methods.
118	template <typename T>
119	std::vector<T> FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes,
120	T terminator) {
121	num_bytes = std::min(a: num_bytes, b: remaining_bytes_);
122	std::vector<T> result = ConsumeBytes<T>(num_bytes + `1`, num_bytes);
123	result.back() = terminator;
124	return result;
125	}
126
127	// Returns a std::vector containing all remaining bytes of the input data.
128	template <typename T>
129	std::vector<T> FuzzedDataProvider::ConsumeRemainingBytes() {
130	return ConsumeBytes<T>(remaining_bytes_);
131	}
132
133	// Returns a std::string containing \|num_bytes\| of input data. Using this and
134	// \|.c_str()\| on the resulting string is the best way to get an immutable
135	// null-terminated C string. If fewer than \|num_bytes\| of data remain, returns
136	// a shorter std::string containing all of the data that's left.
137	inline std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes) {
138	static_assert(sizeof(std::string::value_type) == sizeof(uint8_t),
139	"ConsumeBytesAsString cannot convert the data to a string.");
140
141	num_bytes = std::min(a: num_bytes, b: remaining_bytes_);
142	std::string result(
143	reinterpret_cast<const std::string::value_type *>(data_ptr_), num_bytes);
144	Advance(num_bytes);
145	return result;
146	}
147
148	// Returns a std::string of length from 0 to \|max_length\|. When it runs out of
149	// input data, returns what remains of the input. Designed to be more stable
150	// with respect to a fuzzer inserting characters than just picking a random
151	// length and then consuming that many bytes with \|ConsumeBytes\|.
152	inline std::string
153	FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) {
154	// Reads bytes from the start of \|data_ptr_\|. Maps "\\" to "\", and maps "\"
155	// followed by anything else to the end of the string. As a result of this
156	// logic, a fuzzer can insert characters into the string, and the string
157	// will be lengthened to include those new characters, resulting in a more
158	// stable fuzzer than picking the length of a string independently from
159	// picking its contents.
160	std::string result;
161
162	// Reserve the anticipated capacity to prevent several reallocations.
163	result.reserve(res: std::min(a: max_length, b: remaining_bytes_));
164	for (size_t i = `0`; i < max_length && remaining_bytes_ != `0`; ++i) {
165	char next = ConvertUnsignedToSigned<char>(value: data_ptr_[`0`]);
166	Advance(num_bytes: `1`);
167	if (next == `'\\'` && remaining_bytes_ != `0`) {
168	next = ConvertUnsignedToSigned<char>(value: data_ptr_[`0`]);
169	Advance(num_bytes: `1`);
170	if (next != `'\\'`)
171	break;
172	}
173	result += next;
174	}
175
176	result.shrink_to_fit();
177	return result;
178	}
179
180	// Returns a std::string of length from 0 to \|remaining_bytes_\|.
181	inline std::string FuzzedDataProvider::ConsumeRandomLengthString() {
182	return ConsumeRandomLengthString(max_length: remaining_bytes_);
183	}
184
185	// Returns a std::string containing all remaining bytes of the input data.
186	// Prefer using \|ConsumeRemainingBytes\| unless you actually need a std::string
187	// object.
188	inline std::string FuzzedDataProvider::ConsumeRemainingBytesAsString() {
189	return ConsumeBytesAsString(num_bytes: remaining_bytes_);
190	}
191
192	// Returns a number in the range [Type's min, Type's max]. The value might
193	// not be uniformly distributed in the given range. If there's no input data
194	// left, always returns \|min\|.
195	template <typename T> T FuzzedDataProvider::ConsumeIntegral() {
196	return ConsumeIntegralInRange(std::numeric_limits<T>::min(),
197	std::numeric_limits<T>::max());
198	}
199
200	// Returns a number in the range [min, max] by consuming bytes from the
201	// input data. The value might not be uniformly distributed in the given
202	// range. If there's no input data left, always returns \|min\|. \|min\| must
203	// be less than or equal to \|max\|.
204	template <typename T>
205	T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max) {
206	static_assert(std::is_integral_v<T>, "An integral type is required.");
207	static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type.");
208
209	if (min > max)
210	abort();
211
212	// Use the biggest type possible to hold the range and the result.
213	uint64_t range = static_cast<uint64_t>(max) - static_cast<uint64_t>(min);
214	uint64_t result = `0`;
215	size_t offset = `0`;
216
217	while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > `0` &&
218	remaining_bytes_ != `0`) {
219	// Pull bytes off the end of the seed data. Experimentally, this seems to
220	// allow the fuzzer to more easily explore the input space. This makes
221	// sense, since it works by modifying inputs that caused new code to run,
222	// and this data is often used to encode length of data read by
223	// \|ConsumeBytes\|. Separating out read lengths makes it easier modify the
224	// contents of the data that is actually read.
225	--remaining_bytes_;
226	result = (result << CHAR_BIT) \| data_ptr_[remaining_bytes_];
227	offset += CHAR_BIT;
228	}
229
230	// Avoid division by 0, in case \|range + 1\| results in overflow.
231	if (range != std::numeric_limits<decltype(range)>::max())
232	result = result % (range + `1`);
233
234	return static_cast<T>(static_cast<uint64_t>(min) + result);
235	}
236
237	// Returns a floating point value in the range [Type's lowest, Type's max] by
238	// consuming bytes from the input data. If there's no input data left, always
239	// returns approximately 0.
240	template <typename T> T FuzzedDataProvider::ConsumeFloatingPoint() {
241	return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(),
242	std::numeric_limits<T>::max());
243	}
244
245	// Returns a floating point value in the given range by consuming bytes from
246	// the input data. If there's no input data left, returns \|min\|. Note that
247	// \|min\| must be less than or equal to \|max\|.
248	template <typename T>
249	T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max) {
250	if (min > max)
251	abort();
252
253	T range = `.0`;
254	T result = min;
255	constexpr T zero(`.0`);
256	if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) {
257	// The diff \|max - min\| would overflow the given floating point type. Use
258	// the half of the diff as the range and consume a bool to decide whether
259	// the result is in the first of the second part of the diff.
260	range = (max / `2.0`) - (min / `2.0`);
261	if (ConsumeBool()) {
262	result += range;
263	}
264	} else {
265	range = max - min;
266	}
267
268	return result + range * ConsumeProbability<T>();
269	}
270
271	// Returns a floating point number in the range [0.0, 1.0]. If there's no
272	// input data left, always returns 0.
273	template <typename T> T FuzzedDataProvider::ConsumeProbability() {
274	static_assert(std::is_floating_point_v<T>,
275	"A floating point type is required.");
276
277	// Use different integral types for different floating point types in order
278	// to provide better density of the resulting values.
279	using IntegralType =
280	typename std::conditional_t<(sizeof(T) <= sizeof(uint32_t)), uint32_t,
281	uint64_t>;
282
283	T result = static_cast<T>(ConsumeIntegral<IntegralType>());
284	result /= static_cast<T>(std::numeric_limits<IntegralType>::max());
285	return result;
286	}
287
288	// Reads one byte and returns a bool, or false when no data remains.
289	inline bool FuzzedDataProvider::ConsumeBool() {
290	return `1` & ConsumeIntegral<uint8_t>();
291	}
292
293	// Returns an enum value. The enum must start at 0 and be contiguous. It must
294	// also contain \|kMaxValue\| aliased to its largest (inclusive) value. Such as:
295	// enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue };
296	template <typename T> T FuzzedDataProvider::ConsumeEnum() {
297	static_assert(std::is_enum_v<T>, "\|T\| must be an enum type.");
298	return static_cast<T>(
299	ConsumeIntegralInRange<uint32_t>(min: `0`, max: static_cast<uint32_t>(T::kMaxValue)));
300	}
301
302	// Returns a copy of the value selected from the given fixed-size \|array\|.
303	template <typename T, size_t size>
304	T FuzzedDataProvider::PickValueInArray(const T (&array)[size]) {
305	static_assert(size > `0`, "The array must be non empty.");
306	return array[ConsumeIntegralInRange<size_t>(min: `0`, max: size - `1`)];
307	}
308
309	template <typename T, size_t size>
310	T FuzzedDataProvider::PickValueInArray(const std::array<T, size> &array) {
311	static_assert(size > `0`, "The array must be non empty.");
312	return array[ConsumeIntegralInRange<size_t>(min: `0`, max: size - `1`)];
313	}
314
315	template <typename T>
316	T FuzzedDataProvider::PickValueInArray(std::initializer_list<const T> list) {
317	if (!list.size())
318	abort();
319
320	return *(list.begin() + ConsumeIntegralInRange<size_t>(`0`, list.size() - `1`));
321	}
322
323	// Writes \|num_bytes\| of input data to the given destination pointer. If there
324	// is not enough data left, writes all remaining bytes. Return value is the
325	// number of bytes written.
326	// In general, it's better to avoid using this function, but it may be useful
327	// in cases when it's necessary to fill a certain buffer or object with
328	// fuzzing data.
329	inline size_t FuzzedDataProvider::ConsumeData(void *destination,
330	size_t num_bytes) {
331	num_bytes = std::min(a: num_bytes, b: remaining_bytes_);
332	CopyAndAdvance(destination, num_bytes);
333	return num_bytes;
334	}
335
336	// Private methods.
337	inline void FuzzedDataProvider::CopyAndAdvance(void *destination,
338	size_t num_bytes) {
339	std::memcpy(dest: destination, src: data_ptr_, n: num_bytes);
340	Advance(num_bytes);
341	}
342
343	inline void FuzzedDataProvider::Advance(size_t num_bytes) {
344	if (num_bytes > remaining_bytes_)
345	abort();
346
347	data_ptr_ += num_bytes;
348	remaining_bytes_ -= num_bytes;
349	}
350
351	template <typename T>
352	std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes) {
353	static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type.");
354
355	// The point of using the size-based constructor below is to increase the
356	// odds of having a vector object with capacity being equal to the length.
357	// That part is always implementation specific, but at least both libc++ and
358	// libstdc++ allocate the requested number of bytes in that constructor,
359	// which seems to be a natural choice for other implementations as well.
360	// To increase the odds even more, we also call \|shrink_to_fit\| below.
361	std::vector<T> result(size);
362	if (size == `0`) {
363	if (num_bytes != `0`)
364	abort();
365	return result;
366	}
367
368	CopyAndAdvance(destination: result.data(), num_bytes);
369
370	// Even though \|shrink_to_fit\| is also implementation specific, we expect it
371	// to provide an additional assurance in case vector's constructor allocated
372	// a buffer which is larger than the actual amount of data we put inside it.
373	result.shrink_to_fit();
374	return result;
375	}
376
377	template <typename TS, typename TU>
378	TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) {
379	static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types.");
380	static_assert(!std::numeric_limits<TU>::is_signed,
381	"Source type must be unsigned.");
382
383	// TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream.
384	if (std::numeric_limits<TS>::is_modulo)
385	return static_cast<TS>(value);
386
387	// Avoid using implementation-defined unsigned to signed conversions.
388	// To learn more, see https://stackoverflow.com/questions/13150449.
389	if (value <= std::numeric_limits<TS>::max()) {
390	return static_cast<TS>(value);
391	} else {
392	constexpr auto TS_min = std::numeric_limits<TS>::min();
393	return TS_min + static_cast<TS>(value - TS_min);
394	}
395	}
396
397	#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
398

source code of compiler-rt/include/fuzzer/FuzzedDataProvider.h