string_utils.h source code [libc/src/string/string_utils.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	//===-- String utils --------------------------------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Standalone string utility functions. Utilities requiring memory allocations
10	// should be placed in allocating_string_utils.h instead.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H
15	#define LLVM_LIBC_SRC_STRING_STRING_UTILS_H
16
17	#include "hdr/limits_macros.h"
18	#include "hdr/types/size_t.h"
19	#include "src/__support/CPP/bitset.h"
20	#include "src/__support/CPP/type_traits.h" // cpp::is_same_v
21	#include "src/__support/macros/config.h"
22	#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
23
24	namespace LIBC_NAMESPACE_DECL {
25	namespace internal {
26
27	template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
28	static_assert(CHAR_BIT == 8, "repeat_byte assumes a byte is 8 bits.");
29	constexpr size_t BITS_IN_BYTE = CHAR_BIT;
30	constexpr size_t BYTE_MASK = 0xff;
31	Word result = 0;
32	byte = byte & BYTE_MASK;
33	for (size_t i = 0; i < sizeof(Word); ++i)
34	result = (result << BITS_IN_BYTE) \| byte;
35	return result;
36	}
37
38	// The goal of this function is to take in a block of arbitrary size and return
39	// if it has any bytes equal to zero without branching. This is done by
40	// transforming the block such that zero bytes become non-zero and non-zero
41	// bytes become zero.
42	// The first transformation relies on the properties of carrying in arithmetic
43	// subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00,
44	// then the result for that byte must be equal to 0xff (or 0xfe if the next byte
45	// needs a carry as well).
46	// The next transformation is a simple mask. All zero bytes will have the high
47	// bit set after the subtraction, so each byte is masked with 0x80. This narrows
48	// the set of bytes that result in a non-zero value to only zero bytes and bytes
49	// with the high bit and any other bit set.
50	// The final transformation masks the result of the previous transformations
51	// with the inverse of the original byte. This means that any byte that had the
52	// high bit set will no longer have it set, narrowing the list of bytes which
53	// result in non-zero values to just the zero byte.
54	template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
55	constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
56	constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
57	Word subtracted = block - LOW_BITS;
58	Word inverted = ~block;
59	return (subtracted & inverted & HIGH_BITS) != 0;
60	}
61
62	template <typename Word>
63	LIBC_INLINE size_t string_length_wide_read(const char *src) {
64	const char *char_ptr = src;
65	// Step 1: read 1 byte at a time to align to block size
66	for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
67	++char_ptr) {
68	if (*char_ptr == '\0')
69	return static_cast<size_t>(char_ptr - src);
70	}
71	// Step 2: read blocks
72	for (const Word block_ptr = reinterpret_cast<const Word >(char_ptr);
73	!has_zeroes<Word>(*block_ptr); ++block_ptr) {
74	char_ptr = reinterpret_cast<const char *>(block_ptr);
75	}
76	// Step 3: find the zero in the block
77	for (; *char_ptr != '\0'; ++char_ptr) {
78	;
79	}
80	return static_cast<size_t>(char_ptr - src);
81	}
82
83	// Returns the length of a string, denoted by the first occurrence
84	// of a null terminator.
85	template <typename T> LIBC_INLINE size_t string_length(const T *src) {
86	#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
87	// Unsigned int is the default size for most processors, and on x86-64 it
88	// performs better than larger sizes when the src pointer can't be assumed to
89	// be aligned to a word boundary, so it's the size we use for reading the
90	// string a block at a time.
91	if constexpr (cpp::is_same_v<T, char>)
92	return string_length_wide_read<unsigned int>(src);
93	#else
94	size_t length;
95	for (length = 0; *src; ++src, ++length)
96	;
97	return length;
98	#endif
99	}
100
101	template <typename Word>
102	LIBC_INLINE void find_first_character_wide_read(const unsigned char src,
103	unsigned char ch, size_t n) {
104	const unsigned char *char_ptr = src;
105	size_t cur = 0;
106
107	// Step 1: read 1 byte at a time to align to block size
108	for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
109	++char_ptr, ++cur) {
110	if (*char_ptr == ch)
111	return const_cast<unsigned char *>(char_ptr);
112	}
113
114	const Word ch_mask = repeat_byte<Word>(ch);
115
116	// Step 2: read blocks
117	for (const Word block_ptr = reinterpret_cast<const Word >(char_ptr);
118	!has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
119	++block_ptr, cur += sizeof(Word)) {
120	char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
121	}
122
123	// Step 3: find the match in the block
124	for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
125	;
126	}
127
128	if (*char_ptr != ch \|\| cur >= n)
129	return static_cast<void *>(nullptr);
130
131	return const_cast<unsigned char *>(char_ptr);
132	}
133
134	LIBC_INLINE void find_first_character_byte_read(const unsigned char src,
135	unsigned char ch, size_t n) {
136	for (; n && *src != ch; --n, ++src)
137	;
138	return n ? const_cast<unsigned char *>(src) : nullptr;
139	}
140
141	// Returns the first occurrence of 'ch' within the first 'n' characters of
142	// 'src'. If 'ch' is not found, returns nullptr.
143	LIBC_INLINE void find_first_character(const unsigned char src,
144	unsigned char ch, size_t max_strlen) {
145	#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
146	// If the maximum size of the string is small, the overhead of aligning to a
147	// word boundary and generating a bitmask of the appropriate size may be
148	// greater than the gains from reading larger chunks. Based on some testing,
149	// the crossover point between when it's faster to just read bytewise and read
150	// blocks is somewhere between 16 and 32, so 4 times the size of the block
151	// should be in that range.
152	// Unsigned int is used for the same reason as in strlen.
153	using BlockType = unsigned int;
154	if (max_strlen > (sizeof(BlockType) * 4)) {
155	return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
156	}
157	#endif
158	return find_first_character_byte_read(src, ch, max_strlen);
159	}
160
161	// Returns the maximum length span that contains only characters not found in
162	// 'segment'. If no characters are found, returns the length of 'src'.
163	LIBC_INLINE size_t complementary_span(const char src, const char segment) {
164	const char *initial = src;
165	cpp::bitset<256> bitset;
166
167	for (; *segment; ++segment)
168	bitset.set(reinterpret_cast<const unsigned char >(segment));
169	for (; src && !bitset.test(reinterpret_cast<const unsigned char *>(src));
170	++src)
171	;
172	return static_cast<size_t>(src - initial);
173	}
174
175	// Given the similarities between strtok and strtok_r, we can implement both
176	// using a utility function. On the first call, 'src' is scanned for the
177	// first character not found in 'delimiter_string'. Once found, it scans until
178	// the first character in the 'delimiter_string' or the null terminator is
179	// found. We define this span as a token. The end of the token is appended with
180	// a null terminator, and the token is returned. The point where the last token
181	// is found is then stored within 'context' for subsequent calls. Subsequent
182	// calls will use 'context' when a nullptr is passed in for 'src'. Once the null
183	// terminating character is reached, returns a nullptr.
184	template <bool SkipDelim = true>
185	LIBC_INLINE char string_token(char __restrict src,
186	const char *__restrict delimiter_string,
187	char **__restrict saveptr) {
188	// Return nullptr immediately if both src AND saveptr are nullptr
189	if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
190	return nullptr;
191
192	static_assert(CHAR_BIT == 8, "bitset of 256 assumes char is 8 bits");
193	cpp::bitset<256> delimiter_set;
194	for (; *delimiter_string != '\0'; ++delimiter_string)
195	delimiter_set.set(static_cast<size_t>(*delimiter_string));
196
197	if constexpr (SkipDelim)
198	for (; src != '\0' && delimiter_set.test(static_cast<size_t>(src)); ++src)
199	;
200	if (*src == '\0') {
201	*saveptr = src;
202	return nullptr;
203	}
204	char *token = src;
205	for (; *src != '\0'; ++src) {
206	if (delimiter_set.test(static_cast<size_t>(*src))) {
207	*src = '\0';
208	++src;
209	break;
210	}
211	}
212	*saveptr = src;
213	return token;
214	}
215
216	LIBC_INLINE size_t strlcpy(char __restrict dst, const char __restrict src,
217	size_t size) {
218	size_t len = internal::string_length(src);
219	if (!size)
220	return len;
221	size_t n = len < size - 1 ? len : size - 1;
222	__builtin_memcpy(dst, src, n);
223	dst[n] = '\0';
224	return len;
225	}
226
227	template <bool ReturnNull = true>
228	LIBC_INLINE constexpr static char strchr_implementation(const char src,
229	int c) {
230	char ch = static_cast<char>(c);
231	for (; src && src != ch; ++src)
232	;
233	char ret = ReturnNull ? nullptr : const_cast<char >(src);
234	return src == ch ? const_cast<char >(src) : ret;
235	}
236
237	LIBC_INLINE constexpr static char strrchr_implementation(const char src,
238	int c) {
239	char ch = static_cast<char>(c);
240	char *last_occurrence = nullptr;
241	while (true) {
242	if (*src == ch)
243	last_occurrence = const_cast<char *>(src);
244	if (!*src)
245	return last_occurrence;
246	++src;
247	}
248	}
249
250	} // namespace internal
251	} // namespace LIBC_NAMESPACE_DECL
252
253	#endif // LLVM_LIBC_SRC_STRING_STRING_UTILS_H
254

Warning: This file is not a C or C++ file. It does not have highlighting.

Provided by KDAB

Update your C++ knowledge – Modern C++11/14/17 Training

Find out more

source code of libc/src/string/string_utils.h