Warning: This file is not a C or C++ file. It does not have highlighting.
1 | //===-- String utils --------------------------------------------*- C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Standalone string utility functions. Utilities requiring memory allocations |
10 | // should be placed in allocating_string_utils.h instead. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_LIBC_SRC_STRING_STRING_UTILS_H |
15 | #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H |
16 | |
17 | #include "hdr/limits_macros.h" |
18 | #include "hdr/types/size_t.h" |
19 | #include "src/__support/CPP/bitset.h" |
20 | #include "src/__support/CPP/type_traits.h" // cpp::is_same_v |
21 | #include "src/__support/macros/config.h" |
22 | #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY |
23 | |
24 | namespace LIBC_NAMESPACE_DECL { |
25 | namespace internal { |
26 | |
27 | template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) { |
28 | static_assert(CHAR_BIT == 8, "repeat_byte assumes a byte is 8 bits."); |
29 | constexpr size_t BITS_IN_BYTE = CHAR_BIT; |
30 | constexpr size_t BYTE_MASK = 0xff; |
31 | Word result = 0; |
32 | byte = byte & BYTE_MASK; |
33 | for (size_t i = 0; i < sizeof(Word); ++i) |
34 | result = (result << BITS_IN_BYTE) | byte; |
35 | return result; |
36 | } |
37 | |
38 | // The goal of this function is to take in a block of arbitrary size and return |
39 | // if it has any bytes equal to zero without branching. This is done by |
40 | // transforming the block such that zero bytes become non-zero and non-zero |
41 | // bytes become zero. |
42 | // The first transformation relies on the properties of carrying in arithmetic |
43 | // subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00, |
44 | // then the result for that byte must be equal to 0xff (or 0xfe if the next byte |
45 | // needs a carry as well). |
46 | // The next transformation is a simple mask. All zero bytes will have the high |
47 | // bit set after the subtraction, so each byte is masked with 0x80. This narrows |
48 | // the set of bytes that result in a non-zero value to only zero bytes and bytes |
49 | // with the high bit and any other bit set. |
50 | // The final transformation masks the result of the previous transformations |
51 | // with the inverse of the original byte. This means that any byte that had the |
52 | // high bit set will no longer have it set, narrowing the list of bytes which |
53 | // result in non-zero values to just the zero byte. |
54 | template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) { |
55 | constexpr Word LOW_BITS = repeat_byte<Word>(0x01); |
56 | constexpr Word HIGH_BITS = repeat_byte<Word>(0x80); |
57 | Word subtracted = block - LOW_BITS; |
58 | Word inverted = ~block; |
59 | return (subtracted & inverted & HIGH_BITS) != 0; |
60 | } |
61 | |
62 | template <typename Word> |
63 | LIBC_INLINE size_t string_length_wide_read(const char *src) { |
64 | const char *char_ptr = src; |
65 | // Step 1: read 1 byte at a time to align to block size |
66 | for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0; |
67 | ++char_ptr) { |
68 | if (*char_ptr == '\0') |
69 | return static_cast<size_t>(char_ptr - src); |
70 | } |
71 | // Step 2: read blocks |
72 | for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr); |
73 | !has_zeroes<Word>(*block_ptr); ++block_ptr) { |
74 | char_ptr = reinterpret_cast<const char *>(block_ptr); |
75 | } |
76 | // Step 3: find the zero in the block |
77 | for (; *char_ptr != '\0'; ++char_ptr) { |
78 | ; |
79 | } |
80 | return static_cast<size_t>(char_ptr - src); |
81 | } |
82 | |
83 | // Returns the length of a string, denoted by the first occurrence |
84 | // of a null terminator. |
85 | template <typename T> LIBC_INLINE size_t string_length(const T *src) { |
86 | #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ |
87 | // Unsigned int is the default size for most processors, and on x86-64 it |
88 | // performs better than larger sizes when the src pointer can't be assumed to |
89 | // be aligned to a word boundary, so it's the size we use for reading the |
90 | // string a block at a time. |
91 | if constexpr (cpp::is_same_v<T, char>) |
92 | return string_length_wide_read<unsigned int>(src); |
93 | #else |
94 | size_t length; |
95 | for (length = 0; *src; ++src, ++length) |
96 | ; |
97 | return length; |
98 | #endif |
99 | } |
100 | |
101 | template <typename Word> |
102 | LIBC_INLINE void *find_first_character_wide_read(const unsigned char *src, |
103 | unsigned char ch, size_t n) { |
104 | const unsigned char *char_ptr = src; |
105 | size_t cur = 0; |
106 | |
107 | // Step 1: read 1 byte at a time to align to block size |
108 | for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n; |
109 | ++char_ptr, ++cur) { |
110 | if (*char_ptr == ch) |
111 | return const_cast<unsigned char *>(char_ptr); |
112 | } |
113 | |
114 | const Word ch_mask = repeat_byte<Word>(ch); |
115 | |
116 | // Step 2: read blocks |
117 | for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr); |
118 | !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n; |
119 | ++block_ptr, cur += sizeof(Word)) { |
120 | char_ptr = reinterpret_cast<const unsigned char *>(block_ptr); |
121 | } |
122 | |
123 | // Step 3: find the match in the block |
124 | for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) { |
125 | ; |
126 | } |
127 | |
128 | if (*char_ptr != ch || cur >= n) |
129 | return static_cast<void *>(nullptr); |
130 | |
131 | return const_cast<unsigned char *>(char_ptr); |
132 | } |
133 | |
134 | LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src, |
135 | unsigned char ch, size_t n) { |
136 | for (; n && *src != ch; --n, ++src) |
137 | ; |
138 | return n ? const_cast<unsigned char *>(src) : nullptr; |
139 | } |
140 | |
141 | // Returns the first occurrence of 'ch' within the first 'n' characters of |
142 | // 'src'. If 'ch' is not found, returns nullptr. |
143 | LIBC_INLINE void *find_first_character(const unsigned char *src, |
144 | unsigned char ch, size_t max_strlen) { |
145 | #ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ |
146 | // If the maximum size of the string is small, the overhead of aligning to a |
147 | // word boundary and generating a bitmask of the appropriate size may be |
148 | // greater than the gains from reading larger chunks. Based on some testing, |
149 | // the crossover point between when it's faster to just read bytewise and read |
150 | // blocks is somewhere between 16 and 32, so 4 times the size of the block |
151 | // should be in that range. |
152 | // Unsigned int is used for the same reason as in strlen. |
153 | using BlockType = unsigned int; |
154 | if (max_strlen > (sizeof(BlockType) * 4)) { |
155 | return find_first_character_wide_read<BlockType>(src, ch, max_strlen); |
156 | } |
157 | #endif |
158 | return find_first_character_byte_read(src, ch, max_strlen); |
159 | } |
160 | |
161 | // Returns the maximum length span that contains only characters not found in |
162 | // 'segment'. If no characters are found, returns the length of 'src'. |
163 | LIBC_INLINE size_t complementary_span(const char *src, const char *segment) { |
164 | const char *initial = src; |
165 | cpp::bitset<256> bitset; |
166 | |
167 | for (; *segment; ++segment) |
168 | bitset.set(*reinterpret_cast<const unsigned char *>(segment)); |
169 | for (; *src && !bitset.test(*reinterpret_cast<const unsigned char *>(src)); |
170 | ++src) |
171 | ; |
172 | return static_cast<size_t>(src - initial); |
173 | } |
174 | |
175 | // Given the similarities between strtok and strtok_r, we can implement both |
176 | // using a utility function. On the first call, 'src' is scanned for the |
177 | // first character not found in 'delimiter_string'. Once found, it scans until |
178 | // the first character in the 'delimiter_string' or the null terminator is |
179 | // found. We define this span as a token. The end of the token is appended with |
180 | // a null terminator, and the token is returned. The point where the last token |
181 | // is found is then stored within 'context' for subsequent calls. Subsequent |
182 | // calls will use 'context' when a nullptr is passed in for 'src'. Once the null |
183 | // terminating character is reached, returns a nullptr. |
184 | template <bool SkipDelim = true> |
185 | LIBC_INLINE char *string_token(char *__restrict src, |
186 | const char *__restrict delimiter_string, |
187 | char **__restrict saveptr) { |
188 | // Return nullptr immediately if both src AND saveptr are nullptr |
189 | if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr))) |
190 | return nullptr; |
191 | |
192 | static_assert(CHAR_BIT == 8, "bitset of 256 assumes char is 8 bits"); |
193 | cpp::bitset<256> delimiter_set; |
194 | for (; *delimiter_string != '\0'; ++delimiter_string) |
195 | delimiter_set.set(static_cast<size_t>(*delimiter_string)); |
196 | |
197 | if constexpr (SkipDelim) |
198 | for (; *src != '\0' && delimiter_set.test(static_cast<size_t>(*src)); ++src) |
199 | ; |
200 | if (*src == '\0') { |
201 | *saveptr = src; |
202 | return nullptr; |
203 | } |
204 | char *token = src; |
205 | for (; *src != '\0'; ++src) { |
206 | if (delimiter_set.test(static_cast<size_t>(*src))) { |
207 | *src = '\0'; |
208 | ++src; |
209 | break; |
210 | } |
211 | } |
212 | *saveptr = src; |
213 | return token; |
214 | } |
215 | |
216 | LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src, |
217 | size_t size) { |
218 | size_t len = internal::string_length(src); |
219 | if (!size) |
220 | return len; |
221 | size_t n = len < size - 1 ? len : size - 1; |
222 | __builtin_memcpy(dst, src, n); |
223 | dst[n] = '\0'; |
224 | return len; |
225 | } |
226 | |
227 | template <bool ReturnNull = true> |
228 | LIBC_INLINE constexpr static char *strchr_implementation(const char *src, |
229 | int c) { |
230 | char ch = static_cast<char>(c); |
231 | for (; *src && *src != ch; ++src) |
232 | ; |
233 | char *ret = ReturnNull ? nullptr : const_cast<char *>(src); |
234 | return *src == ch ? const_cast<char *>(src) : ret; |
235 | } |
236 | |
237 | LIBC_INLINE constexpr static char *strrchr_implementation(const char *src, |
238 | int c) { |
239 | char ch = static_cast<char>(c); |
240 | char *last_occurrence = nullptr; |
241 | while (true) { |
242 | if (*src == ch) |
243 | last_occurrence = const_cast<char *>(src); |
244 | if (!*src) |
245 | return last_occurrence; |
246 | ++src; |
247 | } |
248 | } |
249 | |
250 | } // namespace internal |
251 | } // namespace LIBC_NAMESPACE_DECL |
252 | |
253 | #endif // LLVM_LIBC_SRC_STRING_STRING_UTILS_H |
254 |
Warning: This file is not a C or C++ file. It does not have highlighting.