1 | //===-- Format string parser for scanf -------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |
10 | #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |
11 | |
12 | #include "src/__support/arg_list.h" |
13 | #include "src/__support/ctype_utils.h" |
14 | #include "src/__support/str_to_integer.h" |
15 | #include "src/stdio/scanf_core/core_structs.h" |
16 | #include "src/stdio/scanf_core/scanf_config.h" |
17 | |
18 | #include <stddef.h> |
19 | |
20 | namespace LIBC_NAMESPACE { |
21 | namespace scanf_core { |
22 | |
23 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
24 | #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index) |
25 | #else |
26 | #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>() |
27 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
28 | |
29 | template <typename ArgProvider> class Parser { |
30 | const char *__restrict str; |
31 | |
32 | size_t cur_pos = 0; |
33 | ArgProvider args_cur; |
34 | |
35 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
36 | // args_start stores the start of the va_args, which is used when a previous |
37 | // argument is needed. In that case, we have to read the arguments from the |
38 | // beginning since they don't support reading backwards. |
39 | ArgProvider args_start; |
40 | size_t args_index = 1; |
41 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
42 | |
43 | public: |
44 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
45 | LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) |
46 | : str(new_str), args_cur(args), args_start(args) {} |
47 | #else |
48 | LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) |
49 | : str(new_str), args_cur(args) {} |
50 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
51 | |
52 | // get_next_section will parse the format string until it has a fully |
53 | // specified format section. This can either be a raw format section with no |
54 | // conversion, or a format section with a conversion that has all of its |
55 | // variables stored in the format section. |
56 | LIBC_INLINE FormatSection get_next_section() { |
57 | FormatSection section; |
58 | size_t starting_pos = cur_pos; |
59 | if (str[cur_pos] == '%') { |
60 | // format section |
61 | section.has_conv = true; |
62 | |
63 | ++cur_pos; |
64 | [[maybe_unused]] size_t conv_index = 0; |
65 | |
66 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
67 | conv_index = parse_index(local_pos: &cur_pos); |
68 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
69 | |
70 | if (str[cur_pos] == '*') { |
71 | ++cur_pos; |
72 | section.flags = FormatFlags::NO_WRITE; |
73 | } |
74 | |
75 | // handle width |
76 | section.max_width = -1; |
77 | if (internal::isdigit(ch: str[cur_pos])) { |
78 | auto result = internal::strtointeger<int>(src: str + cur_pos, base: 10); |
79 | section.max_width = result.value; |
80 | cur_pos = cur_pos + result.parsed_len; |
81 | } |
82 | |
83 | // TODO(michaelrj): add posix allocate flag support. |
84 | // if (str[cur_pos] == 'm') { |
85 | // ++cur_pos; |
86 | // section.flags = FormatFlags::ALLOCATE; |
87 | // } |
88 | |
89 | LengthModifier lm = parse_length_modifier(local_pos: &cur_pos); |
90 | section.length_modifier = lm; |
91 | |
92 | section.conv_name = str[cur_pos]; |
93 | |
94 | // If NO_WRITE is not set, then read the next arg as the output pointer. |
95 | if ((section.flags & FormatFlags::NO_WRITE) == 0) { |
96 | // Since all outputs are pointers, there's no need to distinguish when |
97 | // reading from va_args. They're all the same size and stored the same. |
98 | section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index); |
99 | } |
100 | |
101 | // If the end of the format section is on the '\0'. This means we need to |
102 | // not advance the cur_pos and we should not count this has having a |
103 | // conversion. |
104 | if (str[cur_pos] != '\0') { |
105 | ++cur_pos; |
106 | } else { |
107 | section.has_conv = false; |
108 | } |
109 | |
110 | // If the format is a bracketed one, then we need to parse out the insides |
111 | // of the brackets. |
112 | if (section.conv_name == '[') { |
113 | constexpr char CLOSING_BRACKET = ']'; |
114 | constexpr char INVERT_FLAG = '^'; |
115 | constexpr char RANGE_OPERATOR = '-'; |
116 | |
117 | cpp::bitset<256> scan_set; |
118 | bool invert = false; |
119 | |
120 | // The circumflex in the first position represents the inversion flag, |
121 | // but it's easier to apply that at the end so we just store it for now. |
122 | if (str[cur_pos] == INVERT_FLAG) { |
123 | invert = true; |
124 | ++cur_pos; |
125 | } |
126 | |
127 | // This is used to determine if a hyphen is being used as a literal or |
128 | // as a range operator. |
129 | size_t set_start_pos = cur_pos; |
130 | |
131 | // Normally the right bracket closes the set, but if it's the first |
132 | // character (possibly after the inversion flag) then it's instead |
133 | // included as a character in the set and the second right bracket |
134 | // closes the set. |
135 | if (str[cur_pos] == CLOSING_BRACKET) { |
136 | scan_set.set(CLOSING_BRACKET); |
137 | ++cur_pos; |
138 | } |
139 | |
140 | while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) { |
141 | // If a hyphen is being used as a range operator, since it's neither |
142 | // at the beginning nor end of the set. |
143 | if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos && |
144 | str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') { |
145 | // Technically there is no requirement to correct the ordering of |
146 | // the range, but since the range operator is entirely |
147 | // implementation defined it seems like a good convenience. |
148 | char a = str[cur_pos - 1]; |
149 | char b = str[cur_pos + 1]; |
150 | char start = (a < b ? a : b); |
151 | char end = (a < b ? b : a); |
152 | scan_set.set_range(Start: start, End: end); |
153 | cur_pos += 2; |
154 | } else { |
155 | scan_set.set(str[cur_pos]); |
156 | ++cur_pos; |
157 | } |
158 | } |
159 | if (invert) |
160 | scan_set.flip(); |
161 | |
162 | if (str[cur_pos] == CLOSING_BRACKET) { |
163 | ++cur_pos; |
164 | section.scan_set = scan_set; |
165 | } else { |
166 | // if the end of the string was encountered, this is not a valid set. |
167 | section.has_conv = false; |
168 | } |
169 | } |
170 | } else { |
171 | // raw section |
172 | section.has_conv = false; |
173 | while (str[cur_pos] != '%' && str[cur_pos] != '\0') |
174 | ++cur_pos; |
175 | } |
176 | section.raw_string = {str + starting_pos, cur_pos - starting_pos}; |
177 | return section; |
178 | } |
179 | |
180 | private: |
181 | // parse_length_modifier parses the length modifier inside a format string. It |
182 | // assumes that str[*local_pos] is inside a format specifier. It returns a |
183 | // LengthModifier with the length modifier it found. It will advance local_pos |
184 | // after the format specifier if one is found. |
185 | LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) { |
186 | switch (str[*local_pos]) { |
187 | case ('l'): |
188 | if (str[*local_pos + 1] == 'l') { |
189 | *local_pos += 2; |
190 | return LengthModifier::ll; |
191 | } else { |
192 | ++*local_pos; |
193 | return LengthModifier::l; |
194 | } |
195 | case ('h'): |
196 | if (str[*local_pos + 1] == 'h') { |
197 | *local_pos += 2; |
198 | return LengthModifier::hh; |
199 | } else { |
200 | ++*local_pos; |
201 | return LengthModifier::h; |
202 | } |
203 | case ('L'): |
204 | ++*local_pos; |
205 | return LengthModifier::L; |
206 | case ('j'): |
207 | ++*local_pos; |
208 | return LengthModifier::j; |
209 | case ('z'): |
210 | ++*local_pos; |
211 | return LengthModifier::z; |
212 | case ('t'): |
213 | ++*local_pos; |
214 | return LengthModifier::t; |
215 | default: |
216 | return LengthModifier::NONE; |
217 | } |
218 | } |
219 | |
220 | // get_next_arg_value gets the next value from the arg list as type T. |
221 | template <class T> LIBC_INLINE T get_next_arg_value() { |
222 | return args_cur.template next_var<T>(); |
223 | } |
224 | |
225 | //---------------------------------------------------- |
226 | // INDEX MODE ONLY FUNCTIONS AFTER HERE: |
227 | //---------------------------------------------------- |
228 | |
229 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
230 | |
231 | // parse_index parses the index of a value inside a format string. It |
232 | // assumes that str[*local_pos] points to character after a '%' or '*', and |
233 | // returns 0 if there is no closing $, or if it finds no number. If it finds a |
234 | // number, it will move local_pos past the end of the $, else it will not move |
235 | // local_pos. |
236 | LIBC_INLINE size_t parse_index(size_t *local_pos) { |
237 | if (internal::isdigit(ch: str[*local_pos])) { |
238 | auto result = internal::strtointeger<int>(src: str + *local_pos, base: 10); |
239 | size_t index = result.value; |
240 | if (str[*local_pos + result.parsed_len] != '$') |
241 | return 0; |
242 | *local_pos = 1 + result.parsed_len + *local_pos; |
243 | return index; |
244 | } |
245 | return 0; |
246 | } |
247 | |
248 | // get_arg_value gets the value from the arg list at index (starting at 1). |
249 | // This may require parsing the format string. An index of 0 is interpreted as |
250 | // the next value. |
251 | template <class T> LIBC_INLINE T get_arg_value(size_t index) { |
252 | if (!(index == 0 || index == args_index)) |
253 | args_to_index(index); |
254 | |
255 | ++args_index; |
256 | return get_next_arg_value<T>(); |
257 | } |
258 | |
259 | // the ArgList can only return the next item in the list. This function is |
260 | // used in index mode when the item that needs to be read is not the next one. |
261 | // It moves cur_args to the index requested so the appropriate value may |
262 | // be read. This may involve parsing the format string, and is in the worst |
263 | // case an O(n^2) operation. |
264 | LIBC_INLINE void args_to_index(size_t index) { |
265 | if (args_index > index) { |
266 | args_index = 1; |
267 | args_cur = args_start; |
268 | } |
269 | |
270 | while (args_index < index) { |
271 | // Since all arguments must be pointers, we can just read all of them as |
272 | // void * and not worry about type issues. |
273 | args_cur.template next_var<void *>(); |
274 | ++args_index; |
275 | } |
276 | } |
277 | |
278 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
279 | }; |
280 | |
281 | } // namespace scanf_core |
282 | } // namespace LIBC_NAMESPACE |
283 | |
284 | #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |
285 | |