1//===-- Format string parser for scanf -------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
10#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
11
12#include "src/__support/arg_list.h"
13#include "src/__support/ctype_utils.h"
14#include "src/__support/str_to_integer.h"
15#include "src/stdio/scanf_core/core_structs.h"
16#include "src/stdio/scanf_core/scanf_config.h"
17
18#include <stddef.h>
19
20namespace LIBC_NAMESPACE {
21namespace scanf_core {
22
23#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
24#define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
25#else
26#define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
27#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
28
29template <typename ArgProvider> class Parser {
30 const char *__restrict str;
31
32 size_t cur_pos = 0;
33 ArgProvider args_cur;
34
35#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
36 // args_start stores the start of the va_args, which is used when a previous
37 // argument is needed. In that case, we have to read the arguments from the
38 // beginning since they don't support reading backwards.
39 ArgProvider args_start;
40 size_t args_index = 1;
41#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
42
43public:
44#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
45 LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
46 : str(new_str), args_cur(args), args_start(args) {}
47#else
48 LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
49 : str(new_str), args_cur(args) {}
50#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
51
52 // get_next_section will parse the format string until it has a fully
53 // specified format section. This can either be a raw format section with no
54 // conversion, or a format section with a conversion that has all of its
55 // variables stored in the format section.
56 LIBC_INLINE FormatSection get_next_section() {
57 FormatSection section;
58 size_t starting_pos = cur_pos;
59 if (str[cur_pos] == '%') {
60 // format section
61 section.has_conv = true;
62
63 ++cur_pos;
64 [[maybe_unused]] size_t conv_index = 0;
65
66#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
67 conv_index = parse_index(local_pos: &cur_pos);
68#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
69
70 if (str[cur_pos] == '*') {
71 ++cur_pos;
72 section.flags = FormatFlags::NO_WRITE;
73 }
74
75 // handle width
76 section.max_width = -1;
77 if (internal::isdigit(ch: str[cur_pos])) {
78 auto result = internal::strtointeger<int>(src: str + cur_pos, base: 10);
79 section.max_width = result.value;
80 cur_pos = cur_pos + result.parsed_len;
81 }
82
83 // TODO(michaelrj): add posix allocate flag support.
84 // if (str[cur_pos] == 'm') {
85 // ++cur_pos;
86 // section.flags = FormatFlags::ALLOCATE;
87 // }
88
89 LengthModifier lm = parse_length_modifier(local_pos: &cur_pos);
90 section.length_modifier = lm;
91
92 section.conv_name = str[cur_pos];
93
94 // If NO_WRITE is not set, then read the next arg as the output pointer.
95 if ((section.flags & FormatFlags::NO_WRITE) == 0) {
96 // Since all outputs are pointers, there's no need to distinguish when
97 // reading from va_args. They're all the same size and stored the same.
98 section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
99 }
100
101 // If the end of the format section is on the '\0'. This means we need to
102 // not advance the cur_pos and we should not count this has having a
103 // conversion.
104 if (str[cur_pos] != '\0') {
105 ++cur_pos;
106 } else {
107 section.has_conv = false;
108 }
109
110 // If the format is a bracketed one, then we need to parse out the insides
111 // of the brackets.
112 if (section.conv_name == '[') {
113 constexpr char CLOSING_BRACKET = ']';
114 constexpr char INVERT_FLAG = '^';
115 constexpr char RANGE_OPERATOR = '-';
116
117 cpp::bitset<256> scan_set;
118 bool invert = false;
119
120 // The circumflex in the first position represents the inversion flag,
121 // but it's easier to apply that at the end so we just store it for now.
122 if (str[cur_pos] == INVERT_FLAG) {
123 invert = true;
124 ++cur_pos;
125 }
126
127 // This is used to determine if a hyphen is being used as a literal or
128 // as a range operator.
129 size_t set_start_pos = cur_pos;
130
131 // Normally the right bracket closes the set, but if it's the first
132 // character (possibly after the inversion flag) then it's instead
133 // included as a character in the set and the second right bracket
134 // closes the set.
135 if (str[cur_pos] == CLOSING_BRACKET) {
136 scan_set.set(CLOSING_BRACKET);
137 ++cur_pos;
138 }
139
140 while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
141 // If a hyphen is being used as a range operator, since it's neither
142 // at the beginning nor end of the set.
143 if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
144 str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
145 // Technically there is no requirement to correct the ordering of
146 // the range, but since the range operator is entirely
147 // implementation defined it seems like a good convenience.
148 char a = str[cur_pos - 1];
149 char b = str[cur_pos + 1];
150 char start = (a < b ? a : b);
151 char end = (a < b ? b : a);
152 scan_set.set_range(Start: start, End: end);
153 cur_pos += 2;
154 } else {
155 scan_set.set(str[cur_pos]);
156 ++cur_pos;
157 }
158 }
159 if (invert)
160 scan_set.flip();
161
162 if (str[cur_pos] == CLOSING_BRACKET) {
163 ++cur_pos;
164 section.scan_set = scan_set;
165 } else {
166 // if the end of the string was encountered, this is not a valid set.
167 section.has_conv = false;
168 }
169 }
170 } else {
171 // raw section
172 section.has_conv = false;
173 while (str[cur_pos] != '%' && str[cur_pos] != '\0')
174 ++cur_pos;
175 }
176 section.raw_string = {str + starting_pos, cur_pos - starting_pos};
177 return section;
178 }
179
180private:
181 // parse_length_modifier parses the length modifier inside a format string. It
182 // assumes that str[*local_pos] is inside a format specifier. It returns a
183 // LengthModifier with the length modifier it found. It will advance local_pos
184 // after the format specifier if one is found.
185 LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) {
186 switch (str[*local_pos]) {
187 case ('l'):
188 if (str[*local_pos + 1] == 'l') {
189 *local_pos += 2;
190 return LengthModifier::ll;
191 } else {
192 ++*local_pos;
193 return LengthModifier::l;
194 }
195 case ('h'):
196 if (str[*local_pos + 1] == 'h') {
197 *local_pos += 2;
198 return LengthModifier::hh;
199 } else {
200 ++*local_pos;
201 return LengthModifier::h;
202 }
203 case ('L'):
204 ++*local_pos;
205 return LengthModifier::L;
206 case ('j'):
207 ++*local_pos;
208 return LengthModifier::j;
209 case ('z'):
210 ++*local_pos;
211 return LengthModifier::z;
212 case ('t'):
213 ++*local_pos;
214 return LengthModifier::t;
215 default:
216 return LengthModifier::NONE;
217 }
218 }
219
220 // get_next_arg_value gets the next value from the arg list as type T.
221 template <class T> LIBC_INLINE T get_next_arg_value() {
222 return args_cur.template next_var<T>();
223 }
224
225 //----------------------------------------------------
226 // INDEX MODE ONLY FUNCTIONS AFTER HERE:
227 //----------------------------------------------------
228
229#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
230
231 // parse_index parses the index of a value inside a format string. It
232 // assumes that str[*local_pos] points to character after a '%' or '*', and
233 // returns 0 if there is no closing $, or if it finds no number. If it finds a
234 // number, it will move local_pos past the end of the $, else it will not move
235 // local_pos.
236 LIBC_INLINE size_t parse_index(size_t *local_pos) {
237 if (internal::isdigit(ch: str[*local_pos])) {
238 auto result = internal::strtointeger<int>(src: str + *local_pos, base: 10);
239 size_t index = result.value;
240 if (str[*local_pos + result.parsed_len] != '$')
241 return 0;
242 *local_pos = 1 + result.parsed_len + *local_pos;
243 return index;
244 }
245 return 0;
246 }
247
248 // get_arg_value gets the value from the arg list at index (starting at 1).
249 // This may require parsing the format string. An index of 0 is interpreted as
250 // the next value.
251 template <class T> LIBC_INLINE T get_arg_value(size_t index) {
252 if (!(index == 0 || index == args_index))
253 args_to_index(index);
254
255 ++args_index;
256 return get_next_arg_value<T>();
257 }
258
259 // the ArgList can only return the next item in the list. This function is
260 // used in index mode when the item that needs to be read is not the next one.
261 // It moves cur_args to the index requested so the appropriate value may
262 // be read. This may involve parsing the format string, and is in the worst
263 // case an O(n^2) operation.
264 LIBC_INLINE void args_to_index(size_t index) {
265 if (args_index > index) {
266 args_index = 1;
267 args_cur = args_start;
268 }
269
270 while (args_index < index) {
271 // Since all arguments must be pointers, we can just read all of them as
272 // void * and not worry about type issues.
273 args_cur.template next_var<void *>();
274 ++args_index;
275 }
276 }
277
278#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
279};
280
281} // namespace scanf_core
282} // namespace LIBC_NAMESPACE
283
284#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
285

source code of libc/src/stdio/scanf_core/parser.h