Warning: This file is not a C or C++ file. It does not have highlighting.

1//===-- Format string parser for scanf -------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
10#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
11
12#include "src/__support/arg_list.h"
13#include "src/__support/ctype_utils.h"
14#include "src/__support/macros/config.h"
15#include "src/__support/str_to_integer.h"
16#include "src/stdio/scanf_core/core_structs.h"
17#include "src/stdio/scanf_core/scanf_config.h"
18
19#include <stddef.h>
20
21namespace LIBC_NAMESPACE_DECL {
22namespace scanf_core {
23
24#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
25#define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
26#else
27#define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
28#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
29
30template <typename ArgProvider> class Parser {
31 const char *__restrict str;
32
33 size_t cur_pos = 0;
34 ArgProvider args_cur;
35
36#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
37 // args_start stores the start of the va_args, which is used when a previous
38 // argument is needed. In that case, we have to read the arguments from the
39 // beginning since they don't support reading backwards.
40 ArgProvider args_start;
41 size_t args_index = 1;
42#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
43
44public:
45#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
46 LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
47 : str(new_str), args_cur(args), args_start(args) {}
48#else
49 LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
50 : str(new_str), args_cur(args) {}
51#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
52
53 // get_next_section will parse the format string until it has a fully
54 // specified format section. This can either be a raw format section with no
55 // conversion, or a format section with a conversion that has all of its
56 // variables stored in the format section.
57 LIBC_INLINE FormatSection get_next_section() {
58 FormatSection section;
59 size_t starting_pos = cur_pos;
60 if (str[cur_pos] == '%') {
61 // format section
62 section.has_conv = true;
63
64 ++cur_pos;
65 [[maybe_unused]] size_t conv_index = 0;
66
67#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
68 conv_index = parse_index(&cur_pos);
69#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
70
71 if (str[cur_pos] == '*') {
72 ++cur_pos;
73 section.flags = FormatFlags::NO_WRITE;
74 }
75
76 // handle width
77 section.max_width = -1;
78 if (internal::isdigit(str[cur_pos])) {
79 auto result = internal::strtointeger<int>(str + cur_pos, 10);
80 section.max_width = result.value;
81 cur_pos = cur_pos + static_cast<size_t>(result.parsed_len);
82 }
83
84 // TODO(michaelrj): add posix allocate flag support.
85 // if (str[cur_pos] == 'm') {
86 // ++cur_pos;
87 // section.flags = FormatFlags::ALLOCATE;
88 // }
89
90 LengthModifier lm = parse_length_modifier(&cur_pos);
91 section.length_modifier = lm;
92
93 section.conv_name = str[cur_pos];
94
95 // If NO_WRITE is not set, then read the next arg as the output pointer.
96 if ((section.flags & FormatFlags::NO_WRITE) == 0) {
97 // Since all outputs are pointers, there's no need to distinguish when
98 // reading from va_args. They're all the same size and stored the same.
99 section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
100 }
101
102 // If the end of the format section is on the '\0'. This means we need to
103 // not advance the cur_pos and we should not count this has having a
104 // conversion.
105 if (str[cur_pos] != '\0') {
106 ++cur_pos;
107 } else {
108 section.has_conv = false;
109 }
110
111 // If the format is a bracketed one, then we need to parse out the insides
112 // of the brackets.
113 if (section.conv_name == '[') {
114 constexpr char CLOSING_BRACKET = ']';
115 constexpr char INVERT_FLAG = '^';
116 constexpr char RANGE_OPERATOR = '-';
117
118 cpp::bitset<256> scan_set;
119 bool invert = false;
120
121 // The circumflex in the first position represents the inversion flag,
122 // but it's easier to apply that at the end so we just store it for now.
123 if (str[cur_pos] == INVERT_FLAG) {
124 invert = true;
125 ++cur_pos;
126 }
127
128 // This is used to determine if a hyphen is being used as a literal or
129 // as a range operator.
130 size_t set_start_pos = cur_pos;
131
132 // Normally the right bracket closes the set, but if it's the first
133 // character (possibly after the inversion flag) then it's instead
134 // included as a character in the set and the second right bracket
135 // closes the set.
136 if (str[cur_pos] == CLOSING_BRACKET) {
137 scan_set.set(CLOSING_BRACKET);
138 ++cur_pos;
139 }
140
141 while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
142 // If a hyphen is being used as a range operator, since it's neither
143 // at the beginning nor end of the set.
144 if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
145 str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
146 // Technically there is no requirement to correct the ordering of
147 // the range, but since the range operator is entirely
148 // implementation defined it seems like a good convenience.
149 char a = str[cur_pos - 1];
150 char b = str[cur_pos + 1];
151 char start = (a < b ? a : b);
152 char end = (a < b ? b : a);
153 scan_set.set_range(static_cast<size_t>(start),
154 static_cast<size_t>(end));
155 cur_pos += 2;
156 } else {
157 scan_set.set(static_cast<size_t>(str[cur_pos]));
158 ++cur_pos;
159 }
160 }
161 if (invert)
162 scan_set.flip();
163
164 if (str[cur_pos] == CLOSING_BRACKET) {
165 ++cur_pos;
166 section.scan_set = scan_set;
167 } else {
168 // if the end of the string was encountered, this is not a valid set.
169 section.has_conv = false;
170 }
171 }
172 } else {
173 // raw section
174 section.has_conv = false;
175 while (str[cur_pos] != '%' && str[cur_pos] != '\0')
176 ++cur_pos;
177 }
178 section.raw_string = {str + starting_pos, cur_pos - starting_pos};
179 return section;
180 }
181
182private:
183 // parse_length_modifier parses the length modifier inside a format string. It
184 // assumes that str[*local_pos] is inside a format specifier. It returns a
185 // LengthModifier with the length modifier it found. It will advance local_pos
186 // after the format specifier if one is found.
187 LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) {
188 switch (str[*local_pos]) {
189 case ('l'):
190 if (str[*local_pos + 1] == 'l') {
191 *local_pos += 2;
192 return LengthModifier::ll;
193 } else {
194 ++*local_pos;
195 return LengthModifier::l;
196 }
197 case ('h'):
198 if (str[*local_pos + 1] == 'h') {
199 *local_pos += 2;
200 return LengthModifier::hh;
201 } else {
202 ++*local_pos;
203 return LengthModifier::h;
204 }
205 case ('L'):
206 ++*local_pos;
207 return LengthModifier::L;
208 case ('j'):
209 ++*local_pos;
210 return LengthModifier::j;
211 case ('z'):
212 ++*local_pos;
213 return LengthModifier::z;
214 case ('t'):
215 ++*local_pos;
216 return LengthModifier::t;
217 default:
218 return LengthModifier::NONE;
219 }
220 }
221
222 // get_next_arg_value gets the next value from the arg list as type T.
223 template <class T> LIBC_INLINE T get_next_arg_value() {
224 return args_cur.template next_var<T>();
225 }
226
227 //----------------------------------------------------
228 // INDEX MODE ONLY FUNCTIONS AFTER HERE:
229 //----------------------------------------------------
230
231#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
232
233 // parse_index parses the index of a value inside a format string. It
234 // assumes that str[*local_pos] points to character after a '%' or '*', and
235 // returns 0 if there is no closing $, or if it finds no number. If it finds a
236 // number, it will move local_pos past the end of the $, else it will not move
237 // local_pos.
238 LIBC_INLINE size_t parse_index(size_t *local_pos) {
239 if (internal::isdigit(str[*local_pos])) {
240 auto result = internal::strtointeger<int>(str + *local_pos, 10);
241 size_t index = static_cast<size_t>(result.value);
242 if (str[*local_pos + static_cast<size_t>(result.parsed_len)] != '$')
243 return 0;
244 *local_pos = static_cast<size_t>(1 + result.parsed_len) + *local_pos;
245 return index;
246 }
247 return 0;
248 }
249
250 // get_arg_value gets the value from the arg list at index (starting at 1).
251 // This may require parsing the format string. An index of 0 is interpreted as
252 // the next value.
253 template <class T> LIBC_INLINE T get_arg_value(size_t index) {
254 if (!(index == 0 || index == args_index))
255 args_to_index(index);
256
257 ++args_index;
258 return get_next_arg_value<T>();
259 }
260
261 // the ArgList can only return the next item in the list. This function is
262 // used in index mode when the item that needs to be read is not the next one.
263 // It moves cur_args to the index requested so the appropriate value may
264 // be read. This may involve parsing the format string, and is in the worst
265 // case an O(n^2) operation.
266 LIBC_INLINE void args_to_index(size_t index) {
267 if (args_index > index) {
268 args_index = 1;
269 args_cur = args_start;
270 }
271
272 while (args_index < index) {
273 // Since all arguments must be pointers, we can just read all of them as
274 // void * and not worry about type issues.
275 args_cur.template next_var<void *>();
276 ++args_index;
277 }
278 }
279
280#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
281};
282
283} // namespace scanf_core
284} // namespace LIBC_NAMESPACE_DECL
285
286#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
287

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of libc/src/stdio/scanf_core/parser.h