Warning: This file is not a C or C++ file. It does not have highlighting.
| 1 | //===-- Format string parser for scanf -------------------------*- C++ -*-===// |
|---|---|
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |
| 10 | #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |
| 11 | |
| 12 | #include "src/__support/arg_list.h" |
| 13 | #include "src/__support/ctype_utils.h" |
| 14 | #include "src/__support/macros/config.h" |
| 15 | #include "src/__support/str_to_integer.h" |
| 16 | #include "src/stdio/scanf_core/core_structs.h" |
| 17 | #include "src/stdio/scanf_core/scanf_config.h" |
| 18 | |
| 19 | #include <stddef.h> |
| 20 | |
| 21 | namespace LIBC_NAMESPACE_DECL { |
| 22 | namespace scanf_core { |
| 23 | |
| 24 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 25 | #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index) |
| 26 | #else |
| 27 | #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>() |
| 28 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 29 | |
| 30 | template <typename ArgProvider> class Parser { |
| 31 | const char *__restrict str; |
| 32 | |
| 33 | size_t cur_pos = 0; |
| 34 | ArgProvider args_cur; |
| 35 | |
| 36 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 37 | // args_start stores the start of the va_args, which is used when a previous |
| 38 | // argument is needed. In that case, we have to read the arguments from the |
| 39 | // beginning since they don't support reading backwards. |
| 40 | ArgProvider args_start; |
| 41 | size_t args_index = 1; |
| 42 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 43 | |
| 44 | public: |
| 45 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 46 | LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) |
| 47 | : str(new_str), args_cur(args), args_start(args) {} |
| 48 | #else |
| 49 | LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) |
| 50 | : str(new_str), args_cur(args) {} |
| 51 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 52 | |
| 53 | // get_next_section will parse the format string until it has a fully |
| 54 | // specified format section. This can either be a raw format section with no |
| 55 | // conversion, or a format section with a conversion that has all of its |
| 56 | // variables stored in the format section. |
| 57 | LIBC_INLINE FormatSection get_next_section() { |
| 58 | FormatSection section; |
| 59 | size_t starting_pos = cur_pos; |
| 60 | if (str[cur_pos] == '%') { |
| 61 | // format section |
| 62 | section.has_conv = true; |
| 63 | |
| 64 | ++cur_pos; |
| 65 | [[maybe_unused]] size_t conv_index = 0; |
| 66 | |
| 67 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 68 | conv_index = parse_index(&cur_pos); |
| 69 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 70 | |
| 71 | if (str[cur_pos] == '*') { |
| 72 | ++cur_pos; |
| 73 | section.flags = FormatFlags::NO_WRITE; |
| 74 | } |
| 75 | |
| 76 | // handle width |
| 77 | section.max_width = -1; |
| 78 | if (internal::isdigit(str[cur_pos])) { |
| 79 | auto result = internal::strtointeger<int>(str + cur_pos, 10); |
| 80 | section.max_width = result.value; |
| 81 | cur_pos = cur_pos + static_cast<size_t>(result.parsed_len); |
| 82 | } |
| 83 | |
| 84 | // TODO(michaelrj): add posix allocate flag support. |
| 85 | // if (str[cur_pos] == 'm') { |
| 86 | // ++cur_pos; |
| 87 | // section.flags = FormatFlags::ALLOCATE; |
| 88 | // } |
| 89 | |
| 90 | LengthModifier lm = parse_length_modifier(&cur_pos); |
| 91 | section.length_modifier = lm; |
| 92 | |
| 93 | section.conv_name = str[cur_pos]; |
| 94 | |
| 95 | // If NO_WRITE is not set, then read the next arg as the output pointer. |
| 96 | if ((section.flags & FormatFlags::NO_WRITE) == 0) { |
| 97 | // Since all outputs are pointers, there's no need to distinguish when |
| 98 | // reading from va_args. They're all the same size and stored the same. |
| 99 | section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index); |
| 100 | } |
| 101 | |
| 102 | // If the end of the format section is on the '\0'. This means we need to |
| 103 | // not advance the cur_pos and we should not count this has having a |
| 104 | // conversion. |
| 105 | if (str[cur_pos] != '\0') { |
| 106 | ++cur_pos; |
| 107 | } else { |
| 108 | section.has_conv = false; |
| 109 | } |
| 110 | |
| 111 | // If the format is a bracketed one, then we need to parse out the insides |
| 112 | // of the brackets. |
| 113 | if (section.conv_name == '[') { |
| 114 | constexpr char CLOSING_BRACKET = ']'; |
| 115 | constexpr char INVERT_FLAG = '^'; |
| 116 | constexpr char RANGE_OPERATOR = '-'; |
| 117 | |
| 118 | cpp::bitset<256> scan_set; |
| 119 | bool invert = false; |
| 120 | |
| 121 | // The circumflex in the first position represents the inversion flag, |
| 122 | // but it's easier to apply that at the end so we just store it for now. |
| 123 | if (str[cur_pos] == INVERT_FLAG) { |
| 124 | invert = true; |
| 125 | ++cur_pos; |
| 126 | } |
| 127 | |
| 128 | // This is used to determine if a hyphen is being used as a literal or |
| 129 | // as a range operator. |
| 130 | size_t set_start_pos = cur_pos; |
| 131 | |
| 132 | // Normally the right bracket closes the set, but if it's the first |
| 133 | // character (possibly after the inversion flag) then it's instead |
| 134 | // included as a character in the set and the second right bracket |
| 135 | // closes the set. |
| 136 | if (str[cur_pos] == CLOSING_BRACKET) { |
| 137 | scan_set.set(CLOSING_BRACKET); |
| 138 | ++cur_pos; |
| 139 | } |
| 140 | |
| 141 | while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) { |
| 142 | // If a hyphen is being used as a range operator, since it's neither |
| 143 | // at the beginning nor end of the set. |
| 144 | if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos && |
| 145 | str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') { |
| 146 | // Technically there is no requirement to correct the ordering of |
| 147 | // the range, but since the range operator is entirely |
| 148 | // implementation defined it seems like a good convenience. |
| 149 | char a = str[cur_pos - 1]; |
| 150 | char b = str[cur_pos + 1]; |
| 151 | char start = (a < b ? a : b); |
| 152 | char end = (a < b ? b : a); |
| 153 | scan_set.set_range(static_cast<size_t>(start), |
| 154 | static_cast<size_t>(end)); |
| 155 | cur_pos += 2; |
| 156 | } else { |
| 157 | scan_set.set(static_cast<size_t>(str[cur_pos])); |
| 158 | ++cur_pos; |
| 159 | } |
| 160 | } |
| 161 | if (invert) |
| 162 | scan_set.flip(); |
| 163 | |
| 164 | if (str[cur_pos] == CLOSING_BRACKET) { |
| 165 | ++cur_pos; |
| 166 | section.scan_set = scan_set; |
| 167 | } else { |
| 168 | // if the end of the string was encountered, this is not a valid set. |
| 169 | section.has_conv = false; |
| 170 | } |
| 171 | } |
| 172 | } else { |
| 173 | // raw section |
| 174 | section.has_conv = false; |
| 175 | while (str[cur_pos] != '%' && str[cur_pos] != '\0') |
| 176 | ++cur_pos; |
| 177 | } |
| 178 | section.raw_string = {str + starting_pos, cur_pos - starting_pos}; |
| 179 | return section; |
| 180 | } |
| 181 | |
| 182 | private: |
| 183 | // parse_length_modifier parses the length modifier inside a format string. It |
| 184 | // assumes that str[*local_pos] is inside a format specifier. It returns a |
| 185 | // LengthModifier with the length modifier it found. It will advance local_pos |
| 186 | // after the format specifier if one is found. |
| 187 | LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) { |
| 188 | switch (str[*local_pos]) { |
| 189 | case ('l'): |
| 190 | if (str[*local_pos + 1] == 'l') { |
| 191 | *local_pos += 2; |
| 192 | return LengthModifier::ll; |
| 193 | } else { |
| 194 | ++*local_pos; |
| 195 | return LengthModifier::l; |
| 196 | } |
| 197 | case ('h'): |
| 198 | if (str[*local_pos + 1] == 'h') { |
| 199 | *local_pos += 2; |
| 200 | return LengthModifier::hh; |
| 201 | } else { |
| 202 | ++*local_pos; |
| 203 | return LengthModifier::h; |
| 204 | } |
| 205 | case ('L'): |
| 206 | ++*local_pos; |
| 207 | return LengthModifier::L; |
| 208 | case ('j'): |
| 209 | ++*local_pos; |
| 210 | return LengthModifier::j; |
| 211 | case ('z'): |
| 212 | ++*local_pos; |
| 213 | return LengthModifier::z; |
| 214 | case ('t'): |
| 215 | ++*local_pos; |
| 216 | return LengthModifier::t; |
| 217 | default: |
| 218 | return LengthModifier::NONE; |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | // get_next_arg_value gets the next value from the arg list as type T. |
| 223 | template <class T> LIBC_INLINE T get_next_arg_value() { |
| 224 | return args_cur.template next_var<T>(); |
| 225 | } |
| 226 | |
| 227 | //---------------------------------------------------- |
| 228 | // INDEX MODE ONLY FUNCTIONS AFTER HERE: |
| 229 | //---------------------------------------------------- |
| 230 | |
| 231 | #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 232 | |
| 233 | // parse_index parses the index of a value inside a format string. It |
| 234 | // assumes that str[*local_pos] points to character after a '%' or '*', and |
| 235 | // returns 0 if there is no closing $, or if it finds no number. If it finds a |
| 236 | // number, it will move local_pos past the end of the $, else it will not move |
| 237 | // local_pos. |
| 238 | LIBC_INLINE size_t parse_index(size_t *local_pos) { |
| 239 | if (internal::isdigit(str[*local_pos])) { |
| 240 | auto result = internal::strtointeger<int>(str + *local_pos, 10); |
| 241 | size_t index = static_cast<size_t>(result.value); |
| 242 | if (str[*local_pos + static_cast<size_t>(result.parsed_len)] != '$') |
| 243 | return 0; |
| 244 | *local_pos = static_cast<size_t>(1 + result.parsed_len) + *local_pos; |
| 245 | return index; |
| 246 | } |
| 247 | return 0; |
| 248 | } |
| 249 | |
| 250 | // get_arg_value gets the value from the arg list at index (starting at 1). |
| 251 | // This may require parsing the format string. An index of 0 is interpreted as |
| 252 | // the next value. |
| 253 | template <class T> LIBC_INLINE T get_arg_value(size_t index) { |
| 254 | if (!(index == 0 || index == args_index)) |
| 255 | args_to_index(index); |
| 256 | |
| 257 | ++args_index; |
| 258 | return get_next_arg_value<T>(); |
| 259 | } |
| 260 | |
| 261 | // the ArgList can only return the next item in the list. This function is |
| 262 | // used in index mode when the item that needs to be read is not the next one. |
| 263 | // It moves cur_args to the index requested so the appropriate value may |
| 264 | // be read. This may involve parsing the format string, and is in the worst |
| 265 | // case an O(n^2) operation. |
| 266 | LIBC_INLINE void args_to_index(size_t index) { |
| 267 | if (args_index > index) { |
| 268 | args_index = 1; |
| 269 | args_cur = args_start; |
| 270 | } |
| 271 | |
| 272 | while (args_index < index) { |
| 273 | // Since all arguments must be pointers, we can just read all of them as |
| 274 | // void * and not worry about type issues. |
| 275 | args_cur.template next_var<void *>(); |
| 276 | ++args_index; |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE |
| 281 | }; |
| 282 | |
| 283 | } // namespace scanf_core |
| 284 | } // namespace LIBC_NAMESPACE_DECL |
| 285 | |
| 286 | #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H |
| 287 |
Warning: This file is not a C or C++ file. It does not have highlighting.
