Warning: This file is not a C or C++ file. It does not have highlighting.
| 1 | //===-- Int type specifier converter for scanf ------------------*- C++ -*-===// |
|---|---|
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H |
| 10 | #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H |
| 11 | |
| 12 | #include "src/__support/CPP/limits.h" |
| 13 | #include "src/__support/ctype_utils.h" |
| 14 | #include "src/__support/macros/config.h" |
| 15 | #include "src/stdio/scanf_core/converter_utils.h" |
| 16 | #include "src/stdio/scanf_core/core_structs.h" |
| 17 | #include "src/stdio/scanf_core/reader.h" |
| 18 | |
| 19 | #include <stddef.h> |
| 20 | |
| 21 | namespace LIBC_NAMESPACE_DECL { |
| 22 | namespace scanf_core { |
| 23 | |
| 24 | // This code is very similar to the code in __support/str_to_integer.h but is |
| 25 | // not quite the same. Here is the list of differences and why they exist: |
| 26 | // 1) This takes a reader and a format section instead of a char* and the base. |
| 27 | // This should be fairly self explanatory. While the char* could be adapted |
| 28 | // to a reader and the base could be calculated ahead of time, the |
| 29 | // semantics are slightly different, specifically a char* can be indexed |
| 30 | // freely (I can read str[2] and then str[0]) whereas a File (which the |
| 31 | // reader may contain) cannot. |
| 32 | // 2) Because this uses a Reader, this function can only unget once. |
| 33 | // This is relevant because scanf specifies it reads the "longest sequence |
| 34 | // of input characters which does not exceed any specified field width and |
| 35 | // which is, or is a prefix of, a matching input sequence." Whereas the |
| 36 | // strtol function accepts "the longest initial subsequence of the input |
| 37 | // string (...) that is of the expected form." This is demonstrated by the |
| 38 | // differences in how they deal with the string "0xZZZ" when parsing as |
| 39 | // hexadecimal. Scanf will read the "0x" as a valid prefix and return 0, |
| 40 | // since it reads the first 'Z', sees that it's not a valid hex digit, and |
| 41 | // reverses one character. The strtol function on the other hand only |
| 42 | // accepts the "0" since that's the longest valid hexadecimal sequence. It |
| 43 | // sees the 'Z' after the "0x" and determines that this is not the prefix |
| 44 | // to a valid hex string. |
| 45 | // 3) This conversion may have a maximum width. |
| 46 | // If a maximum width is specified, this conversion is only allowed to |
| 47 | // accept a certain number of characters. Strtol doesn't have any such |
| 48 | // limitation. |
| 49 | template <typename T> |
| 50 | int convert_int(Reader<T> *reader, const FormatSection &to_conv) { |
| 51 | // %d "Matches an optionally signed decimal integer [...] with the value 10 |
| 52 | // for the base argument. The corresponding argument shall be a pointer to |
| 53 | // signed integer." |
| 54 | |
| 55 | // %i "Matches an optionally signed integer [...] with the value 0 for the |
| 56 | // base argument. The corresponding argument shall be a pointer to signed |
| 57 | // integer." |
| 58 | |
| 59 | // %u "Matches an optionally signed decimal integer [...] with the value 10 |
| 60 | // for the base argument. The corresponding argument shall be a pointer to |
| 61 | // unsigned integer" |
| 62 | |
| 63 | // %o "Matches an optionally signed octal integer [...] with the value 8 for |
| 64 | // the base argument. The corresponding argument shall be a pointer to |
| 65 | // unsigned integer" |
| 66 | |
| 67 | // %x/X "Matches an optionally signed hexadecimal integer [...] with the value |
| 68 | // 16 for the base argument. The corresponding argument shall be a pointer to |
| 69 | // unsigned integer" |
| 70 | |
| 71 | size_t max_width = cpp::numeric_limits<size_t>::max(); |
| 72 | if (to_conv.max_width > 0) { |
| 73 | max_width = to_conv.max_width; |
| 74 | } |
| 75 | |
| 76 | uintmax_t result = 0; |
| 77 | bool is_number = false; |
| 78 | bool is_signed = false; |
| 79 | int base = 0; |
| 80 | if (to_conv.conv_name == 'i') { |
| 81 | base = 0; |
| 82 | is_signed = true; |
| 83 | } else if (to_conv.conv_name == 'o') { |
| 84 | base = 8; |
| 85 | } else if (internal::tolower(to_conv.conv_name) == 'x' || |
| 86 | to_conv.conv_name == 'p') { |
| 87 | base = 16; |
| 88 | } else if (to_conv.conv_name == 'd') { |
| 89 | base = 10; |
| 90 | is_signed = true; |
| 91 | } else { // conv_name must be 'u' |
| 92 | base = 10; |
| 93 | } |
| 94 | |
| 95 | char cur_char = reader->getc(); |
| 96 | |
| 97 | char result_sign = '+'; |
| 98 | if (cur_char == '+' || cur_char == '-') { |
| 99 | result_sign = cur_char; |
| 100 | if (max_width > 1) { |
| 101 | --max_width; |
| 102 | cur_char = reader->getc(); |
| 103 | } else { |
| 104 | // If the max width has been hit already, then the return value must be 0 |
| 105 | // since no actual digits of the number have been parsed yet. |
| 106 | write_int_with_length(0, to_conv); |
| 107 | return MATCHING_FAILURE; |
| 108 | } |
| 109 | } |
| 110 | const bool is_negative = result_sign == '-'; |
| 111 | |
| 112 | // Base of 0 means automatically determine the base. Base of 16 may have a |
| 113 | // prefix of "0x" |
| 114 | if (base == 0 || base == 16) { |
| 115 | // If the first character is 0, then it could be octal or hex. |
| 116 | if (cur_char == '0') { |
| 117 | is_number = true; |
| 118 | |
| 119 | // Read the next character to check. |
| 120 | if (max_width > 1) { |
| 121 | --max_width; |
| 122 | cur_char = reader->getc(); |
| 123 | } else { |
| 124 | write_int_with_length(0, to_conv); |
| 125 | return READ_OK; |
| 126 | } |
| 127 | |
| 128 | if (internal::tolower(cur_char) == 'x') { |
| 129 | // This is a valid hex prefix. |
| 130 | |
| 131 | is_number = false; |
| 132 | // A valid hex prefix is not necessarily a valid number. For the |
| 133 | // conversion to be valid it needs to use all of the characters it |
| 134 | // consumes. From the standard: |
| 135 | // 7.23.6.2 paragraph 9: "An input item is defined as the longest |
| 136 | // sequence of input characters which does not exceed any specified |
| 137 | // field width and which is, or is a prefix of, a matching input |
| 138 | // sequence." |
| 139 | // 7.23.6.2 paragraph 10: "If the input item is not a matching sequence, |
| 140 | // the execution of the directive fails: this condition is a matching |
| 141 | // failure" |
| 142 | base = 16; |
| 143 | if (max_width > 1) { |
| 144 | --max_width; |
| 145 | cur_char = reader->getc(); |
| 146 | } else { |
| 147 | return MATCHING_FAILURE; |
| 148 | } |
| 149 | |
| 150 | } else { |
| 151 | if (base == 0) { |
| 152 | base = 8; |
| 153 | } |
| 154 | } |
| 155 | } else if (base == 0) { |
| 156 | if (internal::isdigit(cur_char)) { |
| 157 | // If the first character is a different number, then it's 10. |
| 158 | base = 10; |
| 159 | } else { |
| 160 | // If the first character isn't a valid digit, then there are no valid |
| 161 | // digits at all. The number is 0. |
| 162 | reader->ungetc(cur_char); |
| 163 | write_int_with_length(0, to_conv); |
| 164 | return MATCHING_FAILURE; |
| 165 | } |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max(); |
| 170 | constexpr uintmax_t SIGNED_MAX = |
| 171 | static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()); |
| 172 | constexpr uintmax_t NEGATIVE_SIGNED_MAX = |
| 173 | static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1; |
| 174 | |
| 175 | const uintmax_t MAX = |
| 176 | (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX) |
| 177 | : UNSIGNED_MAX); |
| 178 | |
| 179 | const uintmax_t max_div_by_base = MAX / base; |
| 180 | |
| 181 | if (internal::isalnum(cur_char) && |
| 182 | internal::b36_char_to_int(cur_char) < base) { |
| 183 | is_number = true; |
| 184 | } |
| 185 | |
| 186 | bool has_overflow = false; |
| 187 | size_t i = 0; |
| 188 | for (; i < max_width && internal::isalnum(cur_char) && |
| 189 | internal::b36_char_to_int(cur_char) < base; |
| 190 | ++i, cur_char = reader->getc()) { |
| 191 | |
| 192 | uintmax_t cur_digit = internal::b36_char_to_int(cur_char); |
| 193 | |
| 194 | if (result == MAX) { |
| 195 | has_overflow = true; |
| 196 | continue; |
| 197 | } else if (result > max_div_by_base) { |
| 198 | result = MAX; |
| 199 | has_overflow = true; |
| 200 | } else { |
| 201 | result = result * base; |
| 202 | } |
| 203 | |
| 204 | if (result > MAX - cur_digit) { |
| 205 | result = MAX; |
| 206 | has_overflow = true; |
| 207 | } else { |
| 208 | result = result + cur_digit; |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | // We always read one more character than will be used, so we have to put the |
| 213 | // last one back. |
| 214 | reader->ungetc(cur_char); |
| 215 | |
| 216 | if (!is_number) |
| 217 | return MATCHING_FAILURE; |
| 218 | |
| 219 | if (has_overflow) { |
| 220 | write_int_with_length(MAX, to_conv); |
| 221 | } else { |
| 222 | if (is_negative) |
| 223 | result = -result; |
| 224 | |
| 225 | write_int_with_length(result, to_conv); |
| 226 | } |
| 227 | |
| 228 | return READ_OK; |
| 229 | } |
| 230 | |
| 231 | } // namespace scanf_core |
| 232 | } // namespace LIBC_NAMESPACE_DECL |
| 233 | |
| 234 | #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H |
| 235 |
Warning: This file is not a C or C++ file. It does not have highlighting.
