1 | //===-- Int type specifier converters for scanf -----------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "src/stdio/scanf_core/int_converter.h" |
10 | |
11 | #include "src/__support/CPP/limits.h" |
12 | #include "src/__support/ctype_utils.h" |
13 | #include "src/stdio/scanf_core/converter_utils.h" |
14 | #include "src/stdio/scanf_core/core_structs.h" |
15 | #include "src/stdio/scanf_core/reader.h" |
16 | |
17 | #include <stddef.h> |
18 | |
19 | namespace LIBC_NAMESPACE { |
20 | namespace scanf_core { |
21 | |
22 | // This code is very similar to the code in __support/str_to_integer.h but is |
23 | // not quite the same. Here is the list of differences and why they exist: |
24 | // 1) This takes a reader and a format section instead of a char* and the base. |
25 | // This should be fairly self explanatory. While the char* could be adapted |
26 | // to a reader and the base could be calculated ahead of time, the |
27 | // semantics are slightly different, specifically a char* can be indexed |
28 | // freely (I can read str[2] and then str[0]) whereas a File (which the |
29 | // reader may contain) cannot. |
30 | // 2) Because this uses a Reader, this function can only unget once. |
31 | // This is relevant because scanf specifies it reads the "longest sequence |
32 | // of input characters which does not exceed any specified field width and |
33 | // which is, or is a prefix of, a matching input sequence." Whereas the |
34 | // strtol function accepts "the longest initial subsequence of the input |
35 | // string (...) that is of the expected form." This is demonstrated by the |
36 | // differences in how they deal with the string "0xZZZ" when parsing as |
37 | // hexadecimal. Scanf will read the "0x" as a valid prefix and return 0, |
38 | // since it reads the first 'Z', sees that it's not a valid hex digit, and |
39 | // reverses one character. The strtol function on the other hand only |
40 | // accepts the "0" since that's the longest valid hexadecimal sequence. It |
41 | // sees the 'Z' after the "0x" and determines that this is not the prefix |
42 | // to a valid hex string. |
43 | // 3) This conversion may have a maximum width. |
44 | // If a maximum width is specified, this conversion is only allowed to |
45 | // accept a certain number of characters. Strtol doesn't have any such |
46 | // limitation. |
47 | int convert_int(Reader *reader, const FormatSection &to_conv) { |
48 | // %d "Matches an optionally signed decimal integer [...] with the value 10 |
49 | // for the base argument. The corresponding argument shall be a pointer to |
50 | // signed integer." |
51 | |
52 | // %i "Matches an optionally signed integer [...] with the value 0 for the |
53 | // base argument. The corresponding argument shall be a pointer to signed |
54 | // integer." |
55 | |
56 | // %u "Matches an optionally signed decimal integer [...] with the value 10 |
57 | // for the base argument. The corresponding argument shall be a pointer to |
58 | // unsigned integer" |
59 | |
60 | // %o "Matches an optionally signed octal integer [...] with the value 8 for |
61 | // the base argument. The corresponding argument shall be a pointer to |
62 | // unsigned integer" |
63 | |
64 | // %x/X "Matches an optionally signed hexadecimal integer [...] with the value |
65 | // 16 for the base argument. The corresponding argument shall be a pointer to |
66 | // unsigned integer" |
67 | |
68 | size_t max_width = cpp::numeric_limits<size_t>::max(); |
69 | if (to_conv.max_width > 0) { |
70 | max_width = to_conv.max_width; |
71 | } |
72 | |
73 | uintmax_t result = 0; |
74 | bool is_number = false; |
75 | bool is_signed = false; |
76 | int base = 0; |
77 | if (to_conv.conv_name == 'i') { |
78 | base = 0; |
79 | is_signed = true; |
80 | } else if (to_conv.conv_name == 'o') { |
81 | base = 8; |
82 | } else if (to_lower(a: to_conv.conv_name) == 'x' || to_conv.conv_name == 'p') { |
83 | base = 16; |
84 | } else if (to_conv.conv_name == 'd') { |
85 | base = 10; |
86 | is_signed = true; |
87 | } else { // conv_name must be 'u' |
88 | base = 10; |
89 | } |
90 | |
91 | char cur_char = reader->getc(); |
92 | |
93 | char result_sign = '+'; |
94 | if (cur_char == '+' || cur_char == '-') { |
95 | result_sign = cur_char; |
96 | if (max_width > 1) { |
97 | --max_width; |
98 | cur_char = reader->getc(); |
99 | } else { |
100 | // If the max width has been hit already, then the return value must be 0 |
101 | // since no actual digits of the number have been parsed yet. |
102 | write_int_with_length(output_val: 0, to_conv); |
103 | return MATCHING_FAILURE; |
104 | } |
105 | } |
106 | const bool is_negative = result_sign == '-'; |
107 | |
108 | // Base of 0 means automatically determine the base. Base of 16 may have a |
109 | // prefix of "0x" |
110 | if (base == 0 || base == 16) { |
111 | // If the first character is 0, then it could be octal or hex. |
112 | if (cur_char == '0') { |
113 | is_number = true; |
114 | |
115 | // Read the next character to check. |
116 | if (max_width > 1) { |
117 | --max_width; |
118 | cur_char = reader->getc(); |
119 | } else { |
120 | write_int_with_length(output_val: 0, to_conv); |
121 | return READ_OK; |
122 | } |
123 | |
124 | if (to_lower(a: cur_char) == 'x') { |
125 | // This is a valid hex prefix. |
126 | base = 16; |
127 | if (max_width > 1) { |
128 | --max_width; |
129 | cur_char = reader->getc(); |
130 | } else { |
131 | write_int_with_length(output_val: 0, to_conv); |
132 | return READ_OK; |
133 | } |
134 | |
135 | } else { |
136 | if (base == 0) { |
137 | base = 8; |
138 | } |
139 | } |
140 | } else if (base == 0) { |
141 | if (internal::isdigit(ch: cur_char)) { |
142 | // If the first character is a different number, then it's 10. |
143 | base = 10; |
144 | } else { |
145 | // If the first character isn't a valid digit, then there are no valid |
146 | // digits at all. The number is 0. |
147 | reader->ungetc(c: cur_char); |
148 | write_int_with_length(output_val: 0, to_conv); |
149 | return MATCHING_FAILURE; |
150 | } |
151 | } |
152 | } |
153 | |
154 | constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max(); |
155 | constexpr uintmax_t SIGNED_MAX = |
156 | static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()); |
157 | constexpr uintmax_t NEGATIVE_SIGNED_MAX = |
158 | static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1; |
159 | |
160 | const uintmax_t MAX = |
161 | (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX) |
162 | : UNSIGNED_MAX); |
163 | |
164 | const uintmax_t max_div_by_base = MAX / base; |
165 | |
166 | if (internal::isalnum(ch: cur_char) && b36_char_to_int(input: cur_char) < base) { |
167 | is_number = true; |
168 | } |
169 | |
170 | bool has_overflow = false; |
171 | size_t i = 0; |
172 | for (; i < max_width && internal::isalnum(ch: cur_char) && |
173 | b36_char_to_int(input: cur_char) < base; |
174 | ++i, cur_char = reader->getc()) { |
175 | |
176 | uintmax_t cur_digit = b36_char_to_int(input: cur_char); |
177 | |
178 | if (result == MAX) { |
179 | has_overflow = true; |
180 | continue; |
181 | } else if (result > max_div_by_base) { |
182 | result = MAX; |
183 | has_overflow = true; |
184 | } else { |
185 | result = result * base; |
186 | } |
187 | |
188 | if (result > MAX - cur_digit) { |
189 | result = MAX; |
190 | has_overflow = true; |
191 | } else { |
192 | result = result + cur_digit; |
193 | } |
194 | } |
195 | |
196 | // We always read one more character than will be used, so we have to put the |
197 | // last one back. |
198 | reader->ungetc(c: cur_char); |
199 | |
200 | if (has_overflow) { |
201 | write_int_with_length(output_val: MAX, to_conv); |
202 | } else { |
203 | if (is_negative) |
204 | result = -result; |
205 | |
206 | write_int_with_length(output_val: result, to_conv); |
207 | } |
208 | |
209 | if (!is_number) |
210 | return MATCHING_FAILURE; |
211 | return READ_OK; |
212 | } |
213 | |
214 | } // namespace scanf_core |
215 | } // namespace LIBC_NAMESPACE |
216 | |