1 | //===-- lib/Parser/characters.cpp -----------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "flang/Parser/characters.h" |
10 | #include "flang/Common/idioms.h" |
11 | #include <algorithm> |
12 | #include <cstddef> |
13 | #include <optional> |
14 | #include <type_traits> |
15 | |
16 | namespace Fortran::parser { |
17 | |
18 | bool useHexadecimalEscapeSequences{false}; |
19 | |
20 | int UTF_8CharacterBytes(const char *p) { |
21 | if ((*p & 0x80) == 0) { |
22 | return 1; |
23 | } else if ((*p & 0xe0) == 0xc0) { |
24 | return 2; |
25 | } else if ((*p & 0xf0) == 0xe0) { |
26 | return 3; |
27 | } else if ((*p & 0xf8) == 0xf0) { |
28 | return 4; |
29 | } else if ((*p & 0xfc) == 0xf8) { |
30 | return 5; |
31 | } else { |
32 | return 6; |
33 | } |
34 | } |
35 | |
36 | template <typename STRING> |
37 | std::string QuoteCharacterLiteralHelper( |
38 | const STRING &str, bool backslashEscapes, Encoding encoding) { |
39 | std::string result{'"'}; |
40 | const auto emit{[&](char ch) { result += ch; }}; |
41 | for (auto ch : str) { |
42 | using CharT = std::decay_t<decltype(ch)>; |
43 | char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)}; |
44 | if (ch32 == static_cast<unsigned char>('"')) { |
45 | emit('"'); // double the " when it appears in the text |
46 | } |
47 | EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding); |
48 | } |
49 | result += '"'; |
50 | return result; |
51 | } |
52 | |
53 | std::string QuoteCharacterLiteral( |
54 | const std::string &str, bool backslashEscapes, Encoding encoding) { |
55 | return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); |
56 | } |
57 | |
58 | std::string QuoteCharacterLiteral( |
59 | const std::u16string &str, bool backslashEscapes, Encoding encoding) { |
60 | return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); |
61 | } |
62 | |
63 | std::string QuoteCharacterLiteral( |
64 | const std::u32string &str, bool backslashEscapes, Encoding encoding) { |
65 | return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); |
66 | } |
67 | |
68 | template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) { |
69 | CHECK(ucs <= 0xff); |
70 | EncodedCharacter result; |
71 | result.buffer[0] = ucs; |
72 | result.bytes = 1; |
73 | return result; |
74 | } |
75 | |
76 | template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) { |
77 | // N.B. char32_t is unsigned |
78 | EncodedCharacter result; |
79 | if (ucs <= 0x7f) { |
80 | result.buffer[0] = ucs; |
81 | result.bytes = 1; |
82 | } else if (ucs <= 0x7ff) { |
83 | result.buffer[0] = 0xc0 | (ucs >> 6); |
84 | result.buffer[1] = 0x80 | (ucs & 0x3f); |
85 | result.bytes = 2; |
86 | } else if (ucs <= 0xffff) { |
87 | result.buffer[0] = 0xe0 | (ucs >> 12); |
88 | result.buffer[1] = 0x80 | ((ucs >> 6) & 0x3f); |
89 | result.buffer[2] = 0x80 | (ucs & 0x3f); |
90 | result.bytes = 3; |
91 | } else if (ucs <= 0x1fffff) { |
92 | // UCS actually only goes up to 0x10ffff, but the |
93 | // UTF-8 encoding can handle 32 bits. |
94 | result.buffer[0] = 0xf0 | (ucs >> 18); |
95 | result.buffer[1] = 0x80 | ((ucs >> 12) & 0x3f); |
96 | result.buffer[2] = 0x80 | ((ucs >> 6) & 0x3f); |
97 | result.buffer[3] = 0x80 | (ucs & 0x3f); |
98 | result.bytes = 4; |
99 | } else if (ucs <= 0x3ffffff) { |
100 | result.buffer[0] = 0xf8 | (ucs >> 24); |
101 | result.buffer[1] = 0x80 | ((ucs >> 18) & 0x3f); |
102 | result.buffer[2] = 0x80 | ((ucs >> 12) & 0x3f); |
103 | result.buffer[3] = 0x80 | ((ucs >> 6) & 0x3f); |
104 | result.buffer[4] = 0x80 | (ucs & 0x3f); |
105 | result.bytes = 5; |
106 | } else { |
107 | result.buffer[0] = 0xfc | (ucs >> 30); |
108 | result.buffer[1] = 0x80 | ((ucs >> 24) & 0x3f); |
109 | result.buffer[2] = 0x80 | ((ucs >> 18) & 0x3f); |
110 | result.buffer[3] = 0x80 | ((ucs >> 12) & 0x3f); |
111 | result.buffer[4] = 0x80 | ((ucs >> 6) & 0x3f); |
112 | result.buffer[5] = 0x80 | (ucs & 0x3f); |
113 | result.bytes = 6; |
114 | } |
115 | return result; |
116 | } |
117 | |
118 | EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) { |
119 | switch (encoding) { |
120 | SWITCH_COVERS_ALL_CASES |
121 | case Encoding::LATIN_1: |
122 | return EncodeCharacter<Encoding::LATIN_1>(ucs); |
123 | case Encoding::UTF_8: |
124 | return EncodeCharacter<Encoding::UTF_8>(ucs); |
125 | } |
126 | } |
127 | |
128 | template <Encoding ENCODING, typename STRING> |
129 | std::string EncodeString(const STRING &str) { |
130 | std::string result; |
131 | for (auto ch : str) { |
132 | char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)}; |
133 | EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)}; |
134 | result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes)); |
135 | } |
136 | return result; |
137 | } |
138 | |
139 | template std::string EncodeString<Encoding::LATIN_1, std::string>( |
140 | const std::string &); |
141 | template std::string EncodeString<Encoding::UTF_8, std::u16string>( |
142 | const std::u16string &); |
143 | template std::string EncodeString<Encoding::UTF_8, std::u32string>( |
144 | const std::u32string &); |
145 | |
146 | template <> |
147 | DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>( |
148 | const char *cp, std::size_t bytes) { |
149 | if (bytes >= 1) { |
150 | return {*reinterpret_cast<const std::uint8_t *>(cp), 1}; |
151 | } else { |
152 | return {}; |
153 | } |
154 | } |
155 | |
156 | template <> |
157 | DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>( |
158 | const char *cp, std::size_t bytes) { |
159 | auto p{reinterpret_cast<const std::uint8_t *>(cp)}; |
160 | char32_t ch{*p}; |
161 | if (ch <= 0x7f) { |
162 | return {ch, 1}; |
163 | } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 && |
164 | ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) { |
165 | ch = ((ch & 7) << 6) | (p[1] & 0x3f); |
166 | ch = (ch << 6) | (p[2] & 0x3f); |
167 | ch = (ch << 6) | (p[3] & 0x3f); |
168 | return {ch, 4}; |
169 | } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 && |
170 | ((p[1] | p[2]) & 0xc0) == 0x80) { |
171 | ch = ((ch & 0xf) << 6) | (p[1] & 0x3f); |
172 | ch = (ch << 6) | (p[2] & 0x3f); |
173 | return {ch, 3}; |
174 | } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 && |
175 | (p[1] & 0xc0) == 0x80) { |
176 | ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f); |
177 | return {ch, 2}; |
178 | } else { |
179 | return {}; // not valid UTF-8 |
180 | } |
181 | } |
182 | |
183 | static DecodedCharacter DecodeEscapedCharacter( |
184 | const char *cp, std::size_t bytes) { |
185 | if (cp[0] == '\\' && bytes >= 2) { |
186 | if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) { |
187 | return {static_cast<unsigned char>(*escChar), 2}; |
188 | } else if (IsOctalDigit(cp[1])) { |
189 | std::size_t maxLen{std::min(a: std::size_t{4}, b: bytes)}; |
190 | char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[1]))}; |
191 | std::size_t len{2}; // so far |
192 | for (; code <= 037 && len < maxLen && IsOctalDigit(cp[len]); ++len) { |
193 | code = 8 * code + DecimalDigitValue(cp[len]); |
194 | } |
195 | return {code, static_cast<int>(len)}; |
196 | } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' && |
197 | IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) { |
198 | return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) + |
199 | HexadecimalDigitValue(cp[3])), |
200 | 4}; |
201 | } else if (IsLetter(cp[1])) { |
202 | // Unknown escape - ignore the '\' (PGI compatibility) |
203 | return {static_cast<unsigned char>(cp[1]), 2}; |
204 | } else { |
205 | // Not an escape character. |
206 | return {'\\', 1}; |
207 | } |
208 | } |
209 | return {static_cast<unsigned char>(cp[0]), 1}; |
210 | } |
211 | |
212 | template <Encoding ENCODING> |
213 | static DecodedCharacter DecodeEscapedCharacters( |
214 | const char *cp, std::size_t bytes) { |
215 | char buffer[EncodedCharacter::maxEncodingBytes]; |
216 | int count[EncodedCharacter::maxEncodingBytes]; |
217 | std::size_t at{0}, len{0}; |
218 | for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) { |
219 | DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)}; |
220 | buffer[len] = code.codepoint; |
221 | at += code.bytes; |
222 | count[len] = at; |
223 | } |
224 | DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)}; |
225 | if (code.bytes > 0) { |
226 | code.bytes = count[code.bytes - 1]; |
227 | } else { |
228 | code.codepoint = buffer[0] & 0xff; |
229 | code.bytes = count[0]; |
230 | } |
231 | return code; |
232 | } |
233 | |
234 | template <Encoding ENCODING> |
235 | DecodedCharacter DecodeCharacter( |
236 | const char *cp, std::size_t bytes, bool backslashEscapes) { |
237 | if (backslashEscapes && bytes >= 2 && *cp == '\\') { |
238 | if (ENCODING == Encoding::UTF_8 && bytes >= 6 && |
239 | ToLowerCaseLetter(cp[1]) == 'u' && IsHexadecimalDigit(cp[2]) && |
240 | IsHexadecimalDigit(cp[3]) && IsHexadecimalDigit(cp[4]) && |
241 | IsHexadecimalDigit(cp[5])) { |
242 | char32_t ch{ |
243 | static_cast<char32_t>(4096 * HexadecimalDigitValue(cp[2]) + |
244 | 256 * HexadecimalDigitValue(cp[3]) + |
245 | 16 * HexadecimalDigitValue(cp[4]) + HexadecimalDigitValue(cp[5])), |
246 | }; |
247 | if (bytes >= 10 && IsHexadecimalDigit(cp[6]) && |
248 | IsHexadecimalDigit(cp[7]) && IsHexadecimalDigit(cp[8]) && |
249 | IsHexadecimalDigit(cp[9])) { |
250 | return {(ch << 16) | |
251 | (4096 * HexadecimalDigitValue(cp[6]) + |
252 | 256 * HexadecimalDigitValue(cp[7]) + |
253 | 16 * HexadecimalDigitValue(cp[8]) + |
254 | HexadecimalDigitValue(cp[9])), |
255 | 10}; |
256 | } else { |
257 | return {ch, 6}; |
258 | } |
259 | } else { |
260 | return DecodeEscapedCharacters<ENCODING>(cp, bytes); |
261 | } |
262 | } else { |
263 | return DecodeRawCharacter<ENCODING>(cp, bytes); |
264 | } |
265 | } |
266 | |
267 | template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>( |
268 | const char *, std::size_t, bool); |
269 | template DecodedCharacter DecodeCharacter<Encoding::UTF_8>( |
270 | const char *, std::size_t, bool); |
271 | |
272 | DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp, |
273 | std::size_t bytes, bool backslashEscapes) { |
274 | switch (encoding) { |
275 | SWITCH_COVERS_ALL_CASES |
276 | case Encoding::LATIN_1: |
277 | return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes); |
278 | case Encoding::UTF_8: |
279 | return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes); |
280 | } |
281 | } |
282 | |
283 | template <typename RESULT, Encoding ENCODING> |
284 | RESULT DecodeString(const std::string &s, bool backslashEscapes) { |
285 | RESULT result; |
286 | const char *p{s.c_str()}; |
287 | for (auto bytes{s.size()}; bytes != 0;) { |
288 | DecodedCharacter decoded{ |
289 | DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)}; |
290 | if (decoded.bytes > 0) { |
291 | if (static_cast<std::size_t>(decoded.bytes) <= bytes) { |
292 | result.append(1, decoded.codepoint); |
293 | bytes -= decoded.bytes; |
294 | p += decoded.bytes; |
295 | continue; |
296 | } |
297 | } |
298 | result.append(1, static_cast<uint8_t>(*p)); |
299 | ++p; |
300 | --bytes; |
301 | } |
302 | return result; |
303 | } |
304 | |
305 | template std::string DecodeString<std::string, Encoding::LATIN_1>( |
306 | const std::string &, bool); |
307 | template std::u16string DecodeString<std::u16string, Encoding::UTF_8>( |
308 | const std::string &, bool); |
309 | template std::u32string DecodeString<std::u32string, Encoding::UTF_8>( |
310 | const std::string &, bool); |
311 | } // namespace Fortran::parser |
312 | |