1//===-- lib/Parser/characters.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "flang/Parser/characters.h"
10#include "flang/Common/idioms.h"
11#include <algorithm>
12#include <cstddef>
13#include <optional>
14#include <type_traits>
15
16namespace Fortran::parser {
17
18bool useHexadecimalEscapeSequences{false};
19
20int UTF_8CharacterBytes(const char *p) {
21 if ((*p & 0x80) == 0) {
22 return 1;
23 } else if ((*p & 0xe0) == 0xc0) {
24 return 2;
25 } else if ((*p & 0xf0) == 0xe0) {
26 return 3;
27 } else if ((*p & 0xf8) == 0xf0) {
28 return 4;
29 } else if ((*p & 0xfc) == 0xf8) {
30 return 5;
31 } else {
32 return 6;
33 }
34}
35
36template <typename STRING>
37std::string QuoteCharacterLiteralHelper(
38 const STRING &str, bool backslashEscapes, Encoding encoding) {
39 std::string result{'"'};
40 const auto emit{[&](char ch) { result += ch; }};
41 for (auto ch : str) {
42 using CharT = std::decay_t<decltype(ch)>;
43 char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)};
44 if (ch32 == static_cast<unsigned char>('"')) {
45 emit('"'); // double the " when it appears in the text
46 }
47 EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding);
48 }
49 result += '"';
50 return result;
51}
52
53std::string QuoteCharacterLiteral(
54 const std::string &str, bool backslashEscapes, Encoding encoding) {
55 return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
56}
57
58std::string QuoteCharacterLiteral(
59 const std::u16string &str, bool backslashEscapes, Encoding encoding) {
60 return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
61}
62
63std::string QuoteCharacterLiteral(
64 const std::u32string &str, bool backslashEscapes, Encoding encoding) {
65 return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
66}
67
68template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) {
69 CHECK(ucs <= 0xff);
70 EncodedCharacter result;
71 result.buffer[0] = ucs;
72 result.bytes = 1;
73 return result;
74}
75
76template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) {
77 // N.B. char32_t is unsigned
78 EncodedCharacter result;
79 if (ucs <= 0x7f) {
80 result.buffer[0] = ucs;
81 result.bytes = 1;
82 } else if (ucs <= 0x7ff) {
83 result.buffer[0] = 0xc0 | (ucs >> 6);
84 result.buffer[1] = 0x80 | (ucs & 0x3f);
85 result.bytes = 2;
86 } else if (ucs <= 0xffff) {
87 result.buffer[0] = 0xe0 | (ucs >> 12);
88 result.buffer[1] = 0x80 | ((ucs >> 6) & 0x3f);
89 result.buffer[2] = 0x80 | (ucs & 0x3f);
90 result.bytes = 3;
91 } else if (ucs <= 0x1fffff) {
92 // UCS actually only goes up to 0x10ffff, but the
93 // UTF-8 encoding can handle 32 bits.
94 result.buffer[0] = 0xf0 | (ucs >> 18);
95 result.buffer[1] = 0x80 | ((ucs >> 12) & 0x3f);
96 result.buffer[2] = 0x80 | ((ucs >> 6) & 0x3f);
97 result.buffer[3] = 0x80 | (ucs & 0x3f);
98 result.bytes = 4;
99 } else if (ucs <= 0x3ffffff) {
100 result.buffer[0] = 0xf8 | (ucs >> 24);
101 result.buffer[1] = 0x80 | ((ucs >> 18) & 0x3f);
102 result.buffer[2] = 0x80 | ((ucs >> 12) & 0x3f);
103 result.buffer[3] = 0x80 | ((ucs >> 6) & 0x3f);
104 result.buffer[4] = 0x80 | (ucs & 0x3f);
105 result.bytes = 5;
106 } else {
107 result.buffer[0] = 0xfc | (ucs >> 30);
108 result.buffer[1] = 0x80 | ((ucs >> 24) & 0x3f);
109 result.buffer[2] = 0x80 | ((ucs >> 18) & 0x3f);
110 result.buffer[3] = 0x80 | ((ucs >> 12) & 0x3f);
111 result.buffer[4] = 0x80 | ((ucs >> 6) & 0x3f);
112 result.buffer[5] = 0x80 | (ucs & 0x3f);
113 result.bytes = 6;
114 }
115 return result;
116}
117
118EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) {
119 switch (encoding) {
120 SWITCH_COVERS_ALL_CASES
121 case Encoding::LATIN_1:
122 return EncodeCharacter<Encoding::LATIN_1>(ucs);
123 case Encoding::UTF_8:
124 return EncodeCharacter<Encoding::UTF_8>(ucs);
125 }
126}
127
128template <Encoding ENCODING, typename STRING>
129std::string EncodeString(const STRING &str) {
130 std::string result;
131 for (auto ch : str) {
132 char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)};
133 EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)};
134 result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes));
135 }
136 return result;
137}
138
139template std::string EncodeString<Encoding::LATIN_1, std::string>(
140 const std::string &);
141template std::string EncodeString<Encoding::UTF_8, std::u16string>(
142 const std::u16string &);
143template std::string EncodeString<Encoding::UTF_8, std::u32string>(
144 const std::u32string &);
145
146template <>
147DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
148 const char *cp, std::size_t bytes) {
149 if (bytes >= 1) {
150 return {*reinterpret_cast<const std::uint8_t *>(cp), 1};
151 } else {
152 return {};
153 }
154}
155
156template <>
157DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
158 const char *cp, std::size_t bytes) {
159 auto p{reinterpret_cast<const std::uint8_t *>(cp)};
160 char32_t ch{*p};
161 if (ch <= 0x7f) {
162 return {ch, 1};
163 } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
164 ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
165 ch = ((ch & 7) << 6) | (p[1] & 0x3f);
166 ch = (ch << 6) | (p[2] & 0x3f);
167 ch = (ch << 6) | (p[3] & 0x3f);
168 return {ch, 4};
169 } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
170 ((p[1] | p[2]) & 0xc0) == 0x80) {
171 ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
172 ch = (ch << 6) | (p[2] & 0x3f);
173 return {ch, 3};
174 } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
175 (p[1] & 0xc0) == 0x80) {
176 ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
177 return {ch, 2};
178 } else {
179 return {}; // not valid UTF-8
180 }
181}
182
183static DecodedCharacter DecodeEscapedCharacter(
184 const char *cp, std::size_t bytes) {
185 if (cp[0] == '\\' && bytes >= 2) {
186 if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) {
187 return {static_cast<unsigned char>(*escChar), 2};
188 } else if (IsOctalDigit(cp[1])) {
189 std::size_t maxLen{std::min(a: std::size_t{4}, b: bytes)};
190 char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[1]))};
191 std::size_t len{2}; // so far
192 for (; code <= 037 && len < maxLen && IsOctalDigit(cp[len]); ++len) {
193 code = 8 * code + DecimalDigitValue(cp[len]);
194 }
195 return {code, static_cast<int>(len)};
196 } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' &&
197 IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) {
198 return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) +
199 HexadecimalDigitValue(cp[3])),
200 4};
201 } else if (IsLetter(cp[1])) {
202 // Unknown escape - ignore the '\' (PGI compatibility)
203 return {static_cast<unsigned char>(cp[1]), 2};
204 } else {
205 // Not an escape character.
206 return {'\\', 1};
207 }
208 }
209 return {static_cast<unsigned char>(cp[0]), 1};
210}
211
212template <Encoding ENCODING>
213static DecodedCharacter DecodeEscapedCharacters(
214 const char *cp, std::size_t bytes) {
215 char buffer[EncodedCharacter::maxEncodingBytes];
216 int count[EncodedCharacter::maxEncodingBytes];
217 std::size_t at{0}, len{0};
218 for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) {
219 DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)};
220 buffer[len] = code.codepoint;
221 at += code.bytes;
222 count[len] = at;
223 }
224 DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)};
225 if (code.bytes > 0) {
226 code.bytes = count[code.bytes - 1];
227 } else {
228 code.codepoint = buffer[0] & 0xff;
229 code.bytes = count[0];
230 }
231 return code;
232}
233
234template <Encoding ENCODING>
235DecodedCharacter DecodeCharacter(
236 const char *cp, std::size_t bytes, bool backslashEscapes) {
237 if (backslashEscapes && bytes >= 2 && *cp == '\\') {
238 if (ENCODING == Encoding::UTF_8 && bytes >= 6 &&
239 ToLowerCaseLetter(cp[1]) == 'u' && IsHexadecimalDigit(cp[2]) &&
240 IsHexadecimalDigit(cp[3]) && IsHexadecimalDigit(cp[4]) &&
241 IsHexadecimalDigit(cp[5])) {
242 char32_t ch{
243 static_cast<char32_t>(4096 * HexadecimalDigitValue(cp[2]) +
244 256 * HexadecimalDigitValue(cp[3]) +
245 16 * HexadecimalDigitValue(cp[4]) + HexadecimalDigitValue(cp[5])),
246 };
247 if (bytes >= 10 && IsHexadecimalDigit(cp[6]) &&
248 IsHexadecimalDigit(cp[7]) && IsHexadecimalDigit(cp[8]) &&
249 IsHexadecimalDigit(cp[9])) {
250 return {(ch << 16) |
251 (4096 * HexadecimalDigitValue(cp[6]) +
252 256 * HexadecimalDigitValue(cp[7]) +
253 16 * HexadecimalDigitValue(cp[8]) +
254 HexadecimalDigitValue(cp[9])),
255 10};
256 } else {
257 return {ch, 6};
258 }
259 } else {
260 return DecodeEscapedCharacters<ENCODING>(cp, bytes);
261 }
262 } else {
263 return DecodeRawCharacter<ENCODING>(cp, bytes);
264 }
265}
266
267template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
268 const char *, std::size_t, bool);
269template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
270 const char *, std::size_t, bool);
271
272DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp,
273 std::size_t bytes, bool backslashEscapes) {
274 switch (encoding) {
275 SWITCH_COVERS_ALL_CASES
276 case Encoding::LATIN_1:
277 return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes);
278 case Encoding::UTF_8:
279 return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes);
280 }
281}
282
283template <typename RESULT, Encoding ENCODING>
284RESULT DecodeString(const std::string &s, bool backslashEscapes) {
285 RESULT result;
286 const char *p{s.c_str()};
287 for (auto bytes{s.size()}; bytes != 0;) {
288 DecodedCharacter decoded{
289 DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)};
290 if (decoded.bytes > 0) {
291 if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
292 result.append(1, decoded.codepoint);
293 bytes -= decoded.bytes;
294 p += decoded.bytes;
295 continue;
296 }
297 }
298 result.append(1, static_cast<uint8_t>(*p));
299 ++p;
300 --bytes;
301 }
302 return result;
303}
304
305template std::string DecodeString<std::string, Encoding::LATIN_1>(
306 const std::string &, bool);
307template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
308 const std::string &, bool);
309template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
310 const std::string &, bool);
311} // namespace Fortran::parser
312

source code of flang/lib/Parser/characters.cpp