| 1 | //===-- lib/Parser/characters.cpp -----------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "flang/Parser/characters.h" |
| 10 | #include "flang/Common/idioms.h" |
| 11 | #include <algorithm> |
| 12 | #include <cstddef> |
| 13 | #include <optional> |
| 14 | #include <type_traits> |
| 15 | |
| 16 | namespace Fortran::parser { |
| 17 | |
| 18 | bool useHexadecimalEscapeSequences{false}; |
| 19 | |
| 20 | int UTF_8CharacterBytes(const char *p) { |
| 21 | if ((*p & 0x80) == 0) { |
| 22 | return 1; |
| 23 | } else if ((*p & 0xe0) == 0xc0) { |
| 24 | return 2; |
| 25 | } else if ((*p & 0xf0) == 0xe0) { |
| 26 | return 3; |
| 27 | } else if ((*p & 0xf8) == 0xf0) { |
| 28 | return 4; |
| 29 | } else if ((*p & 0xfc) == 0xf8) { |
| 30 | return 5; |
| 31 | } else { |
| 32 | return 6; |
| 33 | } |
| 34 | } |
| 35 | |
| 36 | template <typename STRING> |
| 37 | std::string QuoteCharacterLiteralHelper( |
| 38 | const STRING &str, bool backslashEscapes, Encoding encoding) { |
| 39 | std::string result{'"'}; |
| 40 | const auto emit{[&](char ch) { result += ch; }}; |
| 41 | for (auto ch : str) { |
| 42 | using CharT = std::decay_t<decltype(ch)>; |
| 43 | char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)}; |
| 44 | if (ch32 == static_cast<unsigned char>('"')) { |
| 45 | emit('"'); // double the " when it appears in the text |
| 46 | } |
| 47 | EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding); |
| 48 | } |
| 49 | result += '"'; |
| 50 | return result; |
| 51 | } |
| 52 | |
| 53 | std::string QuoteCharacterLiteral( |
| 54 | const std::string &str, bool backslashEscapes, Encoding encoding) { |
| 55 | return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); |
| 56 | } |
| 57 | |
| 58 | std::string QuoteCharacterLiteral( |
| 59 | const std::u16string &str, bool backslashEscapes, Encoding encoding) { |
| 60 | return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); |
| 61 | } |
| 62 | |
| 63 | std::string QuoteCharacterLiteral( |
| 64 | const std::u32string &str, bool backslashEscapes, Encoding encoding) { |
| 65 | return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); |
| 66 | } |
| 67 | |
| 68 | template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) { |
| 69 | CHECK(ucs <= 0xff); |
| 70 | EncodedCharacter result; |
| 71 | result.buffer[0] = ucs; |
| 72 | result.bytes = 1; |
| 73 | return result; |
| 74 | } |
| 75 | |
| 76 | template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) { |
| 77 | // N.B. char32_t is unsigned |
| 78 | EncodedCharacter result; |
| 79 | if (ucs <= 0x7f) { |
| 80 | result.buffer[0] = ucs; |
| 81 | result.bytes = 1; |
| 82 | } else if (ucs <= 0x7ff) { |
| 83 | result.buffer[0] = 0xc0 | (ucs >> 6); |
| 84 | result.buffer[1] = 0x80 | (ucs & 0x3f); |
| 85 | result.bytes = 2; |
| 86 | } else if (ucs <= 0xffff) { |
| 87 | result.buffer[0] = 0xe0 | (ucs >> 12); |
| 88 | result.buffer[1] = 0x80 | ((ucs >> 6) & 0x3f); |
| 89 | result.buffer[2] = 0x80 | (ucs & 0x3f); |
| 90 | result.bytes = 3; |
| 91 | } else if (ucs <= 0x1fffff) { |
| 92 | // UCS actually only goes up to 0x10ffff, but the |
| 93 | // UTF-8 encoding can handle 32 bits. |
| 94 | result.buffer[0] = 0xf0 | (ucs >> 18); |
| 95 | result.buffer[1] = 0x80 | ((ucs >> 12) & 0x3f); |
| 96 | result.buffer[2] = 0x80 | ((ucs >> 6) & 0x3f); |
| 97 | result.buffer[3] = 0x80 | (ucs & 0x3f); |
| 98 | result.bytes = 4; |
| 99 | } else if (ucs <= 0x3ffffff) { |
| 100 | result.buffer[0] = 0xf8 | (ucs >> 24); |
| 101 | result.buffer[1] = 0x80 | ((ucs >> 18) & 0x3f); |
| 102 | result.buffer[2] = 0x80 | ((ucs >> 12) & 0x3f); |
| 103 | result.buffer[3] = 0x80 | ((ucs >> 6) & 0x3f); |
| 104 | result.buffer[4] = 0x80 | (ucs & 0x3f); |
| 105 | result.bytes = 5; |
| 106 | } else { |
| 107 | result.buffer[0] = 0xfc | (ucs >> 30); |
| 108 | result.buffer[1] = 0x80 | ((ucs >> 24) & 0x3f); |
| 109 | result.buffer[2] = 0x80 | ((ucs >> 18) & 0x3f); |
| 110 | result.buffer[3] = 0x80 | ((ucs >> 12) & 0x3f); |
| 111 | result.buffer[4] = 0x80 | ((ucs >> 6) & 0x3f); |
| 112 | result.buffer[5] = 0x80 | (ucs & 0x3f); |
| 113 | result.bytes = 6; |
| 114 | } |
| 115 | return result; |
| 116 | } |
| 117 | |
| 118 | EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) { |
| 119 | switch (encoding) { |
| 120 | SWITCH_COVERS_ALL_CASES |
| 121 | case Encoding::LATIN_1: |
| 122 | return EncodeCharacter<Encoding::LATIN_1>(ucs); |
| 123 | case Encoding::UTF_8: |
| 124 | return EncodeCharacter<Encoding::UTF_8>(ucs); |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | template <Encoding ENCODING, typename STRING> |
| 129 | std::string EncodeString(const STRING &str) { |
| 130 | std::string result; |
| 131 | for (auto ch : str) { |
| 132 | char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)}; |
| 133 | EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)}; |
| 134 | result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes)); |
| 135 | } |
| 136 | return result; |
| 137 | } |
| 138 | |
| 139 | template std::string EncodeString<Encoding::LATIN_1, std::string>( |
| 140 | const std::string &); |
| 141 | template std::string EncodeString<Encoding::UTF_8, std::u16string>( |
| 142 | const std::u16string &); |
| 143 | template std::string EncodeString<Encoding::UTF_8, std::u32string>( |
| 144 | const std::u32string &); |
| 145 | |
| 146 | template <> |
| 147 | DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>( |
| 148 | const char *cp, std::size_t bytes) { |
| 149 | if (bytes >= 1) { |
| 150 | return {*reinterpret_cast<const std::uint8_t *>(cp), 1}; |
| 151 | } else { |
| 152 | return {}; |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | template <> |
| 157 | DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>( |
| 158 | const char *cp, std::size_t bytes) { |
| 159 | auto p{reinterpret_cast<const std::uint8_t *>(cp)}; |
| 160 | char32_t ch{*p}; |
| 161 | if (ch <= 0x7f) { |
| 162 | return {ch, 1}; |
| 163 | } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 && |
| 164 | ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) { |
| 165 | ch = ((ch & 7) << 6) | (p[1] & 0x3f); |
| 166 | ch = (ch << 6) | (p[2] & 0x3f); |
| 167 | ch = (ch << 6) | (p[3] & 0x3f); |
| 168 | return {ch, 4}; |
| 169 | } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 && |
| 170 | ((p[1] | p[2]) & 0xc0) == 0x80) { |
| 171 | ch = ((ch & 0xf) << 6) | (p[1] & 0x3f); |
| 172 | ch = (ch << 6) | (p[2] & 0x3f); |
| 173 | return {ch, 3}; |
| 174 | } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 && |
| 175 | (p[1] & 0xc0) == 0x80) { |
| 176 | ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f); |
| 177 | return {ch, 2}; |
| 178 | } else { |
| 179 | return {}; // not valid UTF-8 |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | static DecodedCharacter DecodeEscapedCharacter( |
| 184 | const char *cp, std::size_t bytes) { |
| 185 | if (cp[0] == '\\' && bytes >= 2) { |
| 186 | if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) { |
| 187 | return {static_cast<unsigned char>(*escChar), 2}; |
| 188 | } else if (IsOctalDigit(cp[1])) { |
| 189 | std::size_t maxLen{std::min(a: std::size_t{4}, b: bytes)}; |
| 190 | char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[1]))}; |
| 191 | std::size_t len{2}; // so far |
| 192 | for (; code <= 037 && len < maxLen && IsOctalDigit(cp[len]); ++len) { |
| 193 | code = 8 * code + DecimalDigitValue(cp[len]); |
| 194 | } |
| 195 | return {code, static_cast<int>(len)}; |
| 196 | } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' && |
| 197 | IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) { |
| 198 | return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) + |
| 199 | HexadecimalDigitValue(cp[3])), |
| 200 | 4}; |
| 201 | } else if (IsLetter(cp[1])) { |
| 202 | // Unknown escape - ignore the '\' (PGI compatibility) |
| 203 | return {static_cast<unsigned char>(cp[1]), 2}; |
| 204 | } else { |
| 205 | // Not an escape character. |
| 206 | return {'\\', 1}; |
| 207 | } |
| 208 | } |
| 209 | return {static_cast<unsigned char>(cp[0]), 1}; |
| 210 | } |
| 211 | |
| 212 | template <Encoding ENCODING> |
| 213 | static DecodedCharacter DecodeEscapedCharacters( |
| 214 | const char *cp, std::size_t bytes) { |
| 215 | char buffer[EncodedCharacter::maxEncodingBytes]; |
| 216 | int count[EncodedCharacter::maxEncodingBytes]; |
| 217 | std::size_t at{0}, len{0}; |
| 218 | for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) { |
| 219 | DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)}; |
| 220 | buffer[len] = code.codepoint; |
| 221 | at += code.bytes; |
| 222 | count[len] = at; |
| 223 | } |
| 224 | DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)}; |
| 225 | if (code.bytes > 0) { |
| 226 | code.bytes = count[code.bytes - 1]; |
| 227 | } else { |
| 228 | code.codepoint = buffer[0] & 0xff; |
| 229 | code.bytes = count[0]; |
| 230 | } |
| 231 | return code; |
| 232 | } |
| 233 | |
| 234 | template <Encoding ENCODING> |
| 235 | DecodedCharacter DecodeCharacter( |
| 236 | const char *cp, std::size_t bytes, bool backslashEscapes) { |
| 237 | if (backslashEscapes && bytes >= 2 && *cp == '\\') { |
| 238 | if (ENCODING == Encoding::UTF_8 && bytes >= 6 && |
| 239 | ToLowerCaseLetter(cp[1]) == 'u' && IsHexadecimalDigit(cp[2]) && |
| 240 | IsHexadecimalDigit(cp[3]) && IsHexadecimalDigit(cp[4]) && |
| 241 | IsHexadecimalDigit(cp[5])) { |
| 242 | char32_t ch{ |
| 243 | static_cast<char32_t>(4096 * HexadecimalDigitValue(cp[2]) + |
| 244 | 256 * HexadecimalDigitValue(cp[3]) + |
| 245 | 16 * HexadecimalDigitValue(cp[4]) + HexadecimalDigitValue(cp[5])), |
| 246 | }; |
| 247 | if (bytes >= 10 && IsHexadecimalDigit(cp[6]) && |
| 248 | IsHexadecimalDigit(cp[7]) && IsHexadecimalDigit(cp[8]) && |
| 249 | IsHexadecimalDigit(cp[9])) { |
| 250 | return {(ch << 16) | |
| 251 | (4096 * HexadecimalDigitValue(cp[6]) + |
| 252 | 256 * HexadecimalDigitValue(cp[7]) + |
| 253 | 16 * HexadecimalDigitValue(cp[8]) + |
| 254 | HexadecimalDigitValue(cp[9])), |
| 255 | 10}; |
| 256 | } else { |
| 257 | return {ch, 6}; |
| 258 | } |
| 259 | } else { |
| 260 | return DecodeEscapedCharacters<ENCODING>(cp, bytes); |
| 261 | } |
| 262 | } else { |
| 263 | return DecodeRawCharacter<ENCODING>(cp, bytes); |
| 264 | } |
| 265 | } |
| 266 | |
| 267 | template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>( |
| 268 | const char *, std::size_t, bool); |
| 269 | template DecodedCharacter DecodeCharacter<Encoding::UTF_8>( |
| 270 | const char *, std::size_t, bool); |
| 271 | |
| 272 | DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp, |
| 273 | std::size_t bytes, bool backslashEscapes) { |
| 274 | switch (encoding) { |
| 275 | SWITCH_COVERS_ALL_CASES |
| 276 | case Encoding::LATIN_1: |
| 277 | return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes); |
| 278 | case Encoding::UTF_8: |
| 279 | return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes); |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | template <typename RESULT, Encoding ENCODING> |
| 284 | RESULT DecodeString(const std::string &s, bool backslashEscapes) { |
| 285 | RESULT result; |
| 286 | const char *p{s.c_str()}; |
| 287 | for (auto bytes{s.size()}; bytes != 0;) { |
| 288 | DecodedCharacter decoded{ |
| 289 | DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)}; |
| 290 | if (decoded.bytes > 0) { |
| 291 | if (static_cast<std::size_t>(decoded.bytes) <= bytes) { |
| 292 | result.append(1, decoded.codepoint); |
| 293 | bytes -= decoded.bytes; |
| 294 | p += decoded.bytes; |
| 295 | continue; |
| 296 | } |
| 297 | } |
| 298 | result.append(1, static_cast<uint8_t>(*p)); |
| 299 | ++p; |
| 300 | --bytes; |
| 301 | } |
| 302 | return result; |
| 303 | } |
| 304 | |
| 305 | template std::string DecodeString<std::string, Encoding::LATIN_1>( |
| 306 | const std::string &, bool); |
| 307 | template std::u16string DecodeString<std::u16string, Encoding::UTF_8>( |
| 308 | const std::string &, bool); |
| 309 | template std::u32string DecodeString<std::u32string, Encoding::UTF_8>( |
| 310 | const std::string &, bool); |
| 311 | } // namespace Fortran::parser |
| 312 | |