| 1 | //===-- lib/runtime/utf.cpp -------------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "flang-rt/runtime/utf.h" |
| 10 | |
| 11 | namespace Fortran::runtime { |
| 12 | |
| 13 | #ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS |
| 14 | // clang-format off |
| 15 | RT_OFFLOAD_VAR_GROUP_BEGIN |
| 16 | const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{ |
| 17 | /* 00 - 7F: 7 bit payload in single byte */ |
| 18 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 19 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 20 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 22 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 24 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 25 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 26 | /* 80 - BF: invalid first byte, valid later byte */ |
| 27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 31 | /* C0 - DF: 11 bit payload */ |
| 32 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 33 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 34 | /* E0 - EF: 16 bit payload */ |
| 35 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 36 | /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4, |
| 37 | /* F8 - FB: 26 bit payload */ 5, 5, 5, 5, |
| 38 | /* FC - FD: 31 bit payload */ 6, 6, |
| 39 | /* FE: 32 bit payload */ 7, |
| 40 | /* FF: invalid */ 0 |
| 41 | }; |
| 42 | RT_OFFLOAD_VAR_GROUP_END |
| 43 | // clang-format on |
| 44 | #endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS |
| 45 | |
| 46 | RT_OFFLOAD_API_GROUP_BEGIN |
| 47 | |
| 48 | std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) { |
| 49 | // Scan back over UTF-8 continuation bytes, if any |
| 50 | for (std::size_t n{1}; n <= limit; ++n) { |
| 51 | if ((end[-n] & 0xc0) != 0x80) { |
| 52 | return n; |
| 53 | } |
| 54 | } |
| 55 | return limit; |
| 56 | } |
| 57 | |
| 58 | // Non-minimal encodings are accepted. |
| 59 | Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) { |
| 60 | const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)}; |
| 61 | std::size_t bytes{MeasureUTF8Bytes(*p0)}; |
| 62 | if (bytes == 1) { |
| 63 | return char32_t{*p}; |
| 64 | } else if (bytes > 1) { |
| 65 | std::uint64_t result{char32_t{*p} & (0x7f >> bytes)}; |
| 66 | for (std::size_t j{1}; j < bytes; ++j) { |
| 67 | std::uint8_t next{p[j]}; |
| 68 | if (next < 0x80 || next > 0xbf) { |
| 69 | return Fortran::common::nullopt; |
| 70 | } |
| 71 | result = (result << 6) | (next & 0x3f); |
| 72 | } |
| 73 | if (result <= 0xffffffff) { |
| 74 | return static_cast<char32_t>(result); |
| 75 | } |
| 76 | } |
| 77 | return Fortran::common::nullopt; |
| 78 | } |
| 79 | |
| 80 | std::size_t EncodeUTF8(char *p0, char32_t ucs) { |
| 81 | std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)}; |
| 82 | if (ucs <= 0x7f) { |
| 83 | p[0] = ucs; |
| 84 | return 1; |
| 85 | } else if (ucs <= 0x7ff) { |
| 86 | p[0] = 0xc0 | (ucs >> 6); |
| 87 | p[1] = 0x80 | (ucs & 0x3f); |
| 88 | return 2; |
| 89 | } else if (ucs <= 0xffff) { |
| 90 | p[0] = 0xe0 | (ucs >> 12); |
| 91 | p[1] = 0x80 | ((ucs >> 6) & 0x3f); |
| 92 | p[2] = 0x80 | (ucs & 0x3f); |
| 93 | return 3; |
| 94 | } else if (ucs <= 0x1fffff) { |
| 95 | p[0] = 0xf0 | (ucs >> 18); |
| 96 | p[1] = 0x80 | ((ucs >> 12) & 0x3f); |
| 97 | p[2] = 0x80 | ((ucs >> 6) & 0x3f); |
| 98 | p[3] = 0x80 | (ucs & 0x3f); |
| 99 | return 4; |
| 100 | } else if (ucs <= 0x3ffffff) { |
| 101 | p[0] = 0xf8 | (ucs >> 24); |
| 102 | p[1] = 0x80 | ((ucs >> 18) & 0x3f); |
| 103 | p[2] = 0x80 | ((ucs >> 12) & 0x3f); |
| 104 | p[3] = 0x80 | ((ucs >> 6) & 0x3f); |
| 105 | p[4] = 0x80 | (ucs & 0x3f); |
| 106 | return 5; |
| 107 | } else if (ucs <= 0x7ffffff) { |
| 108 | p[0] = 0xf8 | (ucs >> 30); |
| 109 | p[1] = 0x80 | ((ucs >> 24) & 0x3f); |
| 110 | p[2] = 0x80 | ((ucs >> 18) & 0x3f); |
| 111 | p[3] = 0x80 | ((ucs >> 12) & 0x3f); |
| 112 | p[4] = 0x80 | ((ucs >> 6) & 0x3f); |
| 113 | p[5] = 0x80 | (ucs & 0x3f); |
| 114 | return 6; |
| 115 | } else { |
| 116 | p[0] = 0xfe; |
| 117 | p[1] = 0x80 | ((ucs >> 30) & 0x3f); |
| 118 | p[2] = 0x80 | ((ucs >> 24) & 0x3f); |
| 119 | p[3] = 0x80 | ((ucs >> 18) & 0x3f); |
| 120 | p[4] = 0x80 | ((ucs >> 12) & 0x3f); |
| 121 | p[5] = 0x80 | ((ucs >> 6) & 0x3f); |
| 122 | p[6] = 0x80 | (ucs & 0x3f); |
| 123 | return 7; |
| 124 | } |
| 125 | } |
| 126 | RT_OFFLOAD_API_GROUP_END |
| 127 | |
| 128 | } // namespace Fortran::runtime |
| 129 | |