1 | //===-- lib/runtime/utf.cpp -------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "flang-rt/runtime/utf.h" |
10 | |
11 | namespace Fortran::runtime { |
12 | |
13 | #ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS |
14 | // clang-format off |
15 | RT_OFFLOAD_VAR_GROUP_BEGIN |
16 | const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{ |
17 | /* 00 - 7F: 7 bit payload in single byte */ |
18 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
19 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
20 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
22 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
24 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
25 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
26 | /* 80 - BF: invalid first byte, valid later byte */ |
27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
31 | /* C0 - DF: 11 bit payload */ |
32 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
33 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
34 | /* E0 - EF: 16 bit payload */ |
35 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
36 | /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4, |
37 | /* F8 - FB: 26 bit payload */ 5, 5, 5, 5, |
38 | /* FC - FD: 31 bit payload */ 6, 6, |
39 | /* FE: 32 bit payload */ 7, |
40 | /* FF: invalid */ 0 |
41 | }; |
42 | RT_OFFLOAD_VAR_GROUP_END |
43 | // clang-format on |
44 | #endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS |
45 | |
46 | RT_OFFLOAD_API_GROUP_BEGIN |
47 | |
48 | std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) { |
49 | // Scan back over UTF-8 continuation bytes, if any |
50 | for (std::size_t n{1}; n <= limit; ++n) { |
51 | if ((end[-n] & 0xc0) != 0x80) { |
52 | return n; |
53 | } |
54 | } |
55 | return limit; |
56 | } |
57 | |
58 | // Non-minimal encodings are accepted. |
59 | Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) { |
60 | const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)}; |
61 | std::size_t bytes{MeasureUTF8Bytes(*p0)}; |
62 | if (bytes == 1) { |
63 | return char32_t{*p}; |
64 | } else if (bytes > 1) { |
65 | std::uint64_t result{char32_t{*p} & (0x7f >> bytes)}; |
66 | for (std::size_t j{1}; j < bytes; ++j) { |
67 | std::uint8_t next{p[j]}; |
68 | if (next < 0x80 || next > 0xbf) { |
69 | return Fortran::common::nullopt; |
70 | } |
71 | result = (result << 6) | (next & 0x3f); |
72 | } |
73 | if (result <= 0xffffffff) { |
74 | return static_cast<char32_t>(result); |
75 | } |
76 | } |
77 | return Fortran::common::nullopt; |
78 | } |
79 | |
80 | std::size_t EncodeUTF8(char *p0, char32_t ucs) { |
81 | std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)}; |
82 | if (ucs <= 0x7f) { |
83 | p[0] = ucs; |
84 | return 1; |
85 | } else if (ucs <= 0x7ff) { |
86 | p[0] = 0xc0 | (ucs >> 6); |
87 | p[1] = 0x80 | (ucs & 0x3f); |
88 | return 2; |
89 | } else if (ucs <= 0xffff) { |
90 | p[0] = 0xe0 | (ucs >> 12); |
91 | p[1] = 0x80 | ((ucs >> 6) & 0x3f); |
92 | p[2] = 0x80 | (ucs & 0x3f); |
93 | return 3; |
94 | } else if (ucs <= 0x1fffff) { |
95 | p[0] = 0xf0 | (ucs >> 18); |
96 | p[1] = 0x80 | ((ucs >> 12) & 0x3f); |
97 | p[2] = 0x80 | ((ucs >> 6) & 0x3f); |
98 | p[3] = 0x80 | (ucs & 0x3f); |
99 | return 4; |
100 | } else if (ucs <= 0x3ffffff) { |
101 | p[0] = 0xf8 | (ucs >> 24); |
102 | p[1] = 0x80 | ((ucs >> 18) & 0x3f); |
103 | p[2] = 0x80 | ((ucs >> 12) & 0x3f); |
104 | p[3] = 0x80 | ((ucs >> 6) & 0x3f); |
105 | p[4] = 0x80 | (ucs & 0x3f); |
106 | return 5; |
107 | } else if (ucs <= 0x7ffffff) { |
108 | p[0] = 0xf8 | (ucs >> 30); |
109 | p[1] = 0x80 | ((ucs >> 24) & 0x3f); |
110 | p[2] = 0x80 | ((ucs >> 18) & 0x3f); |
111 | p[3] = 0x80 | ((ucs >> 12) & 0x3f); |
112 | p[4] = 0x80 | ((ucs >> 6) & 0x3f); |
113 | p[5] = 0x80 | (ucs & 0x3f); |
114 | return 6; |
115 | } else { |
116 | p[0] = 0xfe; |
117 | p[1] = 0x80 | ((ucs >> 30) & 0x3f); |
118 | p[2] = 0x80 | ((ucs >> 24) & 0x3f); |
119 | p[3] = 0x80 | ((ucs >> 18) & 0x3f); |
120 | p[4] = 0x80 | ((ucs >> 12) & 0x3f); |
121 | p[5] = 0x80 | ((ucs >> 6) & 0x3f); |
122 | p[6] = 0x80 | (ucs & 0x3f); |
123 | return 7; |
124 | } |
125 | } |
126 | RT_OFFLOAD_API_GROUP_END |
127 | |
128 | } // namespace Fortran::runtime |
129 | |