1 | //===-- runtime/utf.cpp ---------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "utf.h" |
10 | |
11 | namespace Fortran::runtime { |
12 | |
13 | // clang-format off |
14 | RT_OFFLOAD_VAR_GROUP_BEGIN |
15 | const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{ |
16 | /* 00 - 7F: 7 bit payload in single byte */ |
17 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
18 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
19 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
20 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
22 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
24 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
25 | /* 80 - BF: invalid first byte, valid later byte */ |
26 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
30 | /* C0 - DF: 11 bit payload */ |
31 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
32 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
33 | /* E0 - EF: 16 bit payload */ |
34 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
35 | /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4, |
36 | /* F8 - FB: 26 bit payload */ 5, 5, 5, 5, |
37 | /* FC - FD: 31 bit payload */ 6, 6, |
38 | /* FE: 32 bit payload */ 7, |
39 | /* FF: invalid */ 0 |
40 | }; |
41 | RT_OFFLOAD_VAR_GROUP_END |
42 | // clang-format on |
43 | |
44 | RT_OFFLOAD_API_GROUP_BEGIN |
45 | // Non-minimal encodings are accepted. |
46 | Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) { |
47 | const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)}; |
48 | std::size_t bytes{MeasureUTF8Bytes(*p0)}; |
49 | if (bytes == 1) { |
50 | return char32_t{*p}; |
51 | } else if (bytes > 1) { |
52 | std::uint64_t result{char32_t{*p} & (0x7f >> bytes)}; |
53 | for (std::size_t j{1}; j < bytes; ++j) { |
54 | std::uint8_t next{p[j]}; |
55 | if (next < 0x80 || next > 0xbf) { |
56 | return Fortran::common::nullopt; |
57 | } |
58 | result = (result << 6) | (next & 0x3f); |
59 | } |
60 | if (result <= 0xffffffff) { |
61 | return static_cast<char32_t>(result); |
62 | } |
63 | } |
64 | return Fortran::common::nullopt; |
65 | } |
66 | |
67 | std::size_t EncodeUTF8(char *p0, char32_t ucs) { |
68 | std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)}; |
69 | if (ucs <= 0x7f) { |
70 | p[0] = ucs; |
71 | return 1; |
72 | } else if (ucs <= 0x7ff) { |
73 | p[0] = 0xc0 | (ucs >> 6); |
74 | p[1] = 0x80 | (ucs & 0x3f); |
75 | return 2; |
76 | } else if (ucs <= 0xffff) { |
77 | p[0] = 0xe0 | (ucs >> 12); |
78 | p[1] = 0x80 | ((ucs >> 6) & 0x3f); |
79 | p[2] = 0x80 | (ucs & 0x3f); |
80 | return 3; |
81 | } else if (ucs <= 0x1fffff) { |
82 | p[0] = 0xf0 | (ucs >> 18); |
83 | p[1] = 0x80 | ((ucs >> 12) & 0x3f); |
84 | p[2] = 0x80 | ((ucs >> 6) & 0x3f); |
85 | p[3] = 0x80 | (ucs & 0x3f); |
86 | return 4; |
87 | } else if (ucs <= 0x3ffffff) { |
88 | p[0] = 0xf8 | (ucs >> 24); |
89 | p[1] = 0x80 | ((ucs >> 18) & 0x3f); |
90 | p[2] = 0x80 | ((ucs >> 12) & 0x3f); |
91 | p[3] = 0x80 | ((ucs >> 6) & 0x3f); |
92 | p[4] = 0x80 | (ucs & 0x3f); |
93 | return 5; |
94 | } else if (ucs <= 0x7ffffff) { |
95 | p[0] = 0xf8 | (ucs >> 30); |
96 | p[1] = 0x80 | ((ucs >> 24) & 0x3f); |
97 | p[2] = 0x80 | ((ucs >> 18) & 0x3f); |
98 | p[3] = 0x80 | ((ucs >> 12) & 0x3f); |
99 | p[4] = 0x80 | ((ucs >> 6) & 0x3f); |
100 | p[5] = 0x80 | (ucs & 0x3f); |
101 | return 6; |
102 | } else { |
103 | p[0] = 0xfe; |
104 | p[1] = 0x80 | ((ucs >> 30) & 0x3f); |
105 | p[2] = 0x80 | ((ucs >> 24) & 0x3f); |
106 | p[3] = 0x80 | ((ucs >> 18) & 0x3f); |
107 | p[4] = 0x80 | ((ucs >> 12) & 0x3f); |
108 | p[5] = 0x80 | ((ucs >> 6) & 0x3f); |
109 | p[6] = 0x80 | (ucs & 0x3f); |
110 | return 7; |
111 | } |
112 | } |
113 | RT_OFFLOAD_API_GROUP_END |
114 | |
115 | } // namespace Fortran::runtime |
116 | |