1 | //===-- runtime/utf.cpp ---------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "utf.h" |
10 | |
11 | namespace Fortran::runtime { |
12 | |
13 | // clang-format off |
14 | const std::uint8_t UTF8FirstByteTable[256]{ |
15 | /* 00 - 7F: 7 bit payload in single byte */ |
16 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
17 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
18 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
19 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
20 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
22 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
24 | /* 80 - BF: invalid first byte, valid later byte */ |
25 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
26 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
29 | /* C0 - DF: 11 bit payload */ |
30 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
31 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
32 | /* E0 - EF: 16 bit payload */ |
33 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
34 | /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4, |
35 | /* F8 - FB: 26 bit payload */ 5, 5, 5, 5, |
36 | /* FC - FD: 31 bit payload */ 6, 6, |
37 | /* FE: 32 bit payload */ 7, |
38 | /* FF: invalid */ 0 |
39 | }; |
40 | // clang-format on |
41 | |
42 | // Non-minimal encodings are accepted. |
43 | std::optional<char32_t> DecodeUTF8(const char *p0) { |
44 | const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)}; |
45 | std::size_t bytes{MeasureUTF8Bytes(first: *p0)}; |
46 | if (bytes == 1) { |
47 | return char32_t{*p}; |
48 | } else if (bytes > 1) { |
49 | std::uint64_t result{char32_t{*p} & (0x7f >> bytes)}; |
50 | for (std::size_t j{1}; j < bytes; ++j) { |
51 | std::uint8_t next{p[j]}; |
52 | if (next < 0x80 || next > 0xbf) { |
53 | return std::nullopt; |
54 | } |
55 | result = (result << 6) | (next & 0x3f); |
56 | } |
57 | if (result <= 0xffffffff) { |
58 | return static_cast<char32_t>(result); |
59 | } |
60 | } |
61 | return std::nullopt; |
62 | } |
63 | |
64 | std::size_t EncodeUTF8(char *p0, char32_t ucs) { |
65 | std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)}; |
66 | if (ucs <= 0x7f) { |
67 | p[0] = ucs; |
68 | return 1; |
69 | } else if (ucs <= 0x7ff) { |
70 | p[0] = 0xc0 | (ucs >> 6); |
71 | p[1] = 0x80 | (ucs & 0x3f); |
72 | return 2; |
73 | } else if (ucs <= 0xffff) { |
74 | p[0] = 0xe0 | (ucs >> 12); |
75 | p[1] = 0x80 | ((ucs >> 6) & 0x3f); |
76 | p[2] = 0x80 | (ucs & 0x3f); |
77 | return 3; |
78 | } else if (ucs <= 0x1fffff) { |
79 | p[0] = 0xf0 | (ucs >> 18); |
80 | p[1] = 0x80 | ((ucs >> 12) & 0x3f); |
81 | p[2] = 0x80 | ((ucs >> 6) & 0x3f); |
82 | p[3] = 0x80 | (ucs & 0x3f); |
83 | return 4; |
84 | } else if (ucs <= 0x3ffffff) { |
85 | p[0] = 0xf8 | (ucs >> 24); |
86 | p[1] = 0x80 | ((ucs >> 18) & 0x3f); |
87 | p[2] = 0x80 | ((ucs >> 12) & 0x3f); |
88 | p[3] = 0x80 | ((ucs >> 6) & 0x3f); |
89 | p[4] = 0x80 | (ucs & 0x3f); |
90 | return 5; |
91 | } else if (ucs <= 0x7ffffff) { |
92 | p[0] = 0xf8 | (ucs >> 30); |
93 | p[1] = 0x80 | ((ucs >> 24) & 0x3f); |
94 | p[2] = 0x80 | ((ucs >> 18) & 0x3f); |
95 | p[3] = 0x80 | ((ucs >> 12) & 0x3f); |
96 | p[4] = 0x80 | ((ucs >> 6) & 0x3f); |
97 | p[5] = 0x80 | (ucs & 0x3f); |
98 | return 6; |
99 | } else { |
100 | p[0] = 0xfe; |
101 | p[1] = 0x80 | ((ucs >> 30) & 0x3f); |
102 | p[2] = 0x80 | ((ucs >> 24) & 0x3f); |
103 | p[3] = 0x80 | ((ucs >> 18) & 0x3f); |
104 | p[4] = 0x80 | ((ucs >> 12) & 0x3f); |
105 | p[5] = 0x80 | ((ucs >> 6) & 0x3f); |
106 | p[6] = 0x80 | (ucs & 0x3f); |
107 | return 7; |
108 | } |
109 | } |
110 | |
111 | } // namespace Fortran::runtime |
112 | |