1//===-- runtime/utf.cpp ---------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "utf.h"
10
11namespace Fortran::runtime {
12
13// clang-format off
14const std::uint8_t UTF8FirstByteTable[256]{
15 /* 00 - 7F: 7 bit payload in single byte */
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24 /* 80 - BF: invalid first byte, valid later byte */
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29 /* C0 - DF: 11 bit payload */
30 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
31 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
32 /* E0 - EF: 16 bit payload */
33 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
34 /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
35 /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
36 /* FC - FD: 31 bit payload */ 6, 6,
37 /* FE: 32 bit payload */ 7,
38 /* FF: invalid */ 0
39};
40// clang-format on
41
42// Non-minimal encodings are accepted.
43std::optional<char32_t> DecodeUTF8(const char *p0) {
44 const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
45 std::size_t bytes{MeasureUTF8Bytes(first: *p0)};
46 if (bytes == 1) {
47 return char32_t{*p};
48 } else if (bytes > 1) {
49 std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
50 for (std::size_t j{1}; j < bytes; ++j) {
51 std::uint8_t next{p[j]};
52 if (next < 0x80 || next > 0xbf) {
53 return std::nullopt;
54 }
55 result = (result << 6) | (next & 0x3f);
56 }
57 if (result <= 0xffffffff) {
58 return static_cast<char32_t>(result);
59 }
60 }
61 return std::nullopt;
62}
63
64std::size_t EncodeUTF8(char *p0, char32_t ucs) {
65 std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
66 if (ucs <= 0x7f) {
67 p[0] = ucs;
68 return 1;
69 } else if (ucs <= 0x7ff) {
70 p[0] = 0xc0 | (ucs >> 6);
71 p[1] = 0x80 | (ucs & 0x3f);
72 return 2;
73 } else if (ucs <= 0xffff) {
74 p[0] = 0xe0 | (ucs >> 12);
75 p[1] = 0x80 | ((ucs >> 6) & 0x3f);
76 p[2] = 0x80 | (ucs & 0x3f);
77 return 3;
78 } else if (ucs <= 0x1fffff) {
79 p[0] = 0xf0 | (ucs >> 18);
80 p[1] = 0x80 | ((ucs >> 12) & 0x3f);
81 p[2] = 0x80 | ((ucs >> 6) & 0x3f);
82 p[3] = 0x80 | (ucs & 0x3f);
83 return 4;
84 } else if (ucs <= 0x3ffffff) {
85 p[0] = 0xf8 | (ucs >> 24);
86 p[1] = 0x80 | ((ucs >> 18) & 0x3f);
87 p[2] = 0x80 | ((ucs >> 12) & 0x3f);
88 p[3] = 0x80 | ((ucs >> 6) & 0x3f);
89 p[4] = 0x80 | (ucs & 0x3f);
90 return 5;
91 } else if (ucs <= 0x7ffffff) {
92 p[0] = 0xf8 | (ucs >> 30);
93 p[1] = 0x80 | ((ucs >> 24) & 0x3f);
94 p[2] = 0x80 | ((ucs >> 18) & 0x3f);
95 p[3] = 0x80 | ((ucs >> 12) & 0x3f);
96 p[4] = 0x80 | ((ucs >> 6) & 0x3f);
97 p[5] = 0x80 | (ucs & 0x3f);
98 return 6;
99 } else {
100 p[0] = 0xfe;
101 p[1] = 0x80 | ((ucs >> 30) & 0x3f);
102 p[2] = 0x80 | ((ucs >> 24) & 0x3f);
103 p[3] = 0x80 | ((ucs >> 18) & 0x3f);
104 p[4] = 0x80 | ((ucs >> 12) & 0x3f);
105 p[5] = 0x80 | ((ucs >> 6) & 0x3f);
106 p[6] = 0x80 | (ucs & 0x3f);
107 return 7;
108 }
109}
110
111} // namespace Fortran::runtime
112

source code of flang/runtime/utf.cpp