1 | //===-- runtime/utf.h -----------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | // UTF-8 is the variant-width standard encoding of Unicode (ISO 10646) |
10 | // code points. |
11 | // |
12 | // 7-bit values in [00 .. 7F] represent themselves as single bytes, so true |
13 | // 7-bit ASCII is also valid UTF-8. |
14 | // |
15 | // Larger values are encoded with a start byte in [C0 .. FE] that carries |
16 | // the length of the encoding and some of the upper bits of the value, followed |
17 | // by one or more bytes in the range [80 .. BF]. |
18 | // |
19 | // Specifically, the first byte holds two or more uppermost set bits, |
20 | // a zero bit, and some payload; the second and later bytes each start with |
21 | // their uppermost bit set, the next bit clear, and six bits of payload. |
22 | // Payload parcels are in big-endian order. All bytes must be present in a |
23 | // valid sequence; i.e., low-order sezo bits must be explicit. UTF-8 is |
24 | // self-synchronizing on input as any byte value cannot be both a valid |
25 | // first byte or trailing byte. |
26 | // |
27 | // 0xxxxxxx - 7 bit ASCII |
28 | // 110xxxxx 10xxxxxx - 11-bit value |
29 | // 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value |
30 | // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value |
31 | // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value |
32 | // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value |
33 | // 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value |
34 | // |
35 | // Canonical UTF-8 sequences should be minimal, and our output is so, but |
36 | // we do not reject non-minimal sequences on input. Unicode only defines |
37 | // code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual |
38 | // standard maximum. However, we support extended forms up to 32 bits so that |
39 | // CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data. |
40 | |
41 | #ifndef FORTRAN_RUNTIME_UTF_H_ |
42 | #define FORTRAN_RUNTIME_UTF_H_ |
43 | |
44 | #include "flang/Common/optional.h" |
45 | #include <cstddef> |
46 | #include <cstdint> |
47 | |
48 | namespace Fortran::runtime { |
49 | |
50 | // Derive the length of a UTF-8 character encoding from its first byte. |
51 | // A zero result signifies an invalid encoding. |
52 | RT_OFFLOAD_VAR_GROUP_BEGIN |
53 | extern const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]; |
54 | static constexpr std::size_t maxUTF8Bytes{7}; |
55 | RT_OFFLOAD_VAR_GROUP_END |
56 | |
57 | static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) { |
58 | return UTF8FirstByteTable[static_cast<std::uint8_t>(first)]; |
59 | } |
60 | |
61 | // Ensure that all bytes are present in sequence in the input buffer |
62 | // before calling; use MeasureUTF8Bytes(first byte) to count them. |
63 | RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *); |
64 | |
65 | // Ensure that at least maxUTF8Bytes remain in the output |
66 | // buffer before calling. |
67 | RT_API_ATTRS std::size_t EncodeUTF8(char *, char32_t); |
68 | |
69 | } // namespace Fortran::runtime |
70 | #endif // FORTRAN_RUNTIME_UTF_H_ |
71 | |