utf.h source code [flang/runtime/utf.h]

1	//===-- runtime/utf.h -----------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	// UTF-8 is the variant-width standard encoding of Unicode (ISO 10646)
10	// code points.
11	//
12	// 7-bit values in [00 .. 7F] represent themselves as single bytes, so true
13	// 7-bit ASCII is also valid UTF-8.
14	//
15	// Larger values are encoded with a start byte in [C0 .. FE] that carries
16	// the length of the encoding and some of the upper bits of the value, followed
17	// by one or more bytes in the range [80 .. BF].
18	//
19	// Specifically, the first byte holds two or more uppermost set bits,
20	// a zero bit, and some payload; the second and later bytes each start with
21	// their uppermost bit set, the next bit clear, and six bits of payload.
22	// Payload parcels are in big-endian order. All bytes must be present in a
23	// valid sequence; i.e., low-order sezo bits must be explicit. UTF-8 is
24	// self-synchronizing on input as any byte value cannot be both a valid
25	// first byte or trailing byte.
26	//
27	// 0xxxxxxx - 7 bit ASCII
28	// 110xxxxx 10xxxxxx - 11-bit value
29	// 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value
30	// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value
31	// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value
32	// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value
33	// 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value
34	//
35	// Canonical UTF-8 sequences should be minimal, and our output is so, but
36	// we do not reject non-minimal sequences on input. Unicode only defines
37	// code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual
38	// standard maximum. However, we support extended forms up to 32 bits so that
39	// CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data.
40
41	#ifndef FORTRAN_RUNTIME_UTF_H_
42	#define FORTRAN_RUNTIME_UTF_H_
43
44	#include "flang/Common/optional.h"
45	#include <cstddef>
46	#include <cstdint>
47
48	namespace Fortran::runtime {
49
50	// Derive the length of a UTF-8 character encoding from its first byte.
51	// A zero result signifies an invalid encoding.
52	RT_OFFLOAD_VAR_GROUP_BEGIN
53	extern const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[`256`];
54	static constexpr std::size_t maxUTF8Bytes{`7`};
55	RT_OFFLOAD_VAR_GROUP_END
56
57	static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) {
58	return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
59	}
60
61	// Ensure that all bytes are present in sequence in the input buffer
62	// before calling; use MeasureUTF8Bytes(first byte) to count them.
63	RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *);
64
65	// Ensure that at least maxUTF8Bytes remain in the output
66	// buffer before calling.
67	RT_API_ATTRS std::size_t EncodeUTF8(char , char32_t*);
68
69	} // namespace Fortran::runtime
70	#endif // FORTRAN_RUNTIME_UTF_H_
71

source code of flang/runtime/utf.h