1//===-- Implementation of a class for conversion --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "hdr/errno_macros.h"
10#include "hdr/types/char32_t.h"
11#include "hdr/types/char8_t.h"
12#include "hdr/types/size_t.h"
13#include "src/__support/CPP/bit.h"
14#include "src/__support/common.h"
15#include "src/__support/error_or.h"
16#include "src/__support/math_extras.h"
17#include "src/__support/wchar/mbstate.h"
18
19#include "character_converter.h"
20
21namespace LIBC_NAMESPACE_DECL {
22namespace internal {
23
24// This is for utf-8 bytes other than the first byte
25constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
26// The number of bits per utf-8 byte that actually encode character
27// Information not metadata (# of bits excluding the byte headers)
28constexpr uint32_t MASK_ENCODED_BITS =
29 mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
30// Maximum value for utf-32 for a utf-8 sequence of a given length
31constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
32constexpr int MAX_UTF8_LENGTH = 4;
33
34CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
35
36void CharacterConverter::clear() {
37 state->partial = 0;
38 state->bytes_stored = 0;
39 state->total_bytes = 0;
40}
41
42bool CharacterConverter::isFull() {
43 return state->bytes_stored == state->total_bytes && state->total_bytes != 0;
44}
45
46bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
47
48bool CharacterConverter::isValidState() {
49 if (state->total_bytes > MAX_UTF8_LENGTH)
50 return false;
51
52 const char32_t max_utf32_value =
53 state->total_bytes == 0 ? 0
54 : MAX_VALUE_PER_UTF8_LEN[state->total_bytes - 1];
55 return state->bytes_stored <= state->total_bytes &&
56 state->partial <= max_utf32_value;
57}
58
59int CharacterConverter::push(char8_t utf8_byte) {
60 uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
61 // Checking the first byte if first push
62 if (isEmpty()) {
63 // UTF-8 char has 1 byte total
64 if (num_ones == 0) {
65 state->total_bytes = 1;
66 }
67 // UTF-8 char has 2 through 4 bytes total
68 else if (num_ones >= 2 && num_ones <= 4) {
69 /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
70 we will make the base mask with 7 ones and right shift it as necessary. */
71 constexpr size_t SIGNIFICANT_BITS = 7;
72 char8_t base_mask =
73 static_cast<char8_t>(mask_trailing_ones<uint8_t, SIGNIFICANT_BITS>());
74 state->total_bytes = num_ones;
75 utf8_byte &= (base_mask >> num_ones);
76 }
77 // Invalid first byte
78 else {
79 // bytes_stored and total_bytes will always be 0 here
80 state->partial = static_cast<char32_t>(0);
81 return EILSEQ;
82 }
83 state->partial = static_cast<char32_t>(utf8_byte);
84 state->bytes_stored++;
85 return 0;
86 }
87 // Any subsequent push
88 // Adding 6 more bits so need to left shift
89 if (num_ones == 1 && !isFull()) {
90 char32_t byte = utf8_byte & MASK_ENCODED_BITS;
91 state->partial = state->partial << ENCODED_BITS_PER_UTF8;
92 state->partial |= byte;
93 state->bytes_stored++;
94 return 0;
95 }
96
97 // Invalid byte -> reset the state
98 clear();
99 return EILSEQ;
100}
101
102int CharacterConverter::push(char32_t utf32) {
103 // we can't be partially through a conversion when pushing a utf32 value
104 if (!isEmpty())
105 return -1;
106
107 state->partial = utf32;
108
109 // determine number of utf-8 bytes needed to represent this utf32 value
110 for (uint8_t i = 0; i < MAX_UTF8_LENGTH; i++) {
111 if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
112 state->total_bytes = i + 1;
113 state->bytes_stored = i + 1;
114 return 0;
115 }
116 }
117
118 // `utf32` contains a value that is too large to actually represent a valid
119 // unicode character
120 clear();
121 return EILSEQ;
122}
123
124ErrorOr<char32_t> CharacterConverter::pop_utf32() {
125 // If pop is called too early, do not reset the state, use error to determine
126 // whether enough bytes have been pushed
127 if (!isFull())
128 return Error(-1);
129 char32_t utf32 = state->partial;
130 // reset if successful pop
131 clear();
132 return utf32;
133}
134
135size_t CharacterConverter::sizeAsUTF32() {
136 return 1; // a single utf-32 value can fit an entire character
137}
138
139size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }
140
141ErrorOr<char8_t> CharacterConverter::pop_utf8() {
142 if (isEmpty())
143 return Error(-1);
144
145 constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
146 constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
147
148 char32_t output;
149
150 // Shift to get the next 6 bits from the utf32 encoding
151 const size_t shift_amount = (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8;
152 if (isFull()) {
153 /*
154 Choose the correct set of most significant bits to encode the length
155 of the utf8 sequence. The remaining bits contain the most significant
156 bits of the unicode value of the character.
157 */
158 output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
159 (state->partial >> shift_amount);
160 } else {
161 // Get the next 6 bits and format it like so: 10xxxxxx
162 output = CONTINUING_BYTE_HEADER |
163 ((state->partial >> shift_amount) & MASK_ENCODED_BITS);
164 }
165
166 state->bytes_stored--;
167 if (state->bytes_stored == 0)
168 clear();
169
170 return static_cast<char8_t>(output);
171}
172
173} // namespace internal
174} // namespace LIBC_NAMESPACE_DECL
175

source code of libc/src/__support/wchar/character_converter.cpp