1//===-- Unittests for character_converter utf8->utf32 ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "hdr/errno_macros.h"
10#include "src/__support/error_or.h"
11#include "src/__support/wchar/character_converter.h"
12#include "src/__support/wchar/mbstate.h"
13#include "test/UnitTest/Test.h"
14
15TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
16 LIBC_NAMESPACE::internal::mbstate state;
17 state.bytes_stored = 0;
18 state.total_bytes = 0;
19 char ch = 'A';
20
21 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
22 int err = char_conv.push(static_cast<char8_t>(ch));
23 auto wch = char_conv.pop_utf32();
24
25 ASSERT_EQ(err, 0);
26 ASSERT_TRUE(wch.has_value());
27 ASSERT_EQ(static_cast<int>(wch.value()), 65);
28}
29
30TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
31 LIBC_NAMESPACE::internal::mbstate state;
32 state.bytes_stored = 0;
33 state.total_bytes = 0;
34 const char ch[2] = {static_cast<char>(0xC2),
35 static_cast<char>(0x8E)}; // Ž car symbol
36
37 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
38 char_conv.push(static_cast<char8_t>(ch[0]));
39 char_conv.push(static_cast<char8_t>(ch[1]));
40 auto wch = char_conv.pop_utf32();
41
42 ASSERT_TRUE(wch.has_value());
43 ASSERT_EQ(static_cast<int>(wch.value()), 142);
44}
45
46TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
47 LIBC_NAMESPACE::internal::mbstate state;
48 state.bytes_stored = 0;
49 state.total_bytes = 0;
50 const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
51 static_cast<char>(0x91)}; // ∑ sigma symbol
52
53 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
54 char_conv.push(static_cast<char8_t>(ch[0]));
55 char_conv.push(static_cast<char8_t>(ch[1]));
56 char_conv.push(static_cast<char8_t>(ch[2]));
57 auto wch = char_conv.pop_utf32();
58
59 ASSERT_TRUE(wch.has_value());
60 ASSERT_EQ(static_cast<int>(wch.value()), 8721);
61}
62
63TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
64 LIBC_NAMESPACE::internal::mbstate state;
65 state.bytes_stored = 0;
66 state.total_bytes = 0;
67 const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
68 static_cast<char>(0xA4),
69 static_cast<char>(0xA1)}; // 🤡 clown emoji
70
71 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
72 char_conv.push(static_cast<char8_t>(ch[0]));
73 char_conv.push(static_cast<char8_t>(ch[1]));
74 char_conv.push(static_cast<char8_t>(ch[2]));
75 char_conv.push(static_cast<char8_t>(ch[3]));
76 auto wch = char_conv.pop_utf32();
77
78 ASSERT_TRUE(wch.has_value());
79 ASSERT_EQ(static_cast<int>(wch.value()), 129313);
80}
81
82TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
83 LIBC_NAMESPACE::internal::mbstate state;
84 state.bytes_stored = 0;
85 state.total_bytes = 0;
86 const char ch = static_cast<char>(0x80); // invalid starting bit sequence
87
88 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
89 int err = char_conv.push(static_cast<char8_t>(ch));
90
91 ASSERT_EQ(err, EILSEQ);
92}
93
94TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
95 LIBC_NAMESPACE::internal::mbstate state;
96 state.bytes_stored = 0;
97 state.total_bytes = 0;
98 const char ch[4] = {
99 static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
100 static_cast<char>(0x00)}; // first and third bytes are invalid
101
102 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
103 int err = char_conv.push(static_cast<char8_t>(ch[0]));
104 ASSERT_EQ(err, EILSEQ);
105 err = char_conv.push(static_cast<char8_t>(ch[1]));
106 ASSERT_EQ(err, 0);
107 // Prev byte was single byte so trying to push another should error.
108 err = char_conv.push(static_cast<char8_t>(ch[2]));
109 ASSERT_EQ(err, EILSEQ);
110 err = char_conv.push(static_cast<char8_t>(ch[3]));
111 ASSERT_EQ(err, 0);
112}
113
114TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
115 LIBC_NAMESPACE::internal::mbstate state;
116 state.bytes_stored = 0;
117 state.total_bytes = 0;
118 // Last byte is invalid since it does not have correct starting sequence.
119 // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
120 const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
121 static_cast<char>(0x80), static_cast<char>(0xC0)};
122
123 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
124 int err = char_conv.push(static_cast<char8_t>(ch[0]));
125 ASSERT_EQ(err, 0);
126 err = char_conv.push(static_cast<char8_t>(ch[1]));
127 ASSERT_EQ(err, 0);
128 err = char_conv.push(static_cast<char8_t>(ch[2]));
129 ASSERT_EQ(err, 0);
130 err = char_conv.push(static_cast<char8_t>(ch[3]));
131 ASSERT_EQ(err, EILSEQ);
132}
133
134TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
135 LIBC_NAMESPACE::internal::mbstate state;
136 state.bytes_stored = 0;
137 state.total_bytes = 0;
138 const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
139 static_cast<char>(0x80)};
140
141 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
142 int err = char_conv.push(static_cast<char8_t>(ch[0]));
143 ASSERT_EQ(err, 0);
144 err = char_conv.push(static_cast<char8_t>(ch[1]));
145 ASSERT_EQ(err, 0);
146 // Should produce an error on 3rd byte
147 err = char_conv.push(static_cast<char8_t>(ch[2]));
148 ASSERT_EQ(err, EILSEQ);
149
150 // Should produce an error since mbstate was reset
151 auto wch = char_conv.pop_utf32();
152 ASSERT_FALSE(wch.has_value());
153}
154
155TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
156 LIBC_NAMESPACE::internal::mbstate state;
157 state.bytes_stored = 0;
158 state.total_bytes = 0;
159 const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
160 static_cast<char>(0xC7), static_cast<char>(0x8C)};
161
162 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
163 int err = char_conv.push(static_cast<char8_t>(ch[0]));
164 ASSERT_EQ(err, 0);
165 err = char_conv.push(static_cast<char8_t>(ch[1]));
166 ASSERT_EQ(err, 0);
167 auto wch = char_conv.pop_utf32();
168 ASSERT_TRUE(wch.has_value());
169 ASSERT_EQ(static_cast<int>(wch.value()), 142);
170
171 // Second two byte character
172 err = char_conv.push(static_cast<char8_t>(ch[2]));
173 ASSERT_EQ(err, 0);
174 err = char_conv.push(static_cast<char8_t>(ch[3]));
175 ASSERT_EQ(err, 0);
176 wch = char_conv.pop_utf32();
177 ASSERT_TRUE(wch.has_value());
178 ASSERT_EQ(static_cast<int>(wch.value()), 460);
179}
180
181TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
182 LIBC_NAMESPACE::internal::mbstate state;
183 state.bytes_stored = 0;
184 state.total_bytes = 0;
185 LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
186 const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
187 int err = char_conv.push(static_cast<char8_t>(ch[0]));
188 ASSERT_EQ(err, 0);
189 auto wch = char_conv.pop_utf32();
190 ASSERT_FALSE(
191 wch.has_value()); // Should fail since we have not read enough bytes
192 err = char_conv.push(static_cast<char8_t>(ch[1]));
193 ASSERT_EQ(err, 0);
194 wch = char_conv.pop_utf32();
195 ASSERT_TRUE(wch.has_value());
196 ASSERT_EQ(static_cast<int>(wch.value()), 142);
197}
198

source code of libc/test/src/__support/wchar/utf8_to_32_test.cpp