1//
2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3//
4// Distributed under the Boost Software License, Version 1.0.
5// https://www.boost.org/LICENSE_1_0.txt
6
7#include <boost/locale/utf.hpp>
8#include <boost/locale/util/string.hpp>
9#include "boostLocale/test/tools.hpp"
10#include "boostLocale/test/unit_test.hpp"
11#include <boost/detail/workaround.hpp>
12#include <cstring>
13
14using namespace boost::locale::utf;
15
16const std::uint32_t* u32_seq(std::uint32_t a)
17{
18 static std::uint32_t buf[2];
19 buf[0] = a;
20 buf[1] = 0;
21 return buf;
22}
23
24const std::uint16_t* u16_seq(std::uint16_t a)
25{
26 static std::uint16_t buf[2];
27 buf[0] = a;
28 buf[1] = 0;
29 return buf;
30}
31
32const std::uint16_t* u16_seq(std::uint16_t a, std::uint16_t b)
33{
34 static std::uint16_t buf[3];
35 buf[0] = a;
36 buf[1] = b;
37 buf[2] = 0;
38 return buf;
39}
40
41const char16_t* c16_seq(std::uint16_t a)
42{
43 static char16_t buf[2];
44 buf[0] = static_cast<char16_t>(a);
45 buf[1] = 0;
46 return buf;
47}
48
49const char32_t* c32_seq(std::uint32_t a)
50{
51 static char32_t buf[2];
52 buf[0] = static_cast<char32_t>(a);
53 buf[1] = 0;
54 return buf;
55}
56
57template<typename CharType>
58void test_from_utf(const CharType* const s, unsigned codepoint)
59{
60 const CharType* cur = s;
61 const CharType* const end = boost::locale::util::str_end(s);
62
63 typedef utf_traits<CharType> tr;
64
65 static_assert(tr::max_width == 4 / sizeof(CharType), "Wrong max_width");
66
67 TEST_EQ(tr::decode(cur, end), codepoint);
68
69 if(codepoint != illegal)
70 TEST(cur == end);
71
72 if(codepoint == incomplete) {
73 TEST(*s == 0 || tr::trail_length(*s) > 0);
74 TEST_GE(tr::trail_length(*s), end - s);
75 }
76
77 if(codepoint != incomplete && codepoint != illegal) {
78 TEST(tr::is_lead(*s));
79 TEST(!tr::is_trail(*s));
80 cur = s;
81 while(++cur != end) {
82 TEST(tr::is_trail(*cur));
83 TEST(!tr::is_lead(*cur));
84 }
85 TEST_EQ(tr::width(codepoint), end - s);
86 TEST_EQ(tr::trail_length(*s), tr::width(codepoint) - 1);
87 cur = s;
88 TEST_EQ(tr::decode_valid(cur), codepoint);
89 TEST(cur == end);
90 }
91}
92
93template<typename CharType>
94void test_to_utf(const CharType* str, unsigned codepoint)
95{
96 CharType buf[5] = {1, 1, 1, 1, 1};
97 CharType* p = buf;
98 p = utf_traits<CharType>::encode(codepoint, p);
99 const CharType* const end = boost::locale::util::str_end(str);
100 TEST_EQ(end - str, p - buf);
101 TEST(*p);
102 *p = 0;
103 TEST_EQ(memcmp(str, buf, sizeof(CharType) * (end - str)), 0);
104}
105
106template<typename CharType>
107void test_valid_utf(const CharType* str, unsigned codepoint)
108{
109 test_from_utf(str, codepoint);
110 test_to_utf(str, codepoint);
111}
112
113void test_utf8()
114{
115 std::cout << "- Test UTF-8\n";
116
117 std::cout << "-- Correct" << std::endl;
118 test_valid_utf(str: "\x7f", codepoint: 0x7f);
119 test_valid_utf(str: "\xc2\x80", codepoint: 0x80);
120 test_valid_utf(str: "\xdf\xbf", codepoint: 0x7ff);
121 test_valid_utf(str: "\xe0\xa0\x80", codepoint: 0x800);
122 test_valid_utf(str: "\xef\xbf\xbf", codepoint: 0xffff);
123 test_valid_utf(str: "\xf0\x90\x80\x80", codepoint: 0x10000);
124 test_valid_utf(str: "\xf4\x8f\xbf\xbf", codepoint: 0x10ffff);
125
126 // test that this actually works
127 test_from_utf(s: make2(v: 0x80), codepoint: 0x80);
128 test_from_utf(s: make2(v: 0x7ff), codepoint: 0x7ff);
129
130 test_from_utf(s: make3(v: 0x800), codepoint: 0x800);
131 test_from_utf(s: make3(v: 0xffff), codepoint: 0xffff);
132
133 test_from_utf(s: make4(v: 0x10000), codepoint: 0x10000);
134 test_from_utf(s: make4(v: 0x10ffff), codepoint: 0x10ffff);
135
136 std::cout << "-- Too big" << std::endl;
137 test_from_utf(s: "\xf4\x9f\x80\x80", codepoint: illegal); // 11 0000
138 test_from_utf(s: "\xfb\xbf\xbf\xbf", codepoint: illegal); // 3ff ffff
139 test_from_utf(s: "\xf8\x90\x80\x80\x80", codepoint: illegal); // 400 0000
140 test_from_utf(s: "\xfd\xbf\xbf\xbf\xbf\xbf", codepoint: illegal); // 7fff ffff
141
142 std::cout << "-- Invalid length" << std::endl;
143
144 test_from_utf(s: make2(v: 0), codepoint: illegal);
145 test_from_utf(s: make3(v: 0), codepoint: illegal);
146 test_from_utf(s: make4(v: 0), codepoint: illegal);
147 test_from_utf(s: make2(v: 0x7f), codepoint: illegal);
148 test_from_utf(s: make3(v: 0x7f), codepoint: illegal);
149 test_from_utf(s: make4(v: 0x7f), codepoint: illegal);
150
151 test_from_utf(s: make3(v: 0x80), codepoint: illegal);
152 test_from_utf(s: make4(v: 0x80), codepoint: illegal);
153 test_from_utf(s: make3(v: 0x7ff), codepoint: illegal);
154 test_from_utf(s: make4(v: 0x7ff), codepoint: illegal);
155
156 test_from_utf(s: make4(v: 0x8000), codepoint: illegal);
157 test_from_utf(s: make4(v: 0xffff), codepoint: illegal);
158 test_from_utf(s: make4(v: 0x110000), codepoint: illegal);
159 test_from_utf(s: make4(v: 0x1fffff), codepoint: illegal);
160
161 std::cout << "-- Invalid surrogate" << std::endl;
162
163 test_from_utf(s: make3(v: 0xd800), codepoint: illegal);
164 test_from_utf(s: make3(v: 0xdbff), codepoint: illegal);
165 test_from_utf(s: make3(v: 0xdc00), codepoint: illegal);
166 test_from_utf(s: make3(v: 0xdfff), codepoint: illegal);
167
168 test_from_utf(s: make4(v: 0xd800), codepoint: illegal);
169 test_from_utf(s: make4(v: 0xdbff), codepoint: illegal);
170 test_from_utf(s: make4(v: 0xdc00), codepoint: illegal);
171 test_from_utf(s: make4(v: 0xdfff), codepoint: illegal);
172
173 std::cout << "-- Incomplete" << std::endl;
174
175 test_from_utf(s: "", codepoint: incomplete);
176
177 test_from_utf(s: "\x80", codepoint: illegal);
178 test_from_utf(s: "\xc2", codepoint: incomplete);
179
180 test_from_utf(s: "\xdf", codepoint: incomplete);
181
182 test_from_utf(s: "\xe0", codepoint: incomplete);
183 test_from_utf(s: "\xe0\xa0", codepoint: incomplete);
184
185 test_from_utf(s: "\xef\xbf", codepoint: incomplete);
186 test_from_utf(s: "\xef", codepoint: incomplete);
187
188 test_from_utf(s: "\xf0\x90\x80", codepoint: incomplete);
189 test_from_utf(s: "\xf0\x90", codepoint: incomplete);
190 test_from_utf(s: "\xf0", codepoint: incomplete);
191
192 test_from_utf(s: "\xf4\x8f\xbf", codepoint: incomplete);
193 test_from_utf(s: "\xf4\x8f", codepoint: incomplete);
194 test_from_utf(s: "\xf4", codepoint: incomplete);
195}
196
197void test_utf16()
198{
199 std::cout << "- Test UTF-16\n";
200
201 std::cout << "-- Correct" << std::endl;
202 test_valid_utf(str: u16_seq(a: 0x10), codepoint: 0x10);
203 test_valid_utf(str: u16_seq(a: 0xffff), codepoint: 0xffff);
204 test_valid_utf(str: u16_seq(a: 0xD800, b: 0xDC00), codepoint: 0x10000);
205 test_valid_utf(str: u16_seq(a: 0xDBFF, b: 0xDFFF), codepoint: 0x10FFFF);
206
207 std::cout << "-- Invalid surrogate" << std::endl;
208 test_from_utf(s: u16_seq(a: 0xDFFF), codepoint: illegal);
209 test_from_utf(s: u16_seq(a: 0xDC00), codepoint: illegal);
210
211 std::cout << "-- Incomplete" << std::endl;
212 test_from_utf(s: u16_seq(a: 0), codepoint: incomplete);
213 test_from_utf(s: u16_seq(a: 0xD800), codepoint: incomplete);
214 test_from_utf(s: u16_seq(a: 0xDBFF), codepoint: incomplete);
215
216 std::cout << "-- Test char16_t" << std::endl;
217#if BOOST_WORKAROUND(BOOST_GCC_VERSION, < 50000)
218 test_valid_utf(u"\x0010", 0x10);
219 test_valid_utf(u"\xffff", 0xffff);
220#else
221 test_valid_utf(str: u"\u0010", codepoint: 0x10);
222 test_valid_utf(str: u"\uffff", codepoint: 0xffff);
223#endif
224 test_valid_utf(str: u"\U00010000", codepoint: 0x10000);
225 test_valid_utf(str: u"\U0010FFFF", codepoint: 0x10FFFF);
226 test_from_utf(s: c16_seq(a: 0xDFFF), codepoint: illegal);
227 test_from_utf(s: c16_seq(a: 0xDC00), codepoint: illegal);
228}
229
230void test_utf32()
231{
232 std::cout << "- Test UTF-32\n";
233
234 std::cout << "-- Correct" << std::endl;
235 test_valid_utf(str: u32_seq(a: 0x10), codepoint: 0x10);
236 test_valid_utf(str: u32_seq(a: 0xffff), codepoint: 0xffff);
237 test_valid_utf(str: u32_seq(a: 0x10000), codepoint: 0x10000);
238 test_valid_utf(str: u32_seq(a: 0x10ffff), codepoint: 0x10ffff);
239
240 std::cout << "-- Invalid surrogate" << std::endl;
241 test_from_utf(s: u32_seq(a: 0xD800), codepoint: illegal);
242 test_from_utf(s: u32_seq(a: 0xDBFF), codepoint: illegal);
243 test_from_utf(s: u32_seq(a: 0xDFFF), codepoint: illegal);
244 test_from_utf(s: u32_seq(a: 0xDC00), codepoint: illegal);
245 test_from_utf(s: u32_seq(a: 0x110000), codepoint: illegal);
246
247 std::cout << "-- Incomplete" << std::endl;
248 test_from_utf(s: u32_seq(a: 0), codepoint: incomplete);
249
250 std::cout << "-- Test char32_t" << std::endl;
251#if BOOST_WORKAROUND(BOOST_GCC_VERSION, < 50000)
252 test_valid_utf(U"\x0010", 0x10);
253#else
254 test_valid_utf(str: U"\U00000010", codepoint: 0x10);
255#endif
256 test_valid_utf(str: U"\U0000ffff", codepoint: 0xffff);
257 test_valid_utf(str: U"\U00010000", codepoint: 0x10000);
258 test_valid_utf(str: U"\U0010ffff", codepoint: 0x10ffff);
259 test_from_utf(s: c32_seq(a: 0xD800), codepoint: illegal);
260 test_from_utf(s: c32_seq(a: 0xDBFF), codepoint: illegal);
261 test_from_utf(s: c32_seq(a: 0xDFFF), codepoint: illegal);
262 test_from_utf(s: c32_seq(a: 0xDC00), codepoint: illegal);
263 test_from_utf(s: c32_seq(a: 0x110000), codepoint: illegal);
264}
265
266void test_main(int /*argc*/, char** /*argv*/)
267{
268 test_utf8();
269 test_utf16();
270 test_utf32();
271}
272
273// boostinspect:noascii
274

source code of boost/libs/locale/test/test_utf.cpp