1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) |
3 | // |
4 | // Distributed under the Boost Software License, Version 1.0. |
5 | // https://www.boost.org/LICENSE_1_0.txt |
6 | |
7 | #include <boost/locale/utf.hpp> |
8 | #include <boost/locale/util/string.hpp> |
9 | #include "boostLocale/test/tools.hpp" |
10 | #include "boostLocale/test/unit_test.hpp" |
11 | #include <boost/detail/workaround.hpp> |
12 | #include <cstring> |
13 | |
14 | using namespace boost::locale::utf; |
15 | |
16 | const std::uint32_t* u32_seq(std::uint32_t a) |
17 | { |
18 | static std::uint32_t buf[2]; |
19 | buf[0] = a; |
20 | buf[1] = 0; |
21 | return buf; |
22 | } |
23 | |
24 | const std::uint16_t* u16_seq(std::uint16_t a) |
25 | { |
26 | static std::uint16_t buf[2]; |
27 | buf[0] = a; |
28 | buf[1] = 0; |
29 | return buf; |
30 | } |
31 | |
32 | const std::uint16_t* u16_seq(std::uint16_t a, std::uint16_t b) |
33 | { |
34 | static std::uint16_t buf[3]; |
35 | buf[0] = a; |
36 | buf[1] = b; |
37 | buf[2] = 0; |
38 | return buf; |
39 | } |
40 | |
41 | const char16_t* c16_seq(std::uint16_t a) |
42 | { |
43 | static char16_t buf[2]; |
44 | buf[0] = static_cast<char16_t>(a); |
45 | buf[1] = 0; |
46 | return buf; |
47 | } |
48 | |
49 | const char32_t* c32_seq(std::uint32_t a) |
50 | { |
51 | static char32_t buf[2]; |
52 | buf[0] = static_cast<char32_t>(a); |
53 | buf[1] = 0; |
54 | return buf; |
55 | } |
56 | |
57 | template<typename CharType> |
58 | void test_from_utf(const CharType* const s, unsigned codepoint) |
59 | { |
60 | const CharType* cur = s; |
61 | const CharType* const end = boost::locale::util::str_end(s); |
62 | |
63 | typedef utf_traits<CharType> tr; |
64 | |
65 | static_assert(tr::max_width == 4 / sizeof(CharType), "Wrong max_width" ); |
66 | |
67 | TEST_EQ(tr::decode(cur, end), codepoint); |
68 | |
69 | if(codepoint != illegal) |
70 | TEST(cur == end); |
71 | |
72 | if(codepoint == incomplete) { |
73 | TEST(*s == 0 || tr::trail_length(*s) > 0); |
74 | TEST_GE(tr::trail_length(*s), end - s); |
75 | } |
76 | |
77 | if(codepoint != incomplete && codepoint != illegal) { |
78 | TEST(tr::is_lead(*s)); |
79 | TEST(!tr::is_trail(*s)); |
80 | cur = s; |
81 | while(++cur != end) { |
82 | TEST(tr::is_trail(*cur)); |
83 | TEST(!tr::is_lead(*cur)); |
84 | } |
85 | TEST_EQ(tr::width(codepoint), end - s); |
86 | TEST_EQ(tr::trail_length(*s), tr::width(codepoint) - 1); |
87 | cur = s; |
88 | TEST_EQ(tr::decode_valid(cur), codepoint); |
89 | TEST(cur == end); |
90 | } |
91 | } |
92 | |
93 | template<typename CharType> |
94 | void test_to_utf(const CharType* str, unsigned codepoint) |
95 | { |
96 | CharType buf[5] = {1, 1, 1, 1, 1}; |
97 | CharType* p = buf; |
98 | p = utf_traits<CharType>::encode(codepoint, p); |
99 | const CharType* const end = boost::locale::util::str_end(str); |
100 | TEST_EQ(end - str, p - buf); |
101 | TEST(*p); |
102 | *p = 0; |
103 | TEST_EQ(memcmp(str, buf, sizeof(CharType) * (end - str)), 0); |
104 | } |
105 | |
106 | template<typename CharType> |
107 | void test_valid_utf(const CharType* str, unsigned codepoint) |
108 | { |
109 | test_from_utf(str, codepoint); |
110 | test_to_utf(str, codepoint); |
111 | } |
112 | |
113 | void test_utf8() |
114 | { |
115 | std::cout << "- Test UTF-8\n" ; |
116 | |
117 | std::cout << "-- Correct" << std::endl; |
118 | test_valid_utf(str: "\x7f" , codepoint: 0x7f); |
119 | test_valid_utf(str: "\xc2\x80" , codepoint: 0x80); |
120 | test_valid_utf(str: "\xdf\xbf" , codepoint: 0x7ff); |
121 | test_valid_utf(str: "\xe0\xa0\x80" , codepoint: 0x800); |
122 | test_valid_utf(str: "\xef\xbf\xbf" , codepoint: 0xffff); |
123 | test_valid_utf(str: "\xf0\x90\x80\x80" , codepoint: 0x10000); |
124 | test_valid_utf(str: "\xf4\x8f\xbf\xbf" , codepoint: 0x10ffff); |
125 | |
126 | // test that this actually works |
127 | test_from_utf(s: make2(v: 0x80), codepoint: 0x80); |
128 | test_from_utf(s: make2(v: 0x7ff), codepoint: 0x7ff); |
129 | |
130 | test_from_utf(s: make3(v: 0x800), codepoint: 0x800); |
131 | test_from_utf(s: make3(v: 0xffff), codepoint: 0xffff); |
132 | |
133 | test_from_utf(s: make4(v: 0x10000), codepoint: 0x10000); |
134 | test_from_utf(s: make4(v: 0x10ffff), codepoint: 0x10ffff); |
135 | |
136 | std::cout << "-- Too big" << std::endl; |
137 | test_from_utf(s: "\xf4\x9f\x80\x80" , codepoint: illegal); // 11 0000 |
138 | test_from_utf(s: "\xfb\xbf\xbf\xbf" , codepoint: illegal); // 3ff ffff |
139 | test_from_utf(s: "\xf8\x90\x80\x80\x80" , codepoint: illegal); // 400 0000 |
140 | test_from_utf(s: "\xfd\xbf\xbf\xbf\xbf\xbf" , codepoint: illegal); // 7fff ffff |
141 | |
142 | std::cout << "-- Invalid length" << std::endl; |
143 | |
144 | test_from_utf(s: make2(v: 0), codepoint: illegal); |
145 | test_from_utf(s: make3(v: 0), codepoint: illegal); |
146 | test_from_utf(s: make4(v: 0), codepoint: illegal); |
147 | test_from_utf(s: make2(v: 0x7f), codepoint: illegal); |
148 | test_from_utf(s: make3(v: 0x7f), codepoint: illegal); |
149 | test_from_utf(s: make4(v: 0x7f), codepoint: illegal); |
150 | |
151 | test_from_utf(s: make3(v: 0x80), codepoint: illegal); |
152 | test_from_utf(s: make4(v: 0x80), codepoint: illegal); |
153 | test_from_utf(s: make3(v: 0x7ff), codepoint: illegal); |
154 | test_from_utf(s: make4(v: 0x7ff), codepoint: illegal); |
155 | |
156 | test_from_utf(s: make4(v: 0x8000), codepoint: illegal); |
157 | test_from_utf(s: make4(v: 0xffff), codepoint: illegal); |
158 | test_from_utf(s: make4(v: 0x110000), codepoint: illegal); |
159 | test_from_utf(s: make4(v: 0x1fffff), codepoint: illegal); |
160 | |
161 | std::cout << "-- Invalid surrogate" << std::endl; |
162 | |
163 | test_from_utf(s: make3(v: 0xd800), codepoint: illegal); |
164 | test_from_utf(s: make3(v: 0xdbff), codepoint: illegal); |
165 | test_from_utf(s: make3(v: 0xdc00), codepoint: illegal); |
166 | test_from_utf(s: make3(v: 0xdfff), codepoint: illegal); |
167 | |
168 | test_from_utf(s: make4(v: 0xd800), codepoint: illegal); |
169 | test_from_utf(s: make4(v: 0xdbff), codepoint: illegal); |
170 | test_from_utf(s: make4(v: 0xdc00), codepoint: illegal); |
171 | test_from_utf(s: make4(v: 0xdfff), codepoint: illegal); |
172 | |
173 | std::cout << "-- Incomplete" << std::endl; |
174 | |
175 | test_from_utf(s: "" , codepoint: incomplete); |
176 | |
177 | test_from_utf(s: "\x80" , codepoint: illegal); |
178 | test_from_utf(s: "\xc2" , codepoint: incomplete); |
179 | |
180 | test_from_utf(s: "\xdf" , codepoint: incomplete); |
181 | |
182 | test_from_utf(s: "\xe0" , codepoint: incomplete); |
183 | test_from_utf(s: "\xe0\xa0" , codepoint: incomplete); |
184 | |
185 | test_from_utf(s: "\xef\xbf" , codepoint: incomplete); |
186 | test_from_utf(s: "\xef" , codepoint: incomplete); |
187 | |
188 | test_from_utf(s: "\xf0\x90\x80" , codepoint: incomplete); |
189 | test_from_utf(s: "\xf0\x90" , codepoint: incomplete); |
190 | test_from_utf(s: "\xf0" , codepoint: incomplete); |
191 | |
192 | test_from_utf(s: "\xf4\x8f\xbf" , codepoint: incomplete); |
193 | test_from_utf(s: "\xf4\x8f" , codepoint: incomplete); |
194 | test_from_utf(s: "\xf4" , codepoint: incomplete); |
195 | } |
196 | |
197 | void test_utf16() |
198 | { |
199 | std::cout << "- Test UTF-16\n" ; |
200 | |
201 | std::cout << "-- Correct" << std::endl; |
202 | test_valid_utf(str: u16_seq(a: 0x10), codepoint: 0x10); |
203 | test_valid_utf(str: u16_seq(a: 0xffff), codepoint: 0xffff); |
204 | test_valid_utf(str: u16_seq(a: 0xD800, b: 0xDC00), codepoint: 0x10000); |
205 | test_valid_utf(str: u16_seq(a: 0xDBFF, b: 0xDFFF), codepoint: 0x10FFFF); |
206 | |
207 | std::cout << "-- Invalid surrogate" << std::endl; |
208 | test_from_utf(s: u16_seq(a: 0xDFFF), codepoint: illegal); |
209 | test_from_utf(s: u16_seq(a: 0xDC00), codepoint: illegal); |
210 | |
211 | std::cout << "-- Incomplete" << std::endl; |
212 | test_from_utf(s: u16_seq(a: 0), codepoint: incomplete); |
213 | test_from_utf(s: u16_seq(a: 0xD800), codepoint: incomplete); |
214 | test_from_utf(s: u16_seq(a: 0xDBFF), codepoint: incomplete); |
215 | |
216 | std::cout << "-- Test char16_t" << std::endl; |
217 | #if BOOST_WORKAROUND(BOOST_GCC_VERSION, < 50000) |
218 | test_valid_utf(u"\x0010" , 0x10); |
219 | test_valid_utf(u"\xffff" , 0xffff); |
220 | #else |
221 | test_valid_utf(str: u"\u0010" , codepoint: 0x10); |
222 | test_valid_utf(str: u"\uffff" , codepoint: 0xffff); |
223 | #endif |
224 | test_valid_utf(str: u"\U00010000" , codepoint: 0x10000); |
225 | test_valid_utf(str: u"\U0010FFFF" , codepoint: 0x10FFFF); |
226 | test_from_utf(s: c16_seq(a: 0xDFFF), codepoint: illegal); |
227 | test_from_utf(s: c16_seq(a: 0xDC00), codepoint: illegal); |
228 | } |
229 | |
230 | void test_utf32() |
231 | { |
232 | std::cout << "- Test UTF-32\n" ; |
233 | |
234 | std::cout << "-- Correct" << std::endl; |
235 | test_valid_utf(str: u32_seq(a: 0x10), codepoint: 0x10); |
236 | test_valid_utf(str: u32_seq(a: 0xffff), codepoint: 0xffff); |
237 | test_valid_utf(str: u32_seq(a: 0x10000), codepoint: 0x10000); |
238 | test_valid_utf(str: u32_seq(a: 0x10ffff), codepoint: 0x10ffff); |
239 | |
240 | std::cout << "-- Invalid surrogate" << std::endl; |
241 | test_from_utf(s: u32_seq(a: 0xD800), codepoint: illegal); |
242 | test_from_utf(s: u32_seq(a: 0xDBFF), codepoint: illegal); |
243 | test_from_utf(s: u32_seq(a: 0xDFFF), codepoint: illegal); |
244 | test_from_utf(s: u32_seq(a: 0xDC00), codepoint: illegal); |
245 | test_from_utf(s: u32_seq(a: 0x110000), codepoint: illegal); |
246 | |
247 | std::cout << "-- Incomplete" << std::endl; |
248 | test_from_utf(s: u32_seq(a: 0), codepoint: incomplete); |
249 | |
250 | std::cout << "-- Test char32_t" << std::endl; |
251 | #if BOOST_WORKAROUND(BOOST_GCC_VERSION, < 50000) |
252 | test_valid_utf(U"\x0010" , 0x10); |
253 | #else |
254 | test_valid_utf(str: U"\U00000010" , codepoint: 0x10); |
255 | #endif |
256 | test_valid_utf(str: U"\U0000ffff" , codepoint: 0xffff); |
257 | test_valid_utf(str: U"\U00010000" , codepoint: 0x10000); |
258 | test_valid_utf(str: U"\U0010ffff" , codepoint: 0x10ffff); |
259 | test_from_utf(s: c32_seq(a: 0xD800), codepoint: illegal); |
260 | test_from_utf(s: c32_seq(a: 0xDBFF), codepoint: illegal); |
261 | test_from_utf(s: c32_seq(a: 0xDFFF), codepoint: illegal); |
262 | test_from_utf(s: c32_seq(a: 0xDC00), codepoint: illegal); |
263 | test_from_utf(s: c32_seq(a: 0x110000), codepoint: illegal); |
264 | } |
265 | |
266 | void test_main(int /*argc*/, char** /*argv*/) |
267 | { |
268 | test_utf8(); |
269 | test_utf16(); |
270 | test_utf32(); |
271 | } |
272 | |
273 | // boostinspect:noascii |
274 | |