1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) |
3 | // |
4 | // Distributed under the Boost Software License, Version 1.0. |
5 | // https://www.boost.org/LICENSE_1_0.txt |
6 | |
7 | #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED |
8 | #define BOOST_LOCALE_UTF_HPP_INCLUDED |
9 | |
10 | #include <boost/locale/config.hpp> |
11 | #include <cstdint> |
12 | |
13 | namespace boost { namespace locale { |
14 | /// \brief Namespace that holds basic operations on UTF encoded sequences |
15 | /// |
16 | /// All functions defined in this namespace do not require linking with Boost.Locale library |
17 | namespace utf { |
18 | /// \brief The integral type that can hold a Unicode code point |
19 | using code_point = uint32_t; |
20 | |
21 | /// \brief Special constant that defines illegal code point |
22 | constexpr code_point illegal = 0xFFFFFFFFu; |
23 | /// \brief Special constant that defines incomplete code point |
24 | constexpr code_point incomplete = 0xFFFFFFFEu; |
25 | |
26 | /// Either a length/size or an error (illegal/incomplete) |
27 | using len_or_error = code_point; |
28 | |
29 | /// \brief the function checks if \a v is a valid code point |
30 | inline bool is_valid_codepoint(code_point v) |
31 | { |
32 | if(v > 0x10FFFF) |
33 | return false; |
34 | if(0xD800 <= v && v <= 0xDFFF) // surrogates |
35 | return false; |
36 | return true; |
37 | } |
38 | |
39 | #ifdef BOOST_LOCALE_DOXYGEN |
40 | |
41 | /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points |
42 | template<typename CharType, int size = sizeof(CharType)> |
43 | struct utf_traits { |
44 | /// The type of the character |
45 | typedef CharType char_type; |
46 | |
47 | /// Read one code point from the range [p,e) and return it. |
48 | /// |
49 | /// - If the sequence that was read is incomplete sequence returns \ref incomplete, |
50 | /// - If illegal sequence detected returns \ref illegal |
51 | /// |
52 | /// Requirements |
53 | /// |
54 | /// - Iterator is valid input iterator |
55 | /// |
56 | /// Postconditions |
57 | /// |
58 | /// - p points to the last consumed character |
59 | template<typename Iterator> |
60 | static code_point decode(Iterator& p, Iterator e); |
61 | |
62 | /// Maximal width of valid sequence in the code units: |
63 | /// |
64 | /// - UTF-8 - 4 |
65 | /// - UTF-16 - 2 |
66 | /// - UTF-32 - 1 |
67 | static constexpr int max_width; |
68 | |
69 | /// The width of specific code point in the code units. |
70 | /// |
71 | /// Requirement: value is a valid Unicode code point |
72 | /// Returns value in range [1..max_width] |
73 | static int width(code_point value); |
74 | |
75 | /// Get the size of the trail part of variable length encoded sequence. |
76 | /// |
77 | /// Returns -1 if C is not valid lead character |
78 | static int trail_length(char_type c); |
79 | /// Returns true if c is trail code unit, always false for UTF-32 |
80 | static bool is_trail(char_type c); |
81 | /// Returns true if c is lead code unit, always true of UTF-32 |
82 | static bool is_lead(char_type c); |
83 | |
84 | /// Convert valid Unicode code point \a value to the UTF sequence. |
85 | /// |
86 | /// Requirements: |
87 | /// |
88 | /// - \a value is valid code point |
89 | /// - \a out is an output iterator should be able to accept at least width(value) units |
90 | /// |
91 | /// Returns the iterator past the last written code unit. |
92 | template<typename Iterator> |
93 | static Iterator encode(code_point value, Iterator out); |
94 | |
95 | /// Decodes valid UTF sequence that is pointed by p into code point. |
96 | /// |
97 | /// If the sequence is invalid or points to end the behavior is undefined |
98 | template<typename Iterator> |
99 | static code_point decode_valid(Iterator& p); |
100 | }; |
101 | |
102 | #else |
103 | |
104 | template<typename CharType, int size = sizeof(CharType)> |
105 | struct utf_traits; |
106 | |
107 | template<typename CharType> |
108 | struct utf_traits<CharType, 1> { |
109 | typedef CharType char_type; |
110 | |
111 | static int trail_length(char_type ci) |
112 | { |
113 | unsigned char c = ci; |
114 | if(c < 128) |
115 | return 0; |
116 | if(BOOST_UNLIKELY(c < 194)) |
117 | return -1; |
118 | if(c < 224) |
119 | return 1; |
120 | if(c < 240) |
121 | return 2; |
122 | if(BOOST_LIKELY(c <= 244)) |
123 | return 3; |
124 | return -1; |
125 | } |
126 | |
127 | static constexpr int max_width = 4; |
128 | |
129 | static int width(code_point value) |
130 | { |
131 | if(value <= 0x7F) |
132 | return 1; |
133 | else if(value <= 0x7FF) |
134 | return 2; |
135 | else if(BOOST_LIKELY(value <= 0xFFFF)) |
136 | return 3; |
137 | else |
138 | return 4; |
139 | } |
140 | |
141 | static bool is_trail(char_type ci) |
142 | { |
143 | unsigned char c = ci; |
144 | return (c & 0xC0) == 0x80; |
145 | } |
146 | |
147 | static bool is_lead(char_type ci) { return !is_trail(ci); } |
148 | |
149 | template<typename Iterator> |
150 | static code_point decode(Iterator& p, Iterator e) |
151 | { |
152 | if(BOOST_UNLIKELY(p == e)) |
153 | return incomplete; |
154 | |
155 | unsigned char lead = *p++; |
156 | |
157 | // First byte is fully validated here |
158 | int trail_size = trail_length(ci: lead); |
159 | |
160 | if(BOOST_UNLIKELY(trail_size < 0)) |
161 | return illegal; |
162 | |
163 | // Ok as only ASCII may be of size = 0 |
164 | // also optimize for ASCII text |
165 | if(trail_size == 0) |
166 | return lead; |
167 | |
168 | code_point c = lead & ((1 << (6 - trail_size)) - 1); |
169 | |
170 | // Read the rest |
171 | unsigned char tmp; |
172 | switch(trail_size) { |
173 | case 3: |
174 | if(BOOST_UNLIKELY(p == e)) |
175 | return incomplete; |
176 | tmp = *p++; |
177 | if(!is_trail(ci: tmp)) |
178 | return illegal; |
179 | c = (c << 6) | (tmp & 0x3F); |
180 | BOOST_FALLTHROUGH; |
181 | case 2: |
182 | if(BOOST_UNLIKELY(p == e)) |
183 | return incomplete; |
184 | tmp = *p++; |
185 | if(!is_trail(ci: tmp)) |
186 | return illegal; |
187 | c = (c << 6) | (tmp & 0x3F); |
188 | BOOST_FALLTHROUGH; |
189 | case 1: |
190 | if(BOOST_UNLIKELY(p == e)) |
191 | return incomplete; |
192 | tmp = *p++; |
193 | if(!is_trail(ci: tmp)) |
194 | return illegal; |
195 | c = (c << 6) | (tmp & 0x3F); |
196 | } |
197 | |
198 | // Check code point validity: no surrogates and |
199 | // valid range |
200 | if(BOOST_UNLIKELY(!is_valid_codepoint(c))) |
201 | return illegal; |
202 | |
203 | // make sure it is the most compact representation |
204 | if(BOOST_UNLIKELY(width(c) != trail_size + 1)) |
205 | return illegal; |
206 | |
207 | return c; |
208 | } |
209 | |
210 | template<typename Iterator> |
211 | static code_point decode_valid(Iterator& p) |
212 | { |
213 | unsigned char lead = *p++; |
214 | if(lead < 192) |
215 | return lead; |
216 | |
217 | int trail_size; |
218 | |
219 | if(lead < 224) |
220 | trail_size = 1; |
221 | else if(BOOST_LIKELY(lead < 240)) // non-BMP rare |
222 | trail_size = 2; |
223 | else |
224 | trail_size = 3; |
225 | |
226 | code_point c = lead & ((1 << (6 - trail_size)) - 1); |
227 | |
228 | switch(trail_size) { |
229 | case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH; |
230 | case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH; |
231 | case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); |
232 | } |
233 | |
234 | return c; |
235 | } |
236 | |
237 | template<typename Iterator> |
238 | static Iterator encode(code_point value, Iterator out) |
239 | { |
240 | if(value <= 0x7F) |
241 | *out++ = static_cast<char_type>(value); |
242 | else if(value <= 0x7FF) { |
243 | *out++ = static_cast<char_type>((value >> 6) | 0xC0); |
244 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); |
245 | } else if(BOOST_LIKELY(value <= 0xFFFF)) { |
246 | *out++ = static_cast<char_type>((value >> 12) | 0xE0); |
247 | *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); |
248 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); |
249 | } else { |
250 | *out++ = static_cast<char_type>((value >> 18) | 0xF0); |
251 | *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80); |
252 | *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); |
253 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); |
254 | } |
255 | return out; |
256 | } |
257 | }; // utf8 |
258 | |
259 | template<typename CharType> |
260 | struct utf_traits<CharType, 2> { |
261 | typedef CharType char_type; |
262 | |
263 | // See RFC 2781 |
264 | static bool is_first_surrogate(uint16_t x) { return 0xD800 <= x && x <= 0xDBFF; } |
265 | static bool is_second_surrogate(uint16_t x) { return 0xDC00 <= x && x <= 0xDFFF; } |
266 | static code_point combine_surrogate(uint16_t w1, uint16_t w2) |
267 | { |
268 | return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; |
269 | } |
270 | static int trail_length(char_type c) |
271 | { |
272 | if(is_first_surrogate(x: c)) |
273 | return 1; |
274 | if(is_second_surrogate(x: c)) |
275 | return -1; |
276 | return 0; |
277 | } |
278 | |
279 | /// Returns true if c is trail code unit, always false for UTF-32 |
280 | static bool is_trail(char_type c) { return is_second_surrogate(x: c); } |
281 | /// Returns true if c is lead code unit, always true of UTF-32 |
282 | static bool is_lead(char_type c) { return !is_second_surrogate(x: c); } |
283 | |
284 | template<typename It> |
285 | static code_point decode(It& current, It last) |
286 | { |
287 | if(BOOST_UNLIKELY(current == last)) |
288 | return incomplete; |
289 | uint16_t w1 = *current++; |
290 | if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) |
291 | return w1; |
292 | if(w1 > 0xDBFF) |
293 | return illegal; |
294 | if(current == last) |
295 | return incomplete; |
296 | uint16_t w2 = *current++; |
297 | if(w2 < 0xDC00 || 0xDFFF < w2) |
298 | return illegal; |
299 | return combine_surrogate(w1, w2); |
300 | } |
301 | template<typename It> |
302 | static code_point decode_valid(It& current) |
303 | { |
304 | uint16_t w1 = *current++; |
305 | if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) |
306 | return w1; |
307 | uint16_t w2 = *current++; |
308 | return combine_surrogate(w1, w2); |
309 | } |
310 | |
311 | static constexpr int max_width = 2; |
312 | static int width(code_point u) { return u >= 0x10000 ? 2 : 1; } |
313 | template<typename It> |
314 | static It encode(code_point u, It out) |
315 | { |
316 | if(BOOST_LIKELY(u <= 0xFFFF)) |
317 | *out++ = static_cast<char_type>(u); |
318 | else { |
319 | u -= 0x10000; |
320 | *out++ = static_cast<char_type>(0xD800 | (u >> 10)); |
321 | *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF)); |
322 | } |
323 | return out; |
324 | } |
325 | }; // utf16; |
326 | |
327 | template<typename CharType> |
328 | struct utf_traits<CharType, 4> { |
329 | typedef CharType char_type; |
330 | static int trail_length(char_type c) |
331 | { |
332 | if(is_valid_codepoint(c)) |
333 | return 0; |
334 | return -1; |
335 | } |
336 | static bool is_trail(char_type /*c*/) { return false; } |
337 | static bool is_lead(char_type /*c*/) { return true; } |
338 | |
339 | template<typename It> |
340 | static code_point decode_valid(It& current) |
341 | { |
342 | return *current++; |
343 | } |
344 | |
345 | template<typename It> |
346 | static code_point decode(It& current, It last) |
347 | { |
348 | if(BOOST_UNLIKELY(current == last)) |
349 | return boost::locale::utf::incomplete; |
350 | code_point c = *current++; |
351 | if(BOOST_UNLIKELY(!is_valid_codepoint(c))) |
352 | return boost::locale::utf::illegal; |
353 | return c; |
354 | } |
355 | static constexpr int max_width = 1; |
356 | static int width(code_point /*u*/) { return 1; } |
357 | template<typename It> |
358 | static It encode(code_point u, It out) |
359 | { |
360 | *out++ = static_cast<char_type>(u); |
361 | return out; |
362 | } |
363 | |
364 | }; // utf32 |
365 | |
366 | #endif |
367 | |
368 | } // namespace utf |
369 | }} // namespace boost::locale |
370 | |
371 | #endif |
372 | |