utf.hpp source code [boost/libs/locale/include/boost/locale/utf.hpp]

1	//
2	// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3	//
4	// Distributed under the Boost Software License, Version 1.0.
5	// https://www.boost.org/LICENSE_1_0.txt
6
7	#ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
8	#define BOOST_LOCALE_UTF_HPP_INCLUDED
9
10	#include <boost/locale/config.hpp>
11	#include <cstdint>
12
13	namespace boost { namespace locale {
14	/// \brief Namespace that holds basic operations on UTF encoded sequences
15	///
16	/// All functions defined in this namespace do not require linking with Boost.Locale library
17	namespace utf {
18	/// \brief The integral type that can hold a Unicode code point
19	using code_point = uint32_t;
20
21	/// \brief Special constant that defines illegal code point
22	constexpr code_point illegal = `0xFFFFFFFFu`;
23	/// \brief Special constant that defines incomplete code point
24	constexpr code_point incomplete = `0xFFFFFFFEu`;
25
26	/// Either a length/size or an error (illegal/incomplete)
27	using len_or_error = code_point;
28
29	/// \brief the function checks if \a v is a valid code point
30	inline bool is_valid_codepoint(code_point v)
31	{
32	if(v > `0x10FFFF`)
33	return false;
34	if(`0xD800` <= v && v <= `0xDFFF`) // surrogates
35	return false;
36	return true;
37	}
38
39	#ifdef BOOST_LOCALE_DOXYGEN
40
41	/// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
42	template<typename CharType, int size = sizeof(CharType)>
43	struct utf_traits {
44	/// The type of the character
45	typedef CharType char_type;
46
47	/// Read one code point from the range [p,e) and return it.
48	///
49	/// - If the sequence that was read is incomplete sequence returns \ref incomplete,
50	/// - If illegal sequence detected returns \ref illegal
51	///
52	/// Requirements
53	///
54	/// - Iterator is valid input iterator
55	///
56	/// Postconditions
57	///
58	/// - p points to the last consumed character
59	template<typename Iterator>
60	static code_point decode(Iterator& p, Iterator e);
61
62	/// Maximal width of valid sequence in the code units:
63	///
64	/// - UTF-8 - 4
65	/// - UTF-16 - 2
66	/// - UTF-32 - 1
67	static constexpr int max_width;
68
69	/// The width of specific code point in the code units.
70	///
71	/// Requirement: value is a valid Unicode code point
72	/// Returns value in range [1..max_width]
73	static int width(code_point value);
74
75	/// Get the size of the trail part of variable length encoded sequence.
76	///
77	/// Returns -1 if C is not valid lead character
78	static int trail_length(char_type c);
79	/// Returns true if c is trail code unit, always false for UTF-32
80	static bool is_trail(char_type c);
81	/// Returns true if c is lead code unit, always true of UTF-32
82	static bool is_lead(char_type c);
83
84	/// Convert valid Unicode code point \a value to the UTF sequence.
85	///
86	/// Requirements:
87	///
88	/// - \a value is valid code point
89	/// - \a out is an output iterator should be able to accept at least width(value) units
90	///
91	/// Returns the iterator past the last written code unit.
92	template<typename Iterator>
93	static Iterator encode(code_point value, Iterator out);
94
95	/// Decodes valid UTF sequence that is pointed by p into code point.
96	///
97	/// If the sequence is invalid or points to end the behavior is undefined
98	template<typename Iterator>
99	static code_point decode_valid(Iterator& p);
100	};
101
102	#else
103
104	template<typename CharType, int size = sizeof(CharType)>
105	struct utf_traits;
106
107	template<typename CharType>
108	struct utf_traits<CharType, `1`> {
109	typedef CharType char_type;
110
111	static int trail_length(char_type ci)
112	{
113	unsigned char c = ci;
114	if(c < `128`)
115	return `0`;
116	if(BOOST_UNLIKELY(c < `194`))
117	return -`1`;
118	if(c < `224`)
119	return `1`;
120	if(c < `240`)
121	return `2`;
122	if(BOOST_LIKELY(c <= `244`))
123	return `3`;
124	return -`1`;
125	}
126
127	static constexpr int max_width = `4`;
128
129	static int width(code_point value)
130	{
131	if(value <= `0x7F`)
132	return `1`;
133	else if(value <= `0x7FF`)
134	return `2`;
135	else if(BOOST_LIKELY(value <= `0xFFFF`))
136	return `3`;
137	else
138	return `4`;
139	}
140
141	static bool is_trail(char_type ci)
142	{
143	unsigned char c = ci;
144	return (c & `0xC0`) == `0x80`;
145	}
146
147	static bool is_lead(char_type ci) { return !is_trail(ci); }
148
149	template<typename Iterator>
150	static code_point decode(Iterator& p, Iterator e)
151	{
152	if(BOOST_UNLIKELY(p == e))
153	return incomplete;
154
155	unsigned char lead = *p++;
156
157	// First byte is fully validated here
158	int trail_size = trail_length(ci: lead);
159
160	if(BOOST_UNLIKELY(trail_size < `0`))
161	return illegal;
162
163	// Ok as only ASCII may be of size = 0
164	// also optimize for ASCII text
165	if(trail_size == `0`)
166	return lead;
167
168	code_point c = lead & ((`1` << (`6` - trail_size)) - `1`);
169
170	// Read the rest
171	unsigned char tmp;
172	switch(trail_size) {
173	case `3`:
174	if(BOOST_UNLIKELY(p == e))
175	return incomplete;
176	tmp = *p++;
177	if(!is_trail(ci: tmp))
178	return illegal;
179	c = (c << `6`) \| (tmp & `0x3F`);
180	BOOST_FALLTHROUGH;
181	case `2`:
182	if(BOOST_UNLIKELY(p == e))
183	return incomplete;
184	tmp = *p++;
185	if(!is_trail(ci: tmp))
186	return illegal;
187	c = (c << `6`) \| (tmp & `0x3F`);
188	BOOST_FALLTHROUGH;
189	case `1`:
190	if(BOOST_UNLIKELY(p == e))
191	return incomplete;
192	tmp = *p++;
193	if(!is_trail(ci: tmp))
194	return illegal;
195	c = (c << `6`) \| (tmp & `0x3F`);
196	}
197
198	// Check code point validity: no surrogates and
199	// valid range
200	if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
201	return illegal;
202
203	// make sure it is the most compact representation
204	if(BOOST_UNLIKELY(width(c) != trail_size + `1`))
205	return illegal;
206
207	return c;
208	}
209
210	template<typename Iterator>
211	static code_point decode_valid(Iterator& p)
212	{
213	unsigned char lead = *p++;
214	if(lead < `192`)
215	return lead;
216
217	int trail_size;
218
219	if(lead < `224`)
220	trail_size = `1`;
221	else if(BOOST_LIKELY(lead < `240`)) // non-BMP rare
222	trail_size = `2`;
223	else
224	trail_size = `3`;
225
226	code_point c = lead & ((`1` << (`6` - trail_size)) - `1`);
227
228	switch(trail_size) {
229	case `3`: c = (c << `6`) \| (static_cast<unsigned char>(*p++) & `0x3F`); BOOST_FALLTHROUGH;
230	case `2`: c = (c << `6`) \| (static_cast<unsigned char>(*p++) & `0x3F`); BOOST_FALLTHROUGH;
231	case `1`: c = (c << `6`) \| (static_cast<unsigned char>(*p++) & `0x3F`);
232	}
233
234	return c;
235	}
236
237	template<typename Iterator>
238	static Iterator encode(code_point value, Iterator out)
239	{
240	if(value <= `0x7F`)
241	out++ = static_cast*<char_type>(value);
242	else if(value <= `0x7FF`) {
243	out++ = static_cast*<char_type>((value >> `6`) \| `0xC0`);
244	out++ = static_cast*<char_type>((value & `0x3F`) \| `0x80`);
245	} else if(BOOST_LIKELY(value <= `0xFFFF`)) {
246	out++ = static_cast*<char_type>((value >> `12`) \| `0xE0`);
247	out++ = static_cast*<char_type>(((value >> `6`) & `0x3F`) \| `0x80`);
248	out++ = static_cast*<char_type>((value & `0x3F`) \| `0x80`);
249	} else {
250	out++ = static_cast*<char_type>((value >> `18`) \| `0xF0`);
251	out++ = static_cast*<char_type>(((value >> `12`) & `0x3F`) \| `0x80`);
252	out++ = static_cast*<char_type>(((value >> `6`) & `0x3F`) \| `0x80`);
253	out++ = static_cast*<char_type>((value & `0x3F`) \| `0x80`);
254	}
255	return out;
256	}
257	}; // utf8
258
259	template<typename CharType>
260	struct utf_traits<CharType, `2`> {
261	typedef CharType char_type;
262
263	// See RFC 2781
264	static bool is_first_surrogate(uint16_t x) { return `0xD800` <= x && x <= `0xDBFF`; }
265	static bool is_second_surrogate(uint16_t x) { return `0xDC00` <= x && x <= `0xDFFF`; }
266	static code_point combine_surrogate(uint16_t w1, uint16_t w2)
267	{
268	return ((code_point(w1 & `0x3FF`) << `10`) \| (w2 & `0x3FF`)) + `0x10000`;
269	}
270	static int trail_length(char_type c)
271	{
272	if(is_first_surrogate(x: c))
273	return `1`;
274	if(is_second_surrogate(x: c))
275	return -`1`;
276	return `0`;
277	}
278
279	/// Returns true if c is trail code unit, always false for UTF-32
280	static bool is_trail(char_type c) { return is_second_surrogate(x: c); }
281	/// Returns true if c is lead code unit, always true of UTF-32
282	static bool is_lead(char_type c) { return !is_second_surrogate(x: c); }
283
284	template<typename It>
285	static code_point decode(It& current, It last)
286	{
287	if(BOOST_UNLIKELY(current == last))
288	return incomplete;
289	uint16_t w1 = *current++;
290	if(BOOST_LIKELY(w1 < `0xD800` \|\| `0xDFFF` < w1))
291	return w1;
292	if(w1 > `0xDBFF`)
293	return illegal;
294	if(current == last)
295	return incomplete;
296	uint16_t w2 = *current++;
297	if(w2 < `0xDC00` \|\| `0xDFFF` < w2)
298	return illegal;
299	return combine_surrogate(w1, w2);
300	}
301	template<typename It>
302	static code_point decode_valid(It& current)
303	{
304	uint16_t w1 = *current++;
305	if(BOOST_LIKELY(w1 < `0xD800` \|\| `0xDFFF` < w1))
306	return w1;
307	uint16_t w2 = *current++;
308	return combine_surrogate(w1, w2);
309	}
310
311	static constexpr int max_width = `2`;
312	static int width(code_point u) { return u >= `0x10000` ? `2` : `1`; }
313	template<typename It>
314	static It encode(code_point u, It out)
315	{
316	if(BOOST_LIKELY(u <= `0xFFFF`))
317	out++ = static_cast*<char_type>(u);
318	else {
319	u -= `0x10000`;
320	out++ = static_cast*<char_type>(`0xD800` \| (u >> `10`));
321	out++ = static_cast*<char_type>(`0xDC00` \| (u & `0x3FF`));
322	}
323	return out;
324	}
325	}; // utf16;
326
327	template<typename CharType>
328	struct utf_traits<CharType, `4`> {
329	typedef CharType char_type;
330	static int trail_length(char_type c)
331	{
332	if(is_valid_codepoint(c))
333	return `0`;
334	return -`1`;
335	}
336	static bool is_trail(char_type /c/) { return false; }
337	static bool is_lead(char_type /c/) { return true; }
338
339	template<typename It>
340	static code_point decode_valid(It& current)
341	{
342	return *current++;
343	}
344
345	template<typename It>
346	static code_point decode(It& current, It last)
347	{
348	if(BOOST_UNLIKELY(current == last))
349	return boost::locale::utf::incomplete;
350	code_point c = *current++;
351	if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
352	return boost::locale::utf::illegal;
353	return c;
354	}
355	static constexpr int max_width = `1`;
356	static int width(code_point /u/) { return `1`; }
357	template<typename It>
358	static It encode(code_point u, It out)
359	{
360	out++ = static_cast*<char_type>(u);
361	return out;
362	}
363
364	}; // utf32
365
366	#endif
367
368	} // namespace utf
369	}} // namespace boost::locale
370
371	#endif
372

source code of boost/libs/locale/include/boost/locale/utf.hpp