1//
2// Copyright (c) 2015 Artyom Beilis (Tonkikh)
3// Copyright (c) 2021-2023 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0.
6// https://www.boost.org/LICENSE_1_0.txt
7
8#ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
9#define BOOST_LOCALE_GENERIC_CODECVT_HPP
10
11#include <boost/locale/utf.hpp>
12#include <cstdint>
13#include <locale>
14
15namespace boost { namespace locale {
16
17 static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small to store an UTF-16 codepoint");
18 namespace detail {
19 // Avoid including cstring for std::memcpy
20 inline void copy_uint16_t(void* dst, const void* src)
21 {
22 unsigned char* cdst = static_cast<unsigned char*>(dst);
23 const unsigned char* csrc = static_cast<const unsigned char*>(src);
24 cdst[0] = csrc[0];
25 cdst[1] = csrc[1];
26 }
27 inline uint16_t read_state(const std::mbstate_t& src)
28 {
29 uint16_t dst;
30 copy_uint16_t(dst: &dst, src: &src);
31 return dst;
32 }
33 inline void write_state(std::mbstate_t& dst, const uint16_t src)
34 {
35 copy_uint16_t(dst: &dst, src: &src);
36 }
37 } // namespace detail
38
39 /// \brief A base class that used to define constants for generic_codecvt
40 class generic_codecvt_base {
41 public:
42 /// Initial state for converting to or from Unicode code points, used by initial_state in derived classes
43 enum initial_convertion_state {
44 to_unicode_state, ///< The state would be used by to_unicode functions
45 from_unicode_state ///< The state would be used by from_unicode functions
46 };
47 };
48
49 /// \brief Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t, char32_t
50 /// and char16_t
51 ///
52 /// Implementations should derive from this class defining itself as CodecvtImpl and provide following members
53 ///
54 /// - `state_type` - a type of special object that allows to store intermediate cached data, for example `iconv_t`
55 /// descriptor
56 /// - `state_type initial_state(generic_codecvt_base::initial_convertion_state direction) const` - member function
57 /// that creates initial state
58 /// - `int max_encoding_length() const` - a maximal length that one Unicode code point is represented, for UTF-8 for
59 /// example it is 4 from ISO-8859-1 it is 1
60 /// - `utf::code_point to_unicode(state_type& state, const char*& begin, const char* end)` - extract first code
61 /// point from the text in range [begin,end), in case of success begin would point to the next character sequence to
62 /// be encoded to next code point, in case of incomplete sequence - utf::incomplete shell be returned, and in case
63 /// of invalid input sequence utf::illegal shell be returned and begin would remain unmodified
64 /// - `utf::len_or_error from_unicode(state_type &state, utf::code_point u, char* begin, const char* end)` - convert
65 /// a Unicode code point `u` into a character sequence at [begin,end). Return the length of the sequence in case of
66 /// success, utf::incomplete in case of not enough room to encode the code point, or utf::illegal in case conversion
67 /// can not be performed
68 ///
69 ///
70 /// For example implementation of codecvt for latin1/ISO-8859-1 character set
71 ///
72 /// \code
73 ///
74 /// template<typename CharType>
75 /// class latin1_codecvt: boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >
76 /// {
77 /// public:
78 ///
79 /// /* Standard codecvt constructor */
80 /// latin1_codecvt(size_t refs = 0): boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >(refs)
81 /// {
82 /// }
83 ///
84 /// /* State is unused but required by generic_codecvt */
85 /// struct state_type {};
86 ///
87 /// state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
88 /// {
89 /// return state_type();
90 /// }
91 ///
92 /// int max_encoding_length() const
93 /// {
94 /// return 1;
95 /// }
96 ///
97 /// boost::locale::utf::code_point to_unicode(state_type&, const char*& begin, const char* end) const
98 /// {
99 /// if(begin == end)
100 /// return boost::locale::utf::incomplete;
101 /// return *begin++;
102 /// }
103 ///
104 /// boost::locale::utf::len_or_error from_unicode(state_type&, boost::locale::utf::code_point u,
105 /// char* begin, const char* end) const
106 /// {
107 /// if(u >= 256)
108 /// return boost::locale::utf::illegal;
109 /// if(begin == end)
110 /// return boost::locale::utf::incomplete;
111 /// *begin = u;
112 /// return 1;
113 /// }
114 /// };
115 ///
116 /// \endcode
117 ///
118 /// When external tools used for encoding conversion, the `state_type` is useful to save objects used for
119 /// conversions. For example, icu::UConverter can be saved in such a state for an efficient use:
120 ///
121 /// \code
122 /// template<typename CharType>
123 /// class icu_codecvt: boost::locale::generic_codecvt<CharType,icu_codecvt<CharType>>
124 /// {
125 /// public:
126 ///
127 /// /* Standard codecvt constructor */
128 /// icu_codecvt(std::string const &name,refs = 0):
129 /// boost::locale::generic_codecvt<CharType,icu_codecvt<CharType>>(refs)
130 /// { ... }
131 ///
132 /// using state_type = std::unique_ptr<UConverter,void (*)(UConverter*)>;
133 ///
134 /// state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
135 /// {
136 /// UErrorCode err = U_ZERO_ERROR;
137 /// return state_type(ucnv_safeClone(converter_,0,0,&err),ucnv_close);
138 /// }
139 ///
140 /// boost::locale::utf::code_point to_unicode(state_type &ptr,char const *&begin,char const *end) const
141 /// {
142 /// UErrorCode err = U_ZERO_ERROR;
143 /// boost::locale::utf::code_point cp = ucnv_getNextUChar(ptr.get(),&begin,end,&err);
144 /// ...
145 /// }
146 /// ...
147 /// };
148 /// \endcode
149 ///
150 template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
151 class generic_codecvt;
152
153 /// \brief UTF-16 to/from narrow char codecvt facet to use with char16_t or wchar_t on Windows
154 ///
155 /// Note in order to fit the requirements of usability by std::wfstream it uses mbstate_t
156 /// to handle intermediate states in handling of variable length UTF-16 sequences
157 ///
158 /// Its member functions implement standard virtual functions of basic codecvt
159 template<typename CharType, typename CodecvtImpl>
160 class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
161 public generic_codecvt_base {
162 public:
163 typedef CharType uchar;
164
165 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
166 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
167
168 protected:
169 std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
170 {
171 if(*reinterpret_cast<char*>(&s) != 0)
172 return std::codecvt_base::error;
173 next = from;
174 return std::codecvt_base::ok;
175 }
176 int do_encoding() const noexcept override { return 0; }
177 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
178 bool do_always_noconv() const noexcept override { return false; }
179
180 int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
181 {
182 bool state = *reinterpret_cast<char*>(&std_state) != 0;
183 const char* save_from = from;
184
185 auto cvt_state = implementation().initial_state(to_unicode_state);
186 while(max > 0 && from < from_end) {
187 const char* prev_from = from;
188 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
189 if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
190 from = prev_from;
191 break;
192 }
193 max--;
194 if(ch > 0xFFFF) {
195 if(!state)
196 from = prev_from;
197 state = !state;
198 }
199 }
200 *reinterpret_cast<char*>(&std_state) = state;
201 return static_cast<int>(from - save_from);
202 }
203
204 std::codecvt_base::result do_in(std::mbstate_t& std_state,
205 const char* from,
206 const char* from_end,
207 const char*& from_next,
208 uchar* to,
209 uchar* to_end,
210 uchar*& to_next) const override
211 {
212 std::codecvt_base::result r = std::codecvt_base::ok;
213
214 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
215 // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
216 //
217 // if 0/false no codepoint above >0xFFFF observed, else a codepoint above 0xFFFF was observed
218 // and first pair is written, but no input consumed
219 bool state = *reinterpret_cast<char*>(&std_state) != 0;
220 auto cvt_state = implementation().initial_state(to_unicode_state);
221 while(to < to_end && from < from_end) {
222 const char* from_saved = from;
223
224 utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
225
226 if(ch == boost::locale::utf::illegal) {
227 from = from_saved;
228 r = std::codecvt_base::error;
229 break;
230 }
231 if(ch == boost::locale::utf::incomplete) {
232 from = from_saved;
233 r = std::codecvt_base::partial;
234 break;
235 }
236 // Normal codepoints go directly to stream
237 if(ch <= 0xFFFF)
238 *to++ = static_cast<uchar>(ch);
239 else {
240 // For other codepoints we do the following
241 //
242 // 1. We can't consume our input as we may find ourselves
243 // in state where all input consumed but not all output written,i.e. only
244 // 1st pair is written
245 // 2. We only write first pair and mark this in the state, we also revert back
246 // the from pointer in order to make sure this codepoint would be read
247 // once again and then we would consume our input together with writing
248 // second surrogate pair
249 ch -= 0x10000;
250 std::uint16_t w1 = static_cast<std::uint16_t>(0xD800 | (ch >> 10));
251 std::uint16_t w2 = static_cast<std::uint16_t>(0xDC00 | (ch & 0x3FF));
252 if(!state) {
253 from = from_saved;
254 *to++ = w1;
255 } else
256 *to++ = w2;
257 state = !state;
258 }
259 }
260 from_next = from;
261 to_next = to;
262 if(r == std::codecvt_base::ok && (from != from_end || state))
263 r = std::codecvt_base::partial;
264 *reinterpret_cast<char*>(&std_state) = state;
265 return r;
266 }
267
268 std::codecvt_base::result do_out(std::mbstate_t& std_state,
269 const uchar* from,
270 const uchar* from_end,
271 const uchar*& from_next,
272 char* to,
273 char* to_end,
274 char*& to_next) const override
275 {
276 std::codecvt_base::result r = std::codecvt_base::ok;
277 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
278 // according to standard. We assume that sizeof(mbstate_t) >=2 in order
279 // to be able to store first observed surrogate pair
280 //
281 // State: state!=0 - a first surrogate pair was observed (state = first pair),
282 // we expect the second one to come and then zero the state
283 std::uint16_t state = detail::read_state(src: std_state);
284 auto cvt_state = implementation().initial_state(from_unicode_state);
285 while(to < to_end && from < from_end) {
286 utf::code_point ch = 0;
287 if(state != 0) {
288 // if the state indicates that 1st surrogate pair was written
289 // we should make sure that the second one that comes is actually
290 // second surrogate
291 std::uint16_t w1 = state;
292 std::uint16_t w2 = *from;
293 // we don't forward from as writing may fail to incomplete or
294 // partial conversion
295 if(0xDC00 <= w2 && w2 <= 0xDFFF) {
296 std::uint16_t vh = w1 - 0xD800;
297 std::uint16_t vl = w2 - 0xDC00;
298 ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
299 } else {
300 // Invalid surrogate
301 r = std::codecvt_base::error;
302 break;
303 }
304 } else {
305 ch = *from;
306 if(0xD800 <= ch && ch <= 0xDBFF) {
307 // if this is a first surrogate pair we put
308 // it into the state and consume it, note we don't
309 // go forward as it should be illegal so we increase
310 // the from pointer manually
311 state = static_cast<uint16_t>(ch);
312 from++;
313 continue;
314 } else if(0xDC00 <= ch && ch <= 0xDFFF) {
315 // if we observe second surrogate pair and
316 // first only may be expected we should break from the loop with error
317 // as it is illegal input
318 r = std::codecvt_base::error;
319 break;
320 }
321 }
322 if(!boost::locale::utf::is_valid_codepoint(v: ch)) {
323 r = std::codecvt_base::error;
324 break;
325 }
326 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
327 if(len == boost::locale::utf::incomplete) {
328 r = std::codecvt_base::partial;
329 break;
330 } else if(len == boost::locale::utf::illegal) {
331 r = std::codecvt_base::error;
332 break;
333 } else
334 to += len;
335 state = 0;
336 from++;
337 }
338 from_next = from;
339 to_next = to;
340 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
341 r = std::codecvt_base::partial;
342 detail::write_state(dst&: std_state, src: state);
343 return r;
344 }
345 };
346
347 /// \brief UTF-32 to/from narrow char codecvt facet to use with char32_t or wchar_t on POSIX platforms
348 ///
349 /// Its member functions implement standard virtual functions of basic codecvt.
350 /// mbstate_t is not used for UTF-32 handling due to fixed length encoding
351 template<typename CharType, typename CodecvtImpl>
352 class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
353 public generic_codecvt_base {
354 public:
355 typedef CharType uchar;
356
357 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
358
359 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
360
361 protected:
362 std::codecvt_base::result
363 do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
364 {
365 next = from;
366 return std::codecvt_base::ok;
367 }
368 int do_encoding() const noexcept override { return 0; }
369 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
370 bool do_always_noconv() const noexcept override { return false; }
371
372 int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
373 {
374 const char* start_from = from;
375 auto cvt_state = implementation().initial_state(to_unicode_state);
376 while(max > 0 && from < from_end) {
377 const char* save_from = from;
378 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
379 if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
380 from = save_from;
381 break;
382 }
383 max--;
384 }
385
386 return static_cast<int>(from - start_from);
387 }
388
389 std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
390 const char* from,
391 const char* from_end,
392 const char*& from_next,
393 uchar* to,
394 uchar* to_end,
395 uchar*& to_next) const override
396 {
397 std::codecvt_base::result r = std::codecvt_base::ok;
398
399 auto cvt_state = implementation().initial_state(to_unicode_state);
400 while(to < to_end && from < from_end) {
401 const char* from_saved = from;
402
403 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
404
405 if(ch == boost::locale::utf::illegal) {
406 r = std::codecvt_base::error;
407 from = from_saved;
408 break;
409 }
410 if(ch == boost::locale::utf::incomplete) {
411 r = std::codecvt_base::partial;
412 from = from_saved;
413 break;
414 }
415 *to++ = ch;
416 }
417 from_next = from;
418 to_next = to;
419 if(r == std::codecvt_base::ok && from != from_end)
420 r = std::codecvt_base::partial;
421 return r;
422 }
423
424 std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
425 const uchar* from,
426 const uchar* from_end,
427 const uchar*& from_next,
428 char* to,
429 char* to_end,
430 char*& to_next) const override
431 {
432 std::codecvt_base::result r = std::codecvt_base::ok;
433 auto cvt_state = implementation().initial_state(from_unicode_state);
434 while(to < to_end && from < from_end) {
435 const std::uint32_t ch = *from;
436 if(!boost::locale::utf::is_valid_codepoint(v: ch)) {
437 r = std::codecvt_base::error;
438 break;
439 }
440 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
441 if(len == boost::locale::utf::incomplete) {
442 r = std::codecvt_base::partial;
443 break;
444 } else if(len == boost::locale::utf::illegal) {
445 r = std::codecvt_base::error;
446 break;
447 }
448 to += len;
449 from++;
450 }
451 from_next = from;
452 to_next = to;
453 if(r == std::codecvt_base::ok && from != from_end)
454 r = std::codecvt_base::partial;
455 return r;
456 }
457 };
458
459 template<typename CodecvtImpl>
460 class generic_codecvt<char, CodecvtImpl, 1> : public std::codecvt<char, char, std::mbstate_t>,
461 public generic_codecvt_base {
462 public:
463 typedef char uchar;
464
465 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
466
467 generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
468 };
469
470}} // namespace boost::locale
471
472#endif
473

source code of boost/libs/locale/include/boost/locale/generic_codecvt.hpp