1 | /*============================================================================= |
2 | Copyright (c) 2001-2011 Joel de Guzman |
3 | Copyright (c) 2023 Nikita Kniazev |
4 | |
5 | Distributed under the Boost Software License, Version 1.0. (See accompanying |
6 | file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
7 | ==============================================================================*/ |
8 | #if !defined(BOOST_SPIRIT_UC_TYPES_NOVEMBER_23_2008_0840PM) |
9 | #define BOOST_SPIRIT_UC_TYPES_NOVEMBER_23_2008_0840PM |
10 | |
11 | #if defined(_MSC_VER) |
12 | #pragma once |
13 | #endif |
14 | |
15 | #include <boost/config.hpp> |
16 | #include <boost/cstdint.hpp> |
17 | #include <boost/type_traits/make_unsigned.hpp> |
18 | #include <string> |
19 | |
20 | namespace boost { namespace spirit |
21 | { |
22 | typedef ::boost::uint32_t ucs4_char; |
23 | typedef char utf8_char; |
24 | typedef std::basic_string<ucs4_char> ucs4_string; |
25 | typedef std::basic_string<utf8_char> utf8_string; |
26 | |
27 | namespace detail { |
28 | inline void utf8_put_encode(utf8_string& out, ucs4_char x) |
29 | { |
30 | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf D90 |
31 | if (BOOST_UNLIKELY(x > 0x10FFFFul || (0xD7FFul < x && x < 0xE000ul))) |
32 | x = 0xFFFDul; |
33 | |
34 | // Table 3-6. UTF-8 Bit Distribution |
35 | if (x < 0x80ul) { |
36 | out.push_back(c: static_cast<unsigned char>(x)); |
37 | } |
38 | else if (x < 0x800ul) { |
39 | out.push_back(c: static_cast<unsigned char>(0xC0ul + (x >> 6))); |
40 | out.push_back(c: static_cast<unsigned char>(0x80ul + (x & 0x3Ful))); |
41 | } |
42 | else if (x < 0x10000ul) { |
43 | out.push_back(c: static_cast<unsigned char>(0xE0ul + (x >> 12))); |
44 | out.push_back(c: static_cast<unsigned char>(0x80ul + ((x >> 6) & 0x3Ful))); |
45 | out.push_back(c: static_cast<unsigned char>(0x80ul + (x & 0x3Ful))); |
46 | } |
47 | else { |
48 | out.push_back(c: static_cast<unsigned char>(0xF0ul + (x >> 18))); |
49 | out.push_back(c: static_cast<unsigned char>(0x80ul + ((x >> 12) & 0x3Ful))); |
50 | out.push_back(c: static_cast<unsigned char>(0x80ul + ((x >> 6) & 0x3Ful))); |
51 | out.push_back(c: static_cast<unsigned char>(0x80ul + (x & 0x3Ful))); |
52 | } |
53 | } |
54 | } |
55 | |
56 | template <typename Char> |
57 | inline utf8_string to_utf8(Char value) |
58 | { |
59 | utf8_string result; |
60 | typedef typename make_unsigned<Char>::type UChar; |
61 | detail::utf8_put_encode(out&: result, x: static_cast<UChar>(value)); |
62 | return result; |
63 | } |
64 | |
65 | template <typename Char> |
66 | inline utf8_string to_utf8(Char const* str) |
67 | { |
68 | utf8_string result; |
69 | typedef typename make_unsigned<Char>::type UChar; |
70 | while (*str) |
71 | detail::utf8_put_encode(out&: result, x: static_cast<UChar>(*str++)); |
72 | return result; |
73 | } |
74 | |
75 | template <typename Char, typename Traits, typename Allocator> |
76 | inline utf8_string |
77 | to_utf8(std::basic_string<Char, Traits, Allocator> const& str) |
78 | { |
79 | utf8_string result; |
80 | typedef typename make_unsigned<Char>::type UChar; |
81 | for (Char const* ptr = str.data(), |
82 | * end = ptr + str.size(); ptr < end; ++ptr) |
83 | detail::utf8_put_encode(out&: result, x: static_cast<UChar>(*ptr)); |
84 | return result; |
85 | } |
86 | |
87 | // Assume wchar_t content is UTF-16 on MSVC, or mingw/wineg++ with -fshort-wchar |
88 | #if defined(_MSC_VER) || defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 2 |
89 | inline utf8_string to_utf8(wchar_t value) |
90 | { |
91 | utf8_string result; |
92 | detail::utf8_put_encode(result, static_cast<make_unsigned<wchar_t>::type>(value)); |
93 | return result; |
94 | } |
95 | |
96 | namespace detail { |
97 | inline ucs4_char decode_utf16(wchar_t const*& s) |
98 | { |
99 | typedef make_unsigned<wchar_t>::type uwchar_t; |
100 | |
101 | uwchar_t x(*s); |
102 | if (x < 0xD800ul || x > 0xDFFFul) |
103 | return x; |
104 | |
105 | // expected high-surrogate |
106 | if (BOOST_UNLIKELY((x >> 10) != 0x36ul)) |
107 | return 0xFFFDul; |
108 | |
109 | uwchar_t y(*++s); |
110 | // expected low-surrogate |
111 | if (BOOST_UNLIKELY((y >> 10) != 0x37ul)) |
112 | return 0xFFFDul; |
113 | |
114 | return ((x & 0x3FFul) << 10) + (y & 0x3FFul) + 0x10000ul; |
115 | } |
116 | } |
117 | |
118 | inline utf8_string to_utf8(wchar_t const* str) |
119 | { |
120 | utf8_string result; |
121 | for (ucs4_char c; (c = detail::decode_utf16(str)) != ucs4_char(); ++str) |
122 | detail::utf8_put_encode(result, c); |
123 | return result; |
124 | } |
125 | |
126 | template <typename Traits, typename Allocator> |
127 | inline utf8_string |
128 | to_utf8(std::basic_string<wchar_t, Traits, Allocator> const& str) |
129 | { |
130 | return to_utf8(str.c_str()); |
131 | } |
132 | #endif |
133 | }} |
134 | |
135 | #endif |
136 | |