1 | #ifndef BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_WIDE_ENCODING_HPP |
2 | #define BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_WIDE_ENCODING_HPP |
3 | |
4 | #include <boost/assert.hpp> |
5 | #include <boost/range/iterator_range_core.hpp> |
6 | |
7 | #include <utility> |
8 | |
9 | namespace boost { namespace property_tree { |
10 | namespace json_parser { namespace detail |
11 | { |
12 | |
13 | struct external_wide_encoding |
14 | { |
15 | typedef wchar_t external_char; |
16 | |
17 | bool is_nl(wchar_t c) const { return c == L'\n'; } |
18 | bool is_ws(wchar_t c) const { |
19 | return c == L' ' || c == L'\t' || c == L'\n' || c == L'\r'; |
20 | } |
21 | |
22 | bool is_minus(wchar_t c) const { return c == L'-'; } |
23 | bool is_plusminus(wchar_t c) const { return c == L'+' || c == L'-'; } |
24 | bool is_dot(wchar_t c) const { return c == L'.'; } |
25 | bool is_eE(wchar_t c) const { return c == L'e' || c == L'E'; } |
26 | bool is_0(wchar_t c) const { return c == L'0'; } |
27 | bool is_digit(wchar_t c) const { return c >= L'0' && c <= L'9'; } |
28 | bool is_digit0(wchar_t c) const { return c >= L'1' && c <= L'9'; } |
29 | |
30 | bool is_quote(wchar_t c) const { return c == L'"'; } |
31 | bool is_backslash(wchar_t c) const { return c == L'\\'; } |
32 | bool is_slash(wchar_t c) const { return c == L'/'; } |
33 | |
34 | bool is_comma(wchar_t c) const { return c == L','; } |
35 | bool is_open_bracket(wchar_t c) const { return c == L'['; } |
36 | bool is_close_bracket(wchar_t c) const { return c == L']'; } |
37 | bool is_colon(wchar_t c) const { return c == L':'; } |
38 | bool is_open_brace(wchar_t c) const { return c == L'{'; } |
39 | bool is_close_brace(wchar_t c) const { return c == L'}'; } |
40 | |
41 | bool is_a(wchar_t c) const { return c == L'a'; } |
42 | bool is_b(wchar_t c) const { return c == L'b'; } |
43 | bool is_e(wchar_t c) const { return c == L'e'; } |
44 | bool is_f(wchar_t c) const { return c == L'f'; } |
45 | bool is_l(wchar_t c) const { return c == L'l'; } |
46 | bool is_n(wchar_t c) const { return c == L'n'; } |
47 | bool is_r(wchar_t c) const { return c == L'r'; } |
48 | bool is_s(wchar_t c) const { return c == L's'; } |
49 | bool is_t(wchar_t c) const { return c == L't'; } |
50 | bool is_u(wchar_t c) const { return c == L'u'; } |
51 | |
52 | int decode_hexdigit(wchar_t c) { |
53 | if (c >= L'0' && c <= L'9') return c - L'0'; |
54 | if (c >= L'A' && c <= L'F') return c - L'A' + 10; |
55 | if (c >= L'a' && c <= L'f') return c - L'a' + 10; |
56 | return -1; |
57 | } |
58 | }; |
59 | |
60 | template <bool B> struct is_utf16 {}; |
61 | |
62 | class wide_wide_encoding : public external_wide_encoding |
63 | { |
64 | typedef is_utf16<sizeof(wchar_t) == 2> test_utf16; |
65 | public: |
66 | typedef wchar_t internal_char; |
67 | |
68 | template <typename Iterator> |
69 | boost::iterator_range<Iterator> |
70 | to_internal(Iterator first, Iterator last) const { |
71 | return boost::make_iterator_range(first, last); |
72 | } |
73 | |
74 | wchar_t to_internal_trivial(wchar_t c) const { |
75 | BOOST_ASSERT(!is_surrogate_high(c) && !is_surrogate_low(c)); |
76 | return c; |
77 | } |
78 | |
79 | template <typename Iterator, typename Sentinel, |
80 | typename EncodingErrorFn> |
81 | void skip_codepoint(Iterator& cur, Sentinel end, |
82 | EncodingErrorFn error_fn) const { |
83 | transcode_codepoint(cur, end, DoNothing(), error_fn); |
84 | } |
85 | |
86 | template <typename Iterator, typename Sentinel, typename TranscodedFn, |
87 | typename EncodingErrorFn> |
88 | void transcode_codepoint(Iterator& cur, Sentinel end, |
89 | TranscodedFn transcoded_fn, EncodingErrorFn error_fn) const { |
90 | return transcode_codepoint(cur, end, transcoded_fn, error_fn, |
91 | test_utf16()); |
92 | } |
93 | |
94 | template <typename TranscodedFn> |
95 | void feed_codepoint(unsigned codepoint, |
96 | TranscodedFn transcoded_fn) const { |
97 | feed_codepoint(codepoint, transcoded_fn, test_utf16()); |
98 | } |
99 | |
100 | template <typename Iterator, typename Sentinel> |
101 | void skip_introduction(Iterator& cur, Sentinel end) const { |
102 | // Endianness is already decoded at this level. |
103 | if (cur != end && *cur == 0xfeff) { |
104 | ++cur; |
105 | } |
106 | } |
107 | |
108 | private: |
109 | struct DoNothing { |
110 | void operator ()(wchar_t) const {} |
111 | }; |
112 | |
113 | template <typename Iterator, typename Sentinel, typename TranscodedFn, |
114 | typename EncodingErrorFn> |
115 | void transcode_codepoint(Iterator& cur, Sentinel, |
116 | TranscodedFn transcoded_fn, |
117 | EncodingErrorFn error_fn, |
118 | is_utf16<false>) const { |
119 | wchar_t c = *cur; |
120 | if (c < 0x20) { |
121 | error_fn(); |
122 | } |
123 | transcoded_fn(c); |
124 | ++cur; |
125 | } |
126 | template <typename Iterator, typename Sentinel, typename TranscodedFn, |
127 | typename EncodingErrorFn> |
128 | void transcode_codepoint(Iterator& cur, Sentinel end, |
129 | TranscodedFn transcoded_fn, |
130 | EncodingErrorFn error_fn, |
131 | is_utf16<true>) const { |
132 | wchar_t c = *cur; |
133 | if (c < 0x20) { |
134 | error_fn(); |
135 | } |
136 | if (is_surrogate_low(codepoint: c)) { |
137 | error_fn(); |
138 | } |
139 | transcoded_fn(c); |
140 | ++cur; |
141 | if (is_surrogate_high(codepoint: c)) { |
142 | if (cur == end) { |
143 | error_fn(); |
144 | } |
145 | c = *cur; |
146 | if (!is_surrogate_low(codepoint: c)) { |
147 | error_fn(); |
148 | } |
149 | transcoded_fn(c); |
150 | ++cur; |
151 | } |
152 | } |
153 | |
154 | template <typename TranscodedFn> |
155 | void feed_codepoint(unsigned codepoint, TranscodedFn transcoded_fn, |
156 | is_utf16<false>) const { |
157 | transcoded_fn(static_cast<wchar_t>(codepoint)); |
158 | } |
159 | template <typename TranscodedFn> |
160 | void feed_codepoint(unsigned codepoint, TranscodedFn transcoded_fn, |
161 | is_utf16<true>) const { |
162 | if (codepoint < 0x10000) { |
163 | transcoded_fn(static_cast<wchar_t>(codepoint)); |
164 | } else { |
165 | codepoint -= 0x10000; |
166 | transcoded_fn(static_cast<wchar_t>((codepoint >> 10) | 0xd800)); |
167 | transcoded_fn(static_cast<wchar_t>( |
168 | (codepoint & 0x3ff) | 0xdc00)); |
169 | } |
170 | } |
171 | |
172 | static bool is_surrogate_high(unsigned codepoint) { |
173 | return (codepoint & 0xfc00) == 0xd800; |
174 | } |
175 | static bool is_surrogate_low(unsigned codepoint) { |
176 | return (codepoint & 0xfc00) == 0xdc00; |
177 | } |
178 | }; |
179 | |
180 | }}}} |
181 | |
182 | #endif |
183 | |