| 1 | // |
| 2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) |
| 3 | // |
| 4 | // Distributed under the Boost Software License, Version 1.0. |
| 5 | // https://www.boost.org/LICENSE_1_0.txt |
| 6 | |
| 7 | #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED |
| 8 | #define BOOST_LOCALE_UTF_HPP_INCLUDED |
| 9 | |
| 10 | #include <boost/locale/config.hpp> |
| 11 | #include <cstdint> |
| 12 | |
| 13 | namespace boost { namespace locale { |
| 14 | /// \brief Namespace that holds basic operations on UTF encoded sequences |
| 15 | /// |
| 16 | /// All functions defined in this namespace do not require linking with Boost.Locale library |
| 17 | namespace utf { |
| 18 | /// \brief The integral type that can hold a Unicode code point |
| 19 | using code_point = uint32_t; |
| 20 | |
| 21 | /// \brief Special constant that defines illegal code point |
| 22 | constexpr code_point illegal = 0xFFFFFFFFu; |
| 23 | /// \brief Special constant that defines incomplete code point |
| 24 | constexpr code_point incomplete = 0xFFFFFFFEu; |
| 25 | |
| 26 | /// Either a length/size or an error (illegal/incomplete) |
| 27 | using len_or_error = code_point; |
| 28 | |
| 29 | /// \brief the function checks if \a v is a valid code point |
| 30 | inline bool is_valid_codepoint(code_point v) |
| 31 | { |
| 32 | if(v > 0x10FFFF) |
| 33 | return false; |
| 34 | if(0xD800 <= v && v <= 0xDFFF) // surrogates |
| 35 | return false; |
| 36 | return true; |
| 37 | } |
| 38 | |
| 39 | #ifdef BOOST_LOCALE_DOXYGEN |
| 40 | |
| 41 | /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points |
| 42 | template<typename CharType, int size = sizeof(CharType)> |
| 43 | struct utf_traits { |
| 44 | /// The type of the character |
| 45 | typedef CharType char_type; |
| 46 | |
| 47 | /// Read one code point from the range [p,e) and return it. |
| 48 | /// |
| 49 | /// - If the sequence that was read is incomplete sequence returns \ref incomplete, |
| 50 | /// - If illegal sequence detected returns \ref illegal |
| 51 | /// |
| 52 | /// Requirements |
| 53 | /// |
| 54 | /// - Iterator is valid input iterator |
| 55 | /// |
| 56 | /// Postconditions |
| 57 | /// |
| 58 | /// - p points to the last consumed character |
| 59 | template<typename Iterator> |
| 60 | static code_point decode(Iterator& p, Iterator e); |
| 61 | |
| 62 | /// Maximal width of valid sequence in the code units: |
| 63 | /// |
| 64 | /// - UTF-8 - 4 |
| 65 | /// - UTF-16 - 2 |
| 66 | /// - UTF-32 - 1 |
| 67 | static constexpr int max_width; |
| 68 | |
| 69 | /// The width of specific code point in the code units. |
| 70 | /// |
| 71 | /// Requirement: value is a valid Unicode code point |
| 72 | /// Returns value in range [1..max_width] |
| 73 | static int width(code_point value); |
| 74 | |
| 75 | /// Get the size of the trail part of variable length encoded sequence. |
| 76 | /// |
| 77 | /// Returns -1 if C is not valid lead character |
| 78 | static int trail_length(char_type c); |
| 79 | /// Returns true if c is trail code unit, always false for UTF-32 |
| 80 | static bool is_trail(char_type c); |
| 81 | /// Returns true if c is lead code unit, always true of UTF-32 |
| 82 | static bool is_lead(char_type c); |
| 83 | |
| 84 | /// Convert valid Unicode code point \a value to the UTF sequence. |
| 85 | /// |
| 86 | /// Requirements: |
| 87 | /// |
| 88 | /// - \a value is valid code point |
| 89 | /// - \a out is an output iterator should be able to accept at least width(value) units |
| 90 | /// |
| 91 | /// Returns the iterator past the last written code unit. |
| 92 | template<typename Iterator> |
| 93 | static Iterator encode(code_point value, Iterator out); |
| 94 | |
| 95 | /// Decodes valid UTF sequence that is pointed by p into code point. |
| 96 | /// |
| 97 | /// If the sequence is invalid or points to end the behavior is undefined |
| 98 | template<typename Iterator> |
| 99 | static code_point decode_valid(Iterator& p); |
| 100 | }; |
| 101 | |
| 102 | #else |
| 103 | |
| 104 | template<typename CharType, int size = sizeof(CharType)> |
| 105 | struct utf_traits; |
| 106 | |
| 107 | template<typename CharType> |
| 108 | struct utf_traits<CharType, 1> { |
| 109 | typedef CharType char_type; |
| 110 | |
| 111 | static int trail_length(char_type ci) |
| 112 | { |
| 113 | unsigned char c = ci; |
| 114 | if(c < 128) |
| 115 | return 0; |
| 116 | if(BOOST_UNLIKELY(c < 194)) |
| 117 | return -1; |
| 118 | if(c < 224) |
| 119 | return 1; |
| 120 | if(c < 240) |
| 121 | return 2; |
| 122 | if(BOOST_LIKELY(c <= 244)) |
| 123 | return 3; |
| 124 | return -1; |
| 125 | } |
| 126 | |
| 127 | static constexpr int max_width = 4; |
| 128 | |
| 129 | static int width(code_point value) |
| 130 | { |
| 131 | if(value <= 0x7F) |
| 132 | return 1; |
| 133 | else if(value <= 0x7FF) |
| 134 | return 2; |
| 135 | else if(BOOST_LIKELY(value <= 0xFFFF)) |
| 136 | return 3; |
| 137 | else |
| 138 | return 4; |
| 139 | } |
| 140 | |
| 141 | static bool is_trail(char_type ci) |
| 142 | { |
| 143 | unsigned char c = ci; |
| 144 | return (c & 0xC0) == 0x80; |
| 145 | } |
| 146 | |
| 147 | static bool is_lead(char_type ci) { return !is_trail(ci); } |
| 148 | |
| 149 | template<typename Iterator> |
| 150 | static code_point decode(Iterator& p, Iterator e) |
| 151 | { |
| 152 | if(BOOST_UNLIKELY(p == e)) |
| 153 | return incomplete; |
| 154 | |
| 155 | unsigned char lead = *p++; |
| 156 | |
| 157 | // First byte is fully validated here |
| 158 | int trail_size = trail_length(ci: lead); |
| 159 | |
| 160 | if(BOOST_UNLIKELY(trail_size < 0)) |
| 161 | return illegal; |
| 162 | |
| 163 | // Ok as only ASCII may be of size = 0 |
| 164 | // also optimize for ASCII text |
| 165 | if(trail_size == 0) |
| 166 | return lead; |
| 167 | |
| 168 | code_point c = lead & ((1 << (6 - trail_size)) - 1); |
| 169 | |
| 170 | // Read the rest |
| 171 | unsigned char tmp; |
| 172 | switch(trail_size) { |
| 173 | case 3: |
| 174 | if(BOOST_UNLIKELY(p == e)) |
| 175 | return incomplete; |
| 176 | tmp = *p++; |
| 177 | if(!is_trail(ci: tmp)) |
| 178 | return illegal; |
| 179 | c = (c << 6) | (tmp & 0x3F); |
| 180 | BOOST_FALLTHROUGH; |
| 181 | case 2: |
| 182 | if(BOOST_UNLIKELY(p == e)) |
| 183 | return incomplete; |
| 184 | tmp = *p++; |
| 185 | if(!is_trail(ci: tmp)) |
| 186 | return illegal; |
| 187 | c = (c << 6) | (tmp & 0x3F); |
| 188 | BOOST_FALLTHROUGH; |
| 189 | case 1: |
| 190 | if(BOOST_UNLIKELY(p == e)) |
| 191 | return incomplete; |
| 192 | tmp = *p++; |
| 193 | if(!is_trail(ci: tmp)) |
| 194 | return illegal; |
| 195 | c = (c << 6) | (tmp & 0x3F); |
| 196 | } |
| 197 | |
| 198 | // Check code point validity: no surrogates and |
| 199 | // valid range |
| 200 | if(BOOST_UNLIKELY(!is_valid_codepoint(c))) |
| 201 | return illegal; |
| 202 | |
| 203 | // make sure it is the most compact representation |
| 204 | if(BOOST_UNLIKELY(width(c) != trail_size + 1)) |
| 205 | return illegal; |
| 206 | |
| 207 | return c; |
| 208 | } |
| 209 | |
| 210 | template<typename Iterator> |
| 211 | static code_point decode_valid(Iterator& p) |
| 212 | { |
| 213 | unsigned char lead = *p++; |
| 214 | if(lead < 192) |
| 215 | return lead; |
| 216 | |
| 217 | int trail_size; |
| 218 | |
| 219 | if(lead < 224) |
| 220 | trail_size = 1; |
| 221 | else if(BOOST_LIKELY(lead < 240)) // non-BMP rare |
| 222 | trail_size = 2; |
| 223 | else |
| 224 | trail_size = 3; |
| 225 | |
| 226 | code_point c = lead & ((1 << (6 - trail_size)) - 1); |
| 227 | |
| 228 | switch(trail_size) { |
| 229 | case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH; |
| 230 | case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH; |
| 231 | case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); |
| 232 | } |
| 233 | |
| 234 | return c; |
| 235 | } |
| 236 | |
| 237 | template<typename Iterator> |
| 238 | static Iterator encode(code_point value, Iterator out) |
| 239 | { |
| 240 | if(value <= 0x7F) |
| 241 | *out++ = static_cast<char_type>(value); |
| 242 | else if(value <= 0x7FF) { |
| 243 | *out++ = static_cast<char_type>((value >> 6) | 0xC0); |
| 244 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); |
| 245 | } else if(BOOST_LIKELY(value <= 0xFFFF)) { |
| 246 | *out++ = static_cast<char_type>((value >> 12) | 0xE0); |
| 247 | *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); |
| 248 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); |
| 249 | } else { |
| 250 | *out++ = static_cast<char_type>((value >> 18) | 0xF0); |
| 251 | *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80); |
| 252 | *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); |
| 253 | *out++ = static_cast<char_type>((value & 0x3F) | 0x80); |
| 254 | } |
| 255 | return out; |
| 256 | } |
| 257 | }; // utf8 |
| 258 | |
| 259 | template<typename CharType> |
| 260 | struct utf_traits<CharType, 2> { |
| 261 | typedef CharType char_type; |
| 262 | |
| 263 | // See RFC 2781 |
| 264 | static bool is_first_surrogate(uint16_t x) { return 0xD800 <= x && x <= 0xDBFF; } |
| 265 | static bool is_second_surrogate(uint16_t x) { return 0xDC00 <= x && x <= 0xDFFF; } |
| 266 | static code_point combine_surrogate(uint16_t w1, uint16_t w2) |
| 267 | { |
| 268 | return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; |
| 269 | } |
| 270 | static int trail_length(char_type c) |
| 271 | { |
| 272 | if(is_first_surrogate(x: c)) |
| 273 | return 1; |
| 274 | if(is_second_surrogate(x: c)) |
| 275 | return -1; |
| 276 | return 0; |
| 277 | } |
| 278 | |
| 279 | /// Returns true if c is trail code unit, always false for UTF-32 |
| 280 | static bool is_trail(char_type c) { return is_second_surrogate(x: c); } |
| 281 | /// Returns true if c is lead code unit, always true of UTF-32 |
| 282 | static bool is_lead(char_type c) { return !is_second_surrogate(x: c); } |
| 283 | |
| 284 | template<typename It> |
| 285 | static code_point decode(It& current, It last) |
| 286 | { |
| 287 | if(BOOST_UNLIKELY(current == last)) |
| 288 | return incomplete; |
| 289 | uint16_t w1 = *current++; |
| 290 | if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) |
| 291 | return w1; |
| 292 | if(w1 > 0xDBFF) |
| 293 | return illegal; |
| 294 | if(current == last) |
| 295 | return incomplete; |
| 296 | uint16_t w2 = *current++; |
| 297 | if(w2 < 0xDC00 || 0xDFFF < w2) |
| 298 | return illegal; |
| 299 | return combine_surrogate(w1, w2); |
| 300 | } |
| 301 | template<typename It> |
| 302 | static code_point decode_valid(It& current) |
| 303 | { |
| 304 | uint16_t w1 = *current++; |
| 305 | if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) |
| 306 | return w1; |
| 307 | uint16_t w2 = *current++; |
| 308 | return combine_surrogate(w1, w2); |
| 309 | } |
| 310 | |
| 311 | static constexpr int max_width = 2; |
| 312 | static int width(code_point u) { return u >= 0x10000 ? 2 : 1; } |
| 313 | template<typename It> |
| 314 | static It encode(code_point u, It out) |
| 315 | { |
| 316 | if(BOOST_LIKELY(u <= 0xFFFF)) |
| 317 | *out++ = static_cast<char_type>(u); |
| 318 | else { |
| 319 | u -= 0x10000; |
| 320 | *out++ = static_cast<char_type>(0xD800 | (u >> 10)); |
| 321 | *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF)); |
| 322 | } |
| 323 | return out; |
| 324 | } |
| 325 | }; // utf16; |
| 326 | |
| 327 | template<typename CharType> |
| 328 | struct utf_traits<CharType, 4> { |
| 329 | typedef CharType char_type; |
| 330 | static int trail_length(char_type c) |
| 331 | { |
| 332 | if(is_valid_codepoint(c)) |
| 333 | return 0; |
| 334 | return -1; |
| 335 | } |
| 336 | static bool is_trail(char_type /*c*/) { return false; } |
| 337 | static bool is_lead(char_type /*c*/) { return true; } |
| 338 | |
| 339 | template<typename It> |
| 340 | static code_point decode_valid(It& current) |
| 341 | { |
| 342 | return *current++; |
| 343 | } |
| 344 | |
| 345 | template<typename It> |
| 346 | static code_point decode(It& current, It last) |
| 347 | { |
| 348 | if(BOOST_UNLIKELY(current == last)) |
| 349 | return boost::locale::utf::incomplete; |
| 350 | code_point c = *current++; |
| 351 | if(BOOST_UNLIKELY(!is_valid_codepoint(c))) |
| 352 | return boost::locale::utf::illegal; |
| 353 | return c; |
| 354 | } |
| 355 | static constexpr int max_width = 1; |
| 356 | static int width(code_point /*u*/) { return 1; } |
| 357 | template<typename It> |
| 358 | static It encode(code_point u, It out) |
| 359 | { |
| 360 | *out++ = static_cast<char_type>(u); |
| 361 | return out; |
| 362 | } |
| 363 | |
| 364 | }; // utf32 |
| 365 | |
| 366 | #endif |
| 367 | |
| 368 | } // namespace utf |
| 369 | }} // namespace boost::locale |
| 370 | |
| 371 | #endif |
| 372 | |