| 1 | // |
| 2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) |
| 3 | // Copyright (c) 2022-2023 Alexander Grund |
| 4 | // |
| 5 | // Distributed under the Boost Software License, Version 1.0. |
| 6 | // https://www.boost.org/LICENSE_1_0.txt |
| 7 | |
| 8 | #ifndef BOOST_LOCALE_UTIL_HPP |
| 9 | #define BOOST_LOCALE_UTIL_HPP |
| 10 | |
| 11 | #include <boost/locale/generator.hpp> |
| 12 | #include <boost/locale/utf.hpp> |
| 13 | #include <boost/assert.hpp> |
| 14 | #include <cstdint> |
| 15 | #include <locale> |
| 16 | #include <memory> |
| 17 | #include <typeinfo> |
| 18 | |
| 19 | namespace boost { namespace locale { |
| 20 | /// \brief This namespace provides various utility function useful for Boost.Locale's backends |
| 21 | /// implementations |
| 22 | namespace util { |
| 23 | |
| 24 | /// \brief Return default system locale name in POSIX format. |
| 25 | /// |
| 26 | /// This function tries to detect the locale using LC_ALL, LC_CTYPE and LANG environment |
| 27 | /// variables in this order and if all of them are unset, on POSIX platforms it returns "C". |
| 28 | /// On Windows additionally to the above environment variables, this function |
| 29 | /// tries to create the locale name from ISO-639 and ISO-3166 country codes defined |
| 30 | /// for the users default locale. |
| 31 | /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, |
| 32 | /// otherwise, if the system locale supports ANSI codepages it defines the ANSI encoding, e.g. windows-1252, |
| 33 | /// otherwise (if ANSI codepage is not available) it uses UTF-8 encoding. |
| 34 | BOOST_LOCALE_DECL |
| 35 | std::string get_system_locale(bool use_utf8_on_windows = false); |
| 36 | |
| 37 | /// \brief Installs information facet to locale \a in based on locale name \a name |
| 38 | /// |
| 39 | /// This function installs boost::locale::info facet into the locale \a in and returns |
| 40 | /// newly created locale. |
| 41 | /// |
| 42 | /// Note: all information is based only on parsing of string \a name; |
| 43 | /// |
| 44 | /// The name has following format: language[_COUNTRY][.encoding][\@variant] |
| 45 | /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166 |
| 46 | /// country identifier like "US" or "RU". the Encoding is a character set name |
| 47 | /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or |
| 48 | /// calendar=hebrew. |
| 49 | /// |
| 50 | /// If some parameters are missing they are specified as blanks, default encoding |
| 51 | /// is assumed to be US-ASCII and missing language is assumed to be "C" |
| 52 | BOOST_LOCALE_DECL |
| 53 | std::locale create_info(const std::locale& in, const std::string& name); |
| 54 | |
| 55 | /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for |
| 56 | /// each single code point |
| 57 | /// |
| 58 | /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding |
| 59 | /// to encoding supported by this converter |
| 60 | /// |
| 61 | /// Please note, this converter should be fully stateless. Fully stateless means it should |
| 62 | /// never assume that it is called in any specific order on the text. Even if the |
| 63 | /// encoding itself seems to be stateless like windows-1255 or shift-jis, some |
| 64 | /// encoders (most notably iconv) can actually compose several code-point into one or |
| 65 | /// decompose them in case composite characters are found. So be very careful when implementing |
| 66 | /// these converters for certain character set. |
| 67 | class BOOST_LOCALE_DECL base_converter { |
| 68 | public: |
| 69 | /// This value should be returned when an illegal input sequence or code-point is observed: |
| 70 | /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates |
| 71 | /// or an invalid UTF-8 sequence is found |
| 72 | static constexpr utf::code_point illegal = utf::illegal; |
| 73 | |
| 74 | /// This value is returned in following cases: An incomplete input sequence was found or |
| 75 | /// insufficient output buffer was provided so complete output could not be written. |
| 76 | static constexpr utf::code_point incomplete = utf::incomplete; |
| 77 | |
| 78 | virtual ~base_converter(); |
| 79 | |
| 80 | /// Return the maximal length that one Unicode code-point can be converted to, for example |
| 81 | /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1 |
| 82 | virtual int max_len() const { return 1; } |
| 83 | |
| 84 | /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe. |
| 85 | /// |
| 86 | /// Rule of thumb: if this class' implementation uses simple tables that are unchanged |
| 87 | /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for |
| 88 | /// independent to_unicode, from_unicode calls, you may set it to true, otherwise, |
| 89 | /// for example if you use iconv_t descriptor or UConverter as conversion object return false, |
| 90 | /// and this object will be cloned for each use. |
| 91 | virtual bool is_thread_safe() const { return false; } |
| 92 | |
| 93 | /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false |
| 94 | virtual base_converter* clone() const |
| 95 | { |
| 96 | BOOST_ASSERT(typeid(*this) == typeid(base_converter)); |
| 97 | return new base_converter(); |
| 98 | } |
| 99 | |
| 100 | /// Convert a single character starting at begin and ending at most at end to Unicode code-point. |
| 101 | /// |
| 102 | /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a |
| 103 | /// code_point_end <= \a end it is converted to its Unicode code point equivalent, \a begin is set to \a |
| 104 | /// code_point_end |
| 105 | /// |
| 106 | /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a |
| 107 | /// code_point_end > \a end and [\a begin, \a code_point_end) would be valid input sequence, then \a |
| 108 | /// incomplete is returned begin stays unchanged, for example for UTF-8 conversion a *begin = 0xc2, \a begin |
| 109 | /// +1 = \a end is such situation. |
| 110 | /// |
| 111 | /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a |
| 112 | /// code_point_end <= \a end that is illegal for this encoding, \a illegal is returned and begin stays |
| 113 | /// unchanged. For example if *begin = 0xFF and begin < end for UTF-8, then \a illegal is returned. |
| 114 | virtual utf::code_point to_unicode(const char*& begin, const char* end) |
| 115 | { |
| 116 | if(begin == end) |
| 117 | return incomplete; // LCOV_EXCL_LINE |
| 118 | unsigned char cp = *begin; |
| 119 | if(cp <= 0x7F) { |
| 120 | begin++; |
| 121 | return cp; |
| 122 | } |
| 123 | return illegal; |
| 124 | } |
| 125 | |
| 126 | /// Convert a single code-point \a u into encoding and store it in [begin,end) range. |
| 127 | /// |
| 128 | /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set, |
| 129 | /// \a illegal should be returned |
| 130 | /// |
| 131 | /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then |
| 132 | /// |
| 133 | /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned |
| 134 | /// -# If end - begin < N, incomplete is returned, it is unspecified what would be |
| 135 | /// stored in bytes in range [begin,end) |
| 136 | virtual utf::len_or_error from_unicode(utf::code_point u, char* begin, const char* end) |
| 137 | { |
| 138 | if(begin == end) |
| 139 | return incomplete; // LCOV_EXCL_LINE |
| 140 | if(u >= 0x80) |
| 141 | return illegal; |
| 142 | *begin = static_cast<char>(u); |
| 143 | return 1; |
| 144 | } |
| 145 | }; |
| 146 | |
| 147 | /// This function creates a \a base_converter that can be used for conversion between UTF-8 and |
| 148 | /// Unicode code points |
| 149 | BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter(); |
| 150 | |
| 151 | BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'" ) |
| 152 | inline std::unique_ptr<base_converter> create_utf8_converter_unique_ptr() |
| 153 | { |
| 154 | return create_utf8_converter(); |
| 155 | } |
| 156 | |
| 157 | /// This function creates a \a base_converter that can be used for conversion between single byte |
| 158 | /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, |
| 159 | /// |
| 160 | /// If \a encoding is not supported, empty pointer is returned. |
| 161 | /// So you should check whether the returned pointer is valid/non-NULL |
| 162 | BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter(const std::string& encoding); |
| 163 | |
| 164 | BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'" ) |
| 165 | inline std::unique_ptr<base_converter> create_simple_converter_unique_ptr(const std::string& encoding) |
| 166 | { |
| 167 | return create_simple_converter(encoding); |
| 168 | } |
| 169 | |
| 170 | /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new |
| 171 | /// facet. |
| 172 | /// |
| 173 | /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter. |
| 174 | /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or |
| 175 | /// output. |
| 176 | /// |
| 177 | /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join |
| 178 | /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware |
| 179 | /// of wide encoding type |
| 180 | BOOST_LOCALE_DECL |
| 181 | std::locale create_codecvt(const std::locale& in, std::unique_ptr<base_converter> cvt, char_facet_t type); |
| 182 | |
| 183 | BOOST_DEPRECATED("This function is deprecated, use 'create_codecvt()'" ) |
| 184 | inline std::locale create_codecvt_from_pointer(const std::locale& in, base_converter* cvt, char_facet_t type) |
| 185 | { |
| 186 | return create_codecvt(in, cvt: std::unique_ptr<base_converter>(cvt), type); |
| 187 | } |
| 188 | |
| 189 | BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'" ) |
| 190 | BOOST_LOCALE_DECL base_converter* create_utf8_converter_new_ptr(); |
| 191 | |
| 192 | BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'" ) |
| 193 | BOOST_LOCALE_DECL base_converter* create_simple_converter_new_ptr(const std::string& encoding); |
| 194 | |
| 195 | /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return |
| 196 | /// new locale that is based on \a in and uses new facet. |
| 197 | BOOST_LOCALE_DECL |
| 198 | std::locale create_utf8_codecvt(const std::locale& in, char_facet_t type); |
| 199 | |
| 200 | /// This function installs codecvt that can be used for conversion between single byte |
| 201 | /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, |
| 202 | /// |
| 203 | /// \throws boost::locale::conv::invalid_charset_error: Character set is not supported or isn't a single |
| 204 | /// byte character set |
| 205 | BOOST_LOCALE_DECL |
| 206 | std::locale create_simple_codecvt(const std::locale& in, const std::string& encoding, char_facet_t type); |
| 207 | } // namespace util |
| 208 | }} // namespace boost::locale |
| 209 | |
| 210 | #endif |
| 211 | |