util.hpp source code [boost/libs/locale/include/boost/locale/util.hpp]

1	//
2	// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3	// Copyright (c) 2022-2023 Alexander Grund
4	//
5	// Distributed under the Boost Software License, Version 1.0.
6	// https://www.boost.org/LICENSE_1_0.txt
7
8	#ifndef BOOST_LOCALE_UTIL_HPP
9	#define BOOST_LOCALE_UTIL_HPP
10
11	#include <boost/locale/generator.hpp>
12	#include <boost/locale/utf.hpp>
13	#include <boost/assert.hpp>
14	#include <cstdint>
15	#include <locale>
16	#include <memory>
17	#include <typeinfo>
18
19	namespace boost { namespace locale {
20	/// \brief This namespace provides various utility function useful for Boost.Locale's backends
21	/// implementations
22	namespace util {
23
24	/// \brief Return default system locale name in POSIX format.
25	///
26	/// This function tries to detect the locale using LC_ALL, LC_CTYPE and LANG environment
27	/// variables in this order and if all of them are unset, on POSIX platforms it returns "C".
28	/// On Windows additionally to the above environment variables, this function
29	/// tries to create the locale name from ISO-639 and ISO-3166 country codes defined
30	/// for the users default locale.
31	/// If \a use_utf8_on_windows is true it sets the encoding to UTF-8,
32	/// otherwise, if the system locale supports ANSI codepages it defines the ANSI encoding, e.g. windows-1252,
33	/// otherwise (if ANSI codepage is not available) it uses UTF-8 encoding.
34	BOOST_LOCALE_DECL
35	std::string get_system_locale(bool use_utf8_on_windows = false);
36
37	/// \brief Installs information facet to locale \a in based on locale name \a name
38	///
39	/// This function installs boost::locale::info facet into the locale \a in and returns
40	/// newly created locale.
41	///
42	/// Note: all information is based only on parsing of string \a name;
43	///
44	/// The name has following format: language[_COUNTRY][.encoding][\@variant]
45	/// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
46	/// country identifier like "US" or "RU". the Encoding is a character set name
47	/// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
48	/// calendar=hebrew.
49	///
50	/// If some parameters are missing they are specified as blanks, default encoding
51	/// is assumed to be US-ASCII and missing language is assumed to be "C"
52	BOOST_LOCALE_DECL
53	std::locale create_info(const std::locale& in, const std::string& name);
54
55	/// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
56	/// each single code point
57	///
58	/// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
59	/// to encoding supported by this converter
60	///
61	/// Please note, this converter should be fully stateless. Fully stateless means it should
62	/// never assume that it is called in any specific order on the text. Even if the
63	/// encoding itself seems to be stateless like windows-1255 or shift-jis, some
64	/// encoders (most notably iconv) can actually compose several code-point into one or
65	/// decompose them in case composite characters are found. So be very careful when implementing
66	/// these converters for certain character set.
67	class BOOST_LOCALE_DECL base_converter {
68	public:
69	/// This value should be returned when an illegal input sequence or code-point is observed:
70	/// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
71	/// or an invalid UTF-8 sequence is found
72	static constexpr utf::code_point illegal = utf::illegal;
73
74	/// This value is returned in following cases: An incomplete input sequence was found or
75	/// insufficient output buffer was provided so complete output could not be written.
76	static constexpr utf::code_point incomplete = utf::incomplete;
77
78	virtual ~base_converter();
79
80	/// Return the maximal length that one Unicode code-point can be converted to, for example
81	/// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
82	virtual int max_len() const { return `1`; }
83
84	/// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
85	///
86	/// Rule of thumb: if this class' implementation uses simple tables that are unchanged
87	/// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
88	/// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
89	/// for example if you use iconv_t descriptor or UConverter as conversion object return false,
90	/// and this object will be cloned for each use.
91	virtual bool is_thread_safe() const { return false; }
92
93	/// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
94	virtual base_converter* clone() const
95	{
96	BOOST_ASSERT(typeid(*this) == typeid(base_converter));
97	return new base_converter ();
98	}
99
100	/// Convert a single character starting at begin and ending at most at end to Unicode code-point.
101	///
102	/// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a
103	/// code_point_end <= \a end it is converted to its Unicode code point equivalent, \a begin is set to \a
104	/// code_point_end
105	///
106	/// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a
107	/// code_point_end > \a end and [\a begin, \a code_point_end) would be valid input sequence, then \a
108	/// incomplete is returned begin stays unchanged, for example for UTF-8 conversion a begin = 0xc2, \a begin*
109	/// +1 = \a end is such situation.
110	///
111	/// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a
112	/// code_point_end <= \a end that is illegal for this encoding, \a illegal is returned and begin stays
113	/// unchanged. For example if begin = 0xFF and begin < end for UTF-8, then \a illegal is returned.*
114	virtual utf::code_point to_unicode(const char& begin, const* char* end)
115	{
116	if(begin == end)
117	return incomplete; // LCOV_EXCL_LINE
118	unsigned char cp = *begin;
119	if(cp <= `0x7F`) {
120	begin++;
121	return cp;
122	}
123	return illegal;
124	}
125
126	/// Convert a single code-point \a u into encoding and store it in [begin,end) range.
127	///
128	/// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
129	/// \a illegal should be returned
130	///
131	/// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
132	///
133	/// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
134	/// -# If end - begin < N, incomplete is returned, it is unspecified what would be
135	/// stored in bytes in range [begin,end)
136	virtual utf::len_or_error from_unicode(utf::code_point u, char* begin, const char* end)
137	{
138	if(begin == end)
139	return incomplete; // LCOV_EXCL_LINE
140	if(u >= `0x80`)
141	return illegal;
142	begin = static_cast<char*>(u);
143	return `1`;
144	}
145	};
146
147	/// This function creates a \a base_converter that can be used for conversion between UTF-8 and
148	/// Unicode code points
149	BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter();
150
151	BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'")
152	inline std::unique_ptr<base_converter> create_utf8_converter_unique_ptr()
153	{
154	return create_utf8_converter();
155	}
156
157	/// This function creates a \a base_converter that can be used for conversion between single byte
158	/// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
159	///
160	/// If \a encoding is not supported, empty pointer is returned.
161	/// So you should check whether the returned pointer is valid/non-NULL
162	BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter(const std::string& encoding);
163
164	BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'")
165	inline std::unique_ptr<base_converter> create_simple_converter_unique_ptr(const std::string& encoding)
166	{
167	return create_simple_converter(encoding);
168	}
169
170	/// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
171	/// facet.
172	///
173	/// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
174	/// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or
175	/// output.
176	///
177	/// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
178	/// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
179	/// of wide encoding type
180	BOOST_LOCALE_DECL
181	std::locale create_codecvt(const std::locale& in, std::unique_ptr<base_converter> cvt, char_facet_t type);
182
183	BOOST_DEPRECATED("This function is deprecated, use 'create_codecvt()'")
184	inline std::locale create_codecvt_from_pointer(const std::locale& in, base_converter* cvt, char_facet_t type)
185	{
186	return create_codecvt(in, cvt: std::unique_ptr<base_converter>(cvt), type);
187	}
188
189	BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'")
190	BOOST_LOCALE_DECL base_converter* create_utf8_converter_new_ptr();
191
192	BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'")
193	BOOST_LOCALE_DECL base_converter* create_simple_converter_new_ptr(const std::string& encoding);
194
195	/// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
196	/// new locale that is based on \a in and uses new facet.
197	BOOST_LOCALE_DECL
198	std::locale create_utf8_codecvt(const std::locale& in, char_facet_t type);
199
200	/// This function installs codecvt that can be used for conversion between single byte
201	/// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
202	///
203	/// \throws boost::locale::conv::invalid_charset_error: Character set is not supported or isn't a single
204	/// byte character set
205	BOOST_LOCALE_DECL
206	std::locale create_simple_codecvt(const std::locale& in, const std::string& encoding, char_facet_t type);
207	} // namespace util
208	}} // namespace boost::locale
209
210	#endif
211

source code of boost/libs/locale/include/boost/locale/util.hpp