1#ifndef BOOST_UTF8_CODECVT_FACET_HPP
2#define BOOST_UTF8_CODECVT_FACET_HPP
3
4#include <boost/iostreams/detail/config/wide_streams.hpp>
5#ifdef BOOST_IOSTREAMS_NO_WIDE_STREAMS
6# error wide streams not supported on this platform
7#endif
8
9// MS compatible compilers support #pragma once
10#if defined(_MSC_VER)
11# pragma once
12#endif
13
14/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
15// utf8_codecvt_facet.hpp
16
17// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
18// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
19// Distributed under the Boost Software License, Version 1.0. (See accompany-
20// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
21
22// Note:(Robert Ramey). I have made the following alterations in the original
23// code.
24// a) Rendered utf8_codecvt<wchar_t, char> with using templates
25// b) Move longer functions outside class definition to prevent inlining
26// and make code smaller
27// c) added on a derived class to permit translation to/from current
28// locale to utf8
29
30// See http://www.boost.org for updates, documentation, and revision history.
31
32// archives stored as text - note these ar templated on the basic
33// stream templates to accommodate wide (and other?) kind of characters
34//
35// note the fact that on libraries without wide characters, ostream is
36// is not a specialization of basic_ostream which in fact is not defined
37// in such cases. So we can't use basic_ostream<OStream::char_type> but rather
38// use two template parameters
39//
40// utf8_codecvt_facet
41// This is an implementation of a std::codecvt facet for translating
42// from UTF-8 externally to UCS-4. Note that this is not tied to
43// any specific types in order to allow customization on platforms
44// where wchar_t is not big enough.
45//
46// NOTES: The current implementation jumps through some unpleasant hoops in
47// order to deal with signed character types. As a std::codecvt_base::result,
48// it is necessary for the ExternType to be convertible to unsigned char.
49// I chose not to tie the extern_type explicitly to char. But if any combination
50// of types other than <wchar_t,char_t> is used, then std::codecvt must be
51// specialized on those types for this to work.
52
53#include <locale>
54#include <cstddef> // size_t
55#include <cwchar> // mbstate_t
56#include <boost/integer_traits.hpp>
57#include <boost/iostreams/detail/config/wide_streams.hpp>
58#include <boost/iostreams/detail/codecvt_helper.hpp>
59
60// maximum lenght of a multibyte string
61#define MB_LENGTH_MAX 8
62
63struct utf8_codecvt_facet_wchar_t
64 : public boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t>
65{
66public:
67 explicit utf8_codecvt_facet_wchar_t(std::size_t no_locale_manage = 0)
68 : boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t>
69 (no_locale_manage)
70 { }
71protected:
72 virtual std::codecvt_base::result do_in(
73 std::mbstate_t& state,
74 const char * from,
75 const char * from_end,
76 const char * & from_next,
77 wchar_t * to,
78 wchar_t * to_end,
79 wchar_t*& to_next
80 ) const;
81
82 virtual std::codecvt_base::result do_out(
83 std::mbstate_t & state, const wchar_t * from,
84 const wchar_t * from_end, const wchar_t* & from_next,
85 char * to, char * to_end, char * & to_next
86 ) const;
87
88 bool invalid_continuing_octet(unsigned char octet_1) const {
89 return (octet_1 < 0x80|| 0xbf< octet_1);
90 }
91
92 bool invalid_leading_octet(unsigned char octet_1) const {
93 return (0x7f < octet_1 && octet_1 < 0xc0) ||
94 (octet_1 > 0xfd);
95 }
96
97 // continuing octets = octets except for the leading octet
98 static unsigned int get_cont_octet_count(unsigned char lead_octet) {
99 return get_octet_count(lead_octet) - 1;
100 }
101
102 static unsigned int get_octet_count(unsigned char lead_octet);
103
104 // How many "continuing octets" will be needed for this word
105 // == total octets - 1.
106 int get_cont_octet_out_count(wchar_t word) const ;
107
108 virtual bool do_always_noconv() const throw() { return false; }
109
110 // UTF-8 isn't really stateful since we rewind on partial conversions
111 virtual std::codecvt_base::result do_unshift(
112 std::mbstate_t&,
113 char * from,
114 char * /* to */,
115 char * & next
116 ) const{
117 next = from;
118 return ok;
119 }
120
121 virtual int do_encoding() const throw() {
122 const int variable_byte_external_encoding=0;
123 return variable_byte_external_encoding;
124 }
125
126 // How many char objects can I process to get <= max_limit
127 // wchar_t objects?
128 virtual int do_length(
129 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
130 const char * from,
131 const char * from_end,
132 std::size_t max_limit
133 ) const throw();
134
135 // Largest possible value do_length(state,from,from_end,1) could return.
136 virtual int do_max_length() const throw () {
137 return 6; // largest UTF-8 encoding of a UCS-4 character
138 }
139};
140
141#if 0 // not used - incorrect in any case
142// Robert Ramey - use the above to make a code converter from multi-byte
143// char strings to utf8 encoding
144struct utf8_codecvt_facet_char : public utf8_codecvt_facet_wchar_t
145{
146 typedef utf8_codecvt_facet_wchar_t base_class;
147public:
148 explicit utf8_codecvt_facet_char(std::size_t no_locale_manage=0)
149 : base_class(no_locale_manage)
150 {}
151protected:
152 virtual std::codecvt_base::result do_in(
153 std::mbstate_t & state,
154 const char * from,
155 const char * from_end,
156 const char * & from_next,
157 char * to,
158 char * to_end,
159 char * & to_next
160 ) const;
161
162 virtual std::codecvt_base::result do_out(
163 std::mbstate_t & state,
164 const char * from,
165 const char * from_end,
166 const char* & from_next,
167 char * to,
168 char * to_end,
169 char * & to_next
170 ) const;
171
172 // How many char objects can I process to get <= max_limit
173 // char objects?
174 virtual int do_length(
175 const std::mbstate_t&,
176 const char * from,
177 const char * from_end,
178 std::size_t max_limit
179 ) const;
180};
181#endif
182
183template<class Internal, class External>
184struct utf8_codecvt_facet
185{};
186
187template<>
188struct utf8_codecvt_facet<wchar_t, char>
189 : public utf8_codecvt_facet_wchar_t
190{};
191
192#if 0
193template<>
194struct utf8_codecvt_facet<char, char>
195 : public utf8_codecvt_facet_char
196{};
197#endif
198
199#endif // BOOST_UTF8_CODECVT_FACET_HPP
200
201

source code of boost/libs/iostreams/test/detail/utf8_codecvt_facet.hpp