1//
2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3// Copyright (c) 2021-2022 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0.
6// https://www.boost.org/LICENSE_1_0.txt
7
8#include <boost/locale/boundary.hpp>
9#include <boost/locale/generator.hpp>
10#include "boost/locale/icu/all_generator.hpp"
11#include "boost/locale/icu/cdata.hpp"
12#include "boost/locale/icu/icu_util.hpp"
13#include "boost/locale/icu/uconv.hpp"
14#include "boost/locale/util/encoding.hpp"
15#if BOOST_LOCALE_ICU_VERSION >= 5502
16# include <unicode/utext.h>
17#endif
18#include <memory>
19#include <unicode/brkiter.h>
20#include <unicode/rbbi.h>
21#include <vector>
22
23#ifdef BOOST_MSVC
24# pragma warning(disable : 4244) // 'argument' : conversion from 'int'
25# pragma warning(disable : 4267) // 'argument' : conversion from 'size_t'
26#endif
27
28#if BOOST_LOCALE_ICU_VERSION >= 5502
29namespace std {
30template<>
31struct default_delete<UText> {
32 using pointer = UText*;
33 void operator()(pointer ptr) { utext_close(ut: ptr); }
34};
35} // namespace std
36#endif
37
38namespace boost { namespace locale {
39 namespace boundary { namespace impl_icu {
40
41 using namespace boost::locale::impl_icu;
42
43 index_type map_direct(boundary_type t, icu::BreakIterator* it, int reserve)
44 {
45 index_type indx;
46 indx.reserve(n: reserve);
47#if U_ICU_VERSION_MAJOR_NUM >= 52
48 icu::BreakIterator* rbbi = it;
49#else
50 icu::RuleBasedBreakIterator* rbbi = icu_cast<icu::RuleBasedBreakIterator>(it);
51#endif
52
53 indx.push_back(x: break_info());
54 it->first();
55 int pos = 0;
56 while((pos = it->next()) != icu::BreakIterator::DONE) {
57 indx.push_back(x: break_info(pos));
58 // Character does not have any specific break types
59 if(t != character && rbbi) {
60 std::vector<int32_t> buffer;
61 int32_t membuf[8] = {0}; // try not to use memory allocation if possible
62 int32_t* buf = membuf;
63
64 UErrorCode err = U_ZERO_ERROR;
65 int n = rbbi->getRuleStatusVec(fillInVec: buf, capacity: 8, status&: err);
66
67 if(err == U_BUFFER_OVERFLOW_ERROR) {
68 buffer.resize(new_size: n, x: 0);
69 buf = buffer.data();
70 n = rbbi->getRuleStatusVec(fillInVec: buf, capacity: buffer.size(), status&: err);
71 }
72
73 check_and_throw_icu_error(err);
74
75 for(int i = 0; i < n; i++) {
76 switch(t) {
77 case word:
78 if(UBRK_WORD_NONE <= buf[i] && buf[i] < UBRK_WORD_NONE_LIMIT)
79 indx.back().rule |= word_none;
80 else if(UBRK_WORD_NUMBER <= buf[i] && buf[i] < UBRK_WORD_NUMBER_LIMIT)
81 indx.back().rule |= word_number;
82 else if(UBRK_WORD_LETTER <= buf[i] && buf[i] < UBRK_WORD_LETTER_LIMIT)
83 indx.back().rule |= word_letter;
84 else if(UBRK_WORD_KANA <= buf[i] && buf[i] < UBRK_WORD_KANA_LIMIT)
85 indx.back().rule |= word_kana;
86 else if(UBRK_WORD_IDEO <= buf[i] && buf[i] < UBRK_WORD_IDEO_LIMIT)
87 indx.back().rule |= word_ideo;
88 break;
89
90 case line:
91 if(UBRK_LINE_SOFT <= buf[i] && buf[i] < UBRK_LINE_SOFT_LIMIT)
92 indx.back().rule |= line_soft;
93 else if(UBRK_LINE_HARD <= buf[i] && buf[i] < UBRK_LINE_HARD_LIMIT)
94 indx.back().rule |= line_hard;
95 break;
96
97 case sentence:
98 if(UBRK_SENTENCE_TERM <= buf[i] && buf[i] < UBRK_SENTENCE_TERM_LIMIT)
99 indx.back().rule |= sentence_term;
100 else if(UBRK_SENTENCE_SEP <= buf[i] && buf[i] < UBRK_SENTENCE_SEP_LIMIT)
101 indx.back().rule |= sentence_sep;
102 break;
103 case character: BOOST_UNREACHABLE_RETURN(0);
104 }
105 }
106 } else
107 indx.back().rule |= character_any; // Basic mark... for character
108 }
109 return indx;
110 }
111
112 std::unique_ptr<icu::BreakIterator> get_iterator(boundary_type t, const icu::Locale& loc)
113 {
114 UErrorCode err = U_ZERO_ERROR;
115 std::unique_ptr<icu::BreakIterator> bi;
116 switch(t) {
117 case character: bi.reset(p: icu::BreakIterator::createCharacterInstance(where: loc, status&: err)); break;
118 case word: bi.reset(p: icu::BreakIterator::createWordInstance(where: loc, status&: err)); break;
119 case sentence: bi.reset(p: icu::BreakIterator::createSentenceInstance(where: loc, status&: err)); break;
120 case line: bi.reset(p: icu::BreakIterator::createLineInstance(where: loc, status&: err)); break;
121 }
122 check_and_throw_icu_error(err);
123 if(!bi)
124 throw std::runtime_error("Failed to create break iterator");
125 return bi;
126 }
127
128 template<typename CharType>
129 index_type do_map(boundary_type t,
130 const CharType* begin,
131 const CharType* end,
132 const icu::Locale& loc,
133 const std::string& encoding)
134 {
135 std::unique_ptr<icu::BreakIterator> bi = get_iterator(t, loc);
136 // Versions prior to ICU 55.2 returned wrong splits when used with UText input
137#if BOOST_LOCALE_ICU_VERSION >= 5502
138 UErrorCode err = U_ZERO_ERROR;
139 BOOST_LOCALE_START_CONST_CONDITION
140 if(sizeof(CharType) == 2 || util::is_char8_t<CharType>::value
141 || (sizeof(CharType) == 1 && util::normalize_encoding(encoding) == "utf8"))
142 {
143 UText ut_stack = UTEXT_INITIALIZER;
144 std::unique_ptr<UText> ut;
145 if(sizeof(CharType) == 1)
146 ut.reset(utext_openUTF8(&ut_stack, reinterpret_cast<const char*>(begin), end - begin, &err));
147 else {
148 static_assert(sizeof(UChar) == 2, "!");
149 ut.reset(utext_openUChars(&ut_stack, reinterpret_cast<const UChar*>(begin), end - begin, &err));
150 }
151 BOOST_LOCALE_END_CONST_CONDITION
152
153 check_and_throw_icu_error(err);
154 err = U_ZERO_ERROR;
155 if(!ut)
156 throw std::runtime_error("Failed to create UText");
157 bi->setText(text: ut.get(), status&: err);
158 check_and_throw_icu_error(err);
159 return map_direct(t, bi.get(), end - begin);
160 } else
161#endif
162 {
163 icu_std_converter<CharType> cvt(encoding);
164 const icu::UnicodeString str = cvt.icu(begin, end);
165 bi->setText(str);
166 const index_type indirect = map_direct(t, it: bi.get(), reserve: str.length());
167 index_type indx = indirect;
168 for(size_t i = 1; i < indirect.size(); i++) {
169 const size_t offset_indirect = indirect[i - 1].offset;
170 const size_t diff = indirect[i].offset - offset_indirect;
171 const size_t offset_direct = indx[i - 1].offset;
172 indx[i].offset = offset_direct + cvt.cut(str, begin, end, diff, offset_indirect, offset_direct);
173 }
174 return indx;
175 }
176 } // do_map
177
178 template<typename CharType>
179 class boundary_indexing_impl : public boundary_indexing<CharType> {
180 public:
181 boundary_indexing_impl(const cdata& data) : locale_(data.locale()), encoding_(data.encoding()) {}
182 index_type map(boundary_type t, const CharType* begin, const CharType* end) const
183 {
184 return do_map<CharType>(t, begin, end, locale_, encoding_);
185 }
186
187 private:
188 icu::Locale locale_;
189 std::string encoding_;
190 };
191
192 }} // namespace boundary::impl_icu
193
194 namespace impl_icu {
195 std::locale create_boundary(const std::locale& in, const cdata& cd, char_facet_t type)
196 {
197 using namespace boost::locale::boundary::impl_icu;
198 switch(type) {
199 case char_facet_t::nochar: break;
200 case char_facet_t::char_f: return std::locale(in, new boundary_indexing_impl<char>(cd));
201 case char_facet_t::wchar_f: return std::locale(in, new boundary_indexing_impl<wchar_t>(cd));
202#ifdef __cpp_char8_t
203 case char_facet_t::char8_f: return std::locale(in, new boundary_indexing_impl<char8_t>(cd));
204#endif
205#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
206 case char_facet_t::char16_f: return std::locale(in, new boundary_indexing_impl<char16_t>(cd));
207#endif
208#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
209 case char_facet_t::char32_f: return std::locale(in, new boundary_indexing_impl<char32_t>(cd));
210#endif
211 }
212 return in;
213 }
214 } // namespace impl_icu
215
216}} // namespace boost::locale
217

source code of boost/libs/locale/src/boost/locale/icu/boundary.cpp