| 1 | // |
| 2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) |
| 3 | // Copyright (c) 2021-2022 Alexander Grund |
| 4 | // |
| 5 | // Distributed under the Boost Software License, Version 1.0. |
| 6 | // https://www.boost.org/LICENSE_1_0.txt |
| 7 | |
| 8 | #include <boost/locale/boundary.hpp> |
| 9 | #include <boost/locale/generator.hpp> |
| 10 | #include "boost/locale/icu/all_generator.hpp" |
| 11 | #include "boost/locale/icu/cdata.hpp" |
| 12 | #include "boost/locale/icu/icu_util.hpp" |
| 13 | #include "boost/locale/icu/uconv.hpp" |
| 14 | #include "boost/locale/util/encoding.hpp" |
| 15 | #if BOOST_LOCALE_ICU_VERSION >= 5502 |
| 16 | # include <unicode/utext.h> |
| 17 | #endif |
| 18 | #include <memory> |
| 19 | #include <unicode/brkiter.h> |
| 20 | #include <unicode/rbbi.h> |
| 21 | #include <vector> |
| 22 | |
| 23 | #ifdef BOOST_MSVC |
| 24 | # pragma warning(disable : 4244) // 'argument' : conversion from 'int' |
| 25 | # pragma warning(disable : 4267) // 'argument' : conversion from 'size_t' |
| 26 | #endif |
| 27 | |
| 28 | #if BOOST_LOCALE_ICU_VERSION >= 5502 |
| 29 | namespace std { |
| 30 | template<> |
| 31 | struct default_delete<UText> { |
| 32 | using pointer = UText*; |
| 33 | void operator()(pointer ptr) { utext_close(ut: ptr); } |
| 34 | }; |
| 35 | } // namespace std |
| 36 | #endif |
| 37 | |
| 38 | namespace boost { namespace locale { |
| 39 | namespace boundary { namespace impl_icu { |
| 40 | |
| 41 | using namespace boost::locale::impl_icu; |
| 42 | |
| 43 | index_type map_direct(boundary_type t, icu::BreakIterator* it, int reserve) |
| 44 | { |
| 45 | index_type indx; |
| 46 | indx.reserve(n: reserve); |
| 47 | #if U_ICU_VERSION_MAJOR_NUM >= 52 |
| 48 | icu::BreakIterator* rbbi = it; |
| 49 | #else |
| 50 | icu::RuleBasedBreakIterator* rbbi = icu_cast<icu::RuleBasedBreakIterator>(it); |
| 51 | #endif |
| 52 | |
| 53 | indx.push_back(x: break_info()); |
| 54 | it->first(); |
| 55 | int pos = 0; |
| 56 | while((pos = it->next()) != icu::BreakIterator::DONE) { |
| 57 | indx.push_back(x: break_info(pos)); |
| 58 | // Character does not have any specific break types |
| 59 | if(t != character && rbbi) { |
| 60 | std::vector<int32_t> buffer; |
| 61 | int32_t membuf[8] = {0}; // try not to use memory allocation if possible |
| 62 | int32_t* buf = membuf; |
| 63 | |
| 64 | UErrorCode err = U_ZERO_ERROR; |
| 65 | int n = rbbi->getRuleStatusVec(fillInVec: buf, capacity: 8, status&: err); |
| 66 | |
| 67 | if(err == U_BUFFER_OVERFLOW_ERROR) { |
| 68 | buffer.resize(new_size: n, x: 0); |
| 69 | buf = buffer.data(); |
| 70 | n = rbbi->getRuleStatusVec(fillInVec: buf, capacity: buffer.size(), status&: err); |
| 71 | } |
| 72 | |
| 73 | check_and_throw_icu_error(err); |
| 74 | |
| 75 | for(int i = 0; i < n; i++) { |
| 76 | switch(t) { |
| 77 | case word: |
| 78 | if(UBRK_WORD_NONE <= buf[i] && buf[i] < UBRK_WORD_NONE_LIMIT) |
| 79 | indx.back().rule |= word_none; |
| 80 | else if(UBRK_WORD_NUMBER <= buf[i] && buf[i] < UBRK_WORD_NUMBER_LIMIT) |
| 81 | indx.back().rule |= word_number; |
| 82 | else if(UBRK_WORD_LETTER <= buf[i] && buf[i] < UBRK_WORD_LETTER_LIMIT) |
| 83 | indx.back().rule |= word_letter; |
| 84 | else if(UBRK_WORD_KANA <= buf[i] && buf[i] < UBRK_WORD_KANA_LIMIT) |
| 85 | indx.back().rule |= word_kana; |
| 86 | else if(UBRK_WORD_IDEO <= buf[i] && buf[i] < UBRK_WORD_IDEO_LIMIT) |
| 87 | indx.back().rule |= word_ideo; |
| 88 | break; |
| 89 | |
| 90 | case line: |
| 91 | if(UBRK_LINE_SOFT <= buf[i] && buf[i] < UBRK_LINE_SOFT_LIMIT) |
| 92 | indx.back().rule |= line_soft; |
| 93 | else if(UBRK_LINE_HARD <= buf[i] && buf[i] < UBRK_LINE_HARD_LIMIT) |
| 94 | indx.back().rule |= line_hard; |
| 95 | break; |
| 96 | |
| 97 | case sentence: |
| 98 | if(UBRK_SENTENCE_TERM <= buf[i] && buf[i] < UBRK_SENTENCE_TERM_LIMIT) |
| 99 | indx.back().rule |= sentence_term; |
| 100 | else if(UBRK_SENTENCE_SEP <= buf[i] && buf[i] < UBRK_SENTENCE_SEP_LIMIT) |
| 101 | indx.back().rule |= sentence_sep; |
| 102 | break; |
| 103 | case character: BOOST_UNREACHABLE_RETURN(0); |
| 104 | } |
| 105 | } |
| 106 | } else |
| 107 | indx.back().rule |= character_any; // Basic mark... for character |
| 108 | } |
| 109 | return indx; |
| 110 | } |
| 111 | |
| 112 | std::unique_ptr<icu::BreakIterator> get_iterator(boundary_type t, const icu::Locale& loc) |
| 113 | { |
| 114 | UErrorCode err = U_ZERO_ERROR; |
| 115 | std::unique_ptr<icu::BreakIterator> bi; |
| 116 | switch(t) { |
| 117 | case character: bi.reset(p: icu::BreakIterator::createCharacterInstance(where: loc, status&: err)); break; |
| 118 | case word: bi.reset(p: icu::BreakIterator::createWordInstance(where: loc, status&: err)); break; |
| 119 | case sentence: bi.reset(p: icu::BreakIterator::createSentenceInstance(where: loc, status&: err)); break; |
| 120 | case line: bi.reset(p: icu::BreakIterator::createLineInstance(where: loc, status&: err)); break; |
| 121 | } |
| 122 | check_and_throw_icu_error(err); |
| 123 | if(!bi) |
| 124 | throw std::runtime_error("Failed to create break iterator" ); |
| 125 | return bi; |
| 126 | } |
| 127 | |
| 128 | template<typename CharType> |
| 129 | index_type do_map(boundary_type t, |
| 130 | const CharType* begin, |
| 131 | const CharType* end, |
| 132 | const icu::Locale& loc, |
| 133 | const std::string& encoding) |
| 134 | { |
| 135 | std::unique_ptr<icu::BreakIterator> bi = get_iterator(t, loc); |
| 136 | // Versions prior to ICU 55.2 returned wrong splits when used with UText input |
| 137 | #if BOOST_LOCALE_ICU_VERSION >= 5502 |
| 138 | UErrorCode err = U_ZERO_ERROR; |
| 139 | BOOST_LOCALE_START_CONST_CONDITION |
| 140 | if(sizeof(CharType) == 2 || util::is_char8_t<CharType>::value |
| 141 | || (sizeof(CharType) == 1 && util::normalize_encoding(encoding) == "utf8" )) |
| 142 | { |
| 143 | UText ut_stack = UTEXT_INITIALIZER; |
| 144 | std::unique_ptr<UText> ut; |
| 145 | if(sizeof(CharType) == 1) |
| 146 | ut.reset(utext_openUTF8(&ut_stack, reinterpret_cast<const char*>(begin), end - begin, &err)); |
| 147 | else { |
| 148 | static_assert(sizeof(UChar) == 2, "!" ); |
| 149 | ut.reset(utext_openUChars(&ut_stack, reinterpret_cast<const UChar*>(begin), end - begin, &err)); |
| 150 | } |
| 151 | BOOST_LOCALE_END_CONST_CONDITION |
| 152 | |
| 153 | check_and_throw_icu_error(err); |
| 154 | err = U_ZERO_ERROR; |
| 155 | if(!ut) |
| 156 | throw std::runtime_error("Failed to create UText" ); |
| 157 | bi->setText(text: ut.get(), status&: err); |
| 158 | check_and_throw_icu_error(err); |
| 159 | return map_direct(t, bi.get(), end - begin); |
| 160 | } else |
| 161 | #endif |
| 162 | { |
| 163 | icu_std_converter<CharType> cvt(encoding); |
| 164 | const icu::UnicodeString str = cvt.icu(begin, end); |
| 165 | bi->setText(str); |
| 166 | const index_type indirect = map_direct(t, it: bi.get(), reserve: str.length()); |
| 167 | index_type indx = indirect; |
| 168 | for(size_t i = 1; i < indirect.size(); i++) { |
| 169 | const size_t offset_indirect = indirect[i - 1].offset; |
| 170 | const size_t diff = indirect[i].offset - offset_indirect; |
| 171 | const size_t offset_direct = indx[i - 1].offset; |
| 172 | indx[i].offset = offset_direct + cvt.cut(str, begin, end, diff, offset_indirect, offset_direct); |
| 173 | } |
| 174 | return indx; |
| 175 | } |
| 176 | } // do_map |
| 177 | |
| 178 | template<typename CharType> |
| 179 | class boundary_indexing_impl : public boundary_indexing<CharType> { |
| 180 | public: |
| 181 | boundary_indexing_impl(const cdata& data) : locale_(data.locale()), encoding_(data.encoding()) {} |
| 182 | index_type map(boundary_type t, const CharType* begin, const CharType* end) const |
| 183 | { |
| 184 | return do_map<CharType>(t, begin, end, locale_, encoding_); |
| 185 | } |
| 186 | |
| 187 | private: |
| 188 | icu::Locale locale_; |
| 189 | std::string encoding_; |
| 190 | }; |
| 191 | |
| 192 | }} // namespace boundary::impl_icu |
| 193 | |
| 194 | namespace impl_icu { |
| 195 | std::locale create_boundary(const std::locale& in, const cdata& cd, char_facet_t type) |
| 196 | { |
| 197 | using namespace boost::locale::boundary::impl_icu; |
| 198 | switch(type) { |
| 199 | case char_facet_t::nochar: break; |
| 200 | case char_facet_t::char_f: return std::locale(in, new boundary_indexing_impl<char>(cd)); |
| 201 | case char_facet_t::wchar_f: return std::locale(in, new boundary_indexing_impl<wchar_t>(cd)); |
| 202 | #ifdef __cpp_char8_t |
| 203 | case char_facet_t::char8_f: return std::locale(in, new boundary_indexing_impl<char8_t>(cd)); |
| 204 | #endif |
| 205 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
| 206 | case char_facet_t::char16_f: return std::locale(in, new boundary_indexing_impl<char16_t>(cd)); |
| 207 | #endif |
| 208 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
| 209 | case char_facet_t::char32_f: return std::locale(in, new boundary_indexing_impl<char32_t>(cd)); |
| 210 | #endif |
| 211 | } |
| 212 | return in; |
| 213 | } |
| 214 | } // namespace impl_icu |
| 215 | |
| 216 | }} // namespace boost::locale |
| 217 | |