1//
2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3//
4// Distributed under the Boost Software License, Version 1.0.
5// https://www.boost.org/LICENSE_1_0.txt
6
7#define BOOST_LOCALE_ERROR_LIMIT 100000
8
9#include <boost/locale/boundary.hpp>
10#include <boost/locale/generator.hpp>
11#include <boost/locale/localization_backend.hpp>
12#include "boostLocale/test/tools.hpp"
13#include "boostLocale/test/unit_test.hpp"
14#include <boost/assert.hpp>
15#include <iostream>
16#include <list>
17#include <vector>
18#ifdef BOOST_LOCALE_WITH_ICU
19# include <unicode/uversion.h>
20#endif
21
22namespace lb = boost::locale::boundary;
23template<typename Char>
24using chunks_t = std::vector<std::basic_string<Char>>;
25using masks_t = std::vector<unsigned>;
26using positions_t = std::vector<size_t>;
27
28template<typename Iterator, typename Char>
29void run_segment_iterator_test(const lb::segment_index<Iterator>& map,
30 const Iterator begin,
31 const Iterator end,
32 const chunks_t<Char>& chunks,
33 const masks_t& masks,
34 const positions_t& pos)
35{
36 {
37 unsigned i = 0;
38 typename lb::segment_index<Iterator>::iterator p;
39 for(p = map.begin(); p != map.end(); ++p, i++) {
40 TEST_REQUIRE(i < masks.size());
41 TEST_EQ(p->str(), chunks[i]);
42 TEST_EQ(p->rule(), masks[i]);
43 }
44 TEST_EQ(i, chunks.size());
45
46 for(;;) {
47 if(p == map.begin()) {
48 TEST_EQ(i, 0u);
49 break;
50 } else {
51 --p, --i;
52 TEST_EQ(p->str(), chunks[i]);
53 TEST_EQ(p->rule(), masks[i]);
54 }
55 }
56 for(i = 0, p = map.end(); i < chunks.size(); i++) {
57 --p;
58 size_t index = chunks.size() - i - 1;
59 TEST_EQ(p->str(), chunks[index]);
60 TEST_EQ(p->rule(), masks[index]);
61 }
62 TEST(p == map.begin());
63 }
64
65 {
66 size_t chunk_ptr = 0, i = 0;
67 for(Iterator optr = begin; optr != end; optr++, i++) {
68 const auto p = map.find(optr);
69 if(chunk_ptr < pos.size() && i >= pos[chunk_ptr])
70 chunk_ptr++;
71 if(chunk_ptr >= pos.size())
72 TEST(p == map.end());
73 else {
74 TEST_EQ(p->str(), chunks[chunk_ptr]);
75 TEST_EQ(p->rule(), unsigned(masks[chunk_ptr]));
76 }
77 }
78 }
79}
80
81template<typename Iterator>
82void run_break_iterator_test(const lb::boundary_point_index<Iterator>& map,
83 const Iterator begin,
84 const Iterator end,
85 const std::vector<Iterator>& iters,
86 const masks_t& masks)
87{
88 unsigned i = 0;
89 typename lb::boundary_point_index<Iterator>::iterator p;
90 for(p = map.begin(); p != map.end(); ++p, i++) {
91 TEST_REQUIRE(i < masks.size());
92 TEST(p->iterator() == iters[i]);
93 TEST_EQ(p->rule(), masks[i]);
94 }
95
96 TEST_EQ(i, iters.size());
97
98 do {
99 --p;
100 --i;
101 TEST(p->iterator() == iters.at(i));
102 } while(p != map.begin());
103 TEST_EQ(i, 0u);
104
105 unsigned iters_ptr = 0;
106 for(Iterator optr = begin; optr != end; optr++) {
107 p = map.find(optr);
108 TEST(p->iterator() == iters[iters_ptr]);
109 if(iters.at(iters_ptr) == optr)
110 iters_ptr++;
111 }
112}
113
114template<typename Iterator>
115void verify_index(const lb::boundary_point_index<Iterator>& map,
116 const std::vector<Iterator>& iters,
117 const masks_t& masks)
118{
119 BOOST_ASSERT(iters.size() == masks.size());
120 TEST_REQUIRE(static_cast<size_t>(std::distance(map.begin(), map.end())) == masks.size());
121 size_t i = 0;
122 for(const auto& b_point : map) {
123 TEST(b_point.iterator() == iters[i]);
124 TEST_EQ(b_point.rule(), masks[i]);
125 ++i;
126 }
127}
128
129template<typename Iterator, typename Char>
130void verify_index(const lb::segment_index<Iterator>& map, const chunks_t<Char>& chunks, const masks_t& masks)
131{
132 BOOST_ASSERT(chunks.size() == masks.size());
133 TEST_REQUIRE(static_cast<size_t>(std::distance(map.begin(), map.end())) == masks.size());
134 size_t i = 0;
135 for(const auto& seg : map) {
136 TEST_EQ(seg.str(), chunks[i]);
137 TEST_EQ(seg.rule(), masks[i]);
138 ++i;
139 }
140}
141
142template<typename Char, typename Iterator>
143void test_word_container(Iterator begin,
144 Iterator end,
145 const std::vector<size_t>& ipos,
146 const std::vector<unsigned>& imasks,
147 const std::vector<std::basic_string<Char>>& ichunks,
148 std::locale l,
149 lb::boundary_type bt = lb::word)
150{
151 using segments_t = lb::segment_index<Iterator>;
152 using boundaries_t = lb::boundary_point_index<Iterator>;
153 for(int sm = (bt == lb::word ? 31 : 3); sm >= 0; sm--) {
154 unsigned mask = ((sm & 1) != 0) * 0xF + ((sm & 2) != 0) * 0xF0 + ((sm & 4) != 0) * 0xF00
155 + ((sm & 8) != 0) * 0xF000 + ((sm & 16) != 0) * 0xF0000;
156
157 masks_t masks;
158 std::vector<size_t> pos;
159 std::vector<unsigned> boundary_masks;
160 std::basic_string<Char> empty_chunk;
161
162 chunks_t<Char> chunks;
163 chunks_t<Char> full_chunks;
164 std::vector<Iterator> iters;
165 iters.push_back(begin);
166 boundary_masks.push_back(x: 0);
167
168 for(unsigned i = 0; i < imasks.size(); i++) {
169 if(imasks[i] & mask) {
170 masks.push_back(x: imasks[i]);
171 chunks.push_back(ichunks[i]);
172 full_chunks.push_back(empty_chunk + ichunks[i]);
173 empty_chunk.clear();
174 pos.push_back(x: ipos[i]);
175 } else
176 empty_chunk += ichunks[i];
177
178 if((imasks[i] & mask) || i == imasks.size() - 1) {
179 Iterator ptr = begin;
180 std::advance(ptr, ipos[i]);
181 iters.push_back(ptr);
182 boundary_masks.push_back(x: imasks[i]);
183 }
184 }
185 {
186 segments_t map(bt, begin, end, l);
187 map.rule(mask);
188 map.full_select(false);
189 run_segment_iterator_test(map, begin, end, chunks, masks, pos);
190 map.full_select(true);
191 run_segment_iterator_test(map, begin, end, full_chunks, masks, pos);
192 }
193 {
194 boundaries_t map(bt, begin, end, l);
195 map.rule(mask);
196 run_break_iterator_test(map, begin, end, iters, boundary_masks);
197 }
198
199 std::cout << "-- Copy from segment_index\n";
200 {
201 segments_t ti(bt, begin, end, l);
202 ti.rule(mask);
203 std::cout << "---- Construct boundary_point_index\n";
204 {
205 boundaries_t bi(ti);
206 bi.rule(mask);
207 verify_index(bi, iters, boundary_masks);
208 }
209 std::cout << "---- Assign boundary_point_index\n";
210 {
211 boundaries_t bi;
212 bi.rule(mask);
213 bi = ti;
214 verify_index(bi, iters, boundary_masks);
215 }
216 std::cout << "---- Construct segment_index\n";
217 {
218 segments_t ti2(ti);
219 verify_index(ti2, chunks, masks);
220 }
221 std::cout << "---- Assign segment_index\n";
222 {
223 segments_t ti2;
224 ti2 = ti;
225 verify_index(ti2, chunks, masks);
226 }
227 }
228 std::cout << "-- Copy from boundary_point_index\n";
229 {
230 boundaries_t bi(bt, begin, end, l);
231 bi.rule(mask);
232 std::cout << "---- Construct boundary_point_index\n";
233 {
234 boundaries_t bi2(bi);
235 verify_index(bi2, iters, boundary_masks);
236 }
237 std::cout << "---- Assign boundary_point_index\n";
238 {
239 boundaries_t bi2;
240 bi2 = bi;
241 verify_index(bi2, iters, boundary_masks);
242 }
243 std::cout << "---- Construct segment_index\n";
244 {
245 segments_t ti(bi);
246 ti.rule(mask);
247 verify_index(ti, chunks, masks);
248 }
249 std::cout << "---- Assign segment_index\n";
250 {
251 segments_t ti;
252 ti.rule(mask);
253 ti = bi;
254 verify_index(ti, chunks, masks);
255 }
256 }
257 }
258}
259
260template<typename Char>
261void run_word(std::string* original,
262 const int* none,
263 const int* num,
264 const int* word,
265 const int* kana,
266 const int* ideo,
267 std::locale l,
268 lb::boundary_type b = lb::word)
269{
270 std::vector<size_t> pos;
271 std::vector<std::basic_string<Char>> chunks;
272 std::vector<unsigned> masks;
273 std::basic_string<Char> test_string;
274 for(int i = 0; !original[i].empty(); i++) {
275 chunks.push_back(to_correct_string<Char>(original[i], l));
276 test_string += chunks.back();
277 pos.push_back(test_string.size());
278 masks.push_back(x: (none && none[i] ? 0xFu : 0u) | (num && num[i] ? 0xF0u : 0u) | (word && word[i] ? 0xF00u : 0u)
279 | (kana && kana[i] ? 0xF000u : 0u) | (ideo && ideo[i] ? 0xF0000u : 0u));
280 }
281
282 std::list<Char> lst(test_string.begin(), test_string.end());
283 test_word_container<Char>(lst.begin(), lst.end(), pos, masks, chunks, l, b);
284 test_word_container<Char>(test_string.begin(), test_string.end(), pos, masks, chunks, l, b);
285}
286
287std::string character[] = {"שָ", "ל", "וֹ", "ם", "!", ""};
288int nones[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
289
290// clang-format off
291std::string sentence1[]={"To be\n","or not\n","to be?\n"," That is the question. ","Or maybe not",""};
292int sentence1a[]={ 0, 0, 1, 1, 0, 0};
293int sentence1b[]={ 1, 1, 0, 0, 1, 0};
294
295std::string line1[]={"To ","be\n","or ","not\n","to ","be",""};
296int line1a[]={ 1, 0, 1 , 0, 1, 1 , 0 };
297int line1b[]={ 0, 1, 0 , 1, 0, 0 , 0 };
298// clang-format on
299
300void test_boundaries(std::string* all, int* first, int* second, lb::boundary_type t)
301{
302 boost::locale::generator g;
303 std::cout << " char UTF-8" << std::endl;
304 run_word<char>(original: all, none: first, num: second, word: nullptr, kana: nullptr, ideo: nullptr, l: g("he_IL.UTF-8"), b: t);
305 std::cout << " char CP1255" << std::endl;
306 run_word<char>(original: all, none: first, num: second, word: nullptr, kana: nullptr, ideo: nullptr, l: g("he_IL.cp1255"), b: t);
307 std::cout << " wchar_t" << std::endl;
308 run_word<wchar_t>(original: all, none: first, num: second, word: nullptr, kana: nullptr, ideo: nullptr, l: g("he_IL.UTF-8"), b: t);
309#ifndef BOOST_LOCALE_NO_CXX20_STRING8
310 std::cout << " char8_t" << std::endl;
311 run_word<char8_t>(all, first, second, nullptr, nullptr, nullptr, g("he_IL.UTF-8"), t);
312#endif
313#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
314 std::cout << " char16_t" << std::endl;
315 run_word<char16_t>(all, first, second, nullptr, nullptr, nullptr, g("he_IL.UTF-8"), t);
316#endif
317#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
318 std::cout << " char32_t" << std::endl;
319 run_word<char32_t>(all, first, second, nullptr, nullptr, nullptr, g("he_IL.UTF-8"), t);
320#endif
321}
322
323void word_boundary()
324{
325 boost::locale::generator g;
326 int zero[25] = {0};
327 std::string txt_empty[] = {""};
328
329 // clang-format off
330 std::string txt_simple[] = {" ","Hello",",","World","!"," ",""};
331 int none_simple[] = { 1, 0, 1, 0, 1, 1, 0};
332 int word_simple[] = { 0, 1, 0, 1, 0, 0, 0};
333
334 std::string txt_all[] = {"10"," ","Hello"," ","Windows7"," ","He22o"," ","平仮名","アヒル",""};
335 int none_all[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0};
336#if U_ICU_VERSION_MAJOR_NUM >= 62
337 // ICU 62+ returns only the number classification if there is a number at the boundary
338 int num_all[] = { 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0};
339 int word_all[] = { 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0};
340#else
341 // ICU < 62 combines the word and number classification if there is a number at the boundary
342 int num_all[] = { 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}; // LCOV_EXCL_LINE
343 int word_all[] = { 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0}; // LCOV_EXCL_LINE
344#endif
345#if U_ICU_VERSION_MAJOR_NUM >= 50
346 int kana_all[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
347 int ideo_all[] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
348#else
349 int kana_all[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}; // LCOV_EXCL_LINE
350 int ideo_all[] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}; // LCOV_EXCL_LINE
351#endif
352 // clang-format on
353
354 std::cout << " char UTF-8" << std::endl;
355 const std::locale utf8_en_locale = g("en_US.UTF-8");
356 const std::locale utf8_jp_locale = g("ja_JP.UTF-8");
357 run_word<char>(original: txt_empty, none: zero, num: zero, word: zero, kana: zero, ideo: zero, l: utf8_en_locale);
358 run_word<char>(original: txt_simple, none: none_simple, num: zero, word: word_simple, kana: zero, ideo: zero, l: utf8_en_locale);
359 run_word<char>(original: txt_all, none: none_all, num: num_all, word: word_all, kana: kana_all, ideo: ideo_all, l: utf8_jp_locale);
360
361 std::cout << " char Shift-JIS" << std::endl;
362 const std::locale sjis_jp_locale = g("ja_JP.SJIS");
363 run_word<char>(original: txt_empty, none: zero, num: zero, word: zero, kana: zero, ideo: zero, l: sjis_jp_locale);
364 run_word<char>(original: txt_simple, none: none_simple, num: zero, word: word_simple, kana: zero, ideo: zero, l: sjis_jp_locale);
365 run_word<char>(original: txt_all, none: none_all, num: num_all, word: word_all, kana: kana_all, ideo: ideo_all, l: sjis_jp_locale);
366
367 std::cout << " wchar_t" << std::endl;
368 run_word<wchar_t>(original: txt_empty, none: zero, num: zero, word: zero, kana: zero, ideo: zero, l: utf8_en_locale);
369 run_word<wchar_t>(original: txt_simple, none: none_simple, num: zero, word: word_simple, kana: zero, ideo: zero, l: utf8_en_locale);
370 run_word<wchar_t>(original: txt_all, none: none_all, num: num_all, word: word_all, kana: kana_all, ideo: ideo_all, l: utf8_jp_locale);
371
372#ifndef BOOST_LOCALE_NO_CXX20_STRING8
373 std::cout << " char8_t" << std::endl;
374 run_word<char8_t>(txt_empty, zero, zero, zero, zero, zero, g("ja_JP.UTF-8"));
375 run_word<char8_t>(txt_simple, none_simple, zero, word_simple, zero, zero, utf8_en_locale);
376 run_word<char8_t>(txt_all, none_all, num_all, word_all, kana_all, ideo_all, utf8_jp_locale);
377#endif
378
379#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
380 std::cout << " char16_t" << std::endl;
381 run_word<char16_t>(txt_empty, zero, zero, zero, zero, zero, g("ja_JP.UTF-8"));
382 run_word<char16_t>(txt_simple, none_simple, zero, word_simple, zero, zero, utf8_en_locale);
383 run_word<char16_t>(txt_all, none_all, num_all, word_all, kana_all, ideo_all, utf8_jp_locale);
384#endif
385
386#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
387 std::cout << " char32_t" << std::endl;
388 run_word<char32_t>(txt_empty, zero, zero, zero, zero, zero, g("ja_JP.UTF-8"));
389 run_word<char32_t>(txt_simple, none_simple, zero, word_simple, zero, zero, utf8_en_locale);
390 run_word<char32_t>(txt_all, none_all, num_all, word_all, kana_all, ideo_all, utf8_jp_locale);
391#endif
392}
393void test_op_one_side(const std::string& sl, const std::string& sr, int val)
394{
395 boost::locale::boundary::ssegment l(sl.begin(), sl.end(), 0), r(sr.begin(), sr.end(), 0);
396
397#if BOOST_LOCALE_SPACESHIP_NULLPTR_WARNING
398# pragma clang diagnostic push
399# pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
400#endif
401
402 // segment
403 TEST_EQ((l == r), (val == 0));
404 TEST_EQ((l != r), (val != 0));
405 TEST_EQ((l <= r), (val <= 0));
406 TEST_EQ((l < r), (val < 0));
407 TEST_EQ((l >= r), (val >= 0));
408 TEST_EQ((l > r), (val > 0));
409
410 // C string
411 TEST_EQ((l == sr.c_str()), (val == 0));
412 TEST_EQ((l != sr.c_str()), (val != 0));
413 TEST_EQ((l <= sr.c_str()), (val <= 0));
414 TEST_EQ((l < sr.c_str()), (val < 0));
415 TEST_EQ((l >= sr.c_str()), (val >= 0));
416 TEST_EQ((l > sr.c_str()), (val > 0));
417
418 TEST_EQ((sl.c_str() == r), (val == 0));
419 TEST_EQ((sl.c_str() != r), (val != 0));
420 TEST_EQ((sl.c_str() <= r), (val <= 0));
421 TEST_EQ((sl.c_str() < r), (val < 0));
422 TEST_EQ((sl.c_str() >= r), (val >= 0));
423 TEST_EQ((sl.c_str() > r), (val > 0));
424
425 // C++ string
426 TEST_EQ((l == sr), (val == 0));
427 TEST_EQ((l != sr), (val != 0));
428 TEST_EQ((l <= sr), (val <= 0));
429 TEST_EQ((l < sr), (val < 0));
430 TEST_EQ((l >= sr), (val >= 0));
431 TEST_EQ((l > sr), (val > 0));
432
433 TEST_EQ((sl == r), (val == 0));
434 TEST_EQ((sl != r), (val != 0));
435 TEST_EQ((sl <= r), (val <= 0));
436 TEST_EQ((sl < r), (val < 0));
437 TEST_EQ((sl >= r), (val >= 0));
438 TEST_EQ((sl > r), (val > 0));
439 // self check
440 TEST_EQ((sl == sr), (val == 0));
441 TEST_EQ((sl != sr), (val != 0));
442 TEST_EQ((sl <= sr), (val <= 0));
443 TEST_EQ((sl < sr), (val < 0));
444 TEST_EQ((sl >= sr), (val >= 0));
445 TEST_EQ((sl > sr), (val > 0));
446
447#if BOOST_LOCALE_SPACESHIP_NULLPTR_WARNING
448# pragma clang diagnostic pop
449#endif
450}
451
452void test_op(const std::string& sl, const std::string& sr, int val)
453{
454 test_op_one_side(sl, sr, val);
455 test_op_one_side(sl: sr, sr: sl, val: -val);
456}
457void segment_operator()
458{
459 test_op(sl: "", sr: "a", val: -1);
460 test_op(sl: "", sr: "", val: 0);
461 test_op(sl: "aa", sr: "aaa", val: -1);
462 test_op(sl: "aa", sr: "ab", val: -1);
463}
464
465BOOST_LOCALE_DISABLE_UNREACHABLE_CODE_WARNING
466void test_main(int /*argc*/, char** /*argv*/)
467{
468#ifndef BOOST_LOCALE_NO_STD_BACKEND
469 {
470 namespace bl = boost::locale;
471 const bl::localization_backend_manager orig_backend = bl::localization_backend_manager::global();
472 bl::localization_backend_manager tmp_backend = bl::localization_backend_manager::global();
473 tmp_backend.select(backend_name: "std");
474 bl::localization_backend_manager::global(tmp_backend);
475
476 bl::generator g;
477 const std::string text = "To be or not to be, that is the question.";
478 // Std backend doesn't support segmentation, expect reasonable error
479 TEST_THROWS(bl::boundary::ssegment_index map(bl::boundary::word, text.begin(), text.end(), g("en_US.UTF-8")),
480 std::runtime_error);
481 bl::localization_backend_manager::global(orig_backend);
482 }
483#endif
484#ifndef BOOST_LOCALE_WITH_ICU
485 std::cout << "ICU is not build... Skipping\n";
486 return;
487#endif // !BOOST_LOCALE_WITH_ICU
488 std::cout << "Testing segment operators" << std::endl;
489 segment_operator();
490 std::cout << "Testing word boundary" << std::endl;
491 word_boundary();
492 std::cout << "Testing character boundary" << std::endl;
493 test_boundaries(all: character, first: nones, second: nullptr, t: lb::character);
494 std::cout << "Testing sentence boundary" << std::endl;
495 test_boundaries(all: sentence1, first: sentence1a, second: sentence1b, t: lb::sentence);
496 std::cout << "Testing line boundary" << std::endl;
497 test_boundaries(all: line1, first: line1a, second: line1b, t: lb::line);
498}
499
500// boostinspect:noascii
501

source code of boost/libs/locale/test/test_boundary.cpp