1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) |
3 | // |
4 | // Distributed under the Boost Software License, Version 1.0. |
5 | // https://www.boost.org/LICENSE_1_0.txt |
6 | |
7 | #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED |
8 | #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED |
9 | |
10 | #include <boost/locale/boundary/boundary_point.hpp> |
11 | #include <boost/locale/boundary/facets.hpp> |
12 | #include <boost/locale/boundary/segment.hpp> |
13 | #include <boost/locale/boundary/types.hpp> |
14 | #include <boost/iterator/iterator_facade.hpp> |
15 | #include <algorithm> |
16 | #include <cstdint> |
17 | #include <iterator> |
18 | #include <locale> |
19 | #include <memory> |
20 | #include <stdexcept> |
21 | #include <string> |
22 | #include <type_traits> |
23 | #include <vector> |
24 | |
25 | #ifdef BOOST_MSVC |
26 | # pragma warning(push) |
27 | # pragma warning(disable : 4275 4251 4231 4660) |
28 | #endif |
29 | |
30 | namespace boost { namespace locale { namespace boundary { |
31 | /// |
32 | /// \defgroup boundary Boundary Analysis |
33 | /// |
34 | /// This module contains all operations required for %boundary analysis of text: character, word, line and sentence |
35 | /// boundaries |
36 | /// |
37 | /// @{ |
38 | /// |
39 | |
40 | /// \cond INTERNAL |
41 | |
42 | namespace detail { |
43 | template<typename Char> |
44 | const boundary_indexing<Char>& get_boundary_indexing(const std::locale& l) |
45 | { |
46 | using facet_type = boundary_indexing<Char>; |
47 | if(!std::has_facet<facet_type>(l)) |
48 | throw std::runtime_error("Locale was generated without segmentation support!" ); |
49 | return std::use_facet<facet_type>(l); |
50 | } |
51 | |
52 | template<typename IteratorType, |
53 | typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category> |
54 | struct mapping_traits { |
55 | typedef typename std::iterator_traits<IteratorType>::value_type char_type; |
56 | static index_type map(boundary_type t, IteratorType b, IteratorType e, const std::locale& l) |
57 | { |
58 | std::basic_string<char_type> str(b, e); |
59 | return get_boundary_indexing<char_type>(l).map(t, str.c_str(), str.c_str() + str.size()); |
60 | } |
61 | }; |
62 | |
63 | template<typename CharType, typename SomeIteratorType> |
64 | struct linear_iterator_traits { |
65 | static constexpr bool is_linear = |
66 | std::is_same<SomeIteratorType, CharType*>::value || std::is_same<SomeIteratorType, const CharType*>::value |
67 | || std::is_same<SomeIteratorType, typename std::basic_string<CharType>::iterator>::value |
68 | || std::is_same<SomeIteratorType, typename std::basic_string<CharType>::const_iterator>::value |
69 | || std::is_same<SomeIteratorType, typename std::vector<CharType>::iterator>::value |
70 | || std::is_same<SomeIteratorType, typename std::vector<CharType>::const_iterator>::value; |
71 | }; |
72 | |
73 | template<typename IteratorType> |
74 | struct mapping_traits<IteratorType, std::random_access_iterator_tag> { |
75 | typedef typename std::iterator_traits<IteratorType>::value_type char_type; |
76 | |
77 | static index_type map(boundary_type t, IteratorType b, IteratorType e, const std::locale& l) |
78 | { |
79 | index_type result; |
80 | |
81 | // Optimize for most common cases |
82 | // |
83 | // C++11 requires that string is continuous in memory and all known |
84 | // string implementations do this because of c_str() support. |
85 | |
86 | if(linear_iterator_traits<char_type, IteratorType>::is_linear && b != e) { |
87 | const char_type* begin = &*b; |
88 | const char_type* end = begin + (e - b); |
89 | index_type tmp = get_boundary_indexing<char_type>(l).map(t, begin, end); |
90 | result.swap(x&: tmp); |
91 | } else { |
92 | std::basic_string<char_type> str(b, e); |
93 | index_type tmp = get_boundary_indexing<char_type>(l).map(t, str.c_str(), str.c_str() + str.size()); |
94 | result.swap(x&: tmp); |
95 | } |
96 | return result; |
97 | } |
98 | }; |
99 | |
100 | template<typename BaseIterator> |
101 | class mapping { |
102 | public: |
103 | typedef BaseIterator base_iterator; |
104 | typedef typename std::iterator_traits<base_iterator>::value_type char_type; |
105 | |
106 | mapping(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc) : |
107 | index_(new index_type()), begin_(begin), end_(end) |
108 | { |
109 | index_type idx = detail::mapping_traits<base_iterator>::map(type, begin, end, loc); |
110 | index_->swap(x&: idx); |
111 | } |
112 | |
113 | mapping() {} |
114 | |
115 | const index_type& index() const { return *index_; } |
116 | |
117 | base_iterator begin() const { return begin_; } |
118 | |
119 | base_iterator end() const { return end_; } |
120 | |
121 | private: |
122 | std::shared_ptr<index_type> index_; |
123 | base_iterator begin_, end_; |
124 | }; |
125 | |
126 | template<typename BaseIterator> |
127 | class segment_index_iterator : public boost::iterator_facade<segment_index_iterator<BaseIterator>, |
128 | segment<BaseIterator>, |
129 | boost::bidirectional_traversal_tag, |
130 | const segment<BaseIterator>&> { |
131 | public: |
132 | typedef BaseIterator base_iterator; |
133 | typedef mapping<base_iterator> mapping_type; |
134 | typedef segment<base_iterator> segment_type; |
135 | |
136 | segment_index_iterator() : current_(0, 0), map_(nullptr), mask_(0), full_select_(false) {} |
137 | |
138 | segment_index_iterator(base_iterator p, const mapping_type* map, rule_type mask, bool full_select) : |
139 | map_(map), mask_(mask), full_select_(full_select) |
140 | { |
141 | set(p); |
142 | } |
143 | segment_index_iterator(bool is_begin, const mapping_type* map, rule_type mask, bool full_select) : |
144 | map_(map), mask_(mask), full_select_(full_select) |
145 | { |
146 | if(is_begin) |
147 | set_begin(); |
148 | else |
149 | set_end(); |
150 | } |
151 | |
152 | const segment_type& dereference() const { return value_; } |
153 | |
154 | bool equal(const segment_index_iterator& other) const |
155 | { |
156 | return map_ == other.map_ && current_.second == other.current_.second; |
157 | } |
158 | |
159 | void increment() |
160 | { |
161 | std::pair<size_t, size_t> next = current_; |
162 | if(full_select_) { |
163 | next.first = next.second; |
164 | while(next.second < size()) { |
165 | next.second++; |
166 | if(valid_offset(offset: next.second)) |
167 | break; |
168 | } |
169 | if(next.second == size()) |
170 | next.first = next.second - 1; |
171 | } else { |
172 | while(next.second < size()) { |
173 | next.first = next.second; |
174 | next.second++; |
175 | if(valid_offset(offset: next.second)) |
176 | break; |
177 | } |
178 | } |
179 | update_current(pos: next); |
180 | } |
181 | |
182 | void decrement() |
183 | { |
184 | std::pair<size_t, size_t> next = current_; |
185 | if(full_select_) { |
186 | while(next.second > 1) { |
187 | next.second--; |
188 | if(valid_offset(offset: next.second)) |
189 | break; |
190 | } |
191 | next.first = next.second; |
192 | while(next.first > 0) { |
193 | next.first--; |
194 | if(valid_offset(offset: next.first)) |
195 | break; |
196 | } |
197 | } else { |
198 | while(next.second > 1) { |
199 | next.second--; |
200 | if(valid_offset(offset: next.second)) |
201 | break; |
202 | } |
203 | next.first = next.second - 1; |
204 | } |
205 | update_current(pos: next); |
206 | } |
207 | |
208 | private: |
209 | void set_end() |
210 | { |
211 | current_.first = size() - 1; |
212 | current_.second = size(); |
213 | value_ = segment_type(map_->end(), map_->end(), 0); |
214 | } |
215 | void set_begin() |
216 | { |
217 | current_.first = current_.second = 0; |
218 | value_ = segment_type(map_->begin(), map_->begin(), 0); |
219 | increment(); |
220 | } |
221 | |
222 | void set(base_iterator p) |
223 | { |
224 | const auto b = map_->index().begin(), e = map_->index().end(); |
225 | auto boundary_point = std::upper_bound(b, e, break_info(std::distance(map_->begin(), p))); |
226 | while(boundary_point != e && (boundary_point->rule & mask_) == 0) |
227 | ++boundary_point; |
228 | |
229 | current_.first = current_.second = boundary_point - b; |
230 | |
231 | if(full_select_) { |
232 | while(current_.first > 0) { |
233 | current_.first--; |
234 | if(valid_offset(offset: current_.first)) |
235 | break; |
236 | } |
237 | } else { |
238 | if(current_.first > 0) |
239 | current_.first--; |
240 | } |
241 | value_.first = map_->begin(); |
242 | std::advance(value_.first, get_offset(ind: current_.first)); |
243 | value_.second = value_.first; |
244 | std::advance(value_.second, get_offset(ind: current_.second) - get_offset(ind: current_.first)); |
245 | |
246 | update_rule(); |
247 | } |
248 | |
249 | void update_current(std::pair<size_t, size_t> pos) |
250 | { |
251 | std::ptrdiff_t first_diff = get_offset(ind: pos.first) - get_offset(ind: current_.first); |
252 | std::ptrdiff_t second_diff = get_offset(ind: pos.second) - get_offset(ind: current_.second); |
253 | std::advance(value_.first, first_diff); |
254 | std::advance(value_.second, second_diff); |
255 | current_ = pos; |
256 | update_rule(); |
257 | } |
258 | |
259 | void update_rule() |
260 | { |
261 | if(current_.second != size()) |
262 | value_.rule(index()[current_.second].rule); |
263 | } |
264 | size_t get_offset(size_t ind) const |
265 | { |
266 | if(ind == size()) |
267 | return index().back().offset; |
268 | return index()[ind].offset; |
269 | } |
270 | |
271 | bool valid_offset(size_t offset) const |
272 | { |
273 | return offset == 0 || offset == size() // make sure we not acess index[size] |
274 | || (index()[offset].rule & mask_) != 0; |
275 | } |
276 | |
277 | size_t size() const { return index().size(); } |
278 | |
279 | const index_type& index() const { return map_->index(); } |
280 | |
281 | segment_type value_; |
282 | std::pair<size_t, size_t> current_; |
283 | const mapping_type* map_; |
284 | rule_type mask_; |
285 | bool full_select_; |
286 | }; |
287 | |
288 | template<typename BaseIterator> |
289 | class boundary_point_index_iterator : public boost::iterator_facade<boundary_point_index_iterator<BaseIterator>, |
290 | boundary_point<BaseIterator>, |
291 | boost::bidirectional_traversal_tag, |
292 | const boundary_point<BaseIterator>&> { |
293 | public: |
294 | typedef BaseIterator base_iterator; |
295 | typedef mapping<base_iterator> mapping_type; |
296 | typedef boundary_point<base_iterator> boundary_point_type; |
297 | |
298 | boundary_point_index_iterator() : current_(0), map_(nullptr), mask_(0) {} |
299 | |
300 | boundary_point_index_iterator(bool is_begin, const mapping_type* map, rule_type mask) : |
301 | map_(map), mask_(mask) |
302 | { |
303 | if(is_begin) |
304 | set_begin(); |
305 | else |
306 | set_end(); |
307 | } |
308 | boundary_point_index_iterator(base_iterator p, const mapping_type* map, rule_type mask) : |
309 | map_(map), mask_(mask) |
310 | { |
311 | set(p); |
312 | } |
313 | |
314 | const boundary_point_type& dereference() const { return value_; } |
315 | |
316 | bool equal(const boundary_point_index_iterator& other) const |
317 | { |
318 | return map_ == other.map_ && current_ == other.current_; |
319 | } |
320 | |
321 | void increment() |
322 | { |
323 | size_t next = current_; |
324 | while(next < size()) { |
325 | next++; |
326 | if(valid_offset(offset: next)) |
327 | break; |
328 | } |
329 | update_current(pos: next); |
330 | } |
331 | |
332 | void decrement() |
333 | { |
334 | size_t next = current_; |
335 | while(next > 0) { |
336 | next--; |
337 | if(valid_offset(offset: next)) |
338 | break; |
339 | } |
340 | update_current(pos: next); |
341 | } |
342 | |
343 | private: |
344 | void set_end() |
345 | { |
346 | current_ = size(); |
347 | value_ = boundary_point_type(map_->end(), 0); |
348 | } |
349 | void set_begin() |
350 | { |
351 | current_ = 0; |
352 | value_ = boundary_point_type(map_->begin(), 0); |
353 | } |
354 | |
355 | void set(base_iterator p) |
356 | { |
357 | size_t dist = std::distance(map_->begin(), p); |
358 | |
359 | const auto b = index().begin(), e = index().end(); |
360 | const auto ptr = std::lower_bound(b, e, break_info(dist)); |
361 | |
362 | if(ptr == e) |
363 | current_ = size() - 1; |
364 | else |
365 | current_ = ptr - b; |
366 | |
367 | while(!valid_offset(offset: current_)) |
368 | current_++; |
369 | |
370 | std::ptrdiff_t diff = get_offset(ind: current_) - dist; |
371 | std::advance(p, diff); |
372 | value_.iterator(p); |
373 | update_rule(); |
374 | } |
375 | |
376 | void update_current(size_t pos) |
377 | { |
378 | std::ptrdiff_t diff = get_offset(ind: pos) - get_offset(ind: current_); |
379 | base_iterator i = value_.iterator(); |
380 | std::advance(i, diff); |
381 | current_ = pos; |
382 | value_.iterator(i); |
383 | update_rule(); |
384 | } |
385 | |
386 | void update_rule() |
387 | { |
388 | if(current_ != size()) |
389 | value_.rule(index()[current_].rule); |
390 | } |
391 | size_t get_offset(size_t ind) const |
392 | { |
393 | if(ind == size()) |
394 | return index().back().offset; |
395 | return index()[ind].offset; |
396 | } |
397 | |
398 | bool valid_offset(size_t offset) const |
399 | { |
400 | return offset == 0 || offset + 1 >= size() // last and first are always valid regardless of mark |
401 | || (index()[offset].rule & mask_) != 0; |
402 | } |
403 | |
404 | size_t size() const { return index().size(); } |
405 | |
406 | const index_type& index() const { return map_->index(); } |
407 | |
408 | boundary_point_type value_; |
409 | size_t current_; |
410 | const mapping_type* map_; |
411 | rule_type mask_; |
412 | }; |
413 | |
414 | } // namespace detail |
415 | |
416 | /// \endcond |
417 | |
418 | template<typename BaseIterator> |
419 | class segment_index; |
420 | |
421 | template<typename BaseIterator> |
422 | class boundary_point_index; |
423 | |
424 | /// \brief This class holds an index of segments in the text range and allows to iterate over them |
425 | /// |
426 | /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators |
427 | /// to the \ref segment objects. |
428 | /// |
429 | /// It provides two options on way of selecting segments: |
430 | /// |
431 | /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to |
432 | /// various masks %as \ref word_any. |
433 | /// \n |
434 | /// The default is to select any types of boundaries. |
435 | /// \n |
436 | /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators |
437 | /// would iterate only over the words containing Kana letters and \ref word_any would select all types of |
438 | /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text |
439 | /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", |
440 | /// instead of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?". |
441 | /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous |
442 | /// %boundary point does not fit the selected rule. |
443 | /// \n |
444 | /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?". |
445 | /// \n |
446 | /// This text contains three %boundary points separating it to sentences by different rules: |
447 | /// - The exclamation mark "!" ends the sentence "Hello!" |
448 | /// - The line feed that splits the sentence "How\nare you?" into two parts. |
449 | /// - The question mark that ends the second sentence. |
450 | /// \n |
451 | /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would |
452 | /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required |
453 | /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include |
454 | /// all the text up to previous valid %boundary point and would return two expected sentences: |
455 | /// "Hello!" and "How\nare you?". |
456 | /// |
457 | /// This class allows to find a segment according to the given iterator in range using \ref find() member |
458 | /// function. |
459 | /// |
460 | /// \note |
461 | /// |
462 | /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text |
463 | /// invalidates existing iterators and they can't be used any more. |
464 | /// - segment_index can be created from boundary_point_index or other segment_index that was created with |
465 | /// same \ref boundary_type. This is very fast operation %as they shared same index |
466 | /// and it does not require its regeneration. |
467 | /// |
468 | /// \see |
469 | /// |
470 | /// - \ref boundary_point_index |
471 | /// - \ref segment |
472 | /// - \ref boundary_point |
473 | |
474 | template<typename BaseIterator> |
475 | class segment_index { |
476 | public: |
477 | /// The type of the iterator used to iterate over the original text |
478 | typedef BaseIterator base_iterator; |
479 | |
480 | #ifdef BOOST_LOCALE_DOXYGEN |
481 | /// The bidirectional iterator that iterates over \ref value_type objects. |
482 | /// |
483 | /// - The iterators may be invalidated by use of any non-const member function |
484 | /// including but not limited to \ref rule(rule_type) and \ref full_select(bool). |
485 | /// - The returned value_type object is valid %as long %as iterator points to it. |
486 | /// So this following code is wrong %as t used after p was updated: |
487 | /// \code |
488 | /// segment_index<some_iterator>::iterator p=index.begin(); |
489 | /// segment<some_iterator> &t = *p; |
490 | /// ++p; |
491 | /// std::cout << t.str() << std::endl; |
492 | /// \endcode |
493 | typedef unspecified_iterator_type iterator; |
494 | /// \copydoc iterator |
495 | typedef unspecified_iterator_type const_iterator; |
496 | #else |
497 | typedef detail::segment_index_iterator<base_iterator> iterator; |
498 | typedef detail::segment_index_iterator<base_iterator> const_iterator; |
499 | #endif |
500 | /// The type dereferenced by the \ref iterator and \ref const_iterator. It is |
501 | /// an object that represents selected segment. |
502 | typedef segment<base_iterator> value_type; |
503 | |
504 | /// Default constructor. |
505 | /// |
506 | /// \note |
507 | /// |
508 | /// When this object is constructed by default it does not include a valid index, thus |
509 | /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined |
510 | /// behavior |
511 | segment_index() : mask_(0xFFFFFFFFu), full_select_(false) {} |
512 | /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text |
513 | /// in range [begin,end) using a rule \a mask for locale \a loc. |
514 | segment_index(boundary_type type, |
515 | base_iterator begin, |
516 | base_iterator end, |
517 | rule_type mask, |
518 | const std::locale& loc = std::locale()) : |
519 | map_(type, begin, end, loc), |
520 | mask_(mask), full_select_(false) |
521 | {} |
522 | /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text |
523 | /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc. |
524 | segment_index(boundary_type type, |
525 | base_iterator begin, |
526 | base_iterator end, |
527 | const std::locale& loc = std::locale()) : |
528 | map_(type, begin, end, loc), |
529 | mask_(0xFFFFFFFFu), full_select_(false) |
530 | {} |
531 | |
532 | /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information |
533 | /// and used default rule (all possible segments) |
534 | /// |
535 | /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text |
536 | /// range it is much better to create one from another rather then indexing the same |
537 | /// range twice. |
538 | /// |
539 | /// \note \ref rule() flags are not copied |
540 | segment_index(const boundary_point_index<base_iterator>&); |
541 | |
542 | /// Copy an index from a \ref boundary_point_index. It copies all indexing information |
543 | /// and uses the default rule (all possible segments) |
544 | /// |
545 | /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text |
546 | /// range it is much better to create one from another rather then indexing the same |
547 | /// range twice. |
548 | /// |
549 | /// \note \ref rule() flags are not copied |
550 | segment_index& operator=(const boundary_point_index<base_iterator>&); |
551 | |
552 | /// Create a new index for %boundary analysis \ref boundary_type "type" of the text |
553 | /// in range [begin,end) for locale \a loc. |
554 | /// |
555 | /// \note \ref rule() and \ref full_select() remain unchanged. |
556 | void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc = std::locale()) |
557 | { |
558 | map_ = mapping_type(type, begin, end, loc); |
559 | } |
560 | |
561 | /// Get the \ref iterator on the beginning of the segments range. |
562 | /// |
563 | /// Preconditions: the segment_index should have a mapping |
564 | /// |
565 | /// \note |
566 | /// |
567 | /// The returned iterator is invalidated by access to any non-const member functions of this object |
568 | iterator begin() const |
569 | { |
570 | return iterator(true, &map_, mask_, full_select_); |
571 | } |
572 | |
573 | /// Get the \ref iterator on the ending of the segments range. |
574 | /// |
575 | /// Preconditions: the segment_index should have a mapping |
576 | /// |
577 | /// The returned iterator is invalidated by access to any non-const member functions of this object |
578 | iterator end() const |
579 | { |
580 | return iterator(false, &map_, mask_, full_select_); |
581 | } |
582 | |
583 | /// Find a first valid segment following a position \a p. |
584 | /// |
585 | /// If \a p is inside a valid segment this segment is selected: |
586 | /// |
587 | /// For example: For \ref word %boundary analysis with \ref word_any rule(): |
588 | /// |
589 | /// - "to| be or ", would point to "be", |
590 | /// - "t|o be or ", would point to "to", |
591 | /// - "to be or| ", would point to end. |
592 | /// |
593 | /// |
594 | /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator |
595 | /// to the text in the mapped range. |
596 | /// |
597 | /// The returned iterator is invalidated by access to any non-const member functions of this object |
598 | iterator find(base_iterator p) const |
599 | { |
600 | return iterator(p, &map_, mask_, full_select_); |
601 | } |
602 | |
603 | /// Get the mask of rules that are used |
604 | rule_type rule() const |
605 | { |
606 | return mask_; |
607 | } |
608 | /// Set the mask of rules that are used |
609 | void rule(rule_type v) |
610 | { |
611 | mask_ = v; |
612 | } |
613 | |
614 | /// Get the full_select property value - should segment include in the range |
615 | /// values that not belong to specific \ref rule() or not. |
616 | /// |
617 | /// The default value is false. |
618 | /// |
619 | /// For example for \ref sentence %boundary with rule \ref sentence_term the segments |
620 | /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false |
621 | /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() |
622 | /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the |
623 | /// following part "are you?" |
624 | bool full_select() const |
625 | { |
626 | return full_select_; |
627 | } |
628 | |
629 | /// Set the full_select property value - should segment include in the range |
630 | /// values that not belong to specific \ref rule() or not. |
631 | /// |
632 | /// The default value is false. |
633 | /// |
634 | /// For example for \ref sentence %boundary with rule \ref sentence_term the segments |
635 | /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false |
636 | /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() |
637 | /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the |
638 | /// following part "are you?" |
639 | void full_select(bool v) |
640 | { |
641 | full_select_ = v; |
642 | } |
643 | |
644 | private: |
645 | friend class boundary_point_index<base_iterator>; |
646 | typedef detail::mapping<base_iterator> mapping_type; |
647 | mapping_type map_; |
648 | rule_type mask_; |
649 | bool full_select_; |
650 | }; |
651 | |
652 | /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating |
653 | /// over them. |
654 | /// |
655 | /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators |
656 | /// to the \ref boundary_point objects. |
657 | /// |
658 | /// It provides an option that affects selecting %boundary points according to different rules: |
659 | /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific |
660 | /// types of %boundary points like \ref sentence_term. |
661 | /// |
662 | /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default |
663 | /// rule is used the %boundary points would be: |
664 | /// |
665 | /// - "|Hello! How\nare you?" |
666 | /// - "Hello! |How\nare you?" |
667 | /// - "Hello! How\n|are you?" |
668 | /// - "Hello! How\nare you?|" |
669 | /// |
670 | /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be: |
671 | /// |
672 | /// - "|Hello! How\nare you?" |
673 | /// - "Hello! |How\nare you?" |
674 | /// - "Hello! How\nare you?|" |
675 | /// |
676 | /// Such that a %boundary point defined by a line feed character would be ignored. |
677 | /// |
678 | /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member |
679 | /// function. |
680 | /// |
681 | /// \note |
682 | /// - Even an empty text range [x,x) considered to have a one %boundary point x. |
683 | /// - \a a and \a b points of the range [a,b) are always considered %boundary points |
684 | /// regardless the rules used. |
685 | /// - Changing any of the option \ref rule() or course re-indexing the text |
686 | /// invalidates existing iterators and they can't be used any more. |
687 | /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with |
688 | /// same \ref boundary_type. This is very fast operation %as they shared same index |
689 | /// and it does not require its regeneration. |
690 | /// |
691 | /// \see |
692 | /// |
693 | /// - \ref segment_index |
694 | /// - \ref boundary_point |
695 | /// - \ref segment |
696 | template<typename BaseIterator> |
697 | class boundary_point_index { |
698 | public: |
699 | /// The type of the iterator used to iterate over the original text |
700 | typedef BaseIterator base_iterator; |
701 | |
702 | #ifdef BOOST_LOCALE_DOXYGEN |
703 | /// The bidirectional iterator that iterates over \ref value_type objects. |
704 | /// |
705 | /// - The iterators may be invalidated by use of any non-const member function |
706 | /// including but not limited to \ref rule(rule_type) member function. |
707 | /// - The returned value_type object is valid %as long %as iterator points to it. |
708 | /// So this following code is wrong %as t used after p was updated: |
709 | /// \code |
710 | /// boundary_point_index<some_iterator>::iterator p=index.begin(); |
711 | /// boundary_point<some_iterator> &t = *p; |
712 | /// ++p; |
713 | /// rule_type r = t->rule(); |
714 | /// \endcode |
715 | /// |
716 | typedef unspecified_iterator_type iterator; |
717 | /// \copydoc iterator |
718 | typedef unspecified_iterator_type const_iterator; |
719 | #else |
720 | typedef detail::boundary_point_index_iterator<base_iterator> iterator; |
721 | typedef detail::boundary_point_index_iterator<base_iterator> const_iterator; |
722 | #endif |
723 | /// The type dereferenced by the \ref iterator and \ref const_iterator. It is |
724 | /// an object that represents the selected \ref boundary_point "boundary point". |
725 | typedef boundary_point<base_iterator> value_type; |
726 | |
727 | /// Default constructor. |
728 | /// |
729 | /// \note |
730 | /// |
731 | /// When this object is constructed by default it does not include a valid index, thus |
732 | /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined |
733 | /// behavior |
734 | boundary_point_index() : mask_(0xFFFFFFFFu) {} |
735 | |
736 | /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text |
737 | /// in range [begin,end) using a rule \a mask for locale \a loc. |
738 | boundary_point_index(boundary_type type, |
739 | base_iterator begin, |
740 | base_iterator end, |
741 | rule_type mask, |
742 | const std::locale& loc = std::locale()) : |
743 | map_(type, begin, end, loc), |
744 | mask_(mask) |
745 | {} |
746 | /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text |
747 | /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc. |
748 | boundary_point_index(boundary_type type, |
749 | base_iterator begin, |
750 | base_iterator end, |
751 | const std::locale& loc = std::locale()) : |
752 | map_(type, begin, end, loc), |
753 | mask_(0xFFFFFFFFu) |
754 | {} |
755 | |
756 | /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information |
757 | /// and uses the default rule (all possible %boundary points) |
758 | /// |
759 | /// This operation is very cheap, so if you use boundary_point_index and segment_index on the same text |
760 | /// range it is much better to create one from another rather then indexing the same |
761 | /// range twice. |
762 | /// |
763 | /// \note \ref rule() flags are not copied |
764 | boundary_point_index(const segment_index<base_iterator>& other); |
765 | /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information |
766 | /// and keeps the current \ref rule() unchanged |
767 | /// |
768 | /// This operation is very cheap, so if you use boundary_point_index and segment_index on the same text |
769 | /// range it is much better to create one from another rather then indexing the same |
770 | /// range twice. |
771 | /// |
772 | /// \note \ref rule() flags are not copied |
773 | boundary_point_index& operator=(const segment_index<base_iterator>& other); |
774 | |
775 | /// Create a new index for %boundary analysis \ref boundary_type "type" of the text |
776 | /// in range [begin,end) for locale \a loc. |
777 | /// |
778 | /// \note \ref rule() remains unchanged. |
779 | void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc = std::locale()) |
780 | { |
781 | map_ = mapping_type(type, begin, end, loc); |
782 | } |
783 | |
784 | /// Get the \ref iterator on the beginning of the %boundary points range. |
785 | /// |
786 | /// Preconditions: this boundary_point_index should have a mapping |
787 | /// |
788 | /// \note |
789 | /// |
790 | /// The returned iterator is invalidated by access to any non-const member functions of this object |
791 | iterator begin() const |
792 | { |
793 | return iterator(true, &map_, mask_); |
794 | } |
795 | |
796 | /// Get the \ref iterator on the ending of the %boundary points range. |
797 | /// |
798 | /// Preconditions: this boundary_point_index should have a mapping |
799 | /// |
800 | /// \note |
801 | /// |
802 | /// The returned iterator is invalidated by access to any non-const member functions of this object |
803 | iterator end() const |
804 | { |
805 | return iterator(false, &map_, mask_); |
806 | } |
807 | |
808 | /// Find a first valid %boundary point on a position \a p or following it. |
809 | /// |
810 | /// For example: For \ref word %boundary analysis of the text "to be or" |
811 | /// |
812 | /// - "|to be", would return %boundary point at "|to be", |
813 | /// - "t|o be", would point to "to| be" |
814 | /// |
815 | /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator |
816 | /// to the text in the mapped range. |
817 | /// |
818 | /// The returned iterator is invalidated by access to any non-const member functions of this object |
819 | iterator find(base_iterator p) const |
820 | { |
821 | return iterator(p, &map_, mask_); |
822 | } |
823 | |
824 | /// Get the mask of rules that are used |
825 | rule_type rule() const |
826 | { |
827 | return mask_; |
828 | } |
829 | /// Set the mask of rules that are used |
830 | void rule(rule_type v) |
831 | { |
832 | mask_ = v; |
833 | } |
834 | |
835 | private: |
836 | friend class segment_index<base_iterator>; |
837 | typedef detail::mapping<base_iterator> mapping_type; |
838 | mapping_type map_; |
839 | rule_type mask_; |
840 | }; |
841 | |
842 | /// \cond INTERNAL |
843 | template<typename BaseIterator> |
844 | segment_index<BaseIterator>::segment_index(const boundary_point_index<BaseIterator>& other) : |
845 | map_(other.map_), mask_(0xFFFFFFFFu), full_select_(false) |
846 | {} |
847 | |
848 | template<typename BaseIterator> |
849 | boundary_point_index<BaseIterator>::boundary_point_index(const segment_index<BaseIterator>& other) : |
850 | map_(other.map_), mask_(0xFFFFFFFFu) |
851 | {} |
852 | |
853 | template<typename BaseIterator> |
854 | segment_index<BaseIterator>& segment_index<BaseIterator>::operator=(const boundary_point_index<BaseIterator>& other) |
855 | { |
856 | map_ = other.map_; |
857 | return *this; |
858 | } |
859 | |
860 | template<typename BaseIterator> |
861 | boundary_point_index<BaseIterator>& |
862 | boundary_point_index<BaseIterator>::operator=(const segment_index<BaseIterator>& other) |
863 | { |
864 | map_ = other.map_; |
865 | return *this; |
866 | } |
867 | /// \endcond |
868 | |
869 | typedef segment_index<std::string::const_iterator> ssegment_index; ///< convenience typedef |
870 | typedef segment_index<std::wstring::const_iterator> wssegment_index; ///< convenience typedef |
871 | #ifndef BOOST_LOCALE_NO_CXX20_STRING8 |
872 | typedef segment_index<std::u8string::const_iterator> u8ssegment_index; ///< convenience typedef |
873 | #endif |
874 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
875 | typedef segment_index<std::u16string::const_iterator> u16ssegment_index; ///< convenience typedef |
876 | #endif |
877 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
878 | typedef segment_index<std::u32string::const_iterator> u32ssegment_index; ///< convenience typedef |
879 | #endif |
880 | |
881 | typedef segment_index<const char*> csegment_index; ///< convenience typedef |
882 | typedef segment_index<const wchar_t*> wcsegment_index; ///< convenience typedef |
883 | #ifdef __cpp_char8_t |
884 | typedef segment_index<const char8_t*> u8csegment_index; ///< convenience typedef |
885 | #endif |
886 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
887 | typedef segment_index<const char16_t*> u16csegment_index; ///< convenience typedef |
888 | #endif |
889 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
890 | typedef segment_index<const char32_t*> u32csegment_index; ///< convenience typedef |
891 | #endif |
892 | |
893 | typedef boundary_point_index<std::string::const_iterator> sboundary_point_index; ///< convenience typedef |
894 | typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index; ///< convenience typedef |
895 | #ifndef BOOST_LOCALE_NO_CXX20_STRING8 |
896 | typedef boundary_point_index<std::u8string::const_iterator> u8sboundary_point_index; ///< convenience typedef |
897 | #endif |
898 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
899 | typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index; ///< convenience typedef |
900 | #endif |
901 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
902 | typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index; ///< convenience typedef |
903 | #endif |
904 | |
905 | typedef boundary_point_index<const char*> cboundary_point_index; ///< convenience typedef |
906 | typedef boundary_point_index<const wchar_t*> wcboundary_point_index; ///< convenience typedef |
907 | #ifdef __cpp_char8_t |
908 | typedef boundary_point_index<const char8_t*> u8cboundary_point_index; ///< convenience typedef |
909 | #endif |
910 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
911 | typedef boundary_point_index<const char16_t*> u16cboundary_point_index; ///< convenience typedef |
912 | #endif |
913 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
914 | typedef boundary_point_index<const char32_t*> u32cboundary_point_index; ///< convenience typedef |
915 | #endif |
916 | |
917 | }}} // namespace boost::locale::boundary |
918 | |
919 | /// |
920 | /// \example boundary.cpp |
921 | /// Example of using segment_index |
922 | /// \example wboundary.cpp |
923 | /// Example of using segment_index over wide strings |
924 | /// |
925 | |
926 | #ifdef BOOST_MSVC |
927 | # pragma warning(pop) |
928 | #endif |
929 | |
930 | #endif |
931 | |