1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12
13use crate::tables::grapheme::GraphemeCat;
14
15/// External iterator for grapheme clusters and byte offsets.
16///
17/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18/// trait. See its documentation for more.
19///
20/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22#[derive(Clone)]
23pub struct GraphemeIndices<'a> {
24 start_offset: usize,
25 iter: Graphemes<'a>,
26}
27
28impl<'a> GraphemeIndices<'a> {
29 #[inline]
30 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
31 ///
32 /// ```rust
33 /// # use unicode_segmentation::UnicodeSegmentation;
34 /// let mut iter = "abc".grapheme_indices(true);
35 /// assert_eq!(iter.as_str(), "abc");
36 /// iter.next();
37 /// assert_eq!(iter.as_str(), "bc");
38 /// iter.next();
39 /// iter.next();
40 /// assert_eq!(iter.as_str(), "");
41 /// ```
42 pub fn as_str(&self) -> &'a str {
43 self.iter.as_str()
44 }
45}
46
47impl<'a> Iterator for GraphemeIndices<'a> {
48 type Item = (usize, &'a str);
49
50 #[inline]
51 fn next(&mut self) -> Option<(usize, &'a str)> {
52 self.iter
53 .next()
54 .map(|s: &str| (s.as_ptr() as usize - self.start_offset, s))
55 }
56
57 #[inline]
58 fn size_hint(&self) -> (usize, Option<usize>) {
59 self.iter.size_hint()
60 }
61}
62
63impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
64 #[inline]
65 fn next_back(&mut self) -> Option<(usize, &'a str)> {
66 self.iter
67 .next_back()
68 .map(|s: &str| (s.as_ptr() as usize - self.start_offset, s))
69 }
70}
71
72/// External iterator for a string's
73/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
74///
75/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
76/// documentation for more.
77///
78/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
79/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
80#[derive(Clone, Debug)]
81pub struct Graphemes<'a> {
82 string: &'a str,
83 cursor: GraphemeCursor,
84 cursor_back: GraphemeCursor,
85}
86
87impl<'a> Graphemes<'a> {
88 #[inline]
89 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
90 ///
91 /// ```rust
92 /// # use unicode_segmentation::UnicodeSegmentation;
93 /// let mut iter = "abc".graphemes(true);
94 /// assert_eq!(iter.as_str(), "abc");
95 /// iter.next();
96 /// assert_eq!(iter.as_str(), "bc");
97 /// iter.next();
98 /// iter.next();
99 /// assert_eq!(iter.as_str(), "");
100 /// ```
101 pub fn as_str(&self) -> &'a str {
102 &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
103 }
104}
105
106impl<'a> Iterator for Graphemes<'a> {
107 type Item = &'a str;
108
109 #[inline]
110 fn size_hint(&self) -> (usize, Option<usize>) {
111 let slen: usize = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
112 (cmp::min(v1:slen, v2:1), Some(slen))
113 }
114
115 #[inline]
116 fn next(&mut self) -> Option<&'a str> {
117 let start: usize = self.cursor.cur_cursor();
118 if start == self.cursor_back.cur_cursor() {
119 return None;
120 }
121 let next: usize = self.cursor.next_boundary(self.string, chunk_start:0).unwrap().unwrap();
122 Some(&self.string[start..next])
123 }
124}
125
126impl<'a> DoubleEndedIterator for Graphemes<'a> {
127 #[inline]
128 fn next_back(&mut self) -> Option<&'a str> {
129 let end: usize = self.cursor_back.cur_cursor();
130 if end == self.cursor.cur_cursor() {
131 return None;
132 }
133 let prev: usize = self
134 .cursor_back
135 .prev_boundary(self.string, chunk_start:0)
136 .unwrap()
137 .unwrap();
138 Some(&self.string[prev..end])
139 }
140}
141
142#[inline]
143pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
144 let len: usize = s.len();
145 Graphemes {
146 string: s,
147 cursor: GraphemeCursor::new(offset:0, len, is_extended),
148 cursor_back: GraphemeCursor::new(offset:len, len, is_extended),
149 }
150}
151
152#[inline]
153pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
154 GraphemeIndices {
155 start_offset: s.as_ptr() as usize,
156 iter: new_graphemes(s, is_extended),
157 }
158}
159
160// maybe unify with PairResult?
161// An enum describing information about a potential boundary.
162#[derive(PartialEq, Eq, Clone, Debug)]
163enum GraphemeState {
164 // No information is known.
165 Unknown,
166 // It is known to not be a boundary.
167 NotBreak,
168 // It is known to be a boundary.
169 Break,
170 // The codepoint after is a Regional Indicator Symbol, so a boundary iff
171 // it is preceded by an even number of RIS codepoints. (GB12, GB13)
172 Regional,
173 // The codepoint after is Extended_Pictographic,
174 // so whether it's a boundary depends on pre-context according to GB11.
175 Emoji,
176}
177
178/// Cursor-based segmenter for grapheme clusters.
179///
180/// This allows working with ropes and other datastructures where the string is not contiguous or
181/// fully known at initialization time.
182#[derive(Clone, Debug)]
183pub struct GraphemeCursor {
184 // Current cursor position.
185 offset: usize,
186 // Total length of the string.
187 len: usize,
188 // A config flag indicating whether this cursor computes legacy or extended
189 // grapheme cluster boundaries (enables GB9a and GB9b if set).
190 is_extended: bool,
191 // Information about the potential boundary at `offset`
192 state: GraphemeState,
193 // Category of codepoint immediately preceding cursor, if known.
194 cat_before: Option<GraphemeCat>,
195 // Category of codepoint immediately after cursor, if known.
196 cat_after: Option<GraphemeCat>,
197 // If set, at least one more codepoint immediately preceding this offset
198 // is needed to resolve whether there's a boundary at `offset`.
199 pre_context_offset: Option<usize>,
200 // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
201 // is set, then counts the number of RIS between that and `offset`, otherwise
202 // is an accurate count relative to the string.
203 ris_count: Option<usize>,
204 // Set if a call to `prev_boundary` or `next_boundary` was suspended due
205 // to needing more input.
206 resuming: bool,
207 // Cached grapheme category and associated scalar value range.
208 grapheme_cat_cache: (u32, u32, GraphemeCat),
209}
210
211/// An error return indicating that not enough content was available in the
212/// provided chunk to satisfy the query, and that more content must be provided.
213#[derive(PartialEq, Eq, Debug)]
214pub enum GraphemeIncomplete {
215 /// More pre-context is needed. The caller should call `provide_context`
216 /// with a chunk ending at the offset given, then retry the query. This
217 /// will only be returned if the `chunk_start` parameter is nonzero.
218 PreContext(usize),
219
220 /// When requesting `prev_boundary`, the cursor is moving past the beginning
221 /// of the current chunk, so the chunk before that is requested. This will
222 /// only be returned if the `chunk_start` parameter is nonzero.
223 PrevChunk,
224
225 /// When requesting `next_boundary`, the cursor is moving past the end of the
226 /// current chunk, so the chunk after that is requested. This will only be
227 /// returned if the chunk ends before the `len` parameter provided on
228 /// creation of the cursor.
229 NextChunk, // requesting chunk following the one given
230
231 /// An error returned when the chunk given does not contain the cursor position.
232 InvalidOffset,
233}
234
235// An enum describing the result from lookup of a pair of categories.
236#[derive(PartialEq, Eq)]
237enum PairResult {
238 NotBreak, // definitely not a break
239 Break, // definitely a break
240 Extended, // a break iff not in extended mode
241 Regional, // a break if preceded by an even number of RIS
242 Emoji, // a break if preceded by emoji base and (Extend)*
243}
244
245#[inline]
246fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
247 use self::PairResult::*;
248 use crate::tables::grapheme::GraphemeCat::*;
249 match (before, after) {
250 (GC_CR, GC_LF) => NotBreak, // GB3
251 (GC_Control, _) => Break, // GB4
252 (GC_CR, _) => Break, // GB4
253 (GC_LF, _) => Break, // GB4
254 (_, GC_Control) => Break, // GB5
255 (_, GC_CR) => Break, // GB5
256 (_, GC_LF) => Break, // GB5
257 (GC_L, GC_L) => NotBreak, // GB6
258 (GC_L, GC_V) => NotBreak, // GB6
259 (GC_L, GC_LV) => NotBreak, // GB6
260 (GC_L, GC_LVT) => NotBreak, // GB6
261 (GC_LV, GC_V) => NotBreak, // GB7
262 (GC_LV, GC_T) => NotBreak, // GB7
263 (GC_V, GC_V) => NotBreak, // GB7
264 (GC_V, GC_T) => NotBreak, // GB7
265 (GC_LVT, GC_T) => NotBreak, // GB8
266 (GC_T, GC_T) => NotBreak, // GB8
267 (_, GC_Extend) => NotBreak, // GB9
268 (_, GC_ZWJ) => NotBreak, // GB9
269 (_, GC_SpacingMark) => Extended, // GB9a
270 (GC_Prepend, _) => Extended, // GB9b
271 (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
272 (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
273 (_, _) => Break, // GB999
274 }
275}
276
277impl GraphemeCursor {
278 /// Create a new cursor. The string and initial offset are given at creation
279 /// time, but the contents of the string are not. The `is_extended` parameter
280 /// controls whether extended grapheme clusters are selected.
281 ///
282 /// The `offset` parameter must be on a codepoint boundary.
283 ///
284 /// ```rust
285 /// # use unicode_segmentation::GraphemeCursor;
286 /// let s = "हिन्दी";
287 /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
288 /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
289 /// let mut extended = GraphemeCursor::new(0, s.len(), true);
290 /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
291 /// ```
292 pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
293 let state = if offset == 0 || offset == len {
294 GraphemeState::Break
295 } else {
296 GraphemeState::Unknown
297 };
298 GraphemeCursor {
299 offset: offset,
300 len: len,
301 state: state,
302 is_extended: is_extended,
303 cat_before: None,
304 cat_after: None,
305 pre_context_offset: None,
306 ris_count: None,
307 resuming: false,
308 grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
309 }
310 }
311
312 fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
313 use crate::tables::grapheme as gr;
314 use crate::tables::grapheme::GraphemeCat::*;
315
316 if ch <= '\u{7e}' {
317 // Special-case optimization for ascii, except U+007F. This
318 // improves performance even for many primarily non-ascii texts,
319 // due to use of punctuation and white space characters from the
320 // ascii range.
321 if ch >= '\u{20}' {
322 GC_Any
323 } else if ch == '\n' {
324 GC_LF
325 } else if ch == '\r' {
326 GC_CR
327 } else {
328 GC_Control
329 }
330 } else {
331 // If this char isn't within the cached range, update the cache to the
332 // range that includes it.
333 if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
334 self.grapheme_cat_cache = gr::grapheme_category(ch);
335 }
336 self.grapheme_cat_cache.2
337 }
338 }
339
340 // Not sure I'm gonna keep this, the advantage over new() seems thin.
341
342 /// Set the cursor to a new location in the same string.
343 ///
344 /// ```rust
345 /// # use unicode_segmentation::GraphemeCursor;
346 /// let s = "abcd";
347 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
348 /// assert_eq!(cursor.cur_cursor(), 0);
349 /// cursor.set_cursor(2);
350 /// assert_eq!(cursor.cur_cursor(), 2);
351 /// ```
352 pub fn set_cursor(&mut self, offset: usize) {
353 if offset != self.offset {
354 self.offset = offset;
355 self.state = if offset == 0 || offset == self.len {
356 GraphemeState::Break
357 } else {
358 GraphemeState::Unknown
359 };
360 // reset state derived from text around cursor
361 self.cat_before = None;
362 self.cat_after = None;
363 self.ris_count = None;
364 }
365 }
366
367 #[inline]
368 /// The current offset of the cursor. Equal to the last value provided to
369 /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
370 /// `prev_boundary()`.
371 ///
372 /// ```rust
373 /// # use unicode_segmentation::GraphemeCursor;
374 /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
375 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
376 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
377 /// assert_eq!(cursor.cur_cursor(), 4);
378 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
379 /// assert_eq!(cursor.cur_cursor(), 8);
380 /// ```
381 pub fn cur_cursor(&self) -> usize {
382 self.offset
383 }
384
385 /// Provide additional pre-context when it is needed to decide a boundary.
386 /// The end of the chunk must coincide with the value given in the
387 /// `GraphemeIncomplete::PreContext` request.
388 ///
389 /// ```rust
390 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
391 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
392 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
393 /// // Not enough pre-context to decide if there's a boundary between the two flags.
394 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
395 /// // Provide one more Regional Indicator Symbol of pre-context
396 /// cursor.provide_context(&flags[4..8], 4);
397 /// // Still not enough context to decide.
398 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
399 /// // Provide additional requested context.
400 /// cursor.provide_context(&flags[0..4], 0);
401 /// // That's enough to decide (it always is when context goes to the start of the string)
402 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
403 /// ```
404 pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
405 use crate::tables::grapheme as gr;
406 assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
407 self.pre_context_offset = None;
408 if self.is_extended && chunk_start + chunk.len() == self.offset {
409 let ch = chunk.chars().rev().next().unwrap();
410 if self.grapheme_category(ch) == gr::GC_Prepend {
411 self.decide(false); // GB9b
412 return;
413 }
414 }
415 match self.state {
416 GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
417 GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
418 _ => {
419 if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
420 let ch = chunk.chars().rev().next().unwrap();
421 self.cat_before = Some(self.grapheme_category(ch));
422 }
423 }
424 }
425 }
426
427 #[inline]
428 fn decide(&mut self, is_break: bool) {
429 self.state = if is_break {
430 GraphemeState::Break
431 } else {
432 GraphemeState::NotBreak
433 };
434 }
435
436 #[inline]
437 fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
438 self.decide(is_break);
439 Ok(is_break)
440 }
441
442 #[inline]
443 fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
444 if self.state == GraphemeState::Break {
445 Ok(true)
446 } else if self.state == GraphemeState::NotBreak {
447 Ok(false)
448 } else if let Some(pre_context_offset) = self.pre_context_offset {
449 Err(GraphemeIncomplete::PreContext(pre_context_offset))
450 } else {
451 unreachable!("inconsistent state");
452 }
453 }
454
455 #[inline]
456 fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
457 use crate::tables::grapheme as gr;
458 let mut ris_count = self.ris_count.unwrap_or(0);
459 for ch in chunk.chars().rev() {
460 if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
461 self.ris_count = Some(ris_count);
462 self.decide((ris_count % 2) == 0);
463 return;
464 }
465 ris_count += 1;
466 }
467 self.ris_count = Some(ris_count);
468 if chunk_start == 0 {
469 self.decide((ris_count % 2) == 0);
470 return;
471 }
472 self.pre_context_offset = Some(chunk_start);
473 self.state = GraphemeState::Regional;
474 }
475
476 #[inline]
477 fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
478 use crate::tables::grapheme as gr;
479 let mut iter = chunk.chars().rev();
480 if let Some(ch) = iter.next() {
481 if self.grapheme_category(ch) != gr::GC_ZWJ {
482 self.decide(true);
483 return;
484 }
485 }
486 for ch in iter {
487 match self.grapheme_category(ch) {
488 gr::GC_Extend => (),
489 gr::GC_Extended_Pictographic => {
490 self.decide(false);
491 return;
492 }
493 _ => {
494 self.decide(true);
495 return;
496 }
497 }
498 }
499 if chunk_start == 0 {
500 self.decide(true);
501 return;
502 }
503 self.pre_context_offset = Some(chunk_start);
504 self.state = GraphemeState::Emoji;
505 }
506
507 #[inline]
508 /// Determine whether the current cursor location is a grapheme cluster boundary.
509 /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
510 /// the length of `chunk` is not equal to `len` on creation, then this method
511 /// may return `GraphemeIncomplete::PreContext`. The caller should then
512 /// call `provide_context` with the requested chunk, then retry calling this
513 /// method.
514 ///
515 /// For partial chunks, if the cursor is not at the beginning or end of the
516 /// string, the chunk should contain at least the codepoint following the cursor.
517 /// If the string is nonempty, the chunk must be nonempty.
518 ///
519 /// All calls should have consistent chunk contents (ie, if a chunk provides
520 /// content for a given slice, all further chunks covering that slice must have
521 /// the same content for it).
522 ///
523 /// ```rust
524 /// # use unicode_segmentation::GraphemeCursor;
525 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
526 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
527 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
528 /// cursor.set_cursor(12);
529 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
530 /// ```
531 pub fn is_boundary(
532 &mut self,
533 chunk: &str,
534 chunk_start: usize,
535 ) -> Result<bool, GraphemeIncomplete> {
536 use crate::tables::grapheme as gr;
537 if self.state == GraphemeState::Break {
538 return Ok(true);
539 }
540 if self.state == GraphemeState::NotBreak {
541 return Ok(false);
542 }
543 if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
544 if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
545 return Err(GraphemeIncomplete::InvalidOffset);
546 }
547 }
548 if let Some(pre_context_offset) = self.pre_context_offset {
549 return Err(GraphemeIncomplete::PreContext(pre_context_offset));
550 }
551 let offset_in_chunk = self.offset - chunk_start;
552 if self.cat_after.is_none() {
553 let ch = chunk[offset_in_chunk..].chars().next().unwrap();
554 self.cat_after = Some(self.grapheme_category(ch));
555 }
556 if self.offset == chunk_start {
557 let mut need_pre_context = true;
558 match self.cat_after.unwrap() {
559 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
560 gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
561 _ => need_pre_context = self.cat_before.is_none(),
562 }
563 if need_pre_context {
564 self.pre_context_offset = Some(chunk_start);
565 return Err(GraphemeIncomplete::PreContext(chunk_start));
566 }
567 }
568 if self.cat_before.is_none() {
569 let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
570 self.cat_before = Some(self.grapheme_category(ch));
571 }
572 match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
573 PairResult::NotBreak => return self.decision(false),
574 PairResult::Break => return self.decision(true),
575 PairResult::Extended => {
576 let is_extended = self.is_extended;
577 return self.decision(!is_extended);
578 }
579 PairResult::Regional => {
580 if let Some(ris_count) = self.ris_count {
581 return self.decision((ris_count % 2) == 0);
582 }
583 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
584 self.is_boundary_result()
585 }
586 PairResult::Emoji => {
587 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
588 self.is_boundary_result()
589 }
590 }
591 }
592
593 #[inline]
594 /// Find the next boundary after the current cursor position. Only a part of
595 /// the string need be supplied. If the chunk is incomplete, then this
596 /// method might return `GraphemeIncomplete::PreContext` or
597 /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
598 /// call `provide_context` with the requested chunk, then retry. In the
599 /// latter case, the caller should provide the chunk following the one
600 /// given, then retry.
601 ///
602 /// See `is_boundary` for expectations on the provided chunk.
603 ///
604 /// ```rust
605 /// # use unicode_segmentation::GraphemeCursor;
606 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
607 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
608 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
609 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
610 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
611 /// ```
612 ///
613 /// And an example that uses partial strings:
614 ///
615 /// ```rust
616 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
617 /// let s = "abcd";
618 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
619 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
620 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
621 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
622 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
623 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
624 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
625 /// ```
626 pub fn next_boundary(
627 &mut self,
628 chunk: &str,
629 chunk_start: usize,
630 ) -> Result<Option<usize>, GraphemeIncomplete> {
631 if self.offset == self.len {
632 return Ok(None);
633 }
634 let mut iter = chunk[self.offset - chunk_start..].chars();
635 let mut ch = iter.next().unwrap();
636 loop {
637 if self.resuming {
638 if self.cat_after.is_none() {
639 self.cat_after = Some(self.grapheme_category(ch));
640 }
641 } else {
642 self.offset += ch.len_utf8();
643 self.state = GraphemeState::Unknown;
644 self.cat_before = self.cat_after.take();
645 if self.cat_before.is_none() {
646 self.cat_before = Some(self.grapheme_category(ch));
647 }
648 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
649 self.ris_count = self.ris_count.map(|c| c + 1);
650 } else {
651 self.ris_count = Some(0);
652 }
653 if let Some(next_ch) = iter.next() {
654 ch = next_ch;
655 self.cat_after = Some(self.grapheme_category(ch));
656 } else if self.offset == self.len {
657 self.decide(true);
658 } else {
659 self.resuming = true;
660 return Err(GraphemeIncomplete::NextChunk);
661 }
662 }
663 self.resuming = true;
664 if self.is_boundary(chunk, chunk_start)? {
665 self.resuming = false;
666 return Ok(Some(self.offset));
667 }
668 self.resuming = false;
669 }
670 }
671
672 /// Find the previous boundary after the current cursor position. Only a part
673 /// of the string need be supplied. If the chunk is incomplete, then this
674 /// method might return `GraphemeIncomplete::PreContext` or
675 /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
676 /// call `provide_context` with the requested chunk, then retry. In the
677 /// latter case, the caller should provide the chunk preceding the one
678 /// given, then retry.
679 ///
680 /// See `is_boundary` for expectations on the provided chunk.
681 ///
682 /// ```rust
683 /// # use unicode_segmentation::GraphemeCursor;
684 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
685 /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
686 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
687 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
688 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
689 /// ```
690 ///
691 /// And an example that uses partial strings (note the exact return is not
692 /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
693 ///
694 /// ```rust
695 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
696 /// let s = "abcd";
697 /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
698 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
699 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
700 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
701 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
702 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
703 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
704 /// ```
705 pub fn prev_boundary(
706 &mut self,
707 chunk: &str,
708 chunk_start: usize,
709 ) -> Result<Option<usize>, GraphemeIncomplete> {
710 if self.offset == 0 {
711 return Ok(None);
712 }
713 if self.offset == chunk_start {
714 return Err(GraphemeIncomplete::PrevChunk);
715 }
716 let mut iter = chunk[..self.offset - chunk_start].chars().rev();
717 let mut ch = iter.next().unwrap();
718 loop {
719 if self.offset == chunk_start {
720 self.resuming = true;
721 return Err(GraphemeIncomplete::PrevChunk);
722 }
723 if self.resuming {
724 self.cat_before = Some(self.grapheme_category(ch));
725 } else {
726 self.offset -= ch.len_utf8();
727 self.cat_after = self.cat_before.take();
728 self.state = GraphemeState::Unknown;
729 if let Some(ris_count) = self.ris_count {
730 self.ris_count = if ris_count > 0 {
731 Some(ris_count - 1)
732 } else {
733 None
734 };
735 }
736 if let Some(prev_ch) = iter.next() {
737 ch = prev_ch;
738 self.cat_before = Some(self.grapheme_category(ch));
739 } else if self.offset == 0 {
740 self.decide(true);
741 } else {
742 self.resuming = true;
743 self.cat_after = Some(self.grapheme_category(ch));
744 return Err(GraphemeIncomplete::PrevChunk);
745 }
746 }
747 self.resuming = true;
748 if self.is_boundary(chunk, chunk_start)? {
749 self.resuming = false;
750 return Ok(Some(self.offset));
751 }
752 self.resuming = false;
753 }
754 }
755}
756
757#[test]
758fn test_grapheme_cursor_ris_precontext() {
759 let s: &str = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
760 let mut c: GraphemeCursor = GraphemeCursor::new(offset:8, s.len(), is_extended:true);
761 assert_eq!(
762 c.is_boundary(&s[4..], 4),
763 Err(GraphemeIncomplete::PreContext(4))
764 );
765 c.provide_context(&s[..4], chunk_start:0);
766 assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
767}
768
769#[test]
770fn test_grapheme_cursor_chunk_start_require_precontext() {
771 let s: &str = "\r\n";
772 let mut c: GraphemeCursor = GraphemeCursor::new(offset:1, s.len(), is_extended:true);
773 assert_eq!(
774 c.is_boundary(&s[1..], 1),
775 Err(GraphemeIncomplete::PreContext(1))
776 );
777 c.provide_context(&s[..1], chunk_start:0);
778 assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
779}
780
781#[test]
782fn test_grapheme_cursor_prev_boundary() {
783 let s: &str = "abcd";
784 let mut c: GraphemeCursor = GraphemeCursor::new(offset:3, s.len(), is_extended:true);
785 assert_eq!(
786 c.prev_boundary(&s[2..], 2),
787 Err(GraphemeIncomplete::PrevChunk)
788 );
789 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
790}
791
792#[test]
793fn test_grapheme_cursor_prev_boundary_chunk_start() {
794 let s: &str = "abcd";
795 let mut c: GraphemeCursor = GraphemeCursor::new(offset:2, s.len(), is_extended:true);
796 assert_eq!(
797 c.prev_boundary(&s[2..], 2),
798 Err(GraphemeIncomplete::PrevChunk)
799 );
800 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
801}
802