grapheme.rs source code [crates/unicode_segmentation/src/grapheme.rs]

1	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2	// file at the top-level directory of this distribution and at
3	// http://rust-lang.org/COPYRIGHT.
4	//
5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8	// option. This file may not be copied, modified, or distributed
9	// except according to those terms.
10
11	use core::cmp;
12
13	use crate::tables::grapheme::GraphemeCat;
14
15	/// External iterator for grapheme clusters and byte offsets.
16	///
17	/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18	/// trait. See its documentation for more.
19	///
20	/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22	#[derive(Debug, Clone)]
23	pub struct GraphemeIndices<'a> {
24	start_offset: usize,
25	iter: Graphemes<'a>,
26	}
27
28	impl<'a> GraphemeIndices<'a> {
29	#[inline]
30	/// View the underlying data (the part yet to be iterated) as a slice of the original string.
31	///
32	/// ```rust
33	/// # use unicode_segmentation::UnicodeSegmentation;
34	/// let mut iter = "abc".grapheme_indices(`true`);
35	/// assert_eq!(iter.as_str(), "abc");
36	/// iter.next();
37	/// assert_eq!(iter.as_str(), "bc");
38	/// iter.next();
39	/// iter.next();
40	/// assert_eq!(iter.as_str(), "");
41	/// ```
42	pub fn as_str(&self) -> &'a str {
43	self.iter.as_str()
44	}
45	}
46
47	impl<'a> Iterator for GraphemeIndices<'a> {
48	type Item = (usize, &'a str);
49
50	#[inline]
51	fn next(&mut self) -> Option<(usize, &'a str)> {
52	self.iter
53	.next()
54	.map(\|s: &'a str\| (s.as_ptr() as usize - self.start_offset, s))
55	}
56
57	#[inline]
58	fn size_hint(&self) -> (usize, Option<usize>) {
59	self.iter.size_hint()
60	}
61	}
62
63	impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
64	#[inline]
65	fn next_back(&mut self) -> Option<(usize, &'a str)> {
66	self.iter
67	.next_back()
68	.map(\|s: &'a str\| (s.as_ptr() as usize - self.start_offset, s))
69	}
70	}
71
72	/// External iterator for a string's
73	/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
74	///
75	/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
76	/// documentation for more.
77	///
78	/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
79	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
80	#[derive(Clone, Debug)]
81	pub struct Graphemes<'a> {
82	string: &'a str,
83	cursor: GraphemeCursor,
84	cursor_back: GraphemeCursor,
85	}
86
87	impl<'a> Graphemes<'a> {
88	#[inline]
89	/// View the underlying data (the part yet to be iterated) as a slice of the original string.
90	///
91	/// ```rust
92	/// # use unicode_segmentation::UnicodeSegmentation;
93	/// let mut iter = "abc".graphemes(`true`);
94	/// assert_eq!(iter.as_str(), "abc");
95	/// iter.next();
96	/// assert_eq!(iter.as_str(), "bc");
97	/// iter.next();
98	/// iter.next();
99	/// assert_eq!(iter.as_str(), "");
100	/// ```
101	pub fn as_str(&self) -> &'a str {
102	&self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
103	}
104	}
105
106	impl<'a> Iterator for Graphemes<'a> {
107	type Item = &'a str;
108
109	#[inline]
110	fn size_hint(&self) -> (usize, Option<usize>) {
111	let slen: usize = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
112	(cmp::min(v1:slen, v2:`1`), Some(slen))
113	}
114
115	#[inline]
116	fn next(&mut self) -> Option<&'a str> {
117	let start: usize = self.cursor.cur_cursor();
118	if start == self.cursor_back.cur_cursor() {
119	return None;
120	}
121	let next: usize = self.cursor.next_boundary(self.string, chunk_start:`0`).unwrap().unwrap();
122	Some(&self.string[start..next])
123	}
124	}
125
126	impl<'a> DoubleEndedIterator for Graphemes<'a> {
127	#[inline]
128	fn next_back(&mut self) -> Option<&'a str> {
129	let end: usize = self.cursor_back.cur_cursor();
130	if end == self.cursor.cur_cursor() {
131	return None;
132	}
133	let prev: usize = self
134	.cursor_back
135	.prev_boundary(self.string, chunk_start:`0`)
136	.unwrap()
137	.unwrap();
138	Some(&self.string[prev..end])
139	}
140	}
141
142	#[inline]
143	pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> {
144	let len: usize = s.len();
145	Graphemes {
146	string: s,
147	cursor: GraphemeCursor::new(offset:`0`, len, is_extended),
148	cursor_back: GraphemeCursor::new(offset:len, len, is_extended),
149	}
150	}
151
152	#[inline]
153	pub fn new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_> {
154	GraphemeIndices {
155	start_offset: s.as_ptr() as usize,
156	iter: new_graphemes(s, is_extended),
157	}
158	}
159
160	/// maybe unify with PairResult?
161	/// An enum describing information about a potential boundary.
162	#[derive(PartialEq, Eq, Clone, Debug)]
163	enum GraphemeState {
164	/// No information is known.
165	Unknown,
166	/// It is known to not be a boundary.
167	NotBreak,
168	/// It is known to be a boundary.
169	Break,
170	/// The codepoint after it has Indic_Conjunct_Break=Consonant,
171	/// so there is a break before so a boundary if it is preceded by another
172	/// InCB=Consonant follwoed by a sequence consisting of one or more InCB=Linker
173	/// and zero or more InCB = Extend (in any order).
174	InCbConsonant,
175	/// The codepoint after is a Regional Indicator Symbol, so a boundary iff
176	/// it is preceded by an even number of RIS codepoints. (GB12, GB13)
177	Regional,
178	/// The codepoint after is Extended_Pictographic,
179	/// so whether it's a boundary depends on pre-context according to GB11.
180	Emoji,
181	}
182
183	/// Cursor-based segmenter for grapheme clusters.
184	///
185	/// This allows working with ropes and other datastructures where the string is not contiguous or
186	/// fully known at initialization time.
187	#[derive(Clone, Debug)]
188	pub struct GraphemeCursor {
189	/// Current cursor position.
190	offset: usize,
191	/// Total length of the string.
192	len: usize,
193	/// A config flag indicating whether this cursor computes legacy or extended
194	/// grapheme cluster boundaries (enables GB9a and GB9b if set).
195	is_extended: bool,
196	/// Information about the potential boundary at `offset`
197	state: GraphemeState,
198	/// Category of codepoint immediately preceding cursor, if known.
199	cat_before: Option<GraphemeCat>,
200	/// Category of codepoint immediately after cursor, if known.
201	cat_after: Option<GraphemeCat>,
202	/// If set, at least one more codepoint immediately preceding this offset
203	/// is needed to resolve whether there's a boundary at `offset`.
204	pre_context_offset: Option<usize>,
205	/// The number of `InCB=Linker` codepoints preceding `offset`
206	/// (potentially intermingled with `InCB=Extend`).
207	incb_linker_count: Option<usize>,
208	/// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
209	/// is set, then counts the number of RIS between that and `offset`, otherwise
210	/// is an accurate count relative to the string.
211	ris_count: Option<usize>,
212	/// Set if a call to `prev_boundary` or `next_boundary` was suspended due
213	/// to needing more input.
214	resuming: bool,
215	/// Cached grapheme category and associated scalar value range.
216	grapheme_cat_cache: (u32, u32, GraphemeCat),
217	}
218
219	/// An error return indicating that not enough content was available in the
220	/// provided chunk to satisfy the query, and that more content must be provided.
221	#[derive(PartialEq, Eq, Debug)]
222	pub enum GraphemeIncomplete {
223	/// More pre-context is needed. The caller should call `provide_context`
224	/// with a chunk ending at the offset given, then retry the query. This
225	/// will only be returned if the `chunk_start` parameter is nonzero.
226	PreContext(usize),
227
228	/// When requesting `prev_boundary`, the cursor is moving past the beginning
229	/// of the current chunk, so the chunk before that is requested. This will
230	/// only be returned if the `chunk_start` parameter is nonzero.
231	PrevChunk,
232
233	/// When requesting `next_boundary`, the cursor is moving past the end of the
234	/// current chunk, so the chunk after that is requested. This will only be
235	/// returned if the chunk ends before the `len` parameter provided on
236	/// creation of the cursor.
237	NextChunk, // requesting chunk following the one given
238
239	/// An error returned when the chunk given does not contain the cursor position.
240	InvalidOffset,
241	}
242
243	// An enum describing the result from lookup of a pair of categories.
244	#[derive(PartialEq, Eq)]
245	enum PairResult {
246	/// definitely not a break
247	NotBreak,
248	/// definitely a break
249	Break,
250	/// a break iff not in extended mode
251	Extended,
252	/// a break unless in extended mode and preceded by
253	/// a sequence of 0 or more InCB=Extend and one or more
254	/// InCB = Linker (in any order),
255	/// preceded by another InCB=Consonant
256	InCbConsonant,
257	/// a break if preceded by an even number of RIS
258	Regional,
259	/// a break if preceded by emoji base and (Extend)*
260	Emoji,
261	}
262
263	#[inline]
264	fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
265	use self::PairResult::*;
266	use crate::tables::grapheme::GraphemeCat::*;
267	match (before, after) {
268	(GC_CR, GC_LF) => NotBreak, // GB3
269	(GC_Control \| GC_CR \| GC_LF, _) => Break, // GB4
270	(_, GC_Control \| GC_CR \| GC_LF) => Break, // GB5
271	(GC_L, GC_L \| GC_V \| GC_LV \| GC_LVT) => NotBreak, // GB6
272	(GC_LV \| GC_V, GC_V \| GC_T) => NotBreak, // GB7
273	(GC_LVT \| GC_T, GC_T) => NotBreak, // GB8
274	(_, GC_Extend \| GC_ZWJ) => NotBreak, // GB9
275	(_, GC_SpacingMark) => Extended, // GB9a
276	(GC_Prepend, _) => Extended, // GB9b
277	(_, GC_InCB_Consonant) => InCbConsonant, // GB9c
278	(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
279	(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
280	(_, _) => Break, // GB999
281	}
282	}
283
284	impl GraphemeCursor {
285	/// Create a new cursor. The string and initial offset are given at creation
286	/// time, but the contents of the string are not. The `is_extended` parameter
287	/// controls whether extended grapheme clusters are selected.
288	///
289	/// The `offset` parameter must be on a codepoint boundary.
290	///
291	/// ```rust
292	/// # use unicode_segmentation::GraphemeCursor;
293	/// let s = "हिन्दी";
294	/// let mut legacy = GraphemeCursor::new(`0`, s.len(), `false`);
295	/// assert_eq!(legacy.next_boundary(s, `0`), Ok(Some("ह".len())));
296	/// let mut extended = GraphemeCursor::new(`0`, s.len(), `true`);
297	/// assert_eq!(extended.next_boundary(s, `0`), Ok(Some("हि".len())));
298	/// ```
299	pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
300	let state = if offset == `0` \|\| offset == len {
301	GraphemeState::Break
302	} else {
303	GraphemeState::Unknown
304	};
305	GraphemeCursor {
306	offset,
307	len,
308	state,
309	is_extended,
310	cat_before: None,
311	cat_after: None,
312	pre_context_offset: None,
313	incb_linker_count: None,
314	ris_count: None,
315	resuming: `false`,
316	grapheme_cat_cache: (`0`, `0`, GraphemeCat::GC_Control),
317	}
318	}
319
320	fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
321	use crate::tables::grapheme as gr;
322	use crate::tables::grapheme::GraphemeCat::*;
323
324	if ch <= '`\u{7e}`' {
325	// Special-case optimization for ascii, except U+007F. This
326	// improves performance even for many primarily non-ascii texts,
327	// due to use of punctuation and white space characters from the
328	// ascii range.
329	if ch >= '`\u{20}`' {
330	GC_Any
331	} else if ch == '`\n`' {
332	GC_LF
333	} else if ch == '`\r`' {
334	GC_CR
335	} else {
336	GC_Control
337	}
338	} else {
339	// If this char isn't within the cached range, update the cache to the
340	// range that includes it.
341	if (ch as u32) < self.grapheme_cat_cache.0 \|\| (ch as u32) > self.grapheme_cat_cache.1 {
342	self.grapheme_cat_cache = gr::grapheme_category(ch);
343	}
344	self.grapheme_cat_cache.2
345	}
346	}
347
348	// Not sure I'm gonna keep this, the advantage over new() seems thin.
349
350	/// Set the cursor to a new location in the same string.
351	///
352	/// ```rust
353	/// # use unicode_segmentation::GraphemeCursor;
354	/// let s = "abcd";
355	/// let mut cursor = GraphemeCursor::new(`0`, s.len(), `false`);
356	/// assert_eq!(cursor.cur_cursor(), `0`);
357	/// cursor.set_cursor(`2`);
358	/// assert_eq!(cursor.cur_cursor(), `2`);
359	/// ```
360	pub fn set_cursor(&mut self, offset: usize) {
361	if offset != self.offset {
362	self.offset = offset;
363	self.state = if offset == `0` \|\| offset == self.len {
364	GraphemeState::Break
365	} else {
366	GraphemeState::Unknown
367	};
368	// reset state derived from text around cursor
369	self.cat_before = None;
370	self.cat_after = None;
371	self.incb_linker_count = None;
372	self.ris_count = None;
373	}
374	}
375
376	#[inline]
377	/// The current offset of the cursor. Equal to the last value provided to
378	/// `new()` or `set_cursor()`, or returned from `next_boundary()` or
379	/// `prev_boundary()`.
380	///
381	/// ```rust
382	/// # use unicode_segmentation::GraphemeCursor;
383	/// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
384	/// let flags = "`\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}`";
385	/// let mut cursor = GraphemeCursor::new(`4`, flags.len(), `false`);
386	/// assert_eq!(cursor.cur_cursor(), `4`);
387	/// assert_eq!(cursor.next_boundary(flags, `0`), Ok(Some(`8`)));
388	/// assert_eq!(cursor.cur_cursor(), `8`);
389	/// ```
390	pub fn cur_cursor(&self) -> usize {
391	self.offset
392	}
393
394	/// Provide additional pre-context when it is needed to decide a boundary.
395	/// The end of the chunk must coincide with the value given in the
396	/// `GraphemeIncomplete::PreContext` request.
397	///
398	/// ```rust
399	/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
400	/// let flags = "`\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}`";
401	/// let mut cursor = GraphemeCursor::new(`8`, flags.len(), `false`);
402	/// // Not enough pre-context to decide if there's a boundary between the two flags.
403	/// assert_eq!(cursor.is_boundary(&flags[`8`..], `8`), Err(GraphemeIncomplete::PreContext(`8`)));
404	/// // Provide one more Regional Indicator Symbol of pre-context
405	/// cursor.provide_context(&flags[`4`..`8`], `4`);
406	/// // Still not enough context to decide.
407	/// assert_eq!(cursor.is_boundary(&flags[`8`..], `8`), Err(GraphemeIncomplete::PreContext(`4`)));
408	/// // Provide additional requested context.
409	/// cursor.provide_context(&flags[`0`..`4`], `0`);
410	/// // That's enough to decide (it always is when context goes to the start of the string)
411	/// assert_eq!(cursor.is_boundary(&flags[`8`..], `8`), Ok(`true`));
412	/// ```
413	pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
414	use crate::tables::grapheme as gr;
415	assert!(chunk_start.saturating_add(chunk.len()) == self.pre_context_offset.unwrap());
416	self.pre_context_offset = None;
417	if self.is_extended && chunk_start + chunk.len() == self.offset {
418	let ch = chunk.chars().next_back().unwrap();
419	if self.grapheme_category(ch) == gr::GC_Prepend {
420	self.decide(`false`); // GB9b
421	return;
422	}
423	}
424	match self.state {
425	GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start),
426	GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
427	GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
428	_ => {
429	if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
430	let ch = chunk.chars().next_back().unwrap();
431	self.cat_before = Some(self.grapheme_category(ch));
432	}
433	}
434	}
435	}
436
437	#[inline]
438	fn decide(&mut self, is_break: bool) {
439	self.state = if is_break {
440	GraphemeState::Break
441	} else {
442	GraphemeState::NotBreak
443	};
444	}
445
446	#[inline]
447	fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
448	self.decide(is_break);
449	Ok(is_break)
450	}
451
452	#[inline]
453	fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
454	if self.state == GraphemeState::Break {
455	Ok(`true`)
456	} else if self.state == GraphemeState::NotBreak {
457	Ok(`false`)
458	} else if let Some(pre_context_offset) = self.pre_context_offset {
459	Err(GraphemeIncomplete::PreContext(pre_context_offset))
460	} else {
461	unreachable!("inconsistent state");
462	}
463	}
464
465	/// For handling rule GB9c:
466	///
467	/// There's an `InCB=Consonant` after this, and we need to look back
468	/// to verify whether there should be a break.
469	///
470	/// Seek backward to find an `InCB=Linker` preceded by an `InCB=Consonsnt`
471	/// (potentially separated by some number of `InCB=Linker` or `InCB=Extend`).
472	/// If we find the consonant in question, then there's no break; if we find a consonant
473	/// with no linker, or a non-linker non-extend non-consonant, or the start of text, there's a break;
474	/// otherwise we need more context
475	#[inline]
476	fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) {
477	use crate::tables::{self, grapheme as gr};
478
479	// GB9c only applies to extended grapheme clusters
480	if !self.is_extended {
481	self.decide(`true`);
482	return;
483	}
484
485	let mut incb_linker_count = self.incb_linker_count.unwrap_or(`0`);
486
487	for ch in chunk.chars().rev() {
488	if tables::is_incb_linker(ch) {
489	// We found an InCB linker
490	incb_linker_count += `1`;
491	self.incb_linker_count = Some(incb_linker_count);
492	} else if tables::derived_property::InCB_Extend(ch) {
493	// We ignore InCB extends, continue
494	} else {
495	// Prev character is neither linker nor extend, break suppressed iff it's InCB=Consonant
496	let result = !(self.incb_linker_count.unwrap_or(`0`) > `0`
497	&& self.grapheme_category(ch) == gr::GC_InCB_Consonant);
498	self.decide(result);
499	return;
500	}
501	}
502
503	if chunk_start == `0` {
504	// Start of text and we still haven't found a consonant, so break
505	self.decide(`true`);
506	} else {
507	// We need more context
508	self.pre_context_offset = Some(chunk_start);
509	self.state = GraphemeState::InCbConsonant;
510	}
511	}
512
513	#[inline]
514	fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
515	use crate::tables::grapheme as gr;
516	let mut ris_count = self.ris_count.unwrap_or(`0`);
517	for ch in chunk.chars().rev() {
518	if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
519	self.ris_count = Some(ris_count);
520	self.decide((ris_count % `2`) == `0`);
521	return;
522	}
523	ris_count += `1`;
524	}
525	self.ris_count = Some(ris_count);
526	if chunk_start == `0` {
527	self.decide((ris_count % `2`) == `0`);
528	} else {
529	self.pre_context_offset = Some(chunk_start);
530	self.state = GraphemeState::Regional;
531	}
532	}
533
534	#[inline]
535	fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
536	use crate::tables::grapheme as gr;
537	let mut iter = chunk.chars().rev();
538	if let Some(ch) = iter.next() {
539	if self.grapheme_category(ch) != gr::GC_ZWJ {
540	self.decide(`true`);
541	return;
542	}
543	}
544	for ch in iter {
545	match self.grapheme_category(ch) {
546	gr::GC_Extend => (),
547	gr::GC_Extended_Pictographic => {
548	self.decide(`false`);
549	return;
550	}
551	_ => {
552	self.decide(`true`);
553	return;
554	}
555	}
556	}
557	if chunk_start == `0` {
558	self.decide(`true`);
559	} else {
560	self.pre_context_offset = Some(chunk_start);
561	self.state = GraphemeState::Emoji;
562	}
563	}
564
565	#[inline]
566	/// Determine whether the current cursor location is a grapheme cluster boundary.
567	/// Only a part of the string need be supplied. If `chunk_start` is nonzero or
568	/// the length of `chunk` is not equal to `len` on creation, then this method
569	/// may return `GraphemeIncomplete::PreContext`. The caller should then
570	/// call `provide_context` with the requested chunk, then retry calling this
571	/// method.
572	///
573	/// For partial chunks, if the cursor is not at the beginning or end of the
574	/// string, the chunk should contain at least the codepoint following the cursor.
575	/// If the string is nonempty, the chunk must be nonempty.
576	///
577	/// All calls should have consistent chunk contents (ie, if a chunk provides
578	/// content for a given slice, all further chunks covering that slice must have
579	/// the same content for it).
580	///
581	/// ```rust
582	/// # use unicode_segmentation::GraphemeCursor;
583	/// let flags = "`\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}`";
584	/// let mut cursor = GraphemeCursor::new(`8`, flags.len(), `false`);
585	/// assert_eq!(cursor.is_boundary(flags, `0`), Ok(`true`));
586	/// cursor.set_cursor(`12`);
587	/// assert_eq!(cursor.is_boundary(flags, `0`), Ok(`false`));
588	/// ```
589	pub fn is_boundary(
590	&mut self,
591	chunk: &str,
592	chunk_start: usize,
593	) -> Result<bool, GraphemeIncomplete> {
594	use crate::tables::grapheme as gr;
595	if self.state == GraphemeState::Break {
596	return Ok(`true`);
597	}
598	if self.state == GraphemeState::NotBreak {
599	return Ok(`false`);
600	}
601	if (self.offset < chunk_start \|\| self.offset >= chunk_start.saturating_add(chunk.len()))
602	&& (self.offset > chunk_start.saturating_add(chunk.len()) \|\| self.cat_after.is_none())
603	{
604	return Err(GraphemeIncomplete::InvalidOffset);
605	}
606	if let Some(pre_context_offset) = self.pre_context_offset {
607	return Err(GraphemeIncomplete::PreContext(pre_context_offset));
608	}
609	let offset_in_chunk = self.offset.saturating_sub(chunk_start);
610	if self.cat_after.is_none() {
611	let ch = chunk[offset_in_chunk..].chars().next().unwrap();
612	self.cat_after = Some(self.grapheme_category(ch));
613	}
614	if self.offset == chunk_start {
615	let mut need_pre_context = `true`;
616	match self.cat_after.unwrap() {
617	gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant,
618	gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
619	gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
620	_ => need_pre_context = self.cat_before.is_none(),
621	}
622	if need_pre_context {
623	self.pre_context_offset = Some(chunk_start);
624	return Err(GraphemeIncomplete::PreContext(chunk_start));
625	}
626	}
627	if self.cat_before.is_none() {
628	let ch = chunk[..offset_in_chunk].chars().next_back().unwrap();
629	self.cat_before = Some(self.grapheme_category(ch));
630	}
631	match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
632	PairResult::NotBreak => self.decision(`false`),
633	PairResult::Break => self.decision(`true`),
634	PairResult::Extended => {
635	let is_extended = self.is_extended;
636	self.decision(!is_extended)
637	}
638	PairResult::InCbConsonant => {
639	self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start);
640	self.is_boundary_result()
641	}
642	PairResult::Regional => {
643	if let Some(ris_count) = self.ris_count {
644	return self.decision((ris_count % `2`) == `0`);
645	}
646	self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
647	self.is_boundary_result()
648	}
649	PairResult::Emoji => {
650	self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
651	self.is_boundary_result()
652	}
653	}
654	}
655
656	#[inline]
657	/// Find the next boundary after the current cursor position. Only a part of
658	/// the string need be supplied. If the chunk is incomplete, then this
659	/// method might return `GraphemeIncomplete::PreContext` or
660	/// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
661	/// call `provide_context` with the requested chunk, then retry. In the
662	/// latter case, the caller should provide the chunk following the one
663	/// given, then retry.
664	///
665	/// See `is_boundary` for expectations on the provided chunk.
666	///
667	/// ```rust
668	/// # use unicode_segmentation::GraphemeCursor;
669	/// let flags = "`\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}`";
670	/// let mut cursor = GraphemeCursor::new(`4`, flags.len(), `false`);
671	/// assert_eq!(cursor.next_boundary(flags, `0`), Ok(Some(`8`)));
672	/// assert_eq!(cursor.next_boundary(flags, `0`), Ok(Some(`16`)));
673	/// assert_eq!(cursor.next_boundary(flags, `0`), Ok(None));
674	/// ```
675	///
676	/// And an example that uses partial strings:
677	///
678	/// ```rust
679	/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
680	/// let s = "abcd";
681	/// let mut cursor = GraphemeCursor::new(`0`, s.len(), `false`);
682	/// assert_eq!(cursor.next_boundary(&s[..`2`], `0`), Ok(Some(`1`)));
683	/// assert_eq!(cursor.next_boundary(&s[..`2`], `0`), Err(GraphemeIncomplete::NextChunk));
684	/// assert_eq!(cursor.next_boundary(&s[`2`..`4`], `2`), Ok(Some(`2`)));
685	/// assert_eq!(cursor.next_boundary(&s[`2`..`4`], `2`), Ok(Some(`3`)));
686	/// assert_eq!(cursor.next_boundary(&s[`2`..`4`], `2`), Ok(Some(`4`)));
687	/// assert_eq!(cursor.next_boundary(&s[`2`..`4`], `2`), Ok(None));
688	/// ```
689	pub fn next_boundary(
690	&mut self,
691	chunk: &str,
692	chunk_start: usize,
693	) -> Result<Option<usize>, GraphemeIncomplete> {
694	if self.offset == self.len {
695	return Ok(None);
696	}
697	let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars();
698	let mut ch = match iter.next() {
699	Some(ch) => ch,
700	None => return Err(GraphemeIncomplete::NextChunk),
701	};
702	loop {
703	if self.resuming {
704	if self.cat_after.is_none() {
705	self.cat_after = Some(self.grapheme_category(ch));
706	}
707	} else {
708	self.offset = self.offset.saturating_add(ch.len_utf8());
709	self.state = GraphemeState::Unknown;
710	self.cat_before = self.cat_after.take();
711	if self.cat_before.is_none() {
712	self.cat_before = Some(self.grapheme_category(ch));
713	}
714	if crate::tables::is_incb_linker(ch) {
715	self.incb_linker_count = Some(self.incb_linker_count.map_or(`1`, \|c\| c + `1`));
716	} else if !crate::tables::derived_property::InCB_Extend(ch) {
717	self.incb_linker_count = Some(`0`);
718	}
719	if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
720	self.ris_count = self.ris_count.map(\|c\| c + `1`);
721	} else {
722	self.ris_count = Some(`0`);
723	}
724	if let Some(next_ch) = iter.next() {
725	ch = next_ch;
726	self.cat_after = Some(self.grapheme_category(ch));
727	} else if self.offset == self.len {
728	self.decide(`true`);
729	} else {
730	self.resuming = `true`;
731	return Err(GraphemeIncomplete::NextChunk);
732	}
733	}
734	self.resuming = `true`;
735	if self.is_boundary(chunk, chunk_start)? {
736	self.resuming = `false`;
737	return Ok(Some(self.offset));
738	}
739	self.resuming = `false`;
740	}
741	}
742
743	/// Find the previous boundary after the current cursor position. Only a part
744	/// of the string need be supplied. If the chunk is incomplete, then this
745	/// method might return `GraphemeIncomplete::PreContext` or
746	/// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
747	/// call `provide_context` with the requested chunk, then retry. In the
748	/// latter case, the caller should provide the chunk preceding the one
749	/// given, then retry.
750	///
751	/// See `is_boundary` for expectations on the provided chunk.
752	///
753	/// ```rust
754	/// # use unicode_segmentation::GraphemeCursor;
755	/// let flags = "`\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}`";
756	/// let mut cursor = GraphemeCursor::new(`12`, flags.len(), `false`);
757	/// assert_eq!(cursor.prev_boundary(flags, `0`), Ok(Some(`8`)));
758	/// assert_eq!(cursor.prev_boundary(flags, `0`), Ok(Some(`0`)));
759	/// assert_eq!(cursor.prev_boundary(flags, `0`), Ok(None));
760	/// ```
761	///
762	/// And an example that uses partial strings (note the exact return is not
763	/// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
764	///
765	/// ```rust
766	/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
767	/// let s = "abcd";
768	/// let mut cursor = GraphemeCursor::new(`4`, s.len(), `false`);
769	/// assert_eq!(cursor.prev_boundary(&s[`2`..`4`], `2`), Ok(Some(`3`)));
770	/// assert_eq!(cursor.prev_boundary(&s[`2`..`4`], `2`), Err(GraphemeIncomplete::PrevChunk));
771	/// assert_eq!(cursor.prev_boundary(&s[`0`..`2`], `0`), Ok(Some(`2`)));
772	/// assert_eq!(cursor.prev_boundary(&s[`0`..`2`], `0`), Ok(Some(`1`)));
773	/// assert_eq!(cursor.prev_boundary(&s[`0`..`2`], `0`), Ok(Some(`0`)));
774	/// assert_eq!(cursor.prev_boundary(&s[`0`..`2`], `0`), Ok(None));
775	/// ```
776	pub fn prev_boundary(
777	&mut self,
778	chunk: &str,
779	chunk_start: usize,
780	) -> Result<Option<usize>, GraphemeIncomplete> {
781	if self.offset == `0` {
782	return Ok(None);
783	}
784	if self.offset == chunk_start {
785	return Err(GraphemeIncomplete::PrevChunk);
786	}
787	let mut iter = chunk[..self.offset.saturating_sub(chunk_start)]
788	.chars()
789	.rev();
790	let mut ch = iter.next().unwrap();
791	loop {
792	if self.offset == chunk_start {
793	self.resuming = `true`;
794	return Err(GraphemeIncomplete::PrevChunk);
795	}
796	if self.resuming {
797	self.cat_before = Some(self.grapheme_category(ch));
798	} else {
799	self.offset -= ch.len_utf8();
800	self.cat_after = self.cat_before.take();
801	self.state = GraphemeState::Unknown;
802	if let Some(incb_linker_count) = self.incb_linker_count {
803	self.ris_count = if incb_linker_count > `0` && crate::tables::is_incb_linker(ch) {
804	Some(incb_linker_count - `1`)
805	} else if crate::tables::derived_property::InCB_Extend(ch) {
806	Some(incb_linker_count)
807	} else {
808	None
809	};
810	}
811	if let Some(ris_count) = self.ris_count {
812	self.ris_count = if ris_count > `0` {
813	Some(ris_count - `1`)
814	} else {
815	None
816	};
817	}
818	if let Some(prev_ch) = iter.next() {
819	ch = prev_ch;
820	self.cat_before = Some(self.grapheme_category(ch));
821	} else if self.offset == `0` {
822	self.decide(`true`);
823	} else {
824	self.resuming = `true`;
825	self.cat_after = Some(self.grapheme_category(ch));
826	return Err(GraphemeIncomplete::PrevChunk);
827	}
828	}
829	self.resuming = `true`;
830	if self.is_boundary(chunk, chunk_start)? {
831	self.resuming = `false`;
832	return Ok(Some(self.offset));
833	}
834	self.resuming = `false`;
835	}
836	}
837	}
838
839	#[test]
840	fn test_grapheme_cursor_ris_precontext() {
841	let s = "`\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}`";
842	let mut c = GraphemeCursor::new(`8`, s.len(), `true`);
843	assert_eq!(
844	c.is_boundary(&s[`4`..], `4`),
845	Err(GraphemeIncomplete::PreContext(`4`))
846	);
847	c.provide_context(&s[..`4`], `0`);
848	assert_eq!(c.is_boundary(&s[`4`..], `4`), Ok(`true`));
849	}
850
851	#[test]
852	fn test_grapheme_cursor_chunk_start_require_precontext() {
853	let s = "`\r\n`";
854	let mut c = GraphemeCursor::new(`1`, s.len(), `true`);
855	assert_eq!(
856	c.is_boundary(&s[`1`..], `1`),
857	Err(GraphemeIncomplete::PreContext(`1`))
858	);
859	c.provide_context(&s[..`1`], `0`);
860	assert_eq!(c.is_boundary(&s[`1`..], `1`), Ok(`false`));
861	}
862
863	#[test]
864	fn test_grapheme_cursor_prev_boundary() {
865	let s = "abcd";
866	let mut c = GraphemeCursor::new(`3`, s.len(), `true`);
867	assert_eq!(
868	c.prev_boundary(&s[`2`..], `2`),
869	Err(GraphemeIncomplete::PrevChunk)
870	);
871	assert_eq!(c.prev_boundary(&s[..`2`], `0`), Ok(Some(`2`)));
872	}
873
874	#[test]
875	fn test_grapheme_cursor_prev_boundary_chunk_start() {
876	let s = "abcd";
877	let mut c = GraphemeCursor::new(`2`, s.len(), `true`);
878	assert_eq!(
879	c.prev_boundary(&s[`2`..], `2`),
880	Err(GraphemeIncomplete::PrevChunk)
881	);
882	assert_eq!(c.prev_boundary(&s[..`2`], `0`), Ok(Some(`1`)));
883	}
884