grapheme.rs source code [crates/bstr/src/unicode/grapheme.rs]

1	use regex_automata::{dfa::Automaton, Anchored, Input};
2
3	use crate::{
4	ext_slice::ByteSlice,
5	unicode::fsm::{
6	grapheme_break_fwd::GRAPHEME_BREAK_FWD,
7	grapheme_break_rev::GRAPHEME_BREAK_REV,
8	regional_indicator_rev::REGIONAL_INDICATOR_REV,
9	},
10	utf8,
11	};
12
13	/// An iterator over grapheme clusters in a byte string.
14	///
15	/// This iterator is typically constructed by
16	/// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes).
17	///
18	/// Unicode defines a grapheme cluster as an approximation* to a single user*
19	/// visible character. A grapheme cluster, or just "grapheme," is made up of
20	/// one or more codepoints. For end user oriented tasks, one should generally
21	/// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
22	/// always yields one codepoint at a time.
23	///
24	/// Since graphemes are made up of one or more codepoints, this iterator yields
25	/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
26	/// are [substituted](index.html#handling-of-invalid-utf-8).
27	///
28	/// This iterator can be used in reverse. When reversed, exactly the same
29	/// set of grapheme clusters are yielded, but in reverse order.
30	///
31	/// This iterator only yields extended* grapheme clusters, in accordance with*
32	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
33	#[derive(Clone, Debug)]
34	pub struct Graphemes<'a> {
35	bs: &'a [u8],
36	}
37
38	impl<'a> Graphemes<'a> {
39	pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> {
40	Graphemes { bs }
41	}
42
43	/// View the underlying data as a subslice of the original data.
44	///
45	/// The slice returned has the same lifetime as the original slice, and so
46	/// the iterator can continue to be used while this exists.
47	///
48	/// # Examples
49	///
50	/// ```
51	/// use bstr::ByteSlice;
52	///
53	/// let mut it = b"abc".graphemes();
54	///
55	/// assert_eq!(b"abc", it.as_bytes());
56	/// it.next();
57	/// assert_eq!(b"bc", it.as_bytes());
58	/// it.next();
59	/// it.next();
60	/// assert_eq!(b"", it.as_bytes());
61	/// ```
62	#[inline]
63	pub fn as_bytes(&self) -> &'a [u8] {
64	self.bs
65	}
66	}
67
68	impl<'a> Iterator for Graphemes<'a> {
69	type Item = &'a str;
70
71	#[inline]
72	fn next(&mut self) -> Option<&'a str> {
73	let (grapheme: &str, size: usize) = decode_grapheme(self.bs);
74	if size == `0` {
75	return None;
76	}
77	self.bs = &self.bs[size..];
78	Some(grapheme)
79	}
80	}
81
82	impl<'a> DoubleEndedIterator for Graphemes<'a> {
83	#[inline]
84	fn next_back(&mut self) -> Option<&'a str> {
85	let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs);
86	if size == `0` {
87	return None;
88	}
89	self.bs = &self.bs[..self.bs.len() - size];
90	Some(grapheme)
91	}
92	}
93
94	/// An iterator over grapheme clusters in a byte string and their byte index
95	/// positions.
96	///
97	/// This iterator is typically constructed by
98	/// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices).
99	///
100	/// Unicode defines a grapheme cluster as an approximation* to a single user*
101	/// visible character. A grapheme cluster, or just "grapheme," is made up of
102	/// one or more codepoints. For end user oriented tasks, one should generally
103	/// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
104	/// always yields one codepoint at a time.
105	///
106	/// Since graphemes are made up of one or more codepoints, this iterator
107	/// yields `&str` elements (along with their start and end byte offsets).
108	/// When invalid UTF-8 is encountered, replacement codepoints are
109	/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
110	/// indices yielded by this iterator may not correspond to the length of the
111	/// grapheme cluster yielded with those indices. For example, when this
112	/// iterator encounters `\xFF` in the byte string, then it will yield a pair
113	/// of indices ranging over a single byte, but will provide an `&str`
114	/// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when
115	/// given only valid UTF-8, then all indices are in exact correspondence with
116	/// their paired grapheme cluster.
117	///
118	/// This iterator can be used in reverse. When reversed, exactly the same
119	/// set of grapheme clusters are yielded, but in reverse order.
120	///
121	/// This iterator only yields extended* grapheme clusters, in accordance with*
122	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
123	#[derive(Clone, Debug)]
124	pub struct GraphemeIndices<'a> {
125	bs: &'a [u8],
126	forward_index: usize,
127	reverse_index: usize,
128	}
129
130	impl<'a> GraphemeIndices<'a> {
131	pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
132	GraphemeIndices { bs, forward_index: `0`, reverse_index: bs.len() }
133	}
134
135	/// View the underlying data as a subslice of the original data.
136	///
137	/// The slice returned has the same lifetime as the original slice, and so
138	/// the iterator can continue to be used while this exists.
139	///
140	/// # Examples
141	///
142	/// ```
143	/// use bstr::ByteSlice;
144	///
145	/// let mut it = b"abc".grapheme_indices();
146	///
147	/// assert_eq!(b"abc", it.as_bytes());
148	/// it.next();
149	/// assert_eq!(b"bc", it.as_bytes());
150	/// it.next();
151	/// it.next();
152	/// assert_eq!(b"", it.as_bytes());
153	/// ```
154	#[inline]
155	pub fn as_bytes(&self) -> &'a [u8] {
156	self.bs
157	}
158	}
159
160	impl<'a> Iterator for GraphemeIndices<'a> {
161	type Item = (usize, usize, &'a str);
162
163	#[inline]
164	fn next(&mut self) -> Option<(usize, usize, &'a str)> {
165	let index: usize = self.forward_index;
166	let (grapheme: &str, size: usize) = decode_grapheme(self.bs);
167	if size == `0` {
168	return None;
169	}
170	self.bs = &self.bs[size..];
171	self.forward_index += size;
172	Some((index, index + size, grapheme))
173	}
174	}
175
176	impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
177	#[inline]
178	fn next_back(&mut self) -> Option<(usize, usize, &'a str)> {
179	let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs);
180	if size == `0` {
181	return None;
182	}
183	self.bs = &self.bs[..self.bs.len() - size];
184	self.reverse_index -= size;
185	Some((self.reverse_index, self.reverse_index + size, grapheme))
186	}
187	}
188
189	/// Decode a grapheme from the given byte string.
190	///
191	/// This returns the resulting grapheme (which may be a Unicode replacement
192	/// codepoint if invalid UTF-8 was found), along with the number of bytes
193	/// decoded in the byte string. The number of bytes decoded may not be the
194	/// same as the length of grapheme in the case where invalid UTF-8 is found.
195	pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
196	if bs.is_empty() {
197	("", `0`)
198	} else if bs.len() >= `2`
199	&& bs[`0`].is_ascii()
200	&& bs[`1`].is_ascii()
201	&& !bs[`0`].is_ascii_whitespace()
202	{
203	// FIXME: It is somewhat sad that we have to special case this, but it
204	// leads to a significant speed up in predominantly ASCII text. The
205	// issue here is that the DFA has a bit of overhead, and running it for
206	// every byte in mostly ASCII text results in a bit slowdown. We should
207	// re-litigate this once regex-automata 0.3 is out, but it might be
208	// hard to avoid the special case. A DFA is always going to at least
209	// require some memory access.
210
211	// Safe because all ASCII bytes are valid UTF-8.
212	let grapheme = unsafe { bs[..`1`].to_str_unchecked() };
213	(grapheme, `1`)
214	} else if let Some(hm) = {
215	let input = Input::new(bs).anchored(Anchored::Yes);
216	GRAPHEME_BREAK_FWD.try_search_fwd(&input).unwrap()
217	} {
218	// Safe because a match can only occur for valid UTF-8.
219	let grapheme = unsafe { bs[..hm.offset()].to_str_unchecked() };
220	(grapheme, grapheme.len())
221	} else {
222	const INVALID: &str = "`\u{FFFD}`";
223	// No match on non-empty bytes implies we found invalid UTF-8.
224	let (_, size) = utf8::decode_lossy(bs);
225	(INVALID, size)
226	}
227	}
228
229	fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) {
230	if bs.is_empty() {
231	("", `0`)
232	} else if let Some(hm: HalfMatch) = {
233	let input: Input<'_> = Input::new(bs).anchored(mode:Anchored::Yes);
234	GRAPHEME_BREAK_REV.try_search_rev(&input).unwrap()
235	} {
236	let start: usize = adjust_rev_for_regional_indicator(bs, i:hm.offset());
237	// Safe because a match can only occur for valid UTF-8.
238	let grapheme: &str = unsafe { bs[start..].to_str_unchecked() };
239	(grapheme, grapheme.len())
240	} else {
241	const INVALID: &str = "`\u{FFFD}`";
242	// No match on non-empty bytes implies we found invalid UTF-8.
243	let (_, size: usize) = utf8::decode_last_lossy(slice:bs);
244	(INVALID, size)
245	}
246	}
247
248	/// Return the correct offset for the next grapheme decoded at the end of the
249	/// given byte string, where `i` is the initial guess. In particular,
250	/// `&bs[i..]` represents the candidate grapheme.
251	///
252	/// `i` is returned by this function in all cases except when `&bs[i..]` is
253	/// a pair of regional indicator codepoints. In that case, if an odd number of
254	/// additional regional indicator codepoints precedes `i`, then `i` is
255	/// adjusted such that it points to only a single regional indicator.
256	///
257	/// This "fixing" is necessary to handle the requirement that a break cannot
258	/// occur between regional indicators where it would cause an odd number of
259	/// regional indicators to exist before the break from the start* of the*
260	/// string. A reverse regex cannot detect this case easily without look-around.
261	fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
262	// All regional indicators use a 4 byte encoding, and we only care about
263	// the case where we found a pair of regional indicators.
264	if bs.len() - i != `8` {
265	return i;
266	}
267	// Count all contiguous occurrences of regional indicators. If there's an
268	// even number of them, then we can accept the pair we found. Otherwise,
269	// we can only take one of them.
270	//
271	// FIXME: This is quadratic in the worst case, e.g., a string of just
272	// regional indicator codepoints. A fix probably requires refactoring this
273	// code a bit such that we don't rescan regional indicators.
274	let mut count = `0`;
275	while let Some(hm) = {
276	let input = Input::new(bs).anchored(Anchored::Yes);
277	REGIONAL_INDICATOR_REV.try_search_rev(&input).unwrap()
278	} {
279	bs = &bs[..hm.offset()];
280	count += `1`;
281	}
282	if count % `2` == `0` {
283	i
284	} else {
285	i + `4`
286	}
287	}
288
289	#[cfg(all(test, feature = "std"))]
290	mod tests {
291	use alloc::{
292	string::{String, ToString},
293	vec,
294	vec::Vec,
295	};
296
297	#[cfg(not(miri))]
298	use ucd_parse::GraphemeClusterBreakTest;
299
300	use crate::tests::LOSSY_TESTS;
301
302	use super::*;
303
304	#[test]
305	#[cfg(not(miri))]
306	fn forward_ucd() {
307	for (i, test) in ucdtests().into_iter().enumerate() {
308	let given = test.grapheme_clusters.concat();
309	let got: Vec<String> = Graphemes::new(given.as_bytes())
310	.map(\|cluster\| cluster.to_string())
311	.collect();
312	assert_eq!(
313	test.grapheme_clusters,
314	got,
315	"`\n`grapheme forward break test {} failed:`\n`\
316	given: {:?}`\n`\
317	expected: {:?}`\n`\
318	got: {:?}`\n`",
319	i,
320	uniescape(&given),
321	uniescape_vec(&test.grapheme_clusters),
322	uniescape_vec(&got),
323	);
324	}
325	}
326
327	#[test]
328	#[cfg(not(miri))]
329	fn reverse_ucd() {
330	for (i, test) in ucdtests().into_iter().enumerate() {
331	let given = test.grapheme_clusters.concat();
332	let mut got: Vec<String> = Graphemes::new(given.as_bytes())
333	.rev()
334	.map(\|cluster\| cluster.to_string())
335	.collect();
336	got.reverse();
337	assert_eq!(
338	test.grapheme_clusters,
339	got,
340	"`\n\n`grapheme reverse break test {} failed:`\n`\
341	given: {:?}`\n`\
342	expected: {:?}`\n`\
343	got: {:?}`\n`",
344	i,
345	uniescape(&given),
346	uniescape_vec(&test.grapheme_clusters),
347	uniescape_vec(&got),
348	);
349	}
350	}
351
352	#[test]
353	fn forward_lossy() {
354	for &(expected, input) in LOSSY_TESTS {
355	let got = Graphemes::new(input.as_bytes()).collect::<String>();
356	assert_eq!(expected, got);
357	}
358	}
359
360	#[test]
361	fn reverse_lossy() {
362	for &(expected, input) in LOSSY_TESTS {
363	let expected: String = expected.chars().rev().collect();
364	let got =
365	Graphemes::new(input.as_bytes()).rev().collect::<String>();
366	assert_eq!(expected, got);
367	}
368	}
369
370	#[cfg(not(miri))]
371	fn uniescape(s: &str) -> String {
372	s.chars().flat_map(\|c\| c.escape_unicode()).collect::<String>()
373	}
374
375	#[cfg(not(miri))]
376	fn uniescape_vec(strs: &[String]) -> Vec<String> {
377	strs.iter().map(\|s\| uniescape(s)).collect()
378	}
379
380	/// Return all of the UCD for grapheme breaks.
381	#[cfg(not(miri))]
382	fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
383	const TESTDATA: &str = include_str!("data/GraphemeBreakTest.txt");
384
385	let mut tests = vec![];
386	for mut line in TESTDATA.lines() {
387	line = line.trim();
388	if line.starts_with("#") \|\| line.contains("surrogate") {
389	continue;
390	}
391	tests.push(line.parse().unwrap());
392	}
393	tests
394	}
395	}
396