grapheme.rs source code [crates/bstr-1.5.0/src/unicode/grapheme.rs]

1	use regex_automata::DFA;
2
3	use crate::{
4	ext_slice::ByteSlice,
5	unicode::fsm::{
6	grapheme_break_fwd::GRAPHEME_BREAK_FWD,
7	grapheme_break_rev::GRAPHEME_BREAK_REV,
8	regional_indicator_rev::REGIONAL_INDICATOR_REV,
9	},
10	utf8,
11	};
12
13	/// An iterator over grapheme clusters in a byte string.
14	///
15	/// This iterator is typically constructed by
16	/// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes).
17	///
18	/// Unicode defines a grapheme cluster as an approximation* to a single user*
19	/// visible character. A grapheme cluster, or just "grapheme," is made up of
20	/// one or more codepoints. For end user oriented tasks, one should generally
21	/// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
22	/// always yields one codepoint at a time.
23	///
24	/// Since graphemes are made up of one or more codepoints, this iterator yields
25	/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
26	/// are [substituted](index.html#handling-of-invalid-utf-8).
27	///
28	/// This iterator can be used in reverse. When reversed, exactly the same
29	/// set of grapheme clusters are yielded, but in reverse order.
30	///
31	/// This iterator only yields extended* grapheme clusters, in accordance with*
32	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
33	#[derive(Clone, Debug)]
34	pub struct Graphemes<'a> {
35	bs: &'a [u8],
36	}
37
38	impl<'a> Graphemes<'a> {
39	pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> {
40	Graphemes { bs }
41	}
42
43	/// View the underlying data as a subslice of the original data.
44	///
45	/// The slice returned has the same lifetime as the original slice, and so
46	/// the iterator can continue to be used while this exists.
47	///
48	/// # Examples
49	///
50	/// ```
51	/// use bstr::ByteSlice;
52	///
53	/// let mut it = b"abc".graphemes();
54	///
55	/// assert_eq!(b"abc", it.as_bytes());
56	/// it.next();
57	/// assert_eq!(b"bc", it.as_bytes());
58	/// it.next();
59	/// it.next();
60	/// assert_eq!(b"", it.as_bytes());
61	/// ```
62	#[inline]
63	pub fn as_bytes(&self) -> &'a [u8] {
64	self.bs
65	}
66	}
67
68	impl<'a> Iterator for Graphemes<'a> {
69	type Item = &'a str;
70
71	#[inline]
72	fn next(&mut self) -> Option<&'a str> {
73	let (grapheme: &str, size: usize) = decode_grapheme(self.bs);
74	if size == `0` {
75	return None;
76	}
77	self.bs = &self.bs[size..];
78	Some(grapheme)
79	}
80	}
81
82	impl<'a> DoubleEndedIterator for Graphemes<'a> {
83	#[inline]
84	fn next_back(&mut self) -> Option<&'a str> {
85	let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs);
86	if size == `0` {
87	return None;
88	}
89	self.bs = &self.bs[..self.bs.len() - size];
90	Some(grapheme)
91	}
92	}
93
94	/// An iterator over grapheme clusters in a byte string and their byte index
95	/// positions.
96	///
97	/// This iterator is typically constructed by
98	/// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices).
99	///
100	/// Unicode defines a grapheme cluster as an approximation* to a single user*
101	/// visible character. A grapheme cluster, or just "grapheme," is made up of
102	/// one or more codepoints. For end user oriented tasks, one should generally
103	/// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
104	/// always yields one codepoint at a time.
105	///
106	/// Since graphemes are made up of one or more codepoints, this iterator
107	/// yields `&str` elements (along with their start and end byte offsets).
108	/// When invalid UTF-8 is encountered, replacement codepoints are
109	/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
110	/// indices yielded by this iterator may not correspond to the length of the
111	/// grapheme cluster yielded with those indices. For example, when this
112	/// iterator encounters `\xFF` in the byte string, then it will yield a pair
113	/// of indices ranging over a single byte, but will provide an `&str`
114	/// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when
115	/// given only valid UTF-8, then all indices are in exact correspondence with
116	/// their paired grapheme cluster.
117	///
118	/// This iterator can be used in reverse. When reversed, exactly the same
119	/// set of grapheme clusters are yielded, but in reverse order.
120	///
121	/// This iterator only yields extended* grapheme clusters, in accordance with*
122	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
123	#[derive(Clone, Debug)]
124	pub struct GraphemeIndices<'a> {
125	bs: &'a [u8],
126	forward_index: usize,
127	reverse_index: usize,
128	}
129
130	impl<'a> GraphemeIndices<'a> {
131	pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
132	GraphemeIndices { bs, forward_index: `0`, reverse_index: bs.len() }
133	}
134
135	/// View the underlying data as a subslice of the original data.
136	///
137	/// The slice returned has the same lifetime as the original slice, and so
138	/// the iterator can continue to be used while this exists.
139	///
140	/// # Examples
141	///
142	/// ```
143	/// use bstr::ByteSlice;
144	///
145	/// let mut it = b"abc".grapheme_indices();
146	///
147	/// assert_eq!(b"abc", it.as_bytes());
148	/// it.next();
149	/// assert_eq!(b"bc", it.as_bytes());
150	/// it.next();
151	/// it.next();
152	/// assert_eq!(b"", it.as_bytes());
153	/// ```
154	#[inline]
155	pub fn as_bytes(&self) -> &'a [u8] {
156	self.bs
157	}
158	}
159
160	impl<'a> Iterator for GraphemeIndices<'a> {
161	type Item = (usize, usize, &'a str);
162
163	#[inline]
164	fn next(&mut self) -> Option<(usize, usize, &'a str)> {
165	let index: usize = self.forward_index;
166	let (grapheme: &str, size: usize) = decode_grapheme(self.bs);
167	if size == `0` {
168	return None;
169	}
170	self.bs = &self.bs[size..];
171	self.forward_index += size;
172	Some((index, index + size, grapheme))
173	}
174	}
175
176	impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
177	#[inline]
178	fn next_back(&mut self) -> Option<(usize, usize, &'a str)> {
179	let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs);
180	if size == `0` {
181	return None;
182	}
183	self.bs = &self.bs[..self.bs.len() - size];
184	self.reverse_index -= size;
185	Some((self.reverse_index, self.reverse_index + size, grapheme))
186	}
187	}
188
189	/// Decode a grapheme from the given byte string.
190	///
191	/// This returns the resulting grapheme (which may be a Unicode replacement
192	/// codepoint if invalid UTF-8 was found), along with the number of bytes
193	/// decoded in the byte string. The number of bytes decoded may not be the
194	/// same as the length of grapheme in the case where invalid UTF-8 is found.
195	pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
196	if bs.is_empty() {
197	("", `0`)
198	} else if bs.len() >= `2`
199	&& bs[`0`].is_ascii()
200	&& bs[`1`].is_ascii()
201	&& !bs[`0`].is_ascii_whitespace()
202	{
203	// FIXME: It is somewhat sad that we have to special case this, but it
204	// leads to a significant speed up in predominantly ASCII text. The
205	// issue here is that the DFA has a bit of overhead, and running it for
206	// every byte in mostly ASCII text results in a bit slowdown. We should
207	// re-litigate this once regex-automata 0.3 is out, but it might be
208	// hard to avoid the special case. A DFA is always going to at least
209	// require some memory access.
210
211	// Safe because all ASCII bytes are valid UTF-8.
212	let grapheme = unsafe { bs[..`1`].to_str_unchecked() };
213	(grapheme, `1`)
214	} else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
215	// Safe because a match can only occur for valid UTF-8.
216	let grapheme = unsafe { bs[..end].to_str_unchecked() };
217	(grapheme, grapheme.len())
218	} else {
219	const INVALID: &'static str = "`\u{FFFD}`";
220	// No match on non-empty bytes implies we found invalid UTF-8.
221	let (_, size) = utf8::decode_lossy(bs);
222	(INVALID, size)
223	}
224	}
225
226	fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) {
227	if bs.is_empty() {
228	("", `0`)
229	} else if let Some(mut start: usize) = GRAPHEME_BREAK_REV.rfind(bytes:bs) {
230	start = adjust_rev_for_regional_indicator(bs, i:start);
231	// Safe because a match can only occur for valid UTF-8.
232	let grapheme: &str = unsafe { bs[start..].to_str_unchecked() };
233	(grapheme, grapheme.len())
234	} else {
235	const INVALID: &'static str = "`\u{FFFD}`";
236	// No match on non-empty bytes implies we found invalid UTF-8.
237	let (_, size: usize) = utf8::decode_last_lossy(slice:bs);
238	(INVALID, size)
239	}
240	}
241
242	/// Return the correct offset for the next grapheme decoded at the end of the
243	/// given byte string, where `i` is the initial guess. In particular,
244	/// `&bs[i..]` represents the candidate grapheme.
245	///
246	/// `i` is returned by this function in all cases except when `&bs[i..]` is
247	/// a pair of regional indicator codepoints. In that case, if an odd number of
248	/// additional regional indicator codepoints precedes `i`, then `i` is
249	/// adjusted such that it points to only a single regional indicator.
250	///
251	/// This "fixing" is necessary to handle the requirement that a break cannot
252	/// occur between regional indicators where it would cause an odd number of
253	/// regional indicators to exist before the break from the start* of the*
254	/// string. A reverse regex cannot detect this case easily without look-around.
255	fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
256	// All regional indicators use a 4 byte encoding, and we only care about
257	// the case where we found a pair of regional indicators.
258	if bs.len() - i != `8` {
259	return i;
260	}
261	// Count all contiguous occurrences of regional indicators. If there's an
262	// even number of them, then we can accept the pair we found. Otherwise,
263	// we can only take one of them.
264	//
265	// FIXME: This is quadratic in the worst case, e.g., a string of just
266	// regional indicator codepoints. A fix probably requires refactoring this
267	// code a bit such that we don't rescan regional indicators.
268	let mut count: i32 = `0`;
269	while let Some(start: usize) = REGIONAL_INDICATOR_REV.rfind(bytes:bs) {
270	bs = &bs[..start];
271	count += `1`;
272	}
273	if count % `2` == `0` {
274	i
275	} else {
276	i + `4`
277	}
278	}
279
280	#[cfg(all(test, feature = "std"))]
281	mod tests {
282	#[cfg(not(miri))]
283	use ucd_parse::GraphemeClusterBreakTest;
284
285	use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS};
286
287	use super::*;
288
289	#[test]
290	#[cfg(not(miri))]
291	fn forward_ucd() {
292	for (i, test) in ucdtests().into_iter().enumerate() {
293	let given = test.grapheme_clusters.concat();
294	let got: Vec<String> = Graphemes::new(given.as_bytes())
295	.map(\|cluster\| cluster.to_string())
296	.collect();
297	assert_eq!(
298	test.grapheme_clusters,
299	got,
300	"`\n`grapheme forward break test {} failed:`\n`\
301	given: {:?}`\n`\
302	expected: {:?}`\n`\
303	got: {:?}`\n`",
304	i,
305	uniescape(&given),
306	uniescape_vec(&test.grapheme_clusters),
307	uniescape_vec(&got),
308	);
309	}
310	}
311
312	#[test]
313	#[cfg(not(miri))]
314	fn reverse_ucd() {
315	for (i, test) in ucdtests().into_iter().enumerate() {
316	let given = test.grapheme_clusters.concat();
317	let mut got: Vec<String> = Graphemes::new(given.as_bytes())
318	.rev()
319	.map(\|cluster\| cluster.to_string())
320	.collect();
321	got.reverse();
322	assert_eq!(
323	test.grapheme_clusters,
324	got,
325	"`\n\n`grapheme reverse break test {} failed:`\n`\
326	given: {:?}`\n`\
327	expected: {:?}`\n`\
328	got: {:?}`\n`",
329	i,
330	uniescape(&given),
331	uniescape_vec(&test.grapheme_clusters),
332	uniescape_vec(&got),
333	);
334	}
335	}
336
337	#[test]
338	fn forward_lossy() {
339	for &(expected, input) in LOSSY_TESTS {
340	let got = Graphemes::new(input.as_bytes()).collect::<String>();
341	assert_eq!(expected, got);
342	}
343	}
344
345	#[test]
346	fn reverse_lossy() {
347	for &(expected, input) in LOSSY_TESTS {
348	let expected: String = expected.chars().rev().collect();
349	let got =
350	Graphemes::new(input.as_bytes()).rev().collect::<String>();
351	assert_eq!(expected, got);
352	}
353	}
354
355	#[cfg(not(miri))]
356	fn uniescape(s: &str) -> String {
357	s.chars().flat_map(\|c\| c.escape_unicode()).collect::<String>()
358	}
359
360	#[cfg(not(miri))]
361	fn uniescape_vec(strs: &[String]) -> Vec<String> {
362	strs.iter().map(\|s\| uniescape(s)).collect()
363	}
364
365	/// Return all of the UCD for grapheme breaks.
366	#[cfg(not(miri))]
367	fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
368	const TESTDATA: &'static str =
369	include_str!("data/GraphemeBreakTest.txt");
370
371	let mut tests = vec![];
372	for mut line in TESTDATA.lines() {
373	line = line.trim();
374	if line.starts_with("#") \|\| line.contains("surrogate") {
375	continue;
376	}
377	tests.push(line.parse().unwrap());
378	}
379	tests
380	}
381	}
382

Provided by KDAB

Definitions

Learn Rust with the experts

Find out more