1 | use regex_automata::DFA; |
2 | |
3 | use crate::{ |
4 | ext_slice::ByteSlice, |
5 | unicode::fsm::{ |
6 | grapheme_break_fwd::GRAPHEME_BREAK_FWD, |
7 | grapheme_break_rev::GRAPHEME_BREAK_REV, |
8 | regional_indicator_rev::REGIONAL_INDICATOR_REV, |
9 | }, |
10 | utf8, |
11 | }; |
12 | |
13 | /// An iterator over grapheme clusters in a byte string. |
14 | /// |
15 | /// This iterator is typically constructed by |
16 | /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes). |
17 | /// |
18 | /// Unicode defines a grapheme cluster as an *approximation* to a single user |
19 | /// visible character. A grapheme cluster, or just "grapheme," is made up of |
20 | /// one or more codepoints. For end user oriented tasks, one should generally |
21 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which |
22 | /// always yields one codepoint at a time. |
23 | /// |
24 | /// Since graphemes are made up of one or more codepoints, this iterator yields |
25 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints |
26 | /// are [substituted](index.html#handling-of-invalid-utf-8). |
27 | /// |
28 | /// This iterator can be used in reverse. When reversed, exactly the same |
29 | /// set of grapheme clusters are yielded, but in reverse order. |
30 | /// |
31 | /// This iterator only yields *extended* grapheme clusters, in accordance with |
32 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). |
33 | #[derive (Clone, Debug)] |
34 | pub struct Graphemes<'a> { |
35 | bs: &'a [u8], |
36 | } |
37 | |
38 | impl<'a> Graphemes<'a> { |
39 | pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> { |
40 | Graphemes { bs } |
41 | } |
42 | |
43 | /// View the underlying data as a subslice of the original data. |
44 | /// |
45 | /// The slice returned has the same lifetime as the original slice, and so |
46 | /// the iterator can continue to be used while this exists. |
47 | /// |
48 | /// # Examples |
49 | /// |
50 | /// ``` |
51 | /// use bstr::ByteSlice; |
52 | /// |
53 | /// let mut it = b"abc" .graphemes(); |
54 | /// |
55 | /// assert_eq!(b"abc" , it.as_bytes()); |
56 | /// it.next(); |
57 | /// assert_eq!(b"bc" , it.as_bytes()); |
58 | /// it.next(); |
59 | /// it.next(); |
60 | /// assert_eq!(b"" , it.as_bytes()); |
61 | /// ``` |
62 | #[inline ] |
63 | pub fn as_bytes(&self) -> &'a [u8] { |
64 | self.bs |
65 | } |
66 | } |
67 | |
68 | impl<'a> Iterator for Graphemes<'a> { |
69 | type Item = &'a str; |
70 | |
71 | #[inline ] |
72 | fn next(&mut self) -> Option<&'a str> { |
73 | let (grapheme: &str, size: usize) = decode_grapheme(self.bs); |
74 | if size == 0 { |
75 | return None; |
76 | } |
77 | self.bs = &self.bs[size..]; |
78 | Some(grapheme) |
79 | } |
80 | } |
81 | |
82 | impl<'a> DoubleEndedIterator for Graphemes<'a> { |
83 | #[inline ] |
84 | fn next_back(&mut self) -> Option<&'a str> { |
85 | let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs); |
86 | if size == 0 { |
87 | return None; |
88 | } |
89 | self.bs = &self.bs[..self.bs.len() - size]; |
90 | Some(grapheme) |
91 | } |
92 | } |
93 | |
94 | /// An iterator over grapheme clusters in a byte string and their byte index |
95 | /// positions. |
96 | /// |
97 | /// This iterator is typically constructed by |
98 | /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices). |
99 | /// |
100 | /// Unicode defines a grapheme cluster as an *approximation* to a single user |
101 | /// visible character. A grapheme cluster, or just "grapheme," is made up of |
102 | /// one or more codepoints. For end user oriented tasks, one should generally |
103 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which |
104 | /// always yields one codepoint at a time. |
105 | /// |
106 | /// Since graphemes are made up of one or more codepoints, this iterator |
107 | /// yields `&str` elements (along with their start and end byte offsets). |
108 | /// When invalid UTF-8 is encountered, replacement codepoints are |
109 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the |
110 | /// indices yielded by this iterator may not correspond to the length of the |
111 | /// grapheme cluster yielded with those indices. For example, when this |
112 | /// iterator encounters `\xFF` in the byte string, then it will yield a pair |
113 | /// of indices ranging over a single byte, but will provide an `&str` |
114 | /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when |
115 | /// given only valid UTF-8, then all indices are in exact correspondence with |
116 | /// their paired grapheme cluster. |
117 | /// |
118 | /// This iterator can be used in reverse. When reversed, exactly the same |
119 | /// set of grapheme clusters are yielded, but in reverse order. |
120 | /// |
121 | /// This iterator only yields *extended* grapheme clusters, in accordance with |
122 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). |
123 | #[derive (Clone, Debug)] |
124 | pub struct GraphemeIndices<'a> { |
125 | bs: &'a [u8], |
126 | forward_index: usize, |
127 | reverse_index: usize, |
128 | } |
129 | |
130 | impl<'a> GraphemeIndices<'a> { |
131 | pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> { |
132 | GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() } |
133 | } |
134 | |
135 | /// View the underlying data as a subslice of the original data. |
136 | /// |
137 | /// The slice returned has the same lifetime as the original slice, and so |
138 | /// the iterator can continue to be used while this exists. |
139 | /// |
140 | /// # Examples |
141 | /// |
142 | /// ``` |
143 | /// use bstr::ByteSlice; |
144 | /// |
145 | /// let mut it = b"abc" .grapheme_indices(); |
146 | /// |
147 | /// assert_eq!(b"abc" , it.as_bytes()); |
148 | /// it.next(); |
149 | /// assert_eq!(b"bc" , it.as_bytes()); |
150 | /// it.next(); |
151 | /// it.next(); |
152 | /// assert_eq!(b"" , it.as_bytes()); |
153 | /// ``` |
154 | #[inline ] |
155 | pub fn as_bytes(&self) -> &'a [u8] { |
156 | self.bs |
157 | } |
158 | } |
159 | |
160 | impl<'a> Iterator for GraphemeIndices<'a> { |
161 | type Item = (usize, usize, &'a str); |
162 | |
163 | #[inline ] |
164 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { |
165 | let index: usize = self.forward_index; |
166 | let (grapheme: &str, size: usize) = decode_grapheme(self.bs); |
167 | if size == 0 { |
168 | return None; |
169 | } |
170 | self.bs = &self.bs[size..]; |
171 | self.forward_index += size; |
172 | Some((index, index + size, grapheme)) |
173 | } |
174 | } |
175 | |
176 | impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { |
177 | #[inline ] |
178 | fn next_back(&mut self) -> Option<(usize, usize, &'a str)> { |
179 | let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs); |
180 | if size == 0 { |
181 | return None; |
182 | } |
183 | self.bs = &self.bs[..self.bs.len() - size]; |
184 | self.reverse_index -= size; |
185 | Some((self.reverse_index, self.reverse_index + size, grapheme)) |
186 | } |
187 | } |
188 | |
189 | /// Decode a grapheme from the given byte string. |
190 | /// |
191 | /// This returns the resulting grapheme (which may be a Unicode replacement |
192 | /// codepoint if invalid UTF-8 was found), along with the number of bytes |
193 | /// decoded in the byte string. The number of bytes decoded may not be the |
194 | /// same as the length of grapheme in the case where invalid UTF-8 is found. |
195 | pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) { |
196 | if bs.is_empty() { |
197 | ("" , 0) |
198 | } else if bs.len() >= 2 |
199 | && bs[0].is_ascii() |
200 | && bs[1].is_ascii() |
201 | && !bs[0].is_ascii_whitespace() |
202 | { |
203 | // FIXME: It is somewhat sad that we have to special case this, but it |
204 | // leads to a significant speed up in predominantly ASCII text. The |
205 | // issue here is that the DFA has a bit of overhead, and running it for |
206 | // every byte in mostly ASCII text results in a bit slowdown. We should |
207 | // re-litigate this once regex-automata 0.3 is out, but it might be |
208 | // hard to avoid the special case. A DFA is always going to at least |
209 | // require some memory access. |
210 | |
211 | // Safe because all ASCII bytes are valid UTF-8. |
212 | let grapheme = unsafe { bs[..1].to_str_unchecked() }; |
213 | (grapheme, 1) |
214 | } else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) { |
215 | // Safe because a match can only occur for valid UTF-8. |
216 | let grapheme = unsafe { bs[..end].to_str_unchecked() }; |
217 | (grapheme, grapheme.len()) |
218 | } else { |
219 | const INVALID: &'static str = " \u{FFFD}" ; |
220 | // No match on non-empty bytes implies we found invalid UTF-8. |
221 | let (_, size) = utf8::decode_lossy(bs); |
222 | (INVALID, size) |
223 | } |
224 | } |
225 | |
226 | fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) { |
227 | if bs.is_empty() { |
228 | ("" , 0) |
229 | } else if let Some(mut start: usize) = GRAPHEME_BREAK_REV.rfind(bytes:bs) { |
230 | start = adjust_rev_for_regional_indicator(bs, i:start); |
231 | // Safe because a match can only occur for valid UTF-8. |
232 | let grapheme: &str = unsafe { bs[start..].to_str_unchecked() }; |
233 | (grapheme, grapheme.len()) |
234 | } else { |
235 | const INVALID: &'static str = " \u{FFFD}" ; |
236 | // No match on non-empty bytes implies we found invalid UTF-8. |
237 | let (_, size: usize) = utf8::decode_last_lossy(slice:bs); |
238 | (INVALID, size) |
239 | } |
240 | } |
241 | |
242 | /// Return the correct offset for the next grapheme decoded at the end of the |
243 | /// given byte string, where `i` is the initial guess. In particular, |
244 | /// `&bs[i..]` represents the candidate grapheme. |
245 | /// |
246 | /// `i` is returned by this function in all cases except when `&bs[i..]` is |
247 | /// a pair of regional indicator codepoints. In that case, if an odd number of |
248 | /// additional regional indicator codepoints precedes `i`, then `i` is |
249 | /// adjusted such that it points to only a single regional indicator. |
250 | /// |
251 | /// This "fixing" is necessary to handle the requirement that a break cannot |
252 | /// occur between regional indicators where it would cause an odd number of |
253 | /// regional indicators to exist before the break from the *start* of the |
254 | /// string. A reverse regex cannot detect this case easily without look-around. |
255 | fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize { |
256 | // All regional indicators use a 4 byte encoding, and we only care about |
257 | // the case where we found a pair of regional indicators. |
258 | if bs.len() - i != 8 { |
259 | return i; |
260 | } |
261 | // Count all contiguous occurrences of regional indicators. If there's an |
262 | // even number of them, then we can accept the pair we found. Otherwise, |
263 | // we can only take one of them. |
264 | // |
265 | // FIXME: This is quadratic in the worst case, e.g., a string of just |
266 | // regional indicator codepoints. A fix probably requires refactoring this |
267 | // code a bit such that we don't rescan regional indicators. |
268 | let mut count: i32 = 0; |
269 | while let Some(start: usize) = REGIONAL_INDICATOR_REV.rfind(bytes:bs) { |
270 | bs = &bs[..start]; |
271 | count += 1; |
272 | } |
273 | if count % 2 == 0 { |
274 | i |
275 | } else { |
276 | i + 4 |
277 | } |
278 | } |
279 | |
280 | #[cfg (all(test, feature = "std" ))] |
281 | mod tests { |
282 | #[cfg (not(miri))] |
283 | use ucd_parse::GraphemeClusterBreakTest; |
284 | |
285 | use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS}; |
286 | |
287 | use super::*; |
288 | |
289 | #[test ] |
290 | #[cfg (not(miri))] |
291 | fn forward_ucd() { |
292 | for (i, test) in ucdtests().into_iter().enumerate() { |
293 | let given = test .grapheme_clusters.concat(); |
294 | let got: Vec<String> = Graphemes::new(given.as_bytes()) |
295 | .map(|cluster| cluster.to_string()) |
296 | .collect(); |
297 | assert_eq!( |
298 | test .grapheme_clusters, |
299 | got, |
300 | " \ngrapheme forward break test {} failed: \n\ |
301 | given: {:?}\n\ |
302 | expected: {:?}\n\ |
303 | got: {:?}\n" , |
304 | i, |
305 | uniescape(&given), |
306 | uniescape_vec(&test .grapheme_clusters), |
307 | uniescape_vec(&got), |
308 | ); |
309 | } |
310 | } |
311 | |
312 | #[test ] |
313 | #[cfg (not(miri))] |
314 | fn reverse_ucd() { |
315 | for (i, test) in ucdtests().into_iter().enumerate() { |
316 | let given = test .grapheme_clusters.concat(); |
317 | let mut got: Vec<String> = Graphemes::new(given.as_bytes()) |
318 | .rev() |
319 | .map(|cluster| cluster.to_string()) |
320 | .collect(); |
321 | got.reverse(); |
322 | assert_eq!( |
323 | test .grapheme_clusters, |
324 | got, |
325 | " \n\ngrapheme reverse break test {} failed: \n\ |
326 | given: {:?}\n\ |
327 | expected: {:?}\n\ |
328 | got: {:?}\n" , |
329 | i, |
330 | uniescape(&given), |
331 | uniescape_vec(&test .grapheme_clusters), |
332 | uniescape_vec(&got), |
333 | ); |
334 | } |
335 | } |
336 | |
337 | #[test ] |
338 | fn forward_lossy() { |
339 | for &(expected, input) in LOSSY_TESTS { |
340 | let got = Graphemes::new(input.as_bytes()).collect::<String>(); |
341 | assert_eq!(expected, got); |
342 | } |
343 | } |
344 | |
345 | #[test ] |
346 | fn reverse_lossy() { |
347 | for &(expected, input) in LOSSY_TESTS { |
348 | let expected: String = expected.chars().rev().collect(); |
349 | let got = |
350 | Graphemes::new(input.as_bytes()).rev().collect::<String>(); |
351 | assert_eq!(expected, got); |
352 | } |
353 | } |
354 | |
355 | #[cfg (not(miri))] |
356 | fn uniescape(s: &str) -> String { |
357 | s.chars().flat_map(|c| c.escape_unicode()).collect::<String>() |
358 | } |
359 | |
360 | #[cfg (not(miri))] |
361 | fn uniescape_vec(strs: &[String]) -> Vec<String> { |
362 | strs.iter().map(|s| uniescape(s)).collect() |
363 | } |
364 | |
365 | /// Return all of the UCD for grapheme breaks. |
366 | #[cfg (not(miri))] |
367 | fn ucdtests() -> Vec<GraphemeClusterBreakTest> { |
368 | const TESTDATA: &'static str = |
369 | include_str!("data/GraphemeBreakTest.txt" ); |
370 | |
371 | let mut tests = vec![]; |
372 | for mut line in TESTDATA.lines() { |
373 | line = line.trim(); |
374 | if line.starts_with("#" ) || line.contains("surrogate" ) { |
375 | continue; |
376 | } |
377 | tests.push(line.parse().unwrap()); |
378 | } |
379 | tests |
380 | } |
381 | } |
382 | |