| 1 | use regex_automata::{dfa::Automaton, Anchored, Input}; |
| 2 | |
| 3 | use crate::{ |
| 4 | ext_slice::ByteSlice, |
| 5 | unicode::fsm::{ |
| 6 | grapheme_break_fwd::GRAPHEME_BREAK_FWD, |
| 7 | grapheme_break_rev::GRAPHEME_BREAK_REV, |
| 8 | regional_indicator_rev::REGIONAL_INDICATOR_REV, |
| 9 | }, |
| 10 | utf8, |
| 11 | }; |
| 12 | |
| 13 | /// An iterator over grapheme clusters in a byte string. |
| 14 | /// |
| 15 | /// This iterator is typically constructed by |
| 16 | /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes). |
| 17 | /// |
| 18 | /// Unicode defines a grapheme cluster as an *approximation* to a single user |
| 19 | /// visible character. A grapheme cluster, or just "grapheme," is made up of |
| 20 | /// one or more codepoints. For end user oriented tasks, one should generally |
| 21 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which |
| 22 | /// always yields one codepoint at a time. |
| 23 | /// |
| 24 | /// Since graphemes are made up of one or more codepoints, this iterator yields |
| 25 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints |
| 26 | /// are [substituted](index.html#handling-of-invalid-utf-8). |
| 27 | /// |
| 28 | /// This iterator can be used in reverse. When reversed, exactly the same |
| 29 | /// set of grapheme clusters are yielded, but in reverse order. |
| 30 | /// |
| 31 | /// This iterator only yields *extended* grapheme clusters, in accordance with |
| 32 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). |
| 33 | #[derive (Clone, Debug)] |
| 34 | pub struct Graphemes<'a> { |
| 35 | bs: &'a [u8], |
| 36 | } |
| 37 | |
| 38 | impl<'a> Graphemes<'a> { |
| 39 | pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> { |
| 40 | Graphemes { bs } |
| 41 | } |
| 42 | |
| 43 | /// View the underlying data as a subslice of the original data. |
| 44 | /// |
| 45 | /// The slice returned has the same lifetime as the original slice, and so |
| 46 | /// the iterator can continue to be used while this exists. |
| 47 | /// |
| 48 | /// # Examples |
| 49 | /// |
| 50 | /// ``` |
| 51 | /// use bstr::ByteSlice; |
| 52 | /// |
| 53 | /// let mut it = b"abc" .graphemes(); |
| 54 | /// |
| 55 | /// assert_eq!(b"abc" , it.as_bytes()); |
| 56 | /// it.next(); |
| 57 | /// assert_eq!(b"bc" , it.as_bytes()); |
| 58 | /// it.next(); |
| 59 | /// it.next(); |
| 60 | /// assert_eq!(b"" , it.as_bytes()); |
| 61 | /// ``` |
| 62 | #[inline ] |
| 63 | pub fn as_bytes(&self) -> &'a [u8] { |
| 64 | self.bs |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | impl<'a> Iterator for Graphemes<'a> { |
| 69 | type Item = &'a str; |
| 70 | |
| 71 | #[inline ] |
| 72 | fn next(&mut self) -> Option<&'a str> { |
| 73 | let (grapheme: &str, size: usize) = decode_grapheme(self.bs); |
| 74 | if size == 0 { |
| 75 | return None; |
| 76 | } |
| 77 | self.bs = &self.bs[size..]; |
| 78 | Some(grapheme) |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | impl<'a> DoubleEndedIterator for Graphemes<'a> { |
| 83 | #[inline ] |
| 84 | fn next_back(&mut self) -> Option<&'a str> { |
| 85 | let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs); |
| 86 | if size == 0 { |
| 87 | return None; |
| 88 | } |
| 89 | self.bs = &self.bs[..self.bs.len() - size]; |
| 90 | Some(grapheme) |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | /// An iterator over grapheme clusters in a byte string and their byte index |
| 95 | /// positions. |
| 96 | /// |
| 97 | /// This iterator is typically constructed by |
| 98 | /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices). |
| 99 | /// |
| 100 | /// Unicode defines a grapheme cluster as an *approximation* to a single user |
| 101 | /// visible character. A grapheme cluster, or just "grapheme," is made up of |
| 102 | /// one or more codepoints. For end user oriented tasks, one should generally |
| 103 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which |
| 104 | /// always yields one codepoint at a time. |
| 105 | /// |
| 106 | /// Since graphemes are made up of one or more codepoints, this iterator |
| 107 | /// yields `&str` elements (along with their start and end byte offsets). |
| 108 | /// When invalid UTF-8 is encountered, replacement codepoints are |
| 109 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the |
| 110 | /// indices yielded by this iterator may not correspond to the length of the |
| 111 | /// grapheme cluster yielded with those indices. For example, when this |
| 112 | /// iterator encounters `\xFF` in the byte string, then it will yield a pair |
| 113 | /// of indices ranging over a single byte, but will provide an `&str` |
| 114 | /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when |
| 115 | /// given only valid UTF-8, then all indices are in exact correspondence with |
| 116 | /// their paired grapheme cluster. |
| 117 | /// |
| 118 | /// This iterator can be used in reverse. When reversed, exactly the same |
| 119 | /// set of grapheme clusters are yielded, but in reverse order. |
| 120 | /// |
| 121 | /// This iterator only yields *extended* grapheme clusters, in accordance with |
| 122 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). |
| 123 | #[derive (Clone, Debug)] |
| 124 | pub struct GraphemeIndices<'a> { |
| 125 | bs: &'a [u8], |
| 126 | forward_index: usize, |
| 127 | reverse_index: usize, |
| 128 | } |
| 129 | |
| 130 | impl<'a> GraphemeIndices<'a> { |
| 131 | pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> { |
| 132 | GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() } |
| 133 | } |
| 134 | |
| 135 | /// View the underlying data as a subslice of the original data. |
| 136 | /// |
| 137 | /// The slice returned has the same lifetime as the original slice, and so |
| 138 | /// the iterator can continue to be used while this exists. |
| 139 | /// |
| 140 | /// # Examples |
| 141 | /// |
| 142 | /// ``` |
| 143 | /// use bstr::ByteSlice; |
| 144 | /// |
| 145 | /// let mut it = b"abc" .grapheme_indices(); |
| 146 | /// |
| 147 | /// assert_eq!(b"abc" , it.as_bytes()); |
| 148 | /// it.next(); |
| 149 | /// assert_eq!(b"bc" , it.as_bytes()); |
| 150 | /// it.next(); |
| 151 | /// it.next(); |
| 152 | /// assert_eq!(b"" , it.as_bytes()); |
| 153 | /// ``` |
| 154 | #[inline ] |
| 155 | pub fn as_bytes(&self) -> &'a [u8] { |
| 156 | self.bs |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | impl<'a> Iterator for GraphemeIndices<'a> { |
| 161 | type Item = (usize, usize, &'a str); |
| 162 | |
| 163 | #[inline ] |
| 164 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { |
| 165 | let index: usize = self.forward_index; |
| 166 | let (grapheme: &str, size: usize) = decode_grapheme(self.bs); |
| 167 | if size == 0 { |
| 168 | return None; |
| 169 | } |
| 170 | self.bs = &self.bs[size..]; |
| 171 | self.forward_index += size; |
| 172 | Some((index, index + size, grapheme)) |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { |
| 177 | #[inline ] |
| 178 | fn next_back(&mut self) -> Option<(usize, usize, &'a str)> { |
| 179 | let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs); |
| 180 | if size == 0 { |
| 181 | return None; |
| 182 | } |
| 183 | self.bs = &self.bs[..self.bs.len() - size]; |
| 184 | self.reverse_index -= size; |
| 185 | Some((self.reverse_index, self.reverse_index + size, grapheme)) |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | /// Decode a grapheme from the given byte string. |
| 190 | /// |
| 191 | /// This returns the resulting grapheme (which may be a Unicode replacement |
| 192 | /// codepoint if invalid UTF-8 was found), along with the number of bytes |
| 193 | /// decoded in the byte string. The number of bytes decoded may not be the |
| 194 | /// same as the length of grapheme in the case where invalid UTF-8 is found. |
| 195 | pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) { |
| 196 | if bs.is_empty() { |
| 197 | ("" , 0) |
| 198 | } else if bs.len() >= 2 |
| 199 | && bs[0].is_ascii() |
| 200 | && bs[1].is_ascii() |
| 201 | && !bs[0].is_ascii_whitespace() |
| 202 | { |
| 203 | // FIXME: It is somewhat sad that we have to special case this, but it |
| 204 | // leads to a significant speed up in predominantly ASCII text. The |
| 205 | // issue here is that the DFA has a bit of overhead, and running it for |
| 206 | // every byte in mostly ASCII text results in a bit slowdown. We should |
| 207 | // re-litigate this once regex-automata 0.3 is out, but it might be |
| 208 | // hard to avoid the special case. A DFA is always going to at least |
| 209 | // require some memory access. |
| 210 | |
| 211 | // Safe because all ASCII bytes are valid UTF-8. |
| 212 | let grapheme = unsafe { bs[..1].to_str_unchecked() }; |
| 213 | (grapheme, 1) |
| 214 | } else if let Some(hm) = { |
| 215 | let input = Input::new(bs).anchored(Anchored::Yes); |
| 216 | GRAPHEME_BREAK_FWD.try_search_fwd(&input).unwrap() |
| 217 | } { |
| 218 | // Safe because a match can only occur for valid UTF-8. |
| 219 | let grapheme = unsafe { bs[..hm.offset()].to_str_unchecked() }; |
| 220 | (grapheme, grapheme.len()) |
| 221 | } else { |
| 222 | const INVALID: &str = " \u{FFFD}" ; |
| 223 | // No match on non-empty bytes implies we found invalid UTF-8. |
| 224 | let (_, size) = utf8::decode_lossy(bs); |
| 225 | (INVALID, size) |
| 226 | } |
| 227 | } |
| 228 | |
| 229 | fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) { |
| 230 | if bs.is_empty() { |
| 231 | ("" , 0) |
| 232 | } else if let Some(hm: HalfMatch) = { |
| 233 | let input: Input<'_> = Input::new(bs).anchored(mode:Anchored::Yes); |
| 234 | GRAPHEME_BREAK_REV.try_search_rev(&input).unwrap() |
| 235 | } { |
| 236 | let start: usize = adjust_rev_for_regional_indicator(bs, i:hm.offset()); |
| 237 | // Safe because a match can only occur for valid UTF-8. |
| 238 | let grapheme: &str = unsafe { bs[start..].to_str_unchecked() }; |
| 239 | (grapheme, grapheme.len()) |
| 240 | } else { |
| 241 | const INVALID: &str = " \u{FFFD}" ; |
| 242 | // No match on non-empty bytes implies we found invalid UTF-8. |
| 243 | let (_, size: usize) = utf8::decode_last_lossy(slice:bs); |
| 244 | (INVALID, size) |
| 245 | } |
| 246 | } |
| 247 | |
| 248 | /// Return the correct offset for the next grapheme decoded at the end of the |
| 249 | /// given byte string, where `i` is the initial guess. In particular, |
| 250 | /// `&bs[i..]` represents the candidate grapheme. |
| 251 | /// |
| 252 | /// `i` is returned by this function in all cases except when `&bs[i..]` is |
| 253 | /// a pair of regional indicator codepoints. In that case, if an odd number of |
| 254 | /// additional regional indicator codepoints precedes `i`, then `i` is |
| 255 | /// adjusted such that it points to only a single regional indicator. |
| 256 | /// |
| 257 | /// This "fixing" is necessary to handle the requirement that a break cannot |
| 258 | /// occur between regional indicators where it would cause an odd number of |
| 259 | /// regional indicators to exist before the break from the *start* of the |
| 260 | /// string. A reverse regex cannot detect this case easily without look-around. |
| 261 | fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize { |
| 262 | // All regional indicators use a 4 byte encoding, and we only care about |
| 263 | // the case where we found a pair of regional indicators. |
| 264 | if bs.len() - i != 8 { |
| 265 | return i; |
| 266 | } |
| 267 | // Count all contiguous occurrences of regional indicators. If there's an |
| 268 | // even number of them, then we can accept the pair we found. Otherwise, |
| 269 | // we can only take one of them. |
| 270 | // |
| 271 | // FIXME: This is quadratic in the worst case, e.g., a string of just |
| 272 | // regional indicator codepoints. A fix probably requires refactoring this |
| 273 | // code a bit such that we don't rescan regional indicators. |
| 274 | let mut count = 0; |
| 275 | while let Some(hm) = { |
| 276 | let input = Input::new(bs).anchored(Anchored::Yes); |
| 277 | REGIONAL_INDICATOR_REV.try_search_rev(&input).unwrap() |
| 278 | } { |
| 279 | bs = &bs[..hm.offset()]; |
| 280 | count += 1; |
| 281 | } |
| 282 | if count % 2 == 0 { |
| 283 | i |
| 284 | } else { |
| 285 | i + 4 |
| 286 | } |
| 287 | } |
| 288 | |
| 289 | #[cfg (all(test, feature = "std" ))] |
| 290 | mod tests { |
| 291 | use alloc::{ |
| 292 | string::{String, ToString}, |
| 293 | vec, |
| 294 | vec::Vec, |
| 295 | }; |
| 296 | |
| 297 | #[cfg (not(miri))] |
| 298 | use ucd_parse::GraphemeClusterBreakTest; |
| 299 | |
| 300 | use crate::tests::LOSSY_TESTS; |
| 301 | |
| 302 | use super::*; |
| 303 | |
| 304 | #[test ] |
| 305 | #[cfg (not(miri))] |
| 306 | fn forward_ucd() { |
| 307 | for (i, test) in ucdtests().into_iter().enumerate() { |
| 308 | let given = test .grapheme_clusters.concat(); |
| 309 | let got: Vec<String> = Graphemes::new(given.as_bytes()) |
| 310 | .map(|cluster| cluster.to_string()) |
| 311 | .collect(); |
| 312 | assert_eq!( |
| 313 | test.grapheme_clusters, |
| 314 | got, |
| 315 | " \ngrapheme forward break test {} failed: \n\ |
| 316 | given: {:?} \n\ |
| 317 | expected: {:?} \n\ |
| 318 | got: {:?} \n" , |
| 319 | i, |
| 320 | uniescape(&given), |
| 321 | uniescape_vec(&test.grapheme_clusters), |
| 322 | uniescape_vec(&got), |
| 323 | ); |
| 324 | } |
| 325 | } |
| 326 | |
| 327 | #[test ] |
| 328 | #[cfg (not(miri))] |
| 329 | fn reverse_ucd() { |
| 330 | for (i, test) in ucdtests().into_iter().enumerate() { |
| 331 | let given = test .grapheme_clusters.concat(); |
| 332 | let mut got: Vec<String> = Graphemes::new(given.as_bytes()) |
| 333 | .rev() |
| 334 | .map(|cluster| cluster.to_string()) |
| 335 | .collect(); |
| 336 | got.reverse(); |
| 337 | assert_eq!( |
| 338 | test.grapheme_clusters, |
| 339 | got, |
| 340 | " \n\ngrapheme reverse break test {} failed: \n\ |
| 341 | given: {:?} \n\ |
| 342 | expected: {:?} \n\ |
| 343 | got: {:?} \n" , |
| 344 | i, |
| 345 | uniescape(&given), |
| 346 | uniescape_vec(&test.grapheme_clusters), |
| 347 | uniescape_vec(&got), |
| 348 | ); |
| 349 | } |
| 350 | } |
| 351 | |
| 352 | #[test ] |
| 353 | fn forward_lossy() { |
| 354 | for &(expected, input) in LOSSY_TESTS { |
| 355 | let got = Graphemes::new(input.as_bytes()).collect::<String>(); |
| 356 | assert_eq!(expected, got); |
| 357 | } |
| 358 | } |
| 359 | |
| 360 | #[test ] |
| 361 | fn reverse_lossy() { |
| 362 | for &(expected, input) in LOSSY_TESTS { |
| 363 | let expected: String = expected.chars().rev().collect(); |
| 364 | let got = |
| 365 | Graphemes::new(input.as_bytes()).rev().collect::<String>(); |
| 366 | assert_eq!(expected, got); |
| 367 | } |
| 368 | } |
| 369 | |
| 370 | #[cfg (not(miri))] |
| 371 | fn uniescape(s: &str) -> String { |
| 372 | s.chars().flat_map(|c| c.escape_unicode()).collect::<String>() |
| 373 | } |
| 374 | |
| 375 | #[cfg (not(miri))] |
| 376 | fn uniescape_vec(strs: &[String]) -> Vec<String> { |
| 377 | strs.iter().map(|s| uniescape(s)).collect() |
| 378 | } |
| 379 | |
| 380 | /// Return all of the UCD for grapheme breaks. |
| 381 | #[cfg (not(miri))] |
| 382 | fn ucdtests() -> Vec<GraphemeClusterBreakTest> { |
| 383 | const TESTDATA: &str = include_str!("data/GraphemeBreakTest.txt" ); |
| 384 | |
| 385 | let mut tests = vec![]; |
| 386 | for mut line in TESTDATA.lines() { |
| 387 | line = line.trim(); |
| 388 | if line.starts_with("#" ) || line.contains("surrogate" ) { |
| 389 | continue; |
| 390 | } |
| 391 | tests.push(line.parse().unwrap()); |
| 392 | } |
| 393 | tests |
| 394 | } |
| 395 | } |
| 396 | |