| 1 | use regex_automata::{dfa::Automaton, Anchored, Input}; | 
| 2 |  | 
|---|
| 3 | use crate::{ | 
|---|
| 4 | ext_slice::ByteSlice, | 
|---|
| 5 | unicode::fsm::{ | 
|---|
| 6 | grapheme_break_fwd::GRAPHEME_BREAK_FWD, | 
|---|
| 7 | grapheme_break_rev::GRAPHEME_BREAK_REV, | 
|---|
| 8 | regional_indicator_rev::REGIONAL_INDICATOR_REV, | 
|---|
| 9 | }, | 
|---|
| 10 | utf8, | 
|---|
| 11 | }; | 
|---|
| 12 |  | 
|---|
| 13 | /// An iterator over grapheme clusters in a byte string. | 
|---|
| 14 | /// | 
|---|
| 15 | /// This iterator is typically constructed by | 
|---|
| 16 | /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes). | 
|---|
| 17 | /// | 
|---|
| 18 | /// Unicode defines a grapheme cluster as an *approximation* to a single user | 
|---|
| 19 | /// visible character. A grapheme cluster, or just "grapheme," is made up of | 
|---|
| 20 | /// one or more codepoints. For end user oriented tasks, one should generally | 
|---|
| 21 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which | 
|---|
| 22 | /// always yields one codepoint at a time. | 
|---|
| 23 | /// | 
|---|
| 24 | /// Since graphemes are made up of one or more codepoints, this iterator yields | 
|---|
| 25 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints | 
|---|
| 26 | /// are [substituted](index.html#handling-of-invalid-utf-8). | 
|---|
| 27 | /// | 
|---|
| 28 | /// This iterator can be used in reverse. When reversed, exactly the same | 
|---|
| 29 | /// set of grapheme clusters are yielded, but in reverse order. | 
|---|
| 30 | /// | 
|---|
| 31 | /// This iterator only yields *extended* grapheme clusters, in accordance with | 
|---|
| 32 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). | 
|---|
| 33 | #[ derive(Clone, Debug)] | 
|---|
| 34 | pub struct Graphemes<'a> { | 
|---|
| 35 | bs: &'a [u8], | 
|---|
| 36 | } | 
|---|
| 37 |  | 
|---|
| 38 | impl<'a> Graphemes<'a> { | 
|---|
| 39 | pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> { | 
|---|
| 40 | Graphemes { bs } | 
|---|
| 41 | } | 
|---|
| 42 |  | 
|---|
| 43 | /// View the underlying data as a subslice of the original data. | 
|---|
| 44 | /// | 
|---|
| 45 | /// The slice returned has the same lifetime as the original slice, and so | 
|---|
| 46 | /// the iterator can continue to be used while this exists. | 
|---|
| 47 | /// | 
|---|
| 48 | /// # Examples | 
|---|
| 49 | /// | 
|---|
| 50 | /// ``` | 
|---|
| 51 | /// use bstr::ByteSlice; | 
|---|
| 52 | /// | 
|---|
| 53 | /// let mut it = b"abc".graphemes(); | 
|---|
| 54 | /// | 
|---|
| 55 | /// assert_eq!( b"abc", it.as_bytes()); | 
|---|
| 56 | /// it.next(); | 
|---|
| 57 | /// assert_eq!( b"bc", it.as_bytes()); | 
|---|
| 58 | /// it.next(); | 
|---|
| 59 | /// it.next(); | 
|---|
| 60 | /// assert_eq!( b"", it.as_bytes()); | 
|---|
| 61 | /// ``` | 
|---|
| 62 | #[ inline] | 
|---|
| 63 | pub fn as_bytes(&self) -> &'a [u8] { | 
|---|
| 64 | self.bs | 
|---|
| 65 | } | 
|---|
| 66 | } | 
|---|
| 67 |  | 
|---|
| 68 | impl<'a> Iterator for Graphemes<'a> { | 
|---|
| 69 | type Item = &'a str; | 
|---|
| 70 |  | 
|---|
| 71 | #[ inline] | 
|---|
| 72 | fn next(&mut self) -> Option<&'a str> { | 
|---|
| 73 | let (grapheme: &str, size: usize) = decode_grapheme(self.bs); | 
|---|
| 74 | if size == 0 { | 
|---|
| 75 | return None; | 
|---|
| 76 | } | 
|---|
| 77 | self.bs = &self.bs[size..]; | 
|---|
| 78 | Some(grapheme) | 
|---|
| 79 | } | 
|---|
| 80 | } | 
|---|
| 81 |  | 
|---|
| 82 | impl<'a> DoubleEndedIterator for Graphemes<'a> { | 
|---|
| 83 | #[ inline] | 
|---|
| 84 | fn next_back(&mut self) -> Option<&'a str> { | 
|---|
| 85 | let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs); | 
|---|
| 86 | if size == 0 { | 
|---|
| 87 | return None; | 
|---|
| 88 | } | 
|---|
| 89 | self.bs = &self.bs[..self.bs.len() - size]; | 
|---|
| 90 | Some(grapheme) | 
|---|
| 91 | } | 
|---|
| 92 | } | 
|---|
| 93 |  | 
|---|
| 94 | /// An iterator over grapheme clusters in a byte string and their byte index | 
|---|
| 95 | /// positions. | 
|---|
| 96 | /// | 
|---|
| 97 | /// This iterator is typically constructed by | 
|---|
| 98 | /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices). | 
|---|
| 99 | /// | 
|---|
| 100 | /// Unicode defines a grapheme cluster as an *approximation* to a single user | 
|---|
| 101 | /// visible character. A grapheme cluster, or just "grapheme," is made up of | 
|---|
| 102 | /// one or more codepoints. For end user oriented tasks, one should generally | 
|---|
| 103 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which | 
|---|
| 104 | /// always yields one codepoint at a time. | 
|---|
| 105 | /// | 
|---|
| 106 | /// Since graphemes are made up of one or more codepoints, this iterator | 
|---|
| 107 | /// yields `&str` elements (along with their start and end byte offsets). | 
|---|
| 108 | /// When invalid UTF-8 is encountered, replacement codepoints are | 
|---|
| 109 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the | 
|---|
| 110 | /// indices yielded by this iterator may not correspond to the length of the | 
|---|
| 111 | /// grapheme cluster yielded with those indices. For example, when this | 
|---|
| 112 | /// iterator encounters `\xFF` in the byte string, then it will yield a pair | 
|---|
| 113 | /// of indices ranging over a single byte, but will provide an `&str` | 
|---|
| 114 | /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when | 
|---|
| 115 | /// given only valid UTF-8, then all indices are in exact correspondence with | 
|---|
| 116 | /// their paired grapheme cluster. | 
|---|
| 117 | /// | 
|---|
| 118 | /// This iterator can be used in reverse. When reversed, exactly the same | 
|---|
| 119 | /// set of grapheme clusters are yielded, but in reverse order. | 
|---|
| 120 | /// | 
|---|
| 121 | /// This iterator only yields *extended* grapheme clusters, in accordance with | 
|---|
| 122 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). | 
|---|
| 123 | #[ derive(Clone, Debug)] | 
|---|
| 124 | pub struct GraphemeIndices<'a> { | 
|---|
| 125 | bs: &'a [u8], | 
|---|
| 126 | forward_index: usize, | 
|---|
| 127 | reverse_index: usize, | 
|---|
| 128 | } | 
|---|
| 129 |  | 
|---|
| 130 | impl<'a> GraphemeIndices<'a> { | 
|---|
| 131 | pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> { | 
|---|
| 132 | GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() } | 
|---|
| 133 | } | 
|---|
| 134 |  | 
|---|
| 135 | /// View the underlying data as a subslice of the original data. | 
|---|
| 136 | /// | 
|---|
| 137 | /// The slice returned has the same lifetime as the original slice, and so | 
|---|
| 138 | /// the iterator can continue to be used while this exists. | 
|---|
| 139 | /// | 
|---|
| 140 | /// # Examples | 
|---|
| 141 | /// | 
|---|
| 142 | /// ``` | 
|---|
| 143 | /// use bstr::ByteSlice; | 
|---|
| 144 | /// | 
|---|
| 145 | /// let mut it = b"abc".grapheme_indices(); | 
|---|
| 146 | /// | 
|---|
| 147 | /// assert_eq!( b"abc", it.as_bytes()); | 
|---|
| 148 | /// it.next(); | 
|---|
| 149 | /// assert_eq!( b"bc", it.as_bytes()); | 
|---|
| 150 | /// it.next(); | 
|---|
| 151 | /// it.next(); | 
|---|
| 152 | /// assert_eq!( b"", it.as_bytes()); | 
|---|
| 153 | /// ``` | 
|---|
| 154 | #[ inline] | 
|---|
| 155 | pub fn as_bytes(&self) -> &'a [u8] { | 
|---|
| 156 | self.bs | 
|---|
| 157 | } | 
|---|
| 158 | } | 
|---|
| 159 |  | 
|---|
| 160 | impl<'a> Iterator for GraphemeIndices<'a> { | 
|---|
| 161 | type Item = (usize, usize, &'a str); | 
|---|
| 162 |  | 
|---|
| 163 | #[ inline] | 
|---|
| 164 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { | 
|---|
| 165 | let index: usize = self.forward_index; | 
|---|
| 166 | let (grapheme: &str, size: usize) = decode_grapheme(self.bs); | 
|---|
| 167 | if size == 0 { | 
|---|
| 168 | return None; | 
|---|
| 169 | } | 
|---|
| 170 | self.bs = &self.bs[size..]; | 
|---|
| 171 | self.forward_index += size; | 
|---|
| 172 | Some((index, index + size, grapheme)) | 
|---|
| 173 | } | 
|---|
| 174 | } | 
|---|
| 175 |  | 
|---|
| 176 | impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { | 
|---|
| 177 | #[ inline] | 
|---|
| 178 | fn next_back(&mut self) -> Option<(usize, usize, &'a str)> { | 
|---|
| 179 | let (grapheme: &str, size: usize) = decode_last_grapheme(self.bs); | 
|---|
| 180 | if size == 0 { | 
|---|
| 181 | return None; | 
|---|
| 182 | } | 
|---|
| 183 | self.bs = &self.bs[..self.bs.len() - size]; | 
|---|
| 184 | self.reverse_index -= size; | 
|---|
| 185 | Some((self.reverse_index, self.reverse_index + size, grapheme)) | 
|---|
| 186 | } | 
|---|
| 187 | } | 
|---|
| 188 |  | 
|---|
| 189 | /// Decode a grapheme from the given byte string. | 
|---|
| 190 | /// | 
|---|
| 191 | /// This returns the resulting grapheme (which may be a Unicode replacement | 
|---|
| 192 | /// codepoint if invalid UTF-8 was found), along with the number of bytes | 
|---|
| 193 | /// decoded in the byte string. The number of bytes decoded may not be the | 
|---|
| 194 | /// same as the length of grapheme in the case where invalid UTF-8 is found. | 
|---|
| 195 | pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) { | 
|---|
| 196 | if bs.is_empty() { | 
|---|
| 197 | ( "", 0) | 
|---|
| 198 | } else if bs.len() >= 2 | 
|---|
| 199 | && bs[0].is_ascii() | 
|---|
| 200 | && bs[1].is_ascii() | 
|---|
| 201 | && !bs[0].is_ascii_whitespace() | 
|---|
| 202 | { | 
|---|
| 203 | // FIXME: It is somewhat sad that we have to special case this, but it | 
|---|
| 204 | // leads to a significant speed up in predominantly ASCII text. The | 
|---|
| 205 | // issue here is that the DFA has a bit of overhead, and running it for | 
|---|
| 206 | // every byte in mostly ASCII text results in a bit slowdown. We should | 
|---|
| 207 | // re-litigate this once regex-automata 0.3 is out, but it might be | 
|---|
| 208 | // hard to avoid the special case. A DFA is always going to at least | 
|---|
| 209 | // require some memory access. | 
|---|
| 210 |  | 
|---|
| 211 | // Safe because all ASCII bytes are valid UTF-8. | 
|---|
| 212 | let grapheme = unsafe { bs[..1].to_str_unchecked() }; | 
|---|
| 213 | (grapheme, 1) | 
|---|
| 214 | } else if let Some(hm) = { | 
|---|
| 215 | let input = Input::new(bs).anchored(Anchored::Yes); | 
|---|
| 216 | GRAPHEME_BREAK_FWD.try_search_fwd(&input).unwrap() | 
|---|
| 217 | } { | 
|---|
| 218 | // Safe because a match can only occur for valid UTF-8. | 
|---|
| 219 | let grapheme = unsafe { bs[..hm.offset()].to_str_unchecked() }; | 
|---|
| 220 | (grapheme, grapheme.len()) | 
|---|
| 221 | } else { | 
|---|
| 222 | const INVALID: &str = "\u{FFFD} "; | 
|---|
| 223 | // No match on non-empty bytes implies we found invalid UTF-8. | 
|---|
| 224 | let (_, size) = utf8::decode_lossy(bs); | 
|---|
| 225 | (INVALID, size) | 
|---|
| 226 | } | 
|---|
| 227 | } | 
|---|
| 228 |  | 
|---|
| 229 | fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) { | 
|---|
| 230 | if bs.is_empty() { | 
|---|
| 231 | ( "", 0) | 
|---|
| 232 | } else if let Some(hm: HalfMatch) = { | 
|---|
| 233 | let input: Input<'_> = Input::new(bs).anchored(mode:Anchored::Yes); | 
|---|
| 234 | GRAPHEME_BREAK_REV.try_search_rev(&input).unwrap() | 
|---|
| 235 | } { | 
|---|
| 236 | let start: usize = adjust_rev_for_regional_indicator(bs, i:hm.offset()); | 
|---|
| 237 | // Safe because a match can only occur for valid UTF-8. | 
|---|
| 238 | let grapheme: &str = unsafe { bs[start..].to_str_unchecked() }; | 
|---|
| 239 | (grapheme, grapheme.len()) | 
|---|
| 240 | } else { | 
|---|
| 241 | const INVALID: &str = "\u{FFFD} "; | 
|---|
| 242 | // No match on non-empty bytes implies we found invalid UTF-8. | 
|---|
| 243 | let (_, size: usize) = utf8::decode_last_lossy(slice:bs); | 
|---|
| 244 | (INVALID, size) | 
|---|
| 245 | } | 
|---|
| 246 | } | 
|---|
| 247 |  | 
|---|
| 248 | /// Return the correct offset for the next grapheme decoded at the end of the | 
|---|
| 249 | /// given byte string, where `i` is the initial guess. In particular, | 
|---|
| 250 | /// `&bs[i..]` represents the candidate grapheme. | 
|---|
| 251 | /// | 
|---|
| 252 | /// `i` is returned by this function in all cases except when `&bs[i..]` is | 
|---|
| 253 | /// a pair of regional indicator codepoints. In that case, if an odd number of | 
|---|
| 254 | /// additional regional indicator codepoints precedes `i`, then `i` is | 
|---|
| 255 | /// adjusted such that it points to only a single regional indicator. | 
|---|
| 256 | /// | 
|---|
| 257 | /// This "fixing" is necessary to handle the requirement that a break cannot | 
|---|
| 258 | /// occur between regional indicators where it would cause an odd number of | 
|---|
| 259 | /// regional indicators to exist before the break from the *start* of the | 
|---|
| 260 | /// string. A reverse regex cannot detect this case easily without look-around. | 
|---|
| 261 | fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize { | 
|---|
| 262 | // All regional indicators use a 4 byte encoding, and we only care about | 
|---|
| 263 | // the case where we found a pair of regional indicators. | 
|---|
| 264 | if bs.len() - i != 8 { | 
|---|
| 265 | return i; | 
|---|
| 266 | } | 
|---|
| 267 | // Count all contiguous occurrences of regional indicators. If there's an | 
|---|
| 268 | // even number of them, then we can accept the pair we found. Otherwise, | 
|---|
| 269 | // we can only take one of them. | 
|---|
| 270 | // | 
|---|
| 271 | // FIXME: This is quadratic in the worst case, e.g., a string of just | 
|---|
| 272 | // regional indicator codepoints. A fix probably requires refactoring this | 
|---|
| 273 | // code a bit such that we don't rescan regional indicators. | 
|---|
| 274 | let mut count = 0; | 
|---|
| 275 | while let Some(hm) = { | 
|---|
| 276 | let input = Input::new(bs).anchored(Anchored::Yes); | 
|---|
| 277 | REGIONAL_INDICATOR_REV.try_search_rev(&input).unwrap() | 
|---|
| 278 | } { | 
|---|
| 279 | bs = &bs[..hm.offset()]; | 
|---|
| 280 | count += 1; | 
|---|
| 281 | } | 
|---|
| 282 | if count % 2 == 0 { | 
|---|
| 283 | i | 
|---|
| 284 | } else { | 
|---|
| 285 | i + 4 | 
|---|
| 286 | } | 
|---|
| 287 | } | 
|---|
| 288 |  | 
|---|
| 289 | #[ cfg(all(test, feature = "std"))] | 
|---|
| 290 | mod tests { | 
|---|
| 291 | use alloc::{ | 
|---|
| 292 | string::{String, ToString}, | 
|---|
| 293 | vec, | 
|---|
| 294 | vec::Vec, | 
|---|
| 295 | }; | 
|---|
| 296 |  | 
|---|
| 297 | #[ cfg(not(miri))] | 
|---|
| 298 | use ucd_parse::GraphemeClusterBreakTest; | 
|---|
| 299 |  | 
|---|
| 300 | use crate::tests::LOSSY_TESTS; | 
|---|
| 301 |  | 
|---|
| 302 | use super::*; | 
|---|
| 303 |  | 
|---|
| 304 | #[ test] | 
|---|
| 305 | #[ cfg(not(miri))] | 
|---|
| 306 | fn forward_ucd() { | 
|---|
| 307 | for (i, test) in ucdtests().into_iter().enumerate() { | 
|---|
| 308 | let given = test.grapheme_clusters.concat(); | 
|---|
| 309 | let got: Vec<String> = Graphemes::new(given.as_bytes()) | 
|---|
| 310 | .map(|cluster| cluster.to_string()) | 
|---|
| 311 | .collect(); | 
|---|
| 312 | assert_eq!( | 
|---|
| 313 | test.grapheme_clusters, | 
|---|
| 314 | got, | 
|---|
| 315 | "\n grapheme forward break test {} failed:\n \ | 
|---|
| 316 |                  given:    {:?}\n \ | 
|---|
| 317 |                  expected: {:?}\n \ | 
|---|
| 318 |                  got:      {:?}\n ", | 
|---|
| 319 | i, | 
|---|
| 320 | uniescape(&given), | 
|---|
| 321 | uniescape_vec(&test.grapheme_clusters), | 
|---|
| 322 | uniescape_vec(&got), | 
|---|
| 323 | ); | 
|---|
| 324 | } | 
|---|
| 325 | } | 
|---|
| 326 |  | 
|---|
| 327 | #[ test] | 
|---|
| 328 | #[ cfg(not(miri))] | 
|---|
| 329 | fn reverse_ucd() { | 
|---|
| 330 | for (i, test) in ucdtests().into_iter().enumerate() { | 
|---|
| 331 | let given = test.grapheme_clusters.concat(); | 
|---|
| 332 | let mut got: Vec<String> = Graphemes::new(given.as_bytes()) | 
|---|
| 333 | .rev() | 
|---|
| 334 | .map(|cluster| cluster.to_string()) | 
|---|
| 335 | .collect(); | 
|---|
| 336 | got.reverse(); | 
|---|
| 337 | assert_eq!( | 
|---|
| 338 | test.grapheme_clusters, | 
|---|
| 339 | got, | 
|---|
| 340 | "\n\n grapheme reverse break test {} failed:\n \ | 
|---|
| 341 |                  given:    {:?}\n \ | 
|---|
| 342 |                  expected: {:?}\n \ | 
|---|
| 343 |                  got:      {:?}\n ", | 
|---|
| 344 | i, | 
|---|
| 345 | uniescape(&given), | 
|---|
| 346 | uniescape_vec(&test.grapheme_clusters), | 
|---|
| 347 | uniescape_vec(&got), | 
|---|
| 348 | ); | 
|---|
| 349 | } | 
|---|
| 350 | } | 
|---|
| 351 |  | 
|---|
| 352 | #[ test] | 
|---|
| 353 | fn forward_lossy() { | 
|---|
| 354 | for &(expected, input) in LOSSY_TESTS { | 
|---|
| 355 | let got = Graphemes::new(input.as_bytes()).collect::<String>(); | 
|---|
| 356 | assert_eq!(expected, got); | 
|---|
| 357 | } | 
|---|
| 358 | } | 
|---|
| 359 |  | 
|---|
| 360 | #[ test] | 
|---|
| 361 | fn reverse_lossy() { | 
|---|
| 362 | for &(expected, input) in LOSSY_TESTS { | 
|---|
| 363 | let expected: String = expected.chars().rev().collect(); | 
|---|
| 364 | let got = | 
|---|
| 365 | Graphemes::new(input.as_bytes()).rev().collect::<String>(); | 
|---|
| 366 | assert_eq!(expected, got); | 
|---|
| 367 | } | 
|---|
| 368 | } | 
|---|
| 369 |  | 
|---|
| 370 | #[ cfg(not(miri))] | 
|---|
| 371 | fn uniescape(s: &str) -> String { | 
|---|
| 372 | s.chars().flat_map(|c| c.escape_unicode()).collect::<String>() | 
|---|
| 373 | } | 
|---|
| 374 |  | 
|---|
| 375 | #[ cfg(not(miri))] | 
|---|
| 376 | fn uniescape_vec(strs: &[String]) -> Vec<String> { | 
|---|
| 377 | strs.iter().map(|s| uniescape(s)).collect() | 
|---|
| 378 | } | 
|---|
| 379 |  | 
|---|
| 380 | /// Return all of the UCD for grapheme breaks. | 
|---|
| 381 | #[ cfg(not(miri))] | 
|---|
| 382 | fn ucdtests() -> Vec<GraphemeClusterBreakTest> { | 
|---|
| 383 | const TESTDATA: &str = include_str!( "data/GraphemeBreakTest.txt"); | 
|---|
| 384 |  | 
|---|
| 385 | let mut tests = vec![]; | 
|---|
| 386 | for mut line in TESTDATA.lines() { | 
|---|
| 387 | line = line.trim(); | 
|---|
| 388 | if line.starts_with( "#") || line.contains( "surrogate") { | 
|---|
| 389 | continue; | 
|---|
| 390 | } | 
|---|
| 391 | tests.push(line.parse().unwrap()); | 
|---|
| 392 | } | 
|---|
| 393 | tests | 
|---|
| 394 | } | 
|---|
| 395 | } | 
|---|
| 396 |  | 
|---|