| 1 | use std::borrow::Cow; |
| 2 | use std::hash::Hash; |
| 3 | use std::ops::Range; |
| 4 | |
| 5 | /// Reference to a [`DiffableStr`]. |
| 6 | /// |
| 7 | /// This type exists because while the library only really provides ways to |
| 8 | /// work with `&str` and `&[u8]` there are types that deref into those string |
| 9 | /// slices such as `String` and `Vec<u8>`. |
| 10 | /// |
| 11 | /// This trait is used in the library whenever it's nice to be able to pass |
| 12 | /// strings of different types in. |
| 13 | /// |
| 14 | /// Requires the `text` feature. |
| 15 | pub trait DiffableStrRef { |
| 16 | /// The type of the resolved [`DiffableStr`]. |
| 17 | type Output: DiffableStr + ?Sized; |
| 18 | |
| 19 | /// Resolves the reference. |
| 20 | fn as_diffable_str(&self) -> &Self::Output; |
| 21 | } |
| 22 | |
| 23 | impl<T: DiffableStr + ?Sized> DiffableStrRef for T { |
| 24 | type Output = T; |
| 25 | |
| 26 | fn as_diffable_str(&self) -> &T { |
| 27 | self |
| 28 | } |
| 29 | } |
| 30 | |
| 31 | impl DiffableStrRef for String { |
| 32 | type Output = str; |
| 33 | |
| 34 | fn as_diffable_str(&self) -> &str { |
| 35 | self.as_str() |
| 36 | } |
| 37 | } |
| 38 | |
| 39 | impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> { |
| 40 | type Output = T; |
| 41 | |
| 42 | fn as_diffable_str(&self) -> &T { |
| 43 | self |
| 44 | } |
| 45 | } |
| 46 | |
| 47 | /// All supported diffable strings. |
| 48 | /// |
| 49 | /// The text module can work with different types of strings depending |
| 50 | /// on how the crate is compiled. Out of the box `&str` is always supported |
| 51 | /// but with the `bytes` feature one can also work with `[u8]` slices for |
| 52 | /// as long as they are ASCII compatible. |
| 53 | /// |
| 54 | /// Requires the `text` feature. |
| 55 | pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned { |
| 56 | /// Splits the value into newlines with newlines attached. |
| 57 | fn tokenize_lines(&self) -> Vec<&Self>; |
| 58 | |
| 59 | /// Splits the value into newlines with newlines separated. |
| 60 | fn tokenize_lines_and_newlines(&self) -> Vec<&Self>; |
| 61 | |
| 62 | /// Tokenizes into words. |
| 63 | fn tokenize_words(&self) -> Vec<&Self>; |
| 64 | |
| 65 | /// Tokenizes the input into characters. |
| 66 | fn tokenize_chars(&self) -> Vec<&Self>; |
| 67 | |
| 68 | /// Tokenizes into unicode words. |
| 69 | #[cfg (feature = "unicode" )] |
| 70 | fn tokenize_unicode_words(&self) -> Vec<&Self>; |
| 71 | |
| 72 | /// Tokenizes into unicode graphemes. |
| 73 | #[cfg (feature = "unicode" )] |
| 74 | fn tokenize_graphemes(&self) -> Vec<&Self>; |
| 75 | |
| 76 | /// Decodes the string (potentially) lossy. |
| 77 | fn as_str(&self) -> Option<&str>; |
| 78 | |
| 79 | /// Decodes the string (potentially) lossy. |
| 80 | fn to_string_lossy(&self) -> Cow<'_, str>; |
| 81 | |
| 82 | /// Checks if the string ends in a newline. |
| 83 | fn ends_with_newline(&self) -> bool; |
| 84 | |
| 85 | /// The length of the string. |
| 86 | fn len(&self) -> usize; |
| 87 | |
| 88 | /// Slices the string. |
| 89 | fn slice(&self, rng: Range<usize>) -> &Self; |
| 90 | |
| 91 | /// Returns the string as slice of raw bytes. |
| 92 | fn as_bytes(&self) -> &[u8]; |
| 93 | |
| 94 | /// Checks if the string is empty. |
| 95 | fn is_empty(&self) -> bool { |
| 96 | self.len() == 0 |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | impl DiffableStr for str { |
| 101 | fn tokenize_lines(&self) -> Vec<&Self> { |
| 102 | let mut iter = self.char_indices().peekable(); |
| 103 | let mut last_pos = 0; |
| 104 | let mut lines = vec![]; |
| 105 | |
| 106 | while let Some((idx, c)) = iter.next() { |
| 107 | if c == ' \r' { |
| 108 | if iter.peek().map_or(false, |x| x.1 == ' \n' ) { |
| 109 | lines.push(&self[last_pos..=idx + 1]); |
| 110 | iter.next(); |
| 111 | last_pos = idx + 2; |
| 112 | } else { |
| 113 | lines.push(&self[last_pos..=idx]); |
| 114 | last_pos = idx + 1; |
| 115 | } |
| 116 | } else if c == ' \n' { |
| 117 | lines.push(&self[last_pos..=idx]); |
| 118 | last_pos = idx + 1; |
| 119 | } |
| 120 | } |
| 121 | |
| 122 | if last_pos < self.len() { |
| 123 | lines.push(&self[last_pos..]); |
| 124 | } |
| 125 | |
| 126 | lines |
| 127 | } |
| 128 | |
| 129 | fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { |
| 130 | let mut rv = vec![]; |
| 131 | let mut iter = self.char_indices().peekable(); |
| 132 | |
| 133 | while let Some((idx, c)) = iter.next() { |
| 134 | let is_newline = c == ' \r' || c == ' \n' ; |
| 135 | let start = idx; |
| 136 | let mut end = idx + c.len_utf8(); |
| 137 | while let Some(&(_, next_char)) = iter.peek() { |
| 138 | if (next_char == ' \r' || next_char == ' \n' ) != is_newline { |
| 139 | break; |
| 140 | } |
| 141 | iter.next(); |
| 142 | end += next_char.len_utf8(); |
| 143 | } |
| 144 | rv.push(&self[start..end]); |
| 145 | } |
| 146 | |
| 147 | rv |
| 148 | } |
| 149 | |
| 150 | fn tokenize_words(&self) -> Vec<&Self> { |
| 151 | let mut iter = self.char_indices().peekable(); |
| 152 | let mut rv = vec![]; |
| 153 | |
| 154 | while let Some((idx, c)) = iter.next() { |
| 155 | let is_whitespace = c.is_whitespace(); |
| 156 | let start = idx; |
| 157 | let mut end = idx + c.len_utf8(); |
| 158 | while let Some(&(_, next_char)) = iter.peek() { |
| 159 | if next_char.is_whitespace() != is_whitespace { |
| 160 | break; |
| 161 | } |
| 162 | iter.next(); |
| 163 | end += next_char.len_utf8(); |
| 164 | } |
| 165 | rv.push(&self[start..end]); |
| 166 | } |
| 167 | |
| 168 | rv |
| 169 | } |
| 170 | |
| 171 | fn tokenize_chars(&self) -> Vec<&Self> { |
| 172 | self.char_indices() |
| 173 | .map(move |(i, c)| &self[i..i + c.len_utf8()]) |
| 174 | .collect() |
| 175 | } |
| 176 | |
| 177 | #[cfg (feature = "unicode" )] |
| 178 | fn tokenize_unicode_words(&self) -> Vec<&Self> { |
| 179 | unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect() |
| 180 | } |
| 181 | |
| 182 | #[cfg (feature = "unicode" )] |
| 183 | fn tokenize_graphemes(&self) -> Vec<&Self> { |
| 184 | unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect() |
| 185 | } |
| 186 | |
| 187 | fn as_str(&self) -> Option<&str> { |
| 188 | Some(self) |
| 189 | } |
| 190 | |
| 191 | fn to_string_lossy(&self) -> Cow<'_, str> { |
| 192 | Cow::Borrowed(self) |
| 193 | } |
| 194 | |
| 195 | fn ends_with_newline(&self) -> bool { |
| 196 | self.ends_with(&[' \r' , ' \n' ][..]) |
| 197 | } |
| 198 | |
| 199 | fn len(&self) -> usize { |
| 200 | str::len(self) |
| 201 | } |
| 202 | |
| 203 | fn slice(&self, rng: Range<usize>) -> &Self { |
| 204 | &self[rng] |
| 205 | } |
| 206 | |
| 207 | fn as_bytes(&self) -> &[u8] { |
| 208 | str::as_bytes(self) |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | #[cfg (feature = "bytes" )] |
| 213 | mod bytes_support { |
| 214 | use super::*; |
| 215 | |
| 216 | use bstr::ByteSlice; |
| 217 | |
| 218 | impl DiffableStrRef for Vec<u8> { |
| 219 | type Output = [u8]; |
| 220 | |
| 221 | fn as_diffable_str(&self) -> &[u8] { |
| 222 | self.as_slice() |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | /// Allows viewing ASCII compatible byte slices as strings. |
| 227 | /// |
| 228 | /// Requires the `bytes` feature. |
| 229 | impl DiffableStr for [u8] { |
| 230 | fn tokenize_lines(&self) -> Vec<&Self> { |
| 231 | let mut iter = self.char_indices().peekable(); |
| 232 | let mut last_pos = 0; |
| 233 | let mut lines = vec![]; |
| 234 | |
| 235 | while let Some((_, end, c)) = iter.next() { |
| 236 | if c == ' \r' { |
| 237 | if iter.peek().map_or(false, |x| x.2 == ' \n' ) { |
| 238 | lines.push(&self[last_pos..end + 1]); |
| 239 | iter.next(); |
| 240 | last_pos = end + 1; |
| 241 | } else { |
| 242 | lines.push(&self[last_pos..end]); |
| 243 | last_pos = end; |
| 244 | } |
| 245 | } else if c == ' \n' { |
| 246 | lines.push(&self[last_pos..end]); |
| 247 | last_pos = end; |
| 248 | } |
| 249 | } |
| 250 | |
| 251 | if last_pos < self.len() { |
| 252 | lines.push(&self[last_pos..]); |
| 253 | } |
| 254 | |
| 255 | lines |
| 256 | } |
| 257 | |
| 258 | fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { |
| 259 | let mut rv = vec![]; |
| 260 | let mut iter = self.char_indices().peekable(); |
| 261 | |
| 262 | while let Some((start, mut end, c)) = iter.next() { |
| 263 | let is_newline = c == ' \r' || c == ' \n' ; |
| 264 | while let Some(&(_, new_end, next_char)) = iter.peek() { |
| 265 | if (next_char == ' \r' || next_char == ' \n' ) != is_newline { |
| 266 | break; |
| 267 | } |
| 268 | iter.next(); |
| 269 | end = new_end; |
| 270 | } |
| 271 | rv.push(&self[start..end]); |
| 272 | } |
| 273 | |
| 274 | rv |
| 275 | } |
| 276 | |
| 277 | fn tokenize_words(&self) -> Vec<&Self> { |
| 278 | let mut iter = self.char_indices().peekable(); |
| 279 | let mut rv = vec![]; |
| 280 | |
| 281 | while let Some((start, mut end, c)) = iter.next() { |
| 282 | let is_whitespace = c.is_whitespace(); |
| 283 | while let Some(&(_, new_end, next_char)) = iter.peek() { |
| 284 | if next_char.is_whitespace() != is_whitespace { |
| 285 | break; |
| 286 | } |
| 287 | iter.next(); |
| 288 | end = new_end; |
| 289 | } |
| 290 | rv.push(&self[start..end]); |
| 291 | } |
| 292 | |
| 293 | rv |
| 294 | } |
| 295 | |
| 296 | #[cfg (feature = "unicode" )] |
| 297 | fn tokenize_unicode_words(&self) -> Vec<&Self> { |
| 298 | self.words_with_breaks().map(|x| x.as_bytes()).collect() |
| 299 | } |
| 300 | |
| 301 | #[cfg (feature = "unicode" )] |
| 302 | fn tokenize_graphemes(&self) -> Vec<&Self> { |
| 303 | self.graphemes().map(|x| x.as_bytes()).collect() |
| 304 | } |
| 305 | |
| 306 | fn tokenize_chars(&self) -> Vec<&Self> { |
| 307 | self.char_indices() |
| 308 | .map(move |(start, end, _)| &self[start..end]) |
| 309 | .collect() |
| 310 | } |
| 311 | |
| 312 | fn as_str(&self) -> Option<&str> { |
| 313 | std::str::from_utf8(self).ok() |
| 314 | } |
| 315 | |
| 316 | fn to_string_lossy(&self) -> Cow<'_, str> { |
| 317 | String::from_utf8_lossy(self) |
| 318 | } |
| 319 | |
| 320 | fn ends_with_newline(&self) -> bool { |
| 321 | matches!(self.last_byte(), Some(b' \r' ) | Some(b' \n' )) |
| 322 | } |
| 323 | |
| 324 | fn len(&self) -> usize { |
| 325 | <[u8]>::len(self) |
| 326 | } |
| 327 | |
| 328 | fn slice(&self, rng: Range<usize>) -> &Self { |
| 329 | &self[rng] |
| 330 | } |
| 331 | |
| 332 | fn as_bytes(&self) -> &[u8] { |
| 333 | self |
| 334 | } |
| 335 | } |
| 336 | } |
| 337 | |
| 338 | #[test ] |
| 339 | fn test_split_lines() { |
| 340 | assert_eq!( |
| 341 | DiffableStr::tokenize_lines("first \nsecond \rthird \r\nfourth \nlast" ), |
| 342 | vec!["first \n" , "second \r" , "third \r\n" , "fourth \n" , "last" ] |
| 343 | ); |
| 344 | assert_eq!(DiffableStr::tokenize_lines(" \n\n" ), vec![" \n" , " \n" ]); |
| 345 | assert_eq!(DiffableStr::tokenize_lines(" \n" ), vec![" \n" ]); |
| 346 | assert!(DiffableStr::tokenize_lines("" ).is_empty()); |
| 347 | } |
| 348 | |
| 349 | #[test ] |
| 350 | fn test_split_words() { |
| 351 | assert_eq!( |
| 352 | DiffableStr::tokenize_words("foo bar baz \n\n aha" ), |
| 353 | ["foo" , " " , "bar" , " " , "baz" , " \n\n " , "aha" ] |
| 354 | ); |
| 355 | } |
| 356 | |
| 357 | #[test ] |
| 358 | fn test_split_chars() { |
| 359 | assert_eq!( |
| 360 | DiffableStr::tokenize_chars("abcfö❄️" ), |
| 361 | vec!["a" , "b" , "c" , "f" , "ö" , "❄" , " \u{fe0f}" ] |
| 362 | ); |
| 363 | } |
| 364 | |
| 365 | #[test ] |
| 366 | #[cfg (feature = "unicode" )] |
| 367 | fn test_split_graphemes() { |
| 368 | assert_eq!( |
| 369 | DiffableStr::tokenize_graphemes("abcfö❄️" ), |
| 370 | vec!["a" , "b" , "c" , "f" , "ö" , "❄️" ] |
| 371 | ); |
| 372 | } |
| 373 | |
| 374 | #[test ] |
| 375 | #[cfg (feature = "bytes" )] |
| 376 | fn test_split_lines_bytes() { |
| 377 | assert_eq!( |
| 378 | DiffableStr::tokenize_lines("first \nsecond \rthird \r\nfourth \nlast" .as_bytes()), |
| 379 | vec![ |
| 380 | "first \n" .as_bytes(), |
| 381 | "second \r" .as_bytes(), |
| 382 | "third \r\n" .as_bytes(), |
| 383 | "fourth \n" .as_bytes(), |
| 384 | "last" .as_bytes() |
| 385 | ] |
| 386 | ); |
| 387 | assert_eq!( |
| 388 | DiffableStr::tokenize_lines(" \n\n" .as_bytes()), |
| 389 | vec![" \n" .as_bytes(), " \n" .as_bytes()] |
| 390 | ); |
| 391 | assert_eq!( |
| 392 | DiffableStr::tokenize_lines(" \n" .as_bytes()), |
| 393 | vec![" \n" .as_bytes()] |
| 394 | ); |
| 395 | assert!(DiffableStr::tokenize_lines("" .as_bytes()).is_empty()); |
| 396 | } |
| 397 | |
| 398 | #[test ] |
| 399 | #[cfg (feature = "bytes" )] |
| 400 | fn test_split_words_bytes() { |
| 401 | assert_eq!( |
| 402 | DiffableStr::tokenize_words("foo bar baz \n\n aha" .as_bytes()), |
| 403 | [ |
| 404 | &b"foo" [..], |
| 405 | &b" " [..], |
| 406 | &b"bar" [..], |
| 407 | &b" " [..], |
| 408 | &b"baz" [..], |
| 409 | &b" \n\n " [..], |
| 410 | &b"aha" [..] |
| 411 | ] |
| 412 | ); |
| 413 | } |
| 414 | |
| 415 | #[test ] |
| 416 | #[cfg (feature = "bytes" )] |
| 417 | fn test_split_chars_bytes() { |
| 418 | assert_eq!( |
| 419 | DiffableStr::tokenize_chars("abcfö❄️" .as_bytes()), |
| 420 | vec![ |
| 421 | &b"a" [..], |
| 422 | &b"b" [..], |
| 423 | &b"c" [..], |
| 424 | &b"f" [..], |
| 425 | "ö" .as_bytes(), |
| 426 | "❄" .as_bytes(), |
| 427 | " \u{fe0f}" .as_bytes() |
| 428 | ] |
| 429 | ); |
| 430 | } |
| 431 | |
| 432 | #[test ] |
| 433 | #[cfg (all(feature = "bytes" , feature = "unicode" ))] |
| 434 | fn test_split_graphemes_bytes() { |
| 435 | assert_eq!( |
| 436 | DiffableStr::tokenize_graphemes("abcfö❄️" .as_bytes()), |
| 437 | vec![ |
| 438 | &b"a" [..], |
| 439 | &b"b" [..], |
| 440 | &b"c" [..], |
| 441 | &b"f" [..], |
| 442 | "ö" .as_bytes(), |
| 443 | "❄️" .as_bytes() |
| 444 | ] |
| 445 | ); |
| 446 | } |
| 447 | |