1 | use std::borrow::Cow; |
2 | use std::hash::Hash; |
3 | use std::ops::Range; |
4 | |
5 | /// Reference to a [`DiffableStr`]. |
6 | /// |
7 | /// This type exists because while the library only really provides ways to |
8 | /// work with `&str` and `&[u8]` there are types that deref into those string |
9 | /// slices such as `String` and `Vec<u8>`. |
10 | /// |
11 | /// This trait is used in the library whenever it's nice to be able to pass |
12 | /// strings of different types in. |
13 | /// |
14 | /// Requires the `text` feature. |
15 | pub trait DiffableStrRef { |
16 | /// The type of the resolved [`DiffableStr`]. |
17 | type Output: DiffableStr + ?Sized; |
18 | |
19 | /// Resolves the reference. |
20 | fn as_diffable_str(&self) -> &Self::Output; |
21 | } |
22 | |
23 | impl<T: DiffableStr + ?Sized> DiffableStrRef for T { |
24 | type Output = T; |
25 | |
26 | fn as_diffable_str(&self) -> &T { |
27 | self |
28 | } |
29 | } |
30 | |
31 | impl DiffableStrRef for String { |
32 | type Output = str; |
33 | |
34 | fn as_diffable_str(&self) -> &str { |
35 | self.as_str() |
36 | } |
37 | } |
38 | |
39 | impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> { |
40 | type Output = T; |
41 | |
42 | fn as_diffable_str(&self) -> &T { |
43 | self |
44 | } |
45 | } |
46 | |
47 | /// All supported diffable strings. |
48 | /// |
49 | /// The text module can work with different types of strings depending |
50 | /// on how the crate is compiled. Out of the box `&str` is always supported |
51 | /// but with the `bytes` feature one can also work with `[u8]` slices for |
52 | /// as long as they are ASCII compatible. |
53 | /// |
54 | /// Requires the `text` feature. |
55 | pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned { |
56 | /// Splits the value into newlines with newlines attached. |
57 | fn tokenize_lines(&self) -> Vec<&Self>; |
58 | |
59 | /// Splits the value into newlines with newlines separated. |
60 | fn tokenize_lines_and_newlines(&self) -> Vec<&Self>; |
61 | |
62 | /// Tokenizes into words. |
63 | fn tokenize_words(&self) -> Vec<&Self>; |
64 | |
65 | /// Tokenizes the input into characters. |
66 | fn tokenize_chars(&self) -> Vec<&Self>; |
67 | |
68 | /// Tokenizes into unicode words. |
69 | #[cfg (feature = "unicode" )] |
70 | fn tokenize_unicode_words(&self) -> Vec<&Self>; |
71 | |
72 | /// Tokenizes into unicode graphemes. |
73 | #[cfg (feature = "unicode" )] |
74 | fn tokenize_graphemes(&self) -> Vec<&Self>; |
75 | |
76 | /// Decodes the string (potentially) lossy. |
77 | fn as_str(&self) -> Option<&str>; |
78 | |
79 | /// Decodes the string (potentially) lossy. |
80 | fn to_string_lossy(&self) -> Cow<'_, str>; |
81 | |
82 | /// Checks if the string ends in a newline. |
83 | fn ends_with_newline(&self) -> bool; |
84 | |
85 | /// The length of the string. |
86 | fn len(&self) -> usize; |
87 | |
88 | /// Slices the string. |
89 | fn slice(&self, rng: Range<usize>) -> &Self; |
90 | |
91 | /// Returns the string as slice of raw bytes. |
92 | fn as_bytes(&self) -> &[u8]; |
93 | |
94 | /// Checks if the string is empty. |
95 | fn is_empty(&self) -> bool { |
96 | self.len() == 0 |
97 | } |
98 | } |
99 | |
100 | impl DiffableStr for str { |
101 | fn tokenize_lines(&self) -> Vec<&Self> { |
102 | let mut iter = self.char_indices().peekable(); |
103 | let mut last_pos = 0; |
104 | let mut lines = vec![]; |
105 | |
106 | while let Some((idx, c)) = iter.next() { |
107 | if c == ' \r' { |
108 | if iter.peek().map_or(false, |x| x.1 == ' \n' ) { |
109 | lines.push(&self[last_pos..=idx + 1]); |
110 | iter.next(); |
111 | last_pos = idx + 2; |
112 | } else { |
113 | lines.push(&self[last_pos..=idx]); |
114 | last_pos = idx + 1; |
115 | } |
116 | } else if c == ' \n' { |
117 | lines.push(&self[last_pos..=idx]); |
118 | last_pos = idx + 1; |
119 | } |
120 | } |
121 | |
122 | if last_pos < self.len() { |
123 | lines.push(&self[last_pos..]); |
124 | } |
125 | |
126 | lines |
127 | } |
128 | |
129 | fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { |
130 | let mut rv = vec![]; |
131 | let mut iter = self.char_indices().peekable(); |
132 | |
133 | while let Some((idx, c)) = iter.next() { |
134 | let is_newline = c == ' \r' || c == ' \n' ; |
135 | let start = idx; |
136 | let mut end = idx + c.len_utf8(); |
137 | while let Some(&(_, next_char)) = iter.peek() { |
138 | if (next_char == ' \r' || next_char == ' \n' ) != is_newline { |
139 | break; |
140 | } |
141 | iter.next(); |
142 | end += next_char.len_utf8(); |
143 | } |
144 | rv.push(&self[start..end]); |
145 | } |
146 | |
147 | rv |
148 | } |
149 | |
150 | fn tokenize_words(&self) -> Vec<&Self> { |
151 | let mut iter = self.char_indices().peekable(); |
152 | let mut rv = vec![]; |
153 | |
154 | while let Some((idx, c)) = iter.next() { |
155 | let is_whitespace = c.is_whitespace(); |
156 | let start = idx; |
157 | let mut end = idx + c.len_utf8(); |
158 | while let Some(&(_, next_char)) = iter.peek() { |
159 | if next_char.is_whitespace() != is_whitespace { |
160 | break; |
161 | } |
162 | iter.next(); |
163 | end += next_char.len_utf8(); |
164 | } |
165 | rv.push(&self[start..end]); |
166 | } |
167 | |
168 | rv |
169 | } |
170 | |
171 | fn tokenize_chars(&self) -> Vec<&Self> { |
172 | self.char_indices() |
173 | .map(move |(i, c)| &self[i..i + c.len_utf8()]) |
174 | .collect() |
175 | } |
176 | |
177 | #[cfg (feature = "unicode" )] |
178 | fn tokenize_unicode_words(&self) -> Vec<&Self> { |
179 | unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect() |
180 | } |
181 | |
182 | #[cfg (feature = "unicode" )] |
183 | fn tokenize_graphemes(&self) -> Vec<&Self> { |
184 | unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect() |
185 | } |
186 | |
187 | fn as_str(&self) -> Option<&str> { |
188 | Some(self) |
189 | } |
190 | |
191 | fn to_string_lossy(&self) -> Cow<'_, str> { |
192 | Cow::Borrowed(self) |
193 | } |
194 | |
195 | fn ends_with_newline(&self) -> bool { |
196 | self.ends_with(&[' \r' , ' \n' ][..]) |
197 | } |
198 | |
199 | fn len(&self) -> usize { |
200 | str::len(self) |
201 | } |
202 | |
203 | fn slice(&self, rng: Range<usize>) -> &Self { |
204 | &self[rng] |
205 | } |
206 | |
207 | fn as_bytes(&self) -> &[u8] { |
208 | str::as_bytes(self) |
209 | } |
210 | } |
211 | |
212 | #[cfg (feature = "bytes" )] |
213 | mod bytes_support { |
214 | use super::*; |
215 | |
216 | use bstr::ByteSlice; |
217 | |
218 | impl DiffableStrRef for Vec<u8> { |
219 | type Output = [u8]; |
220 | |
221 | fn as_diffable_str(&self) -> &[u8] { |
222 | self.as_slice() |
223 | } |
224 | } |
225 | |
226 | /// Allows viewing ASCII compatible byte slices as strings. |
227 | /// |
228 | /// Requires the `bytes` feature. |
229 | impl DiffableStr for [u8] { |
230 | fn tokenize_lines(&self) -> Vec<&Self> { |
231 | let mut iter = self.char_indices().peekable(); |
232 | let mut last_pos = 0; |
233 | let mut lines = vec![]; |
234 | |
235 | while let Some((_, end, c)) = iter.next() { |
236 | if c == ' \r' { |
237 | if iter.peek().map_or(false, |x| x.2 == ' \n' ) { |
238 | lines.push(&self[last_pos..end + 1]); |
239 | iter.next(); |
240 | last_pos = end + 1; |
241 | } else { |
242 | lines.push(&self[last_pos..end]); |
243 | last_pos = end; |
244 | } |
245 | } else if c == ' \n' { |
246 | lines.push(&self[last_pos..end]); |
247 | last_pos = end; |
248 | } |
249 | } |
250 | |
251 | if last_pos < self.len() { |
252 | lines.push(&self[last_pos..]); |
253 | } |
254 | |
255 | lines |
256 | } |
257 | |
258 | fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { |
259 | let mut rv = vec![]; |
260 | let mut iter = self.char_indices().peekable(); |
261 | |
262 | while let Some((start, mut end, c)) = iter.next() { |
263 | let is_newline = c == ' \r' || c == ' \n' ; |
264 | while let Some(&(_, new_end, next_char)) = iter.peek() { |
265 | if (next_char == ' \r' || next_char == ' \n' ) != is_newline { |
266 | break; |
267 | } |
268 | iter.next(); |
269 | end = new_end; |
270 | } |
271 | rv.push(&self[start..end]); |
272 | } |
273 | |
274 | rv |
275 | } |
276 | |
277 | fn tokenize_words(&self) -> Vec<&Self> { |
278 | let mut iter = self.char_indices().peekable(); |
279 | let mut rv = vec![]; |
280 | |
281 | while let Some((start, mut end, c)) = iter.next() { |
282 | let is_whitespace = c.is_whitespace(); |
283 | while let Some(&(_, new_end, next_char)) = iter.peek() { |
284 | if next_char.is_whitespace() != is_whitespace { |
285 | break; |
286 | } |
287 | iter.next(); |
288 | end = new_end; |
289 | } |
290 | rv.push(&self[start..end]); |
291 | } |
292 | |
293 | rv |
294 | } |
295 | |
296 | #[cfg (feature = "unicode" )] |
297 | fn tokenize_unicode_words(&self) -> Vec<&Self> { |
298 | self.words_with_breaks().map(|x| x.as_bytes()).collect() |
299 | } |
300 | |
301 | #[cfg (feature = "unicode" )] |
302 | fn tokenize_graphemes(&self) -> Vec<&Self> { |
303 | self.graphemes().map(|x| x.as_bytes()).collect() |
304 | } |
305 | |
306 | fn tokenize_chars(&self) -> Vec<&Self> { |
307 | self.char_indices() |
308 | .map(move |(start, end, _)| &self[start..end]) |
309 | .collect() |
310 | } |
311 | |
312 | fn as_str(&self) -> Option<&str> { |
313 | std::str::from_utf8(self).ok() |
314 | } |
315 | |
316 | fn to_string_lossy(&self) -> Cow<'_, str> { |
317 | String::from_utf8_lossy(self) |
318 | } |
319 | |
320 | fn ends_with_newline(&self) -> bool { |
321 | matches!(self.last_byte(), Some(b' \r' ) | Some(b' \n' )) |
322 | } |
323 | |
324 | fn len(&self) -> usize { |
325 | <[u8]>::len(self) |
326 | } |
327 | |
328 | fn slice(&self, rng: Range<usize>) -> &Self { |
329 | &self[rng] |
330 | } |
331 | |
332 | fn as_bytes(&self) -> &[u8] { |
333 | self |
334 | } |
335 | } |
336 | } |
337 | |
338 | #[test ] |
339 | fn test_split_lines() { |
340 | assert_eq!( |
341 | DiffableStr::tokenize_lines("first \nsecond \rthird \r\nfourth \nlast" ), |
342 | vec!["first \n" , "second \r" , "third \r\n" , "fourth \n" , "last" ] |
343 | ); |
344 | assert_eq!(DiffableStr::tokenize_lines(" \n\n" ), vec![" \n" , " \n" ]); |
345 | assert_eq!(DiffableStr::tokenize_lines(" \n" ), vec![" \n" ]); |
346 | assert!(DiffableStr::tokenize_lines("" ).is_empty()); |
347 | } |
348 | |
349 | #[test ] |
350 | fn test_split_words() { |
351 | assert_eq!( |
352 | DiffableStr::tokenize_words("foo bar baz \n\n aha" ), |
353 | ["foo" , " " , "bar" , " " , "baz" , " \n\n " , "aha" ] |
354 | ); |
355 | } |
356 | |
357 | #[test ] |
358 | fn test_split_chars() { |
359 | assert_eq!( |
360 | DiffableStr::tokenize_chars("abcfö❄️" ), |
361 | vec!["a" , "b" , "c" , "f" , "ö" , "❄" , " \u{fe0f}" ] |
362 | ); |
363 | } |
364 | |
365 | #[test ] |
366 | #[cfg (feature = "unicode" )] |
367 | fn test_split_graphemes() { |
368 | assert_eq!( |
369 | DiffableStr::tokenize_graphemes("abcfö❄️" ), |
370 | vec!["a" , "b" , "c" , "f" , "ö" , "❄️" ] |
371 | ); |
372 | } |
373 | |
374 | #[test ] |
375 | #[cfg (feature = "bytes" )] |
376 | fn test_split_lines_bytes() { |
377 | assert_eq!( |
378 | DiffableStr::tokenize_lines("first \nsecond \rthird \r\nfourth \nlast" .as_bytes()), |
379 | vec![ |
380 | "first \n" .as_bytes(), |
381 | "second \r" .as_bytes(), |
382 | "third \r\n" .as_bytes(), |
383 | "fourth \n" .as_bytes(), |
384 | "last" .as_bytes() |
385 | ] |
386 | ); |
387 | assert_eq!( |
388 | DiffableStr::tokenize_lines(" \n\n" .as_bytes()), |
389 | vec![" \n" .as_bytes(), " \n" .as_bytes()] |
390 | ); |
391 | assert_eq!( |
392 | DiffableStr::tokenize_lines(" \n" .as_bytes()), |
393 | vec![" \n" .as_bytes()] |
394 | ); |
395 | assert!(DiffableStr::tokenize_lines("" .as_bytes()).is_empty()); |
396 | } |
397 | |
398 | #[test ] |
399 | #[cfg (feature = "bytes" )] |
400 | fn test_split_words_bytes() { |
401 | assert_eq!( |
402 | DiffableStr::tokenize_words("foo bar baz \n\n aha" .as_bytes()), |
403 | [ |
404 | &b"foo" [..], |
405 | &b" " [..], |
406 | &b"bar" [..], |
407 | &b" " [..], |
408 | &b"baz" [..], |
409 | &b" \n\n " [..], |
410 | &b"aha" [..] |
411 | ] |
412 | ); |
413 | } |
414 | |
415 | #[test ] |
416 | #[cfg (feature = "bytes" )] |
417 | fn test_split_chars_bytes() { |
418 | assert_eq!( |
419 | DiffableStr::tokenize_chars("abcfö❄️" .as_bytes()), |
420 | vec![ |
421 | &b"a" [..], |
422 | &b"b" [..], |
423 | &b"c" [..], |
424 | &b"f" [..], |
425 | "ö" .as_bytes(), |
426 | "❄" .as_bytes(), |
427 | " \u{fe0f}" .as_bytes() |
428 | ] |
429 | ); |
430 | } |
431 | |
432 | #[test ] |
433 | #[cfg (all(feature = "bytes" , feature = "unicode" ))] |
434 | fn test_split_graphemes_bytes() { |
435 | assert_eq!( |
436 | DiffableStr::tokenize_graphemes("abcfö❄️" .as_bytes()), |
437 | vec![ |
438 | &b"a" [..], |
439 | &b"b" [..], |
440 | &b"c" [..], |
441 | &b"f" [..], |
442 | "ö" .as_bytes(), |
443 | "❄️" .as_bytes() |
444 | ] |
445 | ); |
446 | } |
447 | |