abstraction.rs source code [crates/similar/src/text/abstraction.rs]

1	use std::borrow::Cow;
2	use std::hash::Hash;
3	use std::ops::Range;
4
5	/// Reference to a [`DiffableStr`].
6	///
7	/// This type exists because while the library only really provides ways to
8	/// work with `&str` and `&[u8]` there are types that deref into those string
9	/// slices such as `String` and `Vec<u8>`.
10	///
11	/// This trait is used in the library whenever it's nice to be able to pass
12	/// strings of different types in.
13	///
14	/// Requires the `text` feature.
15	pub trait DiffableStrRef {
16	/// The type of the resolved [`DiffableStr`].
17	type Output: DiffableStr + ?Sized;
18
19	/// Resolves the reference.
20	fn as_diffable_str(&self) -> &Self::Output;
21	}
22
23	impl<T: DiffableStr + ?Sized> DiffableStrRef for T {
24	type Output = T;
25
26	fn as_diffable_str(&self) -> &T {
27	self
28	}
29	}
30
31	impl DiffableStrRef for String {
32	type Output = str;
33
34	fn as_diffable_str(&self) -> &str {
35	self.as_str()
36	}
37	}
38
39	impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> {
40	type Output = T;
41
42	fn as_diffable_str(&self) -> &T {
43	self
44	}
45	}
46
47	/// All supported diffable strings.
48	///
49	/// The text module can work with different types of strings depending
50	/// on how the crate is compiled. Out of the box `&str` is always supported
51	/// but with the `bytes` feature one can also work with `[u8]` slices for
52	/// as long as they are ASCII compatible.
53	///
54	/// Requires the `text` feature.
55	pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
56	/// Splits the value into newlines with newlines attached.
57	fn tokenize_lines(&self) -> Vec<&Self>;
58
59	/// Splits the value into newlines with newlines separated.
60	fn tokenize_lines_and_newlines(&self) -> Vec<&Self>;
61
62	/// Tokenizes into words.
63	fn tokenize_words(&self) -> Vec<&Self>;
64
65	/// Tokenizes the input into characters.
66	fn tokenize_chars(&self) -> Vec<&Self>;
67
68	/// Tokenizes into unicode words.
69	#[cfg(feature = "unicode")]
70	fn tokenize_unicode_words(&self) -> Vec<&Self>;
71
72	/// Tokenizes into unicode graphemes.
73	#[cfg(feature = "unicode")]
74	fn tokenize_graphemes(&self) -> Vec<&Self>;
75
76	/// Decodes the string (potentially) lossy.
77	fn as_str(&self) -> Option<&str>;
78
79	/// Decodes the string (potentially) lossy.
80	fn to_string_lossy(&self) -> Cow<'_, str>;
81
82	/// Checks if the string ends in a newline.
83	fn ends_with_newline(&self) -> bool;
84
85	/// The length of the string.
86	fn len(&self) -> usize;
87
88	/// Slices the string.
89	fn slice(&self, rng: Range<usize>) -> &Self;
90
91	/// Returns the string as slice of raw bytes.
92	fn as_bytes(&self) -> &[u8];
93
94	/// Checks if the string is empty.
95	fn is_empty(&self) -> bool {
96	self.len() == `0`
97	}
98	}
99
100	impl DiffableStr for str {
101	fn tokenize_lines(&self) -> Vec<&Self> {
102	let mut iter = self.char_indices().peekable();
103	let mut last_pos = `0`;
104	let mut lines = vec![];
105
106	while let Some((idx, c)) = iter.next() {
107	if c == '`\r`' {
108	if iter.peek().map_or(`false`, \|x\| x.1 == '`\n`') {
109	lines.push(&self[last_pos..=idx + `1`]);
110	iter.next();
111	last_pos = idx + `2`;
112	} else {
113	lines.push(&self[last_pos..=idx]);
114	last_pos = idx + `1`;
115	}
116	} else if c == '`\n`' {
117	lines.push(&self[last_pos..=idx]);
118	last_pos = idx + `1`;
119	}
120	}
121
122	if last_pos < self.len() {
123	lines.push(&self[last_pos..]);
124	}
125
126	lines
127	}
128
129	fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
130	let mut rv = vec![];
131	let mut iter = self.char_indices().peekable();
132
133	while let Some((idx, c)) = iter.next() {
134	let is_newline = c == '`\r`' \|\| c == '`\n`';
135	let start = idx;
136	let mut end = idx + c.len_utf8();
137	while let Some(&(_, next_char)) = iter.peek() {
138	if (next_char == '`\r`' \|\| next_char == '`\n`') != is_newline {
139	break;
140	}
141	iter.next();
142	end += next_char.len_utf8();
143	}
144	rv.push(&self[start..end]);
145	}
146
147	rv
148	}
149
150	fn tokenize_words(&self) -> Vec<&Self> {
151	let mut iter = self.char_indices().peekable();
152	let mut rv = vec![];
153
154	while let Some((idx, c)) = iter.next() {
155	let is_whitespace = c.is_whitespace();
156	let start = idx;
157	let mut end = idx + c.len_utf8();
158	while let Some(&(_, next_char)) = iter.peek() {
159	if next_char.is_whitespace() != is_whitespace {
160	break;
161	}
162	iter.next();
163	end += next_char.len_utf8();
164	}
165	rv.push(&self[start..end]);
166	}
167
168	rv
169	}
170
171	fn tokenize_chars(&self) -> Vec<&Self> {
172	self.char_indices()
173	.map(move \|(i, c)\| &self[i..i + c.len_utf8()])
174	.collect()
175	}
176
177	#[cfg(feature = "unicode")]
178	fn tokenize_unicode_words(&self) -> Vec<&Self> {
179	unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect()
180	}
181
182	#[cfg(feature = "unicode")]
183	fn tokenize_graphemes(&self) -> Vec<&Self> {
184	unicode_segmentation::UnicodeSegmentation::graphemes(self, `true`).collect()
185	}
186
187	fn as_str(&self) -> Option<&str> {
188	Some(self)
189	}
190
191	fn to_string_lossy(&self) -> Cow<'_, str> {
192	Cow::Borrowed(self)
193	}
194
195	fn ends_with_newline(&self) -> bool {
196	self.ends_with(&['`\r`', '`\n`'][..])
197	}
198
199	fn len(&self) -> usize {
200	str::len(self)
201	}
202
203	fn slice(&self, rng: Range<usize>) -> &Self {
204	&self[rng]
205	}
206
207	fn as_bytes(&self) -> &[u8] {
208	str::as_bytes(self)
209	}
210	}
211
212	#[cfg(feature = "bytes")]
213	mod bytes_support {
214	use super::*;
215
216	use bstr::ByteSlice;
217
218	impl DiffableStrRef for Vec<u8> {
219	type Output = [u8];
220
221	fn as_diffable_str(&self) -> &[u8] {
222	self.as_slice()
223	}
224	}
225
226	/// Allows viewing ASCII compatible byte slices as strings.
227	///
228	/// Requires the `bytes` feature.
229	impl DiffableStr for [u8] {
230	fn tokenize_lines(&self) -> Vec<&Self> {
231	let mut iter = self.char_indices().peekable();
232	let mut last_pos = `0`;
233	let mut lines = vec![];
234
235	while let Some((_, end, c)) = iter.next() {
236	if c == '`\r`' {
237	if iter.peek().map_or(`false`, \|x\| x.2 == '`\n`') {
238	lines.push(&self[last_pos..end + `1`]);
239	iter.next();
240	last_pos = end + `1`;
241	} else {
242	lines.push(&self[last_pos..end]);
243	last_pos = end;
244	}
245	} else if c == '`\n`' {
246	lines.push(&self[last_pos..end]);
247	last_pos = end;
248	}
249	}
250
251	if last_pos < self.len() {
252	lines.push(&self[last_pos..]);
253	}
254
255	lines
256	}
257
258	fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
259	let mut rv = vec![];
260	let mut iter = self.char_indices().peekable();
261
262	while let Some((start, mut end, c)) = iter.next() {
263	let is_newline = c == '`\r`' \|\| c == '`\n`';
264	while let Some(&(_, new_end, next_char)) = iter.peek() {
265	if (next_char == '`\r`' \|\| next_char == '`\n`') != is_newline {
266	break;
267	}
268	iter.next();
269	end = new_end;
270	}
271	rv.push(&self[start..end]);
272	}
273
274	rv
275	}
276
277	fn tokenize_words(&self) -> Vec<&Self> {
278	let mut iter = self.char_indices().peekable();
279	let mut rv = vec![];
280
281	while let Some((start, mut end, c)) = iter.next() {
282	let is_whitespace = c.is_whitespace();
283	while let Some(&(_, new_end, next_char)) = iter.peek() {
284	if next_char.is_whitespace() != is_whitespace {
285	break;
286	}
287	iter.next();
288	end = new_end;
289	}
290	rv.push(&self[start..end]);
291	}
292
293	rv
294	}
295
296	#[cfg(feature = "unicode")]
297	fn tokenize_unicode_words(&self) -> Vec<&Self> {
298	self.words_with_breaks().map(\|x\| x.as_bytes()).collect()
299	}
300
301	#[cfg(feature = "unicode")]
302	fn tokenize_graphemes(&self) -> Vec<&Self> {
303	self.graphemes().map(\|x\| x.as_bytes()).collect()
304	}
305
306	fn tokenize_chars(&self) -> Vec<&Self> {
307	self.char_indices()
308	.map(move \|(start, end, _)\| &self[start..end])
309	.collect()
310	}
311
312	fn as_str(&self) -> Option<&str> {
313	std::str::from_utf8(self).ok()
314	}
315
316	fn to_string_lossy(&self) -> Cow<'_, str> {
317	String::from_utf8_lossy(self)
318	}
319
320	fn ends_with_newline(&self) -> bool {
321	matches!(self.last_byte(), Some(b'`\r`') \| Some(b'`\n`'))
322	}
323
324	fn len(&self) -> usize {
325	<[u8]>::len(self)
326	}
327
328	fn slice(&self, rng: Range<usize>) -> &Self {
329	&self[rng]
330	}
331
332	fn as_bytes(&self) -> &[u8] {
333	self
334	}
335	}
336	}
337
338	#[test]
339	fn test_split_lines() {
340	assert_eq!(
341	DiffableStr::tokenize_lines("first`\n`second`\r`third`\r\n`fourth`\n`last"),
342	vec!["first`\n`", "second`\r`", "third`\r\n`", "fourth`\n`", "last"]
343	);
344	assert_eq!(DiffableStr::tokenize_lines("`\n\n`"), vec!["`\n`", "`\n`"]);
345	assert_eq!(DiffableStr::tokenize_lines("`\n`"), vec!["`\n`"]);
346	assert!(DiffableStr::tokenize_lines("").is_empty());
347	}
348
349	#[test]
350	fn test_split_words() {
351	assert_eq!(
352	DiffableStr::tokenize_words("foo bar baz`\n\n` aha"),
353	["foo", " ", "bar", " ", "baz", "`\n\n` ", "aha"]
354	);
355	}
356
357	#[test]
358	fn test_split_chars() {
359	assert_eq!(
360	DiffableStr::tokenize_chars("abcfö❄️"),
361	vec!["a", "b", "c", "f", "ö", "❄", "`\u{fe0f}`"]
362	);
363	}
364
365	#[test]
366	#[cfg(feature = "unicode")]
367	fn test_split_graphemes() {
368	assert_eq!(
369	DiffableStr::tokenize_graphemes("abcfö❄️"),
370	vec!["a", "b", "c", "f", "ö", "❄️"]
371	);
372	}
373
374	#[test]
375	#[cfg(feature = "bytes")]
376	fn test_split_lines_bytes() {
377	assert_eq!(
378	DiffableStr::tokenize_lines("first`\n`second`\r`third`\r\n`fourth`\n`last".as_bytes()),
379	vec![
380	"first`\n`".as_bytes(),
381	"second`\r`".as_bytes(),
382	"third`\r\n`".as_bytes(),
383	"fourth`\n`".as_bytes(),
384	"last".as_bytes()
385	]
386	);
387	assert_eq!(
388	DiffableStr::tokenize_lines("`\n\n`".as_bytes()),
389	vec!["`\n`".as_bytes(), "`\n`".as_bytes()]
390	);
391	assert_eq!(
392	DiffableStr::tokenize_lines("`\n`".as_bytes()),
393	vec!["`\n`".as_bytes()]
394	);
395	assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty());
396	}
397
398	#[test]
399	#[cfg(feature = "bytes")]
400	fn test_split_words_bytes() {
401	assert_eq!(
402	DiffableStr::tokenize_words("foo bar baz`\n\n` aha".as_bytes()),
403	[
404	&b"foo"[..],
405	&b" "[..],
406	&b"bar"[..],
407	&b" "[..],
408	&b"baz"[..],
409	&b"`\n\n` "[..],
410	&b"aha"[..]
411	]
412	);
413	}
414
415	#[test]
416	#[cfg(feature = "bytes")]
417	fn test_split_chars_bytes() {
418	assert_eq!(
419	DiffableStr::tokenize_chars("abcfö❄️".as_bytes()),
420	vec![
421	&b"a"[..],
422	&b"b"[..],
423	&b"c"[..],
424	&b"f"[..],
425	"ö".as_bytes(),
426	"❄".as_bytes(),
427	"`\u{fe0f}`".as_bytes()
428	]
429	);
430	}
431
432	#[test]
433	#[cfg(all(feature = "bytes", feature = "unicode"))]
434	fn test_split_graphemes_bytes() {
435	assert_eq!(
436	DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()),
437	vec![
438	&b"a"[..],
439	&b"b"[..],
440	&b"c"[..],
441	&b"f"[..],
442	"ö".as_bytes(),
443	"❄️".as_bytes()
444	]
445	);
446	}
447