strip.rs source code [crates/anstream/src/adapter/strip.rs]

1	use anstyle_parse::state::state_change;
2	use anstyle_parse::state::Action;
3	use anstyle_parse::state::State;
4
5	/// Strip ANSI escapes from a `&str`, returning the printable content
6	///
7	/// This can be used to take output from a program that includes escape sequences and write it
8	/// somewhere that does not easily support them, such as a log file.
9	///
10	/// For non-contiguous data, see [`StripStr`].
11	///
12	/// # Example
13	///
14	/// ```rust
15	/// use std::io::Write as _;
16	///
17	/// let styled_text = "`\x1b`[32mfoo`\x1b`[m bar";
18	/// let plain_str = anstream::adapter::strip_str(&styled_text).to_string();
19	/// assert_eq!(plain_str, "foo bar");
20	/// ```
21	#[inline]
22	pub fn strip_str(data: &str) -> StrippedStr<'_> {
23	StrippedStr::new(data)
24	}
25
26	/// See [`strip_str`]
27	#[derive(Default, Clone, Debug, PartialEq, Eq)]
28	pub struct StrippedStr<'s> {
29	bytes: &'s [u8],
30	state: State,
31	}
32
33	impl<'s> StrippedStr<'s> {
34	#[inline]
35	fn new(data: &'s str) -> Self {
36	Self {
37	bytes: data.as_bytes(),
38	state: State::Ground,
39	}
40	}
41
42	/// Create a [`String`] of the printable content
43	#[inline]
44	#[allow(clippy::inherent_to_string_shadow_display)] // Single-allocation implementation
45	pub fn to_string(&self) -> String {
46	use std::fmt::Write as _;
47	let mut stripped: String = String::with_capacity(self.bytes.len());
48	let _ = write!(&mut stripped, "{self}");
49	stripped
50	}
51	}
52
53	impl<'s> std::fmt::Display for StrippedStr<'s> {
54	/// Note:* this does not exhaust the* [`Iterator`]
55	#[inline]
56	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57	let iter: StrippedStr<'s> = Self {
58	bytes: self.bytes,
59	state: self.state,
60	};
61	for printable: &'s str in iter {
62	printable.fmt(f)?;
63	}
64	Ok(())
65	}
66	}
67
68	impl<'s> Iterator for StrippedStr<'s> {
69	type Item = &'s str;
70
71	#[inline]
72	fn next(&mut self) -> Option<Self::Item> {
73	next_str(&mut self.bytes, &mut self.state)
74	}
75	}
76
77	/// Incrementally strip non-contiguous data
78	#[derive(Default, Clone, Debug, PartialEq, Eq)]
79	pub struct StripStr {
80	state: State,
81	}
82
83	impl StripStr {
84	/// Initial state
85	pub fn new() -> Self {
86	Default::default()
87	}
88
89	/// Strip the next segment of data
90	pub fn strip_next<'s>(&'s mut self, data: &'s str) -> StripStrIter<'s> {
91	StripStrIter {
92	bytes: data.as_bytes(),
93	state: &mut self.state,
94	}
95	}
96	}
97
98	/// See [`StripStr`]
99	#[derive(Debug, PartialEq, Eq)]
100	pub struct StripStrIter<'s> {
101	bytes: &'s [u8],
102	state: &'s mut State,
103	}
104
105	impl<'s> Iterator for StripStrIter<'s> {
106	type Item = &'s str;
107
108	#[inline]
109	fn next(&mut self) -> Option<Self::Item> {
110	next_str(&mut self.bytes, self.state)
111	}
112	}
113
114	#[inline]
115	fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> {
116	let offset = bytes.iter().copied().position(\|b\| {
117	let (next_state, action) = state_change(*state, b);
118	if next_state != State::Anywhere {
119	*state = next_state;
120	}
121	is_printable_bytes(action, b)
122	});
123	let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
124	*bytes = next;
125	*state = State::Ground;
126
127	let offset = bytes.iter().copied().position(\|b\| {
128	let (_next_state, action) = state_change(State::Ground, b);
129	!(is_printable_bytes(action, b) \|\| is_utf8_continuation(b))
130	});
131	let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
132	*bytes = next;
133	if printable.is_empty() {
134	None
135	} else {
136	let printable = unsafe {
137	from_utf8_unchecked(
138	printable,
139	"`bytes` was validated as UTF-8, the parser preserves UTF-8 continuations",
140	)
141	};
142	Some(printable)
143	}
144	}
145
146	#[inline]
147	unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'static str) -> &'b str {
148	unsafe {
149	if cfg!(debug_assertions) {
150	// Catch problems more quickly when testing
151	std::str::from_utf8(bytes).expect(msg:safety_justification)
152	} else {
153	std::str::from_utf8_unchecked(bytes)
154	}
155	}
156	}
157
158	#[inline]
159	fn is_utf8_continuation(b: u8) -> bool {
160	matches!(b, `0x80`..=`0xbf`)
161	}
162
163	/// Strip ANSI escapes from bytes, returning the printable content
164	///
165	/// This can be used to take output from a program that includes escape sequences and write it
166	/// somewhere that does not easily support them, such as a log file.
167	///
168	/// # Example
169	///
170	/// ```rust
171	/// use std::io::Write as _;
172	///
173	/// let styled_text = "`\x1b`[32mfoo`\x1b`[m bar";
174	/// let plain_str = anstream::adapter::strip_bytes(styled_text.as_bytes()).into_vec();
175	/// assert_eq!(plain_str.as_slice(), &b"foo bar"[..]);
176	/// ```
177	#[inline]
178	pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> {
179	StrippedBytes::new(bytes:data)
180	}
181
182	/// See [`strip_bytes`]
183	#[derive(Default, Clone, Debug, PartialEq, Eq)]
184	pub struct StrippedBytes<'s> {
185	bytes: &'s [u8],
186	state: State,
187	utf8parser: Utf8Parser,
188	}
189
190	impl<'s> StrippedBytes<'s> {
191	/// See [`strip_bytes`]
192	#[inline]
193	pub fn new(bytes: &'s [u8]) -> Self {
194	Self {
195	bytes,
196	state: State::Ground,
197	utf8parser: Default::default(),
198	}
199	}
200
201	/// Strip the next slice of bytes
202	///
203	/// Used when the content is in several non-contiguous slices
204	///
205	/// # Panic
206	///
207	/// May panic if it is not exhausted / empty
208	#[inline]
209	pub fn extend(&mut self, bytes: &'s [u8]) {
210	debug_assert!(
211	self.is_empty(),
212	"current bytes must be processed to ensure we end at the right state"
213	);
214	self.bytes = bytes;
215	}
216
217	/// Report the bytes has been exhausted
218	#[inline]
219	pub fn is_empty(&self) -> bool {
220	self.bytes.is_empty()
221	}
222
223	/// Create a [`Vec`] of the printable content
224	#[inline]
225	pub fn into_vec(self) -> Vec<u8> {
226	let mut stripped = Vec::with_capacity(self.bytes.len());
227	for printable in self {
228	stripped.extend(printable);
229	}
230	stripped
231	}
232	}
233
234	impl<'s> Iterator for StrippedBytes<'s> {
235	type Item = &'s [u8];
236
237	#[inline]
238	fn next(&mut self) -> Option<Self::Item> {
239	next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser)
240	}
241	}
242
243	/// Incrementally strip non-contiguous data
244	#[derive(Default, Clone, Debug, PartialEq, Eq)]
245	pub struct StripBytes {
246	state: State,
247	utf8parser: Utf8Parser,
248	}
249
250	impl StripBytes {
251	/// Initial state
252	pub fn new() -> Self {
253	Default::default()
254	}
255
256	/// Strip the next segment of data
257	pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> {
258	StripBytesIter {
259	bytes,
260	state: &mut self.state,
261	utf8parser: &mut self.utf8parser,
262	}
263	}
264	}
265
266	/// See [`StripBytes`]
267	#[derive(Debug, PartialEq, Eq)]
268	pub struct StripBytesIter<'s> {
269	bytes: &'s [u8],
270	state: &'s mut State,
271	utf8parser: &'s mut Utf8Parser,
272	}
273
274	impl<'s> Iterator for StripBytesIter<'s> {
275	type Item = &'s [u8];
276
277	#[inline]
278	fn next(&mut self) -> Option<Self::Item> {
279	next_bytes(&mut self.bytes, self.state, self.utf8parser)
280	}
281	}
282
283	#[inline]
284	fn next_bytes<'s>(
285	bytes: &mut &'s [u8],
286	state: &mut State,
287	utf8parser: &mut Utf8Parser,
288	) -> Option<&'s [u8]> {
289	let offset = bytes.iter().copied().position(\|b\| {
290	if *state == State::Utf8 {
291	`true`
292	} else {
293	let (next_state, action) = state_change(*state, b);
294	if next_state != State::Anywhere {
295	*state = next_state;
296	}
297	is_printable_bytes(action, b)
298	}
299	});
300	let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
301	*bytes = next;
302
303	let offset = bytes.iter().copied().position(\|b\| {
304	if *state == State::Utf8 {
305	if utf8parser.add(b) {
306	*state = State::Ground;
307	}
308	`false`
309	} else {
310	let (next_state, action) = state_change(State::Ground, b);
311	if next_state != State::Anywhere {
312	*state = next_state;
313	}
314	if *state == State::Utf8 {
315	utf8parser.add(b);
316	`false`
317	} else {
318	!is_printable_bytes(action, b)
319	}
320	}
321	});
322	let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
323	*bytes = next;
324	if printable.is_empty() {
325	None
326	} else {
327	Some(printable)
328	}
329	}
330
331	#[derive(Default, Clone, Debug, PartialEq, Eq)]
332	pub(crate) struct Utf8Parser {
333	utf8_parser: utf8parse::Parser,
334	}
335
336	impl Utf8Parser {
337	fn add(&mut self, byte: u8) -> bool {
338	let mut b: bool = `false`;
339	let mut receiver: VtUtf8Receiver<'_> = VtUtf8Receiver(&mut b);
340	self.utf8_parser.advance(&mut receiver, byte);
341	b
342	}
343	}
344
345	struct VtUtf8Receiver<'a>(&'a mut bool);
346
347	impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> {
348	fn codepoint(&mut self, _: char) {
349	*self.0 = `true`;
350	}
351
352	fn invalid_sequence(&mut self) {
353	*self.0 = `true`;
354	}
355	}
356
357	#[inline]
358	fn is_printable_bytes(action: Action, byte: u8) -> bool {
359	// VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not
360	// ISO Latin-1, making it DEL and non-printable
361	const DEL: u8 = `0x7f`;
362
363	// Continuations aren't included as they may also be control codes, requiring more context
364	(action == Action::Print && byte != DEL)
365	\|\| action == Action::BeginUtf8
366	\|\| (action == Action::Execute && byte.is_ascii_whitespace())
367	}
368
369	#[cfg(test)]
370	mod test {
371	use super::*;
372	use proptest::prelude::*;
373
374	/// Model based off full parser
375	fn parser_strip(bytes: &[u8]) -> String {
376	#[derive(Default)]
377	struct Strip(String);
378	impl Strip {
379	fn with_capacity(capacity: usize) -> Self {
380	Self(String::with_capacity(capacity))
381	}
382	}
383	impl anstyle_parse::Perform for Strip {
384	fn print(&mut self, c: char) {
385	self.0.push(c);
386	}
387
388	fn execute(&mut self, byte: u8) {
389	if byte.is_ascii_whitespace() {
390	self.0.push(byte as char);
391	}
392	}
393	}
394
395	let mut stripped = Strip::with_capacity(bytes.len());
396	let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
397	for byte in bytes {
398	parser.advance(&mut stripped, *byte);
399	}
400	stripped.0
401	}
402
403	/// Model verifying incremental parsing
404	fn strip_char(mut s: &str) -> String {
405	let mut result = String::new();
406	let mut state = StripStr::new();
407	while !s.is_empty() {
408	let mut indices = s.char_indices();
409	indices.next(); // current
410	let offset = indices.next().map(\|(i, _)\| i).unwrap_or_else(\|\| s.len());
411	let (current, remainder) = s.split_at(offset);
412	for printable in state.strip_next(current) {
413	result.push_str(printable);
414	}
415	s = remainder;
416	}
417	result
418	}
419
420	/// Model verifying incremental parsing
421	fn strip_byte(s: &[u8]) -> Vec<u8> {
422	let mut result = Vec::new();
423	let mut state = StripBytes::default();
424	for start in `0`..s.len() {
425	let current = &s[start..=start];
426	for printable in state.strip_next(current) {
427	result.extend(printable);
428	}
429	}
430	result
431	}
432
433	#[test]
434	fn test_strip_bytes_multibyte() {
435	let bytes = [`240`, `145`, `141`, `139`];
436	let expected = parser_strip(&bytes);
437	let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap();
438	assert_eq!(expected, actual);
439	}
440
441	#[test]
442	fn test_strip_byte_multibyte() {
443	let bytes = [`240`, `145`, `141`, `139`];
444	let expected = parser_strip(&bytes);
445	let actual = String::from_utf8(strip_byte(&bytes).clone()).unwrap();
446	assert_eq!(expected, actual);
447	}
448
449	#[test]
450	fn test_strip_str_del() {
451	let input = std::str::from_utf8(&[`0x7f`]).unwrap();
452	let expected = "";
453	let actual = strip_str(input).to_string();
454	assert_eq!(expected, actual);
455	}
456
457	#[test]
458	fn test_strip_byte_del() {
459	let bytes = [`0x7f`];
460	let expected = "";
461	let actual = String::from_utf8(strip_byte(&bytes).clone()).unwrap();
462	assert_eq!(expected, actual);
463	}
464
465	#[test]
466	fn test_strip_str_handles_broken_sequence() {
467	// valid utf8: \xc3\xb6 then \x1b then \xf0\x9f\x98\x80
468	let s = "ö`\x1b`😀hello😀goodbye";
469	let mut it = strip_str(s);
470	assert_eq!("ö", it.next().unwrap());
471	assert_eq!("ello😀goodbye", it.next().unwrap());
472	}
473
474	proptest! {
475	#[test]
476	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
477	fn strip_str_no_escapes(s in "`\\`PC*") {
478	let expected = parser_strip(s.as_bytes());
479	let actual = strip_str(&s).to_string();
480	assert_eq!(expected, actual);
481	}
482
483	#[test]
484	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
485	fn strip_char_no_escapes(s in "`\\`PC*") {
486	let expected = parser_strip(s.as_bytes());
487	let actual = strip_char(&s);
488	assert_eq!(expected, actual);
489	}
490
491	#[test]
492	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
493	fn strip_bytes_no_escapes(s in "`\\`PC*") {
494	dbg!(&s);
495	dbg!(s.as_bytes());
496	let expected = parser_strip(s.as_bytes());
497	let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap();
498	assert_eq!(expected, actual);
499	}
500
501	#[test]
502	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
503	fn strip_byte_no_escapes(s in "`\\`PC*") {
504	dbg!(&s);
505	dbg!(s.as_bytes());
506	let expected = parser_strip(s.as_bytes());
507	let actual = String::from_utf8(strip_byte(s.as_bytes()).clone()).unwrap();
508	assert_eq!(expected, actual);
509	}
510	}
511	}
512