strip.rs source code [crates/anstream-0.5.0/src/adapter/strip.rs]

1	use anstyle_parse::state::state_change;
2	use anstyle_parse::state::Action;
3	use anstyle_parse::state::State;
4
5	/// Strip ANSI escapes from a `&str`, returning the printable content
6	///
7	/// This can be used to take output from a program that includes escape sequences and write it
8	/// somewhere that does not easily support them, such as a log file.
9	///
10	/// For non-contiguous data, see [`StripStr`].
11	///
12	/// # Example
13	///
14	/// ```rust
15	/// use std::io::Write as _;
16	///
17	/// let styled_text = "`\x1b`[32mfoo`\x1b`[m bar";
18	/// let plain_str = anstream::adapter::strip_str(&styled_text).to_string();
19	/// assert_eq!(plain_str, "foo bar");
20	/// ```
21	#[inline]
22	pub fn strip_str(data: &str) -> StrippedStr<'_> {
23	StrippedStr::new(data)
24	}
25
26	/// See [`strip_str`]
27	#[derive(Default, Clone, Debug, PartialEq, Eq)]
28	pub struct StrippedStr<'s> {
29	bytes: &'s [u8],
30	state: State,
31	}
32
33	impl<'s> StrippedStr<'s> {
34	#[inline]
35	fn new(data: &'s str) -> Self {
36	Self {
37	bytes: data.as_bytes(),
38	state: State::Ground,
39	}
40	}
41
42	/// Create a [`String`] of the printable content
43	#[inline]
44	#[allow(clippy::inherent_to_string_shadow_display)] // Single-allocation implementation
45	pub fn to_string(&self) -> String {
46	use std::fmt::Write as _;
47	let mut stripped: String = String::with_capacity(self.bytes.len());
48	let _ = write!(&mut stripped, "{}", self);
49	stripped
50	}
51	}
52
53	impl<'s> std::fmt::Display for StrippedStr<'s> {
54	/// Note:* this does not exhaust the* [`Iterator`]
55	#[inline]
56	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57	let iter: StrippedStr<'_> = Self {
58	bytes: self.bytes,
59	state: self.state,
60	};
61	for printable: &str in iter {
62	printable.fmt(f)?;
63	}
64	Ok(())
65	}
66	}
67
68	impl<'s> Iterator for StrippedStr<'s> {
69	type Item = &'s str;
70
71	#[inline]
72	fn next(&mut self) -> Option<Self::Item> {
73	next_str(&mut self.bytes, &mut self.state)
74	}
75	}
76
77	/// Incrementally strip non-contiguous data
78	#[derive(Default, Clone, Debug, PartialEq, Eq)]
79	pub struct StripStr {
80	state: State,
81	}
82
83	impl StripStr {
84	/// Initial state
85	pub fn new() -> Self {
86	Default::default()
87	}
88
89	/// Strip the next segment of data
90	pub fn strip_next<'s>(&'s mut self, data: &'s str) -> StripStrIter<'s> {
91	StripStrIter {
92	bytes: data.as_bytes(),
93	state: &mut self.state,
94	}
95	}
96	}
97
98	/// See [`StripStr`]
99	#[derive(Debug, PartialEq, Eq)]
100	pub struct StripStrIter<'s> {
101	bytes: &'s [u8],
102	state: &'s mut State,
103	}
104
105	impl<'s> Iterator for StripStrIter<'s> {
106	type Item = &'s str;
107
108	#[inline]
109	fn next(&mut self) -> Option<Self::Item> {
110	next_str(&mut self.bytes, self.state)
111	}
112	}
113
114	#[inline]
115	fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> {
116	let offset = bytes.iter().copied().position(\|b\| {
117	let (next_state, action) = state_change(*state, b);
118	if next_state != State::Anywhere {
119	*state = next_state;
120	}
121	is_printable_str(action, b)
122	});
123	let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
124	*bytes = next;
125	*state = State::Ground;
126
127	let offset = bytes.iter().copied().position(\|b\| {
128	let (_next_state, action) = state_change(State::Ground, b);
129	!is_printable_str(action, b)
130	});
131	let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
132	*bytes = next;
133	if printable.is_empty() {
134	None
135	} else {
136	let printable = unsafe {
137	from_utf8_unchecked(
138	printable,
139	"`bytes` was validated as UTF-8, the parser preserves UTF-8 continuations",
140	)
141	};
142	Some(printable)
143	}
144	}
145
146	#[inline]
147	unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'static str) -> &'b str {
148	if cfg!(debug_assertions) {
149	// Catch problems more quickly when testing
150	std::str::from_utf8(bytes).expect(msg:safety_justification)
151	} else {
152	std::str::from_utf8_unchecked(bytes)
153	}
154	}
155
156	#[inline]
157	fn is_printable_str(action: Action, byte: u8) -> bool {
158	// VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not
159	// ISO Latin-1, making it DEL and non-printable
160	const DEL: u8 = `0x7f`;
161	(action == Action::Print && byte != DEL)
162	\|\| action == Action::BeginUtf8
163	// since we know the input is valid UTF-8, the only thing we can do with
164	// continuations is to print them
165	\|\| is_utf8_continuation(byte)
166	\|\| (action == Action::Execute && byte.is_ascii_whitespace())
167	}
168
169	#[inline]
170	fn is_utf8_continuation(b: u8) -> bool {
171	matches!(b, `0x80`..=`0xbf`)
172	}
173
174	/// Strip ANSI escapes from bytes, returning the printable content
175	///
176	/// This can be used to take output from a program that includes escape sequences and write it
177	/// somewhere that does not easily support them, such as a log file.
178	///
179	/// # Example
180	///
181	/// ```rust
182	/// use std::io::Write as _;
183	///
184	/// let styled_text = "`\x1b`[32mfoo`\x1b`[m bar";
185	/// let plain_str = anstream::adapter::strip_bytes(styled_text.as_bytes()).into_vec();
186	/// assert_eq!(plain_str.as_slice(), &b"foo bar"[..]);
187	/// ```
188	#[inline]
189	pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> {
190	StrippedBytes::new(bytes:data)
191	}
192
193	/// See [`strip_bytes`]
194	#[derive(Default, Clone, Debug, PartialEq, Eq)]
195	pub struct StrippedBytes<'s> {
196	bytes: &'s [u8],
197	state: State,
198	utf8parser: Utf8Parser,
199	}
200
201	impl<'s> StrippedBytes<'s> {
202	/// See [`strip_bytes`]
203	#[inline]
204	pub fn new(bytes: &'s [u8]) -> Self {
205	Self {
206	bytes,
207	state: State::Ground,
208	utf8parser: Default::default(),
209	}
210	}
211
212	/// Strip the next slice of bytes
213	///
214	/// Used when the content is in several non-contiguous slices
215	///
216	/// # Panic
217	///
218	/// May panic if it is not exhausted / empty
219	#[inline]
220	pub fn extend(&mut self, bytes: &'s [u8]) {
221	debug_assert!(
222	self.is_empty(),
223	"current bytes must be processed to ensure we end at the right state"
224	);
225	self.bytes = bytes;
226	}
227
228	/// Report the bytes has been exhausted
229	#[inline]
230	pub fn is_empty(&self) -> bool {
231	self.bytes.is_empty()
232	}
233
234	/// Create a [`Vec`] of the printable content
235	#[inline]
236	pub fn into_vec(self) -> Vec<u8> {
237	let mut stripped = Vec::with_capacity(self.bytes.len());
238	for printable in self {
239	stripped.extend(printable);
240	}
241	stripped
242	}
243	}
244
245	impl<'s> Iterator for StrippedBytes<'s> {
246	type Item = &'s [u8];
247
248	#[inline]
249	fn next(&mut self) -> Option<Self::Item> {
250	next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser)
251	}
252	}
253
254	/// Incrementally strip non-contiguous data
255	#[derive(Default, Clone, Debug, PartialEq, Eq)]
256	pub struct StripBytes {
257	state: State,
258	utf8parser: Utf8Parser,
259	}
260
261	impl StripBytes {
262	/// Initial state
263	pub fn new() -> Self {
264	Default::default()
265	}
266
267	/// Strip the next segment of data
268	pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> {
269	StripBytesIter {
270	bytes,
271	state: &mut self.state,
272	utf8parser: &mut self.utf8parser,
273	}
274	}
275	}
276
277	/// See [`StripBytes`]
278	#[derive(Debug, PartialEq, Eq)]
279	pub struct StripBytesIter<'s> {
280	bytes: &'s [u8],
281	state: &'s mut State,
282	utf8parser: &'s mut Utf8Parser,
283	}
284
285	impl<'s> Iterator for StripBytesIter<'s> {
286	type Item = &'s [u8];
287
288	#[inline]
289	fn next(&mut self) -> Option<Self::Item> {
290	next_bytes(&mut self.bytes, self.state, self.utf8parser)
291	}
292	}
293
294	#[inline]
295	fn next_bytes<'s>(
296	bytes: &mut &'s [u8],
297	state: &mut State,
298	utf8parser: &mut Utf8Parser,
299	) -> Option<&'s [u8]> {
300	let offset = bytes.iter().copied().position(\|b\| {
301	if *state == State::Utf8 {
302	`true`
303	} else {
304	let (next_state, action) = state_change(*state, b);
305	if next_state != State::Anywhere {
306	*state = next_state;
307	}
308	is_printable_bytes(action, b)
309	}
310	});
311	let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
312	*bytes = next;
313
314	let offset = bytes.iter().copied().position(\|b\| {
315	if *state == State::Utf8 {
316	if utf8parser.add(b) {
317	*state = State::Ground;
318	}
319	`false`
320	} else {
321	let (next_state, action) = state_change(State::Ground, b);
322	if next_state != State::Anywhere {
323	*state = next_state;
324	}
325	if *state == State::Utf8 {
326	utf8parser.add(b);
327	`false`
328	} else {
329	!is_printable_bytes(action, b)
330	}
331	}
332	});
333	let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
334	*bytes = next;
335	if printable.is_empty() {
336	None
337	} else {
338	Some(printable)
339	}
340	}
341
342	#[derive(Default, Clone, Debug, PartialEq, Eq)]
343	pub struct Utf8Parser {
344	utf8_parser: utf8parse::Parser,
345	}
346
347	impl Utf8Parser {
348	fn add(&mut self, byte: u8) -> bool {
349	let mut b: bool = `false`;
350	let mut receiver: VtUtf8Receiver<'_> = VtUtf8Receiver(&mut b);
351	self.utf8_parser.advance(&mut receiver, byte);
352	b
353	}
354	}
355
356	struct VtUtf8Receiver<'a>(&'a mut bool);
357
358	impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> {
359	fn codepoint(&mut self, _: char) {
360	*self.0 = `true`;
361	}
362
363	fn invalid_sequence(&mut self) {
364	*self.0 = `true`;
365	}
366	}
367
368	#[inline]
369	fn is_printable_bytes(action: Action, byte: u8) -> bool {
370	// VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not
371	// ISO Latin-1, making it DEL and non-printable
372	const DEL: u8 = `0x7f`;
373
374	// Continuations aren't included as they may also be control codes, requiring more context
375	(action == Action::Print && byte != DEL)
376	\|\| action == Action::BeginUtf8
377	\|\| (action == Action::Execute && byte.is_ascii_whitespace())
378	}
379
380	#[cfg(test)]
381	mod test {
382	use super::*;
383	use proptest::prelude::*;
384
385	/// Model based off full parser
386	fn parser_strip(bytes: &[u8]) -> String {
387	#[derive(Default)]
388	struct Strip(String);
389	impl Strip {
390	fn with_capacity(capacity: usize) -> Self {
391	Self(String::with_capacity(capacity))
392	}
393	}
394	impl anstyle_parse::Perform for Strip {
395	fn print(&mut self, c: char) {
396	self.0.push(c);
397	}
398
399	fn execute(&mut self, byte: u8) {
400	if byte.is_ascii_whitespace() {
401	self.0.push(byte as char);
402	}
403	}
404	}
405
406	let mut stripped = Strip::with_capacity(bytes.len());
407	let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
408	for byte in bytes {
409	parser.advance(&mut stripped, *byte);
410	}
411	stripped.0
412	}
413
414	/// Model verifying incremental parsing
415	fn strip_char(mut s: &str) -> String {
416	let mut result = String::new();
417	let mut state = StripStr::new();
418	while !s.is_empty() {
419	let mut indices = s.char_indices();
420	indices.next(); // current
421	let offset = indices.next().map(\|(i, _)\| i).unwrap_or_else(\|\| s.len());
422	let (current, remainder) = s.split_at(offset);
423	for printable in state.strip_next(current) {
424	result.push_str(printable);
425	}
426	s = remainder;
427	}
428	result
429	}
430
431	/// Model verifying incremental parsing
432	fn strip_byte(s: &[u8]) -> Vec<u8> {
433	let mut result = Vec::new();
434	let mut state = StripBytes::default();
435	for start in `0`..s.len() {
436	let current = &s[start..=start];
437	for printable in state.strip_next(current) {
438	result.extend(printable);
439	}
440	}
441	result
442	}
443
444	#[test]
445	fn test_strip_bytes_multibyte() {
446	let bytes = [`240`, `145`, `141`, `139`];
447	let expected = parser_strip(&bytes);
448	let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap();
449	assert_eq!(expected, actual);
450	}
451
452	#[test]
453	fn test_strip_byte_multibyte() {
454	let bytes = [`240`, `145`, `141`, `139`];
455	let expected = parser_strip(&bytes);
456	let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
457	assert_eq!(expected, actual);
458	}
459
460	#[test]
461	fn test_strip_str_del() {
462	let input = std::str::from_utf8(&[`0x7f`]).unwrap();
463	let expected = "";
464	let actual = strip_str(input).to_string();
465	assert_eq!(expected, actual);
466	}
467
468	#[test]
469	fn test_strip_byte_del() {
470	let bytes = [`0x7f`];
471	let expected = "";
472	let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
473	assert_eq!(expected, actual);
474	}
475
476	proptest! {
477	#[test]
478	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
479	fn strip_str_no_escapes(s in "`\\`PC*") {
480	let expected = parser_strip(s.as_bytes());
481	let actual = strip_str(&s).to_string();
482	assert_eq!(expected, actual);
483	}
484
485	#[test]
486	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
487	fn strip_char_no_escapes(s in "`\\`PC*") {
488	let expected = parser_strip(s.as_bytes());
489	let actual = strip_char(&s);
490	assert_eq!(expected, actual);
491	}
492
493	#[test]
494	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
495	fn strip_bytes_no_escapes(s in "`\\`PC*") {
496	dbg!(&s);
497	dbg!(s.as_bytes());
498	let expected = parser_strip(s.as_bytes());
499	let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap();
500	assert_eq!(expected, actual);
501	}
502
503	#[test]
504	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
505	fn strip_byte_no_escapes(s in "`\\`PC*") {
506	dbg!(&s);
507	dbg!(s.as_bytes());
508	let expected = parser_strip(s.as_bytes());
509	let actual = String::from_utf8(strip_byte(s.as_bytes()).to_vec()).unwrap();
510	assert_eq!(expected, actual);
511	}
512	}
513	}
514