strip.rs source code [crates/anstream-0.6.13/src/adapter/strip.rs]

1	use anstyle_parse::state::state_change;
2	use anstyle_parse::state::Action;
3	use anstyle_parse::state::State;
4
5	/// Strip ANSI escapes from a `&str`, returning the printable content
6	///
7	/// This can be used to take output from a program that includes escape sequences and write it
8	/// somewhere that does not easily support them, such as a log file.
9	///
10	/// For non-contiguous data, see [`StripStr`].
11	///
12	/// # Example
13	///
14	/// ```rust
15	/// use std::io::Write as _;
16	///
17	/// let styled_text = "`\x1b`[32mfoo`\x1b`[m bar";
18	/// let plain_str = anstream::adapter::strip_str(&styled_text).to_string();
19	/// assert_eq!(plain_str, "foo bar");
20	/// ```
21	#[inline]
22	pub fn strip_str(data: &str) -> StrippedStr<'_> {
23	StrippedStr::new(data)
24	}
25
26	/// See [`strip_str`]
27	#[derive(Default, Clone, Debug, PartialEq, Eq)]
28	pub struct StrippedStr<'s> {
29	bytes: &'s [u8],
30	state: State,
31	}
32
33	impl<'s> StrippedStr<'s> {
34	#[inline]
35	fn new(data: &'s str) -> Self {
36	Self {
37	bytes: data.as_bytes(),
38	state: State::Ground,
39	}
40	}
41
42	/// Create a [`String`] of the printable content
43	#[inline]
44	#[allow(clippy::inherent_to_string_shadow_display)] // Single-allocation implementation
45	pub fn to_string(&self) -> String {
46	use std::fmt::Write as _;
47	let mut stripped: String = String::with_capacity(self.bytes.len());
48	let _ = write!(&mut stripped, "{}", self);
49	stripped
50	}
51	}
52
53	impl<'s> std::fmt::Display for StrippedStr<'s> {
54	/// Note:* this does not exhaust the* [`Iterator`]
55	#[inline]
56	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57	let iter: StrippedStr<'_> = Self {
58	bytes: self.bytes,
59	state: self.state,
60	};
61	for printable: &str in iter {
62	printable.fmt(f)?;
63	}
64	Ok(())
65	}
66	}
67
68	impl<'s> Iterator for StrippedStr<'s> {
69	type Item = &'s str;
70
71	#[inline]
72	fn next(&mut self) -> Option<Self::Item> {
73	next_str(&mut self.bytes, &mut self.state)
74	}
75	}
76
77	/// Incrementally strip non-contiguous data
78	#[derive(Default, Clone, Debug, PartialEq, Eq)]
79	pub struct StripStr {
80	state: State,
81	}
82
83	impl StripStr {
84	/// Initial state
85	pub fn new() -> Self {
86	Default::default()
87	}
88
89	/// Strip the next segment of data
90	pub fn strip_next<'s>(&'s mut self, data: &'s str) -> StripStrIter<'s> {
91	StripStrIter {
92	bytes: data.as_bytes(),
93	state: &mut self.state,
94	}
95	}
96	}
97
98	/// See [`StripStr`]
99	#[derive(Debug, PartialEq, Eq)]
100	pub struct StripStrIter<'s> {
101	bytes: &'s [u8],
102	state: &'s mut State,
103	}
104
105	impl<'s> Iterator for StripStrIter<'s> {
106	type Item = &'s str;
107
108	#[inline]
109	fn next(&mut self) -> Option<Self::Item> {
110	next_str(&mut self.bytes, self.state)
111	}
112	}
113
114	#[inline]
115	fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> {
116	let offset = bytes.iter().copied().position(\|b\| {
117	let (next_state, action) = state_change(*state, b);
118	if next_state != State::Anywhere {
119	*state = next_state;
120	}
121	is_printable_bytes(action, b)
122	});
123	let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
124	*bytes = next;
125	*state = State::Ground;
126
127	let offset = bytes.iter().copied().position(\|b\| {
128	let (_next_state, action) = state_change(State::Ground, b);
129	!(is_printable_bytes(action, b) \|\| is_utf8_continuation(b))
130	});
131	let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
132	*bytes = next;
133	if printable.is_empty() {
134	None
135	} else {
136	let printable = unsafe {
137	from_utf8_unchecked(
138	printable,
139	"`bytes` was validated as UTF-8, the parser preserves UTF-8 continuations",
140	)
141	};
142	Some(printable)
143	}
144	}
145
146	#[inline]
147	unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'static str) -> &'b str {
148	if cfg!(debug_assertions) {
149	// Catch problems more quickly when testing
150	std::str::from_utf8(bytes).expect(msg:safety_justification)
151	} else {
152	std::str::from_utf8_unchecked(bytes)
153	}
154	}
155
156	#[inline]
157	fn is_utf8_continuation(b: u8) -> bool {
158	matches!(b, `0x80`..=`0xbf`)
159	}
160
161	/// Strip ANSI escapes from bytes, returning the printable content
162	///
163	/// This can be used to take output from a program that includes escape sequences and write it
164	/// somewhere that does not easily support them, such as a log file.
165	///
166	/// # Example
167	///
168	/// ```rust
169	/// use std::io::Write as _;
170	///
171	/// let styled_text = "`\x1b`[32mfoo`\x1b`[m bar";
172	/// let plain_str = anstream::adapter::strip_bytes(styled_text.as_bytes()).into_vec();
173	/// assert_eq!(plain_str.as_slice(), &b"foo bar"[..]);
174	/// ```
175	#[inline]
176	pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> {
177	StrippedBytes::new(bytes:data)
178	}
179
180	/// See [`strip_bytes`]
181	#[derive(Default, Clone, Debug, PartialEq, Eq)]
182	pub struct StrippedBytes<'s> {
183	bytes: &'s [u8],
184	state: State,
185	utf8parser: Utf8Parser,
186	}
187
188	impl<'s> StrippedBytes<'s> {
189	/// See [`strip_bytes`]
190	#[inline]
191	pub fn new(bytes: &'s [u8]) -> Self {
192	Self {
193	bytes,
194	state: State::Ground,
195	utf8parser: Default::default(),
196	}
197	}
198
199	/// Strip the next slice of bytes
200	///
201	/// Used when the content is in several non-contiguous slices
202	///
203	/// # Panic
204	///
205	/// May panic if it is not exhausted / empty
206	#[inline]
207	pub fn extend(&mut self, bytes: &'s [u8]) {
208	debug_assert!(
209	self.is_empty(),
210	"current bytes must be processed to ensure we end at the right state"
211	);
212	self.bytes = bytes;
213	}
214
215	/// Report the bytes has been exhausted
216	#[inline]
217	pub fn is_empty(&self) -> bool {
218	self.bytes.is_empty()
219	}
220
221	/// Create a [`Vec`] of the printable content
222	#[inline]
223	pub fn into_vec(self) -> Vec<u8> {
224	let mut stripped = Vec::with_capacity(self.bytes.len());
225	for printable in self {
226	stripped.extend(printable);
227	}
228	stripped
229	}
230	}
231
232	impl<'s> Iterator for StrippedBytes<'s> {
233	type Item = &'s [u8];
234
235	#[inline]
236	fn next(&mut self) -> Option<Self::Item> {
237	next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser)
238	}
239	}
240
241	/// Incrementally strip non-contiguous data
242	#[derive(Default, Clone, Debug, PartialEq, Eq)]
243	pub struct StripBytes {
244	state: State,
245	utf8parser: Utf8Parser,
246	}
247
248	impl StripBytes {
249	/// Initial state
250	pub fn new() -> Self {
251	Default::default()
252	}
253
254	/// Strip the next segment of data
255	pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> {
256	StripBytesIter {
257	bytes,
258	state: &mut self.state,
259	utf8parser: &mut self.utf8parser,
260	}
261	}
262	}
263
264	/// See [`StripBytes`]
265	#[derive(Debug, PartialEq, Eq)]
266	pub struct StripBytesIter<'s> {
267	bytes: &'s [u8],
268	state: &'s mut State,
269	utf8parser: &'s mut Utf8Parser,
270	}
271
272	impl<'s> Iterator for StripBytesIter<'s> {
273	type Item = &'s [u8];
274
275	#[inline]
276	fn next(&mut self) -> Option<Self::Item> {
277	next_bytes(&mut self.bytes, self.state, self.utf8parser)
278	}
279	}
280
281	#[inline]
282	fn next_bytes<'s>(
283	bytes: &mut &'s [u8],
284	state: &mut State,
285	utf8parser: &mut Utf8Parser,
286	) -> Option<&'s [u8]> {
287	let offset = bytes.iter().copied().position(\|b\| {
288	if *state == State::Utf8 {
289	`true`
290	} else {
291	let (next_state, action) = state_change(*state, b);
292	if next_state != State::Anywhere {
293	*state = next_state;
294	}
295	is_printable_bytes(action, b)
296	}
297	});
298	let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
299	*bytes = next;
300
301	let offset = bytes.iter().copied().position(\|b\| {
302	if *state == State::Utf8 {
303	if utf8parser.add(b) {
304	*state = State::Ground;
305	}
306	`false`
307	} else {
308	let (next_state, action) = state_change(State::Ground, b);
309	if next_state != State::Anywhere {
310	*state = next_state;
311	}
312	if *state == State::Utf8 {
313	utf8parser.add(b);
314	`false`
315	} else {
316	!is_printable_bytes(action, b)
317	}
318	}
319	});
320	let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
321	*bytes = next;
322	if printable.is_empty() {
323	None
324	} else {
325	Some(printable)
326	}
327	}
328
329	#[derive(Default, Clone, Debug, PartialEq, Eq)]
330	pub struct Utf8Parser {
331	utf8_parser: utf8parse::Parser,
332	}
333
334	impl Utf8Parser {
335	fn add(&mut self, byte: u8) -> bool {
336	let mut b: bool = `false`;
337	let mut receiver: VtUtf8Receiver<'_> = VtUtf8Receiver(&mut b);
338	self.utf8_parser.advance(&mut receiver, byte);
339	b
340	}
341	}
342
343	struct VtUtf8Receiver<'a>(&'a mut bool);
344
345	impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> {
346	fn codepoint(&mut self, _: char) {
347	*self.0 = `true`;
348	}
349
350	fn invalid_sequence(&mut self) {
351	*self.0 = `true`;
352	}
353	}
354
355	#[inline]
356	fn is_printable_bytes(action: Action, byte: u8) -> bool {
357	// VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not
358	// ISO Latin-1, making it DEL and non-printable
359	const DEL: u8 = `0x7f`;
360
361	// Continuations aren't included as they may also be control codes, requiring more context
362	(action == Action::Print && byte != DEL)
363	\|\| action == Action::BeginUtf8
364	\|\| (action == Action::Execute && byte.is_ascii_whitespace())
365	}
366
367	#[cfg(test)]
368	mod test {
369	use super::*;
370	use proptest::prelude::*;
371
372	/// Model based off full parser
373	fn parser_strip(bytes: &[u8]) -> String {
374	#[derive(Default)]
375	struct Strip(String);
376	impl Strip {
377	fn with_capacity(capacity: usize) -> Self {
378	Self(String::with_capacity(capacity))
379	}
380	}
381	impl anstyle_parse::Perform for Strip {
382	fn print(&mut self, c: char) {
383	self.0.push(c);
384	}
385
386	fn execute(&mut self, byte: u8) {
387	if byte.is_ascii_whitespace() {
388	self.0.push(byte as char);
389	}
390	}
391	}
392
393	let mut stripped = Strip::with_capacity(bytes.len());
394	let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
395	for byte in bytes {
396	parser.advance(&mut stripped, *byte);
397	}
398	stripped.0
399	}
400
401	/// Model verifying incremental parsing
402	fn strip_char(mut s: &str) -> String {
403	let mut result = String::new();
404	let mut state = StripStr::new();
405	while !s.is_empty() {
406	let mut indices = s.char_indices();
407	indices.next(); // current
408	let offset = indices.next().map(\|(i, _)\| i).unwrap_or_else(\|\| s.len());
409	let (current, remainder) = s.split_at(offset);
410	for printable in state.strip_next(current) {
411	result.push_str(printable);
412	}
413	s = remainder;
414	}
415	result
416	}
417
418	/// Model verifying incremental parsing
419	fn strip_byte(s: &[u8]) -> Vec<u8> {
420	let mut result = Vec::new();
421	let mut state = StripBytes::default();
422	for start in `0`..s.len() {
423	let current = &s[start..=start];
424	for printable in state.strip_next(current) {
425	result.extend(printable);
426	}
427	}
428	result
429	}
430
431	#[test]
432	fn test_strip_bytes_multibyte() {
433	let bytes = [`240`, `145`, `141`, `139`];
434	let expected = parser_strip(&bytes);
435	let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap();
436	assert_eq!(expected, actual);
437	}
438
439	#[test]
440	fn test_strip_byte_multibyte() {
441	let bytes = [`240`, `145`, `141`, `139`];
442	let expected = parser_strip(&bytes);
443	let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
444	assert_eq!(expected, actual);
445	}
446
447	#[test]
448	fn test_strip_str_del() {
449	let input = std::str::from_utf8(&[`0x7f`]).unwrap();
450	let expected = "";
451	let actual = strip_str(input).to_string();
452	assert_eq!(expected, actual);
453	}
454
455	#[test]
456	fn test_strip_byte_del() {
457	let bytes = [`0x7f`];
458	let expected = "";
459	let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
460	assert_eq!(expected, actual);
461	}
462
463	#[test]
464	fn test_strip_str_handles_broken_sequence() {
465	// valid utf8: \xc3\xb6 then \x1b then \xf0\x9f\x98\x80
466	let s = "ö`\x1b`😀hello😀goodbye";
467	let mut it = strip_str(s);
468	assert_eq!("ö", it.next().unwrap());
469	assert_eq!("ello😀goodbye", it.next().unwrap());
470	}
471
472	proptest! {
473	#[test]
474	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
475	fn strip_str_no_escapes(s in "`\\`PC*") {
476	let expected = parser_strip(s.as_bytes());
477	let actual = strip_str(&s).to_string();
478	assert_eq!(expected, actual);
479	}
480
481	#[test]
482	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
483	fn strip_char_no_escapes(s in "`\\`PC*") {
484	let expected = parser_strip(s.as_bytes());
485	let actual = strip_char(&s);
486	assert_eq!(expected, actual);
487	}
488
489	#[test]
490	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
491	fn strip_bytes_no_escapes(s in "`\\`PC*") {
492	dbg!(&s);
493	dbg!(s.as_bytes());
494	let expected = parser_strip(s.as_bytes());
495	let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap();
496	assert_eq!(expected, actual);
497	}
498
499	#[test]
500	#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
501	fn strip_byte_no_escapes(s in "`\\`PC*") {
502	dbg!(&s);
503	dbg!(s.as_bytes());
504	let expected = parser_strip(s.as_bytes());
505	let actual = String::from_utf8(strip_byte(s.as_bytes()).to_vec()).unwrap();
506	assert_eq!(expected, actual);
507	}
508	}
509	}
510