1 | use anstyle_parse::state::state_change; |
2 | use anstyle_parse::state::Action; |
3 | use anstyle_parse::state::State; |
4 | |
5 | /// Strip ANSI escapes from a `&str`, returning the printable content |
6 | /// |
7 | /// This can be used to take output from a program that includes escape sequences and write it |
8 | /// somewhere that does not easily support them, such as a log file. |
9 | /// |
10 | /// For non-contiguous data, see [`StripStr`]. |
11 | /// |
12 | /// # Example |
13 | /// |
14 | /// ```rust |
15 | /// use std::io::Write as _; |
16 | /// |
17 | /// let styled_text = " \x1b[32mfoo \x1b[m bar" ; |
18 | /// let plain_str = anstream::adapter::strip_str(&styled_text).to_string(); |
19 | /// assert_eq!(plain_str, "foo bar" ); |
20 | /// ``` |
21 | #[inline ] |
22 | pub fn strip_str(data: &str) -> StrippedStr<'_> { |
23 | StrippedStr::new(data) |
24 | } |
25 | |
26 | /// See [`strip_str`] |
27 | #[derive (Default, Clone, Debug, PartialEq, Eq)] |
28 | pub struct StrippedStr<'s> { |
29 | bytes: &'s [u8], |
30 | state: State, |
31 | } |
32 | |
33 | impl<'s> StrippedStr<'s> { |
34 | #[inline ] |
35 | fn new(data: &'s str) -> Self { |
36 | Self { |
37 | bytes: data.as_bytes(), |
38 | state: State::Ground, |
39 | } |
40 | } |
41 | |
42 | /// Create a [`String`] of the printable content |
43 | #[inline ] |
44 | #[allow (clippy::inherent_to_string_shadow_display)] // Single-allocation implementation |
45 | pub fn to_string(&self) -> String { |
46 | use std::fmt::Write as _; |
47 | let mut stripped: String = String::with_capacity(self.bytes.len()); |
48 | let _ = write!(&mut stripped, " {}" , self); |
49 | stripped |
50 | } |
51 | } |
52 | |
53 | impl<'s> std::fmt::Display for StrippedStr<'s> { |
54 | /// **Note:** this does *not* exhaust the [`Iterator`] |
55 | #[inline ] |
56 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
57 | let iter: StrippedStr<'_> = Self { |
58 | bytes: self.bytes, |
59 | state: self.state, |
60 | }; |
61 | for printable: &str in iter { |
62 | printable.fmt(f)?; |
63 | } |
64 | Ok(()) |
65 | } |
66 | } |
67 | |
68 | impl<'s> Iterator for StrippedStr<'s> { |
69 | type Item = &'s str; |
70 | |
71 | #[inline ] |
72 | fn next(&mut self) -> Option<Self::Item> { |
73 | next_str(&mut self.bytes, &mut self.state) |
74 | } |
75 | } |
76 | |
77 | /// Incrementally strip non-contiguous data |
78 | #[derive (Default, Clone, Debug, PartialEq, Eq)] |
79 | pub struct StripStr { |
80 | state: State, |
81 | } |
82 | |
83 | impl StripStr { |
84 | /// Initial state |
85 | pub fn new() -> Self { |
86 | Default::default() |
87 | } |
88 | |
89 | /// Strip the next segment of data |
90 | pub fn strip_next<'s>(&'s mut self, data: &'s str) -> StripStrIter<'s> { |
91 | StripStrIter { |
92 | bytes: data.as_bytes(), |
93 | state: &mut self.state, |
94 | } |
95 | } |
96 | } |
97 | |
98 | /// See [`StripStr`] |
99 | #[derive (Debug, PartialEq, Eq)] |
100 | pub struct StripStrIter<'s> { |
101 | bytes: &'s [u8], |
102 | state: &'s mut State, |
103 | } |
104 | |
105 | impl<'s> Iterator for StripStrIter<'s> { |
106 | type Item = &'s str; |
107 | |
108 | #[inline ] |
109 | fn next(&mut self) -> Option<Self::Item> { |
110 | next_str(&mut self.bytes, self.state) |
111 | } |
112 | } |
113 | |
114 | #[inline ] |
115 | fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> { |
116 | let offset = bytes.iter().copied().position(|b| { |
117 | let (next_state, action) = state_change(*state, b); |
118 | if next_state != State::Anywhere { |
119 | *state = next_state; |
120 | } |
121 | is_printable_str(action, b) |
122 | }); |
123 | let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len())); |
124 | *bytes = next; |
125 | *state = State::Ground; |
126 | |
127 | let offset = bytes.iter().copied().position(|b| { |
128 | let (_next_state, action) = state_change(State::Ground, b); |
129 | !is_printable_str(action, b) |
130 | }); |
131 | let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len())); |
132 | *bytes = next; |
133 | if printable.is_empty() { |
134 | None |
135 | } else { |
136 | let printable = unsafe { |
137 | from_utf8_unchecked( |
138 | printable, |
139 | "`bytes` was validated as UTF-8, the parser preserves UTF-8 continuations" , |
140 | ) |
141 | }; |
142 | Some(printable) |
143 | } |
144 | } |
145 | |
146 | #[inline ] |
147 | unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'static str) -> &'b str { |
148 | if cfg!(debug_assertions) { |
149 | // Catch problems more quickly when testing |
150 | std::str::from_utf8(bytes).expect(msg:safety_justification) |
151 | } else { |
152 | std::str::from_utf8_unchecked(bytes) |
153 | } |
154 | } |
155 | |
156 | #[inline ] |
157 | fn is_printable_str(action: Action, byte: u8) -> bool { |
158 | // VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not |
159 | // ISO Latin-1, making it DEL and non-printable |
160 | const DEL: u8 = 0x7f; |
161 | (action == Action::Print && byte != DEL) |
162 | || action == Action::BeginUtf8 |
163 | // since we know the input is valid UTF-8, the only thing we can do with |
164 | // continuations is to print them |
165 | || is_utf8_continuation(byte) |
166 | || (action == Action::Execute && byte.is_ascii_whitespace()) |
167 | } |
168 | |
169 | #[inline ] |
170 | fn is_utf8_continuation(b: u8) -> bool { |
171 | matches!(b, 0x80..=0xbf) |
172 | } |
173 | |
174 | /// Strip ANSI escapes from bytes, returning the printable content |
175 | /// |
176 | /// This can be used to take output from a program that includes escape sequences and write it |
177 | /// somewhere that does not easily support them, such as a log file. |
178 | /// |
179 | /// # Example |
180 | /// |
181 | /// ```rust |
182 | /// use std::io::Write as _; |
183 | /// |
184 | /// let styled_text = " \x1b[32mfoo \x1b[m bar" ; |
185 | /// let plain_str = anstream::adapter::strip_bytes(styled_text.as_bytes()).into_vec(); |
186 | /// assert_eq!(plain_str.as_slice(), &b"foo bar" [..]); |
187 | /// ``` |
188 | #[inline ] |
189 | pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> { |
190 | StrippedBytes::new(bytes:data) |
191 | } |
192 | |
193 | /// See [`strip_bytes`] |
194 | #[derive (Default, Clone, Debug, PartialEq, Eq)] |
195 | pub struct StrippedBytes<'s> { |
196 | bytes: &'s [u8], |
197 | state: State, |
198 | utf8parser: Utf8Parser, |
199 | } |
200 | |
201 | impl<'s> StrippedBytes<'s> { |
202 | /// See [`strip_bytes`] |
203 | #[inline ] |
204 | pub fn new(bytes: &'s [u8]) -> Self { |
205 | Self { |
206 | bytes, |
207 | state: State::Ground, |
208 | utf8parser: Default::default(), |
209 | } |
210 | } |
211 | |
212 | /// Strip the next slice of bytes |
213 | /// |
214 | /// Used when the content is in several non-contiguous slices |
215 | /// |
216 | /// # Panic |
217 | /// |
218 | /// May panic if it is not exhausted / empty |
219 | #[inline ] |
220 | pub fn extend(&mut self, bytes: &'s [u8]) { |
221 | debug_assert!( |
222 | self.is_empty(), |
223 | "current bytes must be processed to ensure we end at the right state" |
224 | ); |
225 | self.bytes = bytes; |
226 | } |
227 | |
228 | /// Report the bytes has been exhausted |
229 | #[inline ] |
230 | pub fn is_empty(&self) -> bool { |
231 | self.bytes.is_empty() |
232 | } |
233 | |
234 | /// Create a [`Vec`] of the printable content |
235 | #[inline ] |
236 | pub fn into_vec(self) -> Vec<u8> { |
237 | let mut stripped = Vec::with_capacity(self.bytes.len()); |
238 | for printable in self { |
239 | stripped.extend(printable); |
240 | } |
241 | stripped |
242 | } |
243 | } |
244 | |
245 | impl<'s> Iterator for StrippedBytes<'s> { |
246 | type Item = &'s [u8]; |
247 | |
248 | #[inline ] |
249 | fn next(&mut self) -> Option<Self::Item> { |
250 | next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser) |
251 | } |
252 | } |
253 | |
254 | /// Incrementally strip non-contiguous data |
255 | #[derive (Default, Clone, Debug, PartialEq, Eq)] |
256 | pub struct StripBytes { |
257 | state: State, |
258 | utf8parser: Utf8Parser, |
259 | } |
260 | |
261 | impl StripBytes { |
262 | /// Initial state |
263 | pub fn new() -> Self { |
264 | Default::default() |
265 | } |
266 | |
267 | /// Strip the next segment of data |
268 | pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> { |
269 | StripBytesIter { |
270 | bytes, |
271 | state: &mut self.state, |
272 | utf8parser: &mut self.utf8parser, |
273 | } |
274 | } |
275 | } |
276 | |
277 | /// See [`StripBytes`] |
278 | #[derive (Debug, PartialEq, Eq)] |
279 | pub struct StripBytesIter<'s> { |
280 | bytes: &'s [u8], |
281 | state: &'s mut State, |
282 | utf8parser: &'s mut Utf8Parser, |
283 | } |
284 | |
285 | impl<'s> Iterator for StripBytesIter<'s> { |
286 | type Item = &'s [u8]; |
287 | |
288 | #[inline ] |
289 | fn next(&mut self) -> Option<Self::Item> { |
290 | next_bytes(&mut self.bytes, self.state, self.utf8parser) |
291 | } |
292 | } |
293 | |
294 | #[inline ] |
295 | fn next_bytes<'s>( |
296 | bytes: &mut &'s [u8], |
297 | state: &mut State, |
298 | utf8parser: &mut Utf8Parser, |
299 | ) -> Option<&'s [u8]> { |
300 | let offset = bytes.iter().copied().position(|b| { |
301 | if *state == State::Utf8 { |
302 | true |
303 | } else { |
304 | let (next_state, action) = state_change(*state, b); |
305 | if next_state != State::Anywhere { |
306 | *state = next_state; |
307 | } |
308 | is_printable_bytes(action, b) |
309 | } |
310 | }); |
311 | let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len())); |
312 | *bytes = next; |
313 | |
314 | let offset = bytes.iter().copied().position(|b| { |
315 | if *state == State::Utf8 { |
316 | if utf8parser.add(b) { |
317 | *state = State::Ground; |
318 | } |
319 | false |
320 | } else { |
321 | let (next_state, action) = state_change(State::Ground, b); |
322 | if next_state != State::Anywhere { |
323 | *state = next_state; |
324 | } |
325 | if *state == State::Utf8 { |
326 | utf8parser.add(b); |
327 | false |
328 | } else { |
329 | !is_printable_bytes(action, b) |
330 | } |
331 | } |
332 | }); |
333 | let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len())); |
334 | *bytes = next; |
335 | if printable.is_empty() { |
336 | None |
337 | } else { |
338 | Some(printable) |
339 | } |
340 | } |
341 | |
342 | #[derive (Default, Clone, Debug, PartialEq, Eq)] |
343 | pub struct Utf8Parser { |
344 | utf8_parser: utf8parse::Parser, |
345 | } |
346 | |
347 | impl Utf8Parser { |
348 | fn add(&mut self, byte: u8) -> bool { |
349 | let mut b: bool = false; |
350 | let mut receiver: VtUtf8Receiver<'_> = VtUtf8Receiver(&mut b); |
351 | self.utf8_parser.advance(&mut receiver, byte); |
352 | b |
353 | } |
354 | } |
355 | |
356 | struct VtUtf8Receiver<'a>(&'a mut bool); |
357 | |
358 | impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> { |
359 | fn codepoint(&mut self, _: char) { |
360 | *self.0 = true; |
361 | } |
362 | |
363 | fn invalid_sequence(&mut self) { |
364 | *self.0 = true; |
365 | } |
366 | } |
367 | |
368 | #[inline ] |
369 | fn is_printable_bytes(action: Action, byte: u8) -> bool { |
370 | // VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not |
371 | // ISO Latin-1, making it DEL and non-printable |
372 | const DEL: u8 = 0x7f; |
373 | |
374 | // Continuations aren't included as they may also be control codes, requiring more context |
375 | (action == Action::Print && byte != DEL) |
376 | || action == Action::BeginUtf8 |
377 | || (action == Action::Execute && byte.is_ascii_whitespace()) |
378 | } |
379 | |
380 | #[cfg (test)] |
381 | mod test { |
382 | use super::*; |
383 | use proptest::prelude::*; |
384 | |
385 | /// Model based off full parser |
386 | fn parser_strip(bytes: &[u8]) -> String { |
387 | #[derive (Default)] |
388 | struct Strip(String); |
389 | impl Strip { |
390 | fn with_capacity(capacity: usize) -> Self { |
391 | Self(String::with_capacity(capacity)) |
392 | } |
393 | } |
394 | impl anstyle_parse::Perform for Strip { |
395 | fn print(&mut self, c: char) { |
396 | self.0.push(c); |
397 | } |
398 | |
399 | fn execute(&mut self, byte: u8) { |
400 | if byte.is_ascii_whitespace() { |
401 | self.0.push(byte as char); |
402 | } |
403 | } |
404 | } |
405 | |
406 | let mut stripped = Strip::with_capacity(bytes.len()); |
407 | let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new(); |
408 | for byte in bytes { |
409 | parser.advance(&mut stripped, *byte); |
410 | } |
411 | stripped.0 |
412 | } |
413 | |
414 | /// Model verifying incremental parsing |
415 | fn strip_char(mut s: &str) -> String { |
416 | let mut result = String::new(); |
417 | let mut state = StripStr::new(); |
418 | while !s.is_empty() { |
419 | let mut indices = s.char_indices(); |
420 | indices.next(); // current |
421 | let offset = indices.next().map(|(i, _)| i).unwrap_or_else(|| s.len()); |
422 | let (current, remainder) = s.split_at(offset); |
423 | for printable in state.strip_next(current) { |
424 | result.push_str(printable); |
425 | } |
426 | s = remainder; |
427 | } |
428 | result |
429 | } |
430 | |
431 | /// Model verifying incremental parsing |
432 | fn strip_byte(s: &[u8]) -> Vec<u8> { |
433 | let mut result = Vec::new(); |
434 | let mut state = StripBytes::default(); |
435 | for start in 0..s.len() { |
436 | let current = &s[start..=start]; |
437 | for printable in state.strip_next(current) { |
438 | result.extend(printable); |
439 | } |
440 | } |
441 | result |
442 | } |
443 | |
444 | #[test ] |
445 | fn test_strip_bytes_multibyte() { |
446 | let bytes = [240, 145, 141, 139]; |
447 | let expected = parser_strip(&bytes); |
448 | let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap(); |
449 | assert_eq!(expected, actual); |
450 | } |
451 | |
452 | #[test ] |
453 | fn test_strip_byte_multibyte() { |
454 | let bytes = [240, 145, 141, 139]; |
455 | let expected = parser_strip(&bytes); |
456 | let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap(); |
457 | assert_eq!(expected, actual); |
458 | } |
459 | |
460 | #[test ] |
461 | fn test_strip_str_del() { |
462 | let input = std::str::from_utf8(&[0x7f]).unwrap(); |
463 | let expected = "" ; |
464 | let actual = strip_str(input).to_string(); |
465 | assert_eq!(expected, actual); |
466 | } |
467 | |
468 | #[test ] |
469 | fn test_strip_byte_del() { |
470 | let bytes = [0x7f]; |
471 | let expected = "" ; |
472 | let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap(); |
473 | assert_eq!(expected, actual); |
474 | } |
475 | |
476 | proptest! { |
477 | #[test] |
478 | #[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253 |
479 | fn strip_str_no_escapes(s in " \\PC*" ) { |
480 | let expected = parser_strip(s.as_bytes()); |
481 | let actual = strip_str(&s).to_string(); |
482 | assert_eq!(expected, actual); |
483 | } |
484 | |
485 | #[test] |
486 | #[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253 |
487 | fn strip_char_no_escapes(s in " \\PC*" ) { |
488 | let expected = parser_strip(s.as_bytes()); |
489 | let actual = strip_char(&s); |
490 | assert_eq!(expected, actual); |
491 | } |
492 | |
493 | #[test] |
494 | #[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253 |
495 | fn strip_bytes_no_escapes(s in " \\PC*" ) { |
496 | dbg!(&s); |
497 | dbg!(s.as_bytes()); |
498 | let expected = parser_strip(s.as_bytes()); |
499 | let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap(); |
500 | assert_eq!(expected, actual); |
501 | } |
502 | |
503 | #[test] |
504 | #[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253 |
505 | fn strip_byte_no_escapes(s in " \\PC*" ) { |
506 | dbg!(&s); |
507 | dbg!(s.as_bytes()); |
508 | let expected = parser_strip(s.as_bytes()); |
509 | let actual = String::from_utf8(strip_byte(s.as_bytes()).to_vec()).unwrap(); |
510 | assert_eq!(expected, actual); |
511 | } |
512 | } |
513 | } |
514 | |