| 1 | use std::{fmt, str}; |
| 2 | |
| 3 | #[allow (unused)] |
| 4 | pub(crate) fn write_escaped_str(mut dest: impl fmt::Write, src: &str) -> fmt::Result { |
| 5 | // This implementation reads one byte after another. |
| 6 | // It's not very fast, but should work well enough until portable SIMD gets stabilized. |
| 7 | |
| 8 | let mut escaped_buf: [u8; 8] = ESCAPED_BUF_INIT; |
| 9 | let mut last: usize = 0; |
| 10 | |
| 11 | for (index: usize, byte: u8) in src.bytes().enumerate() { |
| 12 | if let Some(escaped: [u8; 2]) = get_escaped(byte) { |
| 13 | [escaped_buf[2], escaped_buf[3]] = escaped; |
| 14 | write_str_if_nonempty(&mut dest, &src[last..index])?; |
| 15 | // SAFETY: the content of `escaped_buf` is pure ASCII |
| 16 | dest.write_str(unsafe { str::from_utf8_unchecked(&escaped_buf[..ESCAPED_BUF_LEN]) })?; |
| 17 | last = index + 1; |
| 18 | } |
| 19 | } |
| 20 | write_str_if_nonempty(&mut dest, &src[last..]) |
| 21 | } |
| 22 | |
| 23 | #[allow (unused)] |
| 24 | pub(crate) fn write_escaped_char(mut dest: impl fmt::Write, c: char) -> fmt::Result { |
| 25 | if !c.is_ascii() { |
| 26 | dest.write_char(c) |
| 27 | } else if let Some(escaped: [u8; 2]) = get_escaped(byte:c as u8) { |
| 28 | let mut escaped_buf: [u8; 8] = ESCAPED_BUF_INIT; |
| 29 | [escaped_buf[2], escaped_buf[3]] = escaped; |
| 30 | // SAFETY: the content of `escaped_buf` is pure ASCII |
| 31 | dest.write_str(unsafe { str::from_utf8_unchecked(&escaped_buf[..ESCAPED_BUF_LEN]) }) |
| 32 | } else { |
| 33 | // RATIONALE: `write_char(c)` gets optimized if it is known that `c.is_ascii()` |
| 34 | dest.write_char(c) |
| 35 | } |
| 36 | } |
| 37 | |
| 38 | /// Returns the decimal representation of the codepoint if the character needs HTML escaping. |
| 39 | #[inline (always)] |
| 40 | fn get_escaped(byte: u8) -> Option<[u8; 2]> { |
| 41 | match byte { |
| 42 | MIN_CHAR..=MAX_CHAR => match TABLE.lookup[(byte - MIN_CHAR) as usize] { |
| 43 | 0 => None, |
| 44 | escaped: u16 => Some(escaped.to_ne_bytes()), |
| 45 | }, |
| 46 | _ => None, |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | #[inline (always)] |
| 51 | fn write_str_if_nonempty(output: &mut impl fmt::Write, input: &str) -> fmt::Result { |
| 52 | if !input.is_empty() { |
| 53 | output.write_str(input) |
| 54 | } else { |
| 55 | Ok(()) |
| 56 | } |
| 57 | } |
| 58 | |
| 59 | /// List of characters that need HTML escaping, not necessarily in ordinal order. |
| 60 | const CHARS: &[u8] = br#""&'<>"# ; |
| 61 | |
| 62 | /// The character with the lowest codepoint that needs HTML escaping. |
| 63 | const MIN_CHAR: u8 = { |
| 64 | let mut v: u8 = u8::MAX; |
| 65 | let mut i: usize = 0; |
| 66 | while i < CHARS.len() { |
| 67 | if v > CHARS[i] { |
| 68 | v = CHARS[i]; |
| 69 | } |
| 70 | i += 1; |
| 71 | } |
| 72 | v |
| 73 | }; |
| 74 | |
| 75 | /// The character with the highest codepoint that needs HTML escaping. |
| 76 | const MAX_CHAR: u8 = { |
| 77 | let mut v: u8 = u8::MIN; |
| 78 | let mut i: usize = 0; |
| 79 | while i < CHARS.len() { |
| 80 | if v < CHARS[i] { |
| 81 | v = CHARS[i]; |
| 82 | } |
| 83 | i += 1; |
| 84 | } |
| 85 | v |
| 86 | }; |
| 87 | |
| 88 | /// Number of codepoints between the lowest and highest character that needs escaping, incl. |
| 89 | const CHAR_RANGE: usize = (MAX_CHAR - MIN_CHAR + 1) as usize; |
| 90 | |
| 91 | struct Table { |
| 92 | _align: [usize; 0], |
| 93 | lookup: [u16; CHAR_RANGE], |
| 94 | } |
| 95 | |
| 96 | /// For characters that need HTML escaping, the codepoint is formatted as decimal digits, |
| 97 | /// otherwise `b"\0\0"`. Starting at [`MIN_CHAR`]. |
| 98 | const TABLE: Table = { |
| 99 | let mut table: Table = Table { |
| 100 | _align: [], |
| 101 | lookup: [0; CHAR_RANGE], |
| 102 | }; |
| 103 | let mut i: usize = 0; |
| 104 | while i < CHARS.len() { |
| 105 | let c: u8 = CHARS[i]; |
| 106 | let h: u8 = c / 10 + b'0' ; |
| 107 | let l: u8 = c % 10 + b'0' ; |
| 108 | table.lookup[(c - MIN_CHAR) as usize] = u16::from_ne_bytes([h, l]); |
| 109 | i += 1; |
| 110 | } |
| 111 | table |
| 112 | }; |
| 113 | |
| 114 | // RATIONALE: llvm generates better code if the buffer is register sized |
| 115 | const ESCAPED_BUF_INIT: [u8; 8] = *b"&#__; \0\0\0" ; |
| 116 | const ESCAPED_BUF_LEN: usize = b"&#__;" .len(); |
| 117 | |
| 118 | #[test ] |
| 119 | fn test_simple_html_string_escaping() { |
| 120 | let mut buf = String::new(); |
| 121 | write_escaped_str(&mut buf, "<script>" ).unwrap(); |
| 122 | assert_eq!(buf, "<script>" ); |
| 123 | |
| 124 | buf.clear(); |
| 125 | write_escaped_str(&mut buf, "s<crip>t" ).unwrap(); |
| 126 | assert_eq!(buf, "s<crip>t" ); |
| 127 | |
| 128 | buf.clear(); |
| 129 | write_escaped_str(&mut buf, "s<cripcripcripcripcripcripcripcripcripcrip>t" ).unwrap(); |
| 130 | assert_eq!(buf, "s<cripcripcripcripcripcripcripcripcripcrip>t" ); |
| 131 | } |
| 132 | |