| 1 | //! Processing of `data:` URLs according to the Fetch Standard: |
| 2 | //! <https://fetch.spec.whatwg.org/#data-urls> |
| 3 | //! but starting from a string rather than a parsed URL to avoid extra copies. |
| 4 | //! |
| 5 | //! ```rust |
| 6 | //! use data_url::{DataUrl, mime}; |
| 7 | //! |
| 8 | //! let url = DataUrl::process("data:,Hello%20World!" ).unwrap(); |
| 9 | //! let (body, fragment) = url.decode_to_vec().unwrap(); |
| 10 | //! |
| 11 | //! assert_eq!(url.mime_type().type_, "text" ); |
| 12 | //! assert_eq!(url.mime_type().subtype, "plain" ); |
| 13 | //! assert_eq!(url.mime_type().get_parameter("charset" ), Some("US-ASCII" )); |
| 14 | //! assert_eq!(body, b"Hello World!" ); |
| 15 | //! assert!(fragment.is_none()); |
| 16 | //! ``` |
| 17 | #![no_std ] |
| 18 | |
| 19 | // For forwards compatibility |
| 20 | #[cfg (feature = "std" )] |
| 21 | extern crate std; |
| 22 | |
| 23 | #[macro_use ] |
| 24 | extern crate alloc; |
| 25 | |
| 26 | #[cfg (not(feature = "alloc" ))] |
| 27 | compile_error!("the `alloc` feature must be enabled" ); |
| 28 | |
| 29 | use alloc::{string::String, vec::Vec}; |
| 30 | use core::fmt; |
| 31 | |
| 32 | macro_rules! require { |
| 33 | ($condition: expr) => { |
| 34 | if !$condition { |
| 35 | return None; |
| 36 | } |
| 37 | }; |
| 38 | } |
| 39 | |
| 40 | pub mod forgiving_base64; |
| 41 | pub mod mime; |
| 42 | |
| 43 | pub struct DataUrl<'a> { |
| 44 | mime_type: mime::Mime, |
| 45 | base64: bool, |
| 46 | encoded_body_plus_fragment: &'a str, |
| 47 | } |
| 48 | |
| 49 | #[derive (Debug)] |
| 50 | pub enum DataUrlError { |
| 51 | NotADataUrl, |
| 52 | NoComma, |
| 53 | } |
| 54 | |
| 55 | impl fmt::Display for DataUrlError { |
| 56 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 57 | match self { |
| 58 | Self::NotADataUrl => write!(f, "not a valid data url" ), |
| 59 | Self::NoComma => write!( |
| 60 | f, |
| 61 | "data url is missing comma delimiting attributes and body" |
| 62 | ), |
| 63 | } |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | #[cfg (feature = "std" )] |
| 68 | impl std::error::Error for DataUrlError {} |
| 69 | |
| 70 | impl<'a> DataUrl<'a> { |
| 71 | /// <https://fetch.spec.whatwg.org/#data-url-processor> |
| 72 | /// but starting from a string rather than a parsed `Url`, to avoid extra string copies. |
| 73 | pub fn process(input: &'a str) -> Result<Self, DataUrlError> { |
| 74 | use crate::DataUrlError::*; |
| 75 | |
| 76 | let after_colon = pretend_parse_data_url(input).ok_or(NotADataUrl)?; |
| 77 | |
| 78 | let (from_colon_to_comma, encoded_body_plus_fragment) = |
| 79 | find_comma_before_fragment(after_colon).ok_or(NoComma)?; |
| 80 | |
| 81 | let (mime_type, base64) = parse_header(from_colon_to_comma); |
| 82 | |
| 83 | Ok(DataUrl { |
| 84 | mime_type, |
| 85 | base64, |
| 86 | encoded_body_plus_fragment, |
| 87 | }) |
| 88 | } |
| 89 | |
| 90 | pub fn mime_type(&self) -> &mime::Mime { |
| 91 | &self.mime_type |
| 92 | } |
| 93 | |
| 94 | /// Streaming-decode the data URL’s body to `write_body_bytes`, |
| 95 | /// and return the URL’s fragment identifier if it has one. |
| 96 | pub fn decode<F, E>( |
| 97 | &self, |
| 98 | write_body_bytes: F, |
| 99 | ) -> Result<Option<FragmentIdentifier<'a>>, forgiving_base64::DecodeError<E>> |
| 100 | where |
| 101 | F: FnMut(&[u8]) -> Result<(), E>, |
| 102 | { |
| 103 | if self.base64 { |
| 104 | decode_with_base64(self.encoded_body_plus_fragment, write_body_bytes) |
| 105 | } else { |
| 106 | decode_without_base64(self.encoded_body_plus_fragment, write_body_bytes) |
| 107 | .map_err(forgiving_base64::DecodeError::WriteError) |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | /// Return the decoded body, and the URL’s fragment identifier if it has one. |
| 112 | pub fn decode_to_vec( |
| 113 | &self, |
| 114 | ) -> Result<(Vec<u8>, Option<FragmentIdentifier<'a>>), forgiving_base64::InvalidBase64> { |
| 115 | let mut body = Vec::new(); |
| 116 | let fragment = self.decode(|bytes| { |
| 117 | body.extend_from_slice(bytes); |
| 118 | Ok(()) |
| 119 | })?; |
| 120 | Ok((body, fragment)) |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | /// The URL’s fragment identifier (after `#`) |
| 125 | pub struct FragmentIdentifier<'a>(&'a str); |
| 126 | |
| 127 | impl<'a> FragmentIdentifier<'a> { |
| 128 | /// Like in a parsed URL |
| 129 | pub fn to_percent_encoded(&self) -> String { |
| 130 | let mut string: String = String::new(); |
| 131 | for byte: u8 in self.0.bytes() { |
| 132 | match byte { |
| 133 | // Ignore ASCII tabs or newlines like the URL parser would |
| 134 | b' \t' | b' \n' | b' \r' => continue, |
| 135 | // https://url.spec.whatwg.org/#fragment-percent-encode-set |
| 136 | b' \0' ..=b' ' | b'"' | b'<' | b'>' | b'`' | b' \x7F' ..=b' \xFF' => { |
| 137 | percent_encode(byte, &mut string) |
| 138 | } |
| 139 | // Printable ASCII |
| 140 | _ => string.push(ch:byte as char), |
| 141 | } |
| 142 | } |
| 143 | string |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | /// Similar to <https://url.spec.whatwg.org/#concept-basic-url-parser> |
| 148 | /// followed by <https://url.spec.whatwg.org/#concept-url-serializer> |
| 149 | /// |
| 150 | /// * `None`: not a data URL. |
| 151 | /// |
| 152 | /// * `Some(s)`: sort of the result of serialization, except: |
| 153 | /// |
| 154 | /// - `data:` prefix removed |
| 155 | /// - The fragment is included |
| 156 | /// - Other components are **not** UTF-8 percent-encoded |
| 157 | /// - ASCII tabs and newlines in the middle are **not** removed |
| 158 | fn pretend_parse_data_url(input: &str) -> Option<&str> { |
| 159 | // Trim C0 control or space |
| 160 | let left_trimmed: &str = input.trim_start_matches(|ch: char| ch <= ' ' ); |
| 161 | |
| 162 | let mut bytes: Bytes<'_> = left_trimmed.bytes(); |
| 163 | { |
| 164 | // Ignore ASCII tabs or newlines like the URL parser would |
| 165 | let mut iter: impl Iterator = bytes&mut Bytes<'_> |
| 166 | .by_ref() |
| 167 | .filter(|&byte: u8| !matches!(byte, b' \t' | b' \n' | b' \r' )); |
| 168 | require!(iter.next()?.to_ascii_lowercase() == b'd' ); |
| 169 | require!(iter.next()?.to_ascii_lowercase() == b'a' ); |
| 170 | require!(iter.next()?.to_ascii_lowercase() == b't' ); |
| 171 | require!(iter.next()?.to_ascii_lowercase() == b'a' ); |
| 172 | require!(iter.next()? == b':' ); |
| 173 | } |
| 174 | let bytes_consumed: usize = left_trimmed.len() - bytes.len(); |
| 175 | let after_colon: &str = &left_trimmed[bytes_consumed..]; |
| 176 | |
| 177 | // Trim C0 control or space |
| 178 | Some(after_colon.trim_end_matches(|ch: char| ch <= ' ' )) |
| 179 | } |
| 180 | |
| 181 | fn find_comma_before_fragment(after_colon: &str) -> Option<(&str, &str)> { |
| 182 | for (i: usize, byte: u8) in after_colon.bytes().enumerate() { |
| 183 | if byte == b',' { |
| 184 | return Some((&after_colon[..i], &after_colon[i + 1..])); |
| 185 | } |
| 186 | if byte == b'#' { |
| 187 | break; |
| 188 | } |
| 189 | } |
| 190 | None |
| 191 | } |
| 192 | |
| 193 | fn parse_header(from_colon_to_comma: &str) -> (mime::Mime, bool) { |
| 194 | // "Strip leading and trailing ASCII whitespace" |
| 195 | // \t, \n, and \r would have been filtered by the URL parser |
| 196 | // \f percent-encoded by the URL parser |
| 197 | // space is the only remaining ASCII whitespace |
| 198 | let trimmed = from_colon_to_comma.trim_matches(|c| matches!(c, ' ' | ' \t' | ' \n' | ' \r' )); |
| 199 | |
| 200 | let without_base64_suffix = remove_base64_suffix(trimmed); |
| 201 | let base64 = without_base64_suffix.is_some(); |
| 202 | let mime_type = without_base64_suffix.unwrap_or(trimmed); |
| 203 | |
| 204 | let mut string = String::new(); |
| 205 | if mime_type.starts_with(';' ) { |
| 206 | string.push_str("text/plain" ) |
| 207 | } |
| 208 | let mut in_query = false; |
| 209 | for byte in mime_type.bytes() { |
| 210 | match byte { |
| 211 | // Ignore ASCII tabs or newlines like the URL parser would |
| 212 | b' \t' | b' \n' | b' \r' => continue, |
| 213 | |
| 214 | // https://url.spec.whatwg.org/#c0-control-percent-encode-set |
| 215 | b' \0' ..=b' \x1F' | b' \x7F' ..=b' \xFF' => percent_encode(byte, &mut string), |
| 216 | |
| 217 | // Bytes other than the C0 percent-encode set that are percent-encoded |
| 218 | // by the URL parser in the query state. |
| 219 | // '#' is also in that list but cannot occur here |
| 220 | // since it indicates the start of the URL’s fragment. |
| 221 | b' ' | b'"' | b'<' | b'>' if in_query => percent_encode(byte, &mut string), |
| 222 | |
| 223 | b'?' => { |
| 224 | in_query = true; |
| 225 | string.push('?' ) |
| 226 | } |
| 227 | |
| 228 | // Printable ASCII |
| 229 | _ => string.push(byte as char), |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | // FIXME: does Mime::from_str match the MIME Sniffing Standard’s parsing algorithm? |
| 234 | // <https://mimesniff.spec.whatwg.org/#parse-a-mime-type> |
| 235 | let mime_type = string.parse().unwrap_or_else(|_| mime::Mime { |
| 236 | type_: String::from("text" ), |
| 237 | subtype: String::from("plain" ), |
| 238 | parameters: vec![(String::from("charset" ), String::from("US-ASCII" ))], |
| 239 | }); |
| 240 | |
| 241 | (mime_type, base64) |
| 242 | } |
| 243 | |
| 244 | /// None: no base64 suffix |
| 245 | #[allow (clippy::skip_while_next)] |
| 246 | fn remove_base64_suffix(s: &str) -> Option<&str> { |
| 247 | let mut bytes: Bytes<'_> = s.bytes(); |
| 248 | { |
| 249 | // Ignore ASCII tabs or newlines like the URL parser would |
| 250 | let iter: impl Iterator = bytes&mut Bytes<'_> |
| 251 | .by_ref() |
| 252 | .filter(|&byte: u8| !matches!(byte, b' \t' | b' \n' | b' \r' )); |
| 253 | |
| 254 | // Search from the end |
| 255 | let mut iter: impl Iterator = iter.rev(); |
| 256 | |
| 257 | require!(iter.next()? == b'4' ); |
| 258 | require!(iter.next()? == b'6' ); |
| 259 | require!(iter.next()?.to_ascii_lowercase() == b'e' ); |
| 260 | require!(iter.next()?.to_ascii_lowercase() == b's' ); |
| 261 | require!(iter.next()?.to_ascii_lowercase() == b'a' ); |
| 262 | require!(iter.next()?.to_ascii_lowercase() == b'b' ); |
| 263 | require!(iter.skip_while(|&byte| byte == b' ' ).next()? == b';' ); |
| 264 | } |
| 265 | Some(&s[..bytes.len()]) |
| 266 | } |
| 267 | |
| 268 | fn percent_encode(byte: u8, string: &mut String) { |
| 269 | const HEX_UPPER: [u8; 16] = *b"0123456789ABCDEF" ; |
| 270 | string.push(ch:'%' ); |
| 271 | string.push(HEX_UPPER[(byte >> 4) as usize] as char); |
| 272 | string.push(HEX_UPPER[(byte & 0x0f) as usize] as char); |
| 273 | } |
| 274 | |
| 275 | /// This is <https://url.spec.whatwg.org/#string-percent-decode> while also: |
| 276 | /// |
| 277 | /// * Ignoring ASCII tab or newlines |
| 278 | /// * Stopping at the first '#' (which indicates the start of the fragment) |
| 279 | /// |
| 280 | /// Anything that would have been UTF-8 percent-encoded by the URL parser |
| 281 | /// would be percent-decoded here. |
| 282 | /// We skip that round-trip and pass it through unchanged. |
| 283 | fn decode_without_base64<F, E>( |
| 284 | encoded_body_plus_fragment: &str, |
| 285 | mut write_bytes: F, |
| 286 | ) -> Result<Option<FragmentIdentifier<'_>>, E> |
| 287 | where |
| 288 | F: FnMut(&[u8]) -> Result<(), E>, |
| 289 | { |
| 290 | let bytes = encoded_body_plus_fragment.as_bytes(); |
| 291 | let mut slice_start = 0; |
| 292 | for (i, &byte) in bytes.iter().enumerate() { |
| 293 | // We only need to look for 5 different "special" byte values. |
| 294 | // For everything else we make slices as large as possible, borrowing the input, |
| 295 | // in order to make fewer write_all() calls. |
| 296 | if matches!(byte, b'%' | b'#' | b' \t' | b' \n' | b' \r' ) { |
| 297 | // Write everything (if anything) "non-special" we’ve accumulated |
| 298 | // before this special byte |
| 299 | if i > slice_start { |
| 300 | write_bytes(&bytes[slice_start..i])?; |
| 301 | } |
| 302 | // Then deal with the special byte. |
| 303 | match byte { |
| 304 | b'%' => { |
| 305 | let l = bytes.get(i + 2).and_then(|&b| (b as char).to_digit(16)); |
| 306 | let h = bytes.get(i + 1).and_then(|&b| (b as char).to_digit(16)); |
| 307 | if let (Some(h), Some(l)) = (h, l) { |
| 308 | // '%' followed by two ASCII hex digits |
| 309 | let one_byte = h as u8 * 0x10 + l as u8; |
| 310 | write_bytes(&[one_byte])?; |
| 311 | slice_start = i + 3; |
| 312 | } else { |
| 313 | // Do nothing. Leave slice_start unchanged. |
| 314 | // The % sign will be part of the next slice. |
| 315 | } |
| 316 | } |
| 317 | |
| 318 | b'#' => { |
| 319 | let fragment_start = i + 1; |
| 320 | let fragment = &encoded_body_plus_fragment[fragment_start..]; |
| 321 | return Ok(Some(FragmentIdentifier(fragment))); |
| 322 | } |
| 323 | |
| 324 | // Ignore over '\t' | '\n' | '\r' |
| 325 | _ => slice_start = i + 1, |
| 326 | } |
| 327 | } |
| 328 | } |
| 329 | write_bytes(&bytes[slice_start..])?; |
| 330 | Ok(None) |
| 331 | } |
| 332 | |
| 333 | /// `decode_without_base64()` composed with |
| 334 | /// <https://infra.spec.whatwg.org/#isomorphic-decode> composed with |
| 335 | /// <https://infra.spec.whatwg.org/#forgiving-base64-decode>. |
| 336 | fn decode_with_base64<F, E>( |
| 337 | encoded_body_plus_fragment: &str, |
| 338 | write_bytes: F, |
| 339 | ) -> Result<Option<FragmentIdentifier<'_>>, forgiving_base64::DecodeError<E>> |
| 340 | where |
| 341 | F: FnMut(&[u8]) -> Result<(), E>, |
| 342 | { |
| 343 | let mut decoder: Decoder = forgiving_base64::Decoder::new(write_bytes); |
| 344 | let fragment: Option> = decode_without_base64(encoded_body_plus_fragment, |bytes: &[u8]| decoder.feed(input:bytes))?; |
| 345 | decoder.finish()?; |
| 346 | Ok(fragment) |
| 347 | } |
| 348 | |