1 | //! Processing of `data:` URLs according to the Fetch Standard: |
2 | //! <https://fetch.spec.whatwg.org/#data-urls> |
3 | //! but starting from a string rather than a parsed URL to avoid extra copies. |
4 | //! |
5 | //! ```rust |
6 | //! use data_url::{DataUrl, mime}; |
7 | //! |
8 | //! let url = DataUrl::process("data:,Hello%20World!" ).unwrap(); |
9 | //! let (body, fragment) = url.decode_to_vec().unwrap(); |
10 | //! |
11 | //! assert_eq!(url.mime_type().type_, "text" ); |
12 | //! assert_eq!(url.mime_type().subtype, "plain" ); |
13 | //! assert_eq!(url.mime_type().get_parameter("charset" ), Some("US-ASCII" )); |
14 | //! assert_eq!(body, b"Hello World!" ); |
15 | //! assert!(fragment.is_none()); |
16 | //! ``` |
17 | #![no_std ] |
18 | |
19 | // For forwards compatibility |
20 | #[cfg (feature = "std" )] |
21 | extern crate std; |
22 | |
23 | #[macro_use ] |
24 | extern crate alloc; |
25 | |
26 | #[cfg (not(feature = "alloc" ))] |
27 | compile_error!("the `alloc` feature must be enabled" ); |
28 | |
29 | use alloc::{string::String, vec::Vec}; |
30 | use core::fmt; |
31 | |
32 | macro_rules! require { |
33 | ($condition: expr) => { |
34 | if !$condition { |
35 | return None; |
36 | } |
37 | }; |
38 | } |
39 | |
40 | pub mod forgiving_base64; |
41 | pub mod mime; |
42 | |
43 | pub struct DataUrl<'a> { |
44 | mime_type: mime::Mime, |
45 | base64: bool, |
46 | encoded_body_plus_fragment: &'a str, |
47 | } |
48 | |
49 | #[derive (Debug)] |
50 | pub enum DataUrlError { |
51 | NotADataUrl, |
52 | NoComma, |
53 | } |
54 | |
55 | impl fmt::Display for DataUrlError { |
56 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
57 | match self { |
58 | Self::NotADataUrl => write!(f, "not a valid data url" ), |
59 | Self::NoComma => write!( |
60 | f, |
61 | "data url is missing comma delimiting attributes and body" |
62 | ), |
63 | } |
64 | } |
65 | } |
66 | |
67 | #[cfg (feature = "std" )] |
68 | impl std::error::Error for DataUrlError {} |
69 | |
70 | impl<'a> DataUrl<'a> { |
71 | /// <https://fetch.spec.whatwg.org/#data-url-processor> |
72 | /// but starting from a string rather than a parsed `Url`, to avoid extra string copies. |
73 | pub fn process(input: &'a str) -> Result<Self, DataUrlError> { |
74 | use crate::DataUrlError::*; |
75 | |
76 | let after_colon = pretend_parse_data_url(input).ok_or(NotADataUrl)?; |
77 | |
78 | let (from_colon_to_comma, encoded_body_plus_fragment) = |
79 | find_comma_before_fragment(after_colon).ok_or(NoComma)?; |
80 | |
81 | let (mime_type, base64) = parse_header(from_colon_to_comma); |
82 | |
83 | Ok(DataUrl { |
84 | mime_type, |
85 | base64, |
86 | encoded_body_plus_fragment, |
87 | }) |
88 | } |
89 | |
90 | pub fn mime_type(&self) -> &mime::Mime { |
91 | &self.mime_type |
92 | } |
93 | |
94 | /// Streaming-decode the data URL’s body to `write_body_bytes`, |
95 | /// and return the URL’s fragment identifier if it has one. |
96 | pub fn decode<F, E>( |
97 | &self, |
98 | write_body_bytes: F, |
99 | ) -> Result<Option<FragmentIdentifier<'a>>, forgiving_base64::DecodeError<E>> |
100 | where |
101 | F: FnMut(&[u8]) -> Result<(), E>, |
102 | { |
103 | if self.base64 { |
104 | decode_with_base64(self.encoded_body_plus_fragment, write_body_bytes) |
105 | } else { |
106 | decode_without_base64(self.encoded_body_plus_fragment, write_body_bytes) |
107 | .map_err(forgiving_base64::DecodeError::WriteError) |
108 | } |
109 | } |
110 | |
111 | /// Return the decoded body, and the URL’s fragment identifier if it has one. |
112 | pub fn decode_to_vec( |
113 | &self, |
114 | ) -> Result<(Vec<u8>, Option<FragmentIdentifier<'a>>), forgiving_base64::InvalidBase64> { |
115 | let mut body = Vec::new(); |
116 | let fragment = self.decode(|bytes| { |
117 | body.extend_from_slice(bytes); |
118 | Ok(()) |
119 | })?; |
120 | Ok((body, fragment)) |
121 | } |
122 | } |
123 | |
124 | /// The URL’s fragment identifier (after `#`) |
125 | pub struct FragmentIdentifier<'a>(&'a str); |
126 | |
127 | impl<'a> FragmentIdentifier<'a> { |
128 | /// Like in a parsed URL |
129 | pub fn to_percent_encoded(&self) -> String { |
130 | let mut string: String = String::new(); |
131 | for byte: u8 in self.0.bytes() { |
132 | match byte { |
133 | // Ignore ASCII tabs or newlines like the URL parser would |
134 | b' \t' | b' \n' | b' \r' => continue, |
135 | // https://url.spec.whatwg.org/#fragment-percent-encode-set |
136 | b' \0' ..=b' ' | b'"' | b'<' | b'>' | b'`' | b' \x7F' ..=b' \xFF' => { |
137 | percent_encode(byte, &mut string) |
138 | } |
139 | // Printable ASCII |
140 | _ => string.push(ch:byte as char), |
141 | } |
142 | } |
143 | string |
144 | } |
145 | } |
146 | |
147 | /// Similar to <https://url.spec.whatwg.org/#concept-basic-url-parser> |
148 | /// followed by <https://url.spec.whatwg.org/#concept-url-serializer> |
149 | /// |
150 | /// * `None`: not a data URL. |
151 | /// |
152 | /// * `Some(s)`: sort of the result of serialization, except: |
153 | /// |
154 | /// - `data:` prefix removed |
155 | /// - The fragment is included |
156 | /// - Other components are **not** UTF-8 percent-encoded |
157 | /// - ASCII tabs and newlines in the middle are **not** removed |
158 | fn pretend_parse_data_url(input: &str) -> Option<&str> { |
159 | // Trim C0 control or space |
160 | let left_trimmed: &str = input.trim_start_matches(|ch: char| ch <= ' ' ); |
161 | |
162 | let mut bytes: Bytes<'_> = left_trimmed.bytes(); |
163 | { |
164 | // Ignore ASCII tabs or newlines like the URL parser would |
165 | let mut iter: impl Iterator = bytes&mut Bytes<'_> |
166 | .by_ref() |
167 | .filter(|&byte: u8| !matches!(byte, b' \t' | b' \n' | b' \r' )); |
168 | require!(iter.next()?.to_ascii_lowercase() == b'd' ); |
169 | require!(iter.next()?.to_ascii_lowercase() == b'a' ); |
170 | require!(iter.next()?.to_ascii_lowercase() == b't' ); |
171 | require!(iter.next()?.to_ascii_lowercase() == b'a' ); |
172 | require!(iter.next()? == b':' ); |
173 | } |
174 | let bytes_consumed: usize = left_trimmed.len() - bytes.len(); |
175 | let after_colon: &str = &left_trimmed[bytes_consumed..]; |
176 | |
177 | // Trim C0 control or space |
178 | Some(after_colon.trim_end_matches(|ch: char| ch <= ' ' )) |
179 | } |
180 | |
181 | fn find_comma_before_fragment(after_colon: &str) -> Option<(&str, &str)> { |
182 | for (i: usize, byte: u8) in after_colon.bytes().enumerate() { |
183 | if byte == b',' { |
184 | return Some((&after_colon[..i], &after_colon[i + 1..])); |
185 | } |
186 | if byte == b'#' { |
187 | break; |
188 | } |
189 | } |
190 | None |
191 | } |
192 | |
193 | fn parse_header(from_colon_to_comma: &str) -> (mime::Mime, bool) { |
194 | // "Strip leading and trailing ASCII whitespace" |
195 | // \t, \n, and \r would have been filtered by the URL parser |
196 | // \f percent-encoded by the URL parser |
197 | // space is the only remaining ASCII whitespace |
198 | let trimmed = from_colon_to_comma.trim_matches(|c| matches!(c, ' ' | ' \t' | ' \n' | ' \r' )); |
199 | |
200 | let without_base64_suffix = remove_base64_suffix(trimmed); |
201 | let base64 = without_base64_suffix.is_some(); |
202 | let mime_type = without_base64_suffix.unwrap_or(trimmed); |
203 | |
204 | let mut string = String::new(); |
205 | if mime_type.starts_with(';' ) { |
206 | string.push_str("text/plain" ) |
207 | } |
208 | let mut in_query = false; |
209 | for byte in mime_type.bytes() { |
210 | match byte { |
211 | // Ignore ASCII tabs or newlines like the URL parser would |
212 | b' \t' | b' \n' | b' \r' => continue, |
213 | |
214 | // https://url.spec.whatwg.org/#c0-control-percent-encode-set |
215 | b' \0' ..=b' \x1F' | b' \x7F' ..=b' \xFF' => percent_encode(byte, &mut string), |
216 | |
217 | // Bytes other than the C0 percent-encode set that are percent-encoded |
218 | // by the URL parser in the query state. |
219 | // '#' is also in that list but cannot occur here |
220 | // since it indicates the start of the URL’s fragment. |
221 | b' ' | b'"' | b'<' | b'>' if in_query => percent_encode(byte, &mut string), |
222 | |
223 | b'?' => { |
224 | in_query = true; |
225 | string.push('?' ) |
226 | } |
227 | |
228 | // Printable ASCII |
229 | _ => string.push(byte as char), |
230 | } |
231 | } |
232 | |
233 | // FIXME: does Mime::from_str match the MIME Sniffing Standard’s parsing algorithm? |
234 | // <https://mimesniff.spec.whatwg.org/#parse-a-mime-type> |
235 | let mime_type = string.parse().unwrap_or_else(|_| mime::Mime { |
236 | type_: String::from("text" ), |
237 | subtype: String::from("plain" ), |
238 | parameters: vec![(String::from("charset" ), String::from("US-ASCII" ))], |
239 | }); |
240 | |
241 | (mime_type, base64) |
242 | } |
243 | |
244 | /// None: no base64 suffix |
245 | #[allow (clippy::skip_while_next)] |
246 | fn remove_base64_suffix(s: &str) -> Option<&str> { |
247 | let mut bytes: Bytes<'_> = s.bytes(); |
248 | { |
249 | // Ignore ASCII tabs or newlines like the URL parser would |
250 | let iter: impl Iterator = bytes&mut Bytes<'_> |
251 | .by_ref() |
252 | .filter(|&byte: u8| !matches!(byte, b' \t' | b' \n' | b' \r' )); |
253 | |
254 | // Search from the end |
255 | let mut iter: impl Iterator = iter.rev(); |
256 | |
257 | require!(iter.next()? == b'4' ); |
258 | require!(iter.next()? == b'6' ); |
259 | require!(iter.next()?.to_ascii_lowercase() == b'e' ); |
260 | require!(iter.next()?.to_ascii_lowercase() == b's' ); |
261 | require!(iter.next()?.to_ascii_lowercase() == b'a' ); |
262 | require!(iter.next()?.to_ascii_lowercase() == b'b' ); |
263 | require!(iter.skip_while(|&byte| byte == b' ' ).next()? == b';' ); |
264 | } |
265 | Some(&s[..bytes.len()]) |
266 | } |
267 | |
268 | fn percent_encode(byte: u8, string: &mut String) { |
269 | const HEX_UPPER: [u8; 16] = *b"0123456789ABCDEF" ; |
270 | string.push(ch:'%' ); |
271 | string.push(HEX_UPPER[(byte >> 4) as usize] as char); |
272 | string.push(HEX_UPPER[(byte & 0x0f) as usize] as char); |
273 | } |
274 | |
275 | /// This is <https://url.spec.whatwg.org/#string-percent-decode> while also: |
276 | /// |
277 | /// * Ignoring ASCII tab or newlines |
278 | /// * Stopping at the first '#' (which indicates the start of the fragment) |
279 | /// |
280 | /// Anything that would have been UTF-8 percent-encoded by the URL parser |
281 | /// would be percent-decoded here. |
282 | /// We skip that round-trip and pass it through unchanged. |
283 | fn decode_without_base64<F, E>( |
284 | encoded_body_plus_fragment: &str, |
285 | mut write_bytes: F, |
286 | ) -> Result<Option<FragmentIdentifier<'_>>, E> |
287 | where |
288 | F: FnMut(&[u8]) -> Result<(), E>, |
289 | { |
290 | let bytes = encoded_body_plus_fragment.as_bytes(); |
291 | let mut slice_start = 0; |
292 | for (i, &byte) in bytes.iter().enumerate() { |
293 | // We only need to look for 5 different "special" byte values. |
294 | // For everything else we make slices as large as possible, borrowing the input, |
295 | // in order to make fewer write_all() calls. |
296 | if matches!(byte, b'%' | b'#' | b' \t' | b' \n' | b' \r' ) { |
297 | // Write everything (if anything) "non-special" we’ve accumulated |
298 | // before this special byte |
299 | if i > slice_start { |
300 | write_bytes(&bytes[slice_start..i])?; |
301 | } |
302 | // Then deal with the special byte. |
303 | match byte { |
304 | b'%' => { |
305 | let l = bytes.get(i + 2).and_then(|&b| (b as char).to_digit(16)); |
306 | let h = bytes.get(i + 1).and_then(|&b| (b as char).to_digit(16)); |
307 | if let (Some(h), Some(l)) = (h, l) { |
308 | // '%' followed by two ASCII hex digits |
309 | let one_byte = h as u8 * 0x10 + l as u8; |
310 | write_bytes(&[one_byte])?; |
311 | slice_start = i + 3; |
312 | } else { |
313 | // Do nothing. Leave slice_start unchanged. |
314 | // The % sign will be part of the next slice. |
315 | } |
316 | } |
317 | |
318 | b'#' => { |
319 | let fragment_start = i + 1; |
320 | let fragment = &encoded_body_plus_fragment[fragment_start..]; |
321 | return Ok(Some(FragmentIdentifier(fragment))); |
322 | } |
323 | |
324 | // Ignore over '\t' | '\n' | '\r' |
325 | _ => slice_start = i + 1, |
326 | } |
327 | } |
328 | } |
329 | write_bytes(&bytes[slice_start..])?; |
330 | Ok(None) |
331 | } |
332 | |
333 | /// `decode_without_base64()` composed with |
334 | /// <https://infra.spec.whatwg.org/#isomorphic-decode> composed with |
335 | /// <https://infra.spec.whatwg.org/#forgiving-base64-decode>. |
336 | fn decode_with_base64<F, E>( |
337 | encoded_body_plus_fragment: &str, |
338 | write_bytes: F, |
339 | ) -> Result<Option<FragmentIdentifier<'_>>, forgiving_base64::DecodeError<E>> |
340 | where |
341 | F: FnMut(&[u8]) -> Result<(), E>, |
342 | { |
343 | let mut decoder: Decoder = forgiving_base64::Decoder::new(write_bytes); |
344 | let fragment: Option> = decode_without_base64(encoded_body_plus_fragment, |bytes: &[u8]| decoder.feed(input:bytes))?; |
345 | decoder.finish()?; |
346 | Ok(fragment) |
347 | } |
348 | |