1//! Processing of `data:` URLs according to the Fetch Standard:
2//! <https://fetch.spec.whatwg.org/#data-urls>
3//! but starting from a string rather than a parsed URL to avoid extra copies.
4//!
5//! ```rust
6//! use data_url::{DataUrl, mime};
7//!
8//! let url = DataUrl::process("data:,Hello%20World!").unwrap();
9//! let (body, fragment) = url.decode_to_vec().unwrap();
10//!
11//! assert_eq!(url.mime_type().type_, "text");
12//! assert_eq!(url.mime_type().subtype, "plain");
13//! assert_eq!(url.mime_type().get_parameter("charset"), Some("US-ASCII"));
14//! assert_eq!(body, b"Hello World!");
15//! assert!(fragment.is_none());
16//! ```
17#![no_std]
18
19// For forwards compatibility
20#[cfg(feature = "std")]
21extern crate std;
22
23#[macro_use]
24extern crate alloc;
25
26#[cfg(not(feature = "alloc"))]
27compile_error!("the `alloc` feature must be enabled");
28
29use alloc::{string::String, vec::Vec};
30use core::fmt;
31
32macro_rules! require {
33 ($condition: expr) => {
34 if !$condition {
35 return None;
36 }
37 };
38}
39
40pub mod forgiving_base64;
41pub mod mime;
42
43pub struct DataUrl<'a> {
44 mime_type: mime::Mime,
45 base64: bool,
46 encoded_body_plus_fragment: &'a str,
47}
48
49#[derive(Debug)]
50pub enum DataUrlError {
51 NotADataUrl,
52 NoComma,
53}
54
55impl fmt::Display for DataUrlError {
56 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
57 match self {
58 Self::NotADataUrl => write!(f, "not a valid data url"),
59 Self::NoComma => write!(
60 f,
61 "data url is missing comma delimiting attributes and body"
62 ),
63 }
64 }
65}
66
67#[cfg(feature = "std")]
68impl std::error::Error for DataUrlError {}
69
70impl<'a> DataUrl<'a> {
71 /// <https://fetch.spec.whatwg.org/#data-url-processor>
72 /// but starting from a string rather than a parsed `Url`, to avoid extra string copies.
73 pub fn process(input: &'a str) -> Result<Self, DataUrlError> {
74 use crate::DataUrlError::*;
75
76 let after_colon = pretend_parse_data_url(input).ok_or(NotADataUrl)?;
77
78 let (from_colon_to_comma, encoded_body_plus_fragment) =
79 find_comma_before_fragment(after_colon).ok_or(NoComma)?;
80
81 let (mime_type, base64) = parse_header(from_colon_to_comma);
82
83 Ok(DataUrl {
84 mime_type,
85 base64,
86 encoded_body_plus_fragment,
87 })
88 }
89
90 pub fn mime_type(&self) -> &mime::Mime {
91 &self.mime_type
92 }
93
94 /// Streaming-decode the data URL’s body to `write_body_bytes`,
95 /// and return the URL’s fragment identifier if it has one.
96 pub fn decode<F, E>(
97 &self,
98 write_body_bytes: F,
99 ) -> Result<Option<FragmentIdentifier<'a>>, forgiving_base64::DecodeError<E>>
100 where
101 F: FnMut(&[u8]) -> Result<(), E>,
102 {
103 if self.base64 {
104 decode_with_base64(self.encoded_body_plus_fragment, write_body_bytes)
105 } else {
106 decode_without_base64(self.encoded_body_plus_fragment, write_body_bytes)
107 .map_err(forgiving_base64::DecodeError::WriteError)
108 }
109 }
110
111 /// Return the decoded body, and the URL’s fragment identifier if it has one.
112 pub fn decode_to_vec(
113 &self,
114 ) -> Result<(Vec<u8>, Option<FragmentIdentifier<'a>>), forgiving_base64::InvalidBase64> {
115 let mut body = Vec::new();
116 let fragment = self.decode(|bytes| {
117 body.extend_from_slice(bytes);
118 Ok(())
119 })?;
120 Ok((body, fragment))
121 }
122}
123
124/// The URL’s fragment identifier (after `#`)
125pub struct FragmentIdentifier<'a>(&'a str);
126
127impl<'a> FragmentIdentifier<'a> {
128 /// Like in a parsed URL
129 pub fn to_percent_encoded(&self) -> String {
130 let mut string: String = String::new();
131 for byte: u8 in self.0.bytes() {
132 match byte {
133 // Ignore ASCII tabs or newlines like the URL parser would
134 b'\t' | b'\n' | b'\r' => continue,
135 // https://url.spec.whatwg.org/#fragment-percent-encode-set
136 b'\0'..=b' ' | b'"' | b'<' | b'>' | b'`' | b'\x7F'..=b'\xFF' => {
137 percent_encode(byte, &mut string)
138 }
139 // Printable ASCII
140 _ => string.push(ch:byte as char),
141 }
142 }
143 string
144 }
145}
146
147/// Similar to <https://url.spec.whatwg.org/#concept-basic-url-parser>
148/// followed by <https://url.spec.whatwg.org/#concept-url-serializer>
149///
150/// * `None`: not a data URL.
151///
152/// * `Some(s)`: sort of the result of serialization, except:
153///
154/// - `data:` prefix removed
155/// - The fragment is included
156/// - Other components are **not** UTF-8 percent-encoded
157/// - ASCII tabs and newlines in the middle are **not** removed
158fn pretend_parse_data_url(input: &str) -> Option<&str> {
159 // Trim C0 control or space
160 let left_trimmed: &str = input.trim_start_matches(|ch: char| ch <= ' ');
161
162 let mut bytes: Bytes<'_> = left_trimmed.bytes();
163 {
164 // Ignore ASCII tabs or newlines like the URL parser would
165 let mut iter: impl Iterator = bytes&mut Bytes<'_>
166 .by_ref()
167 .filter(|&byte: u8| !matches!(byte, b'\t' | b'\n' | b'\r'));
168 require!(iter.next()?.to_ascii_lowercase() == b'd');
169 require!(iter.next()?.to_ascii_lowercase() == b'a');
170 require!(iter.next()?.to_ascii_lowercase() == b't');
171 require!(iter.next()?.to_ascii_lowercase() == b'a');
172 require!(iter.next()? == b':');
173 }
174 let bytes_consumed: usize = left_trimmed.len() - bytes.len();
175 let after_colon: &str = &left_trimmed[bytes_consumed..];
176
177 // Trim C0 control or space
178 Some(after_colon.trim_end_matches(|ch: char| ch <= ' '))
179}
180
181fn find_comma_before_fragment(after_colon: &str) -> Option<(&str, &str)> {
182 for (i: usize, byte: u8) in after_colon.bytes().enumerate() {
183 if byte == b',' {
184 return Some((&after_colon[..i], &after_colon[i + 1..]));
185 }
186 if byte == b'#' {
187 break;
188 }
189 }
190 None
191}
192
193fn parse_header(from_colon_to_comma: &str) -> (mime::Mime, bool) {
194 // "Strip leading and trailing ASCII whitespace"
195 // \t, \n, and \r would have been filtered by the URL parser
196 // \f percent-encoded by the URL parser
197 // space is the only remaining ASCII whitespace
198 let trimmed = from_colon_to_comma.trim_matches(|c| matches!(c, ' ' | '\t' | '\n' | '\r'));
199
200 let without_base64_suffix = remove_base64_suffix(trimmed);
201 let base64 = without_base64_suffix.is_some();
202 let mime_type = without_base64_suffix.unwrap_or(trimmed);
203
204 let mut string = String::new();
205 if mime_type.starts_with(';') {
206 string.push_str("text/plain")
207 }
208 let mut in_query = false;
209 for byte in mime_type.bytes() {
210 match byte {
211 // Ignore ASCII tabs or newlines like the URL parser would
212 b'\t' | b'\n' | b'\r' => continue,
213
214 // https://url.spec.whatwg.org/#c0-control-percent-encode-set
215 b'\0'..=b'\x1F' | b'\x7F'..=b'\xFF' => percent_encode(byte, &mut string),
216
217 // Bytes other than the C0 percent-encode set that are percent-encoded
218 // by the URL parser in the query state.
219 // '#' is also in that list but cannot occur here
220 // since it indicates the start of the URL’s fragment.
221 b' ' | b'"' | b'<' | b'>' if in_query => percent_encode(byte, &mut string),
222
223 b'?' => {
224 in_query = true;
225 string.push('?')
226 }
227
228 // Printable ASCII
229 _ => string.push(byte as char),
230 }
231 }
232
233 // FIXME: does Mime::from_str match the MIME Sniffing Standard’s parsing algorithm?
234 // <https://mimesniff.spec.whatwg.org/#parse-a-mime-type>
235 let mime_type = string.parse().unwrap_or_else(|_| mime::Mime {
236 type_: String::from("text"),
237 subtype: String::from("plain"),
238 parameters: vec![(String::from("charset"), String::from("US-ASCII"))],
239 });
240
241 (mime_type, base64)
242}
243
244/// None: no base64 suffix
245#[allow(clippy::skip_while_next)]
246fn remove_base64_suffix(s: &str) -> Option<&str> {
247 let mut bytes: Bytes<'_> = s.bytes();
248 {
249 // Ignore ASCII tabs or newlines like the URL parser would
250 let iter: impl Iterator = bytes&mut Bytes<'_>
251 .by_ref()
252 .filter(|&byte: u8| !matches!(byte, b'\t' | b'\n' | b'\r'));
253
254 // Search from the end
255 let mut iter: impl Iterator = iter.rev();
256
257 require!(iter.next()? == b'4');
258 require!(iter.next()? == b'6');
259 require!(iter.next()?.to_ascii_lowercase() == b'e');
260 require!(iter.next()?.to_ascii_lowercase() == b's');
261 require!(iter.next()?.to_ascii_lowercase() == b'a');
262 require!(iter.next()?.to_ascii_lowercase() == b'b');
263 require!(iter.skip_while(|&byte| byte == b' ').next()? == b';');
264 }
265 Some(&s[..bytes.len()])
266}
267
268fn percent_encode(byte: u8, string: &mut String) {
269 const HEX_UPPER: [u8; 16] = *b"0123456789ABCDEF";
270 string.push(ch:'%');
271 string.push(HEX_UPPER[(byte >> 4) as usize] as char);
272 string.push(HEX_UPPER[(byte & 0x0f) as usize] as char);
273}
274
275/// This is <https://url.spec.whatwg.org/#string-percent-decode> while also:
276///
277/// * Ignoring ASCII tab or newlines
278/// * Stopping at the first '#' (which indicates the start of the fragment)
279///
280/// Anything that would have been UTF-8 percent-encoded by the URL parser
281/// would be percent-decoded here.
282/// We skip that round-trip and pass it through unchanged.
283fn decode_without_base64<F, E>(
284 encoded_body_plus_fragment: &str,
285 mut write_bytes: F,
286) -> Result<Option<FragmentIdentifier<'_>>, E>
287where
288 F: FnMut(&[u8]) -> Result<(), E>,
289{
290 let bytes = encoded_body_plus_fragment.as_bytes();
291 let mut slice_start = 0;
292 for (i, &byte) in bytes.iter().enumerate() {
293 // We only need to look for 5 different "special" byte values.
294 // For everything else we make slices as large as possible, borrowing the input,
295 // in order to make fewer write_all() calls.
296 if matches!(byte, b'%' | b'#' | b'\t' | b'\n' | b'\r') {
297 // Write everything (if anything) "non-special" we’ve accumulated
298 // before this special byte
299 if i > slice_start {
300 write_bytes(&bytes[slice_start..i])?;
301 }
302 // Then deal with the special byte.
303 match byte {
304 b'%' => {
305 let l = bytes.get(i + 2).and_then(|&b| (b as char).to_digit(16));
306 let h = bytes.get(i + 1).and_then(|&b| (b as char).to_digit(16));
307 if let (Some(h), Some(l)) = (h, l) {
308 // '%' followed by two ASCII hex digits
309 let one_byte = h as u8 * 0x10 + l as u8;
310 write_bytes(&[one_byte])?;
311 slice_start = i + 3;
312 } else {
313 // Do nothing. Leave slice_start unchanged.
314 // The % sign will be part of the next slice.
315 }
316 }
317
318 b'#' => {
319 let fragment_start = i + 1;
320 let fragment = &encoded_body_plus_fragment[fragment_start..];
321 return Ok(Some(FragmentIdentifier(fragment)));
322 }
323
324 // Ignore over '\t' | '\n' | '\r'
325 _ => slice_start = i + 1,
326 }
327 }
328 }
329 write_bytes(&bytes[slice_start..])?;
330 Ok(None)
331}
332
333/// `decode_without_base64()` composed with
334/// <https://infra.spec.whatwg.org/#isomorphic-decode> composed with
335/// <https://infra.spec.whatwg.org/#forgiving-base64-decode>.
336fn decode_with_base64<F, E>(
337 encoded_body_plus_fragment: &str,
338 write_bytes: F,
339) -> Result<Option<FragmentIdentifier<'_>>, forgiving_base64::DecodeError<E>>
340where
341 F: FnMut(&[u8]) -> Result<(), E>,
342{
343 let mut decoder: Decoder = forgiving_base64::Decoder::new(write_bytes);
344 let fragment: Option> = decode_without_base64(encoded_body_plus_fragment, |bytes: &[u8]| decoder.feed(input:bytes))?;
345 decoder.finish()?;
346 Ok(fragment)
347}
348