lib.rs source code [crates/data_url/src/lib.rs]

1	//! Processing of `data:` URLs according to the Fetch Standard:
2	//! <https://fetch.spec.whatwg.org/#data-urls>
3	//! but starting from a string rather than a parsed URL to avoid extra copies.
4	//!
5	//! ```rust
6	//! use data_url::{DataUrl, mime};
7	//!
8	//! let url = DataUrl::process("data:,Hello%20World!").unwrap();
9	//! let (body, fragment) = url.decode_to_vec().unwrap();
10	//!
11	//! assert_eq!(url.mime_type().type_, "text");
12	//! assert_eq!(url.mime_type().subtype, "plain");
13	//! assert_eq!(url.mime_type().get_parameter("charset"), Some("US-ASCII"));
14	//! assert_eq!(body, b"Hello World!");
15	//! assert!(fragment.is_none());
16	//! ```
17	#![no_std]
18
19	// For forwards compatibility
20	#[cfg(feature = "std")]
21	extern crate std;
22
23	#[macro_use]
24	extern crate alloc;
25
26	#[cfg(not(feature = "alloc"))]
27	compile_error!("the `alloc` feature must be enabled");
28
29	use alloc::{string::String, vec::Vec};
30	use core::fmt;
31
32	macro_rules! require {
33	($condition: expr) => {
34	if !$condition {
35	return None;
36	}
37	};
38	}
39
40	pub mod forgiving_base64;
41	pub mod mime;
42
43	pub struct DataUrl<'a> {
44	mime_type: mime::Mime,
45	base64: bool,
46	encoded_body_plus_fragment: &'a str,
47	}
48
49	#[derive(Debug)]
50	pub enum DataUrlError {
51	NotADataUrl,
52	NoComma,
53	}
54
55	impl fmt::Display for DataUrlError {
56	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
57	match self {
58	Self::NotADataUrl => write!(f, "not a valid data url"),
59	Self::NoComma => write!(
60	f,
61	"data url is missing comma delimiting attributes and body"
62	),
63	}
64	}
65	}
66
67	#[cfg(feature = "std")]
68	impl std::error::Error for DataUrlError {}
69
70	impl<'a> DataUrl<'a> {
71	/// <https://fetch.spec.whatwg.org/#data-url-processor>
72	/// but starting from a string rather than a parsed `Url`, to avoid extra string copies.
73	pub fn process(input: &'a str) -> Result<Self, DataUrlError> {
74	use crate::DataUrlError::*;
75
76	let after_colon = pretend_parse_data_url(input).ok_or(NotADataUrl)?;
77
78	let (from_colon_to_comma, encoded_body_plus_fragment) =
79	find_comma_before_fragment(after_colon).ok_or(NoComma)?;
80
81	let (mime_type, base64) = parse_header(from_colon_to_comma);
82
83	Ok(DataUrl {
84	mime_type,
85	base64,
86	encoded_body_plus_fragment,
87	})
88	}
89
90	pub fn mime_type(&self) -> &mime::Mime {
91	&self.mime_type
92	}
93
94	/// Streaming-decode the data URL’s body to `write_body_bytes`,
95	/// and return the URL’s fragment identifier if it has one.
96	pub fn decode<F, E>(
97	&self,
98	write_body_bytes: F,
99	) -> Result<Option<FragmentIdentifier<'a>>, forgiving_base64::DecodeError<E>>
100	where
101	F: FnMut(&[u8]) -> Result<(), E>,
102	{
103	if self.base64 {
104	decode_with_base64(self.encoded_body_plus_fragment, write_body_bytes)
105	} else {
106	decode_without_base64(self.encoded_body_plus_fragment, write_body_bytes)
107	.map_err(forgiving_base64::DecodeError::WriteError)
108	}
109	}
110
111	/// Return the decoded body, and the URL’s fragment identifier if it has one.
112	pub fn decode_to_vec(
113	&self,
114	) -> Result<(Vec<u8>, Option<FragmentIdentifier<'a>>), forgiving_base64::InvalidBase64> {
115	let mut body = Vec::new();
116	let fragment = self.decode(\|bytes\| {
117	body.extend_from_slice(bytes);
118	Ok(())
119	})?;
120	Ok((body, fragment))
121	}
122	}
123
124	/// The URL’s fragment identifier (after `#`)
125	pub struct FragmentIdentifier<'a>(&'a str);
126
127	impl<'a> FragmentIdentifier<'a> {
128	/// Like in a parsed URL
129	pub fn to_percent_encoded(&self) -> String {
130	let mut string: String = String::new();
131	for byte: u8 in self.0.bytes() {
132	match byte {
133	// Ignore ASCII tabs or newlines like the URL parser would
134	b'`\t`' \| b'`\n`' \| b'`\r`' => continue,
135	// https://url.spec.whatwg.org/#fragment-percent-encode-set
136	b'`\0`'..=b' ' \| b'"' \| b'<' \| b'>' \| b'`' \| b'`\x7F`'..=b'`\xFF`' => {
137	percent_encode(byte, &mut string)
138	}
139	// Printable ASCII
140	_ => string.push(ch:byte as char),
141	}
142	}
143	string
144	}
145	}
146
147	/// Similar to <https://url.spec.whatwg.org/#concept-basic-url-parser>
148	/// followed by <https://url.spec.whatwg.org/#concept-url-serializer>
149	///
150	/// `None`: not a data URL.*
151	///
152	/// `Some(s)`: sort of the result of serialization, except:*
153	///
154	/// - `data:` prefix removed
155	/// - The fragment is included
156	/// - Other components are not* UTF-8 percent-encoded*
157	/// - ASCII tabs and newlines in the middle are not* removed*
158	fn pretend_parse_data_url(input: &str) -> Option<&str> {
159	// Trim C0 control or space
160	let left_trimmed: &str = input.trim_start_matches(\|ch: char\| ch <= ' ');
161
162	let mut bytes: Bytes<'_> = left_trimmed.bytes();
163	{
164	// Ignore ASCII tabs or newlines like the URL parser would
165	let mut iter: impl Iterator = bytes&mut Bytes<'_>
166	.by_ref()
167	.filter(\|&byte: u8\| !matches!(byte, b'`\t`' \| b'`\n`' \| b'`\r`'));
168	require!(iter.next()?.to_ascii_lowercase() == b'd');
169	require!(iter.next()?.to_ascii_lowercase() == b'a');
170	require!(iter.next()?.to_ascii_lowercase() == b't');
171	require!(iter.next()?.to_ascii_lowercase() == b'a');
172	require!(iter.next()? == b':');
173	}
174	let bytes_consumed: usize = left_trimmed.len() - bytes.len();
175	let after_colon: &str = &left_trimmed[bytes_consumed..];
176
177	// Trim C0 control or space
178	Some(after_colon.trim_end_matches(\|ch: char\| ch <= ' '))
179	}
180
181	fn find_comma_before_fragment(after_colon: &str) -> Option<(&str, &str)> {
182	for (i: usize, byte: u8) in after_colon.bytes().enumerate() {
183	if byte == b',' {
184	return Some((&after_colon[..i], &after_colon[i + `1`..]));
185	}
186	if byte == b'#' {
187	break;
188	}
189	}
190	None
191	}
192
193	fn parse_header(from_colon_to_comma: &str) -> (mime::Mime, bool) {
194	// "Strip leading and trailing ASCII whitespace"
195	// \t, \n, and \r would have been filtered by the URL parser
196	// \f percent-encoded by the URL parser
197	// space is the only remaining ASCII whitespace
198	let trimmed = from_colon_to_comma.trim_matches(\|c\| matches!(c, ' ' \| '`\t`' \| '`\n`' \| '`\r`'));
199
200	let without_base64_suffix = remove_base64_suffix(trimmed);
201	let base64 = without_base64_suffix.is_some();
202	let mime_type = without_base64_suffix.unwrap_or(trimmed);
203
204	let mut string = String::new();
205	if mime_type.starts_with(';') {
206	string.push_str("text/plain")
207	}
208	let mut in_query = `false`;
209	for byte in mime_type.bytes() {
210	match byte {
211	// Ignore ASCII tabs or newlines like the URL parser would
212	b'`\t`' \| b'`\n`' \| b'`\r`' => continue,
213
214	// https://url.spec.whatwg.org/#c0-control-percent-encode-set
215	b'`\0`'..=b'`\x1F`' \| b'`\x7F`'..=b'`\xFF`' => percent_encode(byte, &mut string),
216
217	// Bytes other than the C0 percent-encode set that are percent-encoded
218	// by the URL parser in the query state.
219	// '#' is also in that list but cannot occur here
220	// since it indicates the start of the URL’s fragment.
221	b' ' \| b'"' \| b'<' \| b'>' if in_query => percent_encode(byte, &mut string),
222
223	b'?' => {
224	in_query = `true`;
225	string.push('?')
226	}
227
228	// Printable ASCII
229	_ => string.push(byte as char),
230	}
231	}
232
233	// FIXME: does Mime::from_str match the MIME Sniffing Standard’s parsing algorithm?
234	// <https://mimesniff.spec.whatwg.org/#parse-a-mime-type>
235	let mime_type = string.parse().unwrap_or_else(\|_\| mime::Mime {
236	type_: String::from("text"),
237	subtype: String::from("plain"),
238	parameters: vec![(String::from("charset"), String::from("US-ASCII"))],
239	});
240
241	(mime_type, base64)
242	}
243
244	/// None: no base64 suffix
245	#[allow(clippy::skip_while_next)]
246	fn remove_base64_suffix(s: &str) -> Option<&str> {
247	let mut bytes: Bytes<'_> = s.bytes();
248	{
249	// Ignore ASCII tabs or newlines like the URL parser would
250	let iter: impl Iterator = bytes&mut Bytes<'_>
251	.by_ref()
252	.filter(\|&byte: u8\| !matches!(byte, b'`\t`' \| b'`\n`' \| b'`\r`'));
253
254	// Search from the end
255	let mut iter: impl Iterator = iter.rev();
256
257	require!(iter.next()? == b'4');
258	require!(iter.next()? == b'6');
259	require!(iter.next()?.to_ascii_lowercase() == b'e');
260	require!(iter.next()?.to_ascii_lowercase() == b's');
261	require!(iter.next()?.to_ascii_lowercase() == b'a');
262	require!(iter.next()?.to_ascii_lowercase() == b'b');
263	require!(iter.skip_while(\|&byte\| byte == b' ').next()? == b';');
264	}
265	Some(&s[..bytes.len()])
266	}
267
268	fn percent_encode(byte: u8, string: &mut String) {
269	const HEX_UPPER: [u8; `16`] = *b"0123456789ABCDEF";
270	string.push(ch:'%');
271	string.push(HEX_UPPER[(byte >> `4`) as usize] as char);
272	string.push(HEX_UPPER[(byte & `0x0f`) as usize] as char);
273	}
274
275	/// This is <https://url.spec.whatwg.org/#string-percent-decode> while also:
276	///
277	/// Ignoring ASCII tab or newlines*
278	/// Stopping at the first '#' (which indicates the start of the fragment)*
279	///
280	/// Anything that would have been UTF-8 percent-encoded by the URL parser
281	/// would be percent-decoded here.
282	/// We skip that round-trip and pass it through unchanged.
283	fn decode_without_base64<F, E>(
284	encoded_body_plus_fragment: &str,
285	mut write_bytes: F,
286	) -> Result<Option<FragmentIdentifier<'_>>, E>
287	where
288	F: FnMut(&[u8]) -> Result<(), E>,
289	{
290	let bytes = encoded_body_plus_fragment.as_bytes();
291	let mut slice_start = `0`;
292	for (i, &byte) in bytes.iter().enumerate() {
293	// We only need to look for 5 different "special" byte values.
294	// For everything else we make slices as large as possible, borrowing the input,
295	// in order to make fewer write_all() calls.
296	if matches!(byte, b'%' \| b'#' \| b'`\t`' \| b'`\n`' \| b'`\r`') {
297	// Write everything (if anything) "non-special" we’ve accumulated
298	// before this special byte
299	if i > slice_start {
300	write_bytes(&bytes[slice_start..i])?;
301	}
302	// Then deal with the special byte.
303	match byte {
304	b'%' => {
305	let l = bytes.get(i + `2`).and_then(\|&b\| (b as char).to_digit(`16`));
306	let h = bytes.get(i + `1`).and_then(\|&b\| (b as char).to_digit(`16`));
307	if let (Some(h), Some(l)) = (h, l) {
308	// '%' followed by two ASCII hex digits
309	let one_byte = h as u8 * `0x10` + l as u8;
310	write_bytes(&[one_byte])?;
311	slice_start = i + `3`;
312	} else {
313	// Do nothing. Leave slice_start unchanged.
314	// The % sign will be part of the next slice.
315	}
316	}
317
318	b'#' => {
319	let fragment_start = i + `1`;
320	let fragment = &encoded_body_plus_fragment[fragment_start..];
321	return Ok(Some(FragmentIdentifier(fragment)));
322	}
323
324	// Ignore over '\t' \| '\n' \| '\r'
325	_ => slice_start = i + `1`,
326	}
327	}
328	}
329	write_bytes(&bytes[slice_start..])?;
330	Ok(None)
331	}
332
333	/// `decode_without_base64()` composed with
334	/// <https://infra.spec.whatwg.org/#isomorphic-decode> composed with
335	/// <https://infra.spec.whatwg.org/#forgiving-base64-decode>.
336	fn decode_with_base64<F, E>(
337	encoded_body_plus_fragment: &str,
338	write_bytes: F,
339	) -> Result<Option<FragmentIdentifier<'_>>, forgiving_base64::DecodeError<E>>
340	where
341	F: FnMut(&[u8]) -> Result<(), E>,
342	{
343	let mut decoder: Decoder = forgiving_base64::Decoder::new(write_bytes);
344	let fragment: Option> = decode_without_base64(encoded_body_plus_fragment, \|bytes: &[u8]\| decoder.feed(input:bytes))?;
345	decoder.finish()?;
346	Ok(fragment)
347	}
348