lib.rs source code [crates/percent_encoding/src/lib.rs]

1	// Copyright 2013-2016 The rust-url developers.
2	//
3	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6	// option. This file may not be copied, modified, or distributed
7	// except according to those terms.
8
9	//! URLs use special characters to indicate the parts of the request.
10	//! For example, a `?` question mark marks the end of a path and the start of a query string.
11	//! In order for that character to exist inside a path, it needs to be encoded differently.
12	//!
13	//! Percent encoding replaces reserved characters with the `%` escape character
14	//! followed by a byte value as two hexadecimal digits.
15	//! For example, an ASCII space is replaced with `%20`.
16	//!
17	//! When encoding, the set of characters that can (and should, for readability) be left alone
18	//! depends on the context.
19	//! The `?` question mark mentioned above is not a separator when used literally
20	//! inside of a query string, and therefore does not need to be encoded.
21	//! The [`AsciiSet`] parameter of [`percent_encode`] and [`utf8_percent_encode`]
22	//! lets callers configure this.
23	//!
24	//! This crate deliberately does not provide many different sets.
25	//! Users should consider in what context the encoded string will be used,
26	//! read relevant specifications, and define their own set.
27	//! This is done by using the `add` method of an existing set.
28	//!
29	//! # Examples
30	//!
31	//! ```
32	//! use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
33	//!
34	//! /// https://url.spec.whatwg.org/#fragment-percent-encode-set
35	//! const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
36	//!
37	//! assert_eq!(utf8_percent_encode("foo <bar>", FRAGMENT).to_string(), "foo%20%3Cbar%3E");
38	//! ```
39	#![no_std]
40
41	// For forwards compatibility
42	#[cfg(feature = "std")]
43	extern crate std as _;
44
45	#[cfg(feature = "alloc")]
46	extern crate alloc;
47
48	#[cfg(feature = "alloc")]
49	use alloc::{
50	borrow::{Cow, ToOwned},
51	string::String,
52	vec::Vec,
53	};
54	use core::{fmt, mem, slice, str};
55
56	/// Represents a set of characters or bytes in the ASCII range.
57	///
58	/// This is used in [`percent_encode`] and [`utf8_percent_encode`].
59	/// This is similar to [percent-encode sets](https://url.spec.whatwg.org/#percent-encoded-bytes).
60	///
61	/// Use the `add` method of an existing set to define a new set. For example:
62	///
63	/// ```
64	/// use percent_encoding::{AsciiSet, CONTROLS};
65	///
66	/// /// https://url.spec.whatwg.org/#fragment-percent-encode-set
67	/// const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
68	/// ```
69	pub struct AsciiSet {
70	mask: [Chunk; ASCII_RANGE_LEN / BITS_PER_CHUNK],
71	}
72
73	type Chunk = u32;
74
75	const ASCII_RANGE_LEN: usize = `0x80`;
76
77	const BITS_PER_CHUNK: usize = `8` * mem::size_of::<Chunk>();
78
79	impl AsciiSet {
80	/// Called with UTF-8 bytes rather than code points.
81	/// Not used for non-ASCII bytes.
82	const fn contains(&self, byte: u8) -> bool {
83	let chunk = self.mask[byte as usize / BITS_PER_CHUNK];
84	let mask = `1` << (byte as usize % BITS_PER_CHUNK);
85	(chunk & mask) != `0`
86	}
87
88	fn should_percent_encode(&self, byte: u8) -> bool {
89	!byte.is_ascii() \|\| self.contains(byte)
90	}
91
92	pub const fn add(&self, byte: u8) -> Self {
93	let mut mask = self.mask;
94	mask[byte as usize / BITS_PER_CHUNK] \|= `1` << (byte as usize % BITS_PER_CHUNK);
95	AsciiSet { mask }
96	}
97
98	pub const fn remove(&self, byte: u8) -> Self {
99	let mut mask = self.mask;
100	mask[byte as usize / BITS_PER_CHUNK] &= !(`1` << (byte as usize % BITS_PER_CHUNK));
101	AsciiSet { mask }
102	}
103	}
104
105	/// The set of 0x00 to 0x1F (C0 controls), and 0x7F (DEL).
106	///
107	/// Note that this includes the newline and tab characters, but not the space 0x20.
108	///
109	/// <https://url.spec.whatwg.org/#c0-control-percent-encode-set>
110	pub const CONTROLS: &AsciiSet = &AsciiSet {
111	mask: [
112	!`0_u32`, // C0: 0x00 to 0x1F (32 bits set)
113	`0`,
114	`0`,
115	`1` << (`0x7F_u32` % `32`), // DEL: 0x7F (one bit set)
116	],
117	};
118
119	macro_rules! static_assert {
120	($( $bool: expr, )+) => {
121	fn _static_assert() {
122	$(
123	let _ = mem::transmute::<[u8; $bool as usize], u8>;
124	)+
125	}
126	}
127	}
128
129	static_assert! {
130	CONTROLS.contains(`0x00`),
131	CONTROLS.contains(`0x1F`),
132	!CONTROLS.contains(`0x20`),
133	!CONTROLS.contains(`0x7E`),
134	CONTROLS.contains(`0x7F`),
135	}
136
137	/// Everything that is not an ASCII letter or digit.
138	///
139	/// This is probably more eager than necessary in any context.
140	pub const NON_ALPHANUMERIC: &AsciiSet = &CONTROLS
141	.add(b' ')
142	.add(b'!')
143	.add(b'"')
144	.add(b'#')
145	.add(b'$')
146	.add(b'%')
147	.add(b'&')
148	.add(b'`\'`')
149	.add(b'(')
150	.add(b')')
151	.add(b'*')
152	.add(b'+')
153	.add(b',')
154	.add(b'-')
155	.add(b'.')
156	.add(b'/')
157	.add(b':')
158	.add(b';')
159	.add(b'<')
160	.add(b'=')
161	.add(b'>')
162	.add(b'?')
163	.add(b'@')
164	.add(b'[')
165	.add(b'`\\`')
166	.add(b']')
167	.add(b'^')
168	.add(b'_')
169	.add(b'`')
170	.add(b'{')
171	.add(b'\|')
172	.add(b'}')
173	.add(byte:b'~');
174
175	/// Return the percent-encoding of the given byte.
176	///
177	/// This is unconditional, unlike `percent_encode()` which has an `AsciiSet` parameter.
178	///
179	/// # Examples
180	///
181	/// ```
182	/// use percent_encoding::percent_encode_byte;
183	///
184	/// assert_eq!("foo bar".bytes().map(percent_encode_byte).collect::<String>(),
185	/// "%66%6F%6F%20%62%61%72");
186	/// ```
187	#[inline]
188	pub fn percent_encode_byte(byte: u8) -> &'static str {
189	static ENC_TABLE: &[u8; `768`] = b"\
190	%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F\
191	%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F\
192	%20%21%22%23%24%25%26%27%28%29%2A%2B%2C%2D%2E%2F\
193	%30%31%32%33%34%35%36%37%38%39%3A%3B%3C%3D%3E%3F\
194	%40%41%42%43%44%45%46%47%48%49%4A%4B%4C%4D%4E%4F\
195	%50%51%52%53%54%55%56%57%58%59%5A%5B%5C%5D%5E%5F\
196	%60%61%62%63%64%65%66%67%68%69%6A%6B%6C%6D%6E%6F\
197	%70%71%72%73%74%75%76%77%78%79%7A%7B%7C%7D%7E%7F\
198	%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F\
199	%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F\
200	%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF\
201	%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF\
202	%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF\
203	%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF\
204	%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF\
205	%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF\
206	";
207
208	let index = usize::from(byte) * `3`;
209	// SAFETY: ENC_TABLE is ascii-only, so any subset if it should be
210	// ascii-only too, which is valid utf8.
211	unsafe { str::from_utf8_unchecked(&ENC_TABLE[index..index + `3`]) }
212	}
213
214	/// Percent-encode the given bytes with the given set.
215	///
216	/// Non-ASCII bytes and bytes in `ascii_set` are encoded.
217	///
218	/// The return type:
219	///
220	/// Implements `Iterator<Item = &str>` and therefore has a `.collect::<String>()` method,*
221	/// Implements `Display` and therefore has a `.to_string()` method,*
222	/// Implements `Into<Cow<str>>` borrowing `input` when none of its bytes are encoded.*
223	///
224	/// # Examples
225	///
226	/// ```
227	/// use percent_encoding::{percent_encode, NON_ALPHANUMERIC};
228	///
229	/// assert_eq!(percent_encode(b"foo bar?", NON_ALPHANUMERIC).to_string(), "foo%20bar%3F");
230	/// ```
231	#[inline]
232	pub fn percent_encode<'a>(input: &'a [u8], ascii_set: &'static AsciiSet) -> PercentEncode<'a> {
233	PercentEncode {
234	bytes: input,
235	ascii_set,
236	}
237	}
238
239	/// Percent-encode the UTF-8 encoding of the given string.
240	///
241	/// See [`percent_encode`] regarding the return type.
242	///
243	/// # Examples
244	///
245	/// ```
246	/// use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
247	///
248	/// assert_eq!(utf8_percent_encode("foo bar?", NON_ALPHANUMERIC).to_string(), "foo%20bar%3F");
249	/// ```
250	#[inline]
251	pub fn utf8_percent_encode<'a>(input: &'a str, ascii_set: &'static AsciiSet) -> PercentEncode<'a> {
252	percent_encode(input.as_bytes(), ascii_set)
253	}
254
255	/// The return type of [`percent_encode`] and [`utf8_percent_encode`].
256	#[derive(Clone)]
257	pub struct PercentEncode<'a> {
258	bytes: &'a [u8],
259	ascii_set: &'static AsciiSet,
260	}
261
262	impl<'a> Iterator for PercentEncode<'a> {
263	type Item = &'a str;
264
265	fn next(&mut self) -> Option<&'a str> {
266	if let Some((&first_byte, remaining)) = self.bytes.split_first() {
267	if self.ascii_set.should_percent_encode(first_byte) {
268	self.bytes = remaining;
269	Some(percent_encode_byte(first_byte))
270	} else {
271	// The unsafe blocks here are appropriate because the bytes are
272	// confirmed as a subset of UTF-8 in should_percent_encode.
273	for (i, &byte) in remaining.iter().enumerate() {
274	if self.ascii_set.should_percent_encode(byte) {
275	// 1 for first_byte + i for previous iterations of this loop
276	let (unchanged_slice, remaining) = self.bytes.split_at(`1` + i);
277	self.bytes = remaining;
278	return Some(unsafe { str::from_utf8_unchecked(unchanged_slice) });
279	}
280	}
281	let unchanged_slice = self.bytes;
282	self.bytes = &[][..];
283	Some(unsafe { str::from_utf8_unchecked(unchanged_slice) })
284	}
285	} else {
286	None
287	}
288	}
289
290	fn size_hint(&self) -> (usize, Option<usize>) {
291	if self.bytes.is_empty() {
292	(`0`, Some(`0`))
293	} else {
294	(`1`, Some(self.bytes.len()))
295	}
296	}
297	}
298
299	impl<'a> fmt::Display for PercentEncode<'a> {
300	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
301	for c: &'a str in (*self).clone() {
302	formatter.write_str(data:c)?
303	}
304	Ok(())
305	}
306	}
307
308	#[cfg(feature = "alloc")]
309	impl<'a> From<PercentEncode<'a>> for Cow<'a, str> {
310	fn from(mut iter: PercentEncode<'a>) -> Self {
311	match iter.next() {
312	None => "".into(),
313	Some(first: &'a str) => match iter.next() {
314	None => first.into(),
315	Some(second: &'a str) => {
316	let mut string: String = first.to_owned();
317	string.push_str(string:second);
318	string.extend(iter);
319	string.into()
320	}
321	},
322	}
323	}
324	}
325
326	/// Percent-decode the given string.
327	///
328	/// <https://url.spec.whatwg.org/#string-percent-decode>
329	///
330	/// See [`percent_decode`] regarding the return type.
331	#[inline]
332	pub fn percent_decode_str(input: &str) -> PercentDecode<'_> {
333	percent_decode(input.as_bytes())
334	}
335
336	/// Percent-decode the given bytes.
337	///
338	/// <https://url.spec.whatwg.org/#percent-decode>
339	///
340	/// Any sequence of `%` followed by two hexadecimal digits is decoded.
341	/// The return type:
342	///
343	/// Implements `Into<Cow<u8>>` borrowing `input` when it contains no percent-encoded sequence,*
344	/// Implements `Iterator<Item = u8>` and therefore has a `.collect::<Vec<u8>>()` method,*
345	/// Has `decode_utf8()` and `decode_utf8_lossy()` methods.*
346	///
347	/// # Examples
348	///
349	/// ```
350	/// use percent_encoding::percent_decode;
351	///
352	/// assert_eq!(percent_decode(b"foo%20bar%3f").decode_utf8().unwrap(), "foo bar?");
353	/// ```
354	#[inline]
355	pub fn percent_decode(input: &[u8]) -> PercentDecode<'_> {
356	PercentDecode {
357	bytes: input.iter(),
358	}
359	}
360
361	/// The return type of [`percent_decode`].
362	#[derive(Clone, Debug)]
363	pub struct PercentDecode<'a> {
364	bytes: slice::Iter<'a, u8>,
365	}
366
367	fn after_percent_sign(iter: &mut slice::Iter<'_, u8>) -> Option<u8> {
368	let mut cloned_iter: Iter<'_, u8> = iter.clone();
369	let h: u32 = char::from(*cloned_iter.next()?).to_digit(radix:`16`)?;
370	let l: u32 = char::from(*cloned_iter.next()?).to_digit(radix:`16`)?;
371	*iter = cloned_iter;
372	Some(h as u8 * `0x10` + l as u8)
373	}
374
375	impl<'a> Iterator for PercentDecode<'a> {
376	type Item = u8;
377
378	fn next(&mut self) -> Option<u8> {
379	self.bytes.next().map(\|&byte: u8\| {
380	if byte == b'%' {
381	after_percent_sign(&mut self.bytes).unwrap_or(default:byte)
382	} else {
383	byte
384	}
385	})
386	}
387
388	fn size_hint(&self) -> (usize, Option<usize>) {
389	let bytes: usize = self.bytes.len();
390	((bytes + `2`) / `3`, Some(bytes))
391	}
392	}
393
394	#[cfg(feature = "alloc")]
395	impl<'a> From<PercentDecode<'a>> for Cow<'a, [u8]> {
396	fn from(iter: PercentDecode<'a>) -> Self {
397	match iter.if_any() {
398	Some(vec: Vec) => Cow::Owned(vec),
399	None => Cow::Borrowed(iter.bytes.as_slice()),
400	}
401	}
402	}
403
404	impl<'a> PercentDecode<'a> {
405	/// If the percent-decoding is different from the input, return it as a new bytes vector.
406	#[cfg(feature = "alloc")]
407	fn if_any(&self) -> Option<Vec<u8>> {
408	let mut bytes_iter = self.bytes.clone();
409	while bytes_iter.any(\|&b\| b == b'%') {
410	if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter) {
411	let initial_bytes = self.bytes.as_slice();
412	let unchanged_bytes_len = initial_bytes.len() - bytes_iter.len() - `3`;
413	let mut decoded = initial_bytes[..unchanged_bytes_len].to_owned();
414	decoded.push(decoded_byte);
415	decoded.extend(PercentDecode { bytes: bytes_iter });
416	return Some(decoded);
417	}
418	}
419	// Nothing to decode
420	None
421	}
422
423	/// Decode the result of percent-decoding as UTF-8.
424	///
425	/// This is return `Err` when the percent-decoded bytes are not well-formed in UTF-8.
426	#[cfg(feature = "alloc")]
427	pub fn decode_utf8(self) -> Result<Cow<'a, str>, str::Utf8Error> {
428	match self.clone().into() {
429	Cow::Borrowed(bytes) => match str::from_utf8(bytes) {
430	Ok(s) => Ok(s.into()),
431	Err(e) => Err(e),
432	},
433	Cow::Owned(bytes) => match String::from_utf8(bytes) {
434	Ok(s) => Ok(s.into()),
435	Err(e) => Err(e.utf8_error()),
436	},
437	}
438	}
439
440	/// Decode the result of percent-decoding as UTF-8, lossily.
441	///
442	/// Invalid UTF-8 percent-encoded byte sequences will be replaced � U+FFFD,
443	/// the replacement character.
444	#[cfg(feature = "alloc")]
445	pub fn decode_utf8_lossy(self) -> Cow<'a, str> {
446	decode_utf8_lossy(self.clone().into())
447	}
448	}
449
450	#[cfg(feature = "alloc")]
451	fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
452	// Note: This function is duplicated in `form_urlencoded/src/query_encoding.rs`.
453	match input {
454	Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
455	Cow::Owned(bytes) => {
456	match String::from_utf8_lossy(&bytes) {
457	Cow::Borrowed(utf8) => {
458	// If from_utf8_lossy returns a Cow::Borrowed, then we can
459	// be sure our original bytes were valid UTF-8. This is because
460	// if the bytes were invalid UTF-8 from_utf8_lossy would have
461	// to allocate a new owned string to back the Cow so it could
462	// replace invalid bytes with a placeholder.
463
464	// First we do a debug_assert to confirm our description above.
465	let raw_utf8: *const [u8] = utf8.as_bytes();
466	debug_assert!(raw_utf8 == &bytes as const [u8]);
467
468	// Given we know the original input bytes are valid UTF-8,
469	// and we have ownership of those bytes, we re-use them and
470	// return a Cow::Owned here.
471	Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
472	}
473	Cow::Owned(s) => Cow::Owned(s),
474	}
475	}
476	}
477	}
478