validations.rs source code [crates/core/src/str/validations.rs]

1	//! Operations related to UTF-8 validation.
2
3	use super::Utf8Error;
4	use crate::intrinsics::const_eval_select;
5
6	/// Returns the initial codepoint accumulator for the first byte.
7	/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
8	/// for width 3, and 3 bits for width 4.
9	#[inline]
10	const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
11	(byte & (`0x7F` >> width)) as u32
12	}
13
14	/// Returns the value of `ch` updated with continuation byte `byte`.
15	#[inline]
16	const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
17	(ch << `6`) \| (byte & CONT_MASK) as u32
18	}
19
20	/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
21	/// bits `10`).
22	#[inline]
23	pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
24	(byte as i8) < `-64`
25	}
26
27	/// Reads the next code point out of a byte iterator (assuming a
28	/// UTF-8-like encoding).
29	///
30	/// # Safety
31	///
32	/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
33	#[unstable(feature = "str_internals", issue = "none")]
34	#[inline]
35	pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
36	// Decode UTF-8
37	let x = *bytes.next()?;
38	if x < `128` {
39	return Some(x as u32);
40	}
41
42	// Multibyte case follows
43	// Decode from a byte combination out of: [[[x y] z] w]
44	// NOTE: Performance is sensitive to the exact formulation here
45	let init = utf8_first_byte(x, `2`);
46	// SAFETY: `bytes` produces an UTF-8-like string,
47	// so the iterator must produce a value here.
48	let y = unsafe { *bytes.next().unwrap_unchecked() };
49	let mut ch = utf8_acc_cont_byte(init, y);
50	if x >= `0xE0` {
51	// [[x y z] w] case
52	// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
53	// SAFETY: `bytes` produces an UTF-8-like string,
54	// so the iterator must produce a value here.
55	let z = unsafe { *bytes.next().unwrap_unchecked() };
56	let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
57	ch = init << `12` \| y_z;
58	if x >= `0xF0` {
59	// [x y z w] case
60	// use only the lower 3 bits of `init`
61	// SAFETY: `bytes` produces an UTF-8-like string,
62	// so the iterator must produce a value here.
63	let w = unsafe { *bytes.next().unwrap_unchecked() };
64	ch = (init & `7`) << `18` \| utf8_acc_cont_byte(y_z, w);
65	}
66	}
67
68	Some(ch)
69	}
70
71	/// Reads the last code point out of a byte iterator (assuming a
72	/// UTF-8-like encoding).
73	///
74	/// # Safety
75	///
76	/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
77	#[inline]
78	pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
79	where
80	I: DoubleEndedIterator<Item = &'a u8>,
81	{
82	// Decode UTF-8
83	let w = match *bytes.next_back()? {
84	next_byte if next_byte < `128` => return Some(next_byte as u32),
85	back_byte => back_byte,
86	};
87
88	// Multibyte case follows
89	// Decode from a byte combination out of: [x [y [z w]]]
90	let mut ch;
91	// SAFETY: `bytes` produces an UTF-8-like string,
92	// so the iterator must produce a value here.
93	let z = unsafe { *bytes.next_back().unwrap_unchecked() };
94	ch = utf8_first_byte(z, `2`);
95	if utf8_is_cont_byte(z) {
96	// SAFETY: `bytes` produces an UTF-8-like string,
97	// so the iterator must produce a value here.
98	let y = unsafe { *bytes.next_back().unwrap_unchecked() };
99	ch = utf8_first_byte(y, `3`);
100	if utf8_is_cont_byte(y) {
101	// SAFETY: `bytes` produces an UTF-8-like string,
102	// so the iterator must produce a value here.
103	let x = unsafe { *bytes.next_back().unwrap_unchecked() };
104	ch = utf8_first_byte(x, `4`);
105	ch = utf8_acc_cont_byte(ch, y);
106	}
107	ch = utf8_acc_cont_byte(ch, z);
108	}
109	ch = utf8_acc_cont_byte(ch, w);
110
111	Some(ch)
112	}
113
114	const NONASCII_MASK: usize = usize::repeat_u8(`0x80`);
115
116	/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
117	#[inline]
118	const fn contains_nonascii(x: usize) -> bool {
119	(x & NONASCII_MASK) != `0`
120	}
121
122	/// Walks through `v` checking that it's a valid UTF-8 sequence,
123	/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
124	#[inline(always)]
125	#[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
126	pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
127	let mut index = `0`;
128	let len = v.len();
129
130	const USIZE_BYTES: usize = size_of::<usize>();
131
132	let ascii_block_size = `2` * USIZE_BYTES;
133	let blocks_end = if len >= ascii_block_size { len - ascii_block_size + `1` } else { `0` };
134	// Below, we safely fall back to a slower codepath if the offset is `usize::MAX`,
135	// so the end-to-end behavior is the same at compiletime and runtime.
136	let align = const_eval_select!(
137	@capture { v: &[u8] } -> usize:
138	if const {
139	usize::MAX
140	} else {
141	v.as_ptr().align_offset(USIZE_BYTES)
142	}
143	);
144
145	while index < len {
146	let old_offset = index;
147	macro_rules! err {
148	($error_len: expr) => {
149	return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
150	};
151	}
152
153	macro_rules! next {
154	() => {{
155	index += `1`;
156	// we needed data, but there was none: error!
157	if index >= len {
158	err!(None)
159	}
160	v[index]
161	}};
162	}
163
164	let first = v[index];
165	if first >= `128` {
166	let w = utf8_char_width(first);
167	// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
168	// first C2 80 last DF BF
169	// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
170	// first E0 A0 80 last EF BF BF
171	// excluding surrogates codepoints \u{d800} to \u{dfff}
172	// ED A0 80 to ED BF BF
173	// 4-byte encoding is for codepoints \u{10000} to \u{10ffff}
174	// first F0 90 80 80 last F4 8F BF BF
175	//
176	// Use the UTF-8 syntax from the RFC
177	//
178	// https://tools.ietf.org/html/rfc3629
179	// UTF8-1 = %x00-7F
180	// UTF8-2 = %xC2-DF UTF8-tail
181	// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
182	// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
183	// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
184	// %xF4 %x80-8F 2( UTF8-tail )
185	match w {
186	`2` => {
187	if next!() as i8 >= `-64` {
188	err!(Some(`1`))
189	}
190	}
191	`3` => {
192	match (first, next!()) {
193	(`0xE0`, `0xA0`..=`0xBF`)
194	\| (`0xE1`..=`0xEC`, `0x80`..=`0xBF`)
195	\| (`0xED`, `0x80`..=`0x9F`)
196	\| (`0xEE`..=`0xEF`, `0x80`..=`0xBF`) => {}
197	_ => err!(Some(`1`)),
198	}
199	if next!() as i8 >= `-64` {
200	err!(Some(`2`))
201	}
202	}
203	`4` => {
204	match (first, next!()) {
205	(`0xF0`, `0x90`..=`0xBF`) \| (`0xF1`..=`0xF3`, `0x80`..=`0xBF`) \| (`0xF4`, `0x80`..=`0x8F`) => {}
206	_ => err!(Some(`1`)),
207	}
208	if next!() as i8 >= `-64` {
209	err!(Some(`2`))
210	}
211	if next!() as i8 >= `-64` {
212	err!(Some(`3`))
213	}
214	}
215	_ => err!(Some(`1`)),
216	}
217	index += `1`;
218	} else {
219	// Ascii case, try to skip forward quickly.
220	// When the pointer is aligned, read 2 words of data per iteration
221	// until we find a word containing a non-ascii byte.
222	if align != usize::MAX && align.wrapping_sub(index).is_multiple_of(USIZE_BYTES) {
223	let ptr = v.as_ptr();
224	while index < blocks_end {
225	// SAFETY: since `align - index` and `ascii_block_size` are
226	// multiples of `USIZE_BYTES`, `block = ptr.add(index)` is
227	// always aligned with a `usize` so it's safe to dereference
228	// both `block` and `block.add(1)`.
229	unsafe {
230	let block = ptr.add(index) as *const usize;
231	// break if there is a nonascii byte
232	let zu = contains_nonascii(*block);
233	let zv = contains_nonascii(*block.add(`1`));
234	if zu \|\| zv {
235	break;
236	}
237	}
238	index += ascii_block_size;
239	}
240	// step from the point where the wordwise loop stopped
241	while index < len && v[index] < `128` {
242	index += `1`;
243	}
244	} else {
245	index += `1`;
246	}
247	}
248	}
249
250	Ok(())
251	}
252
253	// https://tools.ietf.org/html/rfc3629
254	const UTF8_CHAR_WIDTH: &[u8; `256`] = &[
255	// 1 2 3 4 5 6 7 8 9 A B C D E F
256	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 0
257	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 1
258	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 2
259	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 3
260	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 4
261	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 5
262	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 6
263	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 7
264	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 8
265	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 9
266	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // A
267	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // B
268	`0`, `0`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // C
269	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // D
270	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, // E
271	`4`, `4`, `4`, `4`, `4`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // F
272	];
273
274	/// Given a first byte, determines how many bytes are in this UTF-8 character.
275	#[unstable(feature = "str_internals", issue = "none")]
276	#[must_use]
277	#[inline]
278	pub const fn utf8_char_width(b: u8) -> usize {
279	UTF8_CHAR_WIDTH[b as usize] as usize
280	}
281
282	/// Mask of the value bits of a continuation byte.
283	const CONT_MASK: u8 = `0b0011_1111`;
284