1 | //! Operations related to UTF-8 validation. |
2 | |
3 | use super::Utf8Error; |
4 | use crate::intrinsics::const_eval_select; |
5 | |
6 | /// Returns the initial codepoint accumulator for the first byte. |
7 | /// The first byte is special, only want bottom 5 bits for width 2, 4 bits |
8 | /// for width 3, and 3 bits for width 4. |
9 | #[inline ] |
10 | const fn utf8_first_byte(byte: u8, width: u32) -> u32 { |
11 | (byte & (0x7F >> width)) as u32 |
12 | } |
13 | |
14 | /// Returns the value of `ch` updated with continuation byte `byte`. |
15 | #[inline ] |
16 | const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { |
17 | (ch << 6) | (byte & CONT_MASK) as u32 |
18 | } |
19 | |
20 | /// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the |
21 | /// bits `10`). |
22 | #[inline ] |
23 | pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { |
24 | (byte as i8) < -64 |
25 | } |
26 | |
27 | /// Reads the next code point out of a byte iterator (assuming a |
28 | /// UTF-8-like encoding). |
29 | /// |
30 | /// # Safety |
31 | /// |
32 | /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string |
33 | #[unstable (feature = "str_internals" , issue = "none" )] |
34 | #[inline ] |
35 | pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> { |
36 | // Decode UTF-8 |
37 | let x = *bytes.next()?; |
38 | if x < 128 { |
39 | return Some(x as u32); |
40 | } |
41 | |
42 | // Multibyte case follows |
43 | // Decode from a byte combination out of: [[[x y] z] w] |
44 | // NOTE: Performance is sensitive to the exact formulation here |
45 | let init = utf8_first_byte(x, 2); |
46 | // SAFETY: `bytes` produces an UTF-8-like string, |
47 | // so the iterator must produce a value here. |
48 | let y = unsafe { *bytes.next().unwrap_unchecked() }; |
49 | let mut ch = utf8_acc_cont_byte(init, y); |
50 | if x >= 0xE0 { |
51 | // [[x y z] w] case |
52 | // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid |
53 | // SAFETY: `bytes` produces an UTF-8-like string, |
54 | // so the iterator must produce a value here. |
55 | let z = unsafe { *bytes.next().unwrap_unchecked() }; |
56 | let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); |
57 | ch = init << 12 | y_z; |
58 | if x >= 0xF0 { |
59 | // [x y z w] case |
60 | // use only the lower 3 bits of `init` |
61 | // SAFETY: `bytes` produces an UTF-8-like string, |
62 | // so the iterator must produce a value here. |
63 | let w = unsafe { *bytes.next().unwrap_unchecked() }; |
64 | ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); |
65 | } |
66 | } |
67 | |
68 | Some(ch) |
69 | } |
70 | |
71 | /// Reads the last code point out of a byte iterator (assuming a |
72 | /// UTF-8-like encoding). |
73 | /// |
74 | /// # Safety |
75 | /// |
76 | /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string |
77 | #[inline ] |
78 | pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32> |
79 | where |
80 | I: DoubleEndedIterator<Item = &'a u8>, |
81 | { |
82 | // Decode UTF-8 |
83 | let w = match *bytes.next_back()? { |
84 | next_byte if next_byte < 128 => return Some(next_byte as u32), |
85 | back_byte => back_byte, |
86 | }; |
87 | |
88 | // Multibyte case follows |
89 | // Decode from a byte combination out of: [x [y [z w]]] |
90 | let mut ch; |
91 | // SAFETY: `bytes` produces an UTF-8-like string, |
92 | // so the iterator must produce a value here. |
93 | let z = unsafe { *bytes.next_back().unwrap_unchecked() }; |
94 | ch = utf8_first_byte(z, 2); |
95 | if utf8_is_cont_byte(z) { |
96 | // SAFETY: `bytes` produces an UTF-8-like string, |
97 | // so the iterator must produce a value here. |
98 | let y = unsafe { *bytes.next_back().unwrap_unchecked() }; |
99 | ch = utf8_first_byte(y, 3); |
100 | if utf8_is_cont_byte(y) { |
101 | // SAFETY: `bytes` produces an UTF-8-like string, |
102 | // so the iterator must produce a value here. |
103 | let x = unsafe { *bytes.next_back().unwrap_unchecked() }; |
104 | ch = utf8_first_byte(x, 4); |
105 | ch = utf8_acc_cont_byte(ch, y); |
106 | } |
107 | ch = utf8_acc_cont_byte(ch, z); |
108 | } |
109 | ch = utf8_acc_cont_byte(ch, w); |
110 | |
111 | Some(ch) |
112 | } |
113 | |
114 | const NONASCII_MASK: usize = usize::repeat_u8(0x80); |
115 | |
116 | /// Returns `true` if any byte in the word `x` is nonascii (>= 128). |
117 | #[inline ] |
118 | const fn contains_nonascii(x: usize) -> bool { |
119 | (x & NONASCII_MASK) != 0 |
120 | } |
121 | |
122 | /// Walks through `v` checking that it's a valid UTF-8 sequence, |
123 | /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. |
124 | #[inline (always)] |
125 | #[rustc_allow_const_fn_unstable (const_eval_select)] // fallback impl has same behavior |
126 | pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { |
127 | let mut index = 0; |
128 | let len = v.len(); |
129 | |
130 | const USIZE_BYTES: usize = size_of::<usize>(); |
131 | |
132 | let ascii_block_size = 2 * USIZE_BYTES; |
133 | let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; |
134 | // Below, we safely fall back to a slower codepath if the offset is `usize::MAX`, |
135 | // so the end-to-end behavior is the same at compiletime and runtime. |
136 | let align = const_eval_select!( |
137 | @capture { v: &[u8] } -> usize: |
138 | if const { |
139 | usize::MAX |
140 | } else { |
141 | v.as_ptr().align_offset(USIZE_BYTES) |
142 | } |
143 | ); |
144 | |
145 | while index < len { |
146 | let old_offset = index; |
147 | macro_rules! err { |
148 | ($error_len: expr) => { |
149 | return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }) |
150 | }; |
151 | } |
152 | |
153 | macro_rules! next { |
154 | () => {{ |
155 | index += 1; |
156 | // we needed data, but there was none: error! |
157 | if index >= len { |
158 | err!(None) |
159 | } |
160 | v[index] |
161 | }}; |
162 | } |
163 | |
164 | let first = v[index]; |
165 | if first >= 128 { |
166 | let w = utf8_char_width(first); |
167 | // 2-byte encoding is for codepoints \u{0080} to \u{07ff} |
168 | // first C2 80 last DF BF |
169 | // 3-byte encoding is for codepoints \u{0800} to \u{ffff} |
170 | // first E0 A0 80 last EF BF BF |
171 | // excluding surrogates codepoints \u{d800} to \u{dfff} |
172 | // ED A0 80 to ED BF BF |
173 | // 4-byte encoding is for codepoints \u{10000} to \u{10ffff} |
174 | // first F0 90 80 80 last F4 8F BF BF |
175 | // |
176 | // Use the UTF-8 syntax from the RFC |
177 | // |
178 | // https://tools.ietf.org/html/rfc3629 |
179 | // UTF8-1 = %x00-7F |
180 | // UTF8-2 = %xC2-DF UTF8-tail |
181 | // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / |
182 | // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) |
183 | // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / |
184 | // %xF4 %x80-8F 2( UTF8-tail ) |
185 | match w { |
186 | 2 => { |
187 | if next!() as i8 >= -64 { |
188 | err!(Some(1)) |
189 | } |
190 | } |
191 | 3 => { |
192 | match (first, next!()) { |
193 | (0xE0, 0xA0..=0xBF) |
194 | | (0xE1..=0xEC, 0x80..=0xBF) |
195 | | (0xED, 0x80..=0x9F) |
196 | | (0xEE..=0xEF, 0x80..=0xBF) => {} |
197 | _ => err!(Some(1)), |
198 | } |
199 | if next!() as i8 >= -64 { |
200 | err!(Some(2)) |
201 | } |
202 | } |
203 | 4 => { |
204 | match (first, next!()) { |
205 | (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} |
206 | _ => err!(Some(1)), |
207 | } |
208 | if next!() as i8 >= -64 { |
209 | err!(Some(2)) |
210 | } |
211 | if next!() as i8 >= -64 { |
212 | err!(Some(3)) |
213 | } |
214 | } |
215 | _ => err!(Some(1)), |
216 | } |
217 | index += 1; |
218 | } else { |
219 | // Ascii case, try to skip forward quickly. |
220 | // When the pointer is aligned, read 2 words of data per iteration |
221 | // until we find a word containing a non-ascii byte. |
222 | if align != usize::MAX && align.wrapping_sub(index) % USIZE_BYTES == 0 { |
223 | let ptr = v.as_ptr(); |
224 | while index < blocks_end { |
225 | // SAFETY: since `align - index` and `ascii_block_size` are |
226 | // multiples of `USIZE_BYTES`, `block = ptr.add(index)` is |
227 | // always aligned with a `usize` so it's safe to dereference |
228 | // both `block` and `block.add(1)`. |
229 | unsafe { |
230 | let block = ptr.add(index) as *const usize; |
231 | // break if there is a nonascii byte |
232 | let zu = contains_nonascii(*block); |
233 | let zv = contains_nonascii(*block.add(1)); |
234 | if zu || zv { |
235 | break; |
236 | } |
237 | } |
238 | index += ascii_block_size; |
239 | } |
240 | // step from the point where the wordwise loop stopped |
241 | while index < len && v[index] < 128 { |
242 | index += 1; |
243 | } |
244 | } else { |
245 | index += 1; |
246 | } |
247 | } |
248 | } |
249 | |
250 | Ok(()) |
251 | } |
252 | |
253 | // https://tools.ietf.org/html/rfc3629 |
254 | const UTF8_CHAR_WIDTH: &[u8; 256] = &[ |
255 | // 1 2 3 4 5 6 7 8 9 A B C D E F |
256 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 |
257 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 |
258 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 |
259 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 |
260 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 |
261 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 |
262 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 |
263 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 |
264 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8 |
265 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9 |
266 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A |
267 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B |
268 | 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C |
269 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D |
270 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E |
271 | 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F |
272 | ]; |
273 | |
274 | /// Given a first byte, determines how many bytes are in this UTF-8 character. |
275 | #[unstable (feature = "str_internals" , issue = "none" )] |
276 | #[must_use ] |
277 | #[inline ] |
278 | pub const fn utf8_char_width(b: u8) -> usize { |
279 | UTF8_CHAR_WIDTH[b as usize] as usize |
280 | } |
281 | |
282 | /// Mask of the value bits of a continuation byte. |
283 | const CONT_MASK: u8 = 0b0011_1111; |
284 | |