utf8.rs source code [crates/regex-1.8.4/src/utf8.rs]

1	/// A few elementary UTF-8 encoding and decoding functions used by the matching
2	/// engines.
3	///
4	/// In an ideal world, the matching engines operate on `&str` and we can just
5	/// lean on the standard library for all our UTF-8 needs. However, to support
6	/// byte based regexes (that can match on arbitrary bytes which may contain
7	/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
8	/// The standard library doesn't really recognize this use case, so we have
9	/// to build it out ourselves.
10	///
11	/// Should this be factored out into a separate crate? It seems independently
12	/// useful. There are other crates that already exist (e.g., `utf-8`) that have
13	/// overlapping use cases. Not sure what to do.
14	use std::char;
15
16	const TAG_CONT: u8 = `0b1000_0000`;
17	const TAG_TWO: u8 = `0b1100_0000`;
18	const TAG_THREE: u8 = `0b1110_0000`;
19	const TAG_FOUR: u8 = `0b1111_0000`;
20
21	/// Returns the smallest possible index of the next valid UTF-8 sequence
22	/// starting after `i`.
23	pub fn next_utf8(text: &[u8], i: usize) -> usize {
24	let b: u8 = match text.get(index:i) {
25	None => return i + `1`,
26	Some(&b: u8) => b,
27	};
28	let inc: usize = if b <= `0x7F` {
29	`1`
30	} else if b <= `0b110_11111` {
31	`2`
32	} else if b <= `0b1110_1111` {
33	`3`
34	} else {
35	`4`
36	};
37	i + inc
38	}
39
40	/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
41	///
42	/// If no valid UTF-8 sequence could be found, then `None` is returned.
43	/// Otherwise, the decoded codepoint and the number of bytes read is returned.
44	/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
45	/// 1, 2, 3 or 4.
46	///
47	/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
48	/// codepoint that is out of range (surrogate codepoints are out of range) or
49	/// is not the shortest possible UTF-8 sequence for that codepoint.
50	#[inline]
51	pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
52	let b0 = match src.get(`0`) {
53	None => return None,
54	Some(&b) if b <= `0x7F` => return Some((b as char, `1`)),
55	Some(&b) => b,
56	};
57	match b0 {
58	`0b110_00000`..=`0b110_11111` => {
59	if src.len() < `2` {
60	return None;
61	}
62	let b1 = src[`1`];
63	if `0b11_000000` & b1 != TAG_CONT {
64	return None;
65	}
66	let cp = ((b0 & !TAG_TWO) as u32) << `6` \| ((b1 & !TAG_CONT) as u32);
67	match cp {
68	`0x80`..=`0x7FF` => char::from_u32(cp).map(\|cp\| (cp, `2`)),
69	_ => None,
70	}
71	}
72	`0b1110_0000`..=`0b1110_1111` => {
73	if src.len() < `3` {
74	return None;
75	}
76	let (b1, b2) = (src[`1`], src[`2`]);
77	if `0b11_000000` & b1 != TAG_CONT {
78	return None;
79	}
80	if `0b11_000000` & b2 != TAG_CONT {
81	return None;
82	}
83	let cp = ((b0 & !TAG_THREE) as u32) << `12`
84	\| ((b1 & !TAG_CONT) as u32) << `6`
85	\| ((b2 & !TAG_CONT) as u32);
86	match cp {
87	// char::from_u32 will disallow surrogate codepoints.
88	`0x800`..=`0xFFFF` => char::from_u32(cp).map(\|cp\| (cp, `3`)),
89	_ => None,
90	}
91	}
92	`0b11110_000`..=`0b11110_111` => {
93	if src.len() < `4` {
94	return None;
95	}
96	let (b1, b2, b3) = (src[`1`], src[`2`], src[`3`]);
97	if `0b11_000000` & b1 != TAG_CONT {
98	return None;
99	}
100	if `0b11_000000` & b2 != TAG_CONT {
101	return None;
102	}
103	if `0b11_000000` & b3 != TAG_CONT {
104	return None;
105	}
106	let cp = ((b0 & !TAG_FOUR) as u32) << `18`
107	\| ((b1 & !TAG_CONT) as u32) << `12`
108	\| ((b2 & !TAG_CONT) as u32) << `6`
109	\| ((b3 & !TAG_CONT) as u32);
110	match cp {
111	`0x10000`..=`0x0010_FFFF` => char::from_u32(cp).map(\|cp\| (cp, `4`)),
112	_ => None,
113	}
114	}
115	_ => None,
116	}
117	}
118
119	/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
120	/// of the first.
121	pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
122	if src.is_empty() {
123	return None;
124	}
125	let mut start: usize = src.len() - `1`;
126	if src[start] <= `0x7F` {
127	return Some((src[start] as char, `1`));
128	}
129	while start > src.len().saturating_sub(`4`) {
130	start -= `1`;
131	if is_start_byte(src[start]) {
132	break;
133	}
134	}
135	match decode_utf8(&src[start..]) {
136	None => None,
137	Some((_, n: usize)) if n < src.len() - start => None,
138	Some((cp: char, n: usize)) => Some((cp, n)),
139	}
140	}
141
142	fn is_start_byte(b: u8) -> bool {
143	b & `0b11_000000` != `0b1_0000000`
144	}
145
146	#[cfg(test)]
147	mod tests {
148	use std::str;
149
150	use quickcheck::quickcheck;
151
152	use super::{
153	decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
154	};
155
156	#[test]
157	fn prop_roundtrip() {
158	fn p(given_cp: char) -> bool {
159	let mut tmp = [`0`; `4`];
160	let encoded_len = given_cp.encode_utf8(&mut tmp).len();
161	let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
162	encoded_len == got_len && given_cp == got_cp
163	}
164	quickcheck(p as fn(char) -> bool)
165	}
166
167	#[test]
168	fn prop_roundtrip_last() {
169	fn p(given_cp: char) -> bool {
170	let mut tmp = [`0`; `4`];
171	let encoded_len = given_cp.encode_utf8(&mut tmp).len();
172	let (got_cp, got_len) =
173	decode_last_utf8(&tmp[..encoded_len]).unwrap();
174	encoded_len == got_len && given_cp == got_cp
175	}
176	quickcheck(p as fn(char) -> bool)
177	}
178
179	#[test]
180	fn prop_encode_matches_std() {
181	fn p(cp: char) -> bool {
182	let mut got = [`0`; `4`];
183	let n = cp.encode_utf8(&mut got).len();
184	let expected = cp.to_string();
185	&got[..n] == expected.as_bytes()
186	}
187	quickcheck(p as fn(char) -> bool)
188	}
189
190	#[test]
191	fn prop_decode_matches_std() {
192	fn p(given_cp: char) -> bool {
193	let mut tmp = [`0`; `4`];
194	let n = given_cp.encode_utf8(&mut tmp).len();
195	let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
196	let expected_cp =
197	str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
198	got_cp == expected_cp
199	}
200	quickcheck(p as fn(char) -> bool)
201	}
202
203	#[test]
204	fn prop_decode_last_matches_std() {
205	fn p(given_cp: char) -> bool {
206	let mut tmp = [`0`; `4`];
207	let n = given_cp.encode_utf8(&mut tmp).len();
208	let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
209	let expected_cp = str::from_utf8(&tmp[..n])
210	.unwrap()
211	.chars()
212	.rev()
213	.next()
214	.unwrap();
215	got_cp == expected_cp
216	}
217	quickcheck(p as fn(char) -> bool)
218	}
219
220	#[test]
221	fn reject_invalid() {
222	// Invalid start byte
223	assert_eq!(decode_utf8(&[`0xFF`]), None);
224	// Surrogate pair
225	assert_eq!(decode_utf8(&[`0xED`, `0xA0`, `0x81`]), None);
226	// Invalid continuation byte.
227	assert_eq!(decode_utf8(&[`0xD4`, `0xC2`]), None);
228	// Bad lengths
229	assert_eq!(decode_utf8(&[`0xC3`]), None); // 2 bytes
230	assert_eq!(decode_utf8(&[`0xEF`, `0xBF`]), None); // 3 bytes
231	assert_eq!(decode_utf8(&[`0xF4`, `0x8F`, `0xBF`]), None); // 4 bytes
232	// Not a minimal UTF-8 sequence
233	assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT \| b'a']), None);
234	assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT \| b'a']), None);
235	assert_eq!(
236	decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT \| b'a',]),
237	None
238	);
239	}
240
241	#[test]
242	fn reject_invalid_last() {
243	// Invalid start byte
244	assert_eq!(decode_last_utf8(&[`0xFF`]), None);
245	// Surrogate pair
246	assert_eq!(decode_last_utf8(&[`0xED`, `0xA0`, `0x81`]), None);
247	// Bad lengths
248	assert_eq!(decode_last_utf8(&[`0xC3`]), None); // 2 bytes
249	assert_eq!(decode_last_utf8(&[`0xEF`, `0xBF`]), None); // 3 bytes
250	assert_eq!(decode_last_utf8(&[`0xF4`, `0x8F`, `0xBF`]), None); // 4 bytes
251	// Not a minimal UTF-8 sequence
252	assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT \| b'a']), None);
253	assert_eq!(
254	decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT \| b'a',]),
255	None
256	);
257	assert_eq!(
258	decode_last_utf8(
259	&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT \| b'a',]
260	),
261	None
262	);
263	}
264	}
265