utf8.rs source code [crates/regex-syntax/src/utf8.rs]

1	/!*
2	Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4	This is sub-module is useful for constructing byte based automatons that need
5	to embed UTF-8 decoding. The most common use of this module is in conjunction
6	with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type.
7
8	See the documentation on the `Utf8Sequences` iterator for more details and
9	an example.
10
11	# Wait, what is this?
12
13	This is simplest to explain with an example. Let's say you wanted to test
14	whether a particular byte sequence was a Cyrillic character. One possible
15	scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16	range can be expressed as a sequence of byte ranges:
17
18	```text
19	[D0-D3][80-BF]
20	```
21
22	This is simple enough: simply encode the boundaries, `0400` encodes to
23	`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24	corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26	However, what if you wanted to add the Cyrillic Supplementary characters to
27	your range? Your range might then become `[0400-052F]`. The same procedure
28	as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29	you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30	this isn't quite correct because this range doesn't capture many characters,
31	for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33	Instead, you need multiple sequences of byte ranges:
34
35	```text
36	[D0-D3][80-BF] # matches codepoints 0400-04FF
37	[D4][80-AF] # matches codepoints 0500-052F
38	```
39
40	This gets even more complicated if you want bigger ranges, particularly if
41	they naively contain surrogate codepoints. For example, the sequence of byte
42	ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44	```text
45	[0-7F]
46	[C2-DF][80-BF]
47	[E0][A0-BF][80-BF]
48	[E1-EC][80-BF][80-BF]
49	[ED][80-9F][80-BF]
50	[EE-EF][80-BF][80-BF]
51	```
52
53	Note that the byte ranges above will not* match any erroneous encoding of*
54	UTF-8, including encodings of surrogate codepoints.
55
56	And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58	```text
59	[0-7F]
60	[C2-DF][80-BF]
61	[E0][A0-BF][80-BF]
62	[E1-EC][80-BF][80-BF]
63	[ED][80-9F][80-BF]
64	[EE-EF][80-BF][80-BF]
65	[F0][90-BF][80-BF][80-BF]
66	[F1-F3][80-BF][80-BF][80-BF]
67	[F4][80-8F][80-BF][80-BF]
68	```
69
70	This module automates the process of creating these byte ranges from ranges of
71	Unicode scalar values.
72
73	# Lineage
74
75	I got the idea and general implementation strategy from Russ Cox in his
76	[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77	Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78	I also got the idea from
79	[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80	which uses it for executing automata on their term index.
81	*/
82
83	use core::{char, fmt, iter::FusedIterator, slice};
84
85	use alloc::{vec, vec::Vec};
86
87	const MAX_UTF8_BYTES: usize = `4`;
88
89	/// Utf8Sequence represents a sequence of byte ranges.
90	///
91	/// To match a Utf8Sequence, a candidate byte sequence must match each
92	/// successive range.
93	///
94	/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
95	/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
96	#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
97	pub enum Utf8Sequence {
98	/// One byte range.
99	One(Utf8Range),
100	/// Two successive byte ranges.
101	Two([Utf8Range; `2`]),
102	/// Three successive byte ranges.
103	Three([Utf8Range; `3`]),
104	/// Four successive byte ranges.
105	Four([Utf8Range; `4`]),
106	}
107
108	impl Utf8Sequence {
109	/// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
110	/// range.
111	///
112	/// This assumes that `start` and `end` have the same length.
113	fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
114	assert_eq!(start.len(), end.len());
115	match start.len() {
116	`2` => Utf8Sequence::Two([
117	Utf8Range::new(start[`0`], end[`0`]),
118	Utf8Range::new(start[`1`], end[`1`]),
119	]),
120	`3` => Utf8Sequence::Three([
121	Utf8Range::new(start[`0`], end[`0`]),
122	Utf8Range::new(start[`1`], end[`1`]),
123	Utf8Range::new(start[`2`], end[`2`]),
124	]),
125	`4` => Utf8Sequence::Four([
126	Utf8Range::new(start[`0`], end[`0`]),
127	Utf8Range::new(start[`1`], end[`1`]),
128	Utf8Range::new(start[`2`], end[`2`]),
129	Utf8Range::new(start[`3`], end[`3`]),
130	]),
131	n => unreachable!("invalid encoded length: {}", n),
132	}
133	}
134
135	/// Returns the underlying sequence of byte ranges as a slice.
136	pub fn as_slice(&self) -> &[Utf8Range] {
137	use self::Utf8Sequence::*;
138	match *self {
139	One(ref r) => slice::from_ref(r),
140	Two(ref r) => &r[..],
141	Three(ref r) => &r[..],
142	Four(ref r) => &r[..],
143	}
144	}
145
146	/// Returns the number of byte ranges in this sequence.
147	///
148	/// The length is guaranteed to be in the closed interval `[1, 4]`.
149	pub fn len(&self) -> usize {
150	self.as_slice().len()
151	}
152
153	/// Reverses the ranges in this sequence.
154	///
155	/// For example, if this corresponds to the following sequence:
156	///
157	/// ```text
158	/// [D0-D3][80-BF]
159	/// ```
160	///
161	/// Then after reversal, it will be
162	///
163	/// ```text
164	/// [80-BF][D0-D3]
165	/// ```
166	///
167	/// This is useful when one is constructing a UTF-8 automaton to match
168	/// character classes in reverse.
169	pub fn reverse(&mut self) {
170	match *self {
171	Utf8Sequence::One(_) => {}
172	Utf8Sequence::Two(ref mut x) => x.reverse(),
173	Utf8Sequence::Three(ref mut x) => x.reverse(),
174	Utf8Sequence::Four(ref mut x) => x.reverse(),
175	}
176	}
177
178	/// Returns true if and only if a prefix of `bytes` matches this sequence
179	/// of byte ranges.
180	pub fn matches(&self, bytes: &[u8]) -> bool {
181	if bytes.len() < self.len() {
182	return `false`;
183	}
184	for (&b, r) in bytes.iter().zip(self) {
185	if !r.matches(b) {
186	return `false`;
187	}
188	}
189	`true`
190	}
191	}
192
193	impl<'a> IntoIterator for &'a Utf8Sequence {
194	type IntoIter = slice::Iter<'a, Utf8Range>;
195	type Item = &'a Utf8Range;
196
197	fn into_iter(self) -> Self::IntoIter {
198	self.as_slice().iter()
199	}
200	}
201
202	impl fmt::Debug for Utf8Sequence {
203	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204	use self::Utf8Sequence::*;
205	match *self {
206	One(ref r: &Utf8Range) => write!(f, "{:?}", r),
207	Two(ref r: &[Utf8Range; 2]) => write!(f, "{:?}{:?}", r[`0`], r[`1`]),
208	Three(ref r: &[Utf8Range; 3]) => write!(f, "{:?}{:?}{:?}", r[`0`], r[`1`], r[`2`]),
209	Four(ref r: &[Utf8Range; 4]) => {
210	write!(f, "{:?}{:?}{:?}{:?}", r[`0`], r[`1`], r[`2`], r[`3`])
211	}
212	}
213	}
214	}
215
216	/// A single inclusive range of UTF-8 bytes.
217	#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
218	pub struct Utf8Range {
219	/// Start of byte range (inclusive).
220	pub start: u8,
221	/// End of byte range (inclusive).
222	pub end: u8,
223	}
224
225	impl Utf8Range {
226	fn new(start: u8, end: u8) -> Self {
227	Utf8Range { start, end }
228	}
229
230	/// Returns true if and only if the given byte is in this range.
231	pub fn matches(&self, b: u8) -> bool {
232	self.start <= b && b <= self.end
233	}
234	}
235
236	impl fmt::Debug for Utf8Range {
237	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238	if self.start == self.end {
239	write!(f, "[{:X}]", self.start)
240	} else {
241	write!(f, "[{:X}-{:X}]", self.start, self.end)
242	}
243	}
244	}
245
246	/// An iterator over ranges of matching UTF-8 byte sequences.
247	///
248	/// The iteration represents an alternation of comprehensive byte sequences
249	/// that match precisely the set of UTF-8 encoded scalar values.
250	///
251	/// A byte sequence corresponds to one of the scalar values in the range given
252	/// if and only if it completely matches exactly one of the sequences of byte
253	/// ranges produced by this iterator.
254	///
255	/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
256	/// sequences will match the same bytes.
257	///
258	/// # Example
259	///
260	/// This shows how to match an arbitrary byte sequence against a range of
261	/// scalar values.
262	///
263	/// ```rust
264	/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
265	///
266	/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
267	/// for range in seqs {
268	/// if range.matches(bytes) {
269	/// return `true`;
270	/// }
271	/// }
272	/// `false`
273	/// }
274	///
275	/// // Test the basic multilingual plane.
276	/// let seqs: Vec<_> = Utf8Sequences::new('`\u{0}`', '`\u{FFFF}`').collect();
277	///
278	/// // UTF-8 encoding of 'a'.
279	/// assert!(matches(&seqs, &[`0x61`]));
280	/// // UTF-8 encoding of '☃' (`\u{2603}`).
281	/// assert!(matches(&seqs, &[`0xE2`, `0x98`, `0x83`]));
282	/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
283	/// assert!(!matches(&seqs, &[`0xF0`, `0x90`, `0x8D`, `0x88`]));
284	/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
285	/// // which is invalid UTF-8, and therefore fails, despite the fact that
286	/// // the corresponding codepoint (0xD800) falls in the range given.
287	/// assert!(!matches(&seqs, &[`0xED`, `0xA0`, `0x80`]));
288	/// // And fails against plain old invalid UTF-8.
289	/// assert!(!matches(&seqs, &[`0xFF`, `0xFF`]));
290	/// ```
291	///
292	/// If this example seems circuitous, that's because it is! It's meant to be
293	/// illustrative. In practice, you could just try to decode your byte sequence
294	/// and compare it with the scalar value range directly. However, this is not
295	/// always possible (for example, in a byte based automaton).
296	#[derive(Debug)]
297	pub struct Utf8Sequences {
298	range_stack: Vec<ScalarRange>,
299	}
300
301	impl Utf8Sequences {
302	/// Create a new iterator over UTF-8 byte ranges for the scalar value range
303	/// given.
304	pub fn new(start: char, end: char) -> Self {
305	let mut it: Utf8Sequences = Utf8Sequences { range_stack: vec![] };
306	it.push(start:u32::from(start), end:u32::from(end));
307	it
308	}
309
310	/// reset resets the scalar value range.
311	/// Any existing state is cleared, but resources may be reused.
312	///
313	/// N.B. Benchmarks say that this method is dubious.
314	#[doc(hidden)]
315	pub fn reset(&mut self, start: char, end: char) {
316	self.range_stack.clear();
317	self.push(start:u32::from(start), end:u32::from(end));
318	}
319
320	fn push(&mut self, start: u32, end: u32) {
321	self.range_stack.push(ScalarRange { start, end });
322	}
323	}
324
325	struct ScalarRange {
326	start: u32,
327	end: u32,
328	}
329
330	impl fmt::Debug for ScalarRange {
331	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332	write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
333	}
334	}
335
336	impl Iterator for Utf8Sequences {
337	type Item = Utf8Sequence;
338
339	fn next(&mut self) -> Option<Self::Item> {
340	'TOP: while let Some(mut r) = self.range_stack.pop() {
341	'INNER: loop {
342	if let Some((r1, r2)) = r.split() {
343	self.push(r2.start, r2.end);
344	r.start = r1.start;
345	r.end = r1.end;
346	continue 'INNER;
347	}
348	if !r.is_valid() {
349	continue 'TOP;
350	}
351	for i in `1`..MAX_UTF8_BYTES {
352	let max = max_scalar_value(i);
353	if r.start <= max && max < r.end {
354	self.push(max + `1`, r.end);
355	r.end = max;
356	continue 'INNER;
357	}
358	}
359	if let Some(ascii_range) = r.as_ascii() {
360	return Some(Utf8Sequence::One(ascii_range));
361	}
362	for i in `1`..MAX_UTF8_BYTES {
363	let m = (`1` << (`6` * i)) - `1`;
364	if (r.start & !m) != (r.end & !m) {
365	if (r.start & m) != `0` {
366	self.push((r.start \| m) + `1`, r.end);
367	r.end = r.start \| m;
368	continue 'INNER;
369	}
370	if (r.end & m) != m {
371	self.push(r.end & !m, r.end);
372	r.end = (r.end & !m) - `1`;
373	continue 'INNER;
374	}
375	}
376	}
377	let mut start = [`0`; MAX_UTF8_BYTES];
378	let mut end = [`0`; MAX_UTF8_BYTES];
379	let n = r.encode(&mut start, &mut end);
380	return Some(Utf8Sequence::from_encoded_range(
381	&start[`0`..n],
382	&end[`0`..n],
383	));
384	}
385	}
386	None
387	}
388	}
389
390	impl FusedIterator for Utf8Sequences {}
391
392	impl ScalarRange {
393	/// split splits this range if it overlaps with a surrogate codepoint.
394	///
395	/// Either or both ranges may be invalid.
396	fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
397	if self.start < `0xE000` && self.end > `0xD7FF` {
398	Some((
399	ScalarRange { start: self.start, end: `0xD7FF` },
400	ScalarRange { start: `0xE000`, end: self.end },
401	))
402	} else {
403	None
404	}
405	}
406
407	/// is_valid returns true if and only if start <= end.
408	fn is_valid(&self) -> bool {
409	self.start <= self.end
410	}
411
412	/// as_ascii returns this range as a Utf8Range if and only if all scalar
413	/// values in this range can be encoded as a single byte.
414	fn as_ascii(&self) -> Option<Utf8Range> {
415	if self.is_ascii() {
416	let start = u8::try_from(self.start).unwrap();
417	let end = u8::try_from(self.end).unwrap();
418	Some(Utf8Range::new(start, end))
419	} else {
420	None
421	}
422	}
423
424	/// is_ascii returns true if the range is ASCII only (i.e., takes a single
425	/// byte to encode any scalar value).
426	fn is_ascii(&self) -> bool {
427	self.is_valid() && self.end <= `0x7f`
428	}
429
430	/// encode writes the UTF-8 encoding of the start and end of this range
431	/// to the corresponding destination slices, and returns the number of
432	/// bytes written.
433	///
434	/// The slices should have room for at least `MAX_UTF8_BYTES`.
435	fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
436	let cs = char::from_u32(self.start).unwrap();
437	let ce = char::from_u32(self.end).unwrap();
438	let ss = cs.encode_utf8(start);
439	let se = ce.encode_utf8(end);
440	assert_eq!(ss.len(), se.len());
441	ss.len()
442	}
443	}
444
445	fn max_scalar_value(nbytes: usize) -> u32 {
446	match nbytes {
447	`1` => `0x007F`,
448	`2` => `0x07FF`,
449	`3` => `0xFFFF`,
450	`4` => `0x0010_FFFF`,
451	_ => unreachable!("invalid UTF-8 byte sequence size"),
452	}
453	}
454
455	#[cfg(test)]
456	mod tests {
457	use core::char;
458
459	use alloc::{vec, vec::Vec};
460
461	use crate::utf8::{Utf8Range, Utf8Sequences};
462
463	fn rutf8(s: u8, e: u8) -> Utf8Range {
464	Utf8Range::new(s, e)
465	}
466
467	fn never_accepts_surrogate_codepoints(start: char, end: char) {
468	for cp in `0xD800`..`0xE000` {
469	let buf = encode_surrogate(cp);
470	for r in Utf8Sequences::new(start, end) {
471	if r.matches(&buf) {
472	panic!(
473	"Sequence ({:X}, {:X}) contains range {:?}, \
474	which matches surrogate code point {:X} \
475	with encoded bytes {:?}",
476	u32::from(start),
477	u32::from(end),
478	r,
479	cp,
480	buf,
481	);
482	}
483	}
484	}
485	}
486
487	#[test]
488	fn codepoints_no_surrogates() {
489	never_accepts_surrogate_codepoints('`\u{0}`', '`\u{FFFF}`');
490	never_accepts_surrogate_codepoints('`\u{0}`', '`\u{10FFFF}`');
491	never_accepts_surrogate_codepoints('`\u{0}`', '`\u{10FFFE}`');
492	never_accepts_surrogate_codepoints('`\u{80}`', '`\u{10FFFF}`');
493	never_accepts_surrogate_codepoints('`\u{D7FF}`', '`\u{E000}`');
494	}
495
496	#[test]
497	fn single_codepoint_one_sequence() {
498	// Tests that every range of scalar values that contains a single
499	// scalar value is recognized by one sequence of byte ranges.
500	for i in `0x0`..=`0x0010_FFFF` {
501	let c = match char::from_u32(i) {
502	None => continue,
503	Some(c) => c,
504	};
505	let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
506	assert_eq!(seqs.len(), `1`);
507	}
508	}
509
510	#[test]
511	fn bmp() {
512	use crate::utf8::Utf8Sequence::*;
513
514	let seqs = Utf8Sequences::new('`\u{0}`', '`\u{FFFF}`').collect::<Vec<_>>();
515	assert_eq!(
516	seqs,
517	vec![
518	One(rutf8(`0x0`, `0x7F`)),
519	Two([rutf8(`0xC2`, `0xDF`), rutf8(`0x80`, `0xBF`)]),
520	Three([
521	rutf8(`0xE0`, `0xE0`),
522	rutf8(`0xA0`, `0xBF`),
523	rutf8(`0x80`, `0xBF`)
524	]),
525	Three([
526	rutf8(`0xE1`, `0xEC`),
527	rutf8(`0x80`, `0xBF`),
528	rutf8(`0x80`, `0xBF`)
529	]),
530	Three([
531	rutf8(`0xED`, `0xED`),
532	rutf8(`0x80`, `0x9F`),
533	rutf8(`0x80`, `0xBF`)
534	]),
535	Three([
536	rutf8(`0xEE`, `0xEF`),
537	rutf8(`0x80`, `0xBF`),
538	rutf8(`0x80`, `0xBF`)
539	]),
540	]
541	);
542	}
543
544	#[test]
545	fn reverse() {
546	use crate::utf8::Utf8Sequence::*;
547
548	let mut s = One(rutf8(`0xA`, `0xB`));
549	s.reverse();
550	assert_eq!(s.as_slice(), &[rutf8(`0xA`, `0xB`)]);
551
552	let mut s = Two([rutf8(`0xA`, `0xB`), rutf8(`0xB`, `0xC`)]);
553	s.reverse();
554	assert_eq!(s.as_slice(), &[rutf8(`0xB`, `0xC`), rutf8(`0xA`, `0xB`)]);
555
556	let mut s = Three([rutf8(`0xA`, `0xB`), rutf8(`0xB`, `0xC`), rutf8(`0xC`, `0xD`)]);
557	s.reverse();
558	assert_eq!(
559	s.as_slice(),
560	&[rutf8(`0xC`, `0xD`), rutf8(`0xB`, `0xC`), rutf8(`0xA`, `0xB`)]
561	);
562
563	let mut s = Four([
564	rutf8(`0xA`, `0xB`),
565	rutf8(`0xB`, `0xC`),
566	rutf8(`0xC`, `0xD`),
567	rutf8(`0xD`, `0xE`),
568	]);
569	s.reverse();
570	assert_eq!(
571	s.as_slice(),
572	&[
573	rutf8(`0xD`, `0xE`),
574	rutf8(`0xC`, `0xD`),
575	rutf8(`0xB`, `0xC`),
576	rutf8(`0xA`, `0xB`)
577	]
578	);
579	}
580
581	fn encode_surrogate(cp: u32) -> [u8; `3`] {
582	const TAG_CONT: u8 = `0b1000_0000`;
583	const TAG_THREE_B: u8 = `0b1110_0000`;
584
585	assert!(`0xD800` <= cp && cp < `0xE000`);
586	let mut dst = [`0`; `3`];
587	dst[`0`] = u8::try_from(cp >> `12` & `0x0F`).unwrap() \| TAG_THREE_B;
588	dst[`1`] = u8::try_from(cp >> `6` & `0x3F`).unwrap() \| TAG_CONT;
589	dst[`2`] = u8::try_from(cp & `0x3F`).unwrap() \| TAG_CONT;
590	dst
591	}
592	}
593