utf8.rs source code [crates/regex-syntax-0.6.29/src/utf8.rs]

1	/!*
2	Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4	This is sub-module is useful for constructing byte based automatons that need
5	to embed UTF-8 decoding. The most common use of this module is in conjunction
6	with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.
7
8	See the documentation on the `Utf8Sequences` iterator for more details and
9	an example.
10
11	# Wait, what is this?
12
13	This is simplest to explain with an example. Let's say you wanted to test
14	whether a particular byte sequence was a Cyrillic character. One possible
15	scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16	range can be expressed as a sequence of byte ranges:
17
18	```text
19	[D0-D3][80-BF]
20	```
21
22	This is simple enough: simply encode the boundaries, `0400` encodes to
23	`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24	corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26	However, what if you wanted to add the Cyrillic Supplementary characters to
27	your range? Your range might then become `[0400-052F]`. The same procedure
28	as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29	you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30	this isn't quite correct because this range doesn't capture many characters,
31	for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33	Instead, you need multiple sequences of byte ranges:
34
35	```text
36	[D0-D3][80-BF] # matches codepoints 0400-04FF
37	[D4][80-AF] # matches codepoints 0500-052F
38	```
39
40	This gets even more complicated if you want bigger ranges, particularly if
41	they naively contain surrogate codepoints. For example, the sequence of byte
42	ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44	```text
45	[0-7F]
46	[C2-DF][80-BF]
47	[E0][A0-BF][80-BF]
48	[E1-EC][80-BF][80-BF]
49	[ED][80-9F][80-BF]
50	[EE-EF][80-BF][80-BF]
51	```
52
53	Note that the byte ranges above will not* match any erroneous encoding of*
54	UTF-8, including encodings of surrogate codepoints.
55
56	And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58	```text
59	[0-7F]
60	[C2-DF][80-BF]
61	[E0][A0-BF][80-BF]
62	[E1-EC][80-BF][80-BF]
63	[ED][80-9F][80-BF]
64	[EE-EF][80-BF][80-BF]
65	[F0][90-BF][80-BF][80-BF]
66	[F1-F3][80-BF][80-BF][80-BF]
67	[F4][80-8F][80-BF][80-BF]
68	```
69
70	This module automates the process of creating these byte ranges from ranges of
71	Unicode scalar values.
72
73	# Lineage
74
75	I got the idea and general implementation strategy from Russ Cox in his
76	[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77	Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78	I also got the idea from
79	[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80	which uses it for executing automata on their term index.
81	*/
82
83	#![deny(missing_docs)]
84
85	use std::char;
86	use std::fmt;
87	use std::iter::FusedIterator;
88	use std::slice;
89
90	const MAX_UTF8_BYTES: usize = `4`;
91
92	/// Utf8Sequence represents a sequence of byte ranges.
93	///
94	/// To match a Utf8Sequence, a candidate byte sequence must match each
95	/// successive range.
96	///
97	/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
98	/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
99	#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
100	pub enum Utf8Sequence {
101	/// One byte range.
102	One(Utf8Range),
103	/// Two successive byte ranges.
104	Two([Utf8Range; `2`]),
105	/// Three successive byte ranges.
106	Three([Utf8Range; `3`]),
107	/// Four successive byte ranges.
108	Four([Utf8Range; `4`]),
109	}
110
111	impl Utf8Sequence {
112	/// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
113	/// range.
114	///
115	/// This assumes that `start` and `end` have the same length.
116	fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
117	assert_eq!(start.len(), end.len());
118	match start.len() {
119	`2` => Utf8Sequence::Two([
120	Utf8Range::new(start[`0`], end[`0`]),
121	Utf8Range::new(start[`1`], end[`1`]),
122	]),
123	`3` => Utf8Sequence::Three([
124	Utf8Range::new(start[`0`], end[`0`]),
125	Utf8Range::new(start[`1`], end[`1`]),
126	Utf8Range::new(start[`2`], end[`2`]),
127	]),
128	`4` => Utf8Sequence::Four([
129	Utf8Range::new(start[`0`], end[`0`]),
130	Utf8Range::new(start[`1`], end[`1`]),
131	Utf8Range::new(start[`2`], end[`2`]),
132	Utf8Range::new(start[`3`], end[`3`]),
133	]),
134	n => unreachable!("invalid encoded length: {}", n),
135	}
136	}
137
138	/// Returns the underlying sequence of byte ranges as a slice.
139	pub fn as_slice(&self) -> &[Utf8Range] {
140	use self::Utf8Sequence::*;
141	match *self {
142	One(ref r) => slice::from_ref(r),
143	Two(ref r) => &r[..],
144	Three(ref r) => &r[..],
145	Four(ref r) => &r[..],
146	}
147	}
148
149	/// Returns the number of byte ranges in this sequence.
150	///
151	/// The length is guaranteed to be in the closed interval `[1, 4]`.
152	pub fn len(&self) -> usize {
153	self.as_slice().len()
154	}
155
156	/// Reverses the ranges in this sequence.
157	///
158	/// For example, if this corresponds to the following sequence:
159	///
160	/// ```text
161	/// [D0-D3][80-BF]
162	/// ```
163	///
164	/// Then after reversal, it will be
165	///
166	/// ```text
167	/// [80-BF][D0-D3]
168	/// ```
169	///
170	/// This is useful when one is constructing a UTF-8 automaton to match
171	/// character classes in reverse.
172	pub fn reverse(&mut self) {
173	match *self {
174	Utf8Sequence::One(_) => {}
175	Utf8Sequence::Two(ref mut x) => x.reverse(),
176	Utf8Sequence::Three(ref mut x) => x.reverse(),
177	Utf8Sequence::Four(ref mut x) => x.reverse(),
178	}
179	}
180
181	/// Returns true if and only if a prefix of `bytes` matches this sequence
182	/// of byte ranges.
183	pub fn matches(&self, bytes: &[u8]) -> bool {
184	if bytes.len() < self.len() {
185	return `false`;
186	}
187	for (&b, r) in bytes.iter().zip(self) {
188	if !r.matches(b) {
189	return `false`;
190	}
191	}
192	`true`
193	}
194	}
195
196	impl<'a> IntoIterator for &'a Utf8Sequence {
197	type IntoIter = slice::Iter<'a, Utf8Range>;
198	type Item = &'a Utf8Range;
199
200	fn into_iter(self) -> Self::IntoIter {
201	self.as_slice().iter()
202	}
203	}
204
205	impl fmt::Debug for Utf8Sequence {
206	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
207	use self::Utf8Sequence::*;
208	match *self {
209	One(ref r: &Utf8Range) => write!(f, "{:?}", r),
210	Two(ref r: &[Utf8Range; 2]) => write!(f, "{:?}{:?}", r[`0`], r[`1`]),
211	Three(ref r: &[Utf8Range; 3]) => write!(f, "{:?}{:?}{:?}", r[`0`], r[`1`], r[`2`]),
212	Four(ref r: &[Utf8Range; 4]) => {
213	write!(f, "{:?}{:?}{:?}{:?}", r[`0`], r[`1`], r[`2`], r[`3`])
214	}
215	}
216	}
217	}
218
219	/// A single inclusive range of UTF-8 bytes.
220	#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
221	pub struct Utf8Range {
222	/// Start of byte range (inclusive).
223	pub start: u8,
224	/// End of byte range (inclusive).
225	pub end: u8,
226	}
227
228	impl Utf8Range {
229	fn new(start: u8, end: u8) -> Self {
230	Utf8Range { start, end }
231	}
232
233	/// Returns true if and only if the given byte is in this range.
234	pub fn matches(&self, b: u8) -> bool {
235	self.start <= b && b <= self.end
236	}
237	}
238
239	impl fmt::Debug for Utf8Range {
240	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
241	if self.start == self.end {
242	write!(f, "[{:X}]", self.start)
243	} else {
244	write!(f, "[{:X}-{:X}]", self.start, self.end)
245	}
246	}
247	}
248
249	/// An iterator over ranges of matching UTF-8 byte sequences.
250	///
251	/// The iteration represents an alternation of comprehensive byte sequences
252	/// that match precisely the set of UTF-8 encoded scalar values.
253	///
254	/// A byte sequence corresponds to one of the scalar values in the range given
255	/// if and only if it completely matches exactly one of the sequences of byte
256	/// ranges produced by this iterator.
257	///
258	/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
259	/// sequences will match the same bytes.
260	///
261	/// # Example
262	///
263	/// This shows how to match an arbitrary byte sequence against a range of
264	/// scalar values.
265	///
266	/// ```rust
267	/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
268	///
269	/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
270	/// for range in seqs {
271	/// if range.matches(bytes) {
272	/// return `true`;
273	/// }
274	/// }
275	/// `false`
276	/// }
277	///
278	/// // Test the basic multilingual plane.
279	/// let seqs: Vec<_> = Utf8Sequences::new('`\u{0}`', '`\u{FFFF}`').collect();
280	///
281	/// // UTF-8 encoding of 'a'.
282	/// assert!(matches(&seqs, &[`0x61`]));
283	/// // UTF-8 encoding of '☃' (`\u{2603}`).
284	/// assert!(matches(&seqs, &[`0xE2`, `0x98`, `0x83`]));
285	/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
286	/// assert!(!matches(&seqs, &[`0xF0`, `0x90`, `0x8D`, `0x88`]));
287	/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
288	/// // which is invalid UTF-8, and therefore fails, despite the fact that
289	/// // the corresponding codepoint (0xD800) falls in the range given.
290	/// assert!(!matches(&seqs, &[`0xED`, `0xA0`, `0x80`]));
291	/// // And fails against plain old invalid UTF-8.
292	/// assert!(!matches(&seqs, &[`0xFF`, `0xFF`]));
293	/// ```
294	///
295	/// If this example seems circuitous, that's because it is! It's meant to be
296	/// illustrative. In practice, you could just try to decode your byte sequence
297	/// and compare it with the scalar value range directly. However, this is not
298	/// always possible (for example, in a byte based automaton).
299	#[derive(Debug)]
300	pub struct Utf8Sequences {
301	range_stack: Vec<ScalarRange>,
302	}
303
304	impl Utf8Sequences {
305	/// Create a new iterator over UTF-8 byte ranges for the scalar value range
306	/// given.
307	pub fn new(start: char, end: char) -> Self {
308	let mut it: Utf8Sequences = Utf8Sequences { range_stack: vec![] };
309	it.push(start as u32, end as u32);
310	it
311	}
312
313	/// reset resets the scalar value range.
314	/// Any existing state is cleared, but resources may be reused.
315	///
316	/// N.B. Benchmarks say that this method is dubious.
317	#[doc(hidden)]
318	pub fn reset(&mut self, start: char, end: char) {
319	self.range_stack.clear();
320	self.push(start as u32, end as u32);
321	}
322
323	fn push(&mut self, start: u32, end: u32) {
324	self.range_stack.push(ScalarRange { start, end });
325	}
326	}
327
328	struct ScalarRange {
329	start: u32,
330	end: u32,
331	}
332
333	impl fmt::Debug for ScalarRange {
334	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
335	write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
336	}
337	}
338
339	impl Iterator for Utf8Sequences {
340	type Item = Utf8Sequence;
341
342	fn next(&mut self) -> Option<Self::Item> {
343	'TOP: while let Some(mut r) = self.range_stack.pop() {
344	'INNER: loop {
345	if let Some((r1, r2)) = r.split() {
346	self.push(r2.start, r2.end);
347	r.start = r1.start;
348	r.end = r1.end;
349	continue 'INNER;
350	}
351	if !r.is_valid() {
352	continue 'TOP;
353	}
354	for i in `1`..MAX_UTF8_BYTES {
355	let max = max_scalar_value(i);
356	if r.start <= max && max < r.end {
357	self.push(max + `1`, r.end);
358	r.end = max;
359	continue 'INNER;
360	}
361	}
362	if let Some(ascii_range) = r.as_ascii() {
363	return Some(Utf8Sequence::One(ascii_range));
364	}
365	for i in `1`..MAX_UTF8_BYTES {
366	let m = (`1` << (`6` * i)) - `1`;
367	if (r.start & !m) != (r.end & !m) {
368	if (r.start & m) != `0` {
369	self.push((r.start \| m) + `1`, r.end);
370	r.end = r.start \| m;
371	continue 'INNER;
372	}
373	if (r.end & m) != m {
374	self.push(r.end & !m, r.end);
375	r.end = (r.end & !m) - `1`;
376	continue 'INNER;
377	}
378	}
379	}
380	let mut start = [`0`; MAX_UTF8_BYTES];
381	let mut end = [`0`; MAX_UTF8_BYTES];
382	let n = r.encode(&mut start, &mut end);
383	return Some(Utf8Sequence::from_encoded_range(
384	&start[`0`..n],
385	&end[`0`..n],
386	));
387	}
388	}
389	None
390	}
391	}
392
393	impl FusedIterator for Utf8Sequences {}
394
395	impl ScalarRange {
396	/// split splits this range if it overlaps with a surrogate codepoint.
397	///
398	/// Either or both ranges may be invalid.
399	fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
400	if self.start < `0xE000` && self.end > `0xD7FF` {
401	Some((
402	ScalarRange { start: self.start, end: `0xD7FF` },
403	ScalarRange { start: `0xE000`, end: self.end },
404	))
405	} else {
406	None
407	}
408	}
409
410	/// is_valid returns true if and only if start <= end.
411	fn is_valid(&self) -> bool {
412	self.start <= self.end
413	}
414
415	/// as_ascii returns this range as a Utf8Range if and only if all scalar
416	/// values in this range can be encoded as a single byte.
417	fn as_ascii(&self) -> Option<Utf8Range> {
418	if self.is_ascii() {
419	Some(Utf8Range::new(self.start as u8, self.end as u8))
420	} else {
421	None
422	}
423	}
424
425	/// is_ascii returns true if the range is ASCII only (i.e., takes a single
426	/// byte to encode any scalar value).
427	fn is_ascii(&self) -> bool {
428	self.is_valid() && self.end <= `0x7f`
429	}
430
431	/// encode writes the UTF-8 encoding of the start and end of this range
432	/// to the corresponding destination slices, and returns the number of
433	/// bytes written.
434	///
435	/// The slices should have room for at least `MAX_UTF8_BYTES`.
436	fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
437	let cs = char::from_u32(self.start).unwrap();
438	let ce = char::from_u32(self.end).unwrap();
439	let ss = cs.encode_utf8(start);
440	let se = ce.encode_utf8(end);
441	assert_eq!(ss.len(), se.len());
442	ss.len()
443	}
444	}
445
446	fn max_scalar_value(nbytes: usize) -> u32 {
447	match nbytes {
448	`1` => `0x007F`,
449	`2` => `0x07FF`,
450	`3` => `0xFFFF`,
451	`4` => `0x0010_FFFF`,
452	_ => unreachable!("invalid UTF-8 byte sequence size"),
453	}
454	}
455
456	#[cfg(test)]
457	mod tests {
458	use std::char;
459
460	use crate::utf8::{Utf8Range, Utf8Sequences};
461
462	fn rutf8(s: u8, e: u8) -> Utf8Range {
463	Utf8Range::new(s, e)
464	}
465
466	fn never_accepts_surrogate_codepoints(start: char, end: char) {
467	for cp in `0xD800`..`0xE000` {
468	let buf = encode_surrogate(cp);
469	for r in Utf8Sequences::new(start, end) {
470	if r.matches(&buf) {
471	panic!(
472	"Sequence ({:X}, {:X}) contains range {:?}, \
473	which matches surrogate code point {:X} \
474	with encoded bytes {:?}",
475	start as u32, end as u32, r, cp, buf,
476	);
477	}
478	}
479	}
480	}
481
482	#[test]
483	fn codepoints_no_surrogates() {
484	never_accepts_surrogate_codepoints('`\u{0}`', '`\u{FFFF}`');
485	never_accepts_surrogate_codepoints('`\u{0}`', '`\u{10FFFF}`');
486	never_accepts_surrogate_codepoints('`\u{0}`', '`\u{10FFFE}`');
487	never_accepts_surrogate_codepoints('`\u{80}`', '`\u{10FFFF}`');
488	never_accepts_surrogate_codepoints('`\u{D7FF}`', '`\u{E000}`');
489	}
490
491	#[test]
492	fn single_codepoint_one_sequence() {
493	// Tests that every range of scalar values that contains a single
494	// scalar value is recognized by one sequence of byte ranges.
495	for i in `0x0`..=`0x0010_FFFF` {
496	let c = match char::from_u32(i) {
497	None => continue,
498	Some(c) => c,
499	};
500	let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
501	assert_eq!(seqs.len(), `1`);
502	}
503	}
504
505	#[test]
506	fn bmp() {
507	use crate::utf8::Utf8Sequence::*;
508
509	let seqs = Utf8Sequences::new('`\u{0}`', '`\u{FFFF}`').collect::<Vec<_>>();
510	assert_eq!(
511	seqs,
512	vec![
513	One(rutf8(`0x0`, `0x7F`)),
514	Two([rutf8(`0xC2`, `0xDF`), rutf8(`0x80`, `0xBF`)]),
515	Three([
516	rutf8(`0xE0`, `0xE0`),
517	rutf8(`0xA0`, `0xBF`),
518	rutf8(`0x80`, `0xBF`)
519	]),
520	Three([
521	rutf8(`0xE1`, `0xEC`),
522	rutf8(`0x80`, `0xBF`),
523	rutf8(`0x80`, `0xBF`)
524	]),
525	Three([
526	rutf8(`0xED`, `0xED`),
527	rutf8(`0x80`, `0x9F`),
528	rutf8(`0x80`, `0xBF`)
529	]),
530	Three([
531	rutf8(`0xEE`, `0xEF`),
532	rutf8(`0x80`, `0xBF`),
533	rutf8(`0x80`, `0xBF`)
534	]),
535	]
536	);
537	}
538
539	#[test]
540	fn reverse() {
541	use crate::utf8::Utf8Sequence::*;
542
543	let mut s = One(rutf8(`0xA`, `0xB`));
544	s.reverse();
545	assert_eq!(s.as_slice(), &[rutf8(`0xA`, `0xB`)]);
546
547	let mut s = Two([rutf8(`0xA`, `0xB`), rutf8(`0xB`, `0xC`)]);
548	s.reverse();
549	assert_eq!(s.as_slice(), &[rutf8(`0xB`, `0xC`), rutf8(`0xA`, `0xB`)]);
550
551	let mut s = Three([rutf8(`0xA`, `0xB`), rutf8(`0xB`, `0xC`), rutf8(`0xC`, `0xD`)]);
552	s.reverse();
553	assert_eq!(
554	s.as_slice(),
555	&[rutf8(`0xC`, `0xD`), rutf8(`0xB`, `0xC`), rutf8(`0xA`, `0xB`)]
556	);
557
558	let mut s = Four([
559	rutf8(`0xA`, `0xB`),
560	rutf8(`0xB`, `0xC`),
561	rutf8(`0xC`, `0xD`),
562	rutf8(`0xD`, `0xE`),
563	]);
564	s.reverse();
565	assert_eq!(
566	s.as_slice(),
567	&[
568	rutf8(`0xD`, `0xE`),
569	rutf8(`0xC`, `0xD`),
570	rutf8(`0xB`, `0xC`),
571	rutf8(`0xA`, `0xB`)
572	]
573	);
574	}
575
576	fn encode_surrogate(cp: u32) -> [u8; `3`] {
577	const TAG_CONT: u8 = `0b1000_0000`;
578	const TAG_THREE_B: u8 = `0b1110_0000`;
579
580	assert!(`0xD800` <= cp && cp < `0xE000`);
581	let mut dst = [`0`; `3`];
582	dst[`0`] = (cp >> `12` & `0x0F`) as u8 \| TAG_THREE_B;
583	dst[`1`] = (cp >> `6` & `0x3F`) as u8 \| TAG_CONT;
584	dst[`2`] = (cp & `0x3F`) as u8 \| TAG_CONT;
585	dst
586	}
587	}
588