| 1 | /*! | 
| 2 | Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. | 
|---|
| 3 |  | 
|---|
| 4 | This is sub-module is useful for constructing byte based automatons that need | 
|---|
| 5 | to embed UTF-8 decoding. The most common use of this module is in conjunction | 
|---|
| 6 | with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type. | 
|---|
| 7 |  | 
|---|
| 8 | See the documentation on the `Utf8Sequences` iterator for more details and | 
|---|
| 9 | an example. | 
|---|
| 10 |  | 
|---|
| 11 | # Wait, what is this? | 
|---|
| 12 |  | 
|---|
| 13 | This is simplest to explain with an example. Let's say you wanted to test | 
|---|
| 14 | whether a particular byte sequence was a Cyrillic character. One possible | 
|---|
| 15 | scalar value range is `[0400-04FF]`. The set of allowed bytes for this | 
|---|
| 16 | range can be expressed as a sequence of byte ranges: | 
|---|
| 17 |  | 
|---|
| 18 | ```text | 
|---|
| 19 | [D0-D3][80-BF] | 
|---|
| 20 | ``` | 
|---|
| 21 |  | 
|---|
| 22 | This is simple enough: simply encode the boundaries, `0400` encodes to | 
|---|
| 23 | `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each | 
|---|
| 24 | corresponding pair of bytes: `D0` to `D3` and `80` to `BF`. | 
|---|
| 25 |  | 
|---|
| 26 | However, what if you wanted to add the Cyrillic Supplementary characters to | 
|---|
| 27 | your range? Your range might then become `[0400-052F]`. The same procedure | 
|---|
| 28 | as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges | 
|---|
| 29 | you'd get from the previous transformation would be `[D0-D4][80-AF]`. However, | 
|---|
| 30 | this isn't quite correct because this range doesn't capture many characters, | 
|---|
| 31 | for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`). | 
|---|
| 32 |  | 
|---|
| 33 | Instead, you need multiple sequences of byte ranges: | 
|---|
| 34 |  | 
|---|
| 35 | ```text | 
|---|
| 36 | [D0-D3][80-BF]  # matches codepoints 0400-04FF | 
|---|
| 37 | [D4][80-AF]     # matches codepoints 0500-052F | 
|---|
| 38 | ``` | 
|---|
| 39 |  | 
|---|
| 40 | This gets even more complicated if you want bigger ranges, particularly if | 
|---|
| 41 | they naively contain surrogate codepoints. For example, the sequence of byte | 
|---|
| 42 | ranges for the basic multilingual plane (`[0000-FFFF]`) look like this: | 
|---|
| 43 |  | 
|---|
| 44 | ```text | 
|---|
| 45 | [0-7F] | 
|---|
| 46 | [C2-DF][80-BF] | 
|---|
| 47 | [E0][A0-BF][80-BF] | 
|---|
| 48 | [E1-EC][80-BF][80-BF] | 
|---|
| 49 | [ED][80-9F][80-BF] | 
|---|
| 50 | [EE-EF][80-BF][80-BF] | 
|---|
| 51 | ``` | 
|---|
| 52 |  | 
|---|
| 53 | Note that the byte ranges above will *not* match any erroneous encoding of | 
|---|
| 54 | UTF-8, including encodings of surrogate codepoints. | 
|---|
| 55 |  | 
|---|
| 56 | And, of course, for all of Unicode (`[000000-10FFFF]`): | 
|---|
| 57 |  | 
|---|
| 58 | ```text | 
|---|
| 59 | [0-7F] | 
|---|
| 60 | [C2-DF][80-BF] | 
|---|
| 61 | [E0][A0-BF][80-BF] | 
|---|
| 62 | [E1-EC][80-BF][80-BF] | 
|---|
| 63 | [ED][80-9F][80-BF] | 
|---|
| 64 | [EE-EF][80-BF][80-BF] | 
|---|
| 65 | [F0][90-BF][80-BF][80-BF] | 
|---|
| 66 | [F1-F3][80-BF][80-BF][80-BF] | 
|---|
| 67 | [F4][80-8F][80-BF][80-BF] | 
|---|
| 68 | ``` | 
|---|
| 69 |  | 
|---|
| 70 | This module automates the process of creating these byte ranges from ranges of | 
|---|
| 71 | Unicode scalar values. | 
|---|
| 72 |  | 
|---|
| 73 | # Lineage | 
|---|
| 74 |  | 
|---|
| 75 | I got the idea and general implementation strategy from Russ Cox in his | 
|---|
| 76 | [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2. | 
|---|
| 77 | Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?). | 
|---|
| 78 | I also got the idea from | 
|---|
| 79 | [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java), | 
|---|
| 80 | which uses it for executing automata on their term index. | 
|---|
| 81 | */ | 
|---|
| 82 |  | 
|---|
| 83 | use core::{char, fmt, iter::FusedIterator, slice}; | 
|---|
| 84 |  | 
|---|
| 85 | use alloc::{vec, vec::Vec}; | 
|---|
| 86 |  | 
|---|
| 87 | const MAX_UTF8_BYTES: usize = 4; | 
|---|
| 88 |  | 
|---|
| 89 | /// Utf8Sequence represents a sequence of byte ranges. | 
|---|
| 90 | /// | 
|---|
| 91 | /// To match a Utf8Sequence, a candidate byte sequence must match each | 
|---|
| 92 | /// successive range. | 
|---|
| 93 | /// | 
|---|
| 94 | /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte | 
|---|
| 95 | /// sequence `\xDD\x61` would not match because `0x61 < 0x80`. | 
|---|
| 96 | #[ derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)] | 
|---|
| 97 | pub enum Utf8Sequence { | 
|---|
| 98 | /// One byte range. | 
|---|
| 99 | One(Utf8Range), | 
|---|
| 100 | /// Two successive byte ranges. | 
|---|
| 101 | Two([Utf8Range; 2]), | 
|---|
| 102 | /// Three successive byte ranges. | 
|---|
| 103 | Three([Utf8Range; 3]), | 
|---|
| 104 | /// Four successive byte ranges. | 
|---|
| 105 | Four([Utf8Range; 4]), | 
|---|
| 106 | } | 
|---|
| 107 |  | 
|---|
| 108 | impl Utf8Sequence { | 
|---|
| 109 | /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value | 
|---|
| 110 | /// range. | 
|---|
| 111 | /// | 
|---|
| 112 | /// This assumes that `start` and `end` have the same length. | 
|---|
| 113 | fn from_encoded_range(start: &[u8], end: &[u8]) -> Self { | 
|---|
| 114 | assert_eq!(start.len(), end.len()); | 
|---|
| 115 | match start.len() { | 
|---|
| 116 | 2 => Utf8Sequence::Two([ | 
|---|
| 117 | Utf8Range::new(start[0], end[0]), | 
|---|
| 118 | Utf8Range::new(start[1], end[1]), | 
|---|
| 119 | ]), | 
|---|
| 120 | 3 => Utf8Sequence::Three([ | 
|---|
| 121 | Utf8Range::new(start[0], end[0]), | 
|---|
| 122 | Utf8Range::new(start[1], end[1]), | 
|---|
| 123 | Utf8Range::new(start[2], end[2]), | 
|---|
| 124 | ]), | 
|---|
| 125 | 4 => Utf8Sequence::Four([ | 
|---|
| 126 | Utf8Range::new(start[0], end[0]), | 
|---|
| 127 | Utf8Range::new(start[1], end[1]), | 
|---|
| 128 | Utf8Range::new(start[2], end[2]), | 
|---|
| 129 | Utf8Range::new(start[3], end[3]), | 
|---|
| 130 | ]), | 
|---|
| 131 | n => unreachable!( "invalid encoded length: {} ", n), | 
|---|
| 132 | } | 
|---|
| 133 | } | 
|---|
| 134 |  | 
|---|
| 135 | /// Returns the underlying sequence of byte ranges as a slice. | 
|---|
| 136 | pub fn as_slice(&self) -> &[Utf8Range] { | 
|---|
| 137 | use self::Utf8Sequence::*; | 
|---|
| 138 | match *self { | 
|---|
| 139 | One(ref r) => slice::from_ref(r), | 
|---|
| 140 | Two(ref r) => &r[..], | 
|---|
| 141 | Three(ref r) => &r[..], | 
|---|
| 142 | Four(ref r) => &r[..], | 
|---|
| 143 | } | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | /// Returns the number of byte ranges in this sequence. | 
|---|
| 147 | /// | 
|---|
| 148 | /// The length is guaranteed to be in the closed interval `[1, 4]`. | 
|---|
| 149 | pub fn len(&self) -> usize { | 
|---|
| 150 | self.as_slice().len() | 
|---|
| 151 | } | 
|---|
| 152 |  | 
|---|
| 153 | /// Reverses the ranges in this sequence. | 
|---|
| 154 | /// | 
|---|
| 155 | /// For example, if this corresponds to the following sequence: | 
|---|
| 156 | /// | 
|---|
| 157 | /// ```text | 
|---|
| 158 | /// [D0-D3][80-BF] | 
|---|
| 159 | /// ``` | 
|---|
| 160 | /// | 
|---|
| 161 | /// Then after reversal, it will be | 
|---|
| 162 | /// | 
|---|
| 163 | /// ```text | 
|---|
| 164 | /// [80-BF][D0-D3] | 
|---|
| 165 | /// ``` | 
|---|
| 166 | /// | 
|---|
| 167 | /// This is useful when one is constructing a UTF-8 automaton to match | 
|---|
| 168 | /// character classes in reverse. | 
|---|
| 169 | pub fn reverse(&mut self) { | 
|---|
| 170 | match *self { | 
|---|
| 171 | Utf8Sequence::One(_) => {} | 
|---|
| 172 | Utf8Sequence::Two(ref mut x) => x.reverse(), | 
|---|
| 173 | Utf8Sequence::Three(ref mut x) => x.reverse(), | 
|---|
| 174 | Utf8Sequence::Four(ref mut x) => x.reverse(), | 
|---|
| 175 | } | 
|---|
| 176 | } | 
|---|
| 177 |  | 
|---|
| 178 | /// Returns true if and only if a prefix of `bytes` matches this sequence | 
|---|
| 179 | /// of byte ranges. | 
|---|
| 180 | pub fn matches(&self, bytes: &[u8]) -> bool { | 
|---|
| 181 | if bytes.len() < self.len() { | 
|---|
| 182 | return false; | 
|---|
| 183 | } | 
|---|
| 184 | for (&b, r) in bytes.iter().zip(self) { | 
|---|
| 185 | if !r.matches(b) { | 
|---|
| 186 | return false; | 
|---|
| 187 | } | 
|---|
| 188 | } | 
|---|
| 189 | true | 
|---|
| 190 | } | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | impl<'a> IntoIterator for &'a Utf8Sequence { | 
|---|
| 194 | type IntoIter = slice::Iter<'a, Utf8Range>; | 
|---|
| 195 | type Item = &'a Utf8Range; | 
|---|
| 196 |  | 
|---|
| 197 | fn into_iter(self) -> Self::IntoIter { | 
|---|
| 198 | self.as_slice().iter() | 
|---|
| 199 | } | 
|---|
| 200 | } | 
|---|
| 201 |  | 
|---|
| 202 | impl fmt::Debug for Utf8Sequence { | 
|---|
| 203 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | 
|---|
| 204 | use self::Utf8Sequence::*; | 
|---|
| 205 | match *self { | 
|---|
| 206 | One(ref r: &Utf8Range) => write!(f, "{:?} ", r), | 
|---|
| 207 | Two(ref r: &[Utf8Range; 2]) => write!(f, "{:?}{:?} ", r[0], r[1]), | 
|---|
| 208 | Three(ref r: &[Utf8Range; 3]) => write!(f, "{:?}{:?}{:?} ", r[0], r[1], r[2]), | 
|---|
| 209 | Four(ref r: &[Utf8Range; 4]) => { | 
|---|
| 210 | write!(f, "{:?}{:?}{:?}{:?} ", r[0], r[1], r[2], r[3]) | 
|---|
| 211 | } | 
|---|
| 212 | } | 
|---|
| 213 | } | 
|---|
| 214 | } | 
|---|
| 215 |  | 
|---|
| 216 | /// A single inclusive range of UTF-8 bytes. | 
|---|
| 217 | #[ derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] | 
|---|
| 218 | pub struct Utf8Range { | 
|---|
| 219 | /// Start of byte range (inclusive). | 
|---|
| 220 | pub start: u8, | 
|---|
| 221 | /// End of byte range (inclusive). | 
|---|
| 222 | pub end: u8, | 
|---|
| 223 | } | 
|---|
| 224 |  | 
|---|
| 225 | impl Utf8Range { | 
|---|
| 226 | fn new(start: u8, end: u8) -> Self { | 
|---|
| 227 | Utf8Range { start, end } | 
|---|
| 228 | } | 
|---|
| 229 |  | 
|---|
| 230 | /// Returns true if and only if the given byte is in this range. | 
|---|
| 231 | pub fn matches(&self, b: u8) -> bool { | 
|---|
| 232 | self.start <= b && b <= self.end | 
|---|
| 233 | } | 
|---|
| 234 | } | 
|---|
| 235 |  | 
|---|
| 236 | impl fmt::Debug for Utf8Range { | 
|---|
| 237 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | 
|---|
| 238 | if self.start == self.end { | 
|---|
| 239 | write!(f, "[{:X} ]", self.start) | 
|---|
| 240 | } else { | 
|---|
| 241 | write!(f, "[{:X} -{:X} ]", self.start, self.end) | 
|---|
| 242 | } | 
|---|
| 243 | } | 
|---|
| 244 | } | 
|---|
| 245 |  | 
|---|
| 246 | /// An iterator over ranges of matching UTF-8 byte sequences. | 
|---|
| 247 | /// | 
|---|
| 248 | /// The iteration represents an alternation of comprehensive byte sequences | 
|---|
| 249 | /// that match precisely the set of UTF-8 encoded scalar values. | 
|---|
| 250 | /// | 
|---|
| 251 | /// A byte sequence corresponds to one of the scalar values in the range given | 
|---|
| 252 | /// if and only if it completely matches exactly one of the sequences of byte | 
|---|
| 253 | /// ranges produced by this iterator. | 
|---|
| 254 | /// | 
|---|
| 255 | /// Each sequence of byte ranges matches a unique set of bytes. That is, no two | 
|---|
| 256 | /// sequences will match the same bytes. | 
|---|
| 257 | /// | 
|---|
| 258 | /// # Example | 
|---|
| 259 | /// | 
|---|
| 260 | /// This shows how to match an arbitrary byte sequence against a range of | 
|---|
| 261 | /// scalar values. | 
|---|
| 262 | /// | 
|---|
| 263 | /// ```rust | 
|---|
| 264 | /// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence}; | 
|---|
| 265 | /// | 
|---|
| 266 | /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool { | 
|---|
| 267 | ///     for range in seqs { | 
|---|
| 268 | ///         if range.matches(bytes) { | 
|---|
| 269 | ///             return true; | 
|---|
| 270 | ///         } | 
|---|
| 271 | ///     } | 
|---|
| 272 | ///     false | 
|---|
| 273 | /// } | 
|---|
| 274 | /// | 
|---|
| 275 | /// // Test the basic multilingual plane. | 
|---|
| 276 | /// let seqs: Vec<_> = Utf8Sequences::new( '\u{0} ', '\u{FFFF} ').collect(); | 
|---|
| 277 | /// | 
|---|
| 278 | /// // UTF-8 encoding of 'a'. | 
|---|
| 279 | /// assert!(matches(&seqs, &[0x61])); | 
|---|
| 280 | /// // UTF-8 encoding of '☃' (`\u{2603}`). | 
|---|
| 281 | /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83])); | 
|---|
| 282 | /// // UTF-8 encoding of `\u{10348}` (outside the BMP). | 
|---|
| 283 | /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88])); | 
|---|
| 284 | /// // Tries to match against a UTF-8 encoding of a surrogate codepoint, | 
|---|
| 285 | /// // which is invalid UTF-8, and therefore fails, despite the fact that | 
|---|
| 286 | /// // the corresponding codepoint (0xD800) falls in the range given. | 
|---|
| 287 | /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80])); | 
|---|
| 288 | /// // And fails against plain old invalid UTF-8. | 
|---|
| 289 | /// assert!(!matches(&seqs, &[0xFF, 0xFF])); | 
|---|
| 290 | /// ``` | 
|---|
| 291 | /// | 
|---|
| 292 | /// If this example seems circuitous, that's because it is! It's meant to be | 
|---|
| 293 | /// illustrative. In practice, you could just try to decode your byte sequence | 
|---|
| 294 | /// and compare it with the scalar value range directly. However, this is not | 
|---|
| 295 | /// always possible (for example, in a byte based automaton). | 
|---|
| 296 | #[ derive(Debug)] | 
|---|
| 297 | pub struct Utf8Sequences { | 
|---|
| 298 | range_stack: Vec<ScalarRange>, | 
|---|
| 299 | } | 
|---|
| 300 |  | 
|---|
| 301 | impl Utf8Sequences { | 
|---|
| 302 | /// Create a new iterator over UTF-8 byte ranges for the scalar value range | 
|---|
| 303 | /// given. | 
|---|
| 304 | pub fn new(start: char, end: char) -> Self { | 
|---|
| 305 | let range: ScalarRange = | 
|---|
| 306 | ScalarRange { start: u32::from(start), end: u32::from(end) }; | 
|---|
| 307 | Utf8Sequences { range_stack: vec![range] } | 
|---|
| 308 | } | 
|---|
| 309 |  | 
|---|
| 310 | /// reset resets the scalar value range. | 
|---|
| 311 | /// Any existing state is cleared, but resources may be reused. | 
|---|
| 312 | /// | 
|---|
| 313 | /// N.B. Benchmarks say that this method is dubious. | 
|---|
| 314 | #[ doc(hidden)] | 
|---|
| 315 | pub fn reset(&mut self, start: char, end: char) { | 
|---|
| 316 | self.range_stack.clear(); | 
|---|
| 317 | self.push(start:u32::from(start), end:u32::from(end)); | 
|---|
| 318 | } | 
|---|
| 319 |  | 
|---|
| 320 | fn push(&mut self, start: u32, end: u32) { | 
|---|
| 321 | self.range_stack.push(ScalarRange { start, end }); | 
|---|
| 322 | } | 
|---|
| 323 | } | 
|---|
| 324 |  | 
|---|
| 325 | struct ScalarRange { | 
|---|
| 326 | start: u32, | 
|---|
| 327 | end: u32, | 
|---|
| 328 | } | 
|---|
| 329 |  | 
|---|
| 330 | impl fmt::Debug for ScalarRange { | 
|---|
| 331 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | 
|---|
| 332 | write!(f, "ScalarRange({:X} , {:X} )", self.start, self.end) | 
|---|
| 333 | } | 
|---|
| 334 | } | 
|---|
| 335 |  | 
|---|
| 336 | impl Iterator for Utf8Sequences { | 
|---|
| 337 | type Item = Utf8Sequence; | 
|---|
| 338 |  | 
|---|
| 339 | fn next(&mut self) -> Option<Self::Item> { | 
|---|
| 340 | 'TOP: while let Some(mut r) = self.range_stack.pop() { | 
|---|
| 341 | 'INNER: loop { | 
|---|
| 342 | if let Some((r1, r2)) = r.split() { | 
|---|
| 343 | self.push(r2.start, r2.end); | 
|---|
| 344 | r.start = r1.start; | 
|---|
| 345 | r.end = r1.end; | 
|---|
| 346 | continue 'INNER; | 
|---|
| 347 | } | 
|---|
| 348 | if !r.is_valid() { | 
|---|
| 349 | continue 'TOP; | 
|---|
| 350 | } | 
|---|
| 351 | for i in 1..MAX_UTF8_BYTES { | 
|---|
| 352 | let max = max_scalar_value(i); | 
|---|
| 353 | if r.start <= max && max < r.end { | 
|---|
| 354 | self.push(max + 1, r.end); | 
|---|
| 355 | r.end = max; | 
|---|
| 356 | continue 'INNER; | 
|---|
| 357 | } | 
|---|
| 358 | } | 
|---|
| 359 | if let Some(ascii_range) = r.as_ascii() { | 
|---|
| 360 | return Some(Utf8Sequence::One(ascii_range)); | 
|---|
| 361 | } | 
|---|
| 362 | for i in 1..MAX_UTF8_BYTES { | 
|---|
| 363 | let m = (1 << (6 * i)) - 1; | 
|---|
| 364 | if (r.start & !m) != (r.end & !m) { | 
|---|
| 365 | if (r.start & m) != 0 { | 
|---|
| 366 | self.push((r.start | m) + 1, r.end); | 
|---|
| 367 | r.end = r.start | m; | 
|---|
| 368 | continue 'INNER; | 
|---|
| 369 | } | 
|---|
| 370 | if (r.end & m) != m { | 
|---|
| 371 | self.push(r.end & !m, r.end); | 
|---|
| 372 | r.end = (r.end & !m) - 1; | 
|---|
| 373 | continue 'INNER; | 
|---|
| 374 | } | 
|---|
| 375 | } | 
|---|
| 376 | } | 
|---|
| 377 | let mut start = [0; MAX_UTF8_BYTES]; | 
|---|
| 378 | let mut end = [0; MAX_UTF8_BYTES]; | 
|---|
| 379 | let n = r.encode(&mut start, &mut end); | 
|---|
| 380 | return Some(Utf8Sequence::from_encoded_range( | 
|---|
| 381 | &start[0..n], | 
|---|
| 382 | &end[0..n], | 
|---|
| 383 | )); | 
|---|
| 384 | } | 
|---|
| 385 | } | 
|---|
| 386 | None | 
|---|
| 387 | } | 
|---|
| 388 | } | 
|---|
| 389 |  | 
|---|
| 390 | impl FusedIterator for Utf8Sequences {} | 
|---|
| 391 |  | 
|---|
| 392 | impl ScalarRange { | 
|---|
| 393 | /// split splits this range if it overlaps with a surrogate codepoint. | 
|---|
| 394 | /// | 
|---|
| 395 | /// Either or both ranges may be invalid. | 
|---|
| 396 | fn split(&self) -> Option<(ScalarRange, ScalarRange)> { | 
|---|
| 397 | if self.start < 0xE000 && self.end > 0xD7FF { | 
|---|
| 398 | Some(( | 
|---|
| 399 | ScalarRange { start: self.start, end: 0xD7FF }, | 
|---|
| 400 | ScalarRange { start: 0xE000, end: self.end }, | 
|---|
| 401 | )) | 
|---|
| 402 | } else { | 
|---|
| 403 | None | 
|---|
| 404 | } | 
|---|
| 405 | } | 
|---|
| 406 |  | 
|---|
| 407 | /// is_valid returns true if and only if start <= end. | 
|---|
| 408 | fn is_valid(&self) -> bool { | 
|---|
| 409 | self.start <= self.end | 
|---|
| 410 | } | 
|---|
| 411 |  | 
|---|
| 412 | /// as_ascii returns this range as a Utf8Range if and only if all scalar | 
|---|
| 413 | /// values in this range can be encoded as a single byte. | 
|---|
| 414 | fn as_ascii(&self) -> Option<Utf8Range> { | 
|---|
| 415 | if self.is_ascii() { | 
|---|
| 416 | let start = u8::try_from(self.start).unwrap(); | 
|---|
| 417 | let end = u8::try_from(self.end).unwrap(); | 
|---|
| 418 | Some(Utf8Range::new(start, end)) | 
|---|
| 419 | } else { | 
|---|
| 420 | None | 
|---|
| 421 | } | 
|---|
| 422 | } | 
|---|
| 423 |  | 
|---|
| 424 | /// is_ascii returns true if the range is ASCII only (i.e., takes a single | 
|---|
| 425 | /// byte to encode any scalar value). | 
|---|
| 426 | fn is_ascii(&self) -> bool { | 
|---|
| 427 | self.is_valid() && self.end <= 0x7f | 
|---|
| 428 | } | 
|---|
| 429 |  | 
|---|
| 430 | /// encode writes the UTF-8 encoding of the start and end of this range | 
|---|
| 431 | /// to the corresponding destination slices, and returns the number of | 
|---|
| 432 | /// bytes written. | 
|---|
| 433 | /// | 
|---|
| 434 | /// The slices should have room for at least `MAX_UTF8_BYTES`. | 
|---|
| 435 | fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize { | 
|---|
| 436 | let cs = char::from_u32(self.start).unwrap(); | 
|---|
| 437 | let ce = char::from_u32(self.end).unwrap(); | 
|---|
| 438 | let ss = cs.encode_utf8(start); | 
|---|
| 439 | let se = ce.encode_utf8(end); | 
|---|
| 440 | assert_eq!(ss.len(), se.len()); | 
|---|
| 441 | ss.len() | 
|---|
| 442 | } | 
|---|
| 443 | } | 
|---|
| 444 |  | 
|---|
| 445 | fn max_scalar_value(nbytes: usize) -> u32 { | 
|---|
| 446 | match nbytes { | 
|---|
| 447 | 1 => 0x007F, | 
|---|
| 448 | 2 => 0x07FF, | 
|---|
| 449 | 3 => 0xFFFF, | 
|---|
| 450 | 4 => 0x0010_FFFF, | 
|---|
| 451 | _ => unreachable!( "invalid UTF-8 byte sequence size"), | 
|---|
| 452 | } | 
|---|
| 453 | } | 
|---|
| 454 |  | 
|---|
| 455 | #[ cfg(test)] | 
|---|
| 456 | mod tests { | 
|---|
| 457 | use core::char; | 
|---|
| 458 |  | 
|---|
| 459 | use alloc::{vec, vec::Vec}; | 
|---|
| 460 |  | 
|---|
| 461 | use crate::utf8::{Utf8Range, Utf8Sequences}; | 
|---|
| 462 |  | 
|---|
| 463 | fn rutf8(s: u8, e: u8) -> Utf8Range { | 
|---|
| 464 | Utf8Range::new(s, e) | 
|---|
| 465 | } | 
|---|
| 466 |  | 
|---|
| 467 | fn never_accepts_surrogate_codepoints(start: char, end: char) { | 
|---|
| 468 | for cp in 0xD800..0xE000 { | 
|---|
| 469 | let buf = encode_surrogate(cp); | 
|---|
| 470 | for r in Utf8Sequences::new(start, end) { | 
|---|
| 471 | if r.matches(&buf) { | 
|---|
| 472 | panic!( | 
|---|
| 473 | "Sequence ({:X}, {:X}) contains range {:?}, \ | 
|---|
| 474 |                          which matches surrogate code point {:X} \ | 
|---|
| 475 |                          with encoded bytes {:?}", | 
|---|
| 476 | u32::from(start), | 
|---|
| 477 | u32::from(end), | 
|---|
| 478 | r, | 
|---|
| 479 | cp, | 
|---|
| 480 | buf, | 
|---|
| 481 | ); | 
|---|
| 482 | } | 
|---|
| 483 | } | 
|---|
| 484 | } | 
|---|
| 485 | } | 
|---|
| 486 |  | 
|---|
| 487 | #[ test] | 
|---|
| 488 | fn codepoints_no_surrogates() { | 
|---|
| 489 | never_accepts_surrogate_codepoints( '\u{0} ', '\u{FFFF} '); | 
|---|
| 490 | never_accepts_surrogate_codepoints( '\u{0} ', '\u{10FFFF} '); | 
|---|
| 491 | never_accepts_surrogate_codepoints( '\u{0} ', '\u{10FFFE} '); | 
|---|
| 492 | never_accepts_surrogate_codepoints( '\u{80} ', '\u{10FFFF} '); | 
|---|
| 493 | never_accepts_surrogate_codepoints( '\u{D7FF} ', '\u{E000} '); | 
|---|
| 494 | } | 
|---|
| 495 |  | 
|---|
| 496 | #[ test] | 
|---|
| 497 | fn single_codepoint_one_sequence() { | 
|---|
| 498 | // Tests that every range of scalar values that contains a single | 
|---|
| 499 | // scalar value is recognized by one sequence of byte ranges. | 
|---|
| 500 | for i in 0x0..=0x0010_FFFF { | 
|---|
| 501 | let c = match char::from_u32(i) { | 
|---|
| 502 | None => continue, | 
|---|
| 503 | Some(c) => c, | 
|---|
| 504 | }; | 
|---|
| 505 | let seqs: Vec<_> = Utf8Sequences::new(c, c).collect(); | 
|---|
| 506 | assert_eq!(seqs.len(), 1); | 
|---|
| 507 | } | 
|---|
| 508 | } | 
|---|
| 509 |  | 
|---|
| 510 | #[ test] | 
|---|
| 511 | fn bmp() { | 
|---|
| 512 | use crate::utf8::Utf8Sequence::*; | 
|---|
| 513 |  | 
|---|
| 514 | let seqs = Utf8Sequences::new( '\u{0} ', '\u{FFFF} ').collect::<Vec<_>>(); | 
|---|
| 515 | assert_eq!( | 
|---|
| 516 | seqs, | 
|---|
| 517 | vec![ | 
|---|
| 518 | One(rutf8(0x0, 0x7F)), | 
|---|
| 519 | Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]), | 
|---|
| 520 | Three([ | 
|---|
| 521 | rutf8(0xE0, 0xE0), | 
|---|
| 522 | rutf8(0xA0, 0xBF), | 
|---|
| 523 | rutf8(0x80, 0xBF) | 
|---|
| 524 | ]), | 
|---|
| 525 | Three([ | 
|---|
| 526 | rutf8(0xE1, 0xEC), | 
|---|
| 527 | rutf8(0x80, 0xBF), | 
|---|
| 528 | rutf8(0x80, 0xBF) | 
|---|
| 529 | ]), | 
|---|
| 530 | Three([ | 
|---|
| 531 | rutf8(0xED, 0xED), | 
|---|
| 532 | rutf8(0x80, 0x9F), | 
|---|
| 533 | rutf8(0x80, 0xBF) | 
|---|
| 534 | ]), | 
|---|
| 535 | Three([ | 
|---|
| 536 | rutf8(0xEE, 0xEF), | 
|---|
| 537 | rutf8(0x80, 0xBF), | 
|---|
| 538 | rutf8(0x80, 0xBF) | 
|---|
| 539 | ]), | 
|---|
| 540 | ] | 
|---|
| 541 | ); | 
|---|
| 542 | } | 
|---|
| 543 |  | 
|---|
| 544 | #[ test] | 
|---|
| 545 | fn reverse() { | 
|---|
| 546 | use crate::utf8::Utf8Sequence::*; | 
|---|
| 547 |  | 
|---|
| 548 | let mut s = One(rutf8(0xA, 0xB)); | 
|---|
| 549 | s.reverse(); | 
|---|
| 550 | assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]); | 
|---|
| 551 |  | 
|---|
| 552 | let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]); | 
|---|
| 553 | s.reverse(); | 
|---|
| 554 | assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]); | 
|---|
| 555 |  | 
|---|
| 556 | let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]); | 
|---|
| 557 | s.reverse(); | 
|---|
| 558 | assert_eq!( | 
|---|
| 559 | s.as_slice(), | 
|---|
| 560 | &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)] | 
|---|
| 561 | ); | 
|---|
| 562 |  | 
|---|
| 563 | let mut s = Four([ | 
|---|
| 564 | rutf8(0xA, 0xB), | 
|---|
| 565 | rutf8(0xB, 0xC), | 
|---|
| 566 | rutf8(0xC, 0xD), | 
|---|
| 567 | rutf8(0xD, 0xE), | 
|---|
| 568 | ]); | 
|---|
| 569 | s.reverse(); | 
|---|
| 570 | assert_eq!( | 
|---|
| 571 | s.as_slice(), | 
|---|
| 572 | &[ | 
|---|
| 573 | rutf8(0xD, 0xE), | 
|---|
| 574 | rutf8(0xC, 0xD), | 
|---|
| 575 | rutf8(0xB, 0xC), | 
|---|
| 576 | rutf8(0xA, 0xB) | 
|---|
| 577 | ] | 
|---|
| 578 | ); | 
|---|
| 579 | } | 
|---|
| 580 |  | 
|---|
| 581 | fn encode_surrogate(cp: u32) -> [u8; 3] { | 
|---|
| 582 | const TAG_CONT: u8 = 0b1000_0000; | 
|---|
| 583 | const TAG_THREE_B: u8 = 0b1110_0000; | 
|---|
| 584 |  | 
|---|
| 585 | assert!(0xD800 <= cp && cp < 0xE000); | 
|---|
| 586 | let mut dst = [0; 3]; | 
|---|
| 587 | dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B; | 
|---|
| 588 | dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT; | 
|---|
| 589 | dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT; | 
|---|
| 590 | dst | 
|---|
| 591 | } | 
|---|
| 592 | } | 
|---|
| 593 |  | 
|---|