| 1 | /// Returns true if and only if the given byte is considered a word character. |
| 2 | /// This only applies to ASCII. |
| 3 | pub(crate) fn is_word_byte(b: u8) -> bool { |
| 4 | const fn mkwordset() -> [bool; 256] { |
| 5 | // FIXME: Use as_usize() once const functions in traits are stable. |
| 6 | let mut set = [false; 256]; |
| 7 | set[b'_' as usize] = true; |
| 8 | |
| 9 | let mut byte = b'0' ; |
| 10 | while byte <= b'9' { |
| 11 | set[byte as usize] = true; |
| 12 | byte += 1; |
| 13 | } |
| 14 | byte = b'A' ; |
| 15 | while byte <= b'Z' { |
| 16 | set[byte as usize] = true; |
| 17 | byte += 1; |
| 18 | } |
| 19 | byte = b'a' ; |
| 20 | while byte <= b'z' { |
| 21 | set[byte as usize] = true; |
| 22 | byte += 1; |
| 23 | } |
| 24 | set |
| 25 | } |
| 26 | const WORD: [bool; 256] = mkwordset(); |
| 27 | WORD[b as usize] |
| 28 | } |
| 29 | |
| 30 | /// The accept state index. When we enter this state, we know we've found a |
| 31 | /// valid Unicode scalar value. |
| 32 | const ACCEPT: usize = 12; |
| 33 | /// The reject state index. When we enter this state, we know that we've found |
| 34 | /// invalid UTF-8. |
| 35 | const REJECT: usize = 0; |
| 36 | |
| 37 | /// Like `decode`, but automatically converts the `None` case to the |
| 38 | /// replacement codepoint. |
| 39 | pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { |
| 40 | match decode(slice) { |
| 41 | (Some(ch: char), size: usize) => (ch, size), |
| 42 | (None, size: usize) => (' \u{FFFD}' , size), |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | /// UTF-8 decode a single Unicode scalar value from the beginning of a slice. |
| 47 | /// |
| 48 | /// When successful, the corresponding Unicode scalar value is returned along |
| 49 | /// with the number of bytes it was encoded with. The number of bytes consumed |
| 50 | /// for a successful decode is always between 1 and 4, inclusive. |
| 51 | /// |
| 52 | /// When unsuccessful, `None` is returned along with the number of bytes that |
| 53 | /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, |
| 54 | /// the number of bytes consumed is always between 0 and 3, inclusive, where |
| 55 | /// 0 is only returned when `slice` is empty. |
| 56 | pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { |
| 57 | let slice = slice.as_ref(); |
| 58 | match slice.get(0) { |
| 59 | None => return (None, 0), |
| 60 | Some(&b) if b <= 0x7F => return (Some(b as char), 1), |
| 61 | _ => {} |
| 62 | } |
| 63 | |
| 64 | let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); |
| 65 | while i < slice.len() { |
| 66 | decode_step(&mut state, &mut cp, slice[i]); |
| 67 | i += 1; |
| 68 | |
| 69 | if state == ACCEPT { |
| 70 | // OK since `decode_step` guarantees that `cp` is a valid Unicode |
| 71 | // scalar value in an ACCEPT state. |
| 72 | // |
| 73 | // We don't have to use safe code here, but do so because perf |
| 74 | // isn't our primary objective in regex-lite. |
| 75 | let ch = char::from_u32(cp).unwrap(); |
| 76 | return (Some(ch), i); |
| 77 | } else if state == REJECT { |
| 78 | // At this point, we always want to advance at least one byte. |
| 79 | return (None, core::cmp::max(1, i.saturating_sub(1))); |
| 80 | } |
| 81 | } |
| 82 | (None, i) |
| 83 | } |
| 84 | |
| 85 | /// Transitions to the next state and updates `cp` while it does. |
| 86 | fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { |
| 87 | // Splits the space of all bytes into equivalence classes, such that |
| 88 | // any byte in the same class can never discriminate between whether a |
| 89 | // particular sequence is valid UTF-8 or not. |
| 90 | #[rustfmt::skip] |
| 91 | const CLASSES: [u8; 256] = [ |
| 92 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 93 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 94 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 95 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 96 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, |
| 97 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
| 98 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 99 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, |
| 100 | ]; |
| 101 | |
| 102 | // A state machine taken from `bstr` which was in turn adapted from: |
| 103 | // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
| 104 | #[rustfmt::skip] |
| 105 | const STATES_FORWARD: &'static [u8] = &[ |
| 106 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 107 | 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, |
| 108 | 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0, |
| 109 | 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0, |
| 110 | 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, |
| 111 | 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, |
| 112 | 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, |
| 113 | 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, |
| 114 | 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 115 | ]; |
| 116 | |
| 117 | let class = CLASSES[usize::from(b)]; |
| 118 | if *state == ACCEPT { |
| 119 | *cp = (0xFF >> class) & (b as u32); |
| 120 | } else { |
| 121 | *cp = (b as u32 & 0b111111) | (*cp << 6); |
| 122 | } |
| 123 | *state = usize::from(STATES_FORWARD[*state + usize::from(class)]); |
| 124 | } |
| 125 | |
| 126 | #[cfg (test)] |
| 127 | mod tests { |
| 128 | use alloc::{vec, vec::Vec}; |
| 129 | |
| 130 | use super::*; |
| 131 | |
| 132 | #[test ] |
| 133 | fn decode_valid() { |
| 134 | fn d(mut s: &str) -> Vec<char> { |
| 135 | let mut chars = vec![]; |
| 136 | while !s.is_empty() { |
| 137 | let (ch, size) = decode(s.as_bytes()); |
| 138 | s = &s[size..]; |
| 139 | chars.push(ch.unwrap()); |
| 140 | } |
| 141 | chars |
| 142 | } |
| 143 | |
| 144 | assert_eq!(vec!['☃' ], d("☃" )); |
| 145 | assert_eq!(vec!['☃' , '☃' ], d("☃☃" )); |
| 146 | assert_eq!(vec!['α' , 'β' , 'γ' , 'δ' , 'ε' ], d("αβγδε" )); |
| 147 | assert_eq!(vec!['☃' , '⛄' , '⛇' ], d("☃⛄⛇" )); |
| 148 | assert_eq!(vec!['𝗮' , '𝗯' , '𝗰' , '𝗱' , '𝗲' ], d("𝗮𝗯𝗰𝗱𝗲" )); |
| 149 | } |
| 150 | |
| 151 | #[test ] |
| 152 | fn decode_invalid() { |
| 153 | let (ch, size) = decode(b"" ); |
| 154 | assert_eq!(None, ch); |
| 155 | assert_eq!(0, size); |
| 156 | |
| 157 | let (ch, size) = decode(b" \xFF" ); |
| 158 | assert_eq!(None, ch); |
| 159 | assert_eq!(1, size); |
| 160 | |
| 161 | let (ch, size) = decode(b" \xCE\xF0" ); |
| 162 | assert_eq!(None, ch); |
| 163 | assert_eq!(1, size); |
| 164 | |
| 165 | let (ch, size) = decode(b" \xE2\x98\xF0" ); |
| 166 | assert_eq!(None, ch); |
| 167 | assert_eq!(2, size); |
| 168 | |
| 169 | let (ch, size) = decode(b" \xF0\x9D\x9D" ); |
| 170 | assert_eq!(None, ch); |
| 171 | assert_eq!(3, size); |
| 172 | |
| 173 | let (ch, size) = decode(b" \xF0\x9D\x9D\xF0" ); |
| 174 | assert_eq!(None, ch); |
| 175 | assert_eq!(3, size); |
| 176 | |
| 177 | let (ch, size) = decode(b" \xF0\x82\x82\xAC" ); |
| 178 | assert_eq!(None, ch); |
| 179 | assert_eq!(1, size); |
| 180 | |
| 181 | let (ch, size) = decode(b" \xED\xA0\x80" ); |
| 182 | assert_eq!(None, ch); |
| 183 | assert_eq!(1, size); |
| 184 | |
| 185 | let (ch, size) = decode(b" \xCEa" ); |
| 186 | assert_eq!(None, ch); |
| 187 | assert_eq!(1, size); |
| 188 | |
| 189 | let (ch, size) = decode(b" \xE2\x98a" ); |
| 190 | assert_eq!(None, ch); |
| 191 | assert_eq!(2, size); |
| 192 | |
| 193 | let (ch, size) = decode(b" \xF0\x9D\x9Ca" ); |
| 194 | assert_eq!(None, ch); |
| 195 | assert_eq!(3, size); |
| 196 | } |
| 197 | |
| 198 | #[test ] |
| 199 | fn decode_lossily() { |
| 200 | let (ch, size) = decode_lossy(b"" ); |
| 201 | assert_eq!(' \u{FFFD}' , ch); |
| 202 | assert_eq!(0, size); |
| 203 | |
| 204 | let (ch, size) = decode_lossy(b" \xFF" ); |
| 205 | assert_eq!(' \u{FFFD}' , ch); |
| 206 | assert_eq!(1, size); |
| 207 | |
| 208 | let (ch, size) = decode_lossy(b" \xCE\xF0" ); |
| 209 | assert_eq!(' \u{FFFD}' , ch); |
| 210 | assert_eq!(1, size); |
| 211 | |
| 212 | let (ch, size) = decode_lossy(b" \xE2\x98\xF0" ); |
| 213 | assert_eq!(' \u{FFFD}' , ch); |
| 214 | assert_eq!(2, size); |
| 215 | |
| 216 | let (ch, size) = decode_lossy(b" \xF0\x9D\x9D\xF0" ); |
| 217 | assert_eq!(' \u{FFFD}' , ch); |
| 218 | assert_eq!(3, size); |
| 219 | |
| 220 | let (ch, size) = decode_lossy(b" \xF0\x82\x82\xAC" ); |
| 221 | assert_eq!(' \u{FFFD}' , ch); |
| 222 | assert_eq!(1, size); |
| 223 | |
| 224 | let (ch, size) = decode_lossy(b" \xED\xA0\x80" ); |
| 225 | assert_eq!(' \u{FFFD}' , ch); |
| 226 | assert_eq!(1, size); |
| 227 | |
| 228 | let (ch, size) = decode_lossy(b" \xCEa" ); |
| 229 | assert_eq!(' \u{FFFD}' , ch); |
| 230 | assert_eq!(1, size); |
| 231 | |
| 232 | let (ch, size) = decode_lossy(b" \xE2\x98a" ); |
| 233 | assert_eq!(' \u{FFFD}' , ch); |
| 234 | assert_eq!(2, size); |
| 235 | |
| 236 | let (ch, size) = decode_lossy(b" \xF0\x9D\x9Ca" ); |
| 237 | assert_eq!(' \u{FFFD}' , ch); |
| 238 | assert_eq!(3, size); |
| 239 | } |
| 240 | } |
| 241 | |