1 | /// Returns true if and only if the given byte is considered a word character. |
2 | /// This only applies to ASCII. |
3 | pub(crate) fn is_word_byte(b: u8) -> bool { |
4 | const fn mkwordset() -> [bool; 256] { |
5 | // FIXME: Use as_usize() once const functions in traits are stable. |
6 | let mut set = [false; 256]; |
7 | set[b'_' as usize] = true; |
8 | |
9 | let mut byte = b'0' ; |
10 | while byte <= b'9' { |
11 | set[byte as usize] = true; |
12 | byte += 1; |
13 | } |
14 | byte = b'A' ; |
15 | while byte <= b'Z' { |
16 | set[byte as usize] = true; |
17 | byte += 1; |
18 | } |
19 | byte = b'a' ; |
20 | while byte <= b'z' { |
21 | set[byte as usize] = true; |
22 | byte += 1; |
23 | } |
24 | set |
25 | } |
26 | const WORD: [bool; 256] = mkwordset(); |
27 | WORD[b as usize] |
28 | } |
29 | |
30 | /// The accept state index. When we enter this state, we know we've found a |
31 | /// valid Unicode scalar value. |
32 | const ACCEPT: usize = 12; |
33 | /// The reject state index. When we enter this state, we know that we've found |
34 | /// invalid UTF-8. |
35 | const REJECT: usize = 0; |
36 | |
37 | /// Like `decode`, but automatically converts the `None` case to the |
38 | /// replacement codepoint. |
39 | pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { |
40 | match decode(slice) { |
41 | (Some(ch: char), size: usize) => (ch, size), |
42 | (None, size: usize) => (' \u{FFFD}' , size), |
43 | } |
44 | } |
45 | |
46 | /// UTF-8 decode a single Unicode scalar value from the beginning of a slice. |
47 | /// |
48 | /// When successful, the corresponding Unicode scalar value is returned along |
49 | /// with the number of bytes it was encoded with. The number of bytes consumed |
50 | /// for a successful decode is always between 1 and 4, inclusive. |
51 | /// |
52 | /// When unsuccessful, `None` is returned along with the number of bytes that |
53 | /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, |
54 | /// the number of bytes consumed is always between 0 and 3, inclusive, where |
55 | /// 0 is only returned when `slice` is empty. |
56 | pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { |
57 | let slice = slice.as_ref(); |
58 | match slice.get(0) { |
59 | None => return (None, 0), |
60 | Some(&b) if b <= 0x7F => return (Some(b as char), 1), |
61 | _ => {} |
62 | } |
63 | |
64 | let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); |
65 | while i < slice.len() { |
66 | decode_step(&mut state, &mut cp, slice[i]); |
67 | i += 1; |
68 | |
69 | if state == ACCEPT { |
70 | // OK since `decode_step` guarantees that `cp` is a valid Unicode |
71 | // scalar value in an ACCEPT state. |
72 | // |
73 | // We don't have to use safe code here, but do so because perf |
74 | // isn't our primary objective in regex-lite. |
75 | let ch = char::from_u32(cp).unwrap(); |
76 | return (Some(ch), i); |
77 | } else if state == REJECT { |
78 | // At this point, we always want to advance at least one byte. |
79 | return (None, core::cmp::max(1, i.saturating_sub(1))); |
80 | } |
81 | } |
82 | (None, i) |
83 | } |
84 | |
85 | /// Transitions to the next state and updates `cp` while it does. |
86 | fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { |
87 | // Splits the space of all bytes into equivalence classes, such that |
88 | // any byte in the same class can never discriminate between whether a |
89 | // particular sequence is valid UTF-8 or not. |
90 | #[rustfmt::skip] |
91 | const CLASSES: [u8; 256] = [ |
92 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
93 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
94 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
95 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
96 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, |
97 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
98 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
99 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, |
100 | ]; |
101 | |
102 | // A state machine taken from `bstr` which was in turn adapted from: |
103 | // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
104 | #[rustfmt::skip] |
105 | const STATES_FORWARD: &'static [u8] = &[ |
106 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
107 | 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, |
108 | 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0, |
109 | 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0, |
110 | 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, |
111 | 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, |
112 | 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, |
113 | 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, |
114 | 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
115 | ]; |
116 | |
117 | let class = CLASSES[usize::from(b)]; |
118 | if *state == ACCEPT { |
119 | *cp = (0xFF >> class) & (b as u32); |
120 | } else { |
121 | *cp = (b as u32 & 0b111111) | (*cp << 6); |
122 | } |
123 | *state = usize::from(STATES_FORWARD[*state + usize::from(class)]); |
124 | } |
125 | |
126 | #[cfg (test)] |
127 | mod tests { |
128 | use alloc::{vec, vec::Vec}; |
129 | |
130 | use super::*; |
131 | |
132 | #[test ] |
133 | fn decode_valid() { |
134 | fn d(mut s: &str) -> Vec<char> { |
135 | let mut chars = vec![]; |
136 | while !s.is_empty() { |
137 | let (ch, size) = decode(s.as_bytes()); |
138 | s = &s[size..]; |
139 | chars.push(ch.unwrap()); |
140 | } |
141 | chars |
142 | } |
143 | |
144 | assert_eq!(vec!['☃' ], d("☃" )); |
145 | assert_eq!(vec!['☃' , '☃' ], d("☃☃" )); |
146 | assert_eq!(vec!['α' , 'β' , 'γ' , 'δ' , 'ε' ], d("αβγδε" )); |
147 | assert_eq!(vec!['☃' , '⛄' , '⛇' ], d("☃⛄⛇" )); |
148 | assert_eq!(vec!['𝗮' , '𝗯' , '𝗰' , '𝗱' , '𝗲' ], d("𝗮𝗯𝗰𝗱𝗲" )); |
149 | } |
150 | |
151 | #[test ] |
152 | fn decode_invalid() { |
153 | let (ch, size) = decode(b"" ); |
154 | assert_eq!(None, ch); |
155 | assert_eq!(0, size); |
156 | |
157 | let (ch, size) = decode(b" \xFF" ); |
158 | assert_eq!(None, ch); |
159 | assert_eq!(1, size); |
160 | |
161 | let (ch, size) = decode(b" \xCE\xF0" ); |
162 | assert_eq!(None, ch); |
163 | assert_eq!(1, size); |
164 | |
165 | let (ch, size) = decode(b" \xE2\x98\xF0" ); |
166 | assert_eq!(None, ch); |
167 | assert_eq!(2, size); |
168 | |
169 | let (ch, size) = decode(b" \xF0\x9D\x9D" ); |
170 | assert_eq!(None, ch); |
171 | assert_eq!(3, size); |
172 | |
173 | let (ch, size) = decode(b" \xF0\x9D\x9D\xF0" ); |
174 | assert_eq!(None, ch); |
175 | assert_eq!(3, size); |
176 | |
177 | let (ch, size) = decode(b" \xF0\x82\x82\xAC" ); |
178 | assert_eq!(None, ch); |
179 | assert_eq!(1, size); |
180 | |
181 | let (ch, size) = decode(b" \xED\xA0\x80" ); |
182 | assert_eq!(None, ch); |
183 | assert_eq!(1, size); |
184 | |
185 | let (ch, size) = decode(b" \xCEa" ); |
186 | assert_eq!(None, ch); |
187 | assert_eq!(1, size); |
188 | |
189 | let (ch, size) = decode(b" \xE2\x98a" ); |
190 | assert_eq!(None, ch); |
191 | assert_eq!(2, size); |
192 | |
193 | let (ch, size) = decode(b" \xF0\x9D\x9Ca" ); |
194 | assert_eq!(None, ch); |
195 | assert_eq!(3, size); |
196 | } |
197 | |
198 | #[test ] |
199 | fn decode_lossily() { |
200 | let (ch, size) = decode_lossy(b"" ); |
201 | assert_eq!(' \u{FFFD}' , ch); |
202 | assert_eq!(0, size); |
203 | |
204 | let (ch, size) = decode_lossy(b" \xFF" ); |
205 | assert_eq!(' \u{FFFD}' , ch); |
206 | assert_eq!(1, size); |
207 | |
208 | let (ch, size) = decode_lossy(b" \xCE\xF0" ); |
209 | assert_eq!(' \u{FFFD}' , ch); |
210 | assert_eq!(1, size); |
211 | |
212 | let (ch, size) = decode_lossy(b" \xE2\x98\xF0" ); |
213 | assert_eq!(' \u{FFFD}' , ch); |
214 | assert_eq!(2, size); |
215 | |
216 | let (ch, size) = decode_lossy(b" \xF0\x9D\x9D\xF0" ); |
217 | assert_eq!(' \u{FFFD}' , ch); |
218 | assert_eq!(3, size); |
219 | |
220 | let (ch, size) = decode_lossy(b" \xF0\x82\x82\xAC" ); |
221 | assert_eq!(' \u{FFFD}' , ch); |
222 | assert_eq!(1, size); |
223 | |
224 | let (ch, size) = decode_lossy(b" \xED\xA0\x80" ); |
225 | assert_eq!(' \u{FFFD}' , ch); |
226 | assert_eq!(1, size); |
227 | |
228 | let (ch, size) = decode_lossy(b" \xCEa" ); |
229 | assert_eq!(' \u{FFFD}' , ch); |
230 | assert_eq!(1, size); |
231 | |
232 | let (ch, size) = decode_lossy(b" \xE2\x98a" ); |
233 | assert_eq!(' \u{FFFD}' , ch); |
234 | assert_eq!(2, size); |
235 | |
236 | let (ch, size) = decode_lossy(b" \xF0\x9D\x9Ca" ); |
237 | assert_eq!(' \u{FFFD}' , ch); |
238 | assert_eq!(3, size); |
239 | } |
240 | } |
241 | |