1 | /// A few elementary UTF-8 encoding and decoding functions used by the matching |
2 | /// engines. |
3 | /// |
4 | /// In an ideal world, the matching engines operate on `&str` and we can just |
5 | /// lean on the standard library for all our UTF-8 needs. However, to support |
6 | /// byte based regexes (that can match on arbitrary bytes which may contain |
7 | /// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`. |
8 | /// The standard library doesn't really recognize this use case, so we have |
9 | /// to build it out ourselves. |
10 | /// |
11 | /// Should this be factored out into a separate crate? It seems independently |
12 | /// useful. There are other crates that already exist (e.g., `utf-8`) that have |
13 | /// overlapping use cases. Not sure what to do. |
14 | use std::char; |
15 | |
16 | const TAG_CONT: u8 = 0b1000_0000; |
17 | const TAG_TWO: u8 = 0b1100_0000; |
18 | const TAG_THREE: u8 = 0b1110_0000; |
19 | const TAG_FOUR: u8 = 0b1111_0000; |
20 | |
21 | /// Returns the smallest possible index of the next valid UTF-8 sequence |
22 | /// starting after `i`. |
23 | pub fn next_utf8(text: &[u8], i: usize) -> usize { |
24 | let b: u8 = match text.get(index:i) { |
25 | None => return i + 1, |
26 | Some(&b: u8) => b, |
27 | }; |
28 | let inc: usize = if b <= 0x7F { |
29 | 1 |
30 | } else if b <= 0b110_11111 { |
31 | 2 |
32 | } else if b <= 0b1110_1111 { |
33 | 3 |
34 | } else { |
35 | 4 |
36 | }; |
37 | i + inc |
38 | } |
39 | |
40 | /// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`. |
41 | /// |
42 | /// If no valid UTF-8 sequence could be found, then `None` is returned. |
43 | /// Otherwise, the decoded codepoint and the number of bytes read is returned. |
44 | /// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be |
45 | /// 1, 2, 3 or 4. |
46 | /// |
47 | /// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a |
48 | /// codepoint that is out of range (surrogate codepoints are out of range) or |
49 | /// is not the shortest possible UTF-8 sequence for that codepoint. |
50 | #[inline ] |
51 | pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { |
52 | let b0 = match src.get(0) { |
53 | None => return None, |
54 | Some(&b) if b <= 0x7F => return Some((b as char, 1)), |
55 | Some(&b) => b, |
56 | }; |
57 | match b0 { |
58 | 0b110_00000..=0b110_11111 => { |
59 | if src.len() < 2 { |
60 | return None; |
61 | } |
62 | let b1 = src[1]; |
63 | if 0b11_000000 & b1 != TAG_CONT { |
64 | return None; |
65 | } |
66 | let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); |
67 | match cp { |
68 | 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), |
69 | _ => None, |
70 | } |
71 | } |
72 | 0b1110_0000..=0b1110_1111 => { |
73 | if src.len() < 3 { |
74 | return None; |
75 | } |
76 | let (b1, b2) = (src[1], src[2]); |
77 | if 0b11_000000 & b1 != TAG_CONT { |
78 | return None; |
79 | } |
80 | if 0b11_000000 & b2 != TAG_CONT { |
81 | return None; |
82 | } |
83 | let cp = ((b0 & !TAG_THREE) as u32) << 12 |
84 | | ((b1 & !TAG_CONT) as u32) << 6 |
85 | | ((b2 & !TAG_CONT) as u32); |
86 | match cp { |
87 | // char::from_u32 will disallow surrogate codepoints. |
88 | 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), |
89 | _ => None, |
90 | } |
91 | } |
92 | 0b11110_000..=0b11110_111 => { |
93 | if src.len() < 4 { |
94 | return None; |
95 | } |
96 | let (b1, b2, b3) = (src[1], src[2], src[3]); |
97 | if 0b11_000000 & b1 != TAG_CONT { |
98 | return None; |
99 | } |
100 | if 0b11_000000 & b2 != TAG_CONT { |
101 | return None; |
102 | } |
103 | if 0b11_000000 & b3 != TAG_CONT { |
104 | return None; |
105 | } |
106 | let cp = ((b0 & !TAG_FOUR) as u32) << 18 |
107 | | ((b1 & !TAG_CONT) as u32) << 12 |
108 | | ((b2 & !TAG_CONT) as u32) << 6 |
109 | | ((b3 & !TAG_CONT) as u32); |
110 | match cp { |
111 | 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)), |
112 | _ => None, |
113 | } |
114 | } |
115 | _ => None, |
116 | } |
117 | } |
118 | |
119 | /// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead |
120 | /// of the first. |
121 | pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> { |
122 | if src.is_empty() { |
123 | return None; |
124 | } |
125 | let mut start: usize = src.len() - 1; |
126 | if src[start] <= 0x7F { |
127 | return Some((src[start] as char, 1)); |
128 | } |
129 | while start > src.len().saturating_sub(4) { |
130 | start -= 1; |
131 | if is_start_byte(src[start]) { |
132 | break; |
133 | } |
134 | } |
135 | match decode_utf8(&src[start..]) { |
136 | None => None, |
137 | Some((_, n: usize)) if n < src.len() - start => None, |
138 | Some((cp: char, n: usize)) => Some((cp, n)), |
139 | } |
140 | } |
141 | |
142 | fn is_start_byte(b: u8) -> bool { |
143 | b & 0b11_000000 != 0b1_0000000 |
144 | } |
145 | |
146 | #[cfg (test)] |
147 | mod tests { |
148 | use std::str; |
149 | |
150 | use quickcheck::quickcheck; |
151 | |
152 | use super::{ |
153 | decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO, |
154 | }; |
155 | |
156 | #[test ] |
157 | fn prop_roundtrip() { |
158 | fn p(given_cp: char) -> bool { |
159 | let mut tmp = [0; 4]; |
160 | let encoded_len = given_cp.encode_utf8(&mut tmp).len(); |
161 | let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap(); |
162 | encoded_len == got_len && given_cp == got_cp |
163 | } |
164 | quickcheck(p as fn(char) -> bool) |
165 | } |
166 | |
167 | #[test ] |
168 | fn prop_roundtrip_last() { |
169 | fn p(given_cp: char) -> bool { |
170 | let mut tmp = [0; 4]; |
171 | let encoded_len = given_cp.encode_utf8(&mut tmp).len(); |
172 | let (got_cp, got_len) = |
173 | decode_last_utf8(&tmp[..encoded_len]).unwrap(); |
174 | encoded_len == got_len && given_cp == got_cp |
175 | } |
176 | quickcheck(p as fn(char) -> bool) |
177 | } |
178 | |
179 | #[test ] |
180 | fn prop_encode_matches_std() { |
181 | fn p(cp: char) -> bool { |
182 | let mut got = [0; 4]; |
183 | let n = cp.encode_utf8(&mut got).len(); |
184 | let expected = cp.to_string(); |
185 | &got[..n] == expected.as_bytes() |
186 | } |
187 | quickcheck(p as fn(char) -> bool) |
188 | } |
189 | |
190 | #[test ] |
191 | fn prop_decode_matches_std() { |
192 | fn p(given_cp: char) -> bool { |
193 | let mut tmp = [0; 4]; |
194 | let n = given_cp.encode_utf8(&mut tmp).len(); |
195 | let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap(); |
196 | let expected_cp = |
197 | str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap(); |
198 | got_cp == expected_cp |
199 | } |
200 | quickcheck(p as fn(char) -> bool) |
201 | } |
202 | |
203 | #[test ] |
204 | fn prop_decode_last_matches_std() { |
205 | fn p(given_cp: char) -> bool { |
206 | let mut tmp = [0; 4]; |
207 | let n = given_cp.encode_utf8(&mut tmp).len(); |
208 | let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap(); |
209 | let expected_cp = str::from_utf8(&tmp[..n]) |
210 | .unwrap() |
211 | .chars() |
212 | .rev() |
213 | .next() |
214 | .unwrap(); |
215 | got_cp == expected_cp |
216 | } |
217 | quickcheck(p as fn(char) -> bool) |
218 | } |
219 | |
220 | #[test ] |
221 | fn reject_invalid() { |
222 | // Invalid start byte |
223 | assert_eq!(decode_utf8(&[0xFF]), None); |
224 | // Surrogate pair |
225 | assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); |
226 | // Invalid continuation byte. |
227 | assert_eq!(decode_utf8(&[0xD4, 0xC2]), None); |
228 | // Bad lengths |
229 | assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes |
230 | assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes |
231 | assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes |
232 | // Not a minimal UTF-8 sequence |
233 | assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a' ]), None); |
234 | assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a' ]), None); |
235 | assert_eq!( |
236 | decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a' ,]), |
237 | None |
238 | ); |
239 | } |
240 | |
241 | #[test ] |
242 | fn reject_invalid_last() { |
243 | // Invalid start byte |
244 | assert_eq!(decode_last_utf8(&[0xFF]), None); |
245 | // Surrogate pair |
246 | assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None); |
247 | // Bad lengths |
248 | assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes |
249 | assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes |
250 | assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes |
251 | // Not a minimal UTF-8 sequence |
252 | assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a' ]), None); |
253 | assert_eq!( |
254 | decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a' ,]), |
255 | None |
256 | ); |
257 | assert_eq!( |
258 | decode_last_utf8( |
259 | &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a' ,] |
260 | ), |
261 | None |
262 | ); |
263 | } |
264 | } |
265 | |