1 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
2 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
3 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
4 | // option. This file may not be copied, modified, or distributed |
5 | // except according to those terms. |
6 | |
7 | #![cfg_attr (test, feature(test))] |
8 | |
9 | #[macro_use ] |
10 | extern crate debug_unreachable; |
11 | |
12 | #[macro_use ] |
13 | extern crate mac; |
14 | |
15 | #[cfg (test)] |
16 | extern crate test as std_test; |
17 | |
18 | use std::{slice, char}; |
19 | |
20 | /// Meaning of a complete or partial UTF-8 codepoint. |
21 | /// |
22 | /// Not all checking is performed eagerly. That is, a codepoint `Prefix` or |
23 | /// `Suffix` may in reality have no valid completion. |
24 | #[derive (Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] |
25 | pub enum Meaning { |
26 | /// We found a whole codepoint. |
27 | Whole(char), |
28 | |
29 | /// We found something that isn't a valid Unicode codepoint, but |
30 | /// it *would* correspond to a UTF-16 leading surrogate code unit, |
31 | /// i.e. a value in the range `U+D800` - `U+DBFF`. |
32 | /// |
33 | /// The argument is the code unit's 10-bit index within that range. |
34 | /// |
35 | /// These are found in UTF-8 variants such as CESU-8 and WTF-8. |
36 | LeadSurrogate(u16), |
37 | |
38 | /// We found something that isn't a valid Unicode codepoint, but |
39 | /// it *would* correspond to a UTF-16 trailing surrogate code unit, |
40 | /// i.e. a value in the range `U+DC00` - `U+DFFF`. |
41 | /// |
42 | /// The argument is the code unit's 10-bit index within that range. |
43 | /// |
44 | /// These are found in UTF-8 variants such as CESU-8 and WTF-8. |
45 | TrailSurrogate(u16), |
46 | |
47 | /// We found only a prefix of a codepoint before the buffer ended. |
48 | /// |
49 | /// Includes the number of additional bytes needed. |
50 | Prefix(usize), |
51 | |
52 | /// We found only a suffix of a codepoint before running off the |
53 | /// start of the buffer. |
54 | /// |
55 | /// Up to 3 more bytes may be needed. |
56 | Suffix, |
57 | } |
58 | |
59 | /// Represents a complete or partial UTF-8 codepoint. |
60 | #[derive (Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] |
61 | pub struct Codepoint<'a> { |
62 | /// The bytes that make up the partial or full codepoint. |
63 | /// |
64 | /// For a `Suffix` this depends on `idx`. We don't scan forward |
65 | /// for additional continuation bytes after the reverse scan |
66 | /// failed to locate a multibyte sequence start. |
67 | pub bytes: &'a [u8], |
68 | |
69 | /// Start of the codepoint in the buffer, expressed as an offset |
70 | /// back from `idx`. |
71 | pub rewind: usize, |
72 | |
73 | /// Meaning of the partial or full codepoint. |
74 | pub meaning: Meaning, |
75 | } |
76 | |
77 | #[derive (Debug, PartialEq, Eq)] |
78 | enum Byte { |
79 | Ascii, |
80 | Start(usize), |
81 | Cont, |
82 | } |
83 | |
84 | impl Byte { |
85 | #[inline (always)] |
86 | fn classify(x: u8) -> Option<Byte> { |
87 | match x & 0xC0 { |
88 | 0xC0 => match x { |
89 | x: u8 if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)), |
90 | x: u8 if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)), |
91 | x: u8 if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)), |
92 | _ => None, |
93 | }, |
94 | 0x80 => Some(Byte::Cont), |
95 | _ => Some(Byte::Ascii), |
96 | } |
97 | } |
98 | } |
99 | |
100 | #[inline (always)] |
101 | fn all_cont(buf: &[u8]) -> bool { |
102 | buf.iter().all(|&b: u8| matches!(Byte::classify(b), Some(Byte::Cont))) |
103 | } |
104 | |
105 | // NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence: |
106 | // a starting byte followed by the correct number of continuation bytes. |
107 | #[inline (always)] |
108 | unsafe fn decode(buf: &[u8]) -> Option<Meaning> { |
109 | debug_assert!(buf.len() >= 2); |
110 | debug_assert!(buf.len() <= 4); |
111 | let n; |
112 | match buf.len() { |
113 | 2 => { |
114 | n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6 |
115 | | ((*buf.get_unchecked(1) & 0x3F) as u32); |
116 | if n < 0x80 { return None } // Overlong |
117 | } |
118 | 3 => { |
119 | n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12 |
120 | | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6 |
121 | | ((*buf.get_unchecked(2) & 0x3F) as u32); |
122 | match n { |
123 | 0x0000 ... 0x07FF => return None, // Overlong |
124 | 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)), |
125 | 0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)), |
126 | _ => {} |
127 | } |
128 | } |
129 | 4 => { |
130 | n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18 |
131 | | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12 |
132 | | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6 |
133 | | ((*buf.get_unchecked(3) & 0x3F) as u32); |
134 | if n < 0x1_0000 { return None } // Overlong |
135 | } |
136 | _ => debug_unreachable!(), |
137 | } |
138 | |
139 | char::from_u32(n).map(Meaning::Whole) |
140 | } |
141 | |
142 | #[inline (always)] |
143 | unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { |
144 | debug_assert!(start <= buf.len()); |
145 | debug_assert!(new_len <= (buf.len() - start)); |
146 | slice::from_raw_parts(data:buf.as_ptr().offset(count:start as isize), new_len) |
147 | } |
148 | |
149 | macro_rules! otry { |
150 | ($x:expr) => { unwrap_or_return!($x, None) } |
151 | } |
152 | |
153 | /// Describes the UTF-8 codepoint containing the byte at index `idx` within |
154 | /// `buf`. |
155 | /// |
156 | /// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8 |
157 | /// in the vicinity of `idx`. |
158 | #[inline ] |
159 | pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> { |
160 | if idx >= buf.len() { |
161 | return None; |
162 | } |
163 | |
164 | unsafe { |
165 | let x = *buf.get_unchecked(idx); |
166 | match otry!(Byte::classify(x)) { |
167 | Byte::Ascii => Some(Codepoint { |
168 | bytes: unsafe_slice(buf, idx, 1), |
169 | rewind: 0, |
170 | meaning: Meaning::Whole(x as char), |
171 | }), |
172 | Byte::Start(n) => { |
173 | let avail = buf.len() - idx; |
174 | if avail >= n { |
175 | let bytes = unsafe_slice(buf, idx, n); |
176 | if !all_cont(unsafe_slice(bytes, 1, n-1)) { |
177 | return None; |
178 | } |
179 | let meaning = otry!(decode(bytes)); |
180 | Some(Codepoint { |
181 | bytes: bytes, |
182 | rewind: 0, |
183 | meaning: meaning, |
184 | }) |
185 | } else { |
186 | Some(Codepoint { |
187 | bytes: unsafe_slice(buf, idx, avail), |
188 | rewind: 0, |
189 | meaning: Meaning::Prefix(n - avail), |
190 | }) |
191 | } |
192 | }, |
193 | Byte::Cont => { |
194 | let mut start = idx; |
195 | let mut checked = 0; |
196 | loop { |
197 | if start == 0 { |
198 | // Whoops, fell off the beginning. |
199 | return Some(Codepoint { |
200 | bytes: unsafe_slice(buf, 0, idx + 1), |
201 | rewind: idx, |
202 | meaning: Meaning::Suffix, |
203 | }); |
204 | } |
205 | |
206 | start -= 1; |
207 | checked += 1; |
208 | match otry!(Byte::classify(*buf.get_unchecked(start))) { |
209 | Byte::Cont => (), |
210 | Byte::Start(n) => { |
211 | let avail = buf.len() - start; |
212 | if avail >= n { |
213 | let bytes = unsafe_slice(buf, start, n); |
214 | if checked < n { |
215 | if !all_cont(unsafe_slice(bytes, checked, n-checked)) { |
216 | return None; |
217 | } |
218 | } |
219 | let meaning = otry!(decode(bytes)); |
220 | return Some(Codepoint { |
221 | bytes: bytes, |
222 | rewind: idx - start, |
223 | meaning: meaning, |
224 | }); |
225 | } else { |
226 | return Some(Codepoint { |
227 | bytes: unsafe_slice(buf, start, avail), |
228 | rewind: idx - start, |
229 | meaning: Meaning::Prefix(n - avail), |
230 | }); |
231 | } |
232 | } |
233 | _ => return None, |
234 | } |
235 | |
236 | if idx - start >= 3 { |
237 | // We looked at 3 bytes before a continuation byte |
238 | // and didn't find a start byte. |
239 | return None; |
240 | } |
241 | } |
242 | } |
243 | } |
244 | } |
245 | } |
246 | |
247 | #[cfg (test)] |
248 | mod test; |
249 | |