| 1 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 2 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 3 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 4 | // option. This file may not be copied, modified, or distributed |
| 5 | // except according to those terms. |
| 6 | |
| 7 | #![cfg_attr (test, feature(test))] |
| 8 | |
| 9 | #[macro_use ] |
| 10 | extern crate debug_unreachable; |
| 11 | |
| 12 | #[macro_use ] |
| 13 | extern crate mac; |
| 14 | |
| 15 | #[cfg (test)] |
| 16 | extern crate test as std_test; |
| 17 | |
| 18 | use std::{slice, char}; |
| 19 | |
| 20 | /// Meaning of a complete or partial UTF-8 codepoint. |
| 21 | /// |
| 22 | /// Not all checking is performed eagerly. That is, a codepoint `Prefix` or |
| 23 | /// `Suffix` may in reality have no valid completion. |
| 24 | #[derive (Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] |
| 25 | pub enum Meaning { |
| 26 | /// We found a whole codepoint. |
| 27 | Whole(char), |
| 28 | |
| 29 | /// We found something that isn't a valid Unicode codepoint, but |
| 30 | /// it *would* correspond to a UTF-16 leading surrogate code unit, |
| 31 | /// i.e. a value in the range `U+D800` - `U+DBFF`. |
| 32 | /// |
| 33 | /// The argument is the code unit's 10-bit index within that range. |
| 34 | /// |
| 35 | /// These are found in UTF-8 variants such as CESU-8 and WTF-8. |
| 36 | LeadSurrogate(u16), |
| 37 | |
| 38 | /// We found something that isn't a valid Unicode codepoint, but |
| 39 | /// it *would* correspond to a UTF-16 trailing surrogate code unit, |
| 40 | /// i.e. a value in the range `U+DC00` - `U+DFFF`. |
| 41 | /// |
| 42 | /// The argument is the code unit's 10-bit index within that range. |
| 43 | /// |
| 44 | /// These are found in UTF-8 variants such as CESU-8 and WTF-8. |
| 45 | TrailSurrogate(u16), |
| 46 | |
| 47 | /// We found only a prefix of a codepoint before the buffer ended. |
| 48 | /// |
| 49 | /// Includes the number of additional bytes needed. |
| 50 | Prefix(usize), |
| 51 | |
| 52 | /// We found only a suffix of a codepoint before running off the |
| 53 | /// start of the buffer. |
| 54 | /// |
| 55 | /// Up to 3 more bytes may be needed. |
| 56 | Suffix, |
| 57 | } |
| 58 | |
| 59 | /// Represents a complete or partial UTF-8 codepoint. |
| 60 | #[derive (Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] |
| 61 | pub struct Codepoint<'a> { |
| 62 | /// The bytes that make up the partial or full codepoint. |
| 63 | /// |
| 64 | /// For a `Suffix` this depends on `idx`. We don't scan forward |
| 65 | /// for additional continuation bytes after the reverse scan |
| 66 | /// failed to locate a multibyte sequence start. |
| 67 | pub bytes: &'a [u8], |
| 68 | |
| 69 | /// Start of the codepoint in the buffer, expressed as an offset |
| 70 | /// back from `idx`. |
| 71 | pub rewind: usize, |
| 72 | |
| 73 | /// Meaning of the partial or full codepoint. |
| 74 | pub meaning: Meaning, |
| 75 | } |
| 76 | |
| 77 | #[derive (Debug, PartialEq, Eq)] |
| 78 | enum Byte { |
| 79 | Ascii, |
| 80 | Start(usize), |
| 81 | Cont, |
| 82 | } |
| 83 | |
| 84 | impl Byte { |
| 85 | #[inline (always)] |
| 86 | fn classify(x: u8) -> Option<Byte> { |
| 87 | match x & 0xC0 { |
| 88 | 0xC0 => match x { |
| 89 | x: u8 if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)), |
| 90 | x: u8 if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)), |
| 91 | x: u8 if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)), |
| 92 | _ => None, |
| 93 | }, |
| 94 | 0x80 => Some(Byte::Cont), |
| 95 | _ => Some(Byte::Ascii), |
| 96 | } |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | #[inline (always)] |
| 101 | fn all_cont(buf: &[u8]) -> bool { |
| 102 | buf.iter().all(|&b: u8| matches!(Byte::classify(b), Some(Byte::Cont))) |
| 103 | } |
| 104 | |
| 105 | // NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence: |
| 106 | // a starting byte followed by the correct number of continuation bytes. |
| 107 | #[inline (always)] |
| 108 | unsafe fn decode(buf: &[u8]) -> Option<Meaning> { |
| 109 | debug_assert!(buf.len() >= 2); |
| 110 | debug_assert!(buf.len() <= 4); |
| 111 | let n; |
| 112 | match buf.len() { |
| 113 | 2 => { |
| 114 | n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6 |
| 115 | | ((*buf.get_unchecked(1) & 0x3F) as u32); |
| 116 | if n < 0x80 { return None } // Overlong |
| 117 | } |
| 118 | 3 => { |
| 119 | n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12 |
| 120 | | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6 |
| 121 | | ((*buf.get_unchecked(2) & 0x3F) as u32); |
| 122 | match n { |
| 123 | 0x0000 ... 0x07FF => return None, // Overlong |
| 124 | 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)), |
| 125 | 0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)), |
| 126 | _ => {} |
| 127 | } |
| 128 | } |
| 129 | 4 => { |
| 130 | n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18 |
| 131 | | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12 |
| 132 | | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6 |
| 133 | | ((*buf.get_unchecked(3) & 0x3F) as u32); |
| 134 | if n < 0x1_0000 { return None } // Overlong |
| 135 | } |
| 136 | _ => debug_unreachable!(), |
| 137 | } |
| 138 | |
| 139 | char::from_u32(n).map(Meaning::Whole) |
| 140 | } |
| 141 | |
| 142 | #[inline (always)] |
| 143 | unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { |
| 144 | debug_assert!(start <= buf.len()); |
| 145 | debug_assert!(new_len <= (buf.len() - start)); |
| 146 | slice::from_raw_parts(data:buf.as_ptr().offset(count:start as isize), new_len) |
| 147 | } |
| 148 | |
| 149 | macro_rules! otry { |
| 150 | ($x:expr) => { unwrap_or_return!($x, None) } |
| 151 | } |
| 152 | |
| 153 | /// Describes the UTF-8 codepoint containing the byte at index `idx` within |
| 154 | /// `buf`. |
| 155 | /// |
| 156 | /// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8 |
| 157 | /// in the vicinity of `idx`. |
| 158 | #[inline ] |
| 159 | pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> { |
| 160 | if idx >= buf.len() { |
| 161 | return None; |
| 162 | } |
| 163 | |
| 164 | unsafe { |
| 165 | let x = *buf.get_unchecked(idx); |
| 166 | match otry!(Byte::classify(x)) { |
| 167 | Byte::Ascii => Some(Codepoint { |
| 168 | bytes: unsafe_slice(buf, idx, 1), |
| 169 | rewind: 0, |
| 170 | meaning: Meaning::Whole(x as char), |
| 171 | }), |
| 172 | Byte::Start(n) => { |
| 173 | let avail = buf.len() - idx; |
| 174 | if avail >= n { |
| 175 | let bytes = unsafe_slice(buf, idx, n); |
| 176 | if !all_cont(unsafe_slice(bytes, 1, n-1)) { |
| 177 | return None; |
| 178 | } |
| 179 | let meaning = otry!(decode(bytes)); |
| 180 | Some(Codepoint { |
| 181 | bytes: bytes, |
| 182 | rewind: 0, |
| 183 | meaning: meaning, |
| 184 | }) |
| 185 | } else { |
| 186 | Some(Codepoint { |
| 187 | bytes: unsafe_slice(buf, idx, avail), |
| 188 | rewind: 0, |
| 189 | meaning: Meaning::Prefix(n - avail), |
| 190 | }) |
| 191 | } |
| 192 | }, |
| 193 | Byte::Cont => { |
| 194 | let mut start = idx; |
| 195 | let mut checked = 0; |
| 196 | loop { |
| 197 | if start == 0 { |
| 198 | // Whoops, fell off the beginning. |
| 199 | return Some(Codepoint { |
| 200 | bytes: unsafe_slice(buf, 0, idx + 1), |
| 201 | rewind: idx, |
| 202 | meaning: Meaning::Suffix, |
| 203 | }); |
| 204 | } |
| 205 | |
| 206 | start -= 1; |
| 207 | checked += 1; |
| 208 | match otry!(Byte::classify(*buf.get_unchecked(start))) { |
| 209 | Byte::Cont => (), |
| 210 | Byte::Start(n) => { |
| 211 | let avail = buf.len() - start; |
| 212 | if avail >= n { |
| 213 | let bytes = unsafe_slice(buf, start, n); |
| 214 | if checked < n { |
| 215 | if !all_cont(unsafe_slice(bytes, checked, n-checked)) { |
| 216 | return None; |
| 217 | } |
| 218 | } |
| 219 | let meaning = otry!(decode(bytes)); |
| 220 | return Some(Codepoint { |
| 221 | bytes: bytes, |
| 222 | rewind: idx - start, |
| 223 | meaning: meaning, |
| 224 | }); |
| 225 | } else { |
| 226 | return Some(Codepoint { |
| 227 | bytes: unsafe_slice(buf, start, avail), |
| 228 | rewind: idx - start, |
| 229 | meaning: Meaning::Prefix(n - avail), |
| 230 | }); |
| 231 | } |
| 232 | } |
| 233 | _ => return None, |
| 234 | } |
| 235 | |
| 236 | if idx - start >= 3 { |
| 237 | // We looked at 3 bytes before a continuation byte |
| 238 | // and didn't find a start byte. |
| 239 | return None; |
| 240 | } |
| 241 | } |
| 242 | } |
| 243 | } |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | #[cfg (test)] |
| 248 | mod test; |
| 249 | |