lib.rs source code [crates/futf/src/lib.rs]

1	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
4	// option. This file may not be copied, modified, or distributed
5	// except according to those terms.
6
7	#![cfg_attr(test, feature(test))]
8
9	#[macro_use]
10	extern crate debug_unreachable;
11
12	#[macro_use]
13	extern crate mac;
14
15	#[cfg(test)]
16	extern crate test as std_test;
17
18	use std::{slice, char};
19
20	/// Meaning of a complete or partial UTF-8 codepoint.
21	///
22	/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
23	/// `Suffix` may in reality have no valid completion.
24	#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
25	pub enum Meaning {
26	/// We found a whole codepoint.
27	Whole(char),
28
29	/// We found something that isn't a valid Unicode codepoint, but
30	/// it would* correspond to a UTF-16 leading surrogate code unit,*
31	/// i.e. a value in the range `U+D800` - `U+DBFF`.
32	///
33	/// The argument is the code unit's 10-bit index within that range.
34	///
35	/// These are found in UTF-8 variants such as CESU-8 and WTF-8.
36	LeadSurrogate(u16),
37
38	/// We found something that isn't a valid Unicode codepoint, but
39	/// it would* correspond to a UTF-16 trailing surrogate code unit,*
40	/// i.e. a value in the range `U+DC00` - `U+DFFF`.
41	///
42	/// The argument is the code unit's 10-bit index within that range.
43	///
44	/// These are found in UTF-8 variants such as CESU-8 and WTF-8.
45	TrailSurrogate(u16),
46
47	/// We found only a prefix of a codepoint before the buffer ended.
48	///
49	/// Includes the number of additional bytes needed.
50	Prefix(usize),
51
52	/// We found only a suffix of a codepoint before running off the
53	/// start of the buffer.
54	///
55	/// Up to 3 more bytes may be needed.
56	Suffix,
57	}
58
59	/// Represents a complete or partial UTF-8 codepoint.
60	#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
61	pub struct Codepoint<'a> {
62	/// The bytes that make up the partial or full codepoint.
63	///
64	/// For a `Suffix` this depends on `idx`. We don't scan forward
65	/// for additional continuation bytes after the reverse scan
66	/// failed to locate a multibyte sequence start.
67	pub bytes: &'a [u8],
68
69	/// Start of the codepoint in the buffer, expressed as an offset
70	/// back from `idx`.
71	pub rewind: usize,
72
73	/// Meaning of the partial or full codepoint.
74	pub meaning: Meaning,
75	}
76
77	#[derive(Debug, PartialEq, Eq)]
78	enum Byte {
79	Ascii,
80	Start(usize),
81	Cont,
82	}
83
84	impl Byte {
85	#[inline(always)]
86	fn classify(x: u8) -> Option<Byte> {
87	match x & `0xC0` {
88	`0xC0` => match x {
89	x: u8 if x & `0b11111_000` == `0b11110_000` => Some(Byte::Start(`4`)),
90	x: u8 if x & `0b1111_0000` == `0b1110_0000` => Some(Byte::Start(`3`)),
91	x: u8 if x & `0b111_00000` == `0b110_00000` => Some(Byte::Start(`2`)),
92	_ => None,
93	},
94	`0x80` => Some(Byte::Cont),
95	_ => Some(Byte::Ascii),
96	}
97	}
98	}
99
100	#[inline(always)]
101	fn all_cont(buf: &[u8]) -> bool {
102	buf.iter().all(\|&b: u8\| matches!(Byte::classify(b), Some(Byte::Cont)))
103	}
104
105	// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
106	// a starting byte followed by the correct number of continuation bytes.
107	#[inline(always)]
108	unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
109	debug_assert!(buf.len() >= `2`);
110	debug_assert!(buf.len() <= `4`);
111	let n;
112	match buf.len() {
113	`2` => {
114	n = ((buf.get_unchecked(`0`) & `0b11111`) as u32*) << `6`
115	\| ((buf.get_unchecked(`1`) & `0x3F`) as u32*);
116	if n < `0x80` { return None } // Overlong
117	}
118	`3` => {
119	n = ((buf.get_unchecked(`0`) & `0b1111`) as u32*) << `12`
120	\| ((buf.get_unchecked(`1`) & `0x3F`) as u32*) << `6`
121	\| ((buf.get_unchecked(`2`) & `0x3F`) as u32*);
122	match n {
123	`0x0000` ... `0x07FF` => return None, // Overlong
124	`0xD800` ... `0xDBFF` => return Some(Meaning::LeadSurrogate(n as u16 - `0xD800`)),
125	`0xDC00` ... `0xDFFF` => return Some(Meaning::TrailSurrogate(n as u16 - `0xDC00`)),
126	_ => {}
127	}
128	}
129	`4` => {
130	n = ((buf.get_unchecked(`0`) & `0b111`) as u32*) << `18`
131	\| ((buf.get_unchecked(`1`) & `0x3F`) as u32*) << `12`
132	\| ((buf.get_unchecked(`2`) & `0x3F`) as u32*) << `6`
133	\| ((buf.get_unchecked(`3`) & `0x3F`) as u32*);
134	if n < `0x1_0000` { return None } // Overlong
135	}
136	_ => debug_unreachable!(),
137	}
138
139	char::from_u32(n).map(Meaning::Whole)
140	}
141
142	#[inline(always)]
143	unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
144	debug_assert!(start <= buf.len());
145	debug_assert!(new_len <= (buf.len() - start));
146	slice::from_raw_parts(data:buf.as_ptr().offset(count:start as isize), new_len)
147	}
148
149	macro_rules! otry {
150	($x:expr) => { unwrap_or_return!($x, None) }
151	}
152
153	/// Describes the UTF-8 codepoint containing the byte at index `idx` within
154	/// `buf`.
155	///
156	/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
157	/// in the vicinity of `idx`.
158	#[inline]
159	pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
160	if idx >= buf.len() {
161	return None;
162	}
163
164	unsafe {
165	let x = *buf.get_unchecked(idx);
166	match otry!(Byte::classify(x)) {
167	Byte::Ascii => Some(Codepoint {
168	bytes: unsafe_slice(buf, idx, `1`),
169	rewind: `0`,
170	meaning: Meaning::Whole(x as char),
171	}),
172	Byte::Start(n) => {
173	let avail = buf.len() - idx;
174	if avail >= n {
175	let bytes = unsafe_slice(buf, idx, n);
176	if !all_cont(unsafe_slice(bytes, `1`, n-`1`)) {
177	return None;
178	}
179	let meaning = otry!(decode(bytes));
180	Some(Codepoint {
181	bytes: bytes,
182	rewind: `0`,
183	meaning: meaning,
184	})
185	} else {
186	Some(Codepoint {
187	bytes: unsafe_slice(buf, idx, avail),
188	rewind: `0`,
189	meaning: Meaning::Prefix(n - avail),
190	})
191	}
192	},
193	Byte::Cont => {
194	let mut start = idx;
195	let mut checked = `0`;
196	loop {
197	if start == `0` {
198	// Whoops, fell off the beginning.
199	return Some(Codepoint {
200	bytes: unsafe_slice(buf, `0`, idx + `1`),
201	rewind: idx,
202	meaning: Meaning::Suffix,
203	});
204	}
205
206	start -= `1`;
207	checked += `1`;
208	match otry!(Byte::classify(*buf.get_unchecked(start))) {
209	Byte::Cont => (),
210	Byte::Start(n) => {
211	let avail = buf.len() - start;
212	if avail >= n {
213	let bytes = unsafe_slice(buf, start, n);
214	if checked < n {
215	if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
216	return None;
217	}
218	}
219	let meaning = otry!(decode(bytes));
220	return Some(Codepoint {
221	bytes: bytes,
222	rewind: idx - start,
223	meaning: meaning,
224	});
225	} else {
226	return Some(Codepoint {
227	bytes: unsafe_slice(buf, start, avail),
228	rewind: idx - start,
229	meaning: Meaning::Prefix(n - avail),
230	});
231	}
232	}
233	_ => return None,
234	}
235
236	if idx - start >= `3` {
237	// We looked at 3 bytes before a continuation byte
238	// and didn't find a start byte.
239	return None;
240	}
241	}
242	}
243	}
244	}
245	}
246
247	#[cfg(test)]
248	mod test;
249

Provided by KDAB

Definitions

Learn Rust with the experts

Find out more