lib.rs source code [crates/utf16_iter/src/lib.rs]

1	// Copyright Mozilla Foundation
2	//
3	// Licensed under the Apache License (Version 2.0), or the MIT license,
4	// (the "Licenses") at your option. You may not use this file except in
5	// compliance with one of the Licenses. You may obtain copies of the
6	// Licenses at:
7	//
8	// https://www.apache.org/licenses/LICENSE-2.0
9	// https://opensource.org/licenses/MIT
10	//
11	// Unless required by applicable law or agreed to in writing, software
12	// distributed under the Licenses is distributed on an "AS IS" BASIS,
13	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14	// See the Licenses for the specific language governing permissions and
15	// limitations under the Licenses.
16
17	#![no_std]
18
19	//! Provides iteration by `char` over `&[u16]` containing potentially-invalid
20	//! UTF-16 such that errors are replaced with the REPLACEMENT CHARACTER.
21	//!
22	//! The trait `Utf16CharsEx` provides the convenience method `chars()` on
23	//! byte slices themselves instead of having to use the more verbose
24	//! `Utf16Chars::new(slice)`.
25
26	mod indices;
27	mod report;
28
29	pub use crate::indices::Utf16CharIndices;
30	pub use crate::report::ErrorReportingUtf16Chars;
31	pub use crate::report::Utf16CharsError;
32	use core::iter::FusedIterator;
33
34	#[inline(always)]
35	fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
36	i.wrapping_sub(start) <= (end - start)
37	}
38
39	/// Iterator by `char` over `&[u16]` that contains
40	/// potentially-invalid UTF-16. See the crate documentation.
41	#[derive(Debug, Clone)]
42	pub struct Utf16Chars<'a> {
43	remaining: &'a [u16],
44	}
45
46	impl<'a> Utf16Chars<'a> {
47	#[inline(always)]
48	/// Creates the iterator from a `u16` slice.
49	pub fn new(code_units: &'a [u16]) -> Self {
50	Utf16Chars::<'a> {
51	remaining: code_units,
52	}
53	}
54
55	/// Views the current remaining data in the iterator as a subslice
56	/// of the original slice.
57	#[inline(always)]
58	pub fn as_slice(&self) -> &'a [u16] {
59	self.remaining
60	}
61
62	#[inline(never)]
63	fn surrogate_next(&mut self, surrogate_base: u16, first: u16) -> char {
64	if surrogate_base <= (`0xDBFF` - `0xD800`) {
65	if let Some((&low, tail_tail)) = self.remaining.split_first() {
66	if in_inclusive_range16(low, `0xDC00`, `0xDFFF`) {
67	self.remaining = tail_tail;
68	return unsafe {
69	char::from_u32_unchecked(
70	(u32::from(first) << `10`) + u32::from(low)
71	- (((`0xD800u32` << `10`) - `0x10000u32`) + `0xDC00u32`),
72	)
73	};
74	}
75	}
76	}
77	'`\u{FFFD}`'
78	}
79
80	#[inline(never)]
81	fn surrogate_next_back(&mut self, last: u16) -> char {
82	if in_inclusive_range16(last, `0xDC00`, `0xDFFF`) {
83	if let Some((&high, head_head)) = self.remaining.split_last() {
84	if in_inclusive_range16(high, `0xD800`, `0xDBFF`) {
85	self.remaining = head_head;
86	return unsafe {
87	char::from_u32_unchecked(
88	(u32::from(high) << `10`) + u32::from(last)
89	- (((`0xD800u32` << `10`) - `0x10000u32`) + `0xDC00u32`),
90	)
91	};
92	}
93	}
94	}
95	'`\u{FFFD}`'
96	}
97	}
98
99	impl<'a> Iterator for Utf16Chars<'a> {
100	type Item = char;
101
102	#[inline(always)]
103	fn next(&mut self) -> Option<char> {
104	// It might be OK to delegate to `ErrorReportingUtf16Chars`, but since
105	// the methods are rather small, copypaste is probably clearer. Also,
106	// copypaste would _not_ be equivalent if any part of this was delegated
107	// to an `inline(never)` helper. However, previous experimentation indicated
108	// that such a helper didn't help performance here.
109	let (&first: u16, tail: &[u16]) = self.remaining.split_first()?;
110	self.remaining = tail;
111	let surrogate_base: u16 = first.wrapping_sub(`0xD800`);
112	if surrogate_base > (`0xDFFF` - `0xD800`) {
113	return Some(unsafe { char::from_u32_unchecked(u32::from(first)) });
114	}
115	Some(self.surrogate_next(surrogate_base, first))
116	}
117	}
118
119	impl<'a> DoubleEndedIterator for Utf16Chars<'a> {
120	#[inline(always)]
121	fn next_back(&mut self) -> Option<char> {
122	let (&last: u16, head: &[u16]) = self.remaining.split_last()?;
123	self.remaining = head;
124	if !in_inclusive_range16(i:last, start:`0xD800`, end:`0xDFFF`) {
125	return Some(unsafe { char::from_u32_unchecked(u32::from(last)) });
126	}
127	Some(self.surrogate_next_back(last))
128	}
129	}
130
131	impl FusedIterator for Utf16Chars<'_> {}
132
133	/// Convenience trait that adds `chars()` and `char_indices()` methods
134	/// similar to the ones on string slices to `u16` slices.
135	pub trait Utf16CharsEx {
136	fn chars(&self) -> Utf16Chars<'_>;
137	fn char_indices(&self) -> Utf16CharIndices<'_>;
138	}
139
140	impl Utf16CharsEx for [u16] {
141	/// Convenience method for creating an UTF-16 iterator
142	/// for the slice.
143	#[inline]
144	fn chars(&self) -> Utf16Chars<'_> {
145	Utf16Chars::new(self)
146	}
147	/// Convenience method for creating a code unit index and
148	/// UTF-16 iterator for the slice.
149	#[inline]
150	fn char_indices(&self) -> Utf16CharIndices<'_> {
151	Utf16CharIndices::new(self)
152	}
153	}
154
155	#[cfg(test)]
156	mod tests {
157	use crate::Utf16CharsEx;
158
159	#[test]
160	fn test_boundaries() {
161	assert!([`0xD7FFu16`]
162	.as_slice()
163	.chars()
164	.eq(core::iter::once('`\u{D7FF}`')));
165	assert!([`0xE000u16`]
166	.as_slice()
167	.chars()
168	.eq(core::iter::once('`\u{E000}`')));
169	assert!([`0xD800u16`]
170	.as_slice()
171	.chars()
172	.eq(core::iter::once('`\u{FFFD}`')));
173	assert!([`0xDFFFu16`]
174	.as_slice()
175	.chars()
176	.eq(core::iter::once('`\u{FFFD}`')));
177	}
178
179	#[test]
180	fn test_unpaired() {
181	assert!([`0xD800u16`, `0x0061u16`]
182	.as_slice()
183	.chars()
184	.eq([`0xFFFDu16`, `0x0061u16`].as_slice().chars()));
185	assert!([`0xDFFFu16`, `0x0061u16`]
186	.as_slice()
187	.chars()
188	.eq([`0xFFFDu16`, `0x0061u16`].as_slice().chars()));
189	}
190
191	#[test]
192	fn test_unpaired_rev() {
193	assert!([`0xD800u16`, `0x0061u16`]
194	.as_slice()
195	.chars()
196	.rev()
197	.eq([`0xFFFDu16`, `0x0061u16`].as_slice().chars().rev()));
198	assert!([`0xDFFFu16`, `0x0061u16`]
199	.as_slice()
200	.chars()
201	.rev()
202	.eq([`0xFFFDu16`, `0x0061u16`].as_slice().chars().rev()));
203	}
204
205	#[test]
206	fn test_paired() {
207	assert!([`0xD83Eu16`, `0xDD73u16`]
208	.as_slice()
209	.chars()
210	.eq(core::iter::once('🥳')));
211	}
212
213	#[test]
214	fn test_paired_rev() {
215	assert!([`0xD83Eu16`, `0xDD73u16`]
216	.as_slice()
217	.chars()
218	.rev()
219	.eq(core::iter::once('🥳')));
220	}
221
222	#[test]
223	fn test_as_slice() {
224	let mut iter = [`0x0061u16`, `0x0062u16`].as_slice().chars();
225	let at_start = iter.as_slice();
226	assert_eq!(iter.next(), Some('a'));
227	let in_middle = iter.as_slice();
228	assert_eq!(iter.next(), Some('b'));
229	let at_end = iter.as_slice();
230	assert_eq!(at_start.len(), `2`);
231	assert_eq!(in_middle.len(), `1`);
232	assert_eq!(at_end.len(), `0`);
233	assert_eq!(at_start[`0`], `0x0061u16`);
234	assert_eq!(at_start[`1`], `0x0062u16`);
235	assert_eq!(in_middle[`0`], `0x0062u16`);
236	}
237	}
238