wtf8.rs source code [crates/core/src/wtf8.rs]

1	//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
2	//!
3	//! This library uses Rust’s type system to maintain
4	//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
5	//! like the `String` and `&str` types do for UTF-8.
6	//!
7	//! Since [WTF-8 must not be used
8	//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
9	//! this library deliberately does not provide access to the underlying bytes
10	//! of WTF-8 strings,
11	//! nor can it decode WTF-8 from arbitrary bytes.
12	//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
13	#![unstable(
14	feature = "wtf8_internals",
15	issue = "none",
16	reason = "this is internal code for representing OsStr on some platforms and not a public API"
17	)]
18	// rustdoc bug: doc(hidden) on the module won't stop types in the module from showing up in trait
19	// implementations, so, we'll have to add more doc(hidden)s anyway
20	#![doc(hidden)]
21
22	use crate::char::{EscapeDebugExtArgs, encode_utf16_raw};
23	use crate::clone::CloneToUninit;
24	use crate::fmt::{self, Write};
25	use crate::hash::{Hash, Hasher};
26	use crate::iter::FusedIterator;
27	use crate::num::niche_types::CodePointInner;
28	use crate::str::next_code_point;
29	use crate::{ops, slice, str};
30
31	/// A Unicode code point: from U+0000 to U+10FFFF.
32	///
33	/// Compares with the `char` type,
34	/// which represents a Unicode scalar value:
35	/// a code point that is not a surrogate (U+D800 to U+DFFF).
36	#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
37	#[doc(hidden)]
38	pub struct CodePoint(CodePointInner);
39
40	/// Format the code point as `U+` followed by four to six hexadecimal digits.
41	/// Example: `U+1F4A9`
42	impl fmt::Debug for CodePoint {
43	#[inline]
44	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
45	write!(formatter, "U+{:`04`X}", self.0.as_inner())
46	}
47	}
48
49	impl CodePoint {
50	/// Unsafely creates a new `CodePoint` without checking the value.
51	///
52	/// Only use when `value` is known to be less than or equal to 0x10FFFF.
53	#[inline]
54	pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
55	// SAFETY: Guaranteed by caller.
56	CodePoint(unsafe { CodePointInner::new_unchecked(value) })
57	}
58
59	/// Creates a new `CodePoint` if the value is a valid code point.
60	///
61	/// Returns `None` if `value` is above 0x10FFFF.
62	#[inline]
63	pub fn from_u32(value: u32) -> Option<CodePoint> {
64	Some(CodePoint(CodePointInner::new(value)?))
65	}
66
67	/// Creates a new `CodePoint` from a `char`.
68	///
69	/// Since all Unicode scalar values are code points, this always succeeds.
70	#[inline]
71	pub fn from_char(value: char) -> CodePoint {
72	// SAFETY: All char are valid for this type.
73	unsafe { CodePoint::from_u32_unchecked(value as u32) }
74	}
75
76	/// Returns the numeric value of the code point.
77	#[inline]
78	pub fn to_u32(&self) -> u32 {
79	self.0.as_inner()
80	}
81
82	/// Returns the numeric value of the code point if it is a leading surrogate.
83	#[inline]
84	pub fn to_lead_surrogate(&self) -> Option<u16> {
85	match self.to_u32() {
86	lead @ `0xD800`..=`0xDBFF` => Some(lead as u16),
87	_ => None,
88	}
89	}
90
91	/// Returns the numeric value of the code point if it is a trailing surrogate.
92	#[inline]
93	pub fn to_trail_surrogate(&self) -> Option<u16> {
94	match self.to_u32() {
95	trail @ `0xDC00`..=`0xDFFF` => Some(trail as u16),
96	_ => None,
97	}
98	}
99
100	/// Optionally returns a Unicode scalar value for the code point.
101	///
102	/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
103	#[inline]
104	pub fn to_char(&self) -> Option<char> {
105	match self.to_u32() {
106	`0xD800`..=`0xDFFF` => None,
107	// SAFETY: We explicitly check that the char is valid.
108	valid => Some(unsafe { char::from_u32_unchecked(valid) }),
109	}
110	}
111
112	/// Returns a Unicode scalar value for the code point.
113	///
114	/// Returns `'\u{FFFD}'` (the replacement character “�”)
115	/// if the code point is a surrogate (from U+D800 to U+DFFF).
116	#[inline]
117	pub fn to_char_lossy(&self) -> char {
118	self.to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
119	}
120	}
121
122	/// A borrowed slice of well-formed WTF-8 data.
123	///
124	/// Similar to `&str`, but can additionally contain surrogate code points
125	/// if they’re not in a surrogate pair.
126	#[derive(Eq, Ord, PartialEq, PartialOrd)]
127	#[repr(transparent)]
128	#[rustc_has_incoherent_inherent_impls]
129	#[doc(hidden)]
130	pub struct Wtf8 {
131	bytes: [u8],
132	}
133
134	impl AsRef<[u8]> for Wtf8 {
135	#[inline]
136	fn as_ref(&self) -> &[u8] {
137	&self.bytes
138	}
139	}
140
141	/// Formats the string in double quotes, with characters escaped according to
142	/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
143	/// where each `x` is a hexadecimal digit.
144	impl fmt::Debug for Wtf8 {
145	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
146	fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
147	use crate::fmt::Write as _;
148	for c in s.chars().flat_map(\|c\| {
149	c.escape_debug_ext(EscapeDebugExtArgs {
150	escape_grapheme_extended: `true`,
151	escape_single_quote: `false`,
152	escape_double_quote: `true`,
153	})
154	}) {
155	f.write_char(c)?
156	}
157	Ok(())
158	}
159
160	formatter.write_char('"')?;
161	let mut pos = `0`;
162	while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
163	// SAFETY: next_surrogate provides an index for a range of valid UTF-8 bytes.
164	write_str_escaped(formatter, unsafe {
165	str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
166	})?;
167	write!(formatter, "`\\`u`{{`{:x}`}}`", surrogate)?;
168	pos = surrogate_pos + `3`;
169	}
170
171	// SAFETY: after next_surrogate returns None, the remainder is valid UTF-8.
172	write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
173	formatter.write_char('"')
174	}
175	}
176
177	/// Formats the string with unpaired surrogates substituted with the replacement
178	/// character, U+FFFD.
179	impl fmt::Display for Wtf8 {
180	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
181	let wtf8_bytes: &[u8] = &self.bytes;
182	let mut pos: usize = `0`;
183	loop {
184	match self.next_surrogate(pos) {
185	Some((surrogate_pos: usize, _)) => {
186	// SAFETY: next_surrogate provides an index for a range of valid UTF-8 bytes.
187	formatter.write_str(data:unsafe {
188	str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
189	})?;
190	formatter.write_char(char::REPLACEMENT_CHARACTER)?;
191	pos = surrogate_pos + `3`;
192	}
193	None => {
194	// SAFETY: after next_surrogate returns None, the remainder is valid UTF-8.
195	let s: &str = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
196	if pos == `0` { return s.fmt(formatter) } else { return formatter.write_str(data:s) }
197	}
198	}
199	}
200	}
201	}
202
203	impl Wtf8 {
204	/// Creates a WTF-8 slice from a UTF-8 `&str` slice.
205	#[inline]
206	pub fn from_str(value: &str) -> &Wtf8 {
207	// SAFETY: Since WTF-8 is a superset of UTF-8, this always is valid.
208	unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
209	}
210
211	/// Creates a WTF-8 slice from a WTF-8 byte slice.
212	///
213	/// Since the byte slice is not checked for valid WTF-8, this functions is
214	/// marked unsafe.
215	#[inline]
216	pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
217	// SAFETY: start with &[u8], end with fancy &[u8]
218	unsafe { &(value as const [u8] as *const Wtf8) }
219	}
220
221	/// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
222	///
223	/// Since the byte slice is not checked for valid WTF-8, this functions is
224	/// marked unsafe.
225	#[inline]
226	pub unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
227	// SAFETY: start with &mut [u8], end with fancy &mut [u8]
228	unsafe { &mut (value as mut [u8] as *mut Wtf8) }
229	}
230
231	/// Returns the length, in WTF-8 bytes.
232	#[inline]
233	pub fn len(&self) -> usize {
234	self.bytes.len()
235	}
236
237	#[inline]
238	pub fn is_empty(&self) -> bool {
239	self.bytes.is_empty()
240	}
241
242	/// Returns the code point at `position` if it is in the ASCII range,
243	/// or `b'\xFF'` otherwise.
244	///
245	/// # Panics
246	///
247	/// Panics if `position` is beyond the end of the string.
248	#[inline]
249	pub fn ascii_byte_at(&self, position: usize) -> u8 {
250	match self.bytes[position] {
251	ascii_byte @ `0x00`..=`0x7F` => ascii_byte,
252	_ => `0xFF`,
253	}
254	}
255
256	/// Returns an iterator for the string’s code points.
257	#[inline]
258	pub fn code_points(&self) -> Wtf8CodePoints<'_> {
259	Wtf8CodePoints { bytes: self.bytes.iter() }
260	}
261
262	/// Access raw bytes of WTF-8 data
263	#[inline]
264	pub fn as_bytes(&self) -> &[u8] {
265	&self.bytes
266	}
267
268	/// Tries to convert the string to UTF-8 and return a `&str` slice.
269	///
270	/// Returns `None` if the string contains surrogates.
271	///
272	/// This does not copy the data.
273	#[inline]
274	pub fn as_str(&self) -> Result<&str, str::Utf8Error> {
275	str::from_utf8(&self.bytes)
276	}
277
278	/// Converts the WTF-8 string to potentially ill-formed UTF-16
279	/// and return an iterator of 16-bit code units.
280	///
281	/// This is lossless:
282	/// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
283	/// would always return the original WTF-8 string.
284	#[inline]
285	pub fn encode_wide(&self) -> EncodeWide<'_> {
286	EncodeWide { code_points: self.code_points(), extra: `0` }
287	}
288
289	#[inline]
290	pub fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
291	let mut iter = self.bytes[pos..].iter();
292	loop {
293	let b = *iter.next()?;
294	if b < `0x80` {
295	pos += `1`;
296	} else if b < `0xE0` {
297	iter.next();
298	pos += `2`;
299	} else if b == `0xED` {
300	match (iter.next(), iter.next()) {
301	(Some(&b2), Some(&b3)) if b2 >= `0xA0` => {
302	return Some((pos, decode_surrogate(b2, b3)));
303	}
304	_ => pos += `3`,
305	}
306	} else if b < `0xF0` {
307	iter.next();
308	iter.next();
309	pos += `3`;
310	} else {
311	iter.next();
312	iter.next();
313	iter.next();
314	pos += `4`;
315	}
316	}
317	}
318
319	#[inline]
320	pub fn final_lead_surrogate(&self) -> Option<u16> {
321	match self.bytes {
322	[.., `0xED`, b2 @ `0xA0`..=`0xAF`, b3] => Some(decode_surrogate(b2, b3)),
323	_ => None,
324	}
325	}
326
327	#[inline]
328	pub fn initial_trail_surrogate(&self) -> Option<u16> {
329	match self.bytes {
330	[`0xED`, b2 @ `0xB0`..=`0xBF`, b3, ..] => Some(decode_surrogate(b2, b3)),
331	_ => None,
332	}
333	}
334
335	#[inline]
336	pub fn make_ascii_lowercase(&mut self) {
337	self.bytes.make_ascii_lowercase()
338	}
339
340	#[inline]
341	pub fn make_ascii_uppercase(&mut self) {
342	self.bytes.make_ascii_uppercase()
343	}
344
345	#[inline]
346	pub fn is_ascii(&self) -> bool {
347	self.bytes.is_ascii()
348	}
349
350	#[inline]
351	pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
352	self.bytes.eq_ignore_ascii_case(&other.bytes)
353	}
354	}
355
356	/// Returns a slice of the given string for the byte range \[`begin`..`end`).
357	///
358	/// # Panics
359	///
360	/// Panics when `begin` and `end` do not point to code point boundaries,
361	/// or point beyond the end of the string.
362	impl ops::Index<ops::Range<usize>> for Wtf8 {
363	type Output = Wtf8;
364
365	#[inline]
366	fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
367	if range.start <= range.end
368	&& self.is_code_point_boundary(index:range.start)
369	&& self.is_code_point_boundary(index:range.end)
370	{
371	// SAFETY: is_code_point_boundary checks that the index is valid
372	unsafe { slice_unchecked(self, begin:range.start, range.end) }
373	} else {
374	slice_error_fail(self, begin:range.start, range.end)
375	}
376	}
377	}
378
379	/// Returns a slice of the given string from byte `begin` to its end.
380	///
381	/// # Panics
382	///
383	/// Panics when `begin` is not at a code point boundary,
384	/// or is beyond the end of the string.
385	impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
386	type Output = Wtf8;
387
388	#[inline]
389	fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
390	if self.is_code_point_boundary(index:range.start) {
391	// SAFETY: is_code_point_boundary checks that the index is valid
392	unsafe { slice_unchecked(self, begin:range.start, self.len()) }
393	} else {
394	slice_error_fail(self, begin:range.start, self.len())
395	}
396	}
397	}
398
399	/// Returns a slice of the given string from its beginning to byte `end`.
400	///
401	/// # Panics
402	///
403	/// Panics when `end` is not at a code point boundary,
404	/// or is beyond the end of the string.
405	impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
406	type Output = Wtf8;
407
408	#[inline]
409	fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
410	if self.is_code_point_boundary(index:range.end) {
411	// SAFETY: is_code_point_boundary checks that the index is valid
412	unsafe { slice_unchecked(self, begin:`0`, range.end) }
413	} else {
414	slice_error_fail(self, begin:`0`, range.end)
415	}
416	}
417	}
418
419	impl ops::Index<ops::RangeFull> for Wtf8 {
420	type Output = Wtf8;
421
422	#[inline]
423	fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
424	self
425	}
426	}
427
428	#[inline]
429	fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
430	// The first byte is assumed to be 0xED
431	`0xD800` \| (second_byte as u16 & `0x3F`) << `6` \| third_byte as u16 & `0x3F`
432	}
433
434	impl Wtf8 {
435	/// Copied from str::is_char_boundary
436	#[inline]
437	pub fn is_code_point_boundary(&self, index: usize) -> bool {
438	if index == `0` {
439	return `true`;
440	}
441	match self.bytes.get(index) {
442	None => index == self.len(),
443	Some(&b) => (b as i8) >= `-0x40`,
444	}
445	}
446
447	/// Verify that `index` is at the edge of either a valid UTF-8 codepoint
448	/// (i.e. a codepoint that's not a surrogate) or of the whole string.
449	///
450	/// These are the cases currently permitted by `OsStr::self_encoded_bytes`.
451	/// Splitting between surrogates is valid as far as WTF-8 is concerned, but
452	/// we do not permit it in the public API because WTF-8 is considered an
453	/// implementation detail.
454	#[track_caller]
455	#[inline]
456	pub fn check_utf8_boundary(&self, index: usize) {
457	if index == `0` {
458	return;
459	}
460	match self.bytes.get(index) {
461	Some(`0xED`) => (), // Might be a surrogate
462	Some(&b) if (b as i8) >= `-0x40` => return,
463	Some(_) => panic!("byte index {index} is not a codepoint boundary"),
464	None if index == self.len() => return,
465	None => panic!("byte index {index} is out of bounds"),
466	}
467	if self.bytes[index + `1`] >= `0xA0` {
468	// There's a surrogate after index. Now check before index.
469	if index >= `3` && self.bytes[index - `3`] == `0xED` && self.bytes[index - `2`] >= `0xA0` {
470	panic!("byte index {index} lies between surrogate codepoints");
471	}
472	}
473	}
474	}
475
476	/// Copied from core::str::raw::slice_unchecked
477	#[inline]
478	unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
479	// SAFETY: memory layout of a &[u8] and &Wtf8 are the same
480	unsafe {
481	let len: usize = end - begin;
482	let start: *const u8 = s.as_bytes().as_ptr().add(count:begin);
483	Wtf8::from_bytes_unchecked(slice::from_raw_parts(data:start, len))
484	}
485	}
486
487	/// Copied from core::str::raw::slice_error_fail
488	#[inline(never)]
489	fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
490	assert!(begin <= end);
491	panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
492	}
493
494	/// Iterator for the code points of a WTF-8 string.
495	///
496	/// Created with the method `.code_points()`.
497	#[derive(Clone)]
498	#[doc(hidden)]
499	pub struct Wtf8CodePoints<'a> {
500	bytes: slice::Iter<'a, u8>,
501	}
502
503	impl Iterator for Wtf8CodePoints<'_> {
504	type Item = CodePoint;
505
506	#[inline]
507	fn next(&mut self) -> Option<CodePoint> {
508	// SAFETY: `self.bytes` has been created from a WTF-8 string
509	unsafe { next_code_point(&mut self.bytes).map(\|c: u32\| CodePoint::from_u32_unchecked(c)) }
510	}
511
512	#[inline]
513	fn size_hint(&self) -> (usize, Option<usize>) {
514	let len: usize = self.bytes.len();
515	(len.saturating_add(`3`) / `4`, Some(len))
516	}
517	}
518
519	impl fmt::Debug for Wtf8CodePoints<'_> {
520	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
521	f&mut DebugTuple<'_, '_>.debug_tuple(name:"Wtf8CodePoints")
522	// SAFETY: We always leave the string in a valid state after each iteration.
523	.field(&unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) })
524	.finish()
525	}
526	}
527
528	/// Generates a wide character sequence for potentially ill-formed UTF-16.
529	#[stable(feature = "rust1", since = "1.0.0")]
530	#[derive(Clone)]
531	#[doc(hidden)]
532	pub struct EncodeWide<'a> {
533	code_points: Wtf8CodePoints<'a>,
534	extra: u16,
535	}
536
537	// Copied from libunicode/u_str.rs
538	#[stable(feature = "rust1", since = "1.0.0")]
539	impl Iterator for EncodeWide<'_> {
540	type Item = u16;
541
542	#[inline]
543	fn next(&mut self) -> Option<u16> {
544	if self.extra != `0` {
545	let tmp = self.extra;
546	self.extra = `0`;
547	return Some(tmp);
548	}
549
550	let mut buf = [`0`; char::MAX_LEN_UTF16];
551	self.code_points.next().map(\|code_point\| {
552	let n = encode_utf16_raw(code_point.to_u32(), &mut buf).len();
553	if n == `2` {
554	self.extra = buf[`1`];
555	}
556	buf[`0`]
557	})
558	}
559
560	#[inline]
561	fn size_hint(&self) -> (usize, Option<usize>) {
562	let (low, high) = self.code_points.size_hint();
563	let ext = (self.extra != `0`) as usize;
564	// every code point gets either one u16 or two u16,
565	// so this iterator is between 1 or 2 times as
566	// long as the underlying iterator.
567	(low + ext, high.and_then(\|n\| n.checked_mul(`2`)).and_then(\|n\| n.checked_add(ext)))
568	}
569	}
570
571	#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")]
572	impl FusedIterator for EncodeWide<'_> {}
573
574	#[stable(feature = "encode_wide_debug", since = "1.92.0")]
575	impl fmt::Debug for EncodeWide<'_> {
576	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
577	struct CodeUnit(u16);
578	impl fmt::Debug for CodeUnit {
579	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
580	// This output attempts to balance readability with precision.
581	// Render characters which take only one WTF-16 code unit using
582	// `char` syntax and everything else as code units with hex
583	// integer syntax (including paired and unpaired surrogate
584	// halves). Since Rust has no `char`-like type for WTF-16, this
585	// isn't perfect, so if this output isn't suitable, it is open
586	// to being changed (see #140153).
587	match char::from_u32(self.0 as u32) {
588	Some(c) => write!(f, "{c:?}"),
589	None => write!(f, "0x{:`04`X}", self.0),
590	}
591	}
592	}
593
594	write!(f, "EncodeWide(")?;
595	f.debug_list().entries(self.clone().map(CodeUnit)).finish()?;
596	write!(f, ")")?;
597	Ok(())
598	}
599	}
600
601	impl Hash for CodePoint {
602	#[inline]
603	fn hash<H: Hasher>(&self, state: &mut H) {
604	self.0.hash(state)
605	}
606	}
607
608	impl Hash for Wtf8 {
609	#[inline]
610	fn hash<H: Hasher>(&self, state: &mut H) {
611	state.write(&self.bytes);
612	`0xfeu8`.hash(state)
613	}
614	}
615
616	#[unstable(feature = "clone_to_uninit", issue = "126799")]
617	unsafe impl CloneToUninit for Wtf8 {
618	#[inline]
619	#[cfg_attr(debug_assertions, track_caller)]
620	unsafe fn clone_to_uninit(&self, dst: *mut u8) {
621	// SAFETY: we're just a transparent wrapper around [u8]
622	unsafe { self.bytes.clone_to_uninit(dest:dst) }
623	}
624	}
625