wtf8.rs source code [crates/std/src/sys_common/wtf8.rs]

1	//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
2	//!
3	//! This library uses Rust’s type system to maintain
4	//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
5	//! like the `String` and `&str` types do for UTF-8.
6	//!
7	//! Since [WTF-8 must not be used
8	//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
9	//! this library deliberately does not provide access to the underlying bytes
10	//! of WTF-8 strings,
11	//! nor can it decode WTF-8 from arbitrary bytes.
12	//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
13
14	// this module is imported from @SimonSapin's repo and has tons of dead code on
15	// unix (it's mostly used on windows), so don't worry about dead code here.
16	#![allow(dead_code)]
17
18	#[cfg(test)]
19	mod tests;
20
21	use core::char::{encode_utf16_raw, encode_utf8_raw};
22	use core::str::next_code_point;
23
24	use crate::borrow::Cow;
25	use crate::collections::TryReserveError;
26	use crate::fmt;
27	use crate::hash::{Hash, Hasher};
28	use crate::iter::FusedIterator;
29	use crate::mem;
30	use crate::ops;
31	use crate::rc::Rc;
32	use crate::slice;
33	use crate::str;
34	use crate::sync::Arc;
35	use crate::sys_common::AsInner;
36
37	const UTF8_REPLACEMENT_CHARACTER: &str = "`\u{FFFD}`";
38
39	/// A Unicode code point: from U+0000 to U+10FFFF.
40	///
41	/// Compares with the `char` type,
42	/// which represents a Unicode scalar value:
43	/// a code point that is not a surrogate (U+D800 to U+DFFF).
44	#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
45	pub struct CodePoint {
46	value: u32,
47	}
48
49	/// Format the code point as `U+` followed by four to six hexadecimal digits.
50	/// Example: `U+1F4A9`
51	impl fmt::Debug for CodePoint {
52	#[inline]
53	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
54	write!(formatter, "U+{:`04`X}", self.value)
55	}
56	}
57
58	impl CodePoint {
59	/// Unsafely creates a new `CodePoint` without checking the value.
60	///
61	/// Only use when `value` is known to be less than or equal to 0x10FFFF.
62	#[inline]
63	pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
64	CodePoint { value }
65	}
66
67	/// Creates a new `CodePoint` if the value is a valid code point.
68	///
69	/// Returns `None` if `value` is above 0x10FFFF.
70	#[inline]
71	pub fn from_u32(value: u32) -> Option<CodePoint> {
72	match value {
73	`0`..=`0x10FFFF` => Some(CodePoint { value }),
74	_ => None,
75	}
76	}
77
78	/// Creates a new `CodePoint` from a `char`.
79	///
80	/// Since all Unicode scalar values are code points, this always succeeds.
81	#[inline]
82	pub fn from_char(value: char) -> CodePoint {
83	CodePoint { value: value as u32 }
84	}
85
86	/// Returns the numeric value of the code point.
87	#[inline]
88	pub fn to_u32(&self) -> u32 {
89	self.value
90	}
91
92	/// Returns the numeric value of the code point if it is a leading surrogate.
93	#[inline]
94	pub fn to_lead_surrogate(&self) -> Option<u16> {
95	match self.value {
96	lead @ `0xD800`..=`0xDBFF` => Some(lead as u16),
97	_ => None,
98	}
99	}
100
101	/// Returns the numeric value of the code point if it is a trailing surrogate.
102	#[inline]
103	pub fn to_trail_surrogate(&self) -> Option<u16> {
104	match self.value {
105	trail @ `0xDC00`..=`0xDFFF` => Some(trail as u16),
106	_ => None,
107	}
108	}
109
110	/// Optionally returns a Unicode scalar value for the code point.
111	///
112	/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
113	#[inline]
114	pub fn to_char(&self) -> Option<char> {
115	match self.value {
116	`0xD800`..=`0xDFFF` => None,
117	_ => Some(unsafe { char::from_u32_unchecked(self.value) }),
118	}
119	}
120
121	/// Returns a Unicode scalar value for the code point.
122	///
123	/// Returns `'\u{FFFD}'` (the replacement character “�”)
124	/// if the code point is a surrogate (from U+D800 to U+DFFF).
125	#[inline]
126	pub fn to_char_lossy(&self) -> char {
127	self.to_char().unwrap_or('`\u{FFFD}`')
128	}
129	}
130
131	/// An owned, growable string of well-formed WTF-8 data.
132	///
133	/// Similar to `String`, but can additionally contain surrogate code points
134	/// if they’re not in a surrogate pair.
135	#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
136	pub struct Wtf8Buf {
137	bytes: Vec<u8>,
138
139	/// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
140	/// know this if we're constructed from a `String` or `&str`.
141	///
142	/// It is possible for `bytes` to have valid UTF-8 without this being
143	/// set, such as when we're concatenating `&Wtf8`'s and surrogates become
144	/// paired, as we don't bother to rescan the entire string.
145	is_known_utf8: bool,
146	}
147
148	impl ops::Deref for Wtf8Buf {
149	type Target = Wtf8;
150
151	fn deref(&self) -> &Wtf8 {
152	self.as_slice()
153	}
154	}
155
156	impl ops::DerefMut for Wtf8Buf {
157	fn deref_mut(&mut self) -> &mut Wtf8 {
158	self.as_mut_slice()
159	}
160	}
161
162	/// Format the string with double quotes,
163	/// and surrogates as `\u` followed by four hexadecimal digits.
164	/// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
165	impl fmt::Debug for Wtf8Buf {
166	#[inline]
167	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
168	fmt::Debug::fmt(&**self, f:formatter)
169	}
170	}
171
172	impl Wtf8Buf {
173	/// Creates a new, empty WTF-8 string.
174	#[inline]
175	pub fn new() -> Wtf8Buf {
176	Wtf8Buf { bytes: Vec::new(), is_known_utf8: `true` }
177	}
178
179	/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
180	#[inline]
181	pub fn with_capacity(capacity: usize) -> Wtf8Buf {
182	Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: `true` }
183	}
184
185	/// Creates a WTF-8 string from a WTF-8 byte vec.
186	///
187	/// Since the byte vec is not checked for valid WTF-8, this functions is
188	/// marked unsafe.
189	#[inline]
190	pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
191	Wtf8Buf { bytes: value, is_known_utf8: `false` }
192	}
193
194	/// Creates a WTF-8 string from a UTF-8 `String`.
195	///
196	/// This takes ownership of the `String` and does not copy.
197	///
198	/// Since WTF-8 is a superset of UTF-8, this always succeeds.
199	#[inline]
200	pub fn from_string(string: String) -> Wtf8Buf {
201	Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: `true` }
202	}
203
204	/// Creates a WTF-8 string from a UTF-8 `&str` slice.
205	///
206	/// This copies the content of the slice.
207	///
208	/// Since WTF-8 is a superset of UTF-8, this always succeeds.
209	#[inline]
210	pub fn from_str(str: &str) -> Wtf8Buf {
211	Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()), is_known_utf8: `true` }
212	}
213
214	pub fn clear(&mut self) {
215	self.bytes.clear();
216	self.is_known_utf8 = `true`;
217	}
218
219	/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
220	///
221	/// This is lossless: calling `.encode_wide()` on the resulting string
222	/// will always return the original code units.
223	pub fn from_wide(v: &[u16]) -> Wtf8Buf {
224	let mut string = Wtf8Buf::with_capacity(v.len());
225	for item in char::decode_utf16(v.iter().cloned()) {
226	match item {
227	Ok(ch) => string.push_char(ch),
228	Err(surrogate) => {
229	let surrogate = surrogate.unpaired_surrogate();
230	// Surrogates are known to be in the code point range.
231	let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
232	// The string will now contain an unpaired surrogate.
233	string.is_known_utf8 = `false`;
234	// Skip the WTF-8 concatenation check,
235	// surrogate pairs are already decoded by decode_utf16
236	string.push_code_point_unchecked(code_point);
237	}
238	}
239	}
240	string
241	}
242
243	/// Copied from String::push
244	/// This does not* include the WTF-8 concatenation check or `is_known_utf8` check.*
245	fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
246	let mut bytes = [`0`; `4`];
247	let bytes = encode_utf8_raw(code_point.value, &mut bytes);
248	self.bytes.extend_from_slice(bytes)
249	}
250
251	#[inline]
252	pub fn as_slice(&self) -> &Wtf8 {
253	unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
254	}
255
256	#[inline]
257	pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
258	// Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
259	// cause them to change from well-formed UTF-8 to ill-formed UTF-8,
260	// which would break the assumptions of the `is_known_utf8` field.
261	unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
262	}
263
264	/// Reserves capacity for at least `additional` more bytes to be inserted
265	/// in the given `Wtf8Buf`.
266	/// The collection may reserve more space to avoid frequent reallocations.
267	///
268	/// # Panics
269	///
270	/// Panics if the new capacity overflows `usize`.
271	#[inline]
272	pub fn reserve(&mut self, additional: usize) {
273	self.bytes.reserve(additional)
274	}
275
276	/// Tries to reserve capacity for at least `additional` more length units
277	/// in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to avoid
278	/// frequent reallocations. After calling `try_reserve`, capacity will be
279	/// greater than or equal to `self.len() + additional`. Does nothing if
280	/// capacity is already sufficient. This method preserves the contents even
281	/// if an error occurs.
282	///
283	/// # Errors
284	///
285	/// If the capacity overflows, or the allocator reports a failure, then an error
286	/// is returned.
287	#[inline]
288	pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
289	self.bytes.try_reserve(additional)
290	}
291
292	#[inline]
293	pub fn reserve_exact(&mut self, additional: usize) {
294	self.bytes.reserve_exact(additional)
295	}
296
297	/// Tries to reserve the minimum capacity for exactly `additional`
298	/// length units in the given `Wtf8Buf`. After calling
299	/// `try_reserve_exact`, capacity will be greater than or equal to
300	/// `self.len() + additional` if it returns `Ok(())`.
301	/// Does nothing if the capacity is already sufficient.
302	///
303	/// Note that the allocator may give the `Wtf8Buf` more space than it
304	/// requests. Therefore, capacity can not be relied upon to be precisely
305	/// minimal. Prefer [`try_reserve`] if future insertions are expected.
306	///
307	/// [`try_reserve`]: Wtf8Buf::try_reserve
308	///
309	/// # Errors
310	///
311	/// If the capacity overflows, or the allocator reports a failure, then an error
312	/// is returned.
313	#[inline]
314	pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
315	self.bytes.try_reserve_exact(additional)
316	}
317
318	#[inline]
319	pub fn shrink_to_fit(&mut self) {
320	self.bytes.shrink_to_fit()
321	}
322
323	#[inline]
324	pub fn shrink_to(&mut self, min_capacity: usize) {
325	self.bytes.shrink_to(min_capacity)
326	}
327
328	/// Returns the number of bytes that this string buffer can hold without reallocating.
329	#[inline]
330	pub fn capacity(&self) -> usize {
331	self.bytes.capacity()
332	}
333
334	/// Append a UTF-8 slice at the end of the string.
335	#[inline]
336	pub fn push_str(&mut self, other: &str) {
337	self.bytes.extend_from_slice(other.as_bytes())
338	}
339
340	/// Append a WTF-8 slice at the end of the string.
341	///
342	/// This replaces newly paired surrogates at the boundary
343	/// with a supplementary code point,
344	/// like concatenating ill-formed UTF-16 strings effectively would.
345	#[inline]
346	pub fn push_wtf8(&mut self, other: &Wtf8) {
347	match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
348	// Replace newly paired surrogates by a supplementary code point.
349	(Some(lead), Some(trail)) => {
350	let len_without_lead_surrogate = self.len() - `3`;
351	self.bytes.truncate(len_without_lead_surrogate);
352	let other_without_trail_surrogate = &other.bytes[`3`..];
353	// 4 bytes for the supplementary code point
354	self.bytes.reserve(`4` + other_without_trail_surrogate.len());
355	self.push_char(decode_surrogate_pair(lead, trail));
356	self.bytes.extend_from_slice(other_without_trail_surrogate);
357	}
358	_ => {
359	// If we'll be pushing a string containing a surrogate, we may
360	// no longer have UTF-8.
361	if other.next_surrogate(`0`).is_some() {
362	self.is_known_utf8 = `false`;
363	}
364
365	self.bytes.extend_from_slice(&other.bytes);
366	}
367	}
368	}
369
370	/// Append a Unicode scalar value at the end of the string.
371	#[inline]
372	pub fn push_char(&mut self, c: char) {
373	self.push_code_point_unchecked(CodePoint::from_char(c))
374	}
375
376	/// Append a code point at the end of the string.
377	///
378	/// This replaces newly paired surrogates at the boundary
379	/// with a supplementary code point,
380	/// like concatenating ill-formed UTF-16 strings effectively would.
381	#[inline]
382	pub fn push(&mut self, code_point: CodePoint) {
383	if let Some(trail) = code_point.to_trail_surrogate() {
384	if let Some(lead) = (&*self).final_lead_surrogate() {
385	let len_without_lead_surrogate = self.len() - `3`;
386	self.bytes.truncate(len_without_lead_surrogate);
387	self.push_char(decode_surrogate_pair(lead, trail));
388	return;
389	}
390
391	// We're pushing a trailing surrogate.
392	self.is_known_utf8 = `false`;
393	} else if code_point.to_lead_surrogate().is_some() {
394	// We're pushing a leading surrogate.
395	self.is_known_utf8 = `false`;
396	}
397
398	// No newly paired surrogates at the boundary.
399	self.push_code_point_unchecked(code_point)
400	}
401
402	/// Shortens a string to the specified length.
403	///
404	/// # Panics
405	///
406	/// Panics if `new_len` > current length,
407	/// or if `new_len` is not a code point boundary.
408	#[inline]
409	pub fn truncate(&mut self, new_len: usize) {
410	assert!(is_code_point_boundary(self, new_len));
411	self.bytes.truncate(new_len)
412	}
413
414	/// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
415	#[inline]
416	pub fn into_bytes(self) -> Vec<u8> {
417	self.bytes
418	}
419
420	/// Consumes the WTF-8 string and tries to convert it to UTF-8.
421	///
422	/// This does not copy the data.
423	///
424	/// If the contents are not well-formed UTF-8
425	/// (that is, if the string contains surrogates),
426	/// the original WTF-8 string is returned instead.
427	pub fn into_string(self) -> Result<String, Wtf8Buf> {
428	if self.is_known_utf8 \|\| self.next_surrogate(`0`).is_none() {
429	Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
430	} else {
431	Err(self)
432	}
433	}
434
435	/// Consumes the WTF-8 string and converts it lossily to UTF-8.
436	///
437	/// This does not copy the data (but may overwrite parts of it in place).
438	///
439	/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
440	pub fn into_string_lossy(mut self) -> String {
441	// Fast path: If we already have UTF-8, we can return it immediately.
442	if self.is_known_utf8 {
443	return unsafe { String::from_utf8_unchecked(self.bytes) };
444	}
445
446	let mut pos = `0`;
447	loop {
448	match self.next_surrogate(pos) {
449	Some((surrogate_pos, _)) => {
450	pos = surrogate_pos + `3`;
451	self.bytes[surrogate_pos..pos]
452	.copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
453	}
454	None => return unsafe { String::from_utf8_unchecked(self.bytes) },
455	}
456	}
457	}
458
459	/// Converts this `Wtf8Buf` into a boxed `Wtf8`.
460	#[inline]
461	pub fn into_box(self) -> Box<Wtf8> {
462	// SAFETY: relies on `Wtf8` being `repr(transparent)`.
463	unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
464	}
465
466	/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
467	pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
468	let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
469	Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: `false` }
470	}
471	}
472
473	/// Creates a new WTF-8 string from an iterator of code points.
474	///
475	/// This replaces surrogate code point pairs with supplementary code points,
476	/// like concatenating ill-formed UTF-16 strings effectively would.
477	impl FromIterator<CodePoint> for Wtf8Buf {
478	fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
479	let mut string: Wtf8Buf = Wtf8Buf::new();
480	string.extend(iter);
481	string
482	}
483	}
484
485	/// Append code points from an iterator to the string.
486	///
487	/// This replaces surrogate code point pairs with supplementary code points,
488	/// like concatenating ill-formed UTF-16 strings effectively would.
489	impl Extend<CodePoint> for Wtf8Buf {
490	fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
491	let iterator: ::IntoIter = iter.into_iter();
492	let (low: usize, _high: Option) = iterator.size_hint();
493	// Lower bound of one byte per code point (ASCII only)
494	self.bytes.reserve(additional:low);
495	iterator.for_each(move \|code_point: CodePoint\| self.push(code_point));
496	}
497
498	#[inline]
499	fn extend_one(&mut self, code_point: CodePoint) {
500	self.push(code_point);
501	}
502
503	#[inline]
504	fn extend_reserve(&mut self, additional: usize) {
505	// Lower bound of one byte per code point (ASCII only)
506	self.bytes.reserve(additional);
507	}
508	}
509
510	/// A borrowed slice of well-formed WTF-8 data.
511	///
512	/// Similar to `&str`, but can additionally contain surrogate code points
513	/// if they’re not in a surrogate pair.
514	#[derive(Eq, Ord, PartialEq, PartialOrd)]
515	#[repr(transparent)]
516	pub struct Wtf8 {
517	bytes: [u8],
518	}
519
520	impl AsInner<[u8]> for Wtf8 {
521	#[inline]
522	fn as_inner(&self) -> &[u8] {
523	&self.bytes
524	}
525	}
526
527	/// Format the slice with double quotes,
528	/// and surrogates as `\u` followed by four hexadecimal digits.
529	/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
530	impl fmt::Debug for Wtf8 {
531	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
532	fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
533	use crate::fmt::Write;
534	for c: char in s.chars().flat_map(\|c: char\| c.escape_debug()) {
535	f.write_char(c)?
536	}
537	Ok(())
538	}
539
540	formatter.write_str(data:"`\"`")?;
541	let mut pos: usize = `0`;
542	while let Some((surrogate_pos: usize, surrogate: u16)) = self.next_surrogate(pos) {
543	write_str_escaped(f:formatter, s:unsafe {
544	str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
545	})?;
546	write!(formatter, "`\\`u`{{`{:x}`}}`", surrogate)?;
547	pos = surrogate_pos + `3`;
548	}
549	write_str_escaped(f:formatter, s:unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
550	formatter.write_str(data:"`\"`")
551	}
552	}
553
554	impl fmt::Display for Wtf8 {
555	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
556	let wtf8_bytes: &[u8] = &self.bytes;
557	let mut pos: usize = `0`;
558	loop {
559	match self.next_surrogate(pos) {
560	Some((surrogate_pos: usize, _)) => {
561	formatter.write_str(data:unsafe {
562	str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
563	})?;
564	formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
565	pos = surrogate_pos + `3`;
566	}
567	None => {
568	let s: &str = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
569	if pos == `0` { return s.fmt(formatter) } else { return formatter.write_str(data:s) }
570	}
571	}
572	}
573	}
574	}
575
576	impl Wtf8 {
577	/// Creates a WTF-8 slice from a UTF-8 `&str` slice.
578	///
579	/// Since WTF-8 is a superset of UTF-8, this always succeeds.
580	#[inline]
581	pub fn from_str(value: &str) -> &Wtf8 {
582	unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
583	}
584
585	/// Creates a WTF-8 slice from a WTF-8 byte slice.
586	///
587	/// Since the byte slice is not checked for valid WTF-8, this functions is
588	/// marked unsafe.
589	#[inline]
590	pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
591	mem::transmute(value)
592	}
593
594	/// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
595	///
596	/// Since the byte slice is not checked for valid WTF-8, this functions is
597	/// marked unsafe.
598	#[inline]
599	unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
600	mem::transmute(value)
601	}
602
603	/// Returns the length, in WTF-8 bytes.
604	#[inline]
605	pub fn len(&self) -> usize {
606	self.bytes.len()
607	}
608
609	#[inline]
610	pub fn is_empty(&self) -> bool {
611	self.bytes.is_empty()
612	}
613
614	/// Returns the code point at `position` if it is in the ASCII range,
615	/// or `b'\xFF'` otherwise.
616	///
617	/// # Panics
618	///
619	/// Panics if `position` is beyond the end of the string.
620	#[inline]
621	pub fn ascii_byte_at(&self, position: usize) -> u8 {
622	match self.bytes[position] {
623	ascii_byte @ `0x00`..=`0x7F` => ascii_byte,
624	_ => `0xFF`,
625	}
626	}
627
628	/// Returns an iterator for the string’s code points.
629	#[inline]
630	pub fn code_points(&self) -> Wtf8CodePoints<'_> {
631	Wtf8CodePoints { bytes: self.bytes.iter() }
632	}
633
634	/// Access raw bytes of WTF-8 data
635	#[inline]
636	pub fn as_bytes(&self) -> &[u8] {
637	&self.bytes
638	}
639
640	/// Tries to convert the string to UTF-8 and return a `&str` slice.
641	///
642	/// Returns `None` if the string contains surrogates.
643	///
644	/// This does not copy the data.
645	#[inline]
646	pub fn as_str(&self) -> Result<&str, str::Utf8Error> {
647	str::from_utf8(&self.bytes)
648	}
649
650	/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
651	pub fn to_owned(&self) -> Wtf8Buf {
652	Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: `false` }
653	}
654
655	/// Lossily converts the string to UTF-8.
656	/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
657	///
658	/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
659	///
660	/// This only copies the data if necessary (if it contains any surrogate).
661	pub fn to_string_lossy(&self) -> Cow<'_, str> {
662	let surrogate_pos = match self.next_surrogate(`0`) {
663	None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
664	Some((pos, _)) => pos,
665	};
666	let wtf8_bytes = &self.bytes;
667	let mut utf8_bytes = Vec::with_capacity(self.len());
668	utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
669	utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
670	let mut pos = surrogate_pos + `3`;
671	loop {
672	match self.next_surrogate(pos) {
673	Some((surrogate_pos, _)) => {
674	utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
675	utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
676	pos = surrogate_pos + `3`;
677	}
678	None => {
679	utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
680	return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
681	}
682	}
683	}
684	}
685
686	/// Converts the WTF-8 string to potentially ill-formed UTF-16
687	/// and return an iterator of 16-bit code units.
688	///
689	/// This is lossless:
690	/// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
691	/// would always return the original WTF-8 string.
692	#[inline]
693	pub fn encode_wide(&self) -> EncodeWide<'_> {
694	EncodeWide { code_points: self.code_points(), extra: `0` }
695	}
696
697	#[inline]
698	fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
699	let mut iter = self.bytes[pos..].iter();
700	loop {
701	let b = *iter.next()?;
702	if b < `0x80` {
703	pos += `1`;
704	} else if b < `0xE0` {
705	iter.next();
706	pos += `2`;
707	} else if b == `0xED` {
708	match (iter.next(), iter.next()) {
709	(Some(&b2), Some(&b3)) if b2 >= `0xA0` => {
710	return Some((pos, decode_surrogate(b2, b3)));
711	}
712	_ => pos += `3`,
713	}
714	} else if b < `0xF0` {
715	iter.next();
716	iter.next();
717	pos += `3`;
718	} else {
719	iter.next();
720	iter.next();
721	iter.next();
722	pos += `4`;
723	}
724	}
725	}
726
727	#[inline]
728	fn final_lead_surrogate(&self) -> Option<u16> {
729	match self.bytes {
730	[.., `0xED`, b2 @ `0xA0`..=`0xAF`, b3] => Some(decode_surrogate(b2, b3)),
731	_ => None,
732	}
733	}
734
735	#[inline]
736	fn initial_trail_surrogate(&self) -> Option<u16> {
737	match self.bytes {
738	[`0xED`, b2 @ `0xB0`..=`0xBF`, b3, ..] => Some(decode_surrogate(b2, b3)),
739	_ => None,
740	}
741	}
742
743	pub fn clone_into(&self, buf: &mut Wtf8Buf) {
744	buf.is_known_utf8 = `false`;
745	self.bytes.clone_into(&mut buf.bytes);
746	}
747
748	/// Boxes this `Wtf8`.
749	#[inline]
750	pub fn into_box(&self) -> Box<Wtf8> {
751	let boxed: Box<[u8]> = self.bytes.into();
752	unsafe { mem::transmute(boxed) }
753	}
754
755	/// Creates a boxed, empty `Wtf8`.
756	pub fn empty_box() -> Box<Wtf8> {
757	let boxed: Box<[u8]> = Default::default();
758	unsafe { mem::transmute(boxed) }
759	}
760
761	#[inline]
762	pub fn into_arc(&self) -> Arc<Wtf8> {
763	let arc: Arc<[u8]> = Arc::from(&self.bytes);
764	unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
765	}
766
767	#[inline]
768	pub fn into_rc(&self) -> Rc<Wtf8> {
769	let rc: Rc<[u8]> = Rc::from(&self.bytes);
770	unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
771	}
772
773	#[inline]
774	pub fn make_ascii_lowercase(&mut self) {
775	self.bytes.make_ascii_lowercase()
776	}
777
778	#[inline]
779	pub fn make_ascii_uppercase(&mut self) {
780	self.bytes.make_ascii_uppercase()
781	}
782
783	#[inline]
784	pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
785	Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: `false` }
786	}
787
788	#[inline]
789	pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
790	Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: `false` }
791	}
792
793	#[inline]
794	pub fn is_ascii(&self) -> bool {
795	self.bytes.is_ascii()
796	}
797
798	#[inline]
799	pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
800	self.bytes.eq_ignore_ascii_case(&other.bytes)
801	}
802	}
803
804	/// Returns a slice of the given string for the byte range \[`begin`..`end`).
805	///
806	/// # Panics
807	///
808	/// Panics when `begin` and `end` do not point to code point boundaries,
809	/// or point beyond the end of the string.
810	impl ops::Index<ops::Range<usize>> for Wtf8 {
811	type Output = Wtf8;
812
813	#[inline]
814	fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
815	// is_code_point_boundary checks that the index is in [0, .len()]
816	if range.start <= range.end
817	&& is_code_point_boundary(self, index:range.start)
818	&& is_code_point_boundary(self, index:range.end)
819	{
820	unsafe { slice_unchecked(self, begin:range.start, range.end) }
821	} else {
822	slice_error_fail(self, begin:range.start, range.end)
823	}
824	}
825	}
826
827	/// Returns a slice of the given string from byte `begin` to its end.
828	///
829	/// # Panics
830	///
831	/// Panics when `begin` is not at a code point boundary,
832	/// or is beyond the end of the string.
833	impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
834	type Output = Wtf8;
835
836	#[inline]
837	fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
838	// is_code_point_boundary checks that the index is in [0, .len()]
839	if is_code_point_boundary(self, index:range.start) {
840	unsafe { slice_unchecked(self, begin:range.start, self.len()) }
841	} else {
842	slice_error_fail(self, begin:range.start, self.len())
843	}
844	}
845	}
846
847	/// Returns a slice of the given string from its beginning to byte `end`.
848	///
849	/// # Panics
850	///
851	/// Panics when `end` is not at a code point boundary,
852	/// or is beyond the end of the string.
853	impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
854	type Output = Wtf8;
855
856	#[inline]
857	fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
858	// is_code_point_boundary checks that the index is in [0, .len()]
859	if is_code_point_boundary(self, index:range.end) {
860	unsafe { slice_unchecked(self, begin:`0`, range.end) }
861	} else {
862	slice_error_fail(self, begin:`0`, range.end)
863	}
864	}
865	}
866
867	impl ops::Index<ops::RangeFull> for Wtf8 {
868	type Output = Wtf8;
869
870	#[inline]
871	fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
872	self
873	}
874	}
875
876	#[inline]
877	fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
878	// The first byte is assumed to be 0xED
879	`0xD800` \| (second_byte as u16 & `0x3F`) << `6` \| third_byte as u16 & `0x3F`
880	}
881
882	#[inline]
883	fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
884	let code_point: u32 = `0x10000` + ((((lead - `0xD800`) as u32) << `10`) \| (trail - `0xDC00`) as u32);
885	unsafe { char::from_u32_unchecked(code_point) }
886	}
887
888	/// Copied from str::is_char_boundary
889	#[inline]
890	pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
891	if index == `0` {
892	return `true`;
893	}
894	match slice.bytes.get(index) {
895	None => index == slice.len(),
896	Some(&b: u8) => (b as i8) >= `-0x40`,
897	}
898	}
899
900	/// Verify that `index` is at the edge of either a valid UTF-8 codepoint
901	/// (i.e. a codepoint that's not a surrogate) or of the whole string.
902	///
903	/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`.
904	/// Splitting between surrogates is valid as far as WTF-8 is concerned, but
905	/// we do not permit it in the public API because WTF-8 is considered an
906	/// implementation detail.
907	#[track_caller]
908	#[inline]
909	pub fn check_utf8_boundary(slice: &Wtf8, index: usize) {
910	if index == `0` {
911	return;
912	}
913	match slice.bytes.get(index) {
914	Some(`0xED`) => (), // Might be a surrogate
915	Some(&b: u8) if (b as i8) >= `-0x40` => return,
916	Some(_) => panic!("byte index {index} is not a codepoint boundary"),
917	None if index == slice.len() => return,
918	None => panic!("byte index {index} is out of bounds"),
919	}
920	if slice.bytes[index + `1`] >= `0xA0` {
921	// There's a surrogate after index. Now check before index.
922	if index >= `3` && slice.bytes[index - `3`] == `0xED` && slice.bytes[index - `2`] >= `0xA0` {
923	panic!("byte index {index} lies between surrogate codepoints");
924	}
925	}
926	}
927
928	/// Copied from core::str::raw::slice_unchecked
929	#[inline]
930	pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
931	// memory layout of a &[u8] and &Wtf8 are the same
932	Wtf8::from_bytes_unchecked(slice::from_raw_parts(data:s.bytes.as_ptr().add(begin), len:end - begin))
933	}
934
935	/// Copied from core::str::raw::slice_error_fail
936	#[inline(never)]
937	pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
938	assert!(begin <= end);
939	panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
940	}
941
942	/// Iterator for the code points of a WTF-8 string.
943	///
944	/// Created with the method `.code_points()`.
945	#[derive(Clone)]
946	pub struct Wtf8CodePoints<'a> {
947	bytes: slice::Iter<'a, u8>,
948	}
949
950	impl<'a> Iterator for Wtf8CodePoints<'a> {
951	type Item = CodePoint;
952
953	#[inline]
954	fn next(&mut self) -> Option<CodePoint> {
955	// SAFETY: `self.bytes` has been created from a WTF-8 string
956	unsafe { next_code_point(&mut self.bytes).map(\|c: u32\| CodePoint { value: c }) }
957	}
958
959	#[inline]
960	fn size_hint(&self) -> (usize, Option<usize>) {
961	let len: usize = self.bytes.len();
962	(len.saturating_add(`3`) / `4`, Some(len))
963	}
964	}
965
966	/// Generates a wide character sequence for potentially ill-formed UTF-16.
967	#[stable(feature = "rust1", since = "1.0.0")]
968	#[derive(Clone)]
969	pub struct EncodeWide<'a> {
970	code_points: Wtf8CodePoints<'a>,
971	extra: u16,
972	}
973
974	// Copied from libunicode/u_str.rs
975	#[stable(feature = "rust1", since = "1.0.0")]
976	impl<'a> Iterator for EncodeWide<'a> {
977	type Item = u16;
978
979	#[inline]
980	fn next(&mut self) -> Option<u16> {
981	if self.extra != `0` {
982	let tmp = self.extra;
983	self.extra = `0`;
984	return Some(tmp);
985	}
986
987	let mut buf = [`0`; `2`];
988	self.code_points.next().map(\|code_point\| {
989	let n = encode_utf16_raw(code_point.value, &mut buf).len();
990	if n == `2` {
991	self.extra = buf[`1`];
992	}
993	buf[`0`]
994	})
995	}
996
997	#[inline]
998	fn size_hint(&self) -> (usize, Option<usize>) {
999	let (low, high) = self.code_points.size_hint();
1000	let ext = (self.extra != `0`) as usize;
1001	// every code point gets either one u16 or two u16,
1002	// so this iterator is between 1 or 2 times as
1003	// long as the underlying iterator.
1004	(low + ext, high.and_then(\|n\| n.checked_mul(`2`)).and_then(\|n\| n.checked_add(ext)))
1005	}
1006	}
1007
1008	#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")]
1009	impl FusedIterator for EncodeWide<'_> {}
1010
1011	impl Hash for CodePoint {
1012	#[inline]
1013	fn hash<H: Hasher>(&self, state: &mut H) {
1014	self.value.hash(state)
1015	}
1016	}
1017
1018	impl Hash for Wtf8Buf {
1019	#[inline]
1020	fn hash<H: Hasher>(&self, state: &mut H) {
1021	state.write(&self.bytes);
1022	`0xfeu8`.hash(state)
1023	}
1024	}
1025
1026	impl Hash for Wtf8 {
1027	#[inline]
1028	fn hash<H: Hasher>(&self, state: &mut H) {
1029	state.write(&self.bytes);
1030	`0xfeu8`.hash(state)
1031	}
1032	}
1033