mod.rs source code [crates/alloc/src/wtf8/mod.rs]

1	//! Heap-allocated counterpart to core `wtf8` module.
2	#![unstable(
3	feature = "wtf8_internals",
4	issue = "none",
5	reason = "this is internal code for representing OsStr on some platforms and not a public API"
6	)]
7	// rustdoc bug: doc(hidden) on the module won't stop types in the module from showing up in trait
8	// implementations, so, we'll have to add more doc(hidden)s anyway
9	#![doc(hidden)]
10
11	// Note: This module is also included in the alloctests crate using #[path] to
12	// run the tests. See the comment there for an explanation why this is the case.
13
14	#[cfg(test)]
15	mod tests;
16
17	use core::char::encode_utf8_raw;
18	use core::hash::{Hash, Hasher};
19	pub use core::wtf8::{CodePoint, Wtf8};
20	#[cfg(not(test))]
21	pub use core::wtf8::{EncodeWide, Wtf8CodePoints};
22	use core::{fmt, mem, ops, str};
23
24	use crate::borrow::{Cow, ToOwned};
25	use crate::boxed::Box;
26	use crate::collections::TryReserveError;
27	#[cfg(not(test))]
28	use crate::rc::Rc;
29	use crate::string::String;
30	#[cfg(all(not(test), target_has_atomic = "ptr"))]
31	use crate::sync::Arc;
32	use crate::vec::Vec;
33
34	/// An owned, growable string of well-formed WTF-8 data.
35	///
36	/// Similar to `String`, but can additionally contain surrogate code points
37	/// if they’re not in a surrogate pair.
38	#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
39	#[doc(hidden)]
40	pub struct Wtf8Buf {
41	bytes: Vec<u8>,
42
43	/// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
44	/// know this if we're constructed from a `String` or `&str`.
45	///
46	/// It is possible for `bytes` to have valid UTF-8 without this being
47	/// set, such as when we're concatenating `&Wtf8`'s and surrogates become
48	/// paired, as we don't bother to rescan the entire string.
49	is_known_utf8: bool,
50	}
51
52	impl ops::Deref for Wtf8Buf {
53	type Target = Wtf8;
54
55	fn deref(&self) -> &Wtf8 {
56	self.as_slice()
57	}
58	}
59
60	impl ops::DerefMut for Wtf8Buf {
61	fn deref_mut(&mut self) -> &mut Wtf8 {
62	self.as_mut_slice()
63	}
64	}
65
66	/// Formats the string in double quotes, with characters escaped according to
67	/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
68	/// where each `x` is a hexadecimal digit.
69	///
70	/// For example, the code units [U+0061, U+D800, U+000A] are formatted as
71	/// `"a\u{D800}\n"`.
72	impl fmt::Debug for Wtf8Buf {
73	#[inline]
74	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
75	fmt::Debug::fmt(&**self, formatter)
76	}
77	}
78
79	/// Formats the string with unpaired surrogates substituted with the replacement
80	/// character, U+FFFD.
81	impl fmt::Display for Wtf8Buf {
82	fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
83	if let Some(s) = self.as_known_utf8() {
84	fmt::Display::fmt(s, formatter)
85	} else {
86	fmt::Display::fmt(&**self, formatter)
87	}
88	}
89	}
90
91	#[cfg_attr(test, allow(dead_code))]
92	impl Wtf8Buf {
93	/// Creates a new, empty WTF-8 string.
94	#[inline]
95	pub fn new() -> Wtf8Buf {
96	Wtf8Buf { bytes: Vec::new(), is_known_utf8: `true` }
97	}
98
99	/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
100	#[inline]
101	pub fn with_capacity(capacity: usize) -> Wtf8Buf {
102	Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: `true` }
103	}
104
105	/// Creates a WTF-8 string from a WTF-8 byte vec.
106	///
107	/// Since the byte vec is not checked for valid WTF-8, this function is
108	/// marked unsafe.
109	#[inline]
110	pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
111	Wtf8Buf { bytes: value, is_known_utf8: `false` }
112	}
113
114	/// Creates a WTF-8 string from a UTF-8 `String`.
115	///
116	/// This takes ownership of the `String` and does not copy.
117	///
118	/// Since WTF-8 is a superset of UTF-8, this always succeeds.
119	#[inline]
120	pub const fn from_string(string: String) -> Wtf8Buf {
121	Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: `true` }
122	}
123
124	/// Creates a WTF-8 string from a UTF-8 `&str` slice.
125	///
126	/// This copies the content of the slice.
127	///
128	/// Since WTF-8 is a superset of UTF-8, this always succeeds.
129	#[inline]
130	pub fn from_str(s: &str) -> Wtf8Buf {
131	Wtf8Buf { bytes: s.as_bytes().to_vec(), is_known_utf8: `true` }
132	}
133
134	pub fn clear(&mut self) {
135	self.bytes.clear();
136	self.is_known_utf8 = `true`;
137	}
138
139	/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
140	///
141	/// This is lossless: calling `.encode_wide()` on the resulting string
142	/// will always return the original code units.
143	pub fn from_wide(v: &[u16]) -> Wtf8Buf {
144	let mut string = Wtf8Buf::with_capacity(v.len());
145	for item in char::decode_utf16(v.iter().cloned()) {
146	match item {
147	Ok(ch) => string.push_char(ch),
148	Err(surrogate) => {
149	let surrogate = surrogate.unpaired_surrogate();
150	// Surrogates are known to be in the code point range.
151	let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
152	// The string will now contain an unpaired surrogate.
153	string.is_known_utf8 = `false`;
154	// Skip the WTF-8 concatenation check,
155	// surrogate pairs are already decoded by decode_utf16
156	unsafe {
157	string.push_code_point_unchecked(code_point);
158	}
159	}
160	}
161	}
162	string
163	}
164
165	/// Appends the given `char` to the end of this string.
166	/// This does not* include the WTF-8 concatenation check or `is_known_utf8` check.*
167	/// Copied from String::push.
168	unsafe fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
169	let mut bytes = [`0`; char::MAX_LEN_UTF8];
170	let bytes = encode_utf8_raw(code_point.to_u32(), &mut bytes);
171	self.bytes.extend_from_slice(bytes)
172	}
173
174	#[inline]
175	pub fn as_slice(&self) -> &Wtf8 {
176	unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
177	}
178
179	#[inline]
180	pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
181	// Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
182	// cause them to change from well-formed UTF-8 to ill-formed UTF-8,
183	// which would break the assumptions of the `is_known_utf8` field.
184	unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
185	}
186
187	/// Converts the string to UTF-8 without validation, if it was created from
188	/// valid UTF-8.
189	#[inline]
190	fn as_known_utf8(&self) -> Option<&str> {
191	if self.is_known_utf8 {
192	// SAFETY: The buffer is known to be valid UTF-8.
193	Some(unsafe { str::from_utf8_unchecked(self.as_bytes()) })
194	} else {
195	None
196	}
197	}
198
199	/// Reserves capacity for at least `additional` more bytes to be inserted
200	/// in the given `Wtf8Buf`.
201	/// The collection may reserve more space to avoid frequent reallocations.
202	///
203	/// # Panics
204	///
205	/// Panics if the new capacity exceeds `isize::MAX` bytes.
206	#[inline]
207	pub fn reserve(&mut self, additional: usize) {
208	self.bytes.reserve(additional)
209	}
210
211	/// Tries to reserve capacity for at least `additional` more bytes to be
212	/// inserted in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to
213	/// avoid frequent reallocations. After calling `try_reserve`, capacity will
214	/// be greater than or equal to `self.len() + additional`. Does nothing if
215	/// capacity is already sufficient. This method preserves the contents even
216	/// if an error occurs.
217	///
218	/// # Errors
219	///
220	/// If the capacity overflows, or the allocator reports a failure, then an error
221	/// is returned.
222	#[inline]
223	pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
224	self.bytes.try_reserve(additional)
225	}
226
227	#[inline]
228	pub fn reserve_exact(&mut self, additional: usize) {
229	self.bytes.reserve_exact(additional)
230	}
231
232	/// Tries to reserve the minimum capacity for exactly `additional` more
233	/// bytes to be inserted in the given `Wtf8Buf`. After calling
234	/// `try_reserve_exact`, capacity will be greater than or equal to
235	/// `self.len() + additional` if it returns `Ok(())`.
236	/// Does nothing if the capacity is already sufficient.
237	///
238	/// Note that the allocator may give the `Wtf8Buf` more space than it
239	/// requests. Therefore, capacity can not be relied upon to be precisely
240	/// minimal. Prefer [`try_reserve`] if future insertions are expected.
241	///
242	/// [`try_reserve`]: Wtf8Buf::try_reserve
243	///
244	/// # Errors
245	///
246	/// If the capacity overflows, or the allocator reports a failure, then an error
247	/// is returned.
248	#[inline]
249	pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
250	self.bytes.try_reserve_exact(additional)
251	}
252
253	#[inline]
254	pub fn shrink_to_fit(&mut self) {
255	self.bytes.shrink_to_fit()
256	}
257
258	#[inline]
259	pub fn shrink_to(&mut self, min_capacity: usize) {
260	self.bytes.shrink_to(min_capacity)
261	}
262
263	#[inline]
264	pub fn leak<'a>(self) -> &'a mut Wtf8 {
265	unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) }
266	}
267
268	/// Returns the number of bytes that this string buffer can hold without reallocating.
269	#[inline]
270	pub fn capacity(&self) -> usize {
271	self.bytes.capacity()
272	}
273
274	/// Append a UTF-8 slice at the end of the string.
275	#[inline]
276	pub fn push_str(&mut self, other: &str) {
277	self.bytes.extend_from_slice(other.as_bytes())
278	}
279
280	/// Append a WTF-8 slice at the end of the string.
281	///
282	/// This replaces newly paired surrogates at the boundary
283	/// with a supplementary code point,
284	/// like concatenating ill-formed UTF-16 strings effectively would.
285	#[inline]
286	pub fn push_wtf8(&mut self, other: &Wtf8) {
287	match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
288	// Replace newly paired surrogates by a supplementary code point.
289	(Some(lead), Some(trail)) => {
290	let len_without_lead_surrogate = self.len() - `3`;
291	self.bytes.truncate(len_without_lead_surrogate);
292	let other_without_trail_surrogate = &other.as_bytes()[`3`..];
293	// 4 bytes for the supplementary code point
294	self.bytes.reserve(`4` + other_without_trail_surrogate.len());
295	self.push_char(decode_surrogate_pair(lead, trail));
296	self.bytes.extend_from_slice(other_without_trail_surrogate);
297	}
298	_ => {
299	// If we'll be pushing a string containing a surrogate, we may
300	// no longer have UTF-8.
301	if self.is_known_utf8 && other.next_surrogate(`0`).is_some() {
302	self.is_known_utf8 = `false`;
303	}
304
305	self.bytes.extend_from_slice(other.as_bytes());
306	}
307	}
308	}
309
310	/// Append a Unicode scalar value at the end of the string.
311	#[inline]
312	pub fn push_char(&mut self, c: char) {
313	// SAFETY: It's always safe to push a char.
314	unsafe { self.push_code_point_unchecked(CodePoint::from_char(c)) }
315	}
316
317	/// Append a code point at the end of the string.
318	///
319	/// This replaces newly paired surrogates at the boundary
320	/// with a supplementary code point,
321	/// like concatenating ill-formed UTF-16 strings effectively would.
322	#[inline]
323	pub fn push(&mut self, code_point: CodePoint) {
324	if let Some(trail) = code_point.to_trail_surrogate() {
325	if let Some(lead) = (&*self).final_lead_surrogate() {
326	let len_without_lead_surrogate = self.len() - `3`;
327	self.bytes.truncate(len_without_lead_surrogate);
328	self.push_char(decode_surrogate_pair(lead, trail));
329	return;
330	}
331
332	// We're pushing a trailing surrogate.
333	self.is_known_utf8 = `false`;
334	} else if code_point.to_lead_surrogate().is_some() {
335	// We're pushing a leading surrogate.
336	self.is_known_utf8 = `false`;
337	}
338
339	// No newly paired surrogates at the boundary.
340	unsafe { self.push_code_point_unchecked(code_point) }
341	}
342
343	/// Shortens a string to the specified length.
344	///
345	/// # Panics
346	///
347	/// Panics if `new_len` > current length,
348	/// or if `new_len` is not a code point boundary.
349	#[inline]
350	pub fn truncate(&mut self, new_len: usize) {
351	assert!(self.is_code_point_boundary(new_len));
352	self.bytes.truncate(new_len)
353	}
354
355	/// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
356	#[inline]
357	pub fn into_bytes(self) -> Vec<u8> {
358	self.bytes
359	}
360
361	/// Consumes the WTF-8 string and tries to convert it to UTF-8.
362	///
363	/// This does not copy the data.
364	///
365	/// If the contents are not well-formed UTF-8
366	/// (that is, if the string contains surrogates),
367	/// the original WTF-8 string is returned instead.
368	pub fn into_string(self) -> Result<String, Wtf8Buf> {
369	if self.is_known_utf8 \|\| self.next_surrogate(`0`).is_none() {
370	Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
371	} else {
372	Err(self)
373	}
374	}
375
376	/// Consumes the WTF-8 string and converts it lossily to UTF-8.
377	///
378	/// This does not copy the data (but may overwrite parts of it in place).
379	///
380	/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
381	pub fn into_string_lossy(mut self) -> String {
382	if !self.is_known_utf8 {
383	let mut pos = `0`;
384	while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
385	pos = surrogate_pos + `3`;
386	// Surrogates and the replacement character are all 3 bytes, so
387	// they can substituted in-place.
388	self.bytes[surrogate_pos..pos].copy_from_slice("`\u{FFFD}`".as_bytes());
389	}
390	}
391	unsafe { String::from_utf8_unchecked(self.bytes) }
392	}
393
394	/// Converts this `Wtf8Buf` into a boxed `Wtf8`.
395	#[inline]
396	pub fn into_box(self) -> Box<Wtf8> {
397	// SAFETY: relies on `Wtf8` being `repr(transparent)`.
398	unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
399	}
400
401	/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
402	pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
403	let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
404	Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: `false` }
405	}
406
407	/// Provides plumbing to core `Vec::extend_from_slice`.
408	/// More well behaving alternative to allowing outer types
409	/// full mutable access to the core `Vec`.
410	#[inline]
411	pub unsafe fn extend_from_slice_unchecked(&mut self, other: &[u8]) {
412	self.bytes.extend_from_slice(other);
413	self.is_known_utf8 = `false`;
414	}
415	}
416
417	/// Creates a new WTF-8 string from an iterator of code points.
418	///
419	/// This replaces surrogate code point pairs with supplementary code points,
420	/// like concatenating ill-formed UTF-16 strings effectively would.
421	impl FromIterator<CodePoint> for Wtf8Buf {
422	fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
423	let mut string: Wtf8Buf = Wtf8Buf::new();
424	string.extend(iter);
425	string
426	}
427	}
428
429	/// Append code points from an iterator to the string.
430	///
431	/// This replaces surrogate code point pairs with supplementary code points,
432	/// like concatenating ill-formed UTF-16 strings effectively would.
433	impl Extend<CodePoint> for Wtf8Buf {
434	fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
435	let iterator = iter.into_iter();
436	let (low: usize, _high) = iterator.size_hint();
437	// Lower bound of one byte per code point (ASCII only)
438	self.bytes.reserve(additional:low);
439	iterator.for_each(move \|code_point\| self.push(code_point));
440	}
441
442	#[inline]
443	fn extend_one(&mut self, code_point: CodePoint) {
444	self.push(code_point);
445	}
446
447	#[inline]
448	fn extend_reserve(&mut self, additional: usize) {
449	// Lower bound of one byte per code point (ASCII only)
450	self.bytes.reserve(additional);
451	}
452	}
453
454	/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
455	pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf {
456	Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: `false` }
457	}
458
459	/// Lossily converts the string to UTF-8.
460	/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
461	///
462	/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
463	///
464	/// This only copies the data if necessary (if it contains any surrogate).
465	pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> {
466	let Some((surrogate_pos, _)) = slice.next_surrogate(`0`) else {
467	return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) });
468	};
469	let wtf8_bytes = slice.as_bytes();
470	let mut utf8_bytes: Vec = Vec::with_capacity(slice.len());
471	utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
472	utf8_bytes.extend_from_slice("`\u{FFFD}`".as_bytes());
473	let mut pos = surrogate_pos + `3`;
474	loop {
475	match slice.next_surrogate(pos) {
476	Some((surrogate_pos, _)) => {
477	utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
478	utf8_bytes.extend_from_slice("`\u{FFFD}`".as_bytes());
479	pos = surrogate_pos + `3`;
480	}
481	None => {
482	utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
483	return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
484	}
485	}
486	}
487	}
488
489	#[inline]
490	pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) {
491	buf.is_known_utf8 = `false`;
492	slice.as_bytes().clone_into(&mut buf.bytes);
493	}
494
495	#[cfg(not(test))]
496	impl Wtf8 {
497	#[rustc_allow_incoherent_impl]
498	pub fn to_owned(&self) -> Wtf8Buf {
499	to_owned(self)
500	}
501
502	#[rustc_allow_incoherent_impl]
503	pub fn clone_into(&self, buf: &mut Wtf8Buf) {
504	clone_into(self, buf)
505	}
506
507	#[rustc_allow_incoherent_impl]
508	pub fn to_string_lossy(&self) -> Cow<'_, str> {
509	to_string_lossy(self)
510	}
511
512	#[rustc_allow_incoherent_impl]
513	pub fn into_box(&self) -> Box<Wtf8> {
514	let boxed: Box<[u8]> = self.as_bytes().into();
515	unsafe { mem::transmute(boxed) }
516	}
517
518	#[rustc_allow_incoherent_impl]
519	pub fn empty_box() -> Box<Wtf8> {
520	let boxed: Box<[u8]> = Default::default();
521	unsafe { mem::transmute(boxed) }
522	}
523
524	#[cfg(target_has_atomic = "ptr")]
525	#[rustc_allow_incoherent_impl]
526	pub fn into_arc(&self) -> Arc<Wtf8> {
527	let arc: Arc<[u8]> = Arc::from(self.as_bytes());
528	unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
529	}
530
531	#[rustc_allow_incoherent_impl]
532	pub fn into_rc(&self) -> Rc<Wtf8> {
533	let rc: Rc<[u8]> = Rc::from(self.as_bytes());
534	unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
535	}
536
537	#[inline]
538	#[rustc_allow_incoherent_impl]
539	pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
540	Wtf8Buf { bytes: self.as_bytes().to_ascii_lowercase(), is_known_utf8: `false` }
541	}
542
543	#[inline]
544	#[rustc_allow_incoherent_impl]
545	pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
546	Wtf8Buf { bytes: self.as_bytes().to_ascii_uppercase(), is_known_utf8: `false` }
547	}
548	}
549
550	#[inline]
551	fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
552	let code_point: u32 = `0x10000` + ((((lead - `0xD800`) as u32) << `10`) \| (trail - `0xDC00`) as u32);
553	unsafe { char::from_u32_unchecked(code_point) }
554	}
555
556	impl Hash for Wtf8Buf {
557	#[inline]
558	fn hash<H: Hasher>(&self, state: &mut H) {
559	state.write(&self.bytes);
560	`0xfeu8`.hash(state)
561	}
562	}
563