identifier.rs source code [crates/semver/src/identifier.rs]

1	// This module implements Identifier, a short-optimized string allowed to
2	// contain only the ASCII characters hyphen, dot, 0-9, A-Z, a-z.
3	//
4	// As of mid-2021, the distribution of pre-release lengths on crates.io is:
5	//
6	// length count length count length count
7	// 0 355929 11 81 24 2
8	// 1 208 12 48 25 6
9	// 2 236 13 55 26 10
10	// 3 1909 14 25 27 4
11	// 4 1284 15 15 28 1
12	// 5 1742 16 35 30 1
13	// 6 3440 17 9 31 5
14	// 7 5624 18 6 32 1
15	// 8 1321 19 12 36 2
16	// 9 179 20 2 37 379
17	// 10 65 23 11
18	//
19	// and the distribution of build metadata lengths is:
20	//
21	// length count length count length count
22	// 0 364445 8 7725 18 1
23	// 1 72 9 16 19 1
24	// 2 7 10 85 20 1
25	// 3 28 11 17 22 4
26	// 4 9 12 10 26 1
27	// 5 68 13 9 27 1
28	// 6 73 14 10 40 5
29	// 7 53 15 6
30	//
31	// Therefore it really behooves us to be able to use the entire 8 bytes of a
32	// pointer for inline storage. For both pre-release and build metadata there are
33	// vastly more strings with length exactly 8 bytes than the sum over all lengths
34	// longer than 8 bytes.
35	//
36	// To differentiate the inline representation from the heap allocated long
37	// representation, we'll allocate heap pointers with 2-byte alignment so that
38	// they are guaranteed to have an unset least significant bit. Then in the repr
39	// we store for pointers, we rotate a 1 into the most significant bit of the
40	// most significant byte, which is never set for an ASCII byte.
41	//
42	// Inline repr:
43	//
44	// 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx
45	//
46	// Heap allocated repr:
47	//
48	// 1ppppppp pppppppp pppppppp pppppppp pppppppp pppppppp pppppppp pppppppp 0
49	// ^ most significant bit least significant bit of orig ptr, rotated out ^
50	//
51	// Since the most significant bit doubles as a sign bit for the similarly sized
52	// signed integer type, the CPU has an efficient instruction for inspecting it,
53	// meaning we can differentiate between an inline repr and a heap allocated repr
54	// in one instruction. Effectively an inline repr always looks like a positive
55	// i64 while a heap allocated repr always looks like a negative i64.
56	//
57	// For the inline repr, we store \0 padding on the end of the stored characters,
58	// and thus the string length is readily determined efficiently by a cttz (count
59	// trailing zeros) or bsf (bit scan forward) instruction.
60	//
61	// For the heap allocated repr, the length is encoded as a base-128 varint at
62	// the head of the allocation.
63	//
64	// Empty strings are stored as an all-1 bit pattern, corresponding to -1i64.
65	// Consequently the all-0 bit pattern is never a legal representation in any
66	// repr, leaving it available as a niche for downstream code. For example this
67	// allows size_of::<Version>() == size_of::<Option<Version>>().
68
69	use crate::alloc::alloc::{alloc, dealloc, handle_alloc_error, Layout};
70	use core::isize;
71	use core::mem;
72	use core::num::{NonZeroU64, NonZeroUsize};
73	use core::ptr::{self, NonNull};
74	use core::slice;
75	use core::str;
76	use core::usize;
77
78	const PTR_BYTES: usize = mem::size_of::<NonNull<u8>>();
79
80	// If pointers are already 8 bytes or bigger, then 0. If pointers are smaller
81	// than 8 bytes, then Identifier will contain a byte array to raise its size up
82	// to 8 bytes total.
83	const TAIL_BYTES: usize = `8` * (PTR_BYTES < `8`) as usize - PTR_BYTES * (PTR_BYTES < `8`) as usize;
84
85	#[repr(C, align(`8`))]
86	pub(crate) struct Identifier {
87	head: NonNull<u8>,
88	tail: [u8; TAIL_BYTES],
89	}
90
91	impl Identifier {
92	pub(crate) const fn empty() -> Self {
93	// This is a separate constant because unsafe function calls are not
94	// allowed in a const fn body, only in a const, until later rustc than
95	// what we support.
96	const HEAD: NonNull<u8> = unsafe { NonNull::new_unchecked(!`0` as *mut u8) };
97
98	// `mov rax, -1`
99	Identifier {
100	head: HEAD,
101	tail: [!`0`; TAIL_BYTES],
102	}
103	}
104
105	// SAFETY: string must be ASCII and not contain \0 bytes.
106	pub(crate) unsafe fn new_unchecked(string: &str) -> Self {
107	let len = string.len();
108	debug_assert!(len <= isize::MAX as usize);
109	match len as u64 {
110	`0` => Self::empty(),
111	`1`..=`8` => {
112	let mut bytes = [`0u8`; mem::size_of::<Identifier>()];
113	// SAFETY: string is big enough to read len bytes, bytes is big
114	// enough to write len bytes, and they do not overlap.
115	unsafe { ptr::copy_nonoverlapping(string.as_ptr(), bytes.as_mut_ptr(), len) };
116	// SAFETY: the head field is nonzero because the input string
117	// was at least 1 byte of ASCII and did not contain \0.
118	unsafe { mem::transmute::<[u8; mem::size_of::<Identifier>()], Identifier>(bytes) }
119	}
120	`9`..=`0xff_ffff_ffff_ffff` => {
121	// SAFETY: len is in a range that does not contain 0.
122	let size = bytes_for_varint(unsafe { NonZeroUsize::new_unchecked(len) }) + len;
123	let align = `2`;
124	// On 32-bit and 16-bit architecture, check for size overflowing
125	// isize::MAX. Making an allocation request bigger than this to
126	// the allocator is considered UB. All allocations (including
127	// static ones) are limited to isize::MAX so we're guaranteed
128	// len <= isize::MAX, and we know bytes_for_varint(len) <= 5
129	// because 1285 > isize::MAX, which means the only problem
130	// that can arise is when isize::MAX - 5 <= len <= isize::MAX.
131	// This is pretty much guaranteed to be malicious input so we
132	// don't need to care about returning a good error message.
133	if mem::size_of::<usize>() < `8` {
134	let max_alloc = usize::MAX / `2` - align;
135	assert!(size <= max_alloc);
136	}
137	// SAFETY: align is not zero, align is a power of two, and
138	// rounding size up to align does not overflow isize::MAX.
139	let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
140	// SAFETY: layout's size is nonzero.
141	let ptr = unsafe { alloc(layout) };
142	if ptr.is_null() {
143	handle_alloc_error(layout);
144	}
145	let mut write = ptr;
146	let mut varint_remaining = len;
147	while varint_remaining > `0` {
148	// SAFETY: size is bytes_for_varint(len) bytes + len bytes.
149	// This is writing the first bytes_for_varint(len) bytes.
150	unsafe { ptr::write(write, varint_remaining as u8 \| `0x80`) };
151	varint_remaining >>= `7`;
152	// SAFETY: still in bounds of the same allocation.
153	write = unsafe { write.add(`1`) };
154	}
155	// SAFETY: size is bytes_for_varint(len) bytes + len bytes. This
156	// is writing to the last len bytes.
157	unsafe { ptr::copy_nonoverlapping(string.as_ptr(), write, len) };
158	Identifier {
159	head: ptr_to_repr(ptr),
160	tail: [`0`; TAIL_BYTES],
161	}
162	}
163	`0x100_0000_0000_0000`..=`0xffff_ffff_ffff_ffff` => {
164	unreachable!("please refrain from storing >64 petabytes of text in semver version");
165	}
166	#[cfg(no_exhaustive_int_match)] // rustc <1.33
167	_ => unreachable!(),
168	}
169	}
170
171	pub(crate) fn is_empty(&self) -> bool {
172	// `cmp rdi, -1` -- basically: `repr as i64 == -1`
173	let empty = Self::empty();
174	let is_empty = self.head == empty.head && self.tail == empty.tail;
175	// The empty representation does nothing on Drop. We can't let this one
176	// drop normally because `impl Drop for Identifier` calls is_empty; that
177	// would be an infinite recursion.
178	mem::forget(empty);
179	is_empty
180	}
181
182	fn is_inline(&self) -> bool {
183	// `test rdi, rdi` -- basically: `repr as i64 >= 0`
184	self.head.as_ptr() as usize >> (PTR_BYTES * `8` - `1`) == `0`
185	}
186
187	fn is_empty_or_inline(&self) -> bool {
188	// `cmp rdi, -2` -- basically: `repr as i64 > -2`
189	self.is_empty() \|\| self.is_inline()
190	}
191
192	pub(crate) fn as_str(&self) -> &str {
193	if self.is_empty() {
194	""
195	} else if self.is_inline() {
196	// SAFETY: repr is in the inline representation.
197	unsafe { inline_as_str(self) }
198	} else {
199	// SAFETY: repr is in the heap allocated representation.
200	unsafe { ptr_as_str(&self.head) }
201	}
202	}
203
204	pub(crate) fn ptr_eq(&self, rhs: &Self) -> bool {
205	self.head == rhs.head && self.tail == rhs.tail
206	}
207	}
208
209	impl Clone for Identifier {
210	fn clone(&self) -> Self {
211	if self.is_empty_or_inline() {
212	Identifier {
213	head: self.head,
214	tail: self.tail,
215	}
216	} else {
217	let ptr = repr_to_ptr(self.head);
218	// SAFETY: ptr is one of our own heap allocations.
219	let len = unsafe { decode_len(ptr) };
220	let size = bytes_for_varint(len) + len.get();
221	let align = `2`;
222	// SAFETY: align is not zero, align is a power of two, and rounding
223	// size up to align does not overflow isize::MAX. This is just
224	// duplicating a previous allocation where all of these guarantees
225	// were already made.
226	let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
227	// SAFETY: layout's size is nonzero.
228	let clone = unsafe { alloc(layout) };
229	if clone.is_null() {
230	handle_alloc_error(layout);
231	}
232	// SAFETY: new allocation cannot overlap the previous one (this was
233	// not a realloc). The argument ptrs are readable/writeable
234	// respectively for size bytes.
235	unsafe { ptr::copy_nonoverlapping(ptr, clone, size) }
236	Identifier {
237	head: ptr_to_repr(clone),
238	tail: [`0`; TAIL_BYTES],
239	}
240	}
241	}
242	}
243
244	impl Drop for Identifier {
245	fn drop(&mut self) {
246	if self.is_empty_or_inline() {
247	return;
248	}
249	let ptr: *mut u8 = repr_to_ptr_mut(self.head);
250	// SAFETY: ptr is one of our own heap allocations.
251	let len: NonZero = unsafe { decode_len(ptr) };
252	let size: usize = bytes_for_varint(len) + len.get();
253	let align: usize = `2`;
254	// SAFETY: align is not zero, align is a power of two, and rounding
255	// size up to align does not overflow isize::MAX. These guarantees were
256	// made when originally allocating this memory.
257	let layout: Layout = unsafe { Layout::from_size_align_unchecked(size, align) };
258	// SAFETY: ptr was previously allocated by the same allocator with the
259	// same layout.
260	unsafe { dealloc(ptr, layout) }
261	}
262	}
263
264	impl PartialEq for Identifier {
265	fn eq(&self, rhs: &Self) -> bool {
266	if self.ptr_eq(rhs) {
267	// Fast path (most common)
268	`true`
269	} else if self.is_empty_or_inline() \|\| rhs.is_empty_or_inline() {
270	`false`
271	} else {
272	// SAFETY: both reprs are in the heap allocated representation.
273	unsafe { ptr_as_str(&self.head) == ptr_as_str(&rhs.head) }
274	}
275	}
276	}
277
278	unsafe impl Send for Identifier {}
279	unsafe impl Sync for Identifier {}
280
281	// We use heap pointers that are 2-byte aligned, meaning they have an
282	// insignificant 0 in the least significant bit. We take advantage of that
283	// unneeded bit to rotate a 1 into the most significant bit to make the repr
284	// distinguishable from ASCII bytes.
285	fn ptr_to_repr(original: *mut u8) -> NonNull<u8> {
286	// `mov eax, 1`
287	// `shld rax, rdi, 63`
288	let modified: usize = (original as usize \| `1`).rotate_right(`1`);
289
290	// `original + (modified - original)`, but being mindful of provenance.
291	let diff: usize = modified.wrapping_sub(original as usize);
292	let modified: *mut u8 = original.wrapping_add(count:diff);
293
294	// SAFETY: the most significant bit of repr is known to be set, so the value
295	// is not zero.
296	unsafe { NonNull::new_unchecked(ptr:modified) }
297	}
298
299	// Shift out the 1 previously placed into the most significant bit of the least
300	// significant byte. Shift in a low 0 bit to reconstruct the original 2-byte
301	// aligned pointer.
302	fn repr_to_ptr(modified: NonNull<u8>) -> *const u8 {
303	// `lea rax, [rdi + rdi]`
304	let modified: *mut u8 = modified.as_ptr();
305	let original: usize = (modified as usize) << `1`;
306
307	// `modified + (original - modified)`, but being mindful of provenance.
308	let diff: usize = original.wrapping_sub(modified as usize);
309	modified.wrapping_add(count:diff)
310	}
311
312	fn repr_to_ptr_mut(repr: NonNull<u8>) -> *mut u8 {
313	repr_to_ptr(modified:repr) as *mut u8
314	}
315
316	// Compute the length of the inline string, assuming the argument is in short
317	// string representation. Short strings are stored as 1 to 8 nonzero ASCII
318	// bytes, followed by \0 padding for the remaining bytes.
319	//
320	// SAFETY: the identifier must indeed be in the inline representation.
321	unsafe fn inline_len(repr: &Identifier) -> NonZeroUsize {
322	// SAFETY: Identifier's layout is align(8) and at least size 8. We're doing
323	// an aligned read of the first 8 bytes from it. The bytes are not all zero
324	// because inline strings are at least 1 byte long and cannot contain \0.
325	let repr: NonZero = unsafe { ptr::read(src:repr as *const Identifier as *const NonZeroU64) };
326
327	// Rustc >=1.53 has intrinsics for counting zeros on a non-zeroable integer.
328	// On many architectures these are more efficient than counting on ordinary
329	// zeroable integers (bsf vs cttz). On rustc <1.53 without those intrinsics,
330	// we count zeros in the u64 rather than the NonZeroU64.
331	#[cfg(no_nonzero_bitscan)]
332	let repr = repr.get();
333
334	#[cfg(target_endian = "little")]
335	let zero_bits_on_string_end: u32 = repr.leading_zeros();
336	#[cfg(target_endian = "big")]
337	let zero_bits_on_string_end = repr.trailing_zeros();
338
339	let nonzero_bytes: usize = `8` - zero_bits_on_string_end as usize / `8`;
340
341	// SAFETY: repr is nonzero, so it has at most 63 zero bits on either end,
342	// thus at least one nonzero byte.
343	unsafe { NonZeroUsize::new_unchecked(nonzero_bytes) }
344	}
345
346	// SAFETY: repr must be in the inline representation, i.e. at least 1 and at
347	// most 8 nonzero ASCII bytes padded on the end with \0 bytes.
348	unsafe fn inline_as_str(repr: &Identifier) -> &str {
349	let ptr: const u8 = repr as const Identifier as *const u8;
350	let len: usize = unsafe { inline_len(repr) }.get();
351	// SAFETY: we are viewing the nonzero ASCII prefix of the inline repr's
352	// contents as a slice of bytes. Input/output lifetimes are correctly
353	// associated.
354	let slice: &[u8] = unsafe { slice::from_raw_parts(data:ptr, len) };
355	// SAFETY: the string contents are known to be only ASCII bytes, which are
356	// always valid UTF-8.
357	unsafe { str::from_utf8_unchecked(slice) }
358	}
359
360	// Decode varint. Varints consist of between one and eight base-128 digits, each
361	// of which is stored in a byte with most significant bit set. Adjacent to the
362	// varint in memory there is guaranteed to be at least 9 ASCII bytes, each of
363	// which has an unset most significant bit.
364	//
365	// SAFETY: ptr must be one of our own heap allocations, with the varint header
366	// already written.
367	unsafe fn decode_len(ptr: *const u8) -> NonZeroUsize {
368	// SAFETY: There is at least one byte of varint followed by at least 9 bytes
369	// of string content, which is at least 10 bytes total for the allocation,
370	// so reading the first two is no problem.
371	let [first, second] = unsafe { ptr::read(ptr as *const [u8; `2`]) };
372	if second < `0x80` {
373	// SAFETY: the length of this heap allocated string has been encoded as
374	// one base-128 digit, so the length is at least 9 and at most 127. It
375	// cannot be zero.
376	unsafe { NonZeroUsize::new_unchecked((first & `0x7f`) as usize) }
377	} else {
378	return unsafe { decode_len_cold(ptr) };
379
380	// Identifiers 128 bytes or longer. This is not exercised by any crate
381	// version currently published to crates.io.
382	#[cold]
383	#[inline(never)]
384	unsafe fn decode_len_cold(mut ptr: *const u8) -> NonZeroUsize {
385	let mut len = `0`;
386	let mut shift = `0`;
387	loop {
388	// SAFETY: varint continues while there are bytes having the
389	// most significant bit set, i.e. until we start hitting the
390	// ASCII string content with msb unset.
391	let byte = unsafe { *ptr };
392	if byte < `0x80` {
393	// SAFETY: the string length is known to be 128 bytes or
394	// longer.
395	return unsafe { NonZeroUsize::new_unchecked(len) };
396	}
397	// SAFETY: still in bounds of the same allocation.
398	ptr = unsafe { ptr.add(`1`) };
399	len += ((byte & `0x7f`) as usize) << shift;
400	shift += `7`;
401	}
402	}
403	}
404	}
405
406	// SAFETY: repr must be in the heap allocated representation, with varint header
407	// and string contents already written.
408	unsafe fn ptr_as_str(repr: &NonNull<u8>) -> &str {
409	let ptr: const u8 = repr_to_ptr(repr);
410	let len: NonZero = unsafe { decode_len(ptr) };
411	let header: usize = bytes_for_varint(len);
412	let slice: &[u8] = unsafe { slice::from_raw_parts(data:ptr.add(count:header), len.get()) };
413	// SAFETY: all identifier contents are ASCII bytes, which are always valid
414	// UTF-8.
415	unsafe { str::from_utf8_unchecked(slice) }
416	}
417
418	// Number of base-128 digits required for the varint representation of a length.
419	fn bytes_for_varint(len: NonZeroUsize) -> usize {
420	#[cfg(no_nonzero_bitscan)] // rustc <1.53
421	let len = len.get();
422
423	let usize_bits: usize = mem::size_of::<usize>() * `8`;
424	let len_bits: usize = usize_bits - len.leading_zeros() as usize;
425	(len_bits + `6`) / `7`
426	}
427