drift.rs source code [crates/core/src/slice/sort/stable/drift.rs]

1	//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down
2	//! Quicksort.
3
4	use crate::mem::MaybeUninit;
5	use crate::slice::sort::shared::find_existing_run;
6	use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
7	use crate::slice::sort::stable::merge::merge;
8	use crate::slice::sort::stable::quicksort::quicksort;
9	use crate::{cmp, intrinsics};
10
11	/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true,
12	/// it will only do small-sorts and physical merges, ensuring O(N log(N))*
13	/// worst-case complexity. `scratch.len()` must be at least
14	/// `max(v.len() - v.len() / 2, SMALL_SORT_GENERAL_SCRATCH_LEN)` otherwise the implementation may abort.
15	/// Fully ascending and descending inputs will be sorted with exactly N - 1
16	/// comparisons.
17	///
18	/// This is the main loop for driftsort, which uses powersort's heuristic to
19	/// determine in which order to merge runs, see below for details.
20	pub fn sort<T, F: FnMut(&T, &T) -> bool>(
21	v: &mut [T],
22	scratch: &mut [MaybeUninit<T>],
23	eager_sort: bool,
24	is_less: &mut F,
25	) {
26	let len = v.len();
27	if len < `2` {
28	return; // Removing this length check increases* code size.*
29	}
30	let scale_factor = merge_tree_scale_factor(len);
31
32	// It's important to have a relatively high entry barrier for pre-sorted
33	// runs, as the presence of a single such run will force on average several
34	// merge operations and shrink the maximum quicksort size a lot. For that
35	// reason we use sqrt(len) as our pre-sorted run threshold.
36	const MIN_SQRT_RUN_LEN: usize = `64`;
37	let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) {
38	// For small input length `MIN_SQRT_RUN_LEN` would break pattern
39	// detection of full or nearly sorted inputs.
40	cmp::min(len - len / `2`, MIN_SQRT_RUN_LEN)
41	} else {
42	sqrt_approx(len)
43	};
44
45	// (stack_len, runs, desired_depths) together form a stack maintaining run
46	// information for the powersort heuristic. desired_depths[i] is the desired
47	// depth of the merge node that merges runs[i] with the run that comes after
48	// it.
49	let mut stack_len = `0`;
50	let mut run_storage = MaybeUninit::<[DriftsortRun; `66`]>::uninit();
51	let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast();
52	let mut desired_depth_storage = MaybeUninit::<[u8; `66`]>::uninit();
53	let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast();
54
55	let mut scan_idx = `0`;
56	let mut prev_run = DriftsortRun::new_sorted(`0`); // Initial dummy run.
57	loop {
58	// Compute the next run and the desired depth of the merge node between
59	// prev_run and next_run. On the last iteration we create a dummy run
60	// with root-level desired depth to fully collapse the merge tree.
61	let (next_run, desired_depth);
62	if scan_idx < len {
63	next_run =
64	create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less);
65	desired_depth = merge_tree_depth(
66	scan_idx - prev_run.len(),
67	scan_idx,
68	scan_idx + next_run.len(),
69	scale_factor,
70	);
71	} else {
72	next_run = DriftsortRun::new_sorted(`0`);
73	desired_depth = `0`;
74	};
75
76	// Process the merge nodes between earlier runs[i] that have a desire to
77	// be deeper in the merge tree than the merge node for the splitpoint
78	// between prev_run and next_run.
79	//
80	// SAFETY: first note that this is the only place we modify stack_len,
81	// runs or desired depths. We maintain the following invariants:
82	// 1. The first stack_len elements of runs/desired_depths are initialized.
83	// 2. For all valid i > 0, desired_depths[i] < desired_depths[i+1].
84	// 3. The sum of all valid runs[i].len() plus prev_run.len() equals
85	// scan_idx.
86	unsafe {
87	while stack_len > `1` && *desired_depths.add(stack_len - `1`) >= desired_depth {
88	// Desired depth greater than the upcoming desired depth, pop
89	// left neighbor run from stack and merge into prev_run.
90	let left = *runs.add(stack_len - `1`);
91	let merged_len = left.len() + prev_run.len();
92	let merge_start_idx = scan_idx - merged_len;
93	let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx);
94	prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less);
95	stack_len -= `1`;
96	}
97
98	// We now know that desired_depths[stack_len - 1] < desired_depth,
99	// maintaining our invariant. This also guarantees we don't overflow
100	// the stack as merge_tree_depth(..) <= 64 and thus we can only have
101	// 64 distinct values on the stack before pushing, plus our initial
102	// dummy run, while our capacity is 66.
103	*runs.add(stack_len) = prev_run;
104	*desired_depths.add(stack_len) = desired_depth;
105	stack_len += `1`;
106	}
107
108	// Break before overriding the last run with our dummy run.
109	if scan_idx >= len {
110	break;
111	}
112
113	scan_idx += next_run.len();
114	prev_run = next_run;
115	}
116
117	if !prev_run.sorted() {
118	stable_quicksort(v, scratch, is_less);
119	}
120	}
121
122	// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally
123	// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild.
124	//
125	// This method forms a binary merge tree, where each internal node corresponds
126	// to a splitting point between the adjacent runs that have to be merged. If we
127	// visualize our array as the number line from 0 to 1, we want to find the
128	// dyadic fraction with smallest denominator that lies between the midpoints of
129	// our to-be-merged slices. The exponent in the dyadic fraction indicates the
130	// desired depth in the binary merge tree this internal node wishes to have.
131	// This does not always correspond to the actual depth due to the inherent
132	// imbalance in runs, but we follow it as closely as possible.
133	//
134	// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then
135	// finding the simplest dyadic fraction between midpoints corresponds to finding
136	// the most significant bit difference of the midpoints. We save scale_factor =
137	// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding
138	// having to repeatedly do integer divides. This rescaling isn't exact when n is
139	// not a power of two since we use integers and not reals, but the result is
140	// very close, and in fact when n < 2^30 the resulting tree is equivalent as the
141	// approximation errors stay entirely in the lower order bits.
142	//
143	// Thus for the splitting point between two adjacent slices [a, b) and [b, c)
144	// the desired depth of the corresponding merge node is CLZ((a+b)f ^ (b+c)f),
145	// where CLZ counts the number of leading zeros in an integer and f is our scale
146	// factor. Note that we omitted the division by two in the midpoint
147	// calculations, as this simply shifts the bits by one position (and thus always
148	// adds one to the result), and we only care about the relative depths.
149	//
150	// Finally, if we try to upper bound x = (a+b)f giving x = (n-1 + n) * ceil(2^62 / n) then*
151	// x < (2^62 / n + 1) 2n*
152	// x < 2^63 + 2n
153	// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not
154	// overflow.
155	#[inline(always)]
156	fn merge_tree_scale_factor(n: usize) -> u64 {
157	if usize::BITS > u64::BITS {
158	panic!("Platform not supported");
159	}
160
161	((`1` << `62`) + n as u64 - `1`) / n as u64
162	}
163
164	// Note: merge_tree_depth output is < 64 when left < right as fx and fy must
165	// differ in some bit, and is <= 64 always.
166	#[inline(always)]
167	fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 {
168	let x: u64 = left as u64 + mid as u64;
169	let y: u64 = mid as u64 + right as u64;
170	((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8
171	}
172
173	fn sqrt_approx(n: usize) -> usize {
174	// Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these
175	// two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer
176	// log floors we want to add 0.5 to compensate for this on average, so our
177	// initial approximation is 2^((1 + floor(log2(n))) / 2).
178	//
179	// We then apply an iteration of Newton's method to improve our
180	// approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2.
181	//
182	// Finally we note that the exponentiation / division can be done directly
183	// with shifts. We OR with 1 to avoid zero-checks in the integer log.
184	let ilog: u32 = (n \| `1`).ilog2();
185	let shift: u32 = (`1` + ilog) / `2`;
186	((`1` << shift) + (n >> shift)) / `2`
187	}
188
189	// Lazy logical runs as in Glidesort.
190	#[inline(always)]
191	fn logical_merge<T, F: FnMut(&T, &T) -> bool>(
192	v: &mut [T],
193	scratch: &mut [MaybeUninit<T>],
194	left: DriftsortRun,
195	right: DriftsortRun,
196	is_less: &mut F,
197	) -> DriftsortRun {
198	// If one or both of the runs are sorted do a physical merge, using
199	// quicksort to sort the unsorted run if present. We also need* to*
200	// physically merge if the combined runs would not fit in the scratch space
201	// anymore (as this would mean we are no longer able to quicksort them).
202	let len: usize = v.len();
203	let can_fit_in_scratch: bool = len <= scratch.len();
204	if !can_fit_in_scratch \|\| left.sorted() \|\| right.sorted() {
205	if !left.sorted() {
206	stable_quicksort(&mut v[..left.len()], scratch, is_less);
207	}
208	if !right.sorted() {
209	stable_quicksort(&mut v[left.len()..], scratch, is_less);
210	}
211	merge(v, scratch, mid:left.len(), is_less);
212
213	DriftsortRun::new_sorted(length:len)
214	} else {
215	DriftsortRun::new_unsorted(length:len)
216	}
217	}
218
219	/// Creates a new logical run.
220	///
221	/// A logical run can either be sorted or unsorted. If there is a pre-existing
222	/// run that clears the `min_good_run_len` threshold it is returned as a sorted
223	/// run. If not, the result depends on the value of `eager_sort`. If it is true,
224	/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it
225	/// is false an unsorted run of length `min_good_run_len` is returned.
226	fn create_run<T, F: FnMut(&T, &T) -> bool>(
227	v: &mut [T],
228	scratch: &mut [MaybeUninit<T>],
229	min_good_run_len: usize,
230	eager_sort: bool,
231	is_less: &mut F,
232	) -> DriftsortRun {
233	let len = v.len();
234	if len >= min_good_run_len {
235	let (run_len, was_reversed) = find_existing_run(v, is_less);
236
237	// SAFETY: find_existing_run promises to return a valid run_len.
238	unsafe { intrinsics::assume(run_len <= len) };
239
240	if run_len >= min_good_run_len {
241	if was_reversed {
242	v[..run_len].reverse();
243	}
244
245	return DriftsortRun::new_sorted(run_len);
246	}
247	}
248
249	if eager_sort {
250	// We call quicksort with a len that will immediately call small-sort.
251	// By not calling the small-sort directly here it can always be inlined into
252	// the quicksort itself, making the recursive base case faster and is generally
253	// more binary-size efficient.
254	let eager_run_len = cmp::min(T::small_sort_threshold(), len);
255	quicksort(&mut v[..eager_run_len], scratch, `0`, None, is_less);
256	DriftsortRun::new_sorted(eager_run_len)
257	} else {
258	DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len))
259	}
260	}
261
262	fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>(
263	v: &mut [T],
264	scratch: &mut [MaybeUninit<T>],
265	is_less: &mut F,
266	) {
267	// Limit the number of imbalanced partitions to `2 floor(log2(len))`.*
268	// The binary OR by one is used to eliminate the zero-check in the logarithm.
269	let limit: u32 = `2` * (v.len() \| `1`).ilog2();
270	quicksort(v, scratch, limit, left_ancestor_pivot:None, is_less);
271	}
272
273	/// Compactly stores the length of a run, and whether or not it is sorted. This
274	/// can always fit in a `usize` because the maximum slice length is [`isize::MAX`].
275	#[derive(Copy, Clone)]
276	struct DriftsortRun(usize);
277
278	impl DriftsortRun {
279	#[inline(always)]
280	fn new_sorted(length: usize) -> Self {
281	Self((length << `1`) \| `1`)
282	}
283
284	#[inline(always)]
285	fn new_unsorted(length: usize) -> Self {
286	Self(length << `1`)
287	}
288
289	#[inline(always)]
290	fn sorted(self) -> bool {
291	self.0 & `1` == `1`
292	}
293
294	#[inline(always)]
295	fn len(self) -> usize {
296	self.0 >> `1`
297	}
298	}
299