1 | //! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down |
2 | //! Quicksort. |
3 | |
4 | use crate::mem::MaybeUninit; |
5 | use crate::slice::sort::shared::find_existing_run; |
6 | use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl; |
7 | use crate::slice::sort::stable::merge::merge; |
8 | use crate::slice::sort::stable::quicksort::quicksort; |
9 | use crate::{cmp, intrinsics}; |
10 | |
11 | /// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true, |
12 | /// it will only do small-sorts and physical merges, ensuring O(N * log(N)) |
13 | /// worst-case complexity. `scratch.len()` must be at least |
14 | /// `max(v.len() - v.len() / 2, SMALL_SORT_GENERAL_SCRATCH_LEN)` otherwise the implementation may abort. |
15 | /// Fully ascending and descending inputs will be sorted with exactly N - 1 |
16 | /// comparisons. |
17 | /// |
18 | /// This is the main loop for driftsort, which uses powersort's heuristic to |
19 | /// determine in which order to merge runs, see below for details. |
20 | pub fn sort<T, F: FnMut(&T, &T) -> bool>( |
21 | v: &mut [T], |
22 | scratch: &mut [MaybeUninit<T>], |
23 | eager_sort: bool, |
24 | is_less: &mut F, |
25 | ) { |
26 | let len = v.len(); |
27 | if len < 2 { |
28 | return; // Removing this length check *increases* code size. |
29 | } |
30 | let scale_factor = merge_tree_scale_factor(len); |
31 | |
32 | // It's important to have a relatively high entry barrier for pre-sorted |
33 | // runs, as the presence of a single such run will force on average several |
34 | // merge operations and shrink the maximum quicksort size a lot. For that |
35 | // reason we use sqrt(len) as our pre-sorted run threshold. |
36 | const MIN_SQRT_RUN_LEN: usize = 64; |
37 | let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) { |
38 | // For small input length `MIN_SQRT_RUN_LEN` would break pattern |
39 | // detection of full or nearly sorted inputs. |
40 | cmp::min(len - len / 2, MIN_SQRT_RUN_LEN) |
41 | } else { |
42 | sqrt_approx(len) |
43 | }; |
44 | |
45 | // (stack_len, runs, desired_depths) together form a stack maintaining run |
46 | // information for the powersort heuristic. desired_depths[i] is the desired |
47 | // depth of the merge node that merges runs[i] with the run that comes after |
48 | // it. |
49 | let mut stack_len = 0; |
50 | let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit(); |
51 | let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast(); |
52 | let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit(); |
53 | let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast(); |
54 | |
55 | let mut scan_idx = 0; |
56 | let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run. |
57 | loop { |
58 | // Compute the next run and the desired depth of the merge node between |
59 | // prev_run and next_run. On the last iteration we create a dummy run |
60 | // with root-level desired depth to fully collapse the merge tree. |
61 | let (next_run, desired_depth); |
62 | if scan_idx < len { |
63 | next_run = |
64 | create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less); |
65 | desired_depth = merge_tree_depth( |
66 | scan_idx - prev_run.len(), |
67 | scan_idx, |
68 | scan_idx + next_run.len(), |
69 | scale_factor, |
70 | ); |
71 | } else { |
72 | next_run = DriftsortRun::new_sorted(0); |
73 | desired_depth = 0; |
74 | }; |
75 | |
76 | // Process the merge nodes between earlier runs[i] that have a desire to |
77 | // be deeper in the merge tree than the merge node for the splitpoint |
78 | // between prev_run and next_run. |
79 | // |
80 | // SAFETY: first note that this is the only place we modify stack_len, |
81 | // runs or desired depths. We maintain the following invariants: |
82 | // 1. The first stack_len elements of runs/desired_depths are initialized. |
83 | // 2. For all valid i > 0, desired_depths[i] < desired_depths[i+1]. |
84 | // 3. The sum of all valid runs[i].len() plus prev_run.len() equals |
85 | // scan_idx. |
86 | unsafe { |
87 | while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth { |
88 | // Desired depth greater than the upcoming desired depth, pop |
89 | // left neighbor run from stack and merge into prev_run. |
90 | let left = *runs.add(stack_len - 1); |
91 | let merged_len = left.len() + prev_run.len(); |
92 | let merge_start_idx = scan_idx - merged_len; |
93 | let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx); |
94 | prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less); |
95 | stack_len -= 1; |
96 | } |
97 | |
98 | // We now know that desired_depths[stack_len - 1] < desired_depth, |
99 | // maintaining our invariant. This also guarantees we don't overflow |
100 | // the stack as merge_tree_depth(..) <= 64 and thus we can only have |
101 | // 64 distinct values on the stack before pushing, plus our initial |
102 | // dummy run, while our capacity is 66. |
103 | *runs.add(stack_len) = prev_run; |
104 | *desired_depths.add(stack_len) = desired_depth; |
105 | stack_len += 1; |
106 | } |
107 | |
108 | // Break before overriding the last run with our dummy run. |
109 | if scan_idx >= len { |
110 | break; |
111 | } |
112 | |
113 | scan_idx += next_run.len(); |
114 | prev_run = next_run; |
115 | } |
116 | |
117 | if !prev_run.sorted() { |
118 | stable_quicksort(v, scratch, is_less); |
119 | } |
120 | } |
121 | |
122 | // Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally |
123 | // Adapt to Existing Runs by J. Ian Munro and Sebastian Wild. |
124 | // |
125 | // This method forms a binary merge tree, where each internal node corresponds |
126 | // to a splitting point between the adjacent runs that have to be merged. If we |
127 | // visualize our array as the number line from 0 to 1, we want to find the |
128 | // dyadic fraction with smallest denominator that lies between the midpoints of |
129 | // our to-be-merged slices. The exponent in the dyadic fraction indicates the |
130 | // desired depth in the binary merge tree this internal node wishes to have. |
131 | // This does not always correspond to the actual depth due to the inherent |
132 | // imbalance in runs, but we follow it as closely as possible. |
133 | // |
134 | // As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then |
135 | // finding the simplest dyadic fraction between midpoints corresponds to finding |
136 | // the most significant bit difference of the midpoints. We save scale_factor = |
137 | // ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding |
138 | // having to repeatedly do integer divides. This rescaling isn't exact when n is |
139 | // not a power of two since we use integers and not reals, but the result is |
140 | // very close, and in fact when n < 2^30 the resulting tree is equivalent as the |
141 | // approximation errors stay entirely in the lower order bits. |
142 | // |
143 | // Thus for the splitting point between two adjacent slices [a, b) and [b, c) |
144 | // the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f), |
145 | // where CLZ counts the number of leading zeros in an integer and f is our scale |
146 | // factor. Note that we omitted the division by two in the midpoint |
147 | // calculations, as this simply shifts the bits by one position (and thus always |
148 | // adds one to the result), and we only care about the relative depths. |
149 | // |
150 | // Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then |
151 | // x < (2^62 / n + 1) * 2n |
152 | // x < 2^63 + 2n |
153 | // So as long as n < 2^62 we find that x < 2^64, meaning our operations do not |
154 | // overflow. |
155 | #[inline (always)] |
156 | fn merge_tree_scale_factor(n: usize) -> u64 { |
157 | if usize::BITS > u64::BITS { |
158 | panic!("Platform not supported" ); |
159 | } |
160 | |
161 | ((1 << 62) + n as u64 - 1) / n as u64 |
162 | } |
163 | |
164 | // Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must |
165 | // differ in some bit, and is <= 64 always. |
166 | #[inline (always)] |
167 | fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 { |
168 | let x: u64 = left as u64 + mid as u64; |
169 | let y: u64 = mid as u64 + right as u64; |
170 | ((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8 |
171 | } |
172 | |
173 | fn sqrt_approx(n: usize) -> usize { |
174 | // Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these |
175 | // two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer |
176 | // log floors we want to add 0.5 to compensate for this on average, so our |
177 | // initial approximation is 2^((1 + floor(log2(n))) / 2). |
178 | // |
179 | // We then apply an iteration of Newton's method to improve our |
180 | // approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2. |
181 | // |
182 | // Finally we note that the exponentiation / division can be done directly |
183 | // with shifts. We OR with 1 to avoid zero-checks in the integer log. |
184 | let ilog: u32 = (n | 1).ilog2(); |
185 | let shift: u32 = (1 + ilog) / 2; |
186 | ((1 << shift) + (n >> shift)) / 2 |
187 | } |
188 | |
189 | // Lazy logical runs as in Glidesort. |
190 | #[inline (always)] |
191 | fn logical_merge<T, F: FnMut(&T, &T) -> bool>( |
192 | v: &mut [T], |
193 | scratch: &mut [MaybeUninit<T>], |
194 | left: DriftsortRun, |
195 | right: DriftsortRun, |
196 | is_less: &mut F, |
197 | ) -> DriftsortRun { |
198 | // If one or both of the runs are sorted do a physical merge, using |
199 | // quicksort to sort the unsorted run if present. We also *need* to |
200 | // physically merge if the combined runs would not fit in the scratch space |
201 | // anymore (as this would mean we are no longer able to quicksort them). |
202 | let len: usize = v.len(); |
203 | let can_fit_in_scratch: bool = len <= scratch.len(); |
204 | if !can_fit_in_scratch || left.sorted() || right.sorted() { |
205 | if !left.sorted() { |
206 | stable_quicksort(&mut v[..left.len()], scratch, is_less); |
207 | } |
208 | if !right.sorted() { |
209 | stable_quicksort(&mut v[left.len()..], scratch, is_less); |
210 | } |
211 | merge(v, scratch, mid:left.len(), is_less); |
212 | |
213 | DriftsortRun::new_sorted(length:len) |
214 | } else { |
215 | DriftsortRun::new_unsorted(length:len) |
216 | } |
217 | } |
218 | |
219 | /// Creates a new logical run. |
220 | /// |
221 | /// A logical run can either be sorted or unsorted. If there is a pre-existing |
222 | /// run that clears the `min_good_run_len` threshold it is returned as a sorted |
223 | /// run. If not, the result depends on the value of `eager_sort`. If it is true, |
224 | /// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it |
225 | /// is false an unsorted run of length `min_good_run_len` is returned. |
226 | fn create_run<T, F: FnMut(&T, &T) -> bool>( |
227 | v: &mut [T], |
228 | scratch: &mut [MaybeUninit<T>], |
229 | min_good_run_len: usize, |
230 | eager_sort: bool, |
231 | is_less: &mut F, |
232 | ) -> DriftsortRun { |
233 | let len = v.len(); |
234 | if len >= min_good_run_len { |
235 | let (run_len, was_reversed) = find_existing_run(v, is_less); |
236 | |
237 | // SAFETY: find_existing_run promises to return a valid run_len. |
238 | unsafe { intrinsics::assume(run_len <= len) }; |
239 | |
240 | if run_len >= min_good_run_len { |
241 | if was_reversed { |
242 | v[..run_len].reverse(); |
243 | } |
244 | |
245 | return DriftsortRun::new_sorted(run_len); |
246 | } |
247 | } |
248 | |
249 | if eager_sort { |
250 | // We call quicksort with a len that will immediately call small-sort. |
251 | // By not calling the small-sort directly here it can always be inlined into |
252 | // the quicksort itself, making the recursive base case faster and is generally |
253 | // more binary-size efficient. |
254 | let eager_run_len = cmp::min(T::small_sort_threshold(), len); |
255 | quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less); |
256 | DriftsortRun::new_sorted(eager_run_len) |
257 | } else { |
258 | DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len)) |
259 | } |
260 | } |
261 | |
262 | fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>( |
263 | v: &mut [T], |
264 | scratch: &mut [MaybeUninit<T>], |
265 | is_less: &mut F, |
266 | ) { |
267 | // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. |
268 | // The binary OR by one is used to eliminate the zero-check in the logarithm. |
269 | let limit: u32 = 2 * (v.len() | 1).ilog2(); |
270 | quicksort(v, scratch, limit, left_ancestor_pivot:None, is_less); |
271 | } |
272 | |
273 | /// Compactly stores the length of a run, and whether or not it is sorted. This |
274 | /// can always fit in a `usize` because the maximum slice length is [`isize::MAX`]. |
275 | #[derive (Copy, Clone)] |
276 | struct DriftsortRun(usize); |
277 | |
278 | impl DriftsortRun { |
279 | #[inline (always)] |
280 | fn new_sorted(length: usize) -> Self { |
281 | Self((length << 1) | 1) |
282 | } |
283 | |
284 | #[inline (always)] |
285 | fn new_unsorted(length: usize) -> Self { |
286 | Self(length << 1) |
287 | } |
288 | |
289 | #[inline (always)] |
290 | fn sorted(self) -> bool { |
291 | self.0 & 1 == 1 |
292 | } |
293 | |
294 | #[inline (always)] |
295 | fn len(self) -> usize { |
296 | self.0 >> 1 |
297 | } |
298 | } |
299 | |