| 1 | //! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down |
| 2 | //! Quicksort. |
| 3 | |
| 4 | use crate::mem::MaybeUninit; |
| 5 | use crate::slice::sort::shared::find_existing_run; |
| 6 | use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl; |
| 7 | use crate::slice::sort::stable::merge::merge; |
| 8 | use crate::slice::sort::stable::quicksort::quicksort; |
| 9 | use crate::{cmp, intrinsics}; |
| 10 | |
| 11 | /// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true, |
| 12 | /// it will only do small-sorts and physical merges, ensuring O(N * log(N)) |
| 13 | /// worst-case complexity. `scratch.len()` must be at least |
| 14 | /// `max(v.len() - v.len() / 2, SMALL_SORT_GENERAL_SCRATCH_LEN)` otherwise the implementation may abort. |
| 15 | /// Fully ascending and descending inputs will be sorted with exactly N - 1 |
| 16 | /// comparisons. |
| 17 | /// |
| 18 | /// This is the main loop for driftsort, which uses powersort's heuristic to |
| 19 | /// determine in which order to merge runs, see below for details. |
| 20 | pub fn sort<T, F: FnMut(&T, &T) -> bool>( |
| 21 | v: &mut [T], |
| 22 | scratch: &mut [MaybeUninit<T>], |
| 23 | eager_sort: bool, |
| 24 | is_less: &mut F, |
| 25 | ) { |
| 26 | let len = v.len(); |
| 27 | if len < 2 { |
| 28 | return; // Removing this length check *increases* code size. |
| 29 | } |
| 30 | let scale_factor = merge_tree_scale_factor(len); |
| 31 | |
| 32 | // It's important to have a relatively high entry barrier for pre-sorted |
| 33 | // runs, as the presence of a single such run will force on average several |
| 34 | // merge operations and shrink the maximum quicksort size a lot. For that |
| 35 | // reason we use sqrt(len) as our pre-sorted run threshold. |
| 36 | const MIN_SQRT_RUN_LEN: usize = 64; |
| 37 | let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) { |
| 38 | // For small input length `MIN_SQRT_RUN_LEN` would break pattern |
| 39 | // detection of full or nearly sorted inputs. |
| 40 | cmp::min(len - len / 2, MIN_SQRT_RUN_LEN) |
| 41 | } else { |
| 42 | sqrt_approx(len) |
| 43 | }; |
| 44 | |
| 45 | // (stack_len, runs, desired_depths) together form a stack maintaining run |
| 46 | // information for the powersort heuristic. desired_depths[i] is the desired |
| 47 | // depth of the merge node that merges runs[i] with the run that comes after |
| 48 | // it. |
| 49 | let mut stack_len = 0; |
| 50 | let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit(); |
| 51 | let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast(); |
| 52 | let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit(); |
| 53 | let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast(); |
| 54 | |
| 55 | let mut scan_idx = 0; |
| 56 | let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run. |
| 57 | loop { |
| 58 | // Compute the next run and the desired depth of the merge node between |
| 59 | // prev_run and next_run. On the last iteration we create a dummy run |
| 60 | // with root-level desired depth to fully collapse the merge tree. |
| 61 | let (next_run, desired_depth); |
| 62 | if scan_idx < len { |
| 63 | next_run = |
| 64 | create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less); |
| 65 | desired_depth = merge_tree_depth( |
| 66 | scan_idx - prev_run.len(), |
| 67 | scan_idx, |
| 68 | scan_idx + next_run.len(), |
| 69 | scale_factor, |
| 70 | ); |
| 71 | } else { |
| 72 | next_run = DriftsortRun::new_sorted(0); |
| 73 | desired_depth = 0; |
| 74 | }; |
| 75 | |
| 76 | // Process the merge nodes between earlier runs[i] that have a desire to |
| 77 | // be deeper in the merge tree than the merge node for the splitpoint |
| 78 | // between prev_run and next_run. |
| 79 | // |
| 80 | // SAFETY: first note that this is the only place we modify stack_len, |
| 81 | // runs or desired depths. We maintain the following invariants: |
| 82 | // 1. The first stack_len elements of runs/desired_depths are initialized. |
| 83 | // 2. For all valid i > 0, desired_depths[i] < desired_depths[i+1]. |
| 84 | // 3. The sum of all valid runs[i].len() plus prev_run.len() equals |
| 85 | // scan_idx. |
| 86 | unsafe { |
| 87 | while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth { |
| 88 | // Desired depth greater than the upcoming desired depth, pop |
| 89 | // left neighbor run from stack and merge into prev_run. |
| 90 | let left = *runs.add(stack_len - 1); |
| 91 | let merged_len = left.len() + prev_run.len(); |
| 92 | let merge_start_idx = scan_idx - merged_len; |
| 93 | let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx); |
| 94 | prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less); |
| 95 | stack_len -= 1; |
| 96 | } |
| 97 | |
| 98 | // We now know that desired_depths[stack_len - 1] < desired_depth, |
| 99 | // maintaining our invariant. This also guarantees we don't overflow |
| 100 | // the stack as merge_tree_depth(..) <= 64 and thus we can only have |
| 101 | // 64 distinct values on the stack before pushing, plus our initial |
| 102 | // dummy run, while our capacity is 66. |
| 103 | *runs.add(stack_len) = prev_run; |
| 104 | *desired_depths.add(stack_len) = desired_depth; |
| 105 | stack_len += 1; |
| 106 | } |
| 107 | |
| 108 | // Break before overriding the last run with our dummy run. |
| 109 | if scan_idx >= len { |
| 110 | break; |
| 111 | } |
| 112 | |
| 113 | scan_idx += next_run.len(); |
| 114 | prev_run = next_run; |
| 115 | } |
| 116 | |
| 117 | if !prev_run.sorted() { |
| 118 | stable_quicksort(v, scratch, is_less); |
| 119 | } |
| 120 | } |
| 121 | |
| 122 | // Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally |
| 123 | // Adapt to Existing Runs by J. Ian Munro and Sebastian Wild. |
| 124 | // |
| 125 | // This method forms a binary merge tree, where each internal node corresponds |
| 126 | // to a splitting point between the adjacent runs that have to be merged. If we |
| 127 | // visualize our array as the number line from 0 to 1, we want to find the |
| 128 | // dyadic fraction with smallest denominator that lies between the midpoints of |
| 129 | // our to-be-merged slices. The exponent in the dyadic fraction indicates the |
| 130 | // desired depth in the binary merge tree this internal node wishes to have. |
| 131 | // This does not always correspond to the actual depth due to the inherent |
| 132 | // imbalance in runs, but we follow it as closely as possible. |
| 133 | // |
| 134 | // As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then |
| 135 | // finding the simplest dyadic fraction between midpoints corresponds to finding |
| 136 | // the most significant bit difference of the midpoints. We save scale_factor = |
| 137 | // ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding |
| 138 | // having to repeatedly do integer divides. This rescaling isn't exact when n is |
| 139 | // not a power of two since we use integers and not reals, but the result is |
| 140 | // very close, and in fact when n < 2^30 the resulting tree is equivalent as the |
| 141 | // approximation errors stay entirely in the lower order bits. |
| 142 | // |
| 143 | // Thus for the splitting point between two adjacent slices [a, b) and [b, c) |
| 144 | // the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f), |
| 145 | // where CLZ counts the number of leading zeros in an integer and f is our scale |
| 146 | // factor. Note that we omitted the division by two in the midpoint |
| 147 | // calculations, as this simply shifts the bits by one position (and thus always |
| 148 | // adds one to the result), and we only care about the relative depths. |
| 149 | // |
| 150 | // Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then |
| 151 | // x < (2^62 / n + 1) * 2n |
| 152 | // x < 2^63 + 2n |
| 153 | // So as long as n < 2^62 we find that x < 2^64, meaning our operations do not |
| 154 | // overflow. |
| 155 | #[inline (always)] |
| 156 | fn merge_tree_scale_factor(n: usize) -> u64 { |
| 157 | if usize::BITS > u64::BITS { |
| 158 | panic!("Platform not supported" ); |
| 159 | } |
| 160 | |
| 161 | ((1 << 62) + n as u64 - 1) / n as u64 |
| 162 | } |
| 163 | |
| 164 | // Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must |
| 165 | // differ in some bit, and is <= 64 always. |
| 166 | #[inline (always)] |
| 167 | fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 { |
| 168 | let x: u64 = left as u64 + mid as u64; |
| 169 | let y: u64 = mid as u64 + right as u64; |
| 170 | ((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8 |
| 171 | } |
| 172 | |
| 173 | fn sqrt_approx(n: usize) -> usize { |
| 174 | // Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these |
| 175 | // two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer |
| 176 | // log floors we want to add 0.5 to compensate for this on average, so our |
| 177 | // initial approximation is 2^((1 + floor(log2(n))) / 2). |
| 178 | // |
| 179 | // We then apply an iteration of Newton's method to improve our |
| 180 | // approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2. |
| 181 | // |
| 182 | // Finally we note that the exponentiation / division can be done directly |
| 183 | // with shifts. We OR with 1 to avoid zero-checks in the integer log. |
| 184 | let ilog: u32 = (n | 1).ilog2(); |
| 185 | let shift: u32 = (1 + ilog) / 2; |
| 186 | ((1 << shift) + (n >> shift)) / 2 |
| 187 | } |
| 188 | |
| 189 | // Lazy logical runs as in Glidesort. |
| 190 | #[inline (always)] |
| 191 | fn logical_merge<T, F: FnMut(&T, &T) -> bool>( |
| 192 | v: &mut [T], |
| 193 | scratch: &mut [MaybeUninit<T>], |
| 194 | left: DriftsortRun, |
| 195 | right: DriftsortRun, |
| 196 | is_less: &mut F, |
| 197 | ) -> DriftsortRun { |
| 198 | // If one or both of the runs are sorted do a physical merge, using |
| 199 | // quicksort to sort the unsorted run if present. We also *need* to |
| 200 | // physically merge if the combined runs would not fit in the scratch space |
| 201 | // anymore (as this would mean we are no longer able to quicksort them). |
| 202 | let len: usize = v.len(); |
| 203 | let can_fit_in_scratch: bool = len <= scratch.len(); |
| 204 | if !can_fit_in_scratch || left.sorted() || right.sorted() { |
| 205 | if !left.sorted() { |
| 206 | stable_quicksort(&mut v[..left.len()], scratch, is_less); |
| 207 | } |
| 208 | if !right.sorted() { |
| 209 | stable_quicksort(&mut v[left.len()..], scratch, is_less); |
| 210 | } |
| 211 | merge(v, scratch, mid:left.len(), is_less); |
| 212 | |
| 213 | DriftsortRun::new_sorted(length:len) |
| 214 | } else { |
| 215 | DriftsortRun::new_unsorted(length:len) |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | /// Creates a new logical run. |
| 220 | /// |
| 221 | /// A logical run can either be sorted or unsorted. If there is a pre-existing |
| 222 | /// run that clears the `min_good_run_len` threshold it is returned as a sorted |
| 223 | /// run. If not, the result depends on the value of `eager_sort`. If it is true, |
| 224 | /// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it |
| 225 | /// is false an unsorted run of length `min_good_run_len` is returned. |
| 226 | fn create_run<T, F: FnMut(&T, &T) -> bool>( |
| 227 | v: &mut [T], |
| 228 | scratch: &mut [MaybeUninit<T>], |
| 229 | min_good_run_len: usize, |
| 230 | eager_sort: bool, |
| 231 | is_less: &mut F, |
| 232 | ) -> DriftsortRun { |
| 233 | let len = v.len(); |
| 234 | if len >= min_good_run_len { |
| 235 | let (run_len, was_reversed) = find_existing_run(v, is_less); |
| 236 | |
| 237 | // SAFETY: find_existing_run promises to return a valid run_len. |
| 238 | unsafe { intrinsics::assume(run_len <= len) }; |
| 239 | |
| 240 | if run_len >= min_good_run_len { |
| 241 | if was_reversed { |
| 242 | v[..run_len].reverse(); |
| 243 | } |
| 244 | |
| 245 | return DriftsortRun::new_sorted(run_len); |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | if eager_sort { |
| 250 | // We call quicksort with a len that will immediately call small-sort. |
| 251 | // By not calling the small-sort directly here it can always be inlined into |
| 252 | // the quicksort itself, making the recursive base case faster and is generally |
| 253 | // more binary-size efficient. |
| 254 | let eager_run_len = cmp::min(T::small_sort_threshold(), len); |
| 255 | quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less); |
| 256 | DriftsortRun::new_sorted(eager_run_len) |
| 257 | } else { |
| 258 | DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len)) |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>( |
| 263 | v: &mut [T], |
| 264 | scratch: &mut [MaybeUninit<T>], |
| 265 | is_less: &mut F, |
| 266 | ) { |
| 267 | // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. |
| 268 | // The binary OR by one is used to eliminate the zero-check in the logarithm. |
| 269 | let limit: u32 = 2 * (v.len() | 1).ilog2(); |
| 270 | quicksort(v, scratch, limit, left_ancestor_pivot:None, is_less); |
| 271 | } |
| 272 | |
| 273 | /// Compactly stores the length of a run, and whether or not it is sorted. This |
| 274 | /// can always fit in a `usize` because the maximum slice length is [`isize::MAX`]. |
| 275 | #[derive (Copy, Clone)] |
| 276 | struct DriftsortRun(usize); |
| 277 | |
| 278 | impl DriftsortRun { |
| 279 | #[inline (always)] |
| 280 | fn new_sorted(length: usize) -> Self { |
| 281 | Self((length << 1) | 1) |
| 282 | } |
| 283 | |
| 284 | #[inline (always)] |
| 285 | fn new_unsorted(length: usize) -> Self { |
| 286 | Self(length << 1) |
| 287 | } |
| 288 | |
| 289 | #[inline (always)] |
| 290 | fn sorted(self) -> bool { |
| 291 | self.0 & 1 == 1 |
| 292 | } |
| 293 | |
| 294 | #[inline (always)] |
| 295 | fn len(self) -> usize { |
| 296 | self.0 >> 1 |
| 297 | } |
| 298 | } |
| 299 | |