| 1 | //! This module contains the entry points for `slice::sort`. |
| 2 | |
| 3 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
| 4 | use crate::cmp; |
| 5 | use crate::mem::{MaybeUninit, SizedTypeProperties}; |
| 6 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
| 7 | use crate::slice::sort::shared::smallsort::{ |
| 8 | SMALL_SORT_GENERAL_SCRATCH_LEN, StableSmallSortTypeImpl, insertion_sort_shift_left, |
| 9 | }; |
| 10 | use crate::{cfg_select, intrinsics}; |
| 11 | |
| 12 | pub(crate) mod merge; |
| 13 | |
| 14 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
| 15 | pub(crate) mod drift; |
| 16 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
| 17 | pub(crate) mod quicksort; |
| 18 | |
| 19 | #[cfg (any(feature = "optimize_for_size" , target_pointer_width = "16" ))] |
| 20 | pub(crate) mod tiny; |
| 21 | |
| 22 | /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. |
| 23 | /// Design document: |
| 24 | /// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md> |
| 25 | /// |
| 26 | /// Upholds all safety properties outlined here: |
| 27 | /// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md> |
| 28 | #[inline (always)] |
| 29 | pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) { |
| 30 | // Arrays of zero-sized types are always all-equal, and thus sorted. |
| 31 | if T::IS_ZST { |
| 32 | return; |
| 33 | } |
| 34 | |
| 35 | // Instrumenting the standard library showed that 90+% of the calls to sort |
| 36 | // by rustc are either of size 0 or 1. |
| 37 | let len = v.len(); |
| 38 | if intrinsics::likely(len < 2) { |
| 39 | return; |
| 40 | } |
| 41 | |
| 42 | cfg_select! { |
| 43 | any(feature = "optimize_for_size" , target_pointer_width = "16" ) => { |
| 44 | // Unlike driftsort, mergesort only requires len / 2, |
| 45 | // not len - len / 2. |
| 46 | let alloc_len = len / 2; |
| 47 | |
| 48 | cfg_select! { |
| 49 | target_pointer_width = "16" => { |
| 50 | let mut heap_buf = BufT::with_capacity(alloc_len); |
| 51 | let scratch = heap_buf.as_uninit_slice_mut(); |
| 52 | } |
| 53 | _ => { |
| 54 | // For small inputs 4KiB of stack storage suffices, which allows us to avoid |
| 55 | // calling the (de-)allocator. Benchmarks showed this was quite beneficial. |
| 56 | let mut stack_buf = AlignedStorage::<T, 4096>::new(); |
| 57 | let stack_scratch = stack_buf.as_uninit_slice_mut(); |
| 58 | let mut heap_buf; |
| 59 | let scratch = if stack_scratch.len() >= alloc_len { |
| 60 | stack_scratch |
| 61 | } else { |
| 62 | heap_buf = BufT::with_capacity(alloc_len); |
| 63 | heap_buf.as_uninit_slice_mut() |
| 64 | }; |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | tiny::mergesort(v, scratch, is_less); |
| 69 | } |
| 70 | _ => { |
| 71 | // More advanced sorting methods than insertion sort are faster if called in |
| 72 | // a hot loop for small inputs, but for general-purpose code the small |
| 73 | // binary size of insertion sort is more important. The instruction cache in |
| 74 | // modern processors is very valuable, and for a single sort call in general |
| 75 | // purpose code any gains from an advanced method are cancelled by i-cache |
| 76 | // misses during the sort, and thrashing the i-cache for surrounding code. |
| 77 | const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; |
| 78 | if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { |
| 79 | insertion_sort_shift_left(v, 1, is_less); |
| 80 | return; |
| 81 | } |
| 82 | |
| 83 | driftsort_main::<T, F, BufT>(v, is_less); |
| 84 | } |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | /// See [`sort`] |
| 89 | /// |
| 90 | /// Deliberately don't inline the main sorting routine entrypoint to ensure the |
| 91 | /// inlined insertion sort i-cache footprint remains minimal. |
| 92 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
| 93 | #[inline (never)] |
| 94 | fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) { |
| 95 | // By allocating n elements of memory we can ensure the entire input can |
| 96 | // be sorted using stable quicksort, which allows better performance on |
| 97 | // random and low-cardinality distributions. However, we still want to |
| 98 | // reduce our memory usage to n - n / 2 for large inputs. We do this by scaling |
| 99 | // our allocation as max(n - n / 2, min(n, 8MB)), ensuring we scale like n for |
| 100 | // small inputs and n - n / 2 for large inputs, without a sudden drop off. We |
| 101 | // also need to ensure our alloc >= SMALL_SORT_GENERAL_SCRATCH_LEN, as the |
| 102 | // small-sort always needs this much memory. |
| 103 | // |
| 104 | // driftsort will produce unsorted runs of up to min_good_run_len, which |
| 105 | // is at most len - len / 2. |
| 106 | // Unsorted runs need to be processed by quicksort, which requires as much |
| 107 | // scratch space as the run length, therefore the scratch space must be at |
| 108 | // least len - len / 2. |
| 109 | // If min_good_run_len is ever modified, this code must be updated to allocate |
| 110 | // the correct scratch size for it. |
| 111 | const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB |
| 112 | let max_full_alloc = MAX_FULL_ALLOC_BYTES / size_of::<T>(); |
| 113 | let len = v.len(); |
| 114 | let alloc_len = cmp::max( |
| 115 | cmp::max(len - len / 2, cmp::min(len, max_full_alloc)), |
| 116 | SMALL_SORT_GENERAL_SCRATCH_LEN, |
| 117 | ); |
| 118 | |
| 119 | // For small inputs 4KiB of stack storage suffices, which allows us to avoid |
| 120 | // calling the (de-)allocator. Benchmarks showed this was quite beneficial. |
| 121 | let mut stack_buf = AlignedStorage::<T, 4096>::new(); |
| 122 | let stack_scratch = stack_buf.as_uninit_slice_mut(); |
| 123 | let mut heap_buf; |
| 124 | let scratch = if stack_scratch.len() >= alloc_len { |
| 125 | stack_scratch |
| 126 | } else { |
| 127 | heap_buf = BufT::with_capacity(alloc_len); |
| 128 | heap_buf.as_uninit_slice_mut() |
| 129 | }; |
| 130 | |
| 131 | // For small inputs using quicksort is not yet beneficial, and a single |
| 132 | // small-sort or two small-sorts plus a single merge outperforms it, so use |
| 133 | // eager mode. |
| 134 | let eager_sort = len <= T::small_sort_threshold() * 2; |
| 135 | crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less); |
| 136 | } |
| 137 | |
| 138 | #[doc (hidden)] |
| 139 | /// Abstracts owned memory buffer, so that sort code can live in core where no allocation is |
| 140 | /// possible. This trait can then be implemented in a place that has access to allocation. |
| 141 | pub trait BufGuard<T> { |
| 142 | /// Creates new buffer that holds at least `capacity` memory. |
| 143 | fn with_capacity(capacity: usize) -> Self; |
| 144 | /// Returns mutable access to uninitialized memory owned by the buffer. |
| 145 | fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>]; |
| 146 | } |
| 147 | |
| 148 | #[repr (C)] |
| 149 | struct AlignedStorage<T, const N: usize> { |
| 150 | _align: [T; 0], |
| 151 | storage: [MaybeUninit<u8>; N], |
| 152 | } |
| 153 | |
| 154 | impl<T, const N: usize> AlignedStorage<T, N> { |
| 155 | fn new() -> Self { |
| 156 | Self { _align: [], storage: [const { MaybeUninit::uninit() }; N] } |
| 157 | } |
| 158 | |
| 159 | fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] { |
| 160 | let len: usize = N / size_of::<T>(); |
| 161 | |
| 162 | // SAFETY: `_align` ensures we are correctly aligned. |
| 163 | unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) } |
| 164 | } |
| 165 | } |
| 166 | |