1 | //! This module contains the entry points for `slice::sort`. |
2 | |
3 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
4 | use crate::cmp; |
5 | use crate::mem::{MaybeUninit, SizedTypeProperties}; |
6 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
7 | use crate::slice::sort::shared::smallsort::{ |
8 | SMALL_SORT_GENERAL_SCRATCH_LEN, StableSmallSortTypeImpl, insertion_sort_shift_left, |
9 | }; |
10 | use crate::{cfg_select, intrinsics}; |
11 | |
12 | pub(crate) mod merge; |
13 | |
14 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
15 | pub(crate) mod drift; |
16 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
17 | pub(crate) mod quicksort; |
18 | |
19 | #[cfg (any(feature = "optimize_for_size" , target_pointer_width = "16" ))] |
20 | pub(crate) mod tiny; |
21 | |
22 | /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. |
23 | /// Design document: |
24 | /// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md> |
25 | /// |
26 | /// Upholds all safety properties outlined here: |
27 | /// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md> |
28 | #[inline (always)] |
29 | pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) { |
30 | // Arrays of zero-sized types are always all-equal, and thus sorted. |
31 | if T::IS_ZST { |
32 | return; |
33 | } |
34 | |
35 | // Instrumenting the standard library showed that 90+% of the calls to sort |
36 | // by rustc are either of size 0 or 1. |
37 | let len = v.len(); |
38 | if intrinsics::likely(len < 2) { |
39 | return; |
40 | } |
41 | |
42 | cfg_select! { |
43 | any(feature = "optimize_for_size" , target_pointer_width = "16" ) => { |
44 | // Unlike driftsort, mergesort only requires len / 2, |
45 | // not len - len / 2. |
46 | let alloc_len = len / 2; |
47 | |
48 | cfg_select! { |
49 | target_pointer_width = "16" => { |
50 | let mut heap_buf = BufT::with_capacity(alloc_len); |
51 | let scratch = heap_buf.as_uninit_slice_mut(); |
52 | } |
53 | _ => { |
54 | // For small inputs 4KiB of stack storage suffices, which allows us to avoid |
55 | // calling the (de-)allocator. Benchmarks showed this was quite beneficial. |
56 | let mut stack_buf = AlignedStorage::<T, 4096>::new(); |
57 | let stack_scratch = stack_buf.as_uninit_slice_mut(); |
58 | let mut heap_buf; |
59 | let scratch = if stack_scratch.len() >= alloc_len { |
60 | stack_scratch |
61 | } else { |
62 | heap_buf = BufT::with_capacity(alloc_len); |
63 | heap_buf.as_uninit_slice_mut() |
64 | }; |
65 | } |
66 | } |
67 | |
68 | tiny::mergesort(v, scratch, is_less); |
69 | } |
70 | _ => { |
71 | // More advanced sorting methods than insertion sort are faster if called in |
72 | // a hot loop for small inputs, but for general-purpose code the small |
73 | // binary size of insertion sort is more important. The instruction cache in |
74 | // modern processors is very valuable, and for a single sort call in general |
75 | // purpose code any gains from an advanced method are cancelled by i-cache |
76 | // misses during the sort, and thrashing the i-cache for surrounding code. |
77 | const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; |
78 | if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { |
79 | insertion_sort_shift_left(v, 1, is_less); |
80 | return; |
81 | } |
82 | |
83 | driftsort_main::<T, F, BufT>(v, is_less); |
84 | } |
85 | } |
86 | } |
87 | |
88 | /// See [`sort`] |
89 | /// |
90 | /// Deliberately don't inline the main sorting routine entrypoint to ensure the |
91 | /// inlined insertion sort i-cache footprint remains minimal. |
92 | #[cfg (not(any(feature = "optimize_for_size" , target_pointer_width = "16" )))] |
93 | #[inline (never)] |
94 | fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) { |
95 | // By allocating n elements of memory we can ensure the entire input can |
96 | // be sorted using stable quicksort, which allows better performance on |
97 | // random and low-cardinality distributions. However, we still want to |
98 | // reduce our memory usage to n - n / 2 for large inputs. We do this by scaling |
99 | // our allocation as max(n - n / 2, min(n, 8MB)), ensuring we scale like n for |
100 | // small inputs and n - n / 2 for large inputs, without a sudden drop off. We |
101 | // also need to ensure our alloc >= SMALL_SORT_GENERAL_SCRATCH_LEN, as the |
102 | // small-sort always needs this much memory. |
103 | // |
104 | // driftsort will produce unsorted runs of up to min_good_run_len, which |
105 | // is at most len - len / 2. |
106 | // Unsorted runs need to be processed by quicksort, which requires as much |
107 | // scratch space as the run length, therefore the scratch space must be at |
108 | // least len - len / 2. |
109 | // If min_good_run_len is ever modified, this code must be updated to allocate |
110 | // the correct scratch size for it. |
111 | const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB |
112 | let max_full_alloc = MAX_FULL_ALLOC_BYTES / size_of::<T>(); |
113 | let len = v.len(); |
114 | let alloc_len = cmp::max( |
115 | cmp::max(len - len / 2, cmp::min(len, max_full_alloc)), |
116 | SMALL_SORT_GENERAL_SCRATCH_LEN, |
117 | ); |
118 | |
119 | // For small inputs 4KiB of stack storage suffices, which allows us to avoid |
120 | // calling the (de-)allocator. Benchmarks showed this was quite beneficial. |
121 | let mut stack_buf = AlignedStorage::<T, 4096>::new(); |
122 | let stack_scratch = stack_buf.as_uninit_slice_mut(); |
123 | let mut heap_buf; |
124 | let scratch = if stack_scratch.len() >= alloc_len { |
125 | stack_scratch |
126 | } else { |
127 | heap_buf = BufT::with_capacity(alloc_len); |
128 | heap_buf.as_uninit_slice_mut() |
129 | }; |
130 | |
131 | // For small inputs using quicksort is not yet beneficial, and a single |
132 | // small-sort or two small-sorts plus a single merge outperforms it, so use |
133 | // eager mode. |
134 | let eager_sort = len <= T::small_sort_threshold() * 2; |
135 | crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less); |
136 | } |
137 | |
138 | #[doc (hidden)] |
139 | /// Abstracts owned memory buffer, so that sort code can live in core where no allocation is |
140 | /// possible. This trait can then be implemented in a place that has access to allocation. |
141 | pub trait BufGuard<T> { |
142 | /// Creates new buffer that holds at least `capacity` memory. |
143 | fn with_capacity(capacity: usize) -> Self; |
144 | /// Returns mutable access to uninitialized memory owned by the buffer. |
145 | fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>]; |
146 | } |
147 | |
148 | #[repr (C)] |
149 | struct AlignedStorage<T, const N: usize> { |
150 | _align: [T; 0], |
151 | storage: [MaybeUninit<u8>; N], |
152 | } |
153 | |
154 | impl<T, const N: usize> AlignedStorage<T, N> { |
155 | fn new() -> Self { |
156 | Self { _align: [], storage: [const { MaybeUninit::uninit() }; N] } |
157 | } |
158 | |
159 | fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] { |
160 | let len: usize = N / size_of::<T>(); |
161 | |
162 | // SAFETY: `_align` ensures we are correctly aligned. |
163 | unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) } |
164 | } |
165 | } |
166 | |