1 | //! Parallel iterator types for [slices][std::slice] |
2 | //! |
3 | //! You will rarely need to interact with this module directly unless you need |
4 | //! to name one of the iterator types. |
5 | //! |
6 | //! [std::slice]: https://doc.rust-lang.org/stable/std/slice/ |
7 | |
8 | mod chunks; |
9 | mod mergesort; |
10 | mod quicksort; |
11 | mod rchunks; |
12 | |
13 | mod test; |
14 | |
15 | use self::mergesort::par_mergesort; |
16 | use self::quicksort::par_quicksort; |
17 | use crate::iter::plumbing::*; |
18 | use crate::iter::*; |
19 | use crate::split_producer::*; |
20 | use std::cmp; |
21 | use std::cmp::Ordering; |
22 | use std::fmt::{self, Debug}; |
23 | use std::mem; |
24 | |
25 | pub use self::chunks::{Chunks, ChunksExact, ChunksExactMut, ChunksMut}; |
26 | pub use self::rchunks::{RChunks, RChunksExact, RChunksExactMut, RChunksMut}; |
27 | |
28 | /// Parallel extensions for slices. |
29 | pub trait ParallelSlice<T: Sync> { |
30 | /// Returns a plain slice, which is used to implement the rest of the |
31 | /// parallel methods. |
32 | fn as_parallel_slice(&self) -> &[T]; |
33 | |
34 | /// Returns a parallel iterator over subslices separated by elements that |
35 | /// match the separator. |
36 | /// |
37 | /// # Examples |
38 | /// |
39 | /// ``` |
40 | /// use rayon::prelude::*; |
41 | /// let smallest = [1, 2, 3, 0, 2, 4, 8, 0, 3, 6, 9] |
42 | /// .par_split(|i| *i == 0) |
43 | /// .map(|numbers| numbers.iter().min().unwrap()) |
44 | /// .min(); |
45 | /// assert_eq!(Some(&1), smallest); |
46 | /// ``` |
47 | fn par_split<P>(&self, separator: P) -> Split<'_, T, P> |
48 | where |
49 | P: Fn(&T) -> bool + Sync + Send, |
50 | { |
51 | Split { |
52 | slice: self.as_parallel_slice(), |
53 | separator, |
54 | } |
55 | } |
56 | |
57 | /// Returns a parallel iterator over all contiguous windows of length |
58 | /// `window_size`. The windows overlap. |
59 | /// |
60 | /// # Examples |
61 | /// |
62 | /// ``` |
63 | /// use rayon::prelude::*; |
64 | /// let windows: Vec<_> = [1, 2, 3].par_windows(2).collect(); |
65 | /// assert_eq!(vec![[1, 2], [2, 3]], windows); |
66 | /// ``` |
67 | fn par_windows(&self, window_size: usize) -> Windows<'_, T> { |
68 | Windows { |
69 | window_size, |
70 | slice: self.as_parallel_slice(), |
71 | } |
72 | } |
73 | |
74 | /// Returns a parallel iterator over at most `chunk_size` elements of |
75 | /// `self` at a time. The chunks do not overlap. |
76 | /// |
77 | /// If the number of elements in the iterator is not divisible by |
78 | /// `chunk_size`, the last chunk may be shorter than `chunk_size`. All |
79 | /// other chunks will have that exact length. |
80 | /// |
81 | /// # Examples |
82 | /// |
83 | /// ``` |
84 | /// use rayon::prelude::*; |
85 | /// let chunks: Vec<_> = [1, 2, 3, 4, 5].par_chunks(2).collect(); |
86 | /// assert_eq!(chunks, vec![&[1, 2][..], &[3, 4], &[5]]); |
87 | /// ``` |
88 | #[track_caller ] |
89 | fn par_chunks(&self, chunk_size: usize) -> Chunks<'_, T> { |
90 | assert!(chunk_size != 0, "chunk_size must not be zero" ); |
91 | Chunks::new(chunk_size, self.as_parallel_slice()) |
92 | } |
93 | |
94 | /// Returns a parallel iterator over `chunk_size` elements of |
95 | /// `self` at a time. The chunks do not overlap. |
96 | /// |
97 | /// If `chunk_size` does not divide the length of the slice, then the |
98 | /// last up to `chunk_size-1` elements will be omitted and can be |
99 | /// retrieved from the remainder function of the iterator. |
100 | /// |
101 | /// # Examples |
102 | /// |
103 | /// ``` |
104 | /// use rayon::prelude::*; |
105 | /// let chunks: Vec<_> = [1, 2, 3, 4, 5].par_chunks_exact(2).collect(); |
106 | /// assert_eq!(chunks, vec![&[1, 2][..], &[3, 4]]); |
107 | /// ``` |
108 | #[track_caller ] |
109 | fn par_chunks_exact(&self, chunk_size: usize) -> ChunksExact<'_, T> { |
110 | assert!(chunk_size != 0, "chunk_size must not be zero" ); |
111 | ChunksExact::new(chunk_size, self.as_parallel_slice()) |
112 | } |
113 | |
114 | /// Returns a parallel iterator over at most `chunk_size` elements of `self` at a time, |
115 | /// starting at the end. The chunks do not overlap. |
116 | /// |
117 | /// If the number of elements in the iterator is not divisible by |
118 | /// `chunk_size`, the last chunk may be shorter than `chunk_size`. All |
119 | /// other chunks will have that exact length. |
120 | /// |
121 | /// # Examples |
122 | /// |
123 | /// ``` |
124 | /// use rayon::prelude::*; |
125 | /// let chunks: Vec<_> = [1, 2, 3, 4, 5].par_rchunks(2).collect(); |
126 | /// assert_eq!(chunks, vec![&[4, 5][..], &[2, 3], &[1]]); |
127 | /// ``` |
128 | #[track_caller ] |
129 | fn par_rchunks(&self, chunk_size: usize) -> RChunks<'_, T> { |
130 | assert!(chunk_size != 0, "chunk_size must not be zero" ); |
131 | RChunks::new(chunk_size, self.as_parallel_slice()) |
132 | } |
133 | |
134 | /// Returns a parallel iterator over `chunk_size` elements of `self` at a time, |
135 | /// starting at the end. The chunks do not overlap. |
136 | /// |
137 | /// If `chunk_size` does not divide the length of the slice, then the |
138 | /// last up to `chunk_size-1` elements will be omitted and can be |
139 | /// retrieved from the remainder function of the iterator. |
140 | /// |
141 | /// # Examples |
142 | /// |
143 | /// ``` |
144 | /// use rayon::prelude::*; |
145 | /// let chunks: Vec<_> = [1, 2, 3, 4, 5].par_rchunks_exact(2).collect(); |
146 | /// assert_eq!(chunks, vec![&[4, 5][..], &[2, 3]]); |
147 | /// ``` |
148 | #[track_caller ] |
149 | fn par_rchunks_exact(&self, chunk_size: usize) -> RChunksExact<'_, T> { |
150 | assert!(chunk_size != 0, "chunk_size must not be zero" ); |
151 | RChunksExact::new(chunk_size, self.as_parallel_slice()) |
152 | } |
153 | } |
154 | |
155 | impl<T: Sync> ParallelSlice<T> for [T] { |
156 | #[inline ] |
157 | fn as_parallel_slice(&self) -> &[T] { |
158 | self |
159 | } |
160 | } |
161 | |
162 | /// Parallel extensions for mutable slices. |
163 | pub trait ParallelSliceMut<T: Send> { |
164 | /// Returns a plain mutable slice, which is used to implement the rest of |
165 | /// the parallel methods. |
166 | fn as_parallel_slice_mut(&mut self) -> &mut [T]; |
167 | |
168 | /// Returns a parallel iterator over mutable subslices separated by |
169 | /// elements that match the separator. |
170 | /// |
171 | /// # Examples |
172 | /// |
173 | /// ``` |
174 | /// use rayon::prelude::*; |
175 | /// let mut array = [1, 2, 3, 0, 2, 4, 8, 0, 3, 6, 9]; |
176 | /// array.par_split_mut(|i| *i == 0) |
177 | /// .for_each(|slice| slice.reverse()); |
178 | /// assert_eq!(array, [3, 2, 1, 0, 8, 4, 2, 0, 9, 6, 3]); |
179 | /// ``` |
180 | fn par_split_mut<P>(&mut self, separator: P) -> SplitMut<'_, T, P> |
181 | where |
182 | P: Fn(&T) -> bool + Sync + Send, |
183 | { |
184 | SplitMut { |
185 | slice: self.as_parallel_slice_mut(), |
186 | separator, |
187 | } |
188 | } |
189 | |
190 | /// Returns a parallel iterator over at most `chunk_size` elements of |
191 | /// `self` at a time. The chunks are mutable and do not overlap. |
192 | /// |
193 | /// If the number of elements in the iterator is not divisible by |
194 | /// `chunk_size`, the last chunk may be shorter than `chunk_size`. All |
195 | /// other chunks will have that exact length. |
196 | /// |
197 | /// # Examples |
198 | /// |
199 | /// ``` |
200 | /// use rayon::prelude::*; |
201 | /// let mut array = [1, 2, 3, 4, 5]; |
202 | /// array.par_chunks_mut(2) |
203 | /// .for_each(|slice| slice.reverse()); |
204 | /// assert_eq!(array, [2, 1, 4, 3, 5]); |
205 | /// ``` |
206 | #[track_caller ] |
207 | fn par_chunks_mut(&mut self, chunk_size: usize) -> ChunksMut<'_, T> { |
208 | assert!(chunk_size != 0, "chunk_size must not be zero" ); |
209 | ChunksMut::new(chunk_size, self.as_parallel_slice_mut()) |
210 | } |
211 | |
212 | /// Returns a parallel iterator over `chunk_size` elements of |
213 | /// `self` at a time. The chunks are mutable and do not overlap. |
214 | /// |
215 | /// If `chunk_size` does not divide the length of the slice, then the |
216 | /// last up to `chunk_size-1` elements will be omitted and can be |
217 | /// retrieved from the remainder function of the iterator. |
218 | /// |
219 | /// # Examples |
220 | /// |
221 | /// ``` |
222 | /// use rayon::prelude::*; |
223 | /// let mut array = [1, 2, 3, 4, 5]; |
224 | /// array.par_chunks_exact_mut(3) |
225 | /// .for_each(|slice| slice.reverse()); |
226 | /// assert_eq!(array, [3, 2, 1, 4, 5]); |
227 | /// ``` |
228 | #[track_caller ] |
229 | fn par_chunks_exact_mut(&mut self, chunk_size: usize) -> ChunksExactMut<'_, T> { |
230 | assert!(chunk_size != 0, "chunk_size must not be zero" ); |
231 | ChunksExactMut::new(chunk_size, self.as_parallel_slice_mut()) |
232 | } |
233 | |
234 | /// Returns a parallel iterator over at most `chunk_size` elements of `self` at a time, |
235 | /// starting at the end. The chunks are mutable and do not overlap. |
236 | /// |
237 | /// If the number of elements in the iterator is not divisible by |
238 | /// `chunk_size`, the last chunk may be shorter than `chunk_size`. All |
239 | /// other chunks will have that exact length. |
240 | /// |
241 | /// # Examples |
242 | /// |
243 | /// ``` |
244 | /// use rayon::prelude::*; |
245 | /// let mut array = [1, 2, 3, 4, 5]; |
246 | /// array.par_rchunks_mut(2) |
247 | /// .for_each(|slice| slice.reverse()); |
248 | /// assert_eq!(array, [1, 3, 2, 5, 4]); |
249 | /// ``` |
250 | #[track_caller ] |
251 | fn par_rchunks_mut(&mut self, chunk_size: usize) -> RChunksMut<'_, T> { |
252 | assert!(chunk_size != 0, "chunk_size must not be zero" ); |
253 | RChunksMut::new(chunk_size, self.as_parallel_slice_mut()) |
254 | } |
255 | |
256 | /// Returns a parallel iterator over `chunk_size` elements of `self` at a time, |
257 | /// starting at the end. The chunks are mutable and do not overlap. |
258 | /// |
259 | /// If `chunk_size` does not divide the length of the slice, then the |
260 | /// last up to `chunk_size-1` elements will be omitted and can be |
261 | /// retrieved from the remainder function of the iterator. |
262 | /// |
263 | /// # Examples |
264 | /// |
265 | /// ``` |
266 | /// use rayon::prelude::*; |
267 | /// let mut array = [1, 2, 3, 4, 5]; |
268 | /// array.par_rchunks_exact_mut(3) |
269 | /// .for_each(|slice| slice.reverse()); |
270 | /// assert_eq!(array, [1, 2, 5, 4, 3]); |
271 | /// ``` |
272 | #[track_caller ] |
273 | fn par_rchunks_exact_mut(&mut self, chunk_size: usize) -> RChunksExactMut<'_, T> { |
274 | assert!(chunk_size != 0, "chunk_size must not be zero" ); |
275 | RChunksExactMut::new(chunk_size, self.as_parallel_slice_mut()) |
276 | } |
277 | |
278 | /// Sorts the slice in parallel. |
279 | /// |
280 | /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case. |
281 | /// |
282 | /// When applicable, unstable sorting is preferred because it is generally faster than stable |
283 | /// sorting and it doesn't allocate auxiliary memory. |
284 | /// See [`par_sort_unstable`](#method.par_sort_unstable). |
285 | /// |
286 | /// # Current implementation |
287 | /// |
288 | /// The current algorithm is an adaptive merge sort inspired by |
289 | /// [timsort](https://en.wikipedia.org/wiki/Timsort). |
290 | /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of |
291 | /// two or more sorted sequences concatenated one after another. |
292 | /// |
293 | /// Also, it allocates temporary storage the same size as `self`, but for very short slices a |
294 | /// non-allocating insertion sort is used instead. |
295 | /// |
296 | /// In order to sort the slice in parallel, the slice is first divided into smaller chunks and |
297 | /// all chunks are sorted in parallel. Then, adjacent chunks that together form non-descending |
298 | /// or descending runs are concatenated. Finally, the remaining chunks are merged together using |
299 | /// parallel subdivision of chunks and parallel merge operation. |
300 | /// |
301 | /// # Examples |
302 | /// |
303 | /// ``` |
304 | /// use rayon::prelude::*; |
305 | /// |
306 | /// let mut v = [-5, 4, 1, -3, 2]; |
307 | /// |
308 | /// v.par_sort(); |
309 | /// assert_eq!(v, [-5, -3, 1, 2, 4]); |
310 | /// ``` |
311 | fn par_sort(&mut self) |
312 | where |
313 | T: Ord, |
314 | { |
315 | par_mergesort(self.as_parallel_slice_mut(), T::lt); |
316 | } |
317 | |
318 | /// Sorts the slice in parallel with a comparator function. |
319 | /// |
320 | /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case. |
321 | /// |
322 | /// The comparator function must define a total ordering for the elements in the slice. If |
323 | /// the ordering is not total, the order of the elements is unspecified. An order is a |
324 | /// total order if it is (for all `a`, `b` and `c`): |
325 | /// |
326 | /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and |
327 | /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`. |
328 | /// |
329 | /// For example, while [`f64`] doesn't implement [`Ord`] because `NaN != NaN`, we can use |
330 | /// `partial_cmp` as our sort function when we know the slice doesn't contain a `NaN`. |
331 | /// |
332 | /// ``` |
333 | /// use rayon::prelude::*; |
334 | /// |
335 | /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0]; |
336 | /// floats.par_sort_by(|a, b| a.partial_cmp(b).unwrap()); |
337 | /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]); |
338 | /// ``` |
339 | /// |
340 | /// When applicable, unstable sorting is preferred because it is generally faster than stable |
341 | /// sorting and it doesn't allocate auxiliary memory. |
342 | /// See [`par_sort_unstable_by`](#method.par_sort_unstable_by). |
343 | /// |
344 | /// # Current implementation |
345 | /// |
346 | /// The current algorithm is an adaptive merge sort inspired by |
347 | /// [timsort](https://en.wikipedia.org/wiki/Timsort). |
348 | /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of |
349 | /// two or more sorted sequences concatenated one after another. |
350 | /// |
351 | /// Also, it allocates temporary storage the same size as `self`, but for very short slices a |
352 | /// non-allocating insertion sort is used instead. |
353 | /// |
354 | /// In order to sort the slice in parallel, the slice is first divided into smaller chunks and |
355 | /// all chunks are sorted in parallel. Then, adjacent chunks that together form non-descending |
356 | /// or descending runs are concatenated. Finally, the remaining chunks are merged together using |
357 | /// parallel subdivision of chunks and parallel merge operation. |
358 | /// |
359 | /// # Examples |
360 | /// |
361 | /// ``` |
362 | /// use rayon::prelude::*; |
363 | /// |
364 | /// let mut v = [5, 4, 1, 3, 2]; |
365 | /// v.par_sort_by(|a, b| a.cmp(b)); |
366 | /// assert_eq!(v, [1, 2, 3, 4, 5]); |
367 | /// |
368 | /// // reverse sorting |
369 | /// v.par_sort_by(|a, b| b.cmp(a)); |
370 | /// assert_eq!(v, [5, 4, 3, 2, 1]); |
371 | /// ``` |
372 | fn par_sort_by<F>(&mut self, compare: F) |
373 | where |
374 | F: Fn(&T, &T) -> Ordering + Sync, |
375 | { |
376 | par_mergesort(self.as_parallel_slice_mut(), |a, b| { |
377 | compare(a, b) == Ordering::Less |
378 | }); |
379 | } |
380 | |
381 | /// Sorts the slice in parallel with a key extraction function. |
382 | /// |
383 | /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*)) |
384 | /// worst-case, where the key function is *O*(*m*). |
385 | /// |
386 | /// For expensive key functions (e.g. functions that are not simple property accesses or |
387 | /// basic operations), [`par_sort_by_cached_key`](#method.par_sort_by_cached_key) is likely to |
388 | /// be significantly faster, as it does not recompute element keys. |
389 | /// |
390 | /// When applicable, unstable sorting is preferred because it is generally faster than stable |
391 | /// sorting and it doesn't allocate auxiliary memory. |
392 | /// See [`par_sort_unstable_by_key`](#method.par_sort_unstable_by_key). |
393 | /// |
394 | /// # Current implementation |
395 | /// |
396 | /// The current algorithm is an adaptive merge sort inspired by |
397 | /// [timsort](https://en.wikipedia.org/wiki/Timsort). |
398 | /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of |
399 | /// two or more sorted sequences concatenated one after another. |
400 | /// |
401 | /// Also, it allocates temporary storage the same size as `self`, but for very short slices a |
402 | /// non-allocating insertion sort is used instead. |
403 | /// |
404 | /// In order to sort the slice in parallel, the slice is first divided into smaller chunks and |
405 | /// all chunks are sorted in parallel. Then, adjacent chunks that together form non-descending |
406 | /// or descending runs are concatenated. Finally, the remaining chunks are merged together using |
407 | /// parallel subdivision of chunks and parallel merge operation. |
408 | /// |
409 | /// # Examples |
410 | /// |
411 | /// ``` |
412 | /// use rayon::prelude::*; |
413 | /// |
414 | /// let mut v = [-5i32, 4, 1, -3, 2]; |
415 | /// |
416 | /// v.par_sort_by_key(|k| k.abs()); |
417 | /// assert_eq!(v, [1, 2, -3, 4, -5]); |
418 | /// ``` |
419 | fn par_sort_by_key<K, F>(&mut self, f: F) |
420 | where |
421 | K: Ord, |
422 | F: Fn(&T) -> K + Sync, |
423 | { |
424 | par_mergesort(self.as_parallel_slice_mut(), |a, b| f(a).lt(&f(b))); |
425 | } |
426 | |
427 | /// Sorts the slice in parallel with a key extraction function. |
428 | /// |
429 | /// During sorting, the key function is called at most once per element, by using |
430 | /// temporary storage to remember the results of key evaluation. |
431 | /// The key function is called in parallel, so the order of calls is completely unspecified. |
432 | /// |
433 | /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*)) |
434 | /// worst-case, where the key function is *O*(*m*). |
435 | /// |
436 | /// For simple key functions (e.g., functions that are property accesses or |
437 | /// basic operations), [`par_sort_by_key`](#method.par_sort_by_key) is likely to be |
438 | /// faster. |
439 | /// |
440 | /// # Current implementation |
441 | /// |
442 | /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, |
443 | /// which combines the fast average case of randomized quicksort with the fast worst case of |
444 | /// heapsort, while achieving linear time on slices with certain patterns. It uses some |
445 | /// randomization to avoid degenerate cases, but with a fixed seed to always provide |
446 | /// deterministic behavior. |
447 | /// |
448 | /// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the |
449 | /// length of the slice. |
450 | /// |
451 | /// All quicksorts work in two stages: partitioning into two halves followed by recursive |
452 | /// calls. The partitioning phase is sequential, but the two recursive calls are performed in |
453 | /// parallel. Finally, after sorting the cached keys, the item positions are updated sequentially. |
454 | /// |
455 | /// [pdqsort]: https://github.com/orlp/pdqsort |
456 | /// |
457 | /// # Examples |
458 | /// |
459 | /// ``` |
460 | /// use rayon::prelude::*; |
461 | /// |
462 | /// let mut v = [-5i32, 4, 32, -3, 2]; |
463 | /// |
464 | /// v.par_sort_by_cached_key(|k| k.to_string()); |
465 | /// assert!(v == [-3, -5, 2, 32, 4]); |
466 | /// ``` |
467 | fn par_sort_by_cached_key<K, F>(&mut self, f: F) |
468 | where |
469 | F: Fn(&T) -> K + Sync, |
470 | K: Ord + Send, |
471 | { |
472 | let slice = self.as_parallel_slice_mut(); |
473 | let len = slice.len(); |
474 | if len < 2 { |
475 | return; |
476 | } |
477 | |
478 | // Helper macro for indexing our vector by the smallest possible type, to reduce allocation. |
479 | macro_rules! sort_by_key { |
480 | ($t:ty) => {{ |
481 | let mut indices: Vec<_> = slice |
482 | .par_iter_mut() |
483 | .enumerate() |
484 | .map(|(i, x)| (f(&*x), i as $t)) |
485 | .collect(); |
486 | // The elements of `indices` are unique, as they are indexed, so any sort will be |
487 | // stable with respect to the original slice. We use `sort_unstable` here because |
488 | // it requires less memory allocation. |
489 | indices.par_sort_unstable(); |
490 | for i in 0..len { |
491 | let mut index = indices[i].1; |
492 | while (index as usize) < i { |
493 | index = indices[index as usize].1; |
494 | } |
495 | indices[i].1 = index; |
496 | slice.swap(i, index as usize); |
497 | } |
498 | }}; |
499 | } |
500 | |
501 | let sz_u8 = mem::size_of::<(K, u8)>(); |
502 | let sz_u16 = mem::size_of::<(K, u16)>(); |
503 | let sz_u32 = mem::size_of::<(K, u32)>(); |
504 | let sz_usize = mem::size_of::<(K, usize)>(); |
505 | |
506 | if sz_u8 < sz_u16 && len <= (std::u8::MAX as usize) { |
507 | return sort_by_key!(u8); |
508 | } |
509 | if sz_u16 < sz_u32 && len <= (std::u16::MAX as usize) { |
510 | return sort_by_key!(u16); |
511 | } |
512 | if sz_u32 < sz_usize && len <= (std::u32::MAX as usize) { |
513 | return sort_by_key!(u32); |
514 | } |
515 | sort_by_key!(usize) |
516 | } |
517 | |
518 | /// Sorts the slice in parallel, but might not preserve the order of equal elements. |
519 | /// |
520 | /// This sort is unstable (i.e., may reorder equal elements), in-place |
521 | /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case. |
522 | /// |
523 | /// # Current implementation |
524 | /// |
525 | /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, |
526 | /// which combines the fast average case of randomized quicksort with the fast worst case of |
527 | /// heapsort, while achieving linear time on slices with certain patterns. It uses some |
528 | /// randomization to avoid degenerate cases, but with a fixed seed to always provide |
529 | /// deterministic behavior. |
530 | /// |
531 | /// It is typically faster than stable sorting, except in a few special cases, e.g., when the |
532 | /// slice consists of several concatenated sorted sequences. |
533 | /// |
534 | /// All quicksorts work in two stages: partitioning into two halves followed by recursive |
535 | /// calls. The partitioning phase is sequential, but the two recursive calls are performed in |
536 | /// parallel. |
537 | /// |
538 | /// [pdqsort]: https://github.com/orlp/pdqsort |
539 | /// |
540 | /// # Examples |
541 | /// |
542 | /// ``` |
543 | /// use rayon::prelude::*; |
544 | /// |
545 | /// let mut v = [-5, 4, 1, -3, 2]; |
546 | /// |
547 | /// v.par_sort_unstable(); |
548 | /// assert_eq!(v, [-5, -3, 1, 2, 4]); |
549 | /// ``` |
550 | fn par_sort_unstable(&mut self) |
551 | where |
552 | T: Ord, |
553 | { |
554 | par_quicksort(self.as_parallel_slice_mut(), T::lt); |
555 | } |
556 | |
557 | /// Sorts the slice in parallel with a comparator function, but might not preserve the order of |
558 | /// equal elements. |
559 | /// |
560 | /// This sort is unstable (i.e., may reorder equal elements), in-place |
561 | /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case. |
562 | /// |
563 | /// The comparator function must define a total ordering for the elements in the slice. If |
564 | /// the ordering is not total, the order of the elements is unspecified. An order is a |
565 | /// total order if it is (for all `a`, `b` and `c`): |
566 | /// |
567 | /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and |
568 | /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`. |
569 | /// |
570 | /// For example, while [`f64`] doesn't implement [`Ord`] because `NaN != NaN`, we can use |
571 | /// `partial_cmp` as our sort function when we know the slice doesn't contain a `NaN`. |
572 | /// |
573 | /// ``` |
574 | /// use rayon::prelude::*; |
575 | /// |
576 | /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0]; |
577 | /// floats.par_sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); |
578 | /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]); |
579 | /// ``` |
580 | /// |
581 | /// # Current implementation |
582 | /// |
583 | /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, |
584 | /// which combines the fast average case of randomized quicksort with the fast worst case of |
585 | /// heapsort, while achieving linear time on slices with certain patterns. It uses some |
586 | /// randomization to avoid degenerate cases, but with a fixed seed to always provide |
587 | /// deterministic behavior. |
588 | /// |
589 | /// It is typically faster than stable sorting, except in a few special cases, e.g., when the |
590 | /// slice consists of several concatenated sorted sequences. |
591 | /// |
592 | /// All quicksorts work in two stages: partitioning into two halves followed by recursive |
593 | /// calls. The partitioning phase is sequential, but the two recursive calls are performed in |
594 | /// parallel. |
595 | /// |
596 | /// [pdqsort]: https://github.com/orlp/pdqsort |
597 | /// |
598 | /// # Examples |
599 | /// |
600 | /// ``` |
601 | /// use rayon::prelude::*; |
602 | /// |
603 | /// let mut v = [5, 4, 1, 3, 2]; |
604 | /// v.par_sort_unstable_by(|a, b| a.cmp(b)); |
605 | /// assert_eq!(v, [1, 2, 3, 4, 5]); |
606 | /// |
607 | /// // reverse sorting |
608 | /// v.par_sort_unstable_by(|a, b| b.cmp(a)); |
609 | /// assert_eq!(v, [5, 4, 3, 2, 1]); |
610 | /// ``` |
611 | fn par_sort_unstable_by<F>(&mut self, compare: F) |
612 | where |
613 | F: Fn(&T, &T) -> Ordering + Sync, |
614 | { |
615 | par_quicksort(self.as_parallel_slice_mut(), |a, b| { |
616 | compare(a, b) == Ordering::Less |
617 | }); |
618 | } |
619 | |
620 | /// Sorts the slice in parallel with a key extraction function, but might not preserve the order |
621 | /// of equal elements. |
622 | /// |
623 | /// This sort is unstable (i.e., may reorder equal elements), in-place |
624 | /// (i.e., does not allocate), and *O*(m \* *n* \* log(*n*)) worst-case, |
625 | /// where the key function is *O*(*m*). |
626 | /// |
627 | /// # Current implementation |
628 | /// |
629 | /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, |
630 | /// which combines the fast average case of randomized quicksort with the fast worst case of |
631 | /// heapsort, while achieving linear time on slices with certain patterns. It uses some |
632 | /// randomization to avoid degenerate cases, but with a fixed seed to always provide |
633 | /// deterministic behavior. |
634 | /// |
635 | /// Due to its key calling strategy, `par_sort_unstable_by_key` is likely to be slower than |
636 | /// [`par_sort_by_cached_key`](#method.par_sort_by_cached_key) in cases where the key function |
637 | /// is expensive. |
638 | /// |
639 | /// All quicksorts work in two stages: partitioning into two halves followed by recursive |
640 | /// calls. The partitioning phase is sequential, but the two recursive calls are performed in |
641 | /// parallel. |
642 | /// |
643 | /// [pdqsort]: https://github.com/orlp/pdqsort |
644 | /// |
645 | /// # Examples |
646 | /// |
647 | /// ``` |
648 | /// use rayon::prelude::*; |
649 | /// |
650 | /// let mut v = [-5i32, 4, 1, -3, 2]; |
651 | /// |
652 | /// v.par_sort_unstable_by_key(|k| k.abs()); |
653 | /// assert_eq!(v, [1, 2, -3, 4, -5]); |
654 | /// ``` |
655 | fn par_sort_unstable_by_key<K, F>(&mut self, f: F) |
656 | where |
657 | K: Ord, |
658 | F: Fn(&T) -> K + Sync, |
659 | { |
660 | par_quicksort(self.as_parallel_slice_mut(), |a, b| f(a).lt(&f(b))); |
661 | } |
662 | } |
663 | |
664 | impl<T: Send> ParallelSliceMut<T> for [T] { |
665 | #[inline ] |
666 | fn as_parallel_slice_mut(&mut self) -> &mut [T] { |
667 | self |
668 | } |
669 | } |
670 | |
671 | impl<'data, T: Sync + 'data> IntoParallelIterator for &'data [T] { |
672 | type Item = &'data T; |
673 | type Iter = Iter<'data, T>; |
674 | |
675 | fn into_par_iter(self) -> Self::Iter { |
676 | Iter { slice: self } |
677 | } |
678 | } |
679 | |
680 | impl<'data, T: Send + 'data> IntoParallelIterator for &'data mut [T] { |
681 | type Item = &'data mut T; |
682 | type Iter = IterMut<'data, T>; |
683 | |
684 | fn into_par_iter(self) -> Self::Iter { |
685 | IterMut { slice: self } |
686 | } |
687 | } |
688 | |
689 | /// Parallel iterator over immutable items in a slice |
690 | #[derive (Debug)] |
691 | pub struct Iter<'data, T: Sync> { |
692 | slice: &'data [T], |
693 | } |
694 | |
695 | impl<'data, T: Sync> Clone for Iter<'data, T> { |
696 | fn clone(&self) -> Self { |
697 | Iter { ..*self } |
698 | } |
699 | } |
700 | |
701 | impl<'data, T: Sync + 'data> ParallelIterator for Iter<'data, T> { |
702 | type Item = &'data T; |
703 | |
704 | fn drive_unindexed<C>(self, consumer: C) -> C::Result |
705 | where |
706 | C: UnindexedConsumer<Self::Item>, |
707 | { |
708 | bridge(self, consumer) |
709 | } |
710 | |
711 | fn opt_len(&self) -> Option<usize> { |
712 | Some(self.len()) |
713 | } |
714 | } |
715 | |
716 | impl<'data, T: Sync + 'data> IndexedParallelIterator for Iter<'data, T> { |
717 | fn drive<C>(self, consumer: C) -> C::Result |
718 | where |
719 | C: Consumer<Self::Item>, |
720 | { |
721 | bridge(self, consumer) |
722 | } |
723 | |
724 | fn len(&self) -> usize { |
725 | self.slice.len() |
726 | } |
727 | |
728 | fn with_producer<CB>(self, callback: CB) -> CB::Output |
729 | where |
730 | CB: ProducerCallback<Self::Item>, |
731 | { |
732 | callback.callback(producer:IterProducer { slice: self.slice }) |
733 | } |
734 | } |
735 | |
736 | struct IterProducer<'data, T: Sync> { |
737 | slice: &'data [T], |
738 | } |
739 | |
740 | impl<'data, T: 'data + Sync> Producer for IterProducer<'data, T> { |
741 | type Item = &'data T; |
742 | type IntoIter = ::std::slice::Iter<'data, T>; |
743 | |
744 | fn into_iter(self) -> Self::IntoIter { |
745 | self.slice.iter() |
746 | } |
747 | |
748 | fn split_at(self, index: usize) -> (Self, Self) { |
749 | let (left: &[T], right: &[T]) = self.slice.split_at(mid:index); |
750 | (IterProducer { slice: left }, IterProducer { slice: right }) |
751 | } |
752 | } |
753 | |
754 | /// Parallel iterator over immutable overlapping windows of a slice |
755 | #[derive (Debug)] |
756 | pub struct Windows<'data, T: Sync> { |
757 | window_size: usize, |
758 | slice: &'data [T], |
759 | } |
760 | |
761 | impl<'data, T: Sync> Clone for Windows<'data, T> { |
762 | fn clone(&self) -> Self { |
763 | Windows { ..*self } |
764 | } |
765 | } |
766 | |
767 | impl<'data, T: Sync + 'data> ParallelIterator for Windows<'data, T> { |
768 | type Item = &'data [T]; |
769 | |
770 | fn drive_unindexed<C>(self, consumer: C) -> C::Result |
771 | where |
772 | C: UnindexedConsumer<Self::Item>, |
773 | { |
774 | bridge(self, consumer) |
775 | } |
776 | |
777 | fn opt_len(&self) -> Option<usize> { |
778 | Some(self.len()) |
779 | } |
780 | } |
781 | |
782 | impl<'data, T: Sync + 'data> IndexedParallelIterator for Windows<'data, T> { |
783 | fn drive<C>(self, consumer: C) -> C::Result |
784 | where |
785 | C: Consumer<Self::Item>, |
786 | { |
787 | bridge(self, consumer) |
788 | } |
789 | |
790 | fn len(&self) -> usize { |
791 | assert!(self.window_size >= 1); |
792 | self.slice.len().saturating_sub(self.window_size - 1) |
793 | } |
794 | |
795 | fn with_producer<CB>(self, callback: CB) -> CB::Output |
796 | where |
797 | CB: ProducerCallback<Self::Item>, |
798 | { |
799 | callback.callback(producer:WindowsProducer { |
800 | window_size: self.window_size, |
801 | slice: self.slice, |
802 | }) |
803 | } |
804 | } |
805 | |
806 | struct WindowsProducer<'data, T: Sync> { |
807 | window_size: usize, |
808 | slice: &'data [T], |
809 | } |
810 | |
811 | impl<'data, T: 'data + Sync> Producer for WindowsProducer<'data, T> { |
812 | type Item = &'data [T]; |
813 | type IntoIter = ::std::slice::Windows<'data, T>; |
814 | |
815 | fn into_iter(self) -> Self::IntoIter { |
816 | self.slice.windows(self.window_size) |
817 | } |
818 | |
819 | fn split_at(self, index: usize) -> (Self, Self) { |
820 | let left_index: usize = cmp::min(self.slice.len(), v2:index + (self.window_size - 1)); |
821 | let left: &[T] = &self.slice[..left_index]; |
822 | let right: &[T] = &self.slice[index..]; |
823 | ( |
824 | WindowsProducer { |
825 | window_size: self.window_size, |
826 | slice: left, |
827 | }, |
828 | WindowsProducer { |
829 | window_size: self.window_size, |
830 | slice: right, |
831 | }, |
832 | ) |
833 | } |
834 | } |
835 | |
836 | /// Parallel iterator over mutable items in a slice |
837 | #[derive (Debug)] |
838 | pub struct IterMut<'data, T: Send> { |
839 | slice: &'data mut [T], |
840 | } |
841 | |
842 | impl<'data, T: Send + 'data> ParallelIterator for IterMut<'data, T> { |
843 | type Item = &'data mut T; |
844 | |
845 | fn drive_unindexed<C>(self, consumer: C) -> C::Result |
846 | where |
847 | C: UnindexedConsumer<Self::Item>, |
848 | { |
849 | bridge(self, consumer) |
850 | } |
851 | |
852 | fn opt_len(&self) -> Option<usize> { |
853 | Some(self.len()) |
854 | } |
855 | } |
856 | |
857 | impl<'data, T: Send + 'data> IndexedParallelIterator for IterMut<'data, T> { |
858 | fn drive<C>(self, consumer: C) -> C::Result |
859 | where |
860 | C: Consumer<Self::Item>, |
861 | { |
862 | bridge(self, consumer) |
863 | } |
864 | |
865 | fn len(&self) -> usize { |
866 | self.slice.len() |
867 | } |
868 | |
869 | fn with_producer<CB>(self, callback: CB) -> CB::Output |
870 | where |
871 | CB: ProducerCallback<Self::Item>, |
872 | { |
873 | callback.callback(producer:IterMutProducer { slice: self.slice }) |
874 | } |
875 | } |
876 | |
877 | struct IterMutProducer<'data, T: Send> { |
878 | slice: &'data mut [T], |
879 | } |
880 | |
881 | impl<'data, T: 'data + Send> Producer for IterMutProducer<'data, T> { |
882 | type Item = &'data mut T; |
883 | type IntoIter = ::std::slice::IterMut<'data, T>; |
884 | |
885 | fn into_iter(self) -> Self::IntoIter { |
886 | self.slice.iter_mut() |
887 | } |
888 | |
889 | fn split_at(self, index: usize) -> (Self, Self) { |
890 | let (left: &mut [T], right: &mut [T]) = self.slice.split_at_mut(mid:index); |
891 | ( |
892 | IterMutProducer { slice: left }, |
893 | IterMutProducer { slice: right }, |
894 | ) |
895 | } |
896 | } |
897 | |
898 | /// Parallel iterator over slices separated by a predicate |
899 | pub struct Split<'data, T, P> { |
900 | slice: &'data [T], |
901 | separator: P, |
902 | } |
903 | |
904 | impl<'data, T, P: Clone> Clone for Split<'data, T, P> { |
905 | fn clone(&self) -> Self { |
906 | Split { |
907 | separator: self.separator.clone(), |
908 | ..*self |
909 | } |
910 | } |
911 | } |
912 | |
913 | impl<'data, T: Debug, P> Debug for Split<'data, T, P> { |
914 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
915 | f.debug_struct("Split" ).field(name:"slice" , &self.slice).finish() |
916 | } |
917 | } |
918 | |
919 | impl<'data, T, P> ParallelIterator for Split<'data, T, P> |
920 | where |
921 | P: Fn(&T) -> bool + Sync + Send, |
922 | T: Sync, |
923 | { |
924 | type Item = &'data [T]; |
925 | |
926 | fn drive_unindexed<C>(self, consumer: C) -> C::Result |
927 | where |
928 | C: UnindexedConsumer<Self::Item>, |
929 | { |
930 | let producer: SplitProducer<'_, P, &[T]> = SplitProducer::new(self.slice, &self.separator); |
931 | bridge_unindexed(producer, consumer) |
932 | } |
933 | } |
934 | |
935 | /// Implement support for `SplitProducer`. |
936 | impl<'data, T, P> Fissile<P> for &'data [T] |
937 | where |
938 | P: Fn(&T) -> bool, |
939 | { |
940 | fn length(&self) -> usize { |
941 | self.len() |
942 | } |
943 | |
944 | fn midpoint(&self, end: usize) -> usize { |
945 | end / 2 |
946 | } |
947 | |
948 | fn find(&self, separator: &P, start: usize, end: usize) -> Option<usize> { |
949 | self[start..end].iter().position(separator) |
950 | } |
951 | |
952 | fn rfind(&self, separator: &P, end: usize) -> Option<usize> { |
953 | self[..end].iter().rposition(separator) |
954 | } |
955 | |
956 | fn split_once(self, index: usize) -> (Self, Self) { |
957 | let (left, right) = self.split_at(index); |
958 | (left, &right[1..]) // skip the separator |
959 | } |
960 | |
961 | fn fold_splits<F>(self, separator: &P, folder: F, skip_last: bool) -> F |
962 | where |
963 | F: Folder<Self>, |
964 | Self: Send, |
965 | { |
966 | let mut split = self.split(separator); |
967 | if skip_last { |
968 | split.next_back(); |
969 | } |
970 | folder.consume_iter(split) |
971 | } |
972 | } |
973 | |
974 | /// Parallel iterator over mutable slices separated by a predicate |
975 | pub struct SplitMut<'data, T, P> { |
976 | slice: &'data mut [T], |
977 | separator: P, |
978 | } |
979 | |
980 | impl<'data, T: Debug, P> Debug for SplitMut<'data, T, P> { |
981 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
982 | f&mut DebugStruct<'_, '_>.debug_struct("SplitMut" ) |
983 | .field(name:"slice" , &self.slice) |
984 | .finish() |
985 | } |
986 | } |
987 | |
988 | impl<'data, T, P> ParallelIterator for SplitMut<'data, T, P> |
989 | where |
990 | P: Fn(&T) -> bool + Sync + Send, |
991 | T: Send, |
992 | { |
993 | type Item = &'data mut [T]; |
994 | |
995 | fn drive_unindexed<C>(self, consumer: C) -> C::Result |
996 | where |
997 | C: UnindexedConsumer<Self::Item>, |
998 | { |
999 | let producer: SplitProducer<'_, P, &mut …> = SplitProducer::new(self.slice, &self.separator); |
1000 | bridge_unindexed(producer, consumer) |
1001 | } |
1002 | } |
1003 | |
1004 | /// Implement support for `SplitProducer`. |
1005 | impl<'data, T, P> Fissile<P> for &'data mut [T] |
1006 | where |
1007 | P: Fn(&T) -> bool, |
1008 | { |
1009 | fn length(&self) -> usize { |
1010 | self.len() |
1011 | } |
1012 | |
1013 | fn midpoint(&self, end: usize) -> usize { |
1014 | end / 2 |
1015 | } |
1016 | |
1017 | fn find(&self, separator: &P, start: usize, end: usize) -> Option<usize> { |
1018 | self[start..end].iter().position(separator) |
1019 | } |
1020 | |
1021 | fn rfind(&self, separator: &P, end: usize) -> Option<usize> { |
1022 | self[..end].iter().rposition(separator) |
1023 | } |
1024 | |
1025 | fn split_once(self, index: usize) -> (Self, Self) { |
1026 | let (left, right) = self.split_at_mut(index); |
1027 | (left, &mut right[1..]) // skip the separator |
1028 | } |
1029 | |
1030 | fn fold_splits<F>(self, separator: &P, folder: F, skip_last: bool) -> F |
1031 | where |
1032 | F: Folder<Self>, |
1033 | Self: Send, |
1034 | { |
1035 | let mut split = self.split_mut(separator); |
1036 | if skip_last { |
1037 | split.next_back(); |
1038 | } |
1039 | folder.consume_iter(split) |
1040 | } |
1041 | } |
1042 | |