| 1 | //! Parallel merge sort. | 
| 2 | //! | 
| 3 | //! This implementation is copied verbatim from `std::slice::sort` and then parallelized. | 
| 4 | //! The only difference from the original is that the sequential `mergesort` returns | 
| 5 | //! `MergesortResult` and leaves descending arrays intact. | 
| 6 |  | 
| 7 | use crate::iter::*; | 
| 8 | use crate::slice::ParallelSliceMut; | 
| 9 | use crate::SendPtr; | 
| 10 | use std::mem; | 
| 11 | use std::mem::size_of; | 
| 12 | use std::ptr; | 
| 13 | use std::slice; | 
| 14 |  | 
| 15 | unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T { | 
| 16 |     let old: *mut T = *ptr; | 
| 17 |     *ptr = ptr.offset(count:1); | 
| 18 |     old | 
| 19 | } | 
| 20 |  | 
| 21 | unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T { | 
| 22 |     *ptr = ptr.offset(count:-1); | 
| 23 |     *ptr | 
| 24 | } | 
| 25 |  | 
| 26 | /// When dropped, copies from `src` into `dest` a sequence of length `len`. | 
| 27 | struct CopyOnDrop<T> { | 
| 28 |     src: *const T, | 
| 29 |     dest: *mut T, | 
| 30 |     len: usize, | 
| 31 | } | 
| 32 |  | 
| 33 | impl<T> Drop for CopyOnDrop<T> { | 
| 34 |     fn drop(&mut self) { | 
| 35 |         unsafe { | 
| 36 |             ptr::copy_nonoverlapping(self.src, self.dest, self.len); | 
| 37 |         } | 
| 38 |     } | 
| 39 | } | 
| 40 |  | 
| 41 | /// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted. | 
| 42 | /// | 
| 43 | /// This is the integral subroutine of insertion sort. | 
| 44 | fn insert_head<T, F>(v: &mut [T], is_less: &F) | 
| 45 | where | 
| 46 |     F: Fn(&T, &T) -> bool, | 
| 47 | { | 
| 48 |     if v.len() >= 2 && is_less(&v[1], &v[0]) { | 
| 49 |         unsafe { | 
| 50 |             // There are three ways to implement insertion here: | 
| 51 |             // | 
| 52 |             // 1. Swap adjacent elements until the first one gets to its final destination. | 
| 53 |             //    However, this way we copy data around more than is necessary. If elements are big | 
| 54 |             //    structures (costly to copy), this method will be slow. | 
| 55 |             // | 
| 56 |             // 2. Iterate until the right place for the first element is found. Then shift the | 
| 57 |             //    elements succeeding it to make room for it and finally place it into the | 
| 58 |             //    remaining hole. This is a good method. | 
| 59 |             // | 
| 60 |             // 3. Copy the first element into a temporary variable. Iterate until the right place | 
| 61 |             //    for it is found. As we go along, copy every traversed element into the slot | 
| 62 |             //    preceding it. Finally, copy data from the temporary variable into the remaining | 
| 63 |             //    hole. This method is very good. Benchmarks demonstrated slightly better | 
| 64 |             //    performance than with the 2nd method. | 
| 65 |             // | 
| 66 |             // All methods were benchmarked, and the 3rd showed best results. So we chose that one. | 
| 67 |             let tmp = mem::ManuallyDrop::new(ptr::read(&v[0])); | 
| 68 |  | 
| 69 |             // Intermediate state of the insertion process is always tracked by `hole`, which | 
| 70 |             // serves two purposes: | 
| 71 |             // 1. Protects integrity of `v` from panics in `is_less`. | 
| 72 |             // 2. Fills the remaining hole in `v` in the end. | 
| 73 |             // | 
| 74 |             // Panic safety: | 
| 75 |             // | 
| 76 |             // If `is_less` panics at any point during the process, `hole` will get dropped and | 
| 77 |             // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it | 
| 78 |             // initially held exactly once. | 
| 79 |             let mut hole = InsertionHole { | 
| 80 |                 src: &*tmp, | 
| 81 |                 dest: &mut v[1], | 
| 82 |             }; | 
| 83 |             ptr::copy_nonoverlapping(&v[1], &mut v[0], 1); | 
| 84 |  | 
| 85 |             for i in 2..v.len() { | 
| 86 |                 if !is_less(&v[i], &*tmp) { | 
| 87 |                     break; | 
| 88 |                 } | 
| 89 |                 ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1); | 
| 90 |                 hole.dest = &mut v[i]; | 
| 91 |             } | 
| 92 |             // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`. | 
| 93 |         } | 
| 94 |     } | 
| 95 |  | 
| 96 |     // When dropped, copies from `src` into `dest`. | 
| 97 |     struct InsertionHole<T> { | 
| 98 |         src: *const T, | 
| 99 |         dest: *mut T, | 
| 100 |     } | 
| 101 |  | 
| 102 |     impl<T> Drop for InsertionHole<T> { | 
| 103 |         fn drop(&mut self) { | 
| 104 |             unsafe { | 
| 105 |                 ptr::copy_nonoverlapping(self.src, self.dest, 1); | 
| 106 |             } | 
| 107 |         } | 
| 108 |     } | 
| 109 | } | 
| 110 |  | 
| 111 | /// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and | 
| 112 | /// stores the result into `v[..]`. | 
| 113 | /// | 
| 114 | /// # Safety | 
| 115 | /// | 
| 116 | /// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough | 
| 117 | /// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type. | 
| 118 | unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &F) | 
| 119 | where | 
| 120 |     F: Fn(&T, &T) -> bool, | 
| 121 | { | 
| 122 |     let len = v.len(); | 
| 123 |     let v = v.as_mut_ptr(); | 
| 124 |     let v_mid = v.add(mid); | 
| 125 |     let v_end = v.add(len); | 
| 126 |  | 
| 127 |     // The merge process first copies the shorter run into `buf`. Then it traces the newly copied | 
| 128 |     // run and the longer run forwards (or backwards), comparing their next unconsumed elements and | 
| 129 |     // copying the lesser (or greater) one into `v`. | 
| 130 |     // | 
| 131 |     // As soon as the shorter run is fully consumed, the process is done. If the longer run gets | 
| 132 |     // consumed first, then we must copy whatever is left of the shorter run into the remaining | 
| 133 |     // hole in `v`. | 
| 134 |     // | 
| 135 |     // Intermediate state of the process is always tracked by `hole`, which serves two purposes: | 
| 136 |     // 1. Protects integrity of `v` from panics in `is_less`. | 
| 137 |     // 2. Fills the remaining hole in `v` if the longer run gets consumed first. | 
| 138 |     // | 
| 139 |     // Panic safety: | 
| 140 |     // | 
| 141 |     // If `is_less` panics at any point during the process, `hole` will get dropped and fill the | 
| 142 |     // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every | 
| 143 |     // object it initially held exactly once. | 
| 144 |     let mut hole; | 
| 145 |  | 
| 146 |     if mid <= len - mid { | 
| 147 |         // The left run is shorter. | 
| 148 |         ptr::copy_nonoverlapping(v, buf, mid); | 
| 149 |         hole = MergeHole { | 
| 150 |             start: buf, | 
| 151 |             end: buf.add(mid), | 
| 152 |             dest: v, | 
| 153 |         }; | 
| 154 |  | 
| 155 |         // Initially, these pointers point to the beginnings of their arrays. | 
| 156 |         let left = &mut hole.start; | 
| 157 |         let mut right = v_mid; | 
| 158 |         let out = &mut hole.dest; | 
| 159 |  | 
| 160 |         while *left < hole.end && right < v_end { | 
| 161 |             // Consume the lesser side. | 
| 162 |             // If equal, prefer the left run to maintain stability. | 
| 163 |             let to_copy = if is_less(&*right, &**left) { | 
| 164 |                 get_and_increment(&mut right) | 
| 165 |             } else { | 
| 166 |                 get_and_increment(left) | 
| 167 |             }; | 
| 168 |             ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1); | 
| 169 |         } | 
| 170 |     } else { | 
| 171 |         // The right run is shorter. | 
| 172 |         ptr::copy_nonoverlapping(v_mid, buf, len - mid); | 
| 173 |         hole = MergeHole { | 
| 174 |             start: buf, | 
| 175 |             end: buf.add(len - mid), | 
| 176 |             dest: v_mid, | 
| 177 |         }; | 
| 178 |  | 
| 179 |         // Initially, these pointers point past the ends of their arrays. | 
| 180 |         let left = &mut hole.dest; | 
| 181 |         let right = &mut hole.end; | 
| 182 |         let mut out = v_end; | 
| 183 |  | 
| 184 |         while v < *left && buf < *right { | 
| 185 |             // Consume the greater side. | 
| 186 |             // If equal, prefer the right run to maintain stability. | 
| 187 |             let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) { | 
| 188 |                 decrement_and_get(left) | 
| 189 |             } else { | 
| 190 |                 decrement_and_get(right) | 
| 191 |             }; | 
| 192 |             ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1); | 
| 193 |         } | 
| 194 |     } | 
| 195 |     // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of | 
| 196 |     // it will now be copied into the hole in `v`. | 
| 197 |  | 
| 198 |     // When dropped, copies the range `start..end` into `dest..`. | 
| 199 |     struct MergeHole<T> { | 
| 200 |         start: *mut T, | 
| 201 |         end: *mut T, | 
| 202 |         dest: *mut T, | 
| 203 |     } | 
| 204 |  | 
| 205 |     impl<T> Drop for MergeHole<T> { | 
| 206 |         fn drop(&mut self) { | 
| 207 |             // `T` is not a zero-sized type, so it's okay to divide by its size. | 
| 208 |             unsafe { | 
| 209 |                 let len = self.end.offset_from(self.start) as usize; | 
| 210 |                 ptr::copy_nonoverlapping(self.start, self.dest, len); | 
| 211 |             } | 
| 212 |         } | 
| 213 |     } | 
| 214 | } | 
| 215 |  | 
| 216 | /// The result of merge sort. | 
| 217 | #[must_use ] | 
| 218 | #[derive (Clone, Copy, PartialEq, Eq)] | 
| 219 | enum MergesortResult { | 
| 220 |     /// The slice has already been sorted. | 
| 221 |     NonDescending, | 
| 222 |     /// The slice has been descending and therefore it was left intact. | 
| 223 |     Descending, | 
| 224 |     /// The slice was sorted. | 
| 225 |     Sorted, | 
| 226 | } | 
| 227 |  | 
| 228 | /// A sorted run that starts at index `start` and is of length `len`. | 
| 229 | #[derive (Clone, Copy)] | 
| 230 | struct Run { | 
| 231 |     start: usize, | 
| 232 |     len: usize, | 
| 233 | } | 
| 234 |  | 
| 235 | /// Examines the stack of runs and identifies the next pair of runs to merge. More specifically, | 
| 236 | /// if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the | 
| 237 | /// algorithm should continue building a new run instead, `None` is returned. | 
| 238 | /// | 
| 239 | /// TimSort is infamous for its buggy implementations, as described here: | 
| 240 | /// http://envisage-project.eu/timsort-specification-and-verification/ | 
| 241 | /// | 
| 242 | /// The gist of the story is: we must enforce the invariants on the top four runs on the stack. | 
| 243 | /// Enforcing them on just top three is not sufficient to ensure that the invariants will still | 
| 244 | /// hold for *all* runs in the stack. | 
| 245 | /// | 
| 246 | /// This function correctly checks invariants for the top four runs. Additionally, if the top | 
| 247 | /// run starts at index 0, it will always demand a merge operation until the stack is fully | 
| 248 | /// collapsed, in order to complete the sort. | 
| 249 | #[inline ] | 
| 250 | fn collapse(runs: &[Run]) -> Option<usize> { | 
| 251 |     let n: usize = runs.len(); | 
| 252 |  | 
| 253 |     if n >= 2 | 
| 254 |         && (runs[n - 1].start == 0 | 
| 255 |             || runs[n - 2].len <= runs[n - 1].len | 
| 256 |             || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len) | 
| 257 |             || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len)) | 
| 258 |     { | 
| 259 |         if n >= 3 && runs[n - 3].len < runs[n - 1].len { | 
| 260 |             Some(n - 3) | 
| 261 |         } else { | 
| 262 |             Some(n - 2) | 
| 263 |         } | 
| 264 |     } else { | 
| 265 |         None | 
| 266 |     } | 
| 267 | } | 
| 268 |  | 
| 269 | /// Sorts a slice using merge sort, unless it is already in descending order. | 
| 270 | /// | 
| 271 | /// This function doesn't modify the slice if it is already non-descending or descending. | 
| 272 | /// Otherwise, it sorts the slice into non-descending order. | 
| 273 | /// | 
| 274 | /// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail | 
| 275 | /// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). | 
| 276 | /// | 
| 277 | /// The algorithm identifies strictly descending and non-descending subsequences, which are called | 
| 278 | /// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed | 
| 279 | /// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are | 
| 280 | /// satisfied: | 
| 281 | /// | 
| 282 | /// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len` | 
| 283 | /// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len` | 
| 284 | /// | 
| 285 | /// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case. | 
| 286 | /// | 
| 287 | /// # Safety | 
| 288 | /// | 
| 289 | /// The argument `buf` is used as a temporary buffer and must be at least as long as `v`. | 
| 290 | unsafe fn mergesort<T, F>(v: &mut [T], buf: *mut T, is_less: &F) -> MergesortResult | 
| 291 | where | 
| 292 |     T: Send, | 
| 293 |     F: Fn(&T, &T) -> bool + Sync, | 
| 294 | { | 
| 295 |     // Very short runs are extended using insertion sort to span at least this many elements. | 
| 296 |     const MIN_RUN: usize = 10; | 
| 297 |  | 
| 298 |     let len = v.len(); | 
| 299 |  | 
| 300 |     // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a | 
| 301 |     // strange decision, but consider the fact that merges more often go in the opposite direction | 
| 302 |     // (forwards). According to benchmarks, merging forwards is slightly faster than merging | 
| 303 |     // backwards. To conclude, identifying runs by traversing backwards improves performance. | 
| 304 |     let mut runs = vec![]; | 
| 305 |     let mut end = len; | 
| 306 |     while end > 0 { | 
| 307 |         // Find the next natural run, and reverse it if it's strictly descending. | 
| 308 |         let mut start = end - 1; | 
| 309 |  | 
| 310 |         if start > 0 { | 
| 311 |             start -= 1; | 
| 312 |  | 
| 313 |             if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) { | 
| 314 |                 while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) { | 
| 315 |                     start -= 1; | 
| 316 |                 } | 
| 317 |  | 
| 318 |                 // If this descending run covers the whole slice, return immediately. | 
| 319 |                 if start == 0 && end == len { | 
| 320 |                     return MergesortResult::Descending; | 
| 321 |                 } else { | 
| 322 |                     v[start..end].reverse(); | 
| 323 |                 } | 
| 324 |             } else { | 
| 325 |                 while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) { | 
| 326 |                     start -= 1; | 
| 327 |                 } | 
| 328 |  | 
| 329 |                 // If this non-descending run covers the whole slice, return immediately. | 
| 330 |                 if end - start == len { | 
| 331 |                     return MergesortResult::NonDescending; | 
| 332 |                 } | 
| 333 |             } | 
| 334 |         } | 
| 335 |  | 
| 336 |         // Insert some more elements into the run if it's too short. Insertion sort is faster than | 
| 337 |         // merge sort on short sequences, so this significantly improves performance. | 
| 338 |         while start > 0 && end - start < MIN_RUN { | 
| 339 |             start -= 1; | 
| 340 |             insert_head(&mut v[start..end], &is_less); | 
| 341 |         } | 
| 342 |  | 
| 343 |         // Push this run onto the stack. | 
| 344 |         runs.push(Run { | 
| 345 |             start, | 
| 346 |             len: end - start, | 
| 347 |         }); | 
| 348 |         end = start; | 
| 349 |  | 
| 350 |         // Merge some pairs of adjacent runs to satisfy the invariants. | 
| 351 |         while let Some(r) = collapse(&runs) { | 
| 352 |             let left = runs[r + 1]; | 
| 353 |             let right = runs[r]; | 
| 354 |             merge( | 
| 355 |                 &mut v[left.start..right.start + right.len], | 
| 356 |                 left.len, | 
| 357 |                 buf, | 
| 358 |                 &is_less, | 
| 359 |             ); | 
| 360 |  | 
| 361 |             runs[r] = Run { | 
| 362 |                 start: left.start, | 
| 363 |                 len: left.len + right.len, | 
| 364 |             }; | 
| 365 |             runs.remove(r + 1); | 
| 366 |         } | 
| 367 |     } | 
| 368 |  | 
| 369 |     // Finally, exactly one run must remain in the stack. | 
| 370 |     debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len); | 
| 371 |  | 
| 372 |     // The original order of the slice was neither non-descending nor descending. | 
| 373 |     MergesortResult::Sorted | 
| 374 | } | 
| 375 |  | 
| 376 | //////////////////////////////////////////////////////////////////////////// | 
| 377 | // Everything above this line is copied from `std::slice::sort` (with very minor tweaks). | 
| 378 | // Everything below this line is parallelization. | 
| 379 | //////////////////////////////////////////////////////////////////////////// | 
| 380 |  | 
| 381 | /// Splits two sorted slices so that they can be merged in parallel. | 
| 382 | /// | 
| 383 | /// Returns two indices `(a, b)` so that slices `left[..a]` and `right[..b]` come before | 
| 384 | /// `left[a..]` and `right[b..]`. | 
| 385 | fn split_for_merge<T, F>(left: &[T], right: &[T], is_less: &F) -> (usize, usize) | 
| 386 | where | 
| 387 |     F: Fn(&T, &T) -> bool, | 
| 388 | { | 
| 389 |     let left_len = left.len(); | 
| 390 |     let right_len = right.len(); | 
| 391 |  | 
| 392 |     if left_len >= right_len { | 
| 393 |         let left_mid = left_len / 2; | 
| 394 |  | 
| 395 |         // Find the first element in `right` that is greater than or equal to `left[left_mid]`. | 
| 396 |         let mut a = 0; | 
| 397 |         let mut b = right_len; | 
| 398 |         while a < b { | 
| 399 |             let m = a + (b - a) / 2; | 
| 400 |             if is_less(&right[m], &left[left_mid]) { | 
| 401 |                 a = m + 1; | 
| 402 |             } else { | 
| 403 |                 b = m; | 
| 404 |             } | 
| 405 |         } | 
| 406 |  | 
| 407 |         (left_mid, a) | 
| 408 |     } else { | 
| 409 |         let right_mid = right_len / 2; | 
| 410 |  | 
| 411 |         // Find the first element in `left` that is greater than `right[right_mid]`. | 
| 412 |         let mut a = 0; | 
| 413 |         let mut b = left_len; | 
| 414 |         while a < b { | 
| 415 |             let m = a + (b - a) / 2; | 
| 416 |             if is_less(&right[right_mid], &left[m]) { | 
| 417 |                 b = m; | 
| 418 |             } else { | 
| 419 |                 a = m + 1; | 
| 420 |             } | 
| 421 |         } | 
| 422 |  | 
| 423 |         (a, right_mid) | 
| 424 |     } | 
| 425 | } | 
| 426 |  | 
| 427 | /// Merges slices `left` and `right` in parallel and stores the result into `dest`. | 
| 428 | /// | 
| 429 | /// # Safety | 
| 430 | /// | 
| 431 | /// The `dest` pointer must have enough space to store the result. | 
| 432 | /// | 
| 433 | /// Even if `is_less` panics at any point during the merge process, this function will fully copy | 
| 434 | /// all elements from `left` and `right` into `dest` (not necessarily in sorted order). | 
| 435 | unsafe fn par_merge<T, F>(left: &mut [T], right: &mut [T], dest: *mut T, is_less: &F) | 
| 436 | where | 
| 437 |     T: Send, | 
| 438 |     F: Fn(&T, &T) -> bool + Sync, | 
| 439 | { | 
| 440 |     // Slices whose lengths sum up to this value are merged sequentially. This number is slightly | 
| 441 |     // larger than `CHUNK_LENGTH`, and the reason is that merging is faster than merge sorting, so | 
| 442 |     // merging needs a bit coarser granularity in order to hide the overhead of Rayon's task | 
| 443 |     // scheduling. | 
| 444 |     const MAX_SEQUENTIAL: usize = 5000; | 
| 445 |  | 
| 446 |     let left_len = left.len(); | 
| 447 |     let right_len = right.len(); | 
| 448 |  | 
| 449 |     // Intermediate state of the merge process, which serves two purposes: | 
| 450 |     // 1. Protects integrity of `dest` from panics in `is_less`. | 
| 451 |     // 2. Copies the remaining elements as soon as one of the two sides is exhausted. | 
| 452 |     // | 
| 453 |     // Panic safety: | 
| 454 |     // | 
| 455 |     // If `is_less` panics at any point during the merge process, `s` will get dropped and copy the | 
| 456 |     // remaining parts of `left` and `right` into `dest`. | 
| 457 |     let mut s = State { | 
| 458 |         left_start: left.as_mut_ptr(), | 
| 459 |         left_end: left.as_mut_ptr().add(left_len), | 
| 460 |         right_start: right.as_mut_ptr(), | 
| 461 |         right_end: right.as_mut_ptr().add(right_len), | 
| 462 |         dest, | 
| 463 |     }; | 
| 464 |  | 
| 465 |     if left_len == 0 || right_len == 0 || left_len + right_len < MAX_SEQUENTIAL { | 
| 466 |         while s.left_start < s.left_end && s.right_start < s.right_end { | 
| 467 |             // Consume the lesser side. | 
| 468 |             // If equal, prefer the left run to maintain stability. | 
| 469 |             let to_copy = if is_less(&*s.right_start, &*s.left_start) { | 
| 470 |                 get_and_increment(&mut s.right_start) | 
| 471 |             } else { | 
| 472 |                 get_and_increment(&mut s.left_start) | 
| 473 |             }; | 
| 474 |             ptr::copy_nonoverlapping(to_copy, get_and_increment(&mut s.dest), 1); | 
| 475 |         } | 
| 476 |     } else { | 
| 477 |         // Function `split_for_merge` might panic. If that happens, `s` will get destructed and copy | 
| 478 |         // the whole `left` and `right` into `dest`. | 
| 479 |         let (left_mid, right_mid) = split_for_merge(left, right, is_less); | 
| 480 |         let (left_l, left_r) = left.split_at_mut(left_mid); | 
| 481 |         let (right_l, right_r) = right.split_at_mut(right_mid); | 
| 482 |  | 
| 483 |         // Prevent the destructor of `s` from running. Rayon will ensure that both calls to | 
| 484 |         // `par_merge` happen. If one of the two calls panics, they will ensure that elements still | 
| 485 |         // get copied into `dest_left` and `dest_right``. | 
| 486 |         mem::forget(s); | 
| 487 |  | 
| 488 |         // Wrap pointers in SendPtr so that they can be sent to another thread | 
| 489 |         // See the documentation of SendPtr for a full explanation | 
| 490 |         let dest_l = SendPtr(dest); | 
| 491 |         let dest_r = SendPtr(dest.add(left_l.len() + right_l.len())); | 
| 492 |         rayon_core::join( | 
| 493 |             move || par_merge(left_l, right_l, dest_l.get(), is_less), | 
| 494 |             move || par_merge(left_r, right_r, dest_r.get(), is_less), | 
| 495 |         ); | 
| 496 |     } | 
| 497 |     // Finally, `s` gets dropped if we used sequential merge, thus copying the remaining elements | 
| 498 |     // all at once. | 
| 499 |  | 
| 500 |     // When dropped, copies arrays `left_start..left_end` and `right_start..right_end` into `dest`, | 
| 501 |     // in that order. | 
| 502 |     struct State<T> { | 
| 503 |         left_start: *mut T, | 
| 504 |         left_end: *mut T, | 
| 505 |         right_start: *mut T, | 
| 506 |         right_end: *mut T, | 
| 507 |         dest: *mut T, | 
| 508 |     } | 
| 509 |  | 
| 510 |     impl<T> Drop for State<T> { | 
| 511 |         fn drop(&mut self) { | 
| 512 |             let size = size_of::<T>(); | 
| 513 |             let left_len = (self.left_end as usize - self.left_start as usize) / size; | 
| 514 |             let right_len = (self.right_end as usize - self.right_start as usize) / size; | 
| 515 |  | 
| 516 |             // Copy array `left`, followed by `right`. | 
| 517 |             unsafe { | 
| 518 |                 ptr::copy_nonoverlapping(self.left_start, self.dest, left_len); | 
| 519 |                 self.dest = self.dest.add(left_len); | 
| 520 |                 ptr::copy_nonoverlapping(self.right_start, self.dest, right_len); | 
| 521 |             } | 
| 522 |         } | 
| 523 |     } | 
| 524 | } | 
| 525 |  | 
| 526 | /// Recursively merges pre-sorted chunks inside `v`. | 
| 527 | /// | 
| 528 | /// Chunks of `v` are stored in `chunks` as intervals (inclusive left and exclusive right bound). | 
| 529 | /// Argument `buf` is an auxiliary buffer that will be used during the procedure. | 
| 530 | /// If `into_buf` is true, the result will be stored into `buf`, otherwise it will be in `v`. | 
| 531 | /// | 
| 532 | /// # Safety | 
| 533 | /// | 
| 534 | /// The number of chunks must be positive and they must be adjacent: the right bound of each chunk | 
| 535 | /// must equal the left bound of the following chunk. | 
| 536 | /// | 
| 537 | /// The buffer must be at least as long as `v`. | 
| 538 | unsafe fn recurse<T, F>( | 
| 539 |     v: *mut T, | 
| 540 |     buf: *mut T, | 
| 541 |     chunks: &[(usize, usize)], | 
| 542 |     into_buf: bool, | 
| 543 |     is_less: &F, | 
| 544 | ) where | 
| 545 |     T: Send, | 
| 546 |     F: Fn(&T, &T) -> bool + Sync, | 
| 547 | { | 
| 548 |     let len = chunks.len(); | 
| 549 |     debug_assert!(len > 0); | 
| 550 |  | 
| 551 |     // Base case of the algorithm. | 
| 552 |     // If only one chunk is remaining, there's no more work to split and merge. | 
| 553 |     if len == 1 { | 
| 554 |         if into_buf { | 
| 555 |             // Copy the chunk from `v` into `buf`. | 
| 556 |             let (start, end) = chunks[0]; | 
| 557 |             let src = v.add(start); | 
| 558 |             let dest = buf.add(start); | 
| 559 |             ptr::copy_nonoverlapping(src, dest, end - start); | 
| 560 |         } | 
| 561 |         return; | 
| 562 |     } | 
| 563 |  | 
| 564 |     // Split the chunks into two halves. | 
| 565 |     let (start, _) = chunks[0]; | 
| 566 |     let (mid, _) = chunks[len / 2]; | 
| 567 |     let (_, end) = chunks[len - 1]; | 
| 568 |     let (left, right) = chunks.split_at(len / 2); | 
| 569 |  | 
| 570 |     // After recursive calls finish we'll have to merge chunks `(start, mid)` and `(mid, end)` from | 
| 571 |     // `src` into `dest`. If the current invocation has to store the result into `buf`, we'll | 
| 572 |     // merge chunks from `v` into `buf`, and vice versa. | 
| 573 |     // | 
| 574 |     // Recursive calls flip `into_buf` at each level of recursion. More concretely, `par_merge` | 
| 575 |     // merges chunks from `buf` into `v` at the first level, from `v` into `buf` at the second | 
| 576 |     // level etc. | 
| 577 |     let (src, dest) = if into_buf { (v, buf) } else { (buf, v) }; | 
| 578 |  | 
| 579 |     // Panic safety: | 
| 580 |     // | 
| 581 |     // If `is_less` panics at any point during the recursive calls, the destructor of `guard` will | 
| 582 |     // be executed, thus copying everything from `src` into `dest`. This way we ensure that all | 
| 583 |     // chunks are in fact copied into `dest`, even if the merge process doesn't finish. | 
| 584 |     let guard = CopyOnDrop { | 
| 585 |         src: src.add(start), | 
| 586 |         dest: dest.add(start), | 
| 587 |         len: end - start, | 
| 588 |     }; | 
| 589 |  | 
| 590 |     // Wrap pointers in SendPtr so that they can be sent to another thread | 
| 591 |     // See the documentation of SendPtr for a full explanation | 
| 592 |     let v = SendPtr(v); | 
| 593 |     let buf = SendPtr(buf); | 
| 594 |     rayon_core::join( | 
| 595 |         move || recurse(v.get(), buf.get(), left, !into_buf, is_less), | 
| 596 |         move || recurse(v.get(), buf.get(), right, !into_buf, is_less), | 
| 597 |     ); | 
| 598 |  | 
| 599 |     // Everything went all right - recursive calls didn't panic. | 
| 600 |     // Forget the guard in order to prevent its destructor from running. | 
| 601 |     mem::forget(guard); | 
| 602 |  | 
| 603 |     // Merge chunks `(start, mid)` and `(mid, end)` from `src` into `dest`. | 
| 604 |     let src_left = slice::from_raw_parts_mut(src.add(start), mid - start); | 
| 605 |     let src_right = slice::from_raw_parts_mut(src.add(mid), end - mid); | 
| 606 |     par_merge(src_left, src_right, dest.add(start), is_less); | 
| 607 | } | 
| 608 |  | 
| 609 | /// Sorts `v` using merge sort in parallel. | 
| 610 | /// | 
| 611 | /// The algorithm is stable, allocates memory, and `O(n log n)` worst-case. | 
| 612 | /// The allocated temporary buffer is of the same length as is `v`. | 
| 613 | pub(super) fn par_mergesort<T, F>(v: &mut [T], is_less: F) | 
| 614 | where | 
| 615 |     T: Send, | 
| 616 |     F: Fn(&T, &T) -> bool + Sync, | 
| 617 | { | 
| 618 |     // Slices of up to this length get sorted using insertion sort in order to avoid the cost of | 
| 619 |     // buffer allocation. | 
| 620 |     const MAX_INSERTION: usize = 20; | 
| 621 |     // The length of initial chunks. This number is as small as possible but so that the overhead | 
| 622 |     // of Rayon's task scheduling is still negligible. | 
| 623 |     const CHUNK_LENGTH: usize = 2000; | 
| 624 |  | 
| 625 |     // Sorting has no meaningful behavior on zero-sized types. | 
| 626 |     if size_of::<T>() == 0 { | 
| 627 |         return; | 
| 628 |     } | 
| 629 |  | 
| 630 |     let len = v.len(); | 
| 631 |  | 
| 632 |     // Short slices get sorted in-place via insertion sort to avoid allocations. | 
| 633 |     if len <= MAX_INSERTION { | 
| 634 |         if len >= 2 { | 
| 635 |             for i in (0..len - 1).rev() { | 
| 636 |                 insert_head(&mut v[i..], &is_less); | 
| 637 |             } | 
| 638 |         } | 
| 639 |         return; | 
| 640 |     } | 
| 641 |  | 
| 642 |     // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it | 
| 643 |     // shallow copies of the contents of `v` without risking the dtors running on copies if | 
| 644 |     // `is_less` panics. | 
| 645 |     let mut buf = Vec::<T>::with_capacity(len); | 
| 646 |     let buf = buf.as_mut_ptr(); | 
| 647 |  | 
| 648 |     // If the slice is not longer than one chunk would be, do sequential merge sort and return. | 
| 649 |     if len <= CHUNK_LENGTH { | 
| 650 |         let res = unsafe { mergesort(v, buf, &is_less) }; | 
| 651 |         if res == MergesortResult::Descending { | 
| 652 |             v.reverse(); | 
| 653 |         } | 
| 654 |         return; | 
| 655 |     } | 
| 656 |  | 
| 657 |     // Split the slice into chunks and merge sort them in parallel. | 
| 658 |     // However, descending chunks will not be sorted - they will be simply left intact. | 
| 659 |     let mut iter = { | 
| 660 |         // Wrap pointer in SendPtr so that it can be sent to another thread | 
| 661 |         // See the documentation of SendPtr for a full explanation | 
| 662 |         let buf = SendPtr(buf); | 
| 663 |         let is_less = &is_less; | 
| 664 |  | 
| 665 |         v.par_chunks_mut(CHUNK_LENGTH) | 
| 666 |             .with_max_len(1) | 
| 667 |             .enumerate() | 
| 668 |             .map(move |(i, chunk)| { | 
| 669 |                 let l = CHUNK_LENGTH * i; | 
| 670 |                 let r = l + chunk.len(); | 
| 671 |                 unsafe { | 
| 672 |                     let buf = buf.get().add(l); | 
| 673 |                     (l, r, mergesort(chunk, buf, is_less)) | 
| 674 |                 } | 
| 675 |             }) | 
| 676 |             .collect::<Vec<_>>() | 
| 677 |             .into_iter() | 
| 678 |             .peekable() | 
| 679 |     }; | 
| 680 |  | 
| 681 |     // Now attempt to concatenate adjacent chunks that were left intact. | 
| 682 |     let mut chunks = Vec::with_capacity(iter.len()); | 
| 683 |  | 
| 684 |     while let Some((a, mut b, res)) = iter.next() { | 
| 685 |         // If this chunk was not modified by the sort procedure... | 
| 686 |         if res != MergesortResult::Sorted { | 
| 687 |             while let Some(&(x, y, r)) = iter.peek() { | 
| 688 |                 // If the following chunk is of the same type and can be concatenated... | 
| 689 |                 if r == res && (r == MergesortResult::Descending) == is_less(&v[x], &v[x - 1]) { | 
| 690 |                     // Concatenate them. | 
| 691 |                     b = y; | 
| 692 |                     iter.next(); | 
| 693 |                 } else { | 
| 694 |                     break; | 
| 695 |                 } | 
| 696 |             } | 
| 697 |         } | 
| 698 |  | 
| 699 |         // Descending chunks must be reversed. | 
| 700 |         if res == MergesortResult::Descending { | 
| 701 |             v[a..b].reverse(); | 
| 702 |         } | 
| 703 |  | 
| 704 |         chunks.push((a, b)); | 
| 705 |     } | 
| 706 |  | 
| 707 |     // All chunks are properly sorted. | 
| 708 |     // Now we just have to merge them together. | 
| 709 |     unsafe { | 
| 710 |         recurse(v.as_mut_ptr(), buf, &chunks, false, &is_less); | 
| 711 |     } | 
| 712 | } | 
| 713 |  | 
| 714 | #[cfg (test)] | 
| 715 | mod tests { | 
| 716 |     use super::split_for_merge; | 
| 717 |     use rand::distributions::Uniform; | 
| 718 |     use rand::{thread_rng, Rng}; | 
| 719 |  | 
| 720 |     #[test ] | 
| 721 |     fn test_split_for_merge() { | 
| 722 |         fn check(left: &[u32], right: &[u32]) { | 
| 723 |             let (l, r) = split_for_merge(left, right, &|&a, &b| a < b); | 
| 724 |             assert!(left[..l] | 
| 725 |                 .iter() | 
| 726 |                 .all(|&x| right[r..].iter().all(|&y| x <= y))); | 
| 727 |             assert!(right[..r].iter().all(|&x| left[l..].iter().all(|&y| x < y))); | 
| 728 |         } | 
| 729 |  | 
| 730 |         check(&[1, 2, 2, 2, 2, 3], &[1, 2, 2, 2, 2, 3]); | 
| 731 |         check(&[1, 2, 2, 2, 2, 3], &[]); | 
| 732 |         check(&[], &[1, 2, 2, 2, 2, 3]); | 
| 733 |  | 
| 734 |         let rng = &mut thread_rng(); | 
| 735 |  | 
| 736 |         for _ in 0..100 { | 
| 737 |             let limit: u32 = rng.gen_range(1..21); | 
| 738 |             let left_len: usize = rng.gen_range(0..20); | 
| 739 |             let right_len: usize = rng.gen_range(0..20); | 
| 740 |  | 
| 741 |             let mut left = rng | 
| 742 |                 .sample_iter(&Uniform::new(0, limit)) | 
| 743 |                 .take(left_len) | 
| 744 |                 .collect::<Vec<_>>(); | 
| 745 |             let mut right = rng | 
| 746 |                 .sample_iter(&Uniform::new(0, limit)) | 
| 747 |                 .take(right_len) | 
| 748 |                 .collect::<Vec<_>>(); | 
| 749 |  | 
| 750 |             left.sort(); | 
| 751 |             right.sort(); | 
| 752 |             check(&left, &right); | 
| 753 |         } | 
| 754 |     } | 
| 755 | } | 
| 756 |  |