| 1 | use std::{mem, ops}; |
| 2 | |
| 3 | use crate::stats::float::Float; |
| 4 | use crate::stats::tuple::{Tuple, TupledDistributionsBuilder}; |
| 5 | use crate::stats::univariate::Percentiles; |
| 6 | use crate::stats::univariate::Resamples; |
| 7 | #[cfg (feature = "rayon" )] |
| 8 | use rayon::prelude::*; |
| 9 | |
| 10 | /// A collection of data points drawn from a population |
| 11 | /// |
| 12 | /// Invariants: |
| 13 | /// |
| 14 | /// - The sample contains at least 2 data points |
| 15 | /// - The sample contains no `NaN`s |
| 16 | #[repr (transparent)] |
| 17 | pub struct Sample<A>([A]); |
| 18 | |
| 19 | // TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module |
| 20 | impl<A> Sample<A> |
| 21 | where |
| 22 | A: Float, |
| 23 | { |
| 24 | /// Creates a new sample from an existing slice |
| 25 | /// |
| 26 | /// # Panics |
| 27 | /// |
| 28 | /// Panics if `slice` contains any `NaN` or if `slice` has less than two elements |
| 29 | #[cfg_attr (feature = "cargo-clippy" , allow(clippy::new_ret_no_self))] |
| 30 | pub fn new(slice: &[A]) -> &Sample<A> { |
| 31 | assert!(slice.len() > 1 && slice.iter().all(|x| !x.is_nan())); |
| 32 | |
| 33 | unsafe { mem::transmute(slice) } |
| 34 | } |
| 35 | |
| 36 | /// Returns the biggest element in the sample |
| 37 | /// |
| 38 | /// - Time: `O(length)` |
| 39 | pub fn max(&self) -> A { |
| 40 | let mut elems = self.iter(); |
| 41 | |
| 42 | match elems.next() { |
| 43 | Some(&head) => elems.fold(head, |a, &b| a.max(b)), |
| 44 | // NB `unreachable!` because `Sample` is guaranteed to have at least one data point |
| 45 | None => unreachable!(), |
| 46 | } |
| 47 | } |
| 48 | |
| 49 | /// Returns the arithmetic average of the sample |
| 50 | /// |
| 51 | /// - Time: `O(length)` |
| 52 | pub fn mean(&self) -> A { |
| 53 | let n = self.len(); |
| 54 | |
| 55 | self.sum() / A::cast(n) |
| 56 | } |
| 57 | |
| 58 | /// Returns the median absolute deviation |
| 59 | /// |
| 60 | /// The `median` can be optionally passed along to speed up (2X) the computation |
| 61 | /// |
| 62 | /// - Time: `O(length)` |
| 63 | /// - Memory: `O(length)` |
| 64 | pub fn median_abs_dev(&self, median: Option<A>) -> A |
| 65 | where |
| 66 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
| 67 | { |
| 68 | let median = median.unwrap_or_else(|| self.percentiles().median()); |
| 69 | |
| 70 | // NB Although this operation can be SIMD accelerated, the gain is negligible because the |
| 71 | // bottle neck is the sorting operation which is part of the computation of the median |
| 72 | let abs_devs = self.iter().map(|&x| (x - median).abs()).collect::<Vec<_>>(); |
| 73 | |
| 74 | let abs_devs: &Self = Self::new(&abs_devs); |
| 75 | |
| 76 | abs_devs.percentiles().median() * A::cast(1.4826) |
| 77 | } |
| 78 | |
| 79 | /// Returns the median absolute deviation as a percentage of the median |
| 80 | /// |
| 81 | /// - Time: `O(length)` |
| 82 | /// - Memory: `O(length)` |
| 83 | pub fn median_abs_dev_pct(&self) -> A |
| 84 | where |
| 85 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
| 86 | { |
| 87 | let _100 = A::cast(100); |
| 88 | let median = self.percentiles().median(); |
| 89 | let mad = self.median_abs_dev(Some(median)); |
| 90 | |
| 91 | (mad / median) * _100 |
| 92 | } |
| 93 | |
| 94 | /// Returns the smallest element in the sample |
| 95 | /// |
| 96 | /// - Time: `O(length)` |
| 97 | pub fn min(&self) -> A { |
| 98 | let mut elems = self.iter(); |
| 99 | |
| 100 | match elems.next() { |
| 101 | Some(&elem) => elems.fold(elem, |a, &b| a.min(b)), |
| 102 | // NB `unreachable!` because `Sample` is guaranteed to have at least one data point |
| 103 | None => unreachable!(), |
| 104 | } |
| 105 | } |
| 106 | |
| 107 | /// Returns a "view" into the percentiles of the sample |
| 108 | /// |
| 109 | /// This "view" makes consecutive computations of percentiles much faster (`O(1)`) |
| 110 | /// |
| 111 | /// - Time: `O(N log N) where N = length` |
| 112 | /// - Memory: `O(length)` |
| 113 | pub fn percentiles(&self) -> Percentiles<A> |
| 114 | where |
| 115 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
| 116 | { |
| 117 | use std::cmp::Ordering; |
| 118 | |
| 119 | // NB This function assumes that there are no `NaN`s in the sample |
| 120 | fn cmp<T>(a: &T, b: &T) -> Ordering |
| 121 | where |
| 122 | T: PartialOrd, |
| 123 | { |
| 124 | match a.partial_cmp(b) { |
| 125 | Some(o) => o, |
| 126 | // Arbitrary way to handle NaNs that should never happen |
| 127 | None => Ordering::Equal, |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | let mut v = self.to_vec().into_boxed_slice(); |
| 132 | #[cfg (feature = "rayon" )] |
| 133 | v.par_sort_unstable_by(cmp); |
| 134 | #[cfg (not(feature = "rayon" ))] |
| 135 | v.sort_unstable_by(cmp); |
| 136 | |
| 137 | // NB :-1: to intra-crate privacy rules |
| 138 | unsafe { mem::transmute(v) } |
| 139 | } |
| 140 | |
| 141 | /// Returns the standard deviation of the sample |
| 142 | /// |
| 143 | /// The `mean` can be optionally passed along to speed up (2X) the computation |
| 144 | /// |
| 145 | /// - Time: `O(length)` |
| 146 | pub fn std_dev(&self, mean: Option<A>) -> A { |
| 147 | self.var(mean).sqrt() |
| 148 | } |
| 149 | |
| 150 | /// Returns the standard deviation as a percentage of the mean |
| 151 | /// |
| 152 | /// - Time: `O(length)` |
| 153 | pub fn std_dev_pct(&self) -> A { |
| 154 | let _100 = A::cast(100); |
| 155 | let mean = self.mean(); |
| 156 | let std_dev = self.std_dev(Some(mean)); |
| 157 | |
| 158 | (std_dev / mean) * _100 |
| 159 | } |
| 160 | |
| 161 | /// Returns the sum of all the elements of the sample |
| 162 | /// |
| 163 | /// - Time: `O(length)` |
| 164 | pub fn sum(&self) -> A { |
| 165 | crate::stats::sum(self) |
| 166 | } |
| 167 | |
| 168 | /// Returns the t score between these two samples |
| 169 | /// |
| 170 | /// - Time: `O(length)` |
| 171 | pub fn t(&self, other: &Sample<A>) -> A { |
| 172 | let (x_bar, y_bar) = (self.mean(), other.mean()); |
| 173 | let (s2_x, s2_y) = (self.var(Some(x_bar)), other.var(Some(y_bar))); |
| 174 | let n_x = A::cast(self.len()); |
| 175 | let n_y = A::cast(other.len()); |
| 176 | let num = x_bar - y_bar; |
| 177 | let den = (s2_x / n_x + s2_y / n_y).sqrt(); |
| 178 | |
| 179 | num / den |
| 180 | } |
| 181 | |
| 182 | /// Returns the variance of the sample |
| 183 | /// |
| 184 | /// The `mean` can be optionally passed along to speed up (2X) the computation |
| 185 | /// |
| 186 | /// - Time: `O(length)` |
| 187 | pub fn var(&self, mean: Option<A>) -> A { |
| 188 | use std::ops::Add; |
| 189 | |
| 190 | let mean = mean.unwrap_or_else(|| self.mean()); |
| 191 | let slice = self; |
| 192 | |
| 193 | let sum = slice |
| 194 | .iter() |
| 195 | .map(|&x| (x - mean).powi(2)) |
| 196 | .fold(A::cast(0), Add::add); |
| 197 | |
| 198 | sum / A::cast(slice.len() - 1) |
| 199 | } |
| 200 | |
| 201 | // TODO Remove the `T` parameter in favor of `S::Output` |
| 202 | /// Returns the bootstrap distributions of the parameters estimated by the 1-sample statistic |
| 203 | /// |
| 204 | /// - Multi-threaded |
| 205 | /// - Time: `O(nresamples)` |
| 206 | /// - Memory: `O(nresamples)` |
| 207 | pub fn bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions |
| 208 | where |
| 209 | S: Fn(&Sample<A>) -> T + Sync, |
| 210 | T: Tuple + Send, |
| 211 | T::Distributions: Send, |
| 212 | T::Builder: Send, |
| 213 | { |
| 214 | #[cfg (feature = "rayon" )] |
| 215 | { |
| 216 | (0..nresamples) |
| 217 | .into_par_iter() |
| 218 | .map_init( |
| 219 | || Resamples::new(self), |
| 220 | |resamples, _| statistic(resamples.next()), |
| 221 | ) |
| 222 | .fold( |
| 223 | || T::Builder::new(0), |
| 224 | |mut sub_distributions, sample| { |
| 225 | sub_distributions.push(sample); |
| 226 | sub_distributions |
| 227 | }, |
| 228 | ) |
| 229 | .reduce( |
| 230 | || T::Builder::new(0), |
| 231 | |mut a, mut b| { |
| 232 | a.extend(&mut b); |
| 233 | a |
| 234 | }, |
| 235 | ) |
| 236 | .complete() |
| 237 | } |
| 238 | #[cfg (not(feature = "rayon" ))] |
| 239 | { |
| 240 | let mut resamples = Resamples::new(self); |
| 241 | (0..nresamples) |
| 242 | .map(|_| statistic(resamples.next())) |
| 243 | .fold(T::Builder::new(0), |mut sub_distributions, sample| { |
| 244 | sub_distributions.push(sample); |
| 245 | sub_distributions |
| 246 | }) |
| 247 | .complete() |
| 248 | } |
| 249 | } |
| 250 | |
| 251 | #[cfg (test)] |
| 252 | pub fn iqr(&self) -> A |
| 253 | where |
| 254 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
| 255 | { |
| 256 | self.percentiles().iqr() |
| 257 | } |
| 258 | |
| 259 | #[cfg (test)] |
| 260 | pub fn median(&self) -> A |
| 261 | where |
| 262 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
| 263 | { |
| 264 | self.percentiles().median() |
| 265 | } |
| 266 | } |
| 267 | |
| 268 | impl<A> ops::Deref for Sample<A> { |
| 269 | type Target = [A]; |
| 270 | |
| 271 | fn deref(&self) -> &[A] { |
| 272 | &self.0 |
| 273 | } |
| 274 | } |
| 275 | |