1 | use std::{mem, ops}; |
2 | |
3 | use crate::stats::float::Float; |
4 | use crate::stats::tuple::{Tuple, TupledDistributionsBuilder}; |
5 | use crate::stats::univariate::Percentiles; |
6 | use crate::stats::univariate::Resamples; |
7 | #[cfg (feature = "rayon" )] |
8 | use rayon::prelude::*; |
9 | |
10 | /// A collection of data points drawn from a population |
11 | /// |
12 | /// Invariants: |
13 | /// |
14 | /// - The sample contains at least 2 data points |
15 | /// - The sample contains no `NaN`s |
16 | #[repr (transparent)] |
17 | pub struct Sample<A>([A]); |
18 | |
19 | // TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module |
20 | impl<A> Sample<A> |
21 | where |
22 | A: Float, |
23 | { |
24 | /// Creates a new sample from an existing slice |
25 | /// |
26 | /// # Panics |
27 | /// |
28 | /// Panics if `slice` contains any `NaN` or if `slice` has less than two elements |
29 | #[cfg_attr (feature = "cargo-clippy" , allow(clippy::new_ret_no_self))] |
30 | pub fn new(slice: &[A]) -> &Sample<A> { |
31 | assert!(slice.len() > 1 && slice.iter().all(|x| !x.is_nan())); |
32 | |
33 | unsafe { mem::transmute(slice) } |
34 | } |
35 | |
36 | /// Returns the biggest element in the sample |
37 | /// |
38 | /// - Time: `O(length)` |
39 | pub fn max(&self) -> A { |
40 | let mut elems = self.iter(); |
41 | |
42 | match elems.next() { |
43 | Some(&head) => elems.fold(head, |a, &b| a.max(b)), |
44 | // NB `unreachable!` because `Sample` is guaranteed to have at least one data point |
45 | None => unreachable!(), |
46 | } |
47 | } |
48 | |
49 | /// Returns the arithmetic average of the sample |
50 | /// |
51 | /// - Time: `O(length)` |
52 | pub fn mean(&self) -> A { |
53 | let n = self.len(); |
54 | |
55 | self.sum() / A::cast(n) |
56 | } |
57 | |
58 | /// Returns the median absolute deviation |
59 | /// |
60 | /// The `median` can be optionally passed along to speed up (2X) the computation |
61 | /// |
62 | /// - Time: `O(length)` |
63 | /// - Memory: `O(length)` |
64 | pub fn median_abs_dev(&self, median: Option<A>) -> A |
65 | where |
66 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
67 | { |
68 | let median = median.unwrap_or_else(|| self.percentiles().median()); |
69 | |
70 | // NB Although this operation can be SIMD accelerated, the gain is negligible because the |
71 | // bottle neck is the sorting operation which is part of the computation of the median |
72 | let abs_devs = self.iter().map(|&x| (x - median).abs()).collect::<Vec<_>>(); |
73 | |
74 | let abs_devs: &Self = Self::new(&abs_devs); |
75 | |
76 | abs_devs.percentiles().median() * A::cast(1.4826) |
77 | } |
78 | |
79 | /// Returns the median absolute deviation as a percentage of the median |
80 | /// |
81 | /// - Time: `O(length)` |
82 | /// - Memory: `O(length)` |
83 | pub fn median_abs_dev_pct(&self) -> A |
84 | where |
85 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
86 | { |
87 | let _100 = A::cast(100); |
88 | let median = self.percentiles().median(); |
89 | let mad = self.median_abs_dev(Some(median)); |
90 | |
91 | (mad / median) * _100 |
92 | } |
93 | |
94 | /// Returns the smallest element in the sample |
95 | /// |
96 | /// - Time: `O(length)` |
97 | pub fn min(&self) -> A { |
98 | let mut elems = self.iter(); |
99 | |
100 | match elems.next() { |
101 | Some(&elem) => elems.fold(elem, |a, &b| a.min(b)), |
102 | // NB `unreachable!` because `Sample` is guaranteed to have at least one data point |
103 | None => unreachable!(), |
104 | } |
105 | } |
106 | |
107 | /// Returns a "view" into the percentiles of the sample |
108 | /// |
109 | /// This "view" makes consecutive computations of percentiles much faster (`O(1)`) |
110 | /// |
111 | /// - Time: `O(N log N) where N = length` |
112 | /// - Memory: `O(length)` |
113 | pub fn percentiles(&self) -> Percentiles<A> |
114 | where |
115 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
116 | { |
117 | use std::cmp::Ordering; |
118 | |
119 | // NB This function assumes that there are no `NaN`s in the sample |
120 | fn cmp<T>(a: &T, b: &T) -> Ordering |
121 | where |
122 | T: PartialOrd, |
123 | { |
124 | match a.partial_cmp(b) { |
125 | Some(o) => o, |
126 | // Arbitrary way to handle NaNs that should never happen |
127 | None => Ordering::Equal, |
128 | } |
129 | } |
130 | |
131 | let mut v = self.to_vec().into_boxed_slice(); |
132 | #[cfg (feature = "rayon" )] |
133 | v.par_sort_unstable_by(cmp); |
134 | #[cfg (not(feature = "rayon" ))] |
135 | v.sort_unstable_by(cmp); |
136 | |
137 | // NB :-1: to intra-crate privacy rules |
138 | unsafe { mem::transmute(v) } |
139 | } |
140 | |
141 | /// Returns the standard deviation of the sample |
142 | /// |
143 | /// The `mean` can be optionally passed along to speed up (2X) the computation |
144 | /// |
145 | /// - Time: `O(length)` |
146 | pub fn std_dev(&self, mean: Option<A>) -> A { |
147 | self.var(mean).sqrt() |
148 | } |
149 | |
150 | /// Returns the standard deviation as a percentage of the mean |
151 | /// |
152 | /// - Time: `O(length)` |
153 | pub fn std_dev_pct(&self) -> A { |
154 | let _100 = A::cast(100); |
155 | let mean = self.mean(); |
156 | let std_dev = self.std_dev(Some(mean)); |
157 | |
158 | (std_dev / mean) * _100 |
159 | } |
160 | |
161 | /// Returns the sum of all the elements of the sample |
162 | /// |
163 | /// - Time: `O(length)` |
164 | pub fn sum(&self) -> A { |
165 | crate::stats::sum(self) |
166 | } |
167 | |
168 | /// Returns the t score between these two samples |
169 | /// |
170 | /// - Time: `O(length)` |
171 | pub fn t(&self, other: &Sample<A>) -> A { |
172 | let (x_bar, y_bar) = (self.mean(), other.mean()); |
173 | let (s2_x, s2_y) = (self.var(Some(x_bar)), other.var(Some(y_bar))); |
174 | let n_x = A::cast(self.len()); |
175 | let n_y = A::cast(other.len()); |
176 | let num = x_bar - y_bar; |
177 | let den = (s2_x / n_x + s2_y / n_y).sqrt(); |
178 | |
179 | num / den |
180 | } |
181 | |
182 | /// Returns the variance of the sample |
183 | /// |
184 | /// The `mean` can be optionally passed along to speed up (2X) the computation |
185 | /// |
186 | /// - Time: `O(length)` |
187 | pub fn var(&self, mean: Option<A>) -> A { |
188 | use std::ops::Add; |
189 | |
190 | let mean = mean.unwrap_or_else(|| self.mean()); |
191 | let slice = self; |
192 | |
193 | let sum = slice |
194 | .iter() |
195 | .map(|&x| (x - mean).powi(2)) |
196 | .fold(A::cast(0), Add::add); |
197 | |
198 | sum / A::cast(slice.len() - 1) |
199 | } |
200 | |
201 | // TODO Remove the `T` parameter in favor of `S::Output` |
202 | /// Returns the bootstrap distributions of the parameters estimated by the 1-sample statistic |
203 | /// |
204 | /// - Multi-threaded |
205 | /// - Time: `O(nresamples)` |
206 | /// - Memory: `O(nresamples)` |
207 | pub fn bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions |
208 | where |
209 | S: Fn(&Sample<A>) -> T + Sync, |
210 | T: Tuple + Send, |
211 | T::Distributions: Send, |
212 | T::Builder: Send, |
213 | { |
214 | #[cfg (feature = "rayon" )] |
215 | { |
216 | (0..nresamples) |
217 | .into_par_iter() |
218 | .map_init( |
219 | || Resamples::new(self), |
220 | |resamples, _| statistic(resamples.next()), |
221 | ) |
222 | .fold( |
223 | || T::Builder::new(0), |
224 | |mut sub_distributions, sample| { |
225 | sub_distributions.push(sample); |
226 | sub_distributions |
227 | }, |
228 | ) |
229 | .reduce( |
230 | || T::Builder::new(0), |
231 | |mut a, mut b| { |
232 | a.extend(&mut b); |
233 | a |
234 | }, |
235 | ) |
236 | .complete() |
237 | } |
238 | #[cfg (not(feature = "rayon" ))] |
239 | { |
240 | let mut resamples = Resamples::new(self); |
241 | (0..nresamples) |
242 | .map(|_| statistic(resamples.next())) |
243 | .fold(T::Builder::new(0), |mut sub_distributions, sample| { |
244 | sub_distributions.push(sample); |
245 | sub_distributions |
246 | }) |
247 | .complete() |
248 | } |
249 | } |
250 | |
251 | #[cfg (test)] |
252 | pub fn iqr(&self) -> A |
253 | where |
254 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
255 | { |
256 | self.percentiles().iqr() |
257 | } |
258 | |
259 | #[cfg (test)] |
260 | pub fn median(&self) -> A |
261 | where |
262 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
263 | { |
264 | self.percentiles().median() |
265 | } |
266 | } |
267 | |
268 | impl<A> ops::Deref for Sample<A> { |
269 | type Target = [A]; |
270 | |
271 | fn deref(&self) -> &[A] { |
272 | &self.0 |
273 | } |
274 | } |
275 | |