1use std::{mem, ops};
2
3use crate::stats::float::Float;
4use crate::stats::tuple::{Tuple, TupledDistributionsBuilder};
5use crate::stats::univariate::Percentiles;
6use crate::stats::univariate::Resamples;
7#[cfg(feature = "rayon")]
8use rayon::prelude::*;
9
10/// A collection of data points drawn from a population
11///
12/// Invariants:
13///
14/// - The sample contains at least 2 data points
15/// - The sample contains no `NaN`s
16#[repr(transparent)]
17pub struct Sample<A>([A]);
18
19// TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module
20impl<A> Sample<A>
21where
22 A: Float,
23{
24 /// Creates a new sample from an existing slice
25 ///
26 /// # Panics
27 ///
28 /// Panics if `slice` contains any `NaN` or if `slice` has less than two elements
29 #[cfg_attr(feature = "cargo-clippy", allow(clippy::new_ret_no_self))]
30 pub fn new(slice: &[A]) -> &Sample<A> {
31 assert!(slice.len() > 1 && slice.iter().all(|x| !x.is_nan()));
32
33 unsafe { mem::transmute(slice) }
34 }
35
36 /// Returns the biggest element in the sample
37 ///
38 /// - Time: `O(length)`
39 pub fn max(&self) -> A {
40 let mut elems = self.iter();
41
42 match elems.next() {
43 Some(&head) => elems.fold(head, |a, &b| a.max(b)),
44 // NB `unreachable!` because `Sample` is guaranteed to have at least one data point
45 None => unreachable!(),
46 }
47 }
48
49 /// Returns the arithmetic average of the sample
50 ///
51 /// - Time: `O(length)`
52 pub fn mean(&self) -> A {
53 let n = self.len();
54
55 self.sum() / A::cast(n)
56 }
57
58 /// Returns the median absolute deviation
59 ///
60 /// The `median` can be optionally passed along to speed up (2X) the computation
61 ///
62 /// - Time: `O(length)`
63 /// - Memory: `O(length)`
64 pub fn median_abs_dev(&self, median: Option<A>) -> A
65 where
66 usize: cast::From<A, Output = Result<usize, cast::Error>>,
67 {
68 let median = median.unwrap_or_else(|| self.percentiles().median());
69
70 // NB Although this operation can be SIMD accelerated, the gain is negligible because the
71 // bottle neck is the sorting operation which is part of the computation of the median
72 let abs_devs = self.iter().map(|&x| (x - median).abs()).collect::<Vec<_>>();
73
74 let abs_devs: &Self = Self::new(&abs_devs);
75
76 abs_devs.percentiles().median() * A::cast(1.4826)
77 }
78
79 /// Returns the median absolute deviation as a percentage of the median
80 ///
81 /// - Time: `O(length)`
82 /// - Memory: `O(length)`
83 pub fn median_abs_dev_pct(&self) -> A
84 where
85 usize: cast::From<A, Output = Result<usize, cast::Error>>,
86 {
87 let _100 = A::cast(100);
88 let median = self.percentiles().median();
89 let mad = self.median_abs_dev(Some(median));
90
91 (mad / median) * _100
92 }
93
94 /// Returns the smallest element in the sample
95 ///
96 /// - Time: `O(length)`
97 pub fn min(&self) -> A {
98 let mut elems = self.iter();
99
100 match elems.next() {
101 Some(&elem) => elems.fold(elem, |a, &b| a.min(b)),
102 // NB `unreachable!` because `Sample` is guaranteed to have at least one data point
103 None => unreachable!(),
104 }
105 }
106
107 /// Returns a "view" into the percentiles of the sample
108 ///
109 /// This "view" makes consecutive computations of percentiles much faster (`O(1)`)
110 ///
111 /// - Time: `O(N log N) where N = length`
112 /// - Memory: `O(length)`
113 pub fn percentiles(&self) -> Percentiles<A>
114 where
115 usize: cast::From<A, Output = Result<usize, cast::Error>>,
116 {
117 use std::cmp::Ordering;
118
119 // NB This function assumes that there are no `NaN`s in the sample
120 fn cmp<T>(a: &T, b: &T) -> Ordering
121 where
122 T: PartialOrd,
123 {
124 match a.partial_cmp(b) {
125 Some(o) => o,
126 // Arbitrary way to handle NaNs that should never happen
127 None => Ordering::Equal,
128 }
129 }
130
131 let mut v = self.to_vec().into_boxed_slice();
132 #[cfg(feature = "rayon")]
133 v.par_sort_unstable_by(cmp);
134 #[cfg(not(feature = "rayon"))]
135 v.sort_unstable_by(cmp);
136
137 // NB :-1: to intra-crate privacy rules
138 unsafe { mem::transmute(v) }
139 }
140
141 /// Returns the standard deviation of the sample
142 ///
143 /// The `mean` can be optionally passed along to speed up (2X) the computation
144 ///
145 /// - Time: `O(length)`
146 pub fn std_dev(&self, mean: Option<A>) -> A {
147 self.var(mean).sqrt()
148 }
149
150 /// Returns the standard deviation as a percentage of the mean
151 ///
152 /// - Time: `O(length)`
153 pub fn std_dev_pct(&self) -> A {
154 let _100 = A::cast(100);
155 let mean = self.mean();
156 let std_dev = self.std_dev(Some(mean));
157
158 (std_dev / mean) * _100
159 }
160
161 /// Returns the sum of all the elements of the sample
162 ///
163 /// - Time: `O(length)`
164 pub fn sum(&self) -> A {
165 crate::stats::sum(self)
166 }
167
168 /// Returns the t score between these two samples
169 ///
170 /// - Time: `O(length)`
171 pub fn t(&self, other: &Sample<A>) -> A {
172 let (x_bar, y_bar) = (self.mean(), other.mean());
173 let (s2_x, s2_y) = (self.var(Some(x_bar)), other.var(Some(y_bar)));
174 let n_x = A::cast(self.len());
175 let n_y = A::cast(other.len());
176 let num = x_bar - y_bar;
177 let den = (s2_x / n_x + s2_y / n_y).sqrt();
178
179 num / den
180 }
181
182 /// Returns the variance of the sample
183 ///
184 /// The `mean` can be optionally passed along to speed up (2X) the computation
185 ///
186 /// - Time: `O(length)`
187 pub fn var(&self, mean: Option<A>) -> A {
188 use std::ops::Add;
189
190 let mean = mean.unwrap_or_else(|| self.mean());
191 let slice = self;
192
193 let sum = slice
194 .iter()
195 .map(|&x| (x - mean).powi(2))
196 .fold(A::cast(0), Add::add);
197
198 sum / A::cast(slice.len() - 1)
199 }
200
201 // TODO Remove the `T` parameter in favor of `S::Output`
202 /// Returns the bootstrap distributions of the parameters estimated by the 1-sample statistic
203 ///
204 /// - Multi-threaded
205 /// - Time: `O(nresamples)`
206 /// - Memory: `O(nresamples)`
207 pub fn bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions
208 where
209 S: Fn(&Sample<A>) -> T + Sync,
210 T: Tuple + Send,
211 T::Distributions: Send,
212 T::Builder: Send,
213 {
214 #[cfg(feature = "rayon")]
215 {
216 (0..nresamples)
217 .into_par_iter()
218 .map_init(
219 || Resamples::new(self),
220 |resamples, _| statistic(resamples.next()),
221 )
222 .fold(
223 || T::Builder::new(0),
224 |mut sub_distributions, sample| {
225 sub_distributions.push(sample);
226 sub_distributions
227 },
228 )
229 .reduce(
230 || T::Builder::new(0),
231 |mut a, mut b| {
232 a.extend(&mut b);
233 a
234 },
235 )
236 .complete()
237 }
238 #[cfg(not(feature = "rayon"))]
239 {
240 let mut resamples = Resamples::new(self);
241 (0..nresamples)
242 .map(|_| statistic(resamples.next()))
243 .fold(T::Builder::new(0), |mut sub_distributions, sample| {
244 sub_distributions.push(sample);
245 sub_distributions
246 })
247 .complete()
248 }
249 }
250
251 #[cfg(test)]
252 pub fn iqr(&self) -> A
253 where
254 usize: cast::From<A, Output = Result<usize, cast::Error>>,
255 {
256 self.percentiles().iqr()
257 }
258
259 #[cfg(test)]
260 pub fn median(&self) -> A
261 where
262 usize: cast::From<A, Output = Result<usize, cast::Error>>,
263 {
264 self.percentiles().median()
265 }
266}
267
268impl<A> ops::Deref for Sample<A> {
269 type Target = [A];
270
271 fn deref(&self) -> &[A] {
272 &self.0
273 }
274}
275