1 | #![allow (missing_docs)] |
2 | |
3 | use std::mem; |
4 | |
5 | #[cfg (test)] |
6 | mod tests; |
7 | |
8 | fn local_sort(v: &mut [f64]) { |
9 | v.sort_by(|x: &f64, y: &f64| x.total_cmp(y)); |
10 | } |
11 | |
12 | /// Trait that provides simple descriptive statistics on a univariate set of numeric samples. |
13 | pub trait Stats { |
14 | /// Sum of the samples. |
15 | /// |
16 | /// Note: this method sacrifices performance at the altar of accuracy |
17 | /// Depends on IEEE 754 arithmetic guarantees. See proof of correctness at: |
18 | /// ["Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric |
19 | /// Predicates"][paper] |
20 | /// |
21 | /// [paper]: https://www.cs.cmu.edu/~quake-papers/robust-arithmetic.ps |
22 | fn sum(&self) -> f64; |
23 | |
24 | /// Minimum value of the samples. |
25 | fn min(&self) -> f64; |
26 | |
27 | /// Maximum value of the samples. |
28 | fn max(&self) -> f64; |
29 | |
30 | /// Arithmetic mean (average) of the samples: sum divided by sample-count. |
31 | /// |
32 | /// See: <https://en.wikipedia.org/wiki/Arithmetic_mean> |
33 | fn mean(&self) -> f64; |
34 | |
35 | /// Median of the samples: value separating the lower half of the samples from the higher half. |
36 | /// Equal to `self.percentile(50.0)`. |
37 | /// |
38 | /// See: <https://en.wikipedia.org/wiki/Median> |
39 | fn median(&self) -> f64; |
40 | |
41 | /// Variance of the samples: bias-corrected mean of the squares of the differences of each |
42 | /// sample from the sample mean. Note that this calculates the _sample variance_ rather than the |
43 | /// population variance, which is assumed to be unknown. It therefore corrects the `(n-1)/n` |
44 | /// bias that would appear if we calculated a population variance, by dividing by `(n-1)` rather |
45 | /// than `n`. |
46 | /// |
47 | /// See: <https://en.wikipedia.org/wiki/Variance> |
48 | fn var(&self) -> f64; |
49 | |
50 | /// Standard deviation: the square root of the sample variance. |
51 | /// |
52 | /// Note: this is not a robust statistic for non-normal distributions. Prefer the |
53 | /// `median_abs_dev` for unknown distributions. |
54 | /// |
55 | /// See: <https://en.wikipedia.org/wiki/Standard_deviation> |
56 | fn std_dev(&self) -> f64; |
57 | |
58 | /// Standard deviation as a percent of the mean value. See `std_dev` and `mean`. |
59 | /// |
60 | /// Note: this is not a robust statistic for non-normal distributions. Prefer the |
61 | /// `median_abs_dev_pct` for unknown distributions. |
62 | fn std_dev_pct(&self) -> f64; |
63 | |
64 | /// Scaled median of the absolute deviations of each sample from the sample median. This is a |
65 | /// robust (distribution-agnostic) estimator of sample variability. Use this in preference to |
66 | /// `std_dev` if you cannot assume your sample is normally distributed. Note that this is scaled |
67 | /// by the constant `1.4826` to allow its use as a consistent estimator for the standard |
68 | /// deviation. |
69 | /// |
70 | /// See: <https://en.wikipedia.org/wiki/Median_absolute_deviation> |
71 | fn median_abs_dev(&self) -> f64; |
72 | |
73 | /// Median absolute deviation as a percent of the median. See `median_abs_dev` and `median`. |
74 | fn median_abs_dev_pct(&self) -> f64; |
75 | |
76 | /// Percentile: the value below which `pct` percent of the values in `self` fall. For example, |
77 | /// percentile(95.0) will return the value `v` such that 95% of the samples `s` in `self` |
78 | /// satisfy `s <= v`. |
79 | /// |
80 | /// Calculated by linear interpolation between closest ranks. |
81 | /// |
82 | /// See: <https://en.wikipedia.org/wiki/Percentile> |
83 | fn percentile(&self, pct: f64) -> f64; |
84 | |
85 | /// Quartiles of the sample: three values that divide the sample into four equal groups, each |
86 | /// with 1/4 of the data. The middle value is the median. See `median` and `percentile`. This |
87 | /// function may calculate the 3 quartiles more efficiently than 3 calls to `percentile`, but |
88 | /// is otherwise equivalent. |
89 | /// |
90 | /// See also: <https://en.wikipedia.org/wiki/Quartile> |
91 | fn quartiles(&self) -> (f64, f64, f64); |
92 | |
93 | /// Inter-quartile range: the difference between the 25th percentile (1st quartile) and the 75th |
94 | /// percentile (3rd quartile). See `quartiles`. |
95 | /// |
96 | /// See also: <https://en.wikipedia.org/wiki/Interquartile_range> |
97 | fn iqr(&self) -> f64; |
98 | } |
99 | |
100 | /// Extracted collection of all the summary statistics of a sample set. |
101 | #[derive(Debug, Clone, PartialEq, Copy)] |
102 | #[allow (missing_docs)] |
103 | pub struct Summary { |
104 | pub sum: f64, |
105 | pub min: f64, |
106 | pub max: f64, |
107 | pub mean: f64, |
108 | pub median: f64, |
109 | pub var: f64, |
110 | pub std_dev: f64, |
111 | pub std_dev_pct: f64, |
112 | pub median_abs_dev: f64, |
113 | pub median_abs_dev_pct: f64, |
114 | pub quartiles: (f64, f64, f64), |
115 | pub iqr: f64, |
116 | } |
117 | |
118 | impl Summary { |
119 | /// Construct a new summary of a sample set. |
120 | pub fn new(samples: &[f64]) -> Summary { |
121 | Summary { |
122 | sum: samples.sum(), |
123 | min: samples.min(), |
124 | max: samples.max(), |
125 | mean: samples.mean(), |
126 | median: samples.median(), |
127 | var: samples.var(), |
128 | std_dev: samples.std_dev(), |
129 | std_dev_pct: samples.std_dev_pct(), |
130 | median_abs_dev: samples.median_abs_dev(), |
131 | median_abs_dev_pct: samples.median_abs_dev_pct(), |
132 | quartiles: samples.quartiles(), |
133 | iqr: samples.iqr(), |
134 | } |
135 | } |
136 | } |
137 | |
138 | impl Stats for [f64] { |
139 | // FIXME #11059 handle NaN, inf and overflow |
140 | fn sum(&self) -> f64 { |
141 | let mut partials = vec![]; |
142 | |
143 | for &x in self { |
144 | let mut x = x; |
145 | let mut j = 0; |
146 | // This inner loop applies `hi`/`lo` summation to each |
147 | // partial so that the list of partial sums remains exact. |
148 | for i in 0..partials.len() { |
149 | let mut y: f64 = partials[i]; |
150 | if x.abs() < y.abs() { |
151 | mem::swap(&mut x, &mut y); |
152 | } |
153 | // Rounded `x+y` is stored in `hi` with round-off stored in |
154 | // `lo`. Together `hi+lo` are exactly equal to `x+y`. |
155 | let hi = x + y; |
156 | let lo = y - (hi - x); |
157 | if lo != 0.0 { |
158 | partials[j] = lo; |
159 | j += 1; |
160 | } |
161 | x = hi; |
162 | } |
163 | if j >= partials.len() { |
164 | partials.push(x); |
165 | } else { |
166 | partials[j] = x; |
167 | partials.truncate(j + 1); |
168 | } |
169 | } |
170 | let zero: f64 = 0.0; |
171 | partials.iter().fold(zero, |p, q| p + *q) |
172 | } |
173 | |
174 | fn min(&self) -> f64 { |
175 | assert!(!self.is_empty()); |
176 | self.iter().fold(self[0], |p, q| p.min(*q)) |
177 | } |
178 | |
179 | fn max(&self) -> f64 { |
180 | assert!(!self.is_empty()); |
181 | self.iter().fold(self[0], |p, q| p.max(*q)) |
182 | } |
183 | |
184 | fn mean(&self) -> f64 { |
185 | assert!(!self.is_empty()); |
186 | self.sum() / (self.len() as f64) |
187 | } |
188 | |
189 | fn median(&self) -> f64 { |
190 | self.percentile(50_f64) |
191 | } |
192 | |
193 | fn var(&self) -> f64 { |
194 | if self.len() < 2 { |
195 | 0.0 |
196 | } else { |
197 | let mean = self.mean(); |
198 | let mut v: f64 = 0.0; |
199 | for s in self { |
200 | let x = *s - mean; |
201 | v += x * x; |
202 | } |
203 | // N.B., this is _supposed to be_ len-1, not len. If you |
204 | // change it back to len, you will be calculating a |
205 | // population variance, not a sample variance. |
206 | let denom = (self.len() - 1) as f64; |
207 | v / denom |
208 | } |
209 | } |
210 | |
211 | fn std_dev(&self) -> f64 { |
212 | self.var().sqrt() |
213 | } |
214 | |
215 | fn std_dev_pct(&self) -> f64 { |
216 | let hundred = 100_f64; |
217 | (self.std_dev() / self.mean()) * hundred |
218 | } |
219 | |
220 | fn median_abs_dev(&self) -> f64 { |
221 | let med = self.median(); |
222 | let abs_devs: Vec<f64> = self.iter().map(|&v| (med - v).abs()).collect(); |
223 | // This constant is derived by smarter statistics brains than me, but it is |
224 | // consistent with how R and other packages treat the MAD. |
225 | let number = 1.4826; |
226 | abs_devs.median() * number |
227 | } |
228 | |
229 | fn median_abs_dev_pct(&self) -> f64 { |
230 | let hundred = 100_f64; |
231 | (self.median_abs_dev() / self.median()) * hundred |
232 | } |
233 | |
234 | fn percentile(&self, pct: f64) -> f64 { |
235 | let mut tmp = self.to_vec(); |
236 | local_sort(&mut tmp); |
237 | percentile_of_sorted(&tmp, pct) |
238 | } |
239 | |
240 | fn quartiles(&self) -> (f64, f64, f64) { |
241 | let mut tmp = self.to_vec(); |
242 | local_sort(&mut tmp); |
243 | let first = 25_f64; |
244 | let a = percentile_of_sorted(&tmp, first); |
245 | let second = 50_f64; |
246 | let b = percentile_of_sorted(&tmp, second); |
247 | let third = 75_f64; |
248 | let c = percentile_of_sorted(&tmp, third); |
249 | (a, b, c) |
250 | } |
251 | |
252 | fn iqr(&self) -> f64 { |
253 | let (a, _, c) = self.quartiles(); |
254 | c - a |
255 | } |
256 | } |
257 | |
258 | // Helper function: extract a value representing the `pct` percentile of a sorted sample-set, using |
259 | // linear interpolation. If samples are not sorted, return nonsensical value. |
260 | fn percentile_of_sorted(sorted_samples: &[f64], pct: f64) -> f64 { |
261 | assert!(!sorted_samples.is_empty()); |
262 | if sorted_samples.len() == 1 { |
263 | return sorted_samples[0]; |
264 | } |
265 | let zero: f64 = 0.0; |
266 | assert!(zero <= pct); |
267 | let hundred: f64 = 100_f64; |
268 | assert!(pct <= hundred); |
269 | if pct == hundred { |
270 | return sorted_samples[sorted_samples.len() - 1]; |
271 | } |
272 | let length: f64 = (sorted_samples.len() - 1) as f64; |
273 | let rank: f64 = (pct / hundred) * length; |
274 | let lrank: f64 = rank.floor(); |
275 | let d: f64 = rank - lrank; |
276 | let n: usize = lrank as usize; |
277 | let lo = sorted_samples[n]; |
278 | let hi = sorted_samples[n + 1]; |
279 | lo + (hi - lo) * d |
280 | } |
281 | |
282 | /// Winsorize a set of samples, replacing values above the `100-pct` percentile |
283 | /// and below the `pct` percentile with those percentiles themselves. This is a |
284 | /// way of minimizing the effect of outliers, at the cost of biasing the sample. |
285 | /// It differs from trimming in that it does not change the number of samples, |
286 | /// just changes the values of those that are outliers. |
287 | /// |
288 | /// See: <https://en.wikipedia.org/wiki/Winsorising> |
289 | pub fn winsorize(samples: &mut [f64], pct: f64) { |
290 | let mut tmp: [f64] = samples.to_vec(); |
291 | local_sort(&mut tmp); |
292 | let lo: f64 = percentile_of_sorted(&tmp, pct); |
293 | let hundred: f64 = 100_f64; |
294 | let hi: f64 = percentile_of_sorted(&tmp, pct:hundred - pct); |
295 | for samp in samples { |
296 | if *samp > hi { |
297 | *samp = hi |
298 | } else if *samp < lo { |
299 | *samp = lo |
300 | } |
301 | } |
302 | } |
303 | |