| 1 | #![ allow(missing_docs)] | 
| 2 |  | 
|---|
| 3 | use std::mem; | 
|---|
| 4 |  | 
|---|
| 5 | #[ cfg(test)] | 
|---|
| 6 | mod tests; | 
|---|
| 7 |  | 
|---|
| 8 | fn local_sort(v: &mut [f64]) { | 
|---|
| 9 | v.sort_by(|x: &f64, y: &f64| x.total_cmp(y)); | 
|---|
| 10 | } | 
|---|
| 11 |  | 
|---|
| 12 | /// Trait that provides simple descriptive statistics on a univariate set of numeric samples. | 
|---|
| 13 | pub trait Stats { | 
|---|
| 14 | /// Sum of the samples. | 
|---|
| 15 | /// | 
|---|
| 16 | /// Note: this method sacrifices performance at the altar of accuracy | 
|---|
| 17 | /// Depends on IEEE 754 arithmetic guarantees. See proof of correctness at: | 
|---|
| 18 | /// ["Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric | 
|---|
| 19 | /// Predicates"][paper] | 
|---|
| 20 | /// | 
|---|
| 21 | /// [paper]: https://www.cs.cmu.edu/~quake-papers/robust-arithmetic.ps | 
|---|
| 22 | fn sum(&self) -> f64; | 
|---|
| 23 |  | 
|---|
| 24 | /// Minimum value of the samples. | 
|---|
| 25 | fn min(&self) -> f64; | 
|---|
| 26 |  | 
|---|
| 27 | /// Maximum value of the samples. | 
|---|
| 28 | fn max(&self) -> f64; | 
|---|
| 29 |  | 
|---|
| 30 | /// Arithmetic mean (average) of the samples: sum divided by sample-count. | 
|---|
| 31 | /// | 
|---|
| 32 | /// See: <https://en.wikipedia.org/wiki/Arithmetic_mean> | 
|---|
| 33 | fn mean(&self) -> f64; | 
|---|
| 34 |  | 
|---|
| 35 | /// Median of the samples: value separating the lower half of the samples from the higher half. | 
|---|
| 36 | /// Equal to `self.percentile(50.0)`. | 
|---|
| 37 | /// | 
|---|
| 38 | /// See: <https://en.wikipedia.org/wiki/Median> | 
|---|
| 39 | fn median(&self) -> f64; | 
|---|
| 40 |  | 
|---|
| 41 | /// Variance of the samples: bias-corrected mean of the squares of the differences of each | 
|---|
| 42 | /// sample from the sample mean. Note that this calculates the _sample variance_ rather than the | 
|---|
| 43 | /// population variance, which is assumed to be unknown. It therefore corrects the `(n-1)/n` | 
|---|
| 44 | /// bias that would appear if we calculated a population variance, by dividing by `(n-1)` rather | 
|---|
| 45 | /// than `n`. | 
|---|
| 46 | /// | 
|---|
| 47 | /// See: <https://en.wikipedia.org/wiki/Variance> | 
|---|
| 48 | fn var(&self) -> f64; | 
|---|
| 49 |  | 
|---|
| 50 | /// Standard deviation: the square root of the sample variance. | 
|---|
| 51 | /// | 
|---|
| 52 | /// Note: this is not a robust statistic for non-normal distributions. Prefer the | 
|---|
| 53 | /// `median_abs_dev` for unknown distributions. | 
|---|
| 54 | /// | 
|---|
| 55 | /// See: <https://en.wikipedia.org/wiki/Standard_deviation> | 
|---|
| 56 | fn std_dev(&self) -> f64; | 
|---|
| 57 |  | 
|---|
| 58 | /// Standard deviation as a percent of the mean value. See `std_dev` and `mean`. | 
|---|
| 59 | /// | 
|---|
| 60 | /// Note: this is not a robust statistic for non-normal distributions. Prefer the | 
|---|
| 61 | /// `median_abs_dev_pct` for unknown distributions. | 
|---|
| 62 | fn std_dev_pct(&self) -> f64; | 
|---|
| 63 |  | 
|---|
| 64 | /// Scaled median of the absolute deviations of each sample from the sample median. This is a | 
|---|
| 65 | /// robust (distribution-agnostic) estimator of sample variability. Use this in preference to | 
|---|
| 66 | /// `std_dev` if you cannot assume your sample is normally distributed. Note that this is scaled | 
|---|
| 67 | /// by the constant `1.4826` to allow its use as a consistent estimator for the standard | 
|---|
| 68 | /// deviation. | 
|---|
| 69 | /// | 
|---|
| 70 | /// See: <https://en.wikipedia.org/wiki/Median_absolute_deviation> | 
|---|
| 71 | fn median_abs_dev(&self) -> f64; | 
|---|
| 72 |  | 
|---|
| 73 | /// Median absolute deviation as a percent of the median. See `median_abs_dev` and `median`. | 
|---|
| 74 | fn median_abs_dev_pct(&self) -> f64; | 
|---|
| 75 |  | 
|---|
| 76 | /// Percentile: the value below which `pct` percent of the values in `self` fall. For example, | 
|---|
| 77 | /// percentile(95.0) will return the value `v` such that 95% of the samples `s` in `self` | 
|---|
| 78 | /// satisfy `s <= v`. | 
|---|
| 79 | /// | 
|---|
| 80 | /// Calculated by linear interpolation between closest ranks. | 
|---|
| 81 | /// | 
|---|
| 82 | /// See: <https://en.wikipedia.org/wiki/Percentile> | 
|---|
| 83 | fn percentile(&self, pct: f64) -> f64; | 
|---|
| 84 |  | 
|---|
| 85 | /// Quartiles of the sample: three values that divide the sample into four equal groups, each | 
|---|
| 86 | /// with 1/4 of the data. The middle value is the median. See `median` and `percentile`. This | 
|---|
| 87 | /// function may calculate the 3 quartiles more efficiently than 3 calls to `percentile`, but | 
|---|
| 88 | /// is otherwise equivalent. | 
|---|
| 89 | /// | 
|---|
| 90 | /// See also: <https://en.wikipedia.org/wiki/Quartile> | 
|---|
| 91 | fn quartiles(&self) -> (f64, f64, f64); | 
|---|
| 92 |  | 
|---|
| 93 | /// Inter-quartile range: the difference between the 25th percentile (1st quartile) and the 75th | 
|---|
| 94 | /// percentile (3rd quartile). See `quartiles`. | 
|---|
| 95 | /// | 
|---|
| 96 | /// See also: <https://en.wikipedia.org/wiki/Interquartile_range> | 
|---|
| 97 | fn iqr(&self) -> f64; | 
|---|
| 98 | } | 
|---|
| 99 |  | 
|---|
| 100 | /// Extracted collection of all the summary statistics of a sample set. | 
|---|
| 101 | #[ derive(Debug, Clone, PartialEq, Copy)] | 
|---|
| 102 | #[ allow(missing_docs)] | 
|---|
| 103 | pub struct Summary { | 
|---|
| 104 | pub sum: f64, | 
|---|
| 105 | pub min: f64, | 
|---|
| 106 | pub max: f64, | 
|---|
| 107 | pub mean: f64, | 
|---|
| 108 | pub median: f64, | 
|---|
| 109 | pub var: f64, | 
|---|
| 110 | pub std_dev: f64, | 
|---|
| 111 | pub std_dev_pct: f64, | 
|---|
| 112 | pub median_abs_dev: f64, | 
|---|
| 113 | pub median_abs_dev_pct: f64, | 
|---|
| 114 | pub quartiles: (f64, f64, f64), | 
|---|
| 115 | pub iqr: f64, | 
|---|
| 116 | } | 
|---|
| 117 |  | 
|---|
| 118 | impl Summary { | 
|---|
| 119 | /// Constructs a new summary of a sample set. | 
|---|
| 120 | pub fn new(samples: &[f64]) -> Summary { | 
|---|
| 121 | Summary { | 
|---|
| 122 | sum: samples.sum(), | 
|---|
| 123 | min: samples.min(), | 
|---|
| 124 | max: samples.max(), | 
|---|
| 125 | mean: samples.mean(), | 
|---|
| 126 | median: samples.median(), | 
|---|
| 127 | var: samples.var(), | 
|---|
| 128 | std_dev: samples.std_dev(), | 
|---|
| 129 | std_dev_pct: samples.std_dev_pct(), | 
|---|
| 130 | median_abs_dev: samples.median_abs_dev(), | 
|---|
| 131 | median_abs_dev_pct: samples.median_abs_dev_pct(), | 
|---|
| 132 | quartiles: samples.quartiles(), | 
|---|
| 133 | iqr: samples.iqr(), | 
|---|
| 134 | } | 
|---|
| 135 | } | 
|---|
| 136 | } | 
|---|
| 137 |  | 
|---|
| 138 | impl Stats for [f64] { | 
|---|
| 139 | // FIXME #11059 handle NaN, inf and overflow | 
|---|
| 140 | fn sum(&self) -> f64 { | 
|---|
| 141 | let mut partials = vec![]; | 
|---|
| 142 |  | 
|---|
| 143 | for &x in self { | 
|---|
| 144 | let mut x = x; | 
|---|
| 145 | let mut j = 0; | 
|---|
| 146 | // This inner loop applies `hi`/`lo` summation to each | 
|---|
| 147 | // partial so that the list of partial sums remains exact. | 
|---|
| 148 | for i in 0..partials.len() { | 
|---|
| 149 | let mut y: f64 = partials[i]; | 
|---|
| 150 | if x.abs() < y.abs() { | 
|---|
| 151 | mem::swap(&mut x, &mut y); | 
|---|
| 152 | } | 
|---|
| 153 | // Rounded `x+y` is stored in `hi` with round-off stored in | 
|---|
| 154 | // `lo`. Together `hi+lo` are exactly equal to `x+y`. | 
|---|
| 155 | let hi = x + y; | 
|---|
| 156 | let lo = y - (hi - x); | 
|---|
| 157 | if lo != 0.0 { | 
|---|
| 158 | partials[j] = lo; | 
|---|
| 159 | j += 1; | 
|---|
| 160 | } | 
|---|
| 161 | x = hi; | 
|---|
| 162 | } | 
|---|
| 163 | if j >= partials.len() { | 
|---|
| 164 | partials.push(x); | 
|---|
| 165 | } else { | 
|---|
| 166 | partials[j] = x; | 
|---|
| 167 | partials.truncate(j + 1); | 
|---|
| 168 | } | 
|---|
| 169 | } | 
|---|
| 170 | let zero: f64 = 0.0; | 
|---|
| 171 | partials.iter().fold(zero, |p, q| p + *q) | 
|---|
| 172 | } | 
|---|
| 173 |  | 
|---|
| 174 | fn min(&self) -> f64 { | 
|---|
| 175 | assert!(!self.is_empty()); | 
|---|
| 176 | self.iter().fold(self[0], |p, q| p.min(*q)) | 
|---|
| 177 | } | 
|---|
| 178 |  | 
|---|
| 179 | fn max(&self) -> f64 { | 
|---|
| 180 | assert!(!self.is_empty()); | 
|---|
| 181 | self.iter().fold(self[0], |p, q| p.max(*q)) | 
|---|
| 182 | } | 
|---|
| 183 |  | 
|---|
| 184 | fn mean(&self) -> f64 { | 
|---|
| 185 | assert!(!self.is_empty()); | 
|---|
| 186 | self.sum() / (self.len() as f64) | 
|---|
| 187 | } | 
|---|
| 188 |  | 
|---|
| 189 | fn median(&self) -> f64 { | 
|---|
| 190 | self.percentile(50_f64) | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | fn var(&self) -> f64 { | 
|---|
| 194 | if self.len() < 2 { | 
|---|
| 195 | 0.0 | 
|---|
| 196 | } else { | 
|---|
| 197 | let mean = self.mean(); | 
|---|
| 198 | let mut v: f64 = 0.0; | 
|---|
| 199 | for s in self { | 
|---|
| 200 | let x = *s - mean; | 
|---|
| 201 | v += x * x; | 
|---|
| 202 | } | 
|---|
| 203 | // N.B., this is _supposed to be_ len-1, not len. If you | 
|---|
| 204 | // change it back to len, you will be calculating a | 
|---|
| 205 | // population variance, not a sample variance. | 
|---|
| 206 | let denom = (self.len() - 1) as f64; | 
|---|
| 207 | v / denom | 
|---|
| 208 | } | 
|---|
| 209 | } | 
|---|
| 210 |  | 
|---|
| 211 | fn std_dev(&self) -> f64 { | 
|---|
| 212 | self.var().sqrt() | 
|---|
| 213 | } | 
|---|
| 214 |  | 
|---|
| 215 | fn std_dev_pct(&self) -> f64 { | 
|---|
| 216 | let hundred = 100_f64; | 
|---|
| 217 | (self.std_dev() / self.mean()) * hundred | 
|---|
| 218 | } | 
|---|
| 219 |  | 
|---|
| 220 | fn median_abs_dev(&self) -> f64 { | 
|---|
| 221 | let med = self.median(); | 
|---|
| 222 | let abs_devs: Vec<f64> = self.iter().map(|&v| (med - v).abs()).collect(); | 
|---|
| 223 | // This constant is derived by smarter statistics brains than me, but it is | 
|---|
| 224 | // consistent with how R and other packages treat the MAD. | 
|---|
| 225 | let number = 1.4826; | 
|---|
| 226 | abs_devs.median() * number | 
|---|
| 227 | } | 
|---|
| 228 |  | 
|---|
| 229 | fn median_abs_dev_pct(&self) -> f64 { | 
|---|
| 230 | let hundred = 100_f64; | 
|---|
| 231 | (self.median_abs_dev() / self.median()) * hundred | 
|---|
| 232 | } | 
|---|
| 233 |  | 
|---|
| 234 | fn percentile(&self, pct: f64) -> f64 { | 
|---|
| 235 | let mut tmp = self.to_vec(); | 
|---|
| 236 | local_sort(&mut tmp); | 
|---|
| 237 | percentile_of_sorted(&tmp, pct) | 
|---|
| 238 | } | 
|---|
| 239 |  | 
|---|
| 240 | fn quartiles(&self) -> (f64, f64, f64) { | 
|---|
| 241 | let mut tmp = self.to_vec(); | 
|---|
| 242 | local_sort(&mut tmp); | 
|---|
| 243 | let first = 25_f64; | 
|---|
| 244 | let a = percentile_of_sorted(&tmp, first); | 
|---|
| 245 | let second = 50_f64; | 
|---|
| 246 | let b = percentile_of_sorted(&tmp, second); | 
|---|
| 247 | let third = 75_f64; | 
|---|
| 248 | let c = percentile_of_sorted(&tmp, third); | 
|---|
| 249 | (a, b, c) | 
|---|
| 250 | } | 
|---|
| 251 |  | 
|---|
| 252 | fn iqr(&self) -> f64 { | 
|---|
| 253 | let (a, _, c) = self.quartiles(); | 
|---|
| 254 | c - a | 
|---|
| 255 | } | 
|---|
| 256 | } | 
|---|
| 257 |  | 
|---|
| 258 | // Helper function: extract a value representing the `pct` percentile of a sorted sample-set, using | 
|---|
| 259 | // linear interpolation. If samples are not sorted, return nonsensical value. | 
|---|
| 260 | fn percentile_of_sorted(sorted_samples: &[f64], pct: f64) -> f64 { | 
|---|
| 261 | assert!(!sorted_samples.is_empty()); | 
|---|
| 262 | if sorted_samples.len() == 1 { | 
|---|
| 263 | return sorted_samples[0]; | 
|---|
| 264 | } | 
|---|
| 265 | let zero: f64 = 0.0; | 
|---|
| 266 | assert!(zero <= pct); | 
|---|
| 267 | let hundred: f64 = 100_f64; | 
|---|
| 268 | assert!(pct <= hundred); | 
|---|
| 269 | if pct == hundred { | 
|---|
| 270 | return sorted_samples[sorted_samples.len() - 1]; | 
|---|
| 271 | } | 
|---|
| 272 | let length: f64 = (sorted_samples.len() - 1) as f64; | 
|---|
| 273 | let rank: f64 = (pct / hundred) * length; | 
|---|
| 274 | let lrank: f64 = rank.floor(); | 
|---|
| 275 | let d: f64 = rank - lrank; | 
|---|
| 276 | let n: usize = lrank as usize; | 
|---|
| 277 | let lo: f64 = sorted_samples[n]; | 
|---|
| 278 | let hi: f64 = sorted_samples[n + 1]; | 
|---|
| 279 | lo + (hi - lo) * d | 
|---|
| 280 | } | 
|---|
| 281 |  | 
|---|
| 282 | /// Winsorize a set of samples, replacing values above the `100-pct` percentile | 
|---|
| 283 | /// and below the `pct` percentile with those percentiles themselves. This is a | 
|---|
| 284 | /// way of minimizing the effect of outliers, at the cost of biasing the sample. | 
|---|
| 285 | /// It differs from trimming in that it does not change the number of samples, | 
|---|
| 286 | /// just changes the values of those that are outliers. | 
|---|
| 287 | /// | 
|---|
| 288 | /// See: <https://en.wikipedia.org/wiki/Winsorising> | 
|---|
| 289 | pub fn winsorize(samples: &mut [f64], pct: f64) { | 
|---|
| 290 | let mut tmp: Vec = samples.to_vec(); | 
|---|
| 291 | local_sort(&mut tmp); | 
|---|
| 292 | let lo: f64 = percentile_of_sorted(&tmp, pct); | 
|---|
| 293 | let hundred: f64 = 100_f64; | 
|---|
| 294 | let hi: f64 = percentile_of_sorted(&tmp, pct:hundred - pct); | 
|---|
| 295 | for samp: &mut f64 in samples { | 
|---|
| 296 | if *samp > hi { | 
|---|
| 297 | *samp = hi | 
|---|
| 298 | } else if *samp < lo { | 
|---|
| 299 | *samp = lo | 
|---|
| 300 | } | 
|---|
| 301 | } | 
|---|
| 302 | } | 
|---|
| 303 |  | 
|---|