1 | //! Kernel density estimation |
2 | |
3 | pub mod kernel; |
4 | |
5 | use self::kernel::Kernel; |
6 | use crate::stats::float::Float; |
7 | use crate::stats::univariate::Sample; |
8 | #[cfg (feature = "rayon" )] |
9 | use rayon::prelude::*; |
10 | |
11 | /// Univariate kernel density estimator |
12 | pub struct Kde<'a, A, K> |
13 | where |
14 | A: Float, |
15 | K: Kernel<A>, |
16 | { |
17 | bandwidth: A, |
18 | kernel: K, |
19 | sample: &'a Sample<A>, |
20 | } |
21 | |
22 | impl<'a, A, K> Kde<'a, A, K> |
23 | where |
24 | A: 'a + Float, |
25 | K: Kernel<A>, |
26 | { |
27 | /// Creates a new kernel density estimator from the `sample`, using a kernel and estimating |
28 | /// the bandwidth using the method `bw` |
29 | pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> { |
30 | Kde { |
31 | bandwidth: bw.estimate(sample), |
32 | kernel, |
33 | sample, |
34 | } |
35 | } |
36 | |
37 | /// Returns the bandwidth used by the estimator |
38 | pub fn bandwidth(&self) -> A { |
39 | self.bandwidth |
40 | } |
41 | |
42 | /// Maps the KDE over `xs` |
43 | /// |
44 | /// - Multihreaded |
45 | pub fn map(&self, xs: &[A]) -> Box<[A]> { |
46 | #[cfg (feature = "rayon" )] |
47 | let iter = xs.par_iter(); |
48 | |
49 | #[cfg (not(feature = "rayon" ))] |
50 | let iter = xs.iter(); |
51 | |
52 | iter.map(|&x| self.estimate(x)) |
53 | .collect::<Vec<_>>() |
54 | .into_boxed_slice() |
55 | } |
56 | |
57 | /// Estimates the probability density of `x` |
58 | pub fn estimate(&self, x: A) -> A { |
59 | let _0 = A::cast(0); |
60 | let slice = self.sample; |
61 | let h = self.bandwidth; |
62 | let n = A::cast(slice.len()); |
63 | |
64 | let sum = slice |
65 | .iter() |
66 | .fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h)); |
67 | |
68 | sum / (h * n) |
69 | } |
70 | } |
71 | |
72 | /// Method to estimate the bandwidth |
73 | pub enum Bandwidth { |
74 | /// Use Silverman's rule of thumb to estimate the bandwidth from the sample |
75 | Silverman, |
76 | } |
77 | |
78 | impl Bandwidth { |
79 | fn estimate<A: Float>(self, sample: &Sample<A>) -> A { |
80 | match self { |
81 | Bandwidth::Silverman => { |
82 | let factor = A::cast(4. / 3.); |
83 | let exponent = A::cast(1. / 5.); |
84 | let n = A::cast(sample.len()); |
85 | let sigma = sample.std_dev(None); |
86 | |
87 | sigma * (factor / n).powf(exponent) |
88 | } |
89 | } |
90 | } |
91 | } |
92 | |
93 | #[cfg (test)] |
94 | macro_rules! test { |
95 | ($ty:ident) => { |
96 | mod $ty { |
97 | use approx::relative_eq; |
98 | use quickcheck::quickcheck; |
99 | use quickcheck::TestResult; |
100 | |
101 | use crate::stats::univariate::kde::kernel::Gaussian; |
102 | use crate::stats::univariate::kde::{Bandwidth, Kde}; |
103 | use crate::stats::univariate::Sample; |
104 | |
105 | // The [-inf inf] integral of the estimated PDF should be one |
106 | quickcheck! { |
107 | fn integral(size: u8, start: u8) -> TestResult { |
108 | let size = size as usize; |
109 | let start = start as usize; |
110 | const DX: $ty = 1e-3; |
111 | |
112 | if let Some(v) = crate::stats::test::vec::<$ty>(size, start) { |
113 | let slice = &v[start..]; |
114 | let data = Sample::new(slice); |
115 | let kde = Kde::new(data, Gaussian, Bandwidth::Silverman); |
116 | let h = kde.bandwidth(); |
117 | // NB Obviously a [-inf inf] integral is not feasible, but this range works |
118 | // quite well |
119 | let (a, b) = (data.min() - 5. * h, data.max() + 5. * h); |
120 | |
121 | let mut acc = 0.; |
122 | let mut x = a; |
123 | let mut y = kde.estimate(a); |
124 | |
125 | while x < b { |
126 | acc += DX * y / 2.; |
127 | |
128 | x += DX; |
129 | y = kde.estimate(x); |
130 | |
131 | acc += DX * y / 2.; |
132 | } |
133 | |
134 | TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5)) |
135 | } else { |
136 | TestResult::discard() |
137 | } |
138 | } |
139 | } |
140 | } |
141 | }; |
142 | } |
143 | |
144 | #[cfg (test)] |
145 | mod test { |
146 | test!(f32); |
147 | test!(f64); |
148 | } |
149 | |