1 | //! Tukey's method |
2 | //! |
3 | //! The original method uses two "fences" to classify the data. All the observations "inside" the |
4 | //! fences are considered "normal", and the rest are considered outliers. |
5 | //! |
6 | //! The fences are computed from the quartiles of the sample, according to the following formula: |
7 | //! |
8 | //! ``` ignore |
9 | //! // q1, q3 are the first and third quartiles |
10 | //! let iqr = q3 - q1; // The interquartile range |
11 | //! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr); // the "fences" |
12 | //! |
13 | //! let is_outlier = |x| if x > f1 && x < f2 { true } else { false }; |
14 | //! ``` |
15 | //! |
16 | //! The classifier provided here adds two extra outer fences: |
17 | //! |
18 | //! ``` ignore |
19 | //! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr); // the outer "fences" |
20 | //! ``` |
21 | //! |
22 | //! The extra fences add a sense of "severity" to the classification. Data points outside of the |
23 | //! outer fences are considered "severe" outliers, whereas points outside the inner fences are just |
24 | //! "mild" outliers, and, as the original method, everything inside the inner fences is considered |
25 | //! "normal" data. |
26 | //! |
27 | //! Some ASCII art for the visually oriented people: |
28 | //! |
29 | //! ``` ignore |
30 | //! LOW-ish NORMAL-ish HIGH-ish |
31 | //! x | + | o o o o o o o | + | x |
32 | //! f3 f1 f2 f4 |
33 | //! |
34 | //! Legend: |
35 | //! o: "normal" data (not an outlier) |
36 | //! +: "mild" outlier |
37 | //! x: "severe" outlier |
38 | //! ``` |
39 | |
40 | use std::iter::IntoIterator; |
41 | use std::ops::{Deref, Index}; |
42 | use std::slice; |
43 | |
44 | use crate::stats::float::Float; |
45 | use crate::stats::univariate::Sample; |
46 | |
47 | use self::Label::*; |
48 | |
49 | /// A classified/labeled sample. |
50 | /// |
51 | /// The labeled data can be accessed using the indexing operator. The order of the data points is |
52 | /// retained. |
53 | /// |
54 | /// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the |
55 | /// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)` |
56 | /// pair. |
57 | #[derive(Clone, Copy)] |
58 | pub struct LabeledSample<'a, A> |
59 | where |
60 | A: Float, |
61 | { |
62 | fences: (A, A, A, A), |
63 | sample: &'a Sample<A>, |
64 | } |
65 | |
66 | impl<'a, A> LabeledSample<'a, A> |
67 | where |
68 | A: Float, |
69 | { |
70 | /// Returns the number of data points per label |
71 | /// |
72 | /// - Time: `O(length)` |
73 | #[cfg_attr (feature = "cargo-clippy" , allow(clippy::similar_names))] |
74 | pub fn count(&self) -> (usize, usize, usize, usize, usize) { |
75 | let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0); |
76 | |
77 | for (_, label) in self { |
78 | match label { |
79 | LowSevere => { |
80 | los += 1; |
81 | } |
82 | LowMild => { |
83 | lom += 1; |
84 | } |
85 | NotAnOutlier => { |
86 | noa += 1; |
87 | } |
88 | HighMild => { |
89 | him += 1; |
90 | } |
91 | HighSevere => { |
92 | his += 1; |
93 | } |
94 | } |
95 | } |
96 | |
97 | (los, lom, noa, him, his) |
98 | } |
99 | |
100 | /// Returns the fences used to classify the outliers |
101 | pub fn fences(&self) -> (A, A, A, A) { |
102 | self.fences |
103 | } |
104 | |
105 | /// Returns an iterator over the labeled data |
106 | pub fn iter(&self) -> Iter<'a, A> { |
107 | Iter { |
108 | fences: self.fences, |
109 | iter: self.sample.iter(), |
110 | } |
111 | } |
112 | } |
113 | |
114 | impl<'a, A> Deref for LabeledSample<'a, A> |
115 | where |
116 | A: Float, |
117 | { |
118 | type Target = Sample<A>; |
119 | |
120 | fn deref(&self) -> &Sample<A> { |
121 | self.sample |
122 | } |
123 | } |
124 | |
125 | // FIXME Use the `IndexGet` trait |
126 | impl<'a, A> Index<usize> for LabeledSample<'a, A> |
127 | where |
128 | A: Float, |
129 | { |
130 | type Output = Label; |
131 | |
132 | #[cfg_attr (feature = "cargo-clippy" , allow(clippy::similar_names))] |
133 | fn index(&self, i: usize) -> &Label { |
134 | static LOW_SEVERE: Label = LowSevere; |
135 | static LOW_MILD: Label = LowMild; |
136 | static HIGH_MILD: Label = HighMild; |
137 | static HIGH_SEVERE: Label = HighSevere; |
138 | static NOT_AN_OUTLIER: Label = NotAnOutlier; |
139 | |
140 | let x = self.sample[i]; |
141 | let (lost, lomt, himt, hist) = self.fences; |
142 | |
143 | if x < lost { |
144 | &LOW_SEVERE |
145 | } else if x > hist { |
146 | &HIGH_SEVERE |
147 | } else if x < lomt { |
148 | &LOW_MILD |
149 | } else if x > himt { |
150 | &HIGH_MILD |
151 | } else { |
152 | &NOT_AN_OUTLIER |
153 | } |
154 | } |
155 | } |
156 | |
157 | impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A> |
158 | where |
159 | A: Float, |
160 | { |
161 | type Item = (A, Label); |
162 | type IntoIter = Iter<'a, A>; |
163 | |
164 | fn into_iter(self) -> Iter<'a, A> { |
165 | self.iter() |
166 | } |
167 | } |
168 | |
169 | /// Iterator over the labeled data |
170 | pub struct Iter<'a, A> |
171 | where |
172 | A: Float, |
173 | { |
174 | fences: (A, A, A, A), |
175 | iter: slice::Iter<'a, A>, |
176 | } |
177 | |
178 | impl<'a, A> Iterator for Iter<'a, A> |
179 | where |
180 | A: Float, |
181 | { |
182 | type Item = (A, Label); |
183 | |
184 | #[cfg_attr (feature = "cargo-clippy" , allow(clippy::similar_names))] |
185 | fn next(&mut self) -> Option<(A, Label)> { |
186 | self.iter.next().map(|&x| { |
187 | let (lost, lomt, himt, hist) = self.fences; |
188 | |
189 | let label = if x < lost { |
190 | LowSevere |
191 | } else if x > hist { |
192 | HighSevere |
193 | } else if x < lomt { |
194 | LowMild |
195 | } else if x > himt { |
196 | HighMild |
197 | } else { |
198 | NotAnOutlier |
199 | }; |
200 | |
201 | (x, label) |
202 | }) |
203 | } |
204 | |
205 | fn size_hint(&self) -> (usize, Option<usize>) { |
206 | self.iter.size_hint() |
207 | } |
208 | } |
209 | |
210 | /// Labels used to classify outliers |
211 | pub enum Label { |
212 | /// A "mild" outlier in the "high" spectrum |
213 | HighMild, |
214 | /// A "severe" outlier in the "high" spectrum |
215 | HighSevere, |
216 | /// A "mild" outlier in the "low" spectrum |
217 | LowMild, |
218 | /// A "severe" outlier in the "low" spectrum |
219 | LowSevere, |
220 | /// A normal data point |
221 | NotAnOutlier, |
222 | } |
223 | |
224 | impl Label { |
225 | /// Checks if the data point has an "unusually" high value |
226 | pub fn is_high(&self) -> bool { |
227 | matches!(*self, HighMild | HighSevere) |
228 | } |
229 | |
230 | /// Checks if the data point is labeled as a "mild" outlier |
231 | pub fn is_mild(&self) -> bool { |
232 | matches!(*self, HighMild | LowMild) |
233 | } |
234 | |
235 | /// Checks if the data point has an "unusually" low value |
236 | pub fn is_low(&self) -> bool { |
237 | matches!(*self, LowMild | LowSevere) |
238 | } |
239 | |
240 | /// Checks if the data point is labeled as an outlier |
241 | pub fn is_outlier(&self) -> bool { |
242 | matches!(*self, NotAnOutlier) |
243 | } |
244 | |
245 | /// Checks if the data point is labeled as a "severe" outlier |
246 | pub fn is_severe(&self) -> bool { |
247 | matches!(*self, HighSevere | LowSevere) |
248 | } |
249 | } |
250 | |
251 | /// Classifies the sample, and returns a labeled sample. |
252 | /// |
253 | /// - Time: `O(N log N) where N = length` |
254 | pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A> |
255 | where |
256 | A: Float, |
257 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |
258 | { |
259 | let (q1, _, q3) = sample.percentiles().quartiles(); |
260 | let iqr = q3 - q1; |
261 | |
262 | // Mild |
263 | let k_m = A::cast(1.5_f32); |
264 | // Severe |
265 | let k_s = A::cast(3); |
266 | |
267 | LabeledSample { |
268 | fences: ( |
269 | q1 - k_s * iqr, |
270 | q1 - k_m * iqr, |
271 | q3 + k_m * iqr, |
272 | q3 + k_s * iqr, |
273 | ), |
274 | sample, |
275 | } |
276 | } |
277 | |