1 | //! Tukey's method |
---|---|

2 | //! |

3 | //! The original method uses two "fences" to classify the data. All the observations "inside" the |

4 | //! fences are considered "normal", and the rest are considered outliers. |

5 | //! |

6 | //! The fences are computed from the quartiles of the sample, according to the following formula: |

7 | //! |

8 | //! ``` ignore |

9 | //! // q1, q3 are the first and third quartiles |

10 | //! let iqr = q3 - q1; // The interquartile range |

11 | //! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr); // the "fences" |

12 | //! |

13 | //! let is_outlier = |x| if x > f1 && x < f2 { true } else { false }; |

14 | //! ``` |

15 | //! |

16 | //! The classifier provided here adds two extra outer fences: |

17 | //! |

18 | //! ``` ignore |

19 | //! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr); // the outer "fences" |

20 | //! ``` |

21 | //! |

22 | //! The extra fences add a sense of "severity" to the classification. Data points outside of the |

23 | //! outer fences are considered "severe" outliers, whereas points outside the inner fences are just |

24 | //! "mild" outliers, and, as the original method, everything inside the inner fences is considered |

25 | //! "normal" data. |

26 | //! |

27 | //! Some ASCII art for the visually oriented people: |

28 | //! |

29 | //! ``` ignore |

30 | //! LOW-ish NORMAL-ish HIGH-ish |

31 | //! x | + | o o o o o o o | + | x |

32 | //! f3 f1 f2 f4 |

33 | //! |

34 | //! Legend: |

35 | //! o: "normal"data (not an outlier) |

36 | //! +: "mild"outlier |

37 | //! x: "severe"outlier |

38 | //! ``` |

39 | |

40 | use std::iter::IntoIterator; |

41 | use std::ops::{Deref, Index}; |

42 | use std::slice; |

43 | |

44 | use crate::stats::float::Float; |

45 | use crate::stats::univariate::Sample; |

46 | |

47 | use self::Label::*; |

48 | |

49 | /// A classified/labeled sample. |

50 | /// |

51 | /// The labeled data can be accessed using the indexing operator. The order of the data points is |

52 | /// retained. |

53 | /// |

54 | /// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the |

55 | /// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)` |

56 | /// pair. |

57 | #[derive(Clone, Copy)] |

58 | pub struct LabeledSample<'a, A> |

59 | where |

60 | A: Float, |

61 | { |

62 | fences: (A, A, A, A), |

63 | sample: &'a Sample<A>, |

64 | } |

65 | |

66 | impl<'a, A> LabeledSample<'a, A> |

67 | where |

68 | A: Float, |

69 | { |

70 | /// Returns the number of data points per label |

71 | /// |

72 | /// - Time: `O(length)` |

73 | #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))] |

74 | pub fn count(&self) -> (usize, usize, usize, usize, usize) { |

75 | let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0); |

76 | |

77 | for (_, label) in self { |

78 | match label { |

79 | LowSevere => { |

80 | los += 1; |

81 | } |

82 | LowMild => { |

83 | lom += 1; |

84 | } |

85 | NotAnOutlier => { |

86 | noa += 1; |

87 | } |

88 | HighMild => { |

89 | him += 1; |

90 | } |

91 | HighSevere => { |

92 | his += 1; |

93 | } |

94 | } |

95 | } |

96 | |

97 | (los, lom, noa, him, his) |

98 | } |

99 | |

100 | /// Returns the fences used to classify the outliers |

101 | pub fn fences(&self) -> (A, A, A, A) { |

102 | self.fences |

103 | } |

104 | |

105 | /// Returns an iterator over the labeled data |

106 | pub fn iter(&self) -> Iter<'a, A> { |

107 | Iter { |

108 | fences: self.fences, |

109 | iter: self.sample.iter(), |

110 | } |

111 | } |

112 | } |

113 | |

114 | impl<'a, A> Deref for LabeledSample<'a, A> |

115 | where |

116 | A: Float, |

117 | { |

118 | type Target = Sample<A>; |

119 | |

120 | fn deref(&self) -> &Sample<A> { |

121 | self.sample |

122 | } |

123 | } |

124 | |

125 | // FIXME Use the `IndexGet` trait |

126 | impl<'a, A> Index<usize> for LabeledSample<'a, A> |

127 | where |

128 | A: Float, |

129 | { |

130 | type Output = Label; |

131 | |

132 | #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))] |

133 | fn index(&self, i: usize) -> &Label { |

134 | static LOW_SEVERE: Label = LowSevere; |

135 | static LOW_MILD: Label = LowMild; |

136 | static HIGH_MILD: Label = HighMild; |

137 | static HIGH_SEVERE: Label = HighSevere; |

138 | static NOT_AN_OUTLIER: Label = NotAnOutlier; |

139 | |

140 | let x = self.sample[i]; |

141 | let (lost, lomt, himt, hist) = self.fences; |

142 | |

143 | if x < lost { |

144 | &LOW_SEVERE |

145 | } else if x > hist { |

146 | &HIGH_SEVERE |

147 | } else if x < lomt { |

148 | &LOW_MILD |

149 | } else if x > himt { |

150 | &HIGH_MILD |

151 | } else { |

152 | &NOT_AN_OUTLIER |

153 | } |

154 | } |

155 | } |

156 | |

157 | impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A> |

158 | where |

159 | A: Float, |

160 | { |

161 | type Item = (A, Label); |

162 | type IntoIter = Iter<'a, A>; |

163 | |

164 | fn into_iter(self) -> Iter<'a, A> { |

165 | self.iter() |

166 | } |

167 | } |

168 | |

169 | /// Iterator over the labeled data |

170 | pub struct Iter<'a, A> |

171 | where |

172 | A: Float, |

173 | { |

174 | fences: (A, A, A, A), |

175 | iter: slice::Iter<'a, A>, |

176 | } |

177 | |

178 | impl<'a, A> Iterator for Iter<'a, A> |

179 | where |

180 | A: Float, |

181 | { |

182 | type Item = (A, Label); |

183 | |

184 | #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))] |

185 | fn next(&mut self) -> Option<(A, Label)> { |

186 | self.iter.next().map(|&x| { |

187 | let (lost, lomt, himt, hist) = self.fences; |

188 | |

189 | let label = if x < lost { |

190 | LowSevere |

191 | } else if x > hist { |

192 | HighSevere |

193 | } else if x < lomt { |

194 | LowMild |

195 | } else if x > himt { |

196 | HighMild |

197 | } else { |

198 | NotAnOutlier |

199 | }; |

200 | |

201 | (x, label) |

202 | }) |

203 | } |

204 | |

205 | fn size_hint(&self) -> (usize, Option<usize>) { |

206 | self.iter.size_hint() |

207 | } |

208 | } |

209 | |

210 | /// Labels used to classify outliers |

211 | pub enum Label { |

212 | /// A "mild" outlier in the "high" spectrum |

213 | HighMild, |

214 | /// A "severe" outlier in the "high" spectrum |

215 | HighSevere, |

216 | /// A "mild" outlier in the "low" spectrum |

217 | LowMild, |

218 | /// A "severe" outlier in the "low" spectrum |

219 | LowSevere, |

220 | /// A normal data point |

221 | NotAnOutlier, |

222 | } |

223 | |

224 | impl Label { |

225 | /// Checks if the data point has an "unusually" high value |

226 | pub fn is_high(&self) -> bool { |

227 | matches!(*self, HighMild | HighSevere) |

228 | } |

229 | |

230 | /// Checks if the data point is labeled as a "mild" outlier |

231 | pub fn is_mild(&self) -> bool { |

232 | matches!(*self, HighMild | LowMild) |

233 | } |

234 | |

235 | /// Checks if the data point has an "unusually" low value |

236 | pub fn is_low(&self) -> bool { |

237 | matches!(*self, LowMild | LowSevere) |

238 | } |

239 | |

240 | /// Checks if the data point is labeled as an outlier |

241 | pub fn is_outlier(&self) -> bool { |

242 | matches!(*self, NotAnOutlier) |

243 | } |

244 | |

245 | /// Checks if the data point is labeled as a "severe" outlier |

246 | pub fn is_severe(&self) -> bool { |

247 | matches!(*self, HighSevere | LowSevere) |

248 | } |

249 | } |

250 | |

251 | /// Classifies the sample, and returns a labeled sample. |

252 | /// |

253 | /// - Time: `O(N log N) where N = length` |

254 | pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A> |

255 | where |

256 | A: Float, |

257 | usize: cast::From<A, Output = Result<usize, cast::Error>>, |

258 | { |

259 | let (q1, _, q3) = sample.percentiles().quartiles(); |

260 | let iqr = q3 - q1; |

261 | |

262 | // Mild |

263 | let k_m = A::cast(1.5_f32); |

264 | // Severe |

265 | let k_s = A::cast(3); |

266 | |

267 | LabeledSample { |

268 | fences: ( |

269 | q1 - k_s * iqr, |

270 | q1 - k_m * iqr, |

271 | q3 + k_m * iqr, |

272 | q3 + k_s * iqr, |

273 | ), |

274 | sample, |

275 | } |

276 | } |

277 |