1 | // Copyright 2018 Developers of the Rand project. |
---|---|

2 | // |

3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |

4 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |

5 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |

6 | // option. This file may not be copied, modified, or distributed |

7 | // except according to those terms. |

8 | |

9 | //! Math helper functions |

10 | |

11 | #[cfg(feature = "simd_support")] use packed_simd::*; |

12 | |

13 | |

14 | pub(crate) trait WideningMultiply<RHS = Self> { |

15 | type Output; |

16 | |

17 | fn wmul(self, x: RHS) -> Self::Output; |

18 | } |

19 | |

20 | macro_rules! wmul_impl { |

21 | ($ty:ty, $wide:ty, $shift:expr) => { |

22 | impl WideningMultiply for $ty { |

23 | type Output = ($ty, $ty); |

24 | |

25 | #[inline(always)] |

26 | fn wmul(self, x: $ty) -> Self::Output { |

27 | let tmp = (self as $wide) * (x as $wide); |

28 | ((tmp >> $shift) as $ty, tmp as $ty) |

29 | } |

30 | } |

31 | }; |

32 | |

33 | // simd bulk implementation |

34 | ($(($ty:ident, $wide:ident),)+, $shift:expr) => { |

35 | $( |

36 | impl WideningMultiply for $ty { |

37 | type Output = ($ty, $ty); |

38 | |

39 | #[inline(always)] |

40 | fn wmul(self, x: $ty) -> Self::Output { |

41 | // For supported vectors, this should compile to a couple |

42 | // supported multiply & swizzle instructions (no actual |

43 | // casting). |

44 | // TODO: optimize |

45 | let y: $wide = self.cast(); |

46 | let x: $wide = x.cast(); |

47 | let tmp = y * x; |

48 | let hi: $ty = (tmp >> $shift).cast(); |

49 | let lo: $ty = tmp.cast(); |

50 | (hi, lo) |

51 | } |

52 | } |

53 | )+ |

54 | }; |

55 | } |

56 | wmul_impl! { u8, u16, 8 } |

57 | wmul_impl! { u16, u32, 16 } |

58 | wmul_impl! { u32, u64, 32 } |

59 | wmul_impl! { u64, u128, 64 } |

60 | |

61 | // This code is a translation of the __mulddi3 function in LLVM's |

62 | // compiler-rt. It is an optimised variant of the common method |

63 | // `(a + b) * (c + d) = ac + ad + bc + bd`. |

64 | // |

65 | // For some reason LLVM can optimise the C version very well, but |

66 | // keeps shuffling registers in this Rust translation. |

67 | macro_rules! wmul_impl_large { |

68 | ($ty:ty, $half:expr) => { |

69 | impl WideningMultiply for $ty { |

70 | type Output = ($ty, $ty); |

71 | |

72 | #[inline(always)] |

73 | fn wmul(self, b: $ty) -> Self::Output { |

74 | const LOWER_MASK: $ty = !0 >> $half; |

75 | let mut low = (self & LOWER_MASK).wrapping_mul(b & LOWER_MASK); |

76 | let mut t = low >> $half; |

77 | low &= LOWER_MASK; |

78 | t += (self >> $half).wrapping_mul(b & LOWER_MASK); |

79 | low += (t & LOWER_MASK) << $half; |

80 | let mut high = t >> $half; |

81 | t = low >> $half; |

82 | low &= LOWER_MASK; |

83 | t += (b >> $half).wrapping_mul(self & LOWER_MASK); |

84 | low += (t & LOWER_MASK) << $half; |

85 | high += t >> $half; |

86 | high += (self >> $half).wrapping_mul(b >> $half); |

87 | |

88 | (high, low) |

89 | } |

90 | } |

91 | }; |

92 | |

93 | // simd bulk implementation |

94 | (($($ty:ty,)+) $scalar:ty, $half:expr) => { |

95 | $( |

96 | impl WideningMultiply for $ty { |

97 | type Output = ($ty, $ty); |

98 | |

99 | #[inline(always)] |

100 | fn wmul(self, b: $ty) -> Self::Output { |

101 | // needs wrapping multiplication |

102 | const LOWER_MASK: $scalar = !0 >> $half; |

103 | let mut low = (self & LOWER_MASK) * (b & LOWER_MASK); |

104 | let mut t = low >> $half; |

105 | low &= LOWER_MASK; |

106 | t += (self >> $half) * (b & LOWER_MASK); |

107 | low += (t & LOWER_MASK) << $half; |

108 | let mut high = t >> $half; |

109 | t = low >> $half; |

110 | low &= LOWER_MASK; |

111 | t += (b >> $half) * (self & LOWER_MASK); |

112 | low += (t & LOWER_MASK) << $half; |

113 | high += t >> $half; |

114 | high += (self >> $half) * (b >> $half); |

115 | |

116 | (high, low) |

117 | } |

118 | } |

119 | )+ |

120 | }; |

121 | } |

122 | wmul_impl_large! { u128, 64 } |

123 | |

124 | macro_rules! wmul_impl_usize { |

125 | ($ty:ty) => { |

126 | impl WideningMultiply for usize { |

127 | type Output = (usize, usize); |

128 | |

129 | #[inline(always)] |

130 | fn wmul(self, x: usize) -> Self::Output { |

131 | let (high, low) = (self as $ty).wmul(x as $ty); |

132 | (high as usize, low as usize) |

133 | } |

134 | } |

135 | }; |

136 | } |

137 | #[cfg(target_pointer_width = "16")] |

138 | wmul_impl_usize! { u16 } |

139 | #[cfg(target_pointer_width = "32")] |

140 | wmul_impl_usize! { u32 } |

141 | #[cfg(target_pointer_width = "64")] |

142 | wmul_impl_usize! { u64 } |

143 | |

144 | #[cfg(feature = "simd_support")] |

145 | mod simd_wmul { |

146 | use super::*; |

147 | #[cfg(target_arch = "x86")] use core::arch::x86::*; |

148 | #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; |

149 | |

150 | wmul_impl! { |

151 | (u8x2, u16x2), |

152 | (u8x4, u16x4), |

153 | (u8x8, u16x8), |

154 | (u8x16, u16x16), |

155 | (u8x32, u16x32),, |

156 | 8 |

157 | } |

158 | |

159 | wmul_impl! { (u16x2, u32x2),, 16 } |

160 | wmul_impl! { (u16x4, u32x4),, 16 } |

161 | #[cfg(not(target_feature = "sse2"))] |

162 | wmul_impl! { (u16x8, u32x8),, 16 } |

163 | #[cfg(not(target_feature = "avx2"))] |

164 | wmul_impl! { (u16x16, u32x16),, 16 } |

165 | |

166 | // 16-bit lane widths allow use of the x86 `mulhi` instructions, which |

167 | // means `wmul` can be implemented with only two instructions. |

168 | #[allow(unused_macros)] |

169 | macro_rules! wmul_impl_16 { |

170 | ($ty:ident, $intrinsic:ident, $mulhi:ident, $mullo:ident) => { |

171 | impl WideningMultiply for $ty { |

172 | type Output = ($ty, $ty); |

173 | |

174 | #[inline(always)] |

175 | fn wmul(self, x: $ty) -> Self::Output { |

176 | let b = $intrinsic::from_bits(x); |

177 | let a = $intrinsic::from_bits(self); |

178 | let hi = $ty::from_bits(unsafe { $mulhi(a, b) }); |

179 | let lo = $ty::from_bits(unsafe { $mullo(a, b) }); |

180 | (hi, lo) |

181 | } |

182 | } |

183 | }; |

184 | } |

185 | |

186 | #[cfg(target_feature = "sse2")] |

187 | wmul_impl_16! { u16x8, __m128i, _mm_mulhi_epu16, _mm_mullo_epi16 } |

188 | #[cfg(target_feature = "avx2")] |

189 | wmul_impl_16! { u16x16, __m256i, _mm256_mulhi_epu16, _mm256_mullo_epi16 } |

190 | // FIXME: there are no `__m512i` types in stdsimd yet, so `wmul::<u16x32>` |

191 | // cannot use the same implementation. |

192 | |

193 | wmul_impl! { |

194 | (u32x2, u64x2), |

195 | (u32x4, u64x4), |

196 | (u32x8, u64x8),, |

197 | 32 |

198 | } |

199 | |

200 | // TODO: optimize, this seems to seriously slow things down |

201 | wmul_impl_large! { (u8x64,) u8, 4 } |

202 | wmul_impl_large! { (u16x32,) u16, 8 } |

203 | wmul_impl_large! { (u32x16,) u32, 16 } |

204 | wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 } |

205 | } |

206 | |

207 | /// Helper trait when dealing with scalar and SIMD floating point types. |

208 | pub(crate) trait FloatSIMDUtils { |

209 | // `PartialOrd` for vectors compares lexicographically. We want to compare all |

210 | // the individual SIMD lanes instead, and get the combined result over all |

211 | // lanes. This is possible using something like `a.lt(b).all()`, but we |

212 | // implement it as a trait so we can write the same code for `f32` and `f64`. |

213 | // Only the comparison functions we need are implemented. |

214 | fn all_lt(self, other: Self) -> bool; |

215 | fn all_le(self, other: Self) -> bool; |

216 | fn all_finite(self) -> bool; |

217 | |

218 | type Mask; |

219 | fn finite_mask(self) -> Self::Mask; |

220 | fn gt_mask(self, other: Self) -> Self::Mask; |

221 | fn ge_mask(self, other: Self) -> Self::Mask; |

222 | |

223 | // Decrease all lanes where the mask is `true` to the next lower value |

224 | // representable by the floating-point type. At least one of the lanes |

225 | // must be set. |

226 | fn decrease_masked(self, mask: Self::Mask) -> Self; |

227 | |

228 | // Convert from int value. Conversion is done while retaining the numerical |

229 | // value, not by retaining the binary representation. |

230 | type UInt; |

231 | fn cast_from_int(i: Self::UInt) -> Self; |

232 | } |

233 | |

234 | /// Implement functions available in std builds but missing from core primitives |

235 | #[cfg(not(std))] |

236 | // False positive: We are following `std` here. |

237 | #[allow(clippy::wrong_self_convention)] |

238 | pub(crate) trait Float: Sized { |

239 | fn is_nan(self) -> bool; |

240 | fn is_infinite(self) -> bool; |

241 | fn is_finite(self) -> bool; |

242 | } |

243 | |

244 | /// Implement functions on f32/f64 to give them APIs similar to SIMD types |

245 | pub(crate) trait FloatAsSIMD: Sized { |

246 | #[inline(always)] |

247 | fn lanes() -> usize { |

248 | 1 |

249 | } |

250 | #[inline(always)] |

251 | fn splat(scalar: Self) -> Self { |

252 | scalar |

253 | } |

254 | #[inline(always)] |

255 | fn extract(self, index: usize) -> Self { |

256 | debug_assert_eq!(index, 0); |

257 | self |

258 | } |

259 | #[inline(always)] |

260 | fn replace(self, index: usize, new_value: Self) -> Self { |

261 | debug_assert_eq!(index, 0); |

262 | new_value |

263 | } |

264 | } |

265 | |

266 | pub(crate) trait BoolAsSIMD: Sized { |

267 | fn any(self) -> bool; |

268 | fn all(self) -> bool; |

269 | fn none(self) -> bool; |

270 | } |

271 | |

272 | impl BoolAsSIMD for bool { |

273 | #[inline(always)] |

274 | fn any(self) -> bool { |

275 | self |

276 | } |

277 | |

278 | #[inline(always)] |

279 | fn all(self) -> bool { |

280 | self |

281 | } |

282 | |

283 | #[inline(always)] |

284 | fn none(self) -> bool { |

285 | !self |

286 | } |

287 | } |

288 | |

289 | macro_rules! scalar_float_impl { |

290 | ($ty:ident, $uty:ident) => { |

291 | #[cfg(not(std))] |

292 | impl Float for $ty { |

293 | #[inline] |

294 | fn is_nan(self) -> bool { |

295 | self != self |

296 | } |

297 | |

298 | #[inline] |

299 | fn is_infinite(self) -> bool { |

300 | self == ::core::$ty::INFINITY || self == ::core::$ty::NEG_INFINITY |

301 | } |

302 | |

303 | #[inline] |

304 | fn is_finite(self) -> bool { |

305 | !(self.is_nan() || self.is_infinite()) |

306 | } |

307 | } |

308 | |

309 | impl FloatSIMDUtils for $ty { |

310 | type Mask = bool; |

311 | type UInt = $uty; |

312 | |

313 | #[inline(always)] |

314 | fn all_lt(self, other: Self) -> bool { |

315 | self < other |

316 | } |

317 | |

318 | #[inline(always)] |

319 | fn all_le(self, other: Self) -> bool { |

320 | self <= other |

321 | } |

322 | |

323 | #[inline(always)] |

324 | fn all_finite(self) -> bool { |

325 | self.is_finite() |

326 | } |

327 | |

328 | #[inline(always)] |

329 | fn finite_mask(self) -> Self::Mask { |

330 | self.is_finite() |

331 | } |

332 | |

333 | #[inline(always)] |

334 | fn gt_mask(self, other: Self) -> Self::Mask { |

335 | self > other |

336 | } |

337 | |

338 | #[inline(always)] |

339 | fn ge_mask(self, other: Self) -> Self::Mask { |

340 | self >= other |

341 | } |

342 | |

343 | #[inline(always)] |

344 | fn decrease_masked(self, mask: Self::Mask) -> Self { |

345 | debug_assert!(mask, "At least one lane must be set"); |

346 | <$ty>::from_bits(self.to_bits() - 1) |

347 | } |

348 | |

349 | #[inline] |

350 | fn cast_from_int(i: Self::UInt) -> Self { |

351 | i as $ty |

352 | } |

353 | } |

354 | |

355 | impl FloatAsSIMD for $ty {} |

356 | }; |

357 | } |

358 | |

359 | scalar_float_impl!(f32, u32); |

360 | scalar_float_impl!(f64, u64); |

361 | |

362 | |

363 | #[cfg(feature = "simd_support")] |

364 | macro_rules! simd_impl { |

365 | ($ty:ident, $f_scalar:ident, $mty:ident, $uty:ident) => { |

366 | impl FloatSIMDUtils for $ty { |

367 | type Mask = $mty; |

368 | type UInt = $uty; |

369 | |

370 | #[inline(always)] |

371 | fn all_lt(self, other: Self) -> bool { |

372 | self.lt(other).all() |

373 | } |

374 | |

375 | #[inline(always)] |

376 | fn all_le(self, other: Self) -> bool { |

377 | self.le(other).all() |

378 | } |

379 | |

380 | #[inline(always)] |

381 | fn all_finite(self) -> bool { |

382 | self.finite_mask().all() |

383 | } |

384 | |

385 | #[inline(always)] |

386 | fn finite_mask(self) -> Self::Mask { |

387 | // This can possibly be done faster by checking bit patterns |

388 | let neg_inf = $ty::splat(::core::$f_scalar::NEG_INFINITY); |

389 | let pos_inf = $ty::splat(::core::$f_scalar::INFINITY); |

390 | self.gt(neg_inf) & self.lt(pos_inf) |

391 | } |

392 | |

393 | #[inline(always)] |

394 | fn gt_mask(self, other: Self) -> Self::Mask { |

395 | self.gt(other) |

396 | } |

397 | |

398 | #[inline(always)] |

399 | fn ge_mask(self, other: Self) -> Self::Mask { |

400 | self.ge(other) |

401 | } |

402 | |

403 | #[inline(always)] |

404 | fn decrease_masked(self, mask: Self::Mask) -> Self { |

405 | // Casting a mask into ints will produce all bits set for |

406 | // true, and 0 for false. Adding that to the binary |

407 | // representation of a float means subtracting one from |

408 | // the binary representation, resulting in the next lower |

409 | // value representable by $ty. This works even when the |

410 | // current value is infinity. |

411 | debug_assert!(mask.any(), "At least one lane must be set"); |

412 | <$ty>::from_bits(<$uty>::from_bits(self) + <$uty>::from_bits(mask)) |

413 | } |

414 | |

415 | #[inline] |

416 | fn cast_from_int(i: Self::UInt) -> Self { |

417 | i.cast() |

418 | } |

419 | } |

420 | }; |

421 | } |

422 | |

423 | #[cfg(feature= "simd_support")] simd_impl! { f32x2, f32, m32x2, u32x2 } |

424 | #[cfg(feature= "simd_support")] simd_impl! { f32x4, f32, m32x4, u32x4 } |

425 | #[cfg(feature= "simd_support")] simd_impl! { f32x8, f32, m32x8, u32x8 } |

426 | #[cfg(feature= "simd_support")] simd_impl! { f32x16, f32, m32x16, u32x16 } |

427 | #[cfg(feature= "simd_support")] simd_impl! { f64x2, f64, m64x2, u64x2 } |

428 | #[cfg(feature= "simd_support")] simd_impl! { f64x4, f64, m64x4, u64x4 } |

429 | #[cfg(feature= "simd_support")] simd_impl! { f64x8, f64, m64x8, u64x8 } |

430 |