utils.rs - Codebrowser

1	// Copyright 2018 Developers of the Rand project.
2	//
3	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
6	// option. This file may not be copied, modified, or distributed
7	// except according to those terms.
8
9	//! Math helper functions
10
11	#[cfg(feature = "simd_support")] use packed_simd::*;
12
13
14	pub(crate) trait WideningMultiply<RHS = Self> {
15	type Output;
16
17	fn wmul(self, x: RHS) -> Self::Output;
18	}
19
20	macro_rules! wmul_impl {
21	($ty:ty, $wide:ty, $shift:expr) => {
22	impl WideningMultiply for $ty {
23	type Output = ($ty, $ty);
24
25	#[inline(always)]
26	fn wmul(self, x: $ty) -> Self::Output {
27	let tmp = (self as $wide) * (x as $wide);
28	((tmp >> $shift) as $ty, tmp as $ty)
29	}
30	}
31	};
32
33	// simd bulk implementation
34	($(($ty:ident, $wide:ident),)+, $shift:expr) => {
35	$(
36	impl WideningMultiply for $ty {
37	type Output = ($ty, $ty);
38
39	#[inline(always)]
40	fn wmul(self, x: $ty) -> Self::Output {
41	// For supported vectors, this should compile to a couple
42	// supported multiply & swizzle instructions (no actual
43	// casting).
44	// TODO: optimize
45	let y: $wide = self.cast();
46	let x: $wide = x.cast();
47	let tmp = y * x;
48	let hi: $ty = (tmp >> $shift).cast();
49	let lo: $ty = tmp.cast();
50	(hi, lo)
51	}
52	}
53	)+
54	};
55	}
56	wmul_impl! { u8, u16, `8` }
57	wmul_impl! { u16, u32, `16` }
58	wmul_impl! { u32, u64, `32` }
59	wmul_impl! { u64, u128, `64` }
60
61	// This code is a translation of the __mulddi3 function in LLVM's
62	// compiler-rt. It is an optimised variant of the common method
63	// `(a + b) (c + d) = ac + ad + bc + bd`.*
64	//
65	// For some reason LLVM can optimise the C version very well, but
66	// keeps shuffling registers in this Rust translation.
67	macro_rules! wmul_impl_large {
68	($ty:ty, $half:expr) => {
69	impl WideningMultiply for $ty {
70	type Output = ($ty, $ty);
71
72	#[inline(always)]
73	fn wmul(self, b: $ty) -> Self::Output {
74	const LOWER_MASK: $ty = !`0` >> $half;
75	let mut low = (self & LOWER_MASK).wrapping_mul(b & LOWER_MASK);
76	let mut t = low >> $half;
77	low &= LOWER_MASK;
78	t += (self >> $half).wrapping_mul(b & LOWER_MASK);
79	low += (t & LOWER_MASK) << $half;
80	let mut high = t >> $half;
81	t = low >> $half;
82	low &= LOWER_MASK;
83	t += (b >> $half).wrapping_mul(self & LOWER_MASK);
84	low += (t & LOWER_MASK) << $half;
85	high += t >> $half;
86	high += (self >> $half).wrapping_mul(b >> $half);
87
88	(high, low)
89	}
90	}
91	};
92
93	// simd bulk implementation
94	(($($ty:ty,)+) $scalar:ty, $half:expr) => {
95	$(
96	impl WideningMultiply for $ty {
97	type Output = ($ty, $ty);
98
99	#[inline(always)]
100	fn wmul(self, b: $ty) -> Self::Output {
101	// needs wrapping multiplication
102	const LOWER_MASK: $scalar = !`0` >> $half;
103	let mut low = (self & LOWER_MASK) * (b & LOWER_MASK);
104	let mut t = low >> $half;
105	low &= LOWER_MASK;
106	t += (self >> $half) * (b & LOWER_MASK);
107	low += (t & LOWER_MASK) << $half;
108	let mut high = t >> $half;
109	t = low >> $half;
110	low &= LOWER_MASK;
111	t += (b >> $half) * (self & LOWER_MASK);
112	low += (t & LOWER_MASK) << $half;
113	high += t >> $half;
114	high += (self >> $half) * (b >> $half);
115
116	(high, low)
117	}
118	}
119	)+
120	};
121	}
122	wmul_impl_large! { u128, `64` }
123
124	macro_rules! wmul_impl_usize {
125	($ty:ty) => {
126	impl WideningMultiply for usize {
127	type Output = (usize, usize);
128
129	#[inline(always)]
130	fn wmul(self, x: usize) -> Self::Output {
131	let (high, low) = (self as $ty).wmul(x as $ty);
132	(high as usize, low as usize)
133	}
134	}
135	};
136	}
137	#[cfg(target_pointer_width = "16")]
138	wmul_impl_usize! { u16 }
139	#[cfg(target_pointer_width = "32")]
140	wmul_impl_usize! { u32 }
141	#[cfg(target_pointer_width = "64")]
142	wmul_impl_usize! { u64 }
143
144	#[cfg(feature = "simd_support")]
145	mod simd_wmul {
146	use super::*;
147	#[cfg(target_arch = "x86")] use core::arch::x86::*;
148	#[cfg(target_arch = "x86_64")] use core::arch::x86_64::*;
149
150	wmul_impl! {
151	(u8x2, u16x2),
152	(u8x4, u16x4),
153	(u8x8, u16x8),
154	(u8x16, u16x16),
155	(u8x32, u16x32),,
156	`8`
157	}
158
159	wmul_impl! { (u16x2, u32x2),, `16` }
160	wmul_impl! { (u16x4, u32x4),, `16` }
161	#[cfg(not(target_feature = "sse2"))]
162	wmul_impl! { (u16x8, u32x8),, `16` }
163	#[cfg(not(target_feature = "avx2"))]
164	wmul_impl! { (u16x16, u32x16),, `16` }
165
166	// 16-bit lane widths allow use of the x86 `mulhi` instructions, which
167	// means `wmul` can be implemented with only two instructions.
168	#[allow(unused_macros)]
169	macro_rules! wmul_impl_16 {
170	($ty:ident, $intrinsic:ident, $mulhi:ident, $mullo:ident) => {
171	impl WideningMultiply for $ty {
172	type Output = ($ty, $ty);
173
174	#[inline(always)]
175	fn wmul(self, x: $ty) -> Self::Output {
176	let b = $intrinsic::from_bits(x);
177	let a = $intrinsic::from_bits(self);
178	let hi = $ty::from_bits(unsafe { $mulhi(a, b) });
179	let lo = $ty::from_bits(unsafe { $mullo(a, b) });
180	(hi, lo)
181	}
182	}
183	};
184	}
185
186	#[cfg(target_feature = "sse2")]
187	wmul_impl_16! { u16x8, __m128i, _mm_mulhi_epu16, _mm_mullo_epi16 }
188	#[cfg(target_feature = "avx2")]
189	wmul_impl_16! { u16x16, __m256i, _mm256_mulhi_epu16, _mm256_mullo_epi16 }
190	// FIXME: there are no `__m512i` types in stdsimd yet, so `wmul::<u16x32>`
191	// cannot use the same implementation.
192
193	wmul_impl! {
194	(u32x2, u64x2),
195	(u32x4, u64x4),
196	(u32x8, u64x8),,
197	`32`
198	}
199
200	// TODO: optimize, this seems to seriously slow things down
201	wmul_impl_large! { (u8x64,) u8, `4` }
202	wmul_impl_large! { (u16x32,) u16, `8` }
203	wmul_impl_large! { (u32x16,) u32, `16` }
204	wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, `32` }
205	}
206
207	/// Helper trait when dealing with scalar and SIMD floating point types.
208	pub(crate) trait FloatSIMDUtils {
209	// `PartialOrd` for vectors compares lexicographically. We want to compare all
210	// the individual SIMD lanes instead, and get the combined result over all
211	// lanes. This is possible using something like `a.lt(b).all()`, but we
212	// implement it as a trait so we can write the same code for `f32` and `f64`.
213	// Only the comparison functions we need are implemented.
214	fn all_lt(self, other: Self) -> bool;
215	fn all_le(self, other: Self) -> bool;
216	fn all_finite(self) -> bool;
217
218	type Mask;
219	fn finite_mask(self) -> Self::Mask;
220	fn gt_mask(self, other: Self) -> Self::Mask;
221	fn ge_mask(self, other: Self) -> Self::Mask;
222
223	// Decrease all lanes where the mask is `true` to the next lower value
224	// representable by the floating-point type. At least one of the lanes
225	// must be set.
226	fn decrease_masked(self, mask: Self::Mask) -> Self;
227
228	// Convert from int value. Conversion is done while retaining the numerical
229	// value, not by retaining the binary representation.
230	type UInt;
231	fn cast_from_int(i: Self::UInt) -> Self;
232	}
233
234	/// Implement functions available in std builds but missing from core primitives
235	#[cfg(not(std))]
236	// False positive: We are following `std` here.
237	#[allow(clippy::wrong_self_convention)]
238	pub(crate) trait Float: Sized {
239	fn is_nan(self) -> bool;
240	fn is_infinite(self) -> bool;
241	fn is_finite(self) -> bool;
242	}
243
244	/// Implement functions on f32/f64 to give them APIs similar to SIMD types
245	pub(crate) trait FloatAsSIMD: Sized {
246	#[inline(always)]
247	fn lanes() -> usize {
248	`1`
249	}
250	#[inline(always)]
251	fn splat(scalar: Self) -> Self {
252	scalar
253	}
254	#[inline(always)]
255	fn extract(self, index: usize) -> Self {
256	debug_assert_eq!(index, `0`);
257	self
258	}
259	#[inline(always)]
260	fn replace(self, index: usize, new_value: Self) -> Self {
261	debug_assert_eq!(index, `0`);
262	new_value
263	}
264	}
265
266	pub(crate) trait BoolAsSIMD: Sized {
267	fn any(self) -> bool;
268	fn all(self) -> bool;
269	fn none(self) -> bool;
270	}
271
272	impl BoolAsSIMD for bool {
273	#[inline(always)]
274	fn any(self) -> bool {
275	self
276	}
277
278	#[inline(always)]
279	fn all(self) -> bool {
280	self
281	}
282
283	#[inline(always)]
284	fn none(self) -> bool {
285	!self
286	}
287	}
288
289	macro_rules! scalar_float_impl {
290	($ty:ident, $uty:ident) => {
291	#[cfg(not(std))]
292	impl Float for $ty {
293	#[inline]
294	fn is_nan(self) -> bool {
295	self != self
296	}
297
298	#[inline]
299	fn is_infinite(self) -> bool {
300	self == ::core::$ty::INFINITY \|\| self == ::core::$ty::NEG_INFINITY
301	}
302
303	#[inline]
304	fn is_finite(self) -> bool {
305	!(self.is_nan() \|\| self.is_infinite())
306	}
307	}
308
309	impl FloatSIMDUtils for $ty {
310	type Mask = bool;
311	type UInt = $uty;
312
313	#[inline(always)]
314	fn all_lt(self, other: Self) -> bool {
315	self < other
316	}
317
318	#[inline(always)]
319	fn all_le(self, other: Self) -> bool {
320	self <= other
321	}
322
323	#[inline(always)]
324	fn all_finite(self) -> bool {
325	self.is_finite()
326	}
327
328	#[inline(always)]
329	fn finite_mask(self) -> Self::Mask {
330	self.is_finite()
331	}
332
333	#[inline(always)]
334	fn gt_mask(self, other: Self) -> Self::Mask {
335	self > other
336	}
337
338	#[inline(always)]
339	fn ge_mask(self, other: Self) -> Self::Mask {
340	self >= other
341	}
342
343	#[inline(always)]
344	fn decrease_masked(self, mask: Self::Mask) -> Self {
345	debug_assert!(mask, "At least one lane must be set");
346	<$ty>::from_bits(self.to_bits() - `1`)
347	}
348
349	#[inline]
350	fn cast_from_int(i: Self::UInt) -> Self {
351	i as $ty
352	}
353	}
354
355	impl FloatAsSIMD for $ty {}
356	};
357	}
358
359	scalar_float_impl!(f32, u32);
360	scalar_float_impl!(f64, u64);
361
362
363	#[cfg(feature = "simd_support")]
364	macro_rules! simd_impl {
365	($ty:ident, $f_scalar:ident, $mty:ident, $uty:ident) => {
366	impl FloatSIMDUtils for $ty {
367	type Mask = $mty;
368	type UInt = $uty;
369
370	#[inline(always)]
371	fn all_lt(self, other: Self) -> bool {
372	self.lt(other).all()
373	}
374
375	#[inline(always)]
376	fn all_le(self, other: Self) -> bool {
377	self.le(other).all()
378	}
379
380	#[inline(always)]
381	fn all_finite(self) -> bool {
382	self.finite_mask().all()
383	}
384
385	#[inline(always)]
386	fn finite_mask(self) -> Self::Mask {
387	// This can possibly be done faster by checking bit patterns
388	let neg_inf = $ty::splat(::core::$f_scalar::NEG_INFINITY);
389	let pos_inf = $ty::splat(::core::$f_scalar::INFINITY);
390	self.gt(neg_inf) & self.lt(pos_inf)
391	}
392
393	#[inline(always)]
394	fn gt_mask(self, other: Self) -> Self::Mask {
395	self.gt(other)
396	}
397
398	#[inline(always)]
399	fn ge_mask(self, other: Self) -> Self::Mask {
400	self.ge(other)
401	}
402
403	#[inline(always)]
404	fn decrease_masked(self, mask: Self::Mask) -> Self {
405	// Casting a mask into ints will produce all bits set for
406	// true, and 0 for false. Adding that to the binary
407	// representation of a float means subtracting one from
408	// the binary representation, resulting in the next lower
409	// value representable by $ty. This works even when the
410	// current value is infinity.
411	debug_assert!(mask.any(), "At least one lane must be set");
412	<$ty>::from_bits(<$uty>::from_bits(self) + <$uty>::from_bits(mask))
413	}
414
415	#[inline]
416	fn cast_from_int(i: Self::UInt) -> Self {
417	i.cast()
418	}
419	}
420	};
421	}
422
423	#[cfg(feature="simd_support")] simd_impl! { f32x2, f32, m32x2, u32x2 }
424	#[cfg(feature="simd_support")] simd_impl! { f32x4, f32, m32x4, u32x4 }
425	#[cfg(feature="simd_support")] simd_impl! { f32x8, f32, m32x8, u32x8 }
426	#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m32x16, u32x16 }
427	#[cfg(feature="simd_support")] simd_impl! { f64x2, f64, m64x2, u64x2 }
428	#[cfg(feature="simd_support")] simd_impl! { f64x4, f64, m64x4, u64x4 }
429	#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m64x8, u64x8 }
430