u16x16_t.rs source code [crates/tiny-skia-0.11.4/src/wide/u16x16_t.rs]

1	// Copyright 2020 Yevhenii Reizner
2	//
3	// Use of this source code is governed by a BSD-style license that can be
4	// found in the LICENSE file.
5
6	// No need to use explicit 256bit AVX2 SIMD.
7	// `-C target-cpu=native` will autovectorize it better than us.
8	// Not even sure why explicit instructions are so slow...
9	//
10	// On ARM AArch64 we can actually get up to 2x performance boost by using SIMD.
11	//
12	// We also have to inline all the methods. They are pretty large,
13	// but without the inlining the performance is plummeting.
14
15	#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
16	use bytemuck::cast;
17	#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
18	use core::arch::aarch64::uint16x8_t;
19
20	#[allow(non_camel_case_types)]
21	#[derive(Copy, Clone, PartialEq, Default, Debug)]
22	pub struct u16x16(pub [u16; `16`]);
23
24	macro_rules! impl_u16x16_op {
25	($a:expr, $op:ident, $b:expr) => {
26	u16x16([
27	$a.`0`[`0`].$op($b.`0`[`0`]),
28	$a.`0`[`1`].$op($b.`0`[`1`]),
29	$a.`0`[`2`].$op($b.`0`[`2`]),
30	$a.`0`[`3`].$op($b.`0`[`3`]),
31	$a.`0`[`4`].$op($b.`0`[`4`]),
32	$a.`0`[`5`].$op($b.`0`[`5`]),
33	$a.`0`[`6`].$op($b.`0`[`6`]),
34	$a.`0`[`7`].$op($b.`0`[`7`]),
35	$a.`0`[`8`].$op($b.`0`[`8`]),
36	$a.`0`[`9`].$op($b.`0`[`9`]),
37	$a.`0`[`10`].$op($b.`0`[`10`]),
38	$a.`0`[`11`].$op($b.`0`[`11`]),
39	$a.`0`[`12`].$op($b.`0`[`12`]),
40	$a.`0`[`13`].$op($b.`0`[`13`]),
41	$a.`0`[`14`].$op($b.`0`[`14`]),
42	$a.`0`[`15`].$op($b.`0`[`15`]),
43	])
44	};
45	}
46
47	#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
48	macro_rules! impl_aarch64_call {
49	($f:ident, $a:expr, $b:expr) => {
50	let a = $a.split();
51	let b = $b.split();
52	Self(bytemuck::cast([
53	unsafe { core::arch::aarch64::$f(a.`0`, b.`0`) },
54	unsafe { core::arch::aarch64::$f(a.`1`, b.`1`) },
55	]))
56	};
57	}
58
59	impl u16x16 {
60	#[inline]
61	pub fn splat(n: u16) -> Self {
62	Self([n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n])
63	}
64
65	#[inline]
66	pub fn as_slice(&self) -> &[u16; `16`] {
67	&self.0
68	}
69
70	#[inline]
71	pub fn min(&self, rhs: &Self) -> Self {
72	cfg_if::cfg_if! {
73	if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
74	impl_aarch64_call!(vminq_u16, self, rhs)
75	} else {
76	impl_u16x16_op!(self, min, rhs)
77	}
78	}
79	}
80
81	#[inline]
82	pub fn max(&self, rhs: &Self) -> Self {
83	cfg_if::cfg_if! {
84	if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
85	impl_aarch64_call!(vmaxq_u16, self, rhs)
86	} else {
87	impl_u16x16_op!(self, max, rhs)
88	}
89	}
90	}
91
92	#[inline]
93	pub fn cmp_le(&self, rhs: &Self) -> Self {
94	cfg_if::cfg_if! {
95	if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
96	impl_aarch64_call!(vcleq_u16, self, rhs)
97	} else {
98	Self([
99	if self.0[ `0`] <= rhs.0[ `0`] { !`0` } else { `0` },
100	if self.0[ `1`] <= rhs.0[ `1`] { !`0` } else { `0` },
101	if self.0[ `2`] <= rhs.0[ `2`] { !`0` } else { `0` },
102	if self.0[ `3`] <= rhs.0[ `3`] { !`0` } else { `0` },
103	if self.0[ `4`] <= rhs.0[ `4`] { !`0` } else { `0` },
104	if self.0[ `5`] <= rhs.0[ `5`] { !`0` } else { `0` },
105	if self.0[ `6`] <= rhs.0[ `6`] { !`0` } else { `0` },
106	if self.0[ `7`] <= rhs.0[ `7`] { !`0` } else { `0` },
107	if self.0[ `8`] <= rhs.0[ `8`] { !`0` } else { `0` },
108	if self.0[ `9`] <= rhs.0[ `9`] { !`0` } else { `0` },
109	if self.0[`10`] <= rhs.0[`10`] { !`0` } else { `0` },
110	if self.0[`11`] <= rhs.0[`11`] { !`0` } else { `0` },
111	if self.0[`12`] <= rhs.0[`12`] { !`0` } else { `0` },
112	if self.0[`13`] <= rhs.0[`13`] { !`0` } else { `0` },
113	if self.0[`14`] <= rhs.0[`14`] { !`0` } else { `0` },
114	if self.0[`15`] <= rhs.0[`15`] { !`0` } else { `0` },
115	])
116	}
117	}
118	}
119
120	#[inline]
121	pub fn blend(self, t: Self, e: Self) -> Self {
122	(t & self) \| (e & !self)
123	}
124
125	#[inline]
126	#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
127	pub fn split(self) -> (uint16x8_t, uint16x8_t) {
128	let pair: [uint16x8_t; `2`] = cast(self.0);
129	(pair[`0`], pair[`1`])
130	}
131	}
132
133	impl core::ops::Add<u16x16> for u16x16 {
134	type Output = Self;
135
136	#[inline]
137	fn add(self, rhs: Self) -> Self::Output {
138	cfg_if::cfg_if! {
139	if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
140	impl_aarch64_call!(vaddq_u16, self, rhs)
141	} else {
142	impl_u16x16_op!(self, add, rhs)
143	}
144	}
145	}
146	}
147
148	impl core::ops::Sub<u16x16> for u16x16 {
149	type Output = Self;
150
151	#[inline]
152	fn sub(self, rhs: Self) -> Self::Output {
153	cfg_if::cfg_if! {
154	if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
155	impl_aarch64_call!(vsubq_u16, self, rhs)
156	} else {
157	impl_u16x16_op!(self, sub, rhs)
158	}
159	}
160	}
161	}
162
163	impl core::ops::Mul<u16x16> for u16x16 {
164	type Output = Self;
165
166	#[inline]
167	fn mul(self, rhs: Self) -> Self::Output {
168	cfg_if::cfg_if! {
169	if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
170	impl_aarch64_call!(vmulq_u16, self, rhs)
171	} else {
172	impl_u16x16_op!(self, mul, rhs)
173	}
174	}
175	}
176	}
177
178	impl core::ops::Div<u16x16> for u16x16 {
179	type Output = Self;
180
181	#[inline]
182	fn div(self, rhs: Self) -> Self::Output {
183	impl_u16x16_op!(self, div, rhs)
184	}
185	}
186
187	impl core::ops::BitAnd<u16x16> for u16x16 {
188	type Output = Self;
189
190	#[inline]
191	fn bitand(self, rhs: Self) -> Self::Output {
192	cfg_if::cfg_if! {
193	if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
194	impl_aarch64_call!(vandq_u16, self, rhs)
195	} else {
196	impl_u16x16_op!(self, bitand, rhs)
197	}
198	}
199	}
200	}
201
202	impl core::ops::BitOr<u16x16> for u16x16 {
203	type Output = Self;
204
205	#[inline]
206	fn bitor(self, rhs: Self) -> Self::Output {
207	cfg_if::cfg_if! {
208	if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
209	impl_aarch64_call!(vorrq_u16, self, rhs)
210	} else {
211	impl_u16x16_op!(self, bitor, rhs)
212	}
213	}
214	}
215	}
216
217	impl core::ops::Not for u16x16 {
218	type Output = Self;
219
220	#[inline]
221	fn not(self) -> Self::Output {
222	u16x16([
223	!self.0[`0`],
224	!self.0[`1`],
225	!self.0[`2`],
226	!self.0[`3`],
227	!self.0[`4`],
228	!self.0[`5`],
229	!self.0[`6`],
230	!self.0[`7`],
231	!self.0[`8`],
232	!self.0[`9`],
233	!self.0[`10`],
234	!self.0[`11`],
235	!self.0[`12`],
236	!self.0[`13`],
237	!self.0[`14`],
238	!self.0[`15`],
239	])
240	}
241	}
242
243	impl core::ops::Shr for u16x16 {
244	type Output = Self;
245
246	#[inline]
247	fn shr(self, rhs: Self) -> Self::Output {
248	impl_u16x16_op!(self, shr, rhs)
249	}
250	}
251