int_ops.rs source code [crates/tinystr/src/int_ops.rs]

1	// This file is part of ICU4X. For terms of use, please see the file
2	// called LICENSE at the top level of the ICU4X source tree
3	// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5	use crate::asciibyte::AsciiByte;
6
7	/// Internal helper struct that performs operations on aligned integers.
8	/// Supports strings up to 4 bytes long.
9	#[repr(transparent)]
10	pub struct Aligned4(u32);
11
12	impl Aligned4 {
13	/// # Panics
14	/// Panics if N is greater than 4
15	#[inline]
16	pub const fn from_bytes<const N: usize>(src: &[u8; N]) -> Self {
17	let mut bytes = [`0`; `4`];
18	let mut i = `0`;
19	// The function documentation defines when panics may occur
20	#[allow(clippy::indexing_slicing)]
21	while i < N {
22	bytes[i] = src[i];
23	i += `1`;
24	}
25	Self(u32::from_ne_bytes(bytes))
26	}
27
28	#[inline]
29	pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
30	Self::from_bytes::<N>(unsafe { core::mem::transmute(src) })
31	}
32
33	#[inline]
34	pub const fn to_bytes(&self) -> [u8; `4`] {
35	self.0.to_ne_bytes()
36	}
37
38	#[inline]
39	pub const fn to_ascii_bytes(&self) -> [AsciiByte; `4`] {
40	unsafe { core::mem::transmute(self.to_bytes()) }
41	}
42
43	pub const fn len(&self) -> usize {
44	let word = self.0;
45	#[cfg(target_endian = "little")]
46	let len = (`4` - word.leading_zeros() / `8`) as usize;
47	#[cfg(target_endian = "big")]
48	let len = (`4` - word.trailing_zeros() / `8`) as usize;
49	len
50	}
51
52	pub const fn is_ascii_alphabetic(&self) -> bool {
53	let word = self.0;
54	// Each of the following bitmasks set the high bit* (0x8) to 0 for valid and 1 for invalid.*
55	// `mask` sets all NUL bytes to 0.
56	let mask = (word + `0x7f7f_7f7f`) & `0x8080_8080`;
57	// `lower` converts the string to lowercase. It may also change the value of non-alpha
58	// characters, but this does not matter for the alphabetic test that follows.
59	let lower = word \| `0x2020_2020`;
60	// `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters.
61	let alpha = !(lower + `0x1f1f_1f1f`) \| (lower + `0x0505_0505`);
62	// The overall string is valid if every character passes at least one test.
63	// We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`).
64	(alpha & mask) == `0`
65	}
66
67	pub const fn is_ascii_alphanumeric(&self) -> bool {
68	let word = self.0;
69	// See explanatory comments in is_ascii_alphabetic
70	let mask = (word + `0x7f7f_7f7f`) & `0x8080_8080`;
71	let numeric = !(word + `0x5050_5050`) \| (word + `0x4646_4646`);
72	let lower = word \| `0x2020_2020`;
73	let alpha = !(lower + `0x1f1f_1f1f`) \| (lower + `0x0505_0505`);
74	(alpha & numeric & mask) == `0`
75	}
76
77	pub const fn is_ascii_numeric(&self) -> bool {
78	let word = self.0;
79	// See explanatory comments in is_ascii_alphabetic
80	let mask = (word + `0x7f7f_7f7f`) & `0x8080_8080`;
81	let numeric = !(word + `0x5050_5050`) \| (word + `0x4646_4646`);
82	(numeric & mask) == `0`
83	}
84
85	pub const fn is_ascii_lowercase(&self) -> bool {
86	let word = self.0;
87	// For efficiency, this function tests for an invalid string rather than a valid string.
88	// A string is ASCII lowercase iff it contains no uppercase ASCII characters.
89	// `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1.
90	let invalid_case = !(word + `0x3f3f_3f3f`) \| (word + `0x2525_2525`);
91	// The string is valid if it contains no invalid characters (if all high bits are 1).
92	(invalid_case & `0x8080_8080`) == `0x8080_8080`
93	}
94
95	pub const fn is_ascii_titlecase(&self) -> bool {
96	let word = self.0;
97	// See explanatory comments in is_ascii_lowercase
98	let invalid_case = if cfg!(target_endian = "little") {
99	!(word + `0x3f3f_3f1f`) \| (word + `0x2525_2505`)
100	} else {
101	!(word + `0x1f3f_3f3f`) \| (word + `0x0525_2525`)
102	};
103	(invalid_case & `0x8080_8080`) == `0x8080_8080`
104	}
105
106	pub const fn is_ascii_uppercase(&self) -> bool {
107	let word = self.0;
108	// See explanatory comments in is_ascii_lowercase
109	let invalid_case = !(word + `0x1f1f_1f1f`) \| (word + `0x0505_0505`);
110	(invalid_case & `0x8080_8080`) == `0x8080_8080`
111	}
112
113	pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
114	let word = self.0;
115	// `mask` sets all NUL bytes to 0.
116	let mask = (word + `0x7f7f_7f7f`) & `0x8080_8080`;
117	// `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
118	let lower_alpha = !(word + `0x1f1f_1f1f`) \| (word + `0x0505_0505`);
119	// The overall string is valid if every character passes at least one test.
120	// We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
121	(lower_alpha & mask) == `0`
122	}
123
124	pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
125	let word = self.0;
126	// See explanatory comments in is_ascii_alphabetic_lowercase
127	let mask = (word + `0x7f7f_7f7f`) & `0x8080_8080`;
128	let title_case = if cfg!(target_endian = "little") {
129	!(word + `0x1f1f_1f3f`) \| (word + `0x0505_0525`)
130	} else {
131	!(word + `0x3f1f_1f1f`) \| (word + `0x2505_0505`)
132	};
133	(title_case & mask) == `0`
134	}
135
136	pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
137	let word = self.0;
138	// See explanatory comments in is_ascii_alphabetic_lowercase
139	let mask = (word + `0x7f7f_7f7f`) & `0x8080_8080`;
140	let upper_alpha = !(word + `0x3f3f_3f3f`) \| (word + `0x2525_2525`);
141	(upper_alpha & mask) == `0`
142	}
143
144	pub const fn to_ascii_lowercase(&self) -> Self {
145	let word = self.0;
146	let result = word \| (((word + `0x3f3f_3f3f`) & !(word + `0x2525_2525`) & `0x8080_8080`) >> `2`);
147	Self(result)
148	}
149
150	pub const fn to_ascii_titlecase(&self) -> Self {
151	let word = self.0.to_le();
152	let mask = ((word + `0x3f3f_3f1f`) & !(word + `0x2525_2505`) & `0x8080_8080`) >> `2`;
153	let result = (word \| mask) & !(`0x20` & mask);
154	Self(u32::from_le(result))
155	}
156
157	pub const fn to_ascii_uppercase(&self) -> Self {
158	let word = self.0;
159	let result = word & !(((word + `0x1f1f_1f1f`) & !(word + `0x0505_0505`) & `0x8080_8080`) >> `2`);
160	Self(result)
161	}
162	}
163
164	/// Internal helper struct that performs operations on aligned integers.
165	/// Supports strings up to 8 bytes long.
166	#[repr(transparent)]
167	pub struct Aligned8(u64);
168
169	impl Aligned8 {
170	/// # Panics
171	/// Panics if N is greater than 8
172	#[inline]
173	pub const fn from_bytes<const N: usize>(src: &[u8; N]) -> Self {
174	let mut bytes = [`0`; `8`];
175	let mut i = `0`;
176	// The function documentation defines when panics may occur
177	#[allow(clippy::indexing_slicing)]
178	while i < N {
179	bytes[i] = src[i];
180	i += `1`;
181	}
182	Self(u64::from_ne_bytes(bytes))
183	}
184
185	#[inline]
186	pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
187	Self::from_bytes::<N>(unsafe { core::mem::transmute(src) })
188	}
189
190	#[inline]
191	pub const fn to_bytes(&self) -> [u8; `8`] {
192	self.0.to_ne_bytes()
193	}
194
195	#[inline]
196	pub const fn to_ascii_bytes(&self) -> [AsciiByte; `8`] {
197	unsafe { core::mem::transmute(self.to_bytes()) }
198	}
199
200	pub const fn len(&self) -> usize {
201	let word = self.0;
202	#[cfg(target_endian = "little")]
203	let len = (`8` - word.leading_zeros() / `8`) as usize;
204	#[cfg(target_endian = "big")]
205	let len = (`8` - word.trailing_zeros() / `8`) as usize;
206	len
207	}
208
209	pub const fn is_ascii_alphabetic(&self) -> bool {
210	let word = self.0;
211	let mask = (word + `0x7f7f_7f7f_7f7f_7f7f`) & `0x8080_8080_8080_8080`;
212	let lower = word \| `0x2020_2020_2020_2020`;
213	let alpha = !(lower + `0x1f1f_1f1f_1f1f_1f1f`) \| (lower + `0x0505_0505_0505_0505`);
214	(alpha & mask) == `0`
215	}
216
217	pub const fn is_ascii_alphanumeric(&self) -> bool {
218	let word = self.0;
219	let mask = (word + `0x7f7f_7f7f_7f7f_7f7f`) & `0x8080_8080_8080_8080`;
220	let numeric = !(word + `0x5050_5050_5050_5050`) \| (word + `0x4646_4646_4646_4646`);
221	let lower = word \| `0x2020_2020_2020_2020`;
222	let alpha = !(lower + `0x1f1f_1f1f_1f1f_1f1f`) \| (lower + `0x0505_0505_0505_0505`);
223	(alpha & numeric & mask) == `0`
224	}
225
226	pub const fn is_ascii_numeric(&self) -> bool {
227	let word = self.0;
228	let mask = (word + `0x7f7f_7f7f_7f7f_7f7f`) & `0x8080_8080_8080_8080`;
229	let numeric = !(word + `0x5050_5050_5050_5050`) \| (word + `0x4646_4646_4646_4646`);
230	(numeric & mask) == `0`
231	}
232
233	pub const fn is_ascii_lowercase(&self) -> bool {
234	let word = self.0;
235	let invalid_case = !(word + `0x3f3f_3f3f_3f3f_3f3f`) \| (word + `0x2525_2525_2525_2525`);
236	(invalid_case & `0x8080_8080_8080_8080`) == `0x8080_8080_8080_8080`
237	}
238
239	pub const fn is_ascii_titlecase(&self) -> bool {
240	let word = self.0;
241	let invalid_case = if cfg!(target_endian = "little") {
242	!(word + `0x3f3f_3f3f_3f3f_3f1f`) \| (word + `0x2525_2525_2525_2505`)
243	} else {
244	!(word + `0x1f3f_3f3f_3f3f_3f3f`) \| (word + `0x0525_2525_2525_2525`)
245	};
246	(invalid_case & `0x8080_8080_8080_8080`) == `0x8080_8080_8080_8080`
247	}
248
249	pub const fn is_ascii_uppercase(&self) -> bool {
250	let word = self.0;
251	let invalid_case = !(word + `0x1f1f_1f1f_1f1f_1f1f`) \| (word + `0x0505_0505_0505_0505`);
252	(invalid_case & `0x8080_8080_8080_8080`) == `0x8080_8080_8080_8080`
253	}
254
255	pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
256	let word = self.0;
257	// `mask` sets all NUL bytes to 0.
258	let mask = (word + `0x7f7f_7f7f_7f7f_7f7f`) & `0x8080_8080_8080_8080`;
259	// `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
260	let lower_alpha = !(word + `0x1f1f_1f1f_1f1f_1f1f`) \| (word + `0x0505_0505_0505_0505`);
261	// The overall string is valid if every character passes at least one test.
262	// We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
263	(lower_alpha & mask) == `0`
264	}
265
266	pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
267	let word = self.0;
268	// See explanatory comments in is_ascii_alphabetic_lowercase
269	let mask = (word + `0x7f7f_7f7f_7f7f_7f7f`) & `0x8080_8080_8080_8080`;
270	let title_case = if cfg!(target_endian = "little") {
271	!(word + `0x1f1f_1f1f_1f1f_1f3f`) \| (word + `0x0505_0505_0505_0525`)
272	} else {
273	!(word + `0x3f1f_1f1f_1f1f_1f1f`) \| (word + `0x2505_0505_0505_0505`)
274	};
275	(title_case & mask) == `0`
276	}
277
278	pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
279	let word = self.0;
280	// See explanatory comments in is_ascii_alphabetic_lowercase
281	let mask = (word + `0x7f7f_7f7f_7f7f_7f7f`) & `0x8080_8080_8080_8080`;
282	let upper_alpha = !(word + `0x3f3f_3f3f_3f3f_3f3f`) \| (word + `0x2525_2525_2525_2525`);
283	(upper_alpha & mask) == `0`
284	}
285
286	pub const fn to_ascii_lowercase(&self) -> Self {
287	let word = self.0;
288	let result = word
289	\| (((word + `0x3f3f_3f3f_3f3f_3f3f`)
290	& !(word + `0x2525_2525_2525_2525`)
291	& `0x8080_8080_8080_8080`)
292	>> `2`);
293	Self(result)
294	}
295
296	pub const fn to_ascii_titlecase(&self) -> Self {
297	let word = self.0.to_le();
298	let mask = ((word + `0x3f3f_3f3f_3f3f_3f1f`)
299	& !(word + `0x2525_2525_2525_2505`)
300	& `0x8080_8080_8080_8080`)
301	>> `2`;
302	let result = (word \| mask) & !(`0x20` & mask);
303	Self(u64::from_le(result))
304	}
305
306	pub const fn to_ascii_uppercase(&self) -> Self {
307	let word = self.0;
308	let result = word
309	& !(((word + `0x1f1f_1f1f_1f1f_1f1f`)
310	& !(word + `0x0505_0505_0505_0505`)
311	& `0x8080_8080_8080_8080`)
312	>> `2`);
313	Self(result)
314	}
315	}
316