1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::asciibyte::AsciiByte;
6
7/// Internal helper struct that performs operations on aligned integers.
8/// Supports strings up to 4 bytes long.
9#[repr(transparent)]
10pub struct Aligned4(u32);
11
12impl Aligned4 {
13 /// # Panics
14 /// Panics if N is greater than 4
15 #[inline]
16 pub const fn from_bytes<const N: usize>(src: &[u8; N]) -> Self {
17 let mut bytes = [0; 4];
18 let mut i = 0;
19 // The function documentation defines when panics may occur
20 #[allow(clippy::indexing_slicing)]
21 while i < N {
22 bytes[i] = src[i];
23 i += 1;
24 }
25 Self(u32::from_ne_bytes(bytes))
26 }
27
28 #[inline]
29 pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
30 Self::from_bytes::<N>(unsafe { core::mem::transmute(src) })
31 }
32
33 #[inline]
34 pub const fn to_bytes(&self) -> [u8; 4] {
35 self.0.to_ne_bytes()
36 }
37
38 #[inline]
39 pub const fn to_ascii_bytes(&self) -> [AsciiByte; 4] {
40 unsafe { core::mem::transmute(self.to_bytes()) }
41 }
42
43 pub const fn len(&self) -> usize {
44 let word = self.0;
45 #[cfg(target_endian = "little")]
46 let len = (4 - word.leading_zeros() / 8) as usize;
47 #[cfg(target_endian = "big")]
48 let len = (4 - word.trailing_zeros() / 8) as usize;
49 len
50 }
51
52 pub const fn is_ascii_alphabetic(&self) -> bool {
53 let word = self.0;
54 // Each of the following bitmasks set *the high bit* (0x8) to 0 for valid and 1 for invalid.
55 // `mask` sets all NUL bytes to 0.
56 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
57 // `lower` converts the string to lowercase. It may also change the value of non-alpha
58 // characters, but this does not matter for the alphabetic test that follows.
59 let lower = word | 0x2020_2020;
60 // `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters.
61 let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
62 // The overall string is valid if every character passes at least one test.
63 // We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`).
64 (alpha & mask) == 0
65 }
66
67 pub const fn is_ascii_alphanumeric(&self) -> bool {
68 let word = self.0;
69 // See explanatory comments in is_ascii_alphabetic
70 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
71 let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
72 let lower = word | 0x2020_2020;
73 let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
74 (alpha & numeric & mask) == 0
75 }
76
77 pub const fn is_ascii_numeric(&self) -> bool {
78 let word = self.0;
79 // See explanatory comments in is_ascii_alphabetic
80 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
81 let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
82 (numeric & mask) == 0
83 }
84
85 pub const fn is_ascii_lowercase(&self) -> bool {
86 let word = self.0;
87 // For efficiency, this function tests for an invalid string rather than a valid string.
88 // A string is ASCII lowercase iff it contains no uppercase ASCII characters.
89 // `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1.
90 let invalid_case = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
91 // The string is valid if it contains no invalid characters (if all high bits are 1).
92 (invalid_case & 0x8080_8080) == 0x8080_8080
93 }
94
95 pub const fn is_ascii_titlecase(&self) -> bool {
96 let word = self.0;
97 // See explanatory comments in is_ascii_lowercase
98 let invalid_case = if cfg!(target_endian = "little") {
99 !(word + 0x3f3f_3f1f) | (word + 0x2525_2505)
100 } else {
101 !(word + 0x1f3f_3f3f) | (word + 0x0525_2525)
102 };
103 (invalid_case & 0x8080_8080) == 0x8080_8080
104 }
105
106 pub const fn is_ascii_uppercase(&self) -> bool {
107 let word = self.0;
108 // See explanatory comments in is_ascii_lowercase
109 let invalid_case = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
110 (invalid_case & 0x8080_8080) == 0x8080_8080
111 }
112
113 pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
114 let word = self.0;
115 // `mask` sets all NUL bytes to 0.
116 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
117 // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
118 let lower_alpha = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
119 // The overall string is valid if every character passes at least one test.
120 // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
121 (lower_alpha & mask) == 0
122 }
123
124 pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
125 let word = self.0;
126 // See explanatory comments in is_ascii_alphabetic_lowercase
127 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
128 let title_case = if cfg!(target_endian = "little") {
129 !(word + 0x1f1f_1f3f) | (word + 0x0505_0525)
130 } else {
131 !(word + 0x3f1f_1f1f) | (word + 0x2505_0505)
132 };
133 (title_case & mask) == 0
134 }
135
136 pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
137 let word = self.0;
138 // See explanatory comments in is_ascii_alphabetic_lowercase
139 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
140 let upper_alpha = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
141 (upper_alpha & mask) == 0
142 }
143
144 pub const fn to_ascii_lowercase(&self) -> Self {
145 let word = self.0;
146 let result = word | (((word + 0x3f3f_3f3f) & !(word + 0x2525_2525) & 0x8080_8080) >> 2);
147 Self(result)
148 }
149
150 pub const fn to_ascii_titlecase(&self) -> Self {
151 let word = self.0.to_le();
152 let mask = ((word + 0x3f3f_3f1f) & !(word + 0x2525_2505) & 0x8080_8080) >> 2;
153 let result = (word | mask) & !(0x20 & mask);
154 Self(u32::from_le(result))
155 }
156
157 pub const fn to_ascii_uppercase(&self) -> Self {
158 let word = self.0;
159 let result = word & !(((word + 0x1f1f_1f1f) & !(word + 0x0505_0505) & 0x8080_8080) >> 2);
160 Self(result)
161 }
162}
163
164/// Internal helper struct that performs operations on aligned integers.
165/// Supports strings up to 8 bytes long.
166#[repr(transparent)]
167pub struct Aligned8(u64);
168
169impl Aligned8 {
170 /// # Panics
171 /// Panics if N is greater than 8
172 #[inline]
173 pub const fn from_bytes<const N: usize>(src: &[u8; N]) -> Self {
174 let mut bytes = [0; 8];
175 let mut i = 0;
176 // The function documentation defines when panics may occur
177 #[allow(clippy::indexing_slicing)]
178 while i < N {
179 bytes[i] = src[i];
180 i += 1;
181 }
182 Self(u64::from_ne_bytes(bytes))
183 }
184
185 #[inline]
186 pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
187 Self::from_bytes::<N>(unsafe { core::mem::transmute(src) })
188 }
189
190 #[inline]
191 pub const fn to_bytes(&self) -> [u8; 8] {
192 self.0.to_ne_bytes()
193 }
194
195 #[inline]
196 pub const fn to_ascii_bytes(&self) -> [AsciiByte; 8] {
197 unsafe { core::mem::transmute(self.to_bytes()) }
198 }
199
200 pub const fn len(&self) -> usize {
201 let word = self.0;
202 #[cfg(target_endian = "little")]
203 let len = (8 - word.leading_zeros() / 8) as usize;
204 #[cfg(target_endian = "big")]
205 let len = (8 - word.trailing_zeros() / 8) as usize;
206 len
207 }
208
209 pub const fn is_ascii_alphabetic(&self) -> bool {
210 let word = self.0;
211 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
212 let lower = word | 0x2020_2020_2020_2020;
213 let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
214 (alpha & mask) == 0
215 }
216
217 pub const fn is_ascii_alphanumeric(&self) -> bool {
218 let word = self.0;
219 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
220 let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
221 let lower = word | 0x2020_2020_2020_2020;
222 let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
223 (alpha & numeric & mask) == 0
224 }
225
226 pub const fn is_ascii_numeric(&self) -> bool {
227 let word = self.0;
228 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
229 let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
230 (numeric & mask) == 0
231 }
232
233 pub const fn is_ascii_lowercase(&self) -> bool {
234 let word = self.0;
235 let invalid_case = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
236 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
237 }
238
239 pub const fn is_ascii_titlecase(&self) -> bool {
240 let word = self.0;
241 let invalid_case = if cfg!(target_endian = "little") {
242 !(word + 0x3f3f_3f3f_3f3f_3f1f) | (word + 0x2525_2525_2525_2505)
243 } else {
244 !(word + 0x1f3f_3f3f_3f3f_3f3f) | (word + 0x0525_2525_2525_2525)
245 };
246 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
247 }
248
249 pub const fn is_ascii_uppercase(&self) -> bool {
250 let word = self.0;
251 let invalid_case = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
252 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
253 }
254
255 pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
256 let word = self.0;
257 // `mask` sets all NUL bytes to 0.
258 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
259 // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
260 let lower_alpha = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
261 // The overall string is valid if every character passes at least one test.
262 // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
263 (lower_alpha & mask) == 0
264 }
265
266 pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
267 let word = self.0;
268 // See explanatory comments in is_ascii_alphabetic_lowercase
269 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
270 let title_case = if cfg!(target_endian = "little") {
271 !(word + 0x1f1f_1f1f_1f1f_1f3f) | (word + 0x0505_0505_0505_0525)
272 } else {
273 !(word + 0x3f1f_1f1f_1f1f_1f1f) | (word + 0x2505_0505_0505_0505)
274 };
275 (title_case & mask) == 0
276 }
277
278 pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
279 let word = self.0;
280 // See explanatory comments in is_ascii_alphabetic_lowercase
281 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
282 let upper_alpha = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
283 (upper_alpha & mask) == 0
284 }
285
286 pub const fn to_ascii_lowercase(&self) -> Self {
287 let word = self.0;
288 let result = word
289 | (((word + 0x3f3f_3f3f_3f3f_3f3f)
290 & !(word + 0x2525_2525_2525_2525)
291 & 0x8080_8080_8080_8080)
292 >> 2);
293 Self(result)
294 }
295
296 pub const fn to_ascii_titlecase(&self) -> Self {
297 let word = self.0.to_le();
298 let mask = ((word + 0x3f3f_3f3f_3f3f_3f1f)
299 & !(word + 0x2525_2525_2525_2505)
300 & 0x8080_8080_8080_8080)
301 >> 2;
302 let result = (word | mask) & !(0x20 & mask);
303 Self(u64::from_le(result))
304 }
305
306 pub const fn to_ascii_uppercase(&self) -> Self {
307 let word = self.0;
308 let result = word
309 & !(((word + 0x1f1f_1f1f_1f1f_1f1f)
310 & !(word + 0x0505_0505_0505_0505)
311 & 0x8080_8080_8080_8080)
312 >> 2);
313 Self(result)
314 }
315}
316