1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use crate::asciibyte::AsciiByte; |
6 | |
7 | /// Internal helper struct that performs operations on aligned integers. |
8 | /// Supports strings up to 4 bytes long. |
9 | #[repr (transparent)] |
10 | pub struct Aligned4(u32); |
11 | |
12 | impl Aligned4 { |
13 | /// # Panics |
14 | /// Panics if N is greater than 4 |
15 | #[inline ] |
16 | pub const fn from_bytes<const N: usize>(src: &[u8; N]) -> Self { |
17 | let mut bytes = [0; 4]; |
18 | let mut i = 0; |
19 | // The function documentation defines when panics may occur |
20 | #[allow (clippy::indexing_slicing)] |
21 | while i < N { |
22 | bytes[i] = src[i]; |
23 | i += 1; |
24 | } |
25 | Self(u32::from_ne_bytes(bytes)) |
26 | } |
27 | |
28 | #[inline ] |
29 | pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self { |
30 | Self::from_bytes::<N>(unsafe { core::mem::transmute(src) }) |
31 | } |
32 | |
33 | #[inline ] |
34 | pub const fn to_bytes(&self) -> [u8; 4] { |
35 | self.0.to_ne_bytes() |
36 | } |
37 | |
38 | #[inline ] |
39 | pub const fn to_ascii_bytes(&self) -> [AsciiByte; 4] { |
40 | unsafe { core::mem::transmute(self.to_bytes()) } |
41 | } |
42 | |
43 | pub const fn len(&self) -> usize { |
44 | let word = self.0; |
45 | #[cfg (target_endian = "little" )] |
46 | let len = (4 - word.leading_zeros() / 8) as usize; |
47 | #[cfg (target_endian = "big" )] |
48 | let len = (4 - word.trailing_zeros() / 8) as usize; |
49 | len |
50 | } |
51 | |
52 | pub const fn is_ascii_alphabetic(&self) -> bool { |
53 | let word = self.0; |
54 | // Each of the following bitmasks set *the high bit* (0x8) to 0 for valid and 1 for invalid. |
55 | // `mask` sets all NUL bytes to 0. |
56 | let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; |
57 | // `lower` converts the string to lowercase. It may also change the value of non-alpha |
58 | // characters, but this does not matter for the alphabetic test that follows. |
59 | let lower = word | 0x2020_2020; |
60 | // `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters. |
61 | let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505); |
62 | // The overall string is valid if every character passes at least one test. |
63 | // We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`). |
64 | (alpha & mask) == 0 |
65 | } |
66 | |
67 | pub const fn is_ascii_alphanumeric(&self) -> bool { |
68 | let word = self.0; |
69 | // See explanatory comments in is_ascii_alphabetic |
70 | let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; |
71 | let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646); |
72 | let lower = word | 0x2020_2020; |
73 | let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505); |
74 | (alpha & numeric & mask) == 0 |
75 | } |
76 | |
77 | pub const fn is_ascii_numeric(&self) -> bool { |
78 | let word = self.0; |
79 | // See explanatory comments in is_ascii_alphabetic |
80 | let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; |
81 | let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646); |
82 | (numeric & mask) == 0 |
83 | } |
84 | |
85 | pub const fn is_ascii_lowercase(&self) -> bool { |
86 | let word = self.0; |
87 | // For efficiency, this function tests for an invalid string rather than a valid string. |
88 | // A string is ASCII lowercase iff it contains no uppercase ASCII characters. |
89 | // `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1. |
90 | let invalid_case = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525); |
91 | // The string is valid if it contains no invalid characters (if all high bits are 1). |
92 | (invalid_case & 0x8080_8080) == 0x8080_8080 |
93 | } |
94 | |
95 | pub const fn is_ascii_titlecase(&self) -> bool { |
96 | let word = self.0; |
97 | // See explanatory comments in is_ascii_lowercase |
98 | let invalid_case = if cfg!(target_endian = "little" ) { |
99 | !(word + 0x3f3f_3f1f) | (word + 0x2525_2505) |
100 | } else { |
101 | !(word + 0x1f3f_3f3f) | (word + 0x0525_2525) |
102 | }; |
103 | (invalid_case & 0x8080_8080) == 0x8080_8080 |
104 | } |
105 | |
106 | pub const fn is_ascii_uppercase(&self) -> bool { |
107 | let word = self.0; |
108 | // See explanatory comments in is_ascii_lowercase |
109 | let invalid_case = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505); |
110 | (invalid_case & 0x8080_8080) == 0x8080_8080 |
111 | } |
112 | |
113 | pub const fn is_ascii_alphabetic_lowercase(&self) -> bool { |
114 | let word = self.0; |
115 | // `mask` sets all NUL bytes to 0. |
116 | let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; |
117 | // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1. |
118 | let lower_alpha = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505); |
119 | // The overall string is valid if every character passes at least one test. |
120 | // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`). |
121 | (lower_alpha & mask) == 0 |
122 | } |
123 | |
124 | pub const fn is_ascii_alphabetic_titlecase(&self) -> bool { |
125 | let word = self.0; |
126 | // See explanatory comments in is_ascii_alphabetic_lowercase |
127 | let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; |
128 | let title_case = if cfg!(target_endian = "little" ) { |
129 | !(word + 0x1f1f_1f3f) | (word + 0x0505_0525) |
130 | } else { |
131 | !(word + 0x3f1f_1f1f) | (word + 0x2505_0505) |
132 | }; |
133 | (title_case & mask) == 0 |
134 | } |
135 | |
136 | pub const fn is_ascii_alphabetic_uppercase(&self) -> bool { |
137 | let word = self.0; |
138 | // See explanatory comments in is_ascii_alphabetic_lowercase |
139 | let mask = (word + 0x7f7f_7f7f) & 0x8080_8080; |
140 | let upper_alpha = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525); |
141 | (upper_alpha & mask) == 0 |
142 | } |
143 | |
144 | pub const fn to_ascii_lowercase(&self) -> Self { |
145 | let word = self.0; |
146 | let result = word | (((word + 0x3f3f_3f3f) & !(word + 0x2525_2525) & 0x8080_8080) >> 2); |
147 | Self(result) |
148 | } |
149 | |
150 | pub const fn to_ascii_titlecase(&self) -> Self { |
151 | let word = self.0.to_le(); |
152 | let mask = ((word + 0x3f3f_3f1f) & !(word + 0x2525_2505) & 0x8080_8080) >> 2; |
153 | let result = (word | mask) & !(0x20 & mask); |
154 | Self(u32::from_le(result)) |
155 | } |
156 | |
157 | pub const fn to_ascii_uppercase(&self) -> Self { |
158 | let word = self.0; |
159 | let result = word & !(((word + 0x1f1f_1f1f) & !(word + 0x0505_0505) & 0x8080_8080) >> 2); |
160 | Self(result) |
161 | } |
162 | } |
163 | |
164 | /// Internal helper struct that performs operations on aligned integers. |
165 | /// Supports strings up to 8 bytes long. |
166 | #[repr (transparent)] |
167 | pub struct Aligned8(u64); |
168 | |
169 | impl Aligned8 { |
170 | /// # Panics |
171 | /// Panics if N is greater than 8 |
172 | #[inline ] |
173 | pub const fn from_bytes<const N: usize>(src: &[u8; N]) -> Self { |
174 | let mut bytes = [0; 8]; |
175 | let mut i = 0; |
176 | // The function documentation defines when panics may occur |
177 | #[allow (clippy::indexing_slicing)] |
178 | while i < N { |
179 | bytes[i] = src[i]; |
180 | i += 1; |
181 | } |
182 | Self(u64::from_ne_bytes(bytes)) |
183 | } |
184 | |
185 | #[inline ] |
186 | pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self { |
187 | Self::from_bytes::<N>(unsafe { core::mem::transmute(src) }) |
188 | } |
189 | |
190 | #[inline ] |
191 | pub const fn to_bytes(&self) -> [u8; 8] { |
192 | self.0.to_ne_bytes() |
193 | } |
194 | |
195 | #[inline ] |
196 | pub const fn to_ascii_bytes(&self) -> [AsciiByte; 8] { |
197 | unsafe { core::mem::transmute(self.to_bytes()) } |
198 | } |
199 | |
200 | pub const fn len(&self) -> usize { |
201 | let word = self.0; |
202 | #[cfg (target_endian = "little" )] |
203 | let len = (8 - word.leading_zeros() / 8) as usize; |
204 | #[cfg (target_endian = "big" )] |
205 | let len = (8 - word.trailing_zeros() / 8) as usize; |
206 | len |
207 | } |
208 | |
209 | pub const fn is_ascii_alphabetic(&self) -> bool { |
210 | let word = self.0; |
211 | let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; |
212 | let lower = word | 0x2020_2020_2020_2020; |
213 | let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505); |
214 | (alpha & mask) == 0 |
215 | } |
216 | |
217 | pub const fn is_ascii_alphanumeric(&self) -> bool { |
218 | let word = self.0; |
219 | let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; |
220 | let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646); |
221 | let lower = word | 0x2020_2020_2020_2020; |
222 | let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505); |
223 | (alpha & numeric & mask) == 0 |
224 | } |
225 | |
226 | pub const fn is_ascii_numeric(&self) -> bool { |
227 | let word = self.0; |
228 | let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; |
229 | let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646); |
230 | (numeric & mask) == 0 |
231 | } |
232 | |
233 | pub const fn is_ascii_lowercase(&self) -> bool { |
234 | let word = self.0; |
235 | let invalid_case = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525); |
236 | (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 |
237 | } |
238 | |
239 | pub const fn is_ascii_titlecase(&self) -> bool { |
240 | let word = self.0; |
241 | let invalid_case = if cfg!(target_endian = "little" ) { |
242 | !(word + 0x3f3f_3f3f_3f3f_3f1f) | (word + 0x2525_2525_2525_2505) |
243 | } else { |
244 | !(word + 0x1f3f_3f3f_3f3f_3f3f) | (word + 0x0525_2525_2525_2525) |
245 | }; |
246 | (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 |
247 | } |
248 | |
249 | pub const fn is_ascii_uppercase(&self) -> bool { |
250 | let word = self.0; |
251 | let invalid_case = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505); |
252 | (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080 |
253 | } |
254 | |
255 | pub const fn is_ascii_alphabetic_lowercase(&self) -> bool { |
256 | let word = self.0; |
257 | // `mask` sets all NUL bytes to 0. |
258 | let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; |
259 | // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1. |
260 | let lower_alpha = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505); |
261 | // The overall string is valid if every character passes at least one test. |
262 | // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`). |
263 | (lower_alpha & mask) == 0 |
264 | } |
265 | |
266 | pub const fn is_ascii_alphabetic_titlecase(&self) -> bool { |
267 | let word = self.0; |
268 | // See explanatory comments in is_ascii_alphabetic_lowercase |
269 | let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; |
270 | let title_case = if cfg!(target_endian = "little" ) { |
271 | !(word + 0x1f1f_1f1f_1f1f_1f3f) | (word + 0x0505_0505_0505_0525) |
272 | } else { |
273 | !(word + 0x3f1f_1f1f_1f1f_1f1f) | (word + 0x2505_0505_0505_0505) |
274 | }; |
275 | (title_case & mask) == 0 |
276 | } |
277 | |
278 | pub const fn is_ascii_alphabetic_uppercase(&self) -> bool { |
279 | let word = self.0; |
280 | // See explanatory comments in is_ascii_alphabetic_lowercase |
281 | let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080; |
282 | let upper_alpha = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525); |
283 | (upper_alpha & mask) == 0 |
284 | } |
285 | |
286 | pub const fn to_ascii_lowercase(&self) -> Self { |
287 | let word = self.0; |
288 | let result = word |
289 | | (((word + 0x3f3f_3f3f_3f3f_3f3f) |
290 | & !(word + 0x2525_2525_2525_2525) |
291 | & 0x8080_8080_8080_8080) |
292 | >> 2); |
293 | Self(result) |
294 | } |
295 | |
296 | pub const fn to_ascii_titlecase(&self) -> Self { |
297 | let word = self.0.to_le(); |
298 | let mask = ((word + 0x3f3f_3f3f_3f3f_3f1f) |
299 | & !(word + 0x2525_2525_2525_2505) |
300 | & 0x8080_8080_8080_8080) |
301 | >> 2; |
302 | let result = (word | mask) & !(0x20 & mask); |
303 | Self(u64::from_le(result)) |
304 | } |
305 | |
306 | pub const fn to_ascii_uppercase(&self) -> Self { |
307 | let word = self.0; |
308 | let result = word |
309 | & !(((word + 0x1f1f_1f1f_1f1f_1f1f) |
310 | & !(word + 0x0505_0505_0505_0505) |
311 | & 0x8080_8080_8080_8080) |
312 | >> 2); |
313 | Self(result) |
314 | } |
315 | } |
316 | |