1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use super::{AsULE, RawBytesULE, VarULE}; |
6 | use crate::ule::EqULE; |
7 | use crate::{map::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroVecError}; |
8 | use alloc::boxed::Box; |
9 | use core::cmp::Ordering; |
10 | use core::fmt; |
11 | use core::ops::Deref; |
12 | |
13 | /// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant. |
14 | /// |
15 | /// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For |
16 | /// example, strings that are keys of a map don't need to ever be reified as `str`s. |
17 | /// |
18 | /// [`UnvalidatedStr`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`]. |
19 | /// |
20 | /// The main advantage of this type over `[u8]` is that it serializes as a string in |
21 | /// human-readable formats like JSON. |
22 | /// |
23 | /// # Examples |
24 | /// |
25 | /// Using an [`UnvalidatedStr`] as the key of a [`ZeroMap`]: |
26 | /// |
27 | /// ``` |
28 | /// use zerovec::ule::UnvalidatedStr; |
29 | /// use zerovec::ZeroMap; |
30 | /// |
31 | /// let map: ZeroMap<UnvalidatedStr, usize> = [ |
32 | /// (UnvalidatedStr::from_str("abc" ), 11), |
33 | /// (UnvalidatedStr::from_str("def" ), 22), |
34 | /// (UnvalidatedStr::from_str("ghi" ), 33), |
35 | /// ] |
36 | /// .into_iter() |
37 | /// .collect(); |
38 | /// |
39 | /// let key = "abc" ; |
40 | /// let value = map.get_copied_by(|uvstr| uvstr.as_bytes().cmp(key.as_bytes())); |
41 | /// assert_eq!(Some(11), value); |
42 | /// ``` |
43 | /// |
44 | /// [`ZeroMap`]: crate::ZeroMap |
45 | #[repr (transparent)] |
46 | #[derive (PartialEq, Eq, PartialOrd, Ord)] |
47 | #[allow (clippy::exhaustive_structs)] // transparent newtype |
48 | pub struct UnvalidatedStr([u8]); |
49 | |
50 | impl fmt::Debug for UnvalidatedStr { |
51 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
52 | // Debug as a string if possible |
53 | match self.try_as_str() { |
54 | Ok(s: &str) => fmt::Debug::fmt(self:s, f), |
55 | Err(_) => fmt::Debug::fmt(&self.0, f), |
56 | } |
57 | } |
58 | } |
59 | |
60 | impl UnvalidatedStr { |
61 | /// Create a [`UnvalidatedStr`] from a byte slice. |
62 | #[inline ] |
63 | pub const fn from_bytes(other: &[u8]) -> &Self { |
64 | // Safety: UnvalidatedStr is transparent over [u8] |
65 | unsafe { core::mem::transmute(other) } |
66 | } |
67 | |
68 | /// Create a [`UnvalidatedStr`] from a string slice. |
69 | #[inline ] |
70 | pub const fn from_str(s: &str) -> &Self { |
71 | Self::from_bytes(s.as_bytes()) |
72 | } |
73 | |
74 | /// Create a [`UnvalidatedStr`] from boxed bytes. |
75 | #[inline ] |
76 | pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> { |
77 | // Safety: UnvalidatedStr is transparent over [u8] |
78 | unsafe { core::mem::transmute(other) } |
79 | } |
80 | |
81 | /// Create a [`UnvalidatedStr`] from a boxed `str`. |
82 | #[inline ] |
83 | pub fn from_boxed_str(other: Box<str>) -> Box<Self> { |
84 | Self::from_boxed_bytes(other.into_boxed_bytes()) |
85 | } |
86 | |
87 | /// Get the bytes from a [`UnvalidatedStr]. |
88 | #[inline ] |
89 | pub const fn as_bytes(&self) -> &[u8] { |
90 | &self.0 |
91 | } |
92 | |
93 | /// Attempt to convert a [`UnvalidatedStr`] to a `str`. |
94 | /// |
95 | /// # Examples |
96 | /// |
97 | /// ``` |
98 | /// use zerovec::ule::UnvalidatedStr; |
99 | /// |
100 | /// static A: &UnvalidatedStr = UnvalidatedStr::from_bytes(b"abc" ); |
101 | /// |
102 | /// let b = A.try_as_str().unwrap(); |
103 | /// assert_eq!(b, "abc" ); |
104 | /// ``` |
105 | // Note: this is const starting in 1.63 |
106 | #[inline ] |
107 | pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> { |
108 | core::str::from_utf8(&self.0) |
109 | } |
110 | } |
111 | |
112 | impl<'a> From<&'a str> for &'a UnvalidatedStr { |
113 | #[inline ] |
114 | fn from(other: &'a str) -> Self { |
115 | UnvalidatedStr::from_str(other) |
116 | } |
117 | } |
118 | |
119 | impl From<Box<str>> for Box<UnvalidatedStr> { |
120 | #[inline ] |
121 | fn from(other: Box<str>) -> Self { |
122 | UnvalidatedStr::from_boxed_str(other) |
123 | } |
124 | } |
125 | |
126 | impl Deref for UnvalidatedStr { |
127 | type Target = [u8]; |
128 | fn deref(&self) -> &Self::Target { |
129 | &self.0 |
130 | } |
131 | } |
132 | |
133 | impl<'a> ZeroMapKV<'a> for UnvalidatedStr { |
134 | type Container = VarZeroVec<'a, UnvalidatedStr>; |
135 | type Slice = VarZeroSlice<UnvalidatedStr>; |
136 | type GetType = UnvalidatedStr; |
137 | type OwnedType = Box<UnvalidatedStr>; |
138 | } |
139 | |
140 | // Safety (based on the safety checklist on the VarULE trait): |
141 | // 1. UnvalidatedStr does not include any uninitialized or padding bytes (transparent over a ULE) |
142 | // 2. UnvalidatedStr is aligned to 1 byte (transparent over a ULE) |
143 | // 3. The impl of `validate_byte_slice()` returns an error if any byte is not valid (impossible) |
144 | // 4. The impl of `validate_byte_slice()` returns an error if the slice cannot be used in its entirety (impossible) |
145 | // 5. The impl of `from_byte_slice_unchecked()` returns a reference to the same data (returns the argument directly) |
146 | // 6. All other methods are defaulted |
147 | // 7. `[T]` byte equality is semantic equality (transparent over a ULE) |
148 | unsafe impl VarULE for UnvalidatedStr { |
149 | #[inline ] |
150 | fn validate_byte_slice(_: &[u8]) -> Result<(), ZeroVecError> { |
151 | Ok(()) |
152 | } |
153 | #[inline ] |
154 | unsafe fn from_byte_slice_unchecked(bytes: &[u8]) -> &Self { |
155 | UnvalidatedStr::from_bytes(bytes) |
156 | } |
157 | } |
158 | |
159 | /// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate |
160 | #[cfg (feature = "serde" )] |
161 | impl serde::Serialize for UnvalidatedStr { |
162 | fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> |
163 | where |
164 | S: serde::Serializer, |
165 | { |
166 | use serde::ser::Error; |
167 | let s = self |
168 | .try_as_str() |
169 | .map_err(|_| S::Error::custom("invalid UTF-8 in UnvalidatedStr" ))?; |
170 | if serializer.is_human_readable() { |
171 | serializer.serialize_str(s) |
172 | } else { |
173 | serializer.serialize_bytes(s.as_bytes()) |
174 | } |
175 | } |
176 | } |
177 | |
178 | /// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate |
179 | #[cfg (feature = "serde" )] |
180 | impl<'de> serde::Deserialize<'de> for Box<UnvalidatedStr> { |
181 | fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
182 | where |
183 | D: serde::Deserializer<'de>, |
184 | { |
185 | if deserializer.is_human_readable() { |
186 | let boxed_str = Box::<str>::deserialize(deserializer)?; |
187 | Ok(UnvalidatedStr::from_boxed_str(boxed_str)) |
188 | } else { |
189 | let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?; |
190 | Ok(UnvalidatedStr::from_boxed_bytes(boxed_bytes)) |
191 | } |
192 | } |
193 | } |
194 | |
195 | /// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate |
196 | #[cfg (feature = "serde" )] |
197 | impl<'de, 'a> serde::Deserialize<'de> for &'a UnvalidatedStr |
198 | where |
199 | 'de: 'a, |
200 | { |
201 | fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
202 | where |
203 | D: serde::Deserializer<'de>, |
204 | { |
205 | if deserializer.is_human_readable() { |
206 | let s = <&str>::deserialize(deserializer)?; |
207 | Ok(UnvalidatedStr::from_str(s)) |
208 | } else { |
209 | let bytes = <&[u8]>::deserialize(deserializer)?; |
210 | Ok(UnvalidatedStr::from_bytes(bytes)) |
211 | } |
212 | } |
213 | } |
214 | |
215 | /// A u8 array of little-endian data that is expected to be a Unicode scalar value, but is not |
216 | /// validated as such. |
217 | /// |
218 | /// Use this type instead of `char` when you want to deal with data that is expected to be valid |
219 | /// Unicode scalar values, but you want control over when or if you validate that assumption. |
220 | /// |
221 | /// # Examples |
222 | /// |
223 | /// ``` |
224 | /// use zerovec::ule::{RawBytesULE, UnvalidatedChar, ULE}; |
225 | /// use zerovec::{ZeroSlice, ZeroVec}; |
226 | /// |
227 | /// // data known to be little-endian three-byte chunks of valid Unicode scalar values |
228 | /// let data = [0x68, 0x00, 0x00, 0x69, 0x00, 0x00, 0x4B, 0xF4, 0x01]; |
229 | /// // ground truth expectation |
230 | /// let real = ['h' , 'i' , '👋' ]; |
231 | /// |
232 | /// let chars: &ZeroSlice<UnvalidatedChar> = ZeroSlice::parse_byte_slice(&data).expect("invalid data length" ); |
233 | /// let parsed: Vec<_> = chars.iter().map(|c| unsafe { c.to_char_unchecked() }).collect(); |
234 | /// assert_eq!(&parsed, &real); |
235 | /// |
236 | /// let real_chars: ZeroVec<_> = real.iter().copied().map(UnvalidatedChar::from_char).collect(); |
237 | /// let serialized_data = chars.as_bytes(); |
238 | /// assert_eq!(serialized_data, &data); |
239 | /// ``` |
240 | #[repr (transparent)] |
241 | #[derive (PartialEq, Eq, Clone, Copy, Hash)] |
242 | pub struct UnvalidatedChar([u8; 3]); |
243 | |
244 | impl UnvalidatedChar { |
245 | /// Create a [`UnvalidatedChar`] from a `char`. |
246 | /// |
247 | /// # Examples |
248 | /// |
249 | /// ``` |
250 | /// use zerovec::ule::UnvalidatedChar; |
251 | /// |
252 | /// let a = UnvalidatedChar::from_char('a' ); |
253 | /// assert_eq!(a.try_to_char().unwrap(), 'a' ); |
254 | /// ``` |
255 | #[inline ] |
256 | pub const fn from_char(c: char) -> Self { |
257 | let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); |
258 | Self([u0, u1, u2]) |
259 | } |
260 | |
261 | #[inline ] |
262 | #[doc (hidden)] |
263 | pub const fn from_u24(c: u32) -> Self { |
264 | let [u0, u1, u2, _u3] = c.to_le_bytes(); |
265 | Self([u0, u1, u2]) |
266 | } |
267 | |
268 | /// Attempt to convert a [`UnvalidatedChar`] to a `char`. |
269 | /// |
270 | /// # Examples |
271 | /// |
272 | /// ``` |
273 | /// use zerovec::ule::{AsULE, UnvalidatedChar}; |
274 | /// |
275 | /// let a = UnvalidatedChar::from_char('a' ); |
276 | /// assert_eq!(a.try_to_char(), Ok('a' )); |
277 | /// |
278 | /// let b = UnvalidatedChar::from_unaligned([0xFF, 0xFF, 0xFF].into()); |
279 | /// assert!(matches!(b.try_to_char(), Err(_))); |
280 | /// ``` |
281 | #[inline ] |
282 | pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> { |
283 | let [u0, u1, u2] = self.0; |
284 | char::try_from(u32::from_le_bytes([u0, u1, u2, 0])) |
285 | } |
286 | |
287 | /// Convert a [`UnvalidatedChar`] to a `char', returning [`char::REPLACEMENT_CHARACTER`] |
288 | /// if the `UnvalidatedChar` does not represent a valid Unicode scalar value. |
289 | /// |
290 | /// # Examples |
291 | /// |
292 | /// ``` |
293 | /// use zerovec::ule::{AsULE, UnvalidatedChar}; |
294 | /// |
295 | /// let a = UnvalidatedChar::from_unaligned([0xFF, 0xFF, 0xFF].into()); |
296 | /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER); |
297 | /// ``` |
298 | #[inline ] |
299 | pub fn to_char_lossy(self) -> char { |
300 | self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER) |
301 | } |
302 | |
303 | /// Convert a [`UnvalidatedChar`] to a `char` without checking that it is |
304 | /// a valid Unicode scalar value. |
305 | /// |
306 | /// # Safety |
307 | /// |
308 | /// The `UnvalidatedChar` must be a valid Unicode scalar value in little-endian order. |
309 | /// |
310 | /// # Examples |
311 | /// |
312 | /// ``` |
313 | /// use zerovec::ule::UnvalidatedChar; |
314 | /// |
315 | /// let a = UnvalidatedChar::from_char('a' ); |
316 | /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a' ); |
317 | /// ``` |
318 | #[inline ] |
319 | pub unsafe fn to_char_unchecked(self) -> char { |
320 | let [u0, u1, u2] = self.0; |
321 | char::from_u32_unchecked(u32::from_le_bytes([u0, u1, u2, 0])) |
322 | } |
323 | } |
324 | |
325 | impl RawBytesULE<3> { |
326 | /// Converts a [`UnvalidatedChar`] to its ULE type. This is equivalent to calling |
327 | /// [`AsULE::to_unaligned`]. |
328 | #[inline ] |
329 | pub const fn from_unvalidated_char(uc: UnvalidatedChar) -> Self { |
330 | RawBytesULE(uc.0) |
331 | } |
332 | } |
333 | |
334 | impl AsULE for UnvalidatedChar { |
335 | type ULE = RawBytesULE<3>; |
336 | |
337 | #[inline ] |
338 | fn to_unaligned(self) -> Self::ULE { |
339 | RawBytesULE(self.0) |
340 | } |
341 | |
342 | #[inline ] |
343 | fn from_unaligned(unaligned: Self::ULE) -> Self { |
344 | Self(unaligned.0) |
345 | } |
346 | } |
347 | |
348 | // Safety: UnvalidatedChar is always the little-endian representation of a char, |
349 | // which corresponds to its AsULE::ULE type |
350 | unsafe impl EqULE for UnvalidatedChar {} |
351 | |
352 | impl fmt::Debug for UnvalidatedChar { |
353 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
354 | // Debug as a char if possible |
355 | match self.try_to_char() { |
356 | Ok(c: char) => fmt::Debug::fmt(&c, f), |
357 | Err(_) => fmt::Debug::fmt(&self.0, f), |
358 | } |
359 | } |
360 | } |
361 | |
362 | impl PartialOrd for UnvalidatedChar { |
363 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
364 | Some(self.cmp(other)) |
365 | } |
366 | } |
367 | |
368 | impl Ord for UnvalidatedChar { |
369 | // custom implementation, as derived Ord would compare lexicographically |
370 | fn cmp(&self, other: &Self) -> Ordering { |
371 | let [a0: u8, a1: u8, a2: u8] = self.0; |
372 | let a: u32 = u32::from_le_bytes([a0, a1, a2, 0]); |
373 | let [b0: u8, b1: u8, b2: u8] = other.0; |
374 | let b: u32 = u32::from_le_bytes([b0, b1, b2, 0]); |
375 | a.cmp(&b) |
376 | } |
377 | } |
378 | |
379 | impl From<char> for UnvalidatedChar { |
380 | #[inline ] |
381 | fn from(value: char) -> Self { |
382 | Self::from_char(value) |
383 | } |
384 | } |
385 | |
386 | impl TryFrom<UnvalidatedChar> for char { |
387 | type Error = core::char::CharTryFromError; |
388 | |
389 | #[inline ] |
390 | fn try_from(value: UnvalidatedChar) -> Result<char, Self::Error> { |
391 | value.try_to_char() |
392 | } |
393 | } |
394 | |
395 | /// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate |
396 | #[cfg (feature = "serde" )] |
397 | impl serde::Serialize for UnvalidatedChar { |
398 | fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> |
399 | where |
400 | S: serde::Serializer, |
401 | { |
402 | use serde::ser::Error; |
403 | let c = self |
404 | .try_to_char() |
405 | .map_err(|_| S::Error::custom("invalid Unicode scalar value in UnvalidatedChar" ))?; |
406 | if serializer.is_human_readable() { |
407 | serializer.serialize_char(c) |
408 | } else { |
409 | self.0.serialize(serializer) |
410 | } |
411 | } |
412 | } |
413 | |
414 | /// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate |
415 | #[cfg (feature = "serde" )] |
416 | impl<'de> serde::Deserialize<'de> for UnvalidatedChar { |
417 | fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
418 | where |
419 | D: serde::Deserializer<'de>, |
420 | { |
421 | if deserializer.is_human_readable() { |
422 | let c = <char>::deserialize(deserializer)?; |
423 | Ok(UnvalidatedChar::from_char(c)) |
424 | } else { |
425 | let bytes = <[u8; 3]>::deserialize(deserializer)?; |
426 | Ok(UnvalidatedChar(bytes)) |
427 | } |
428 | } |
429 | } |
430 | |
431 | #[cfg (feature = "databake" )] |
432 | impl databake::Bake for UnvalidatedChar { |
433 | fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { |
434 | match self.try_to_char() { |
435 | Ok(ch) => { |
436 | env.insert("zerovec" ); |
437 | let ch = ch.bake(env); |
438 | databake::quote! { |
439 | zerovec::ule::UnvalidatedChar::from_char(#ch) |
440 | } |
441 | } |
442 | Err(_) => { |
443 | env.insert("zerovec" ); |
444 | let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]); |
445 | databake::quote! { |
446 | zerovec::ule::UnvalidatedChar::from_u24(#u24) |
447 | } |
448 | } |
449 | } |
450 | } |
451 | } |
452 | |
453 | #[cfg (test)] |
454 | mod test { |
455 | use super::*; |
456 | use crate::ZeroVec; |
457 | |
458 | #[test ] |
459 | fn test_serde_fail() { |
460 | let uc = UnvalidatedChar([0xFF, 0xFF, 0xFF]); |
461 | serde_json::to_string(&uc).expect_err("serialize invalid char bytes" ); |
462 | bincode::serialize(&uc).expect_err("serialize invalid char bytes" ); |
463 | } |
464 | |
465 | #[test ] |
466 | fn test_serde_json() { |
467 | let c = '🙃' ; |
468 | let uc = UnvalidatedChar::from_char(c); |
469 | let json_ser = serde_json::to_string(&uc).unwrap(); |
470 | |
471 | assert_eq!(json_ser, r#""🙃""# ); |
472 | |
473 | let json_de: UnvalidatedChar = serde_json::from_str(&json_ser).unwrap(); |
474 | |
475 | assert_eq!(uc, json_de); |
476 | } |
477 | |
478 | #[test ] |
479 | fn test_serde_bincode() { |
480 | let c = '🙃' ; |
481 | let uc = UnvalidatedChar::from_char(c); |
482 | let bytes_ser = bincode::serialize(&uc).unwrap(); |
483 | |
484 | assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]); |
485 | |
486 | let bytes_de: UnvalidatedChar = bincode::deserialize(&bytes_ser).unwrap(); |
487 | |
488 | assert_eq!(uc, bytes_de); |
489 | } |
490 | |
491 | #[test ] |
492 | fn test_representation() { |
493 | let chars = ['w' , 'ω' , '文' , '𑄃' , '🙃' ]; |
494 | |
495 | // backed by [UnvalidatedChar] |
496 | let uvchars: Vec<_> = chars |
497 | .iter() |
498 | .copied() |
499 | .map(UnvalidatedChar::from_char) |
500 | .collect(); |
501 | // backed by [RawBytesULE<3>] |
502 | let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect(); |
503 | |
504 | let ule_bytes = zvec.as_bytes(); |
505 | let uvbytes; |
506 | unsafe { |
507 | let ptr = &uvchars[..] as *const _ as *const u8; |
508 | uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len()); |
509 | } |
510 | |
511 | // UnvalidatedChar is defined as little-endian, so this must be true on all platforms |
512 | // also asserts that to_unaligned/from_unaligned are no-ops |
513 | assert_eq!(uvbytes, ule_bytes); |
514 | |
515 | assert_eq!( |
516 | &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], |
517 | ule_bytes |
518 | ); |
519 | } |
520 | |
521 | #[test ] |
522 | fn test_char_bake() { |
523 | databake::test_bake!(UnvalidatedChar, const: crate::ule::UnvalidatedChar::from_char('b' ), zerovec); |
524 | // surrogate code point |
525 | databake::test_bake!(UnvalidatedChar, const: crate::ule::UnvalidatedChar::from_u24(55296u32), zerovec); |
526 | } |
527 | } |
528 | |