1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use super::{AsULE, RawBytesULE, VarULE};
6use crate::ule::EqULE;
7use crate::{map::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroVecError};
8use alloc::boxed::Box;
9use core::cmp::Ordering;
10use core::fmt;
11use core::ops::Deref;
12
13/// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant.
14///
15/// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For
16/// example, strings that are keys of a map don't need to ever be reified as `str`s.
17///
18/// [`UnvalidatedStr`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`].
19///
20/// The main advantage of this type over `[u8]` is that it serializes as a string in
21/// human-readable formats like JSON.
22///
23/// # Examples
24///
25/// Using an [`UnvalidatedStr`] as the key of a [`ZeroMap`]:
26///
27/// ```
28/// use zerovec::ule::UnvalidatedStr;
29/// use zerovec::ZeroMap;
30///
31/// let map: ZeroMap<UnvalidatedStr, usize> = [
32/// (UnvalidatedStr::from_str("abc"), 11),
33/// (UnvalidatedStr::from_str("def"), 22),
34/// (UnvalidatedStr::from_str("ghi"), 33),
35/// ]
36/// .into_iter()
37/// .collect();
38///
39/// let key = "abc";
40/// let value = map.get_copied_by(|uvstr| uvstr.as_bytes().cmp(key.as_bytes()));
41/// assert_eq!(Some(11), value);
42/// ```
43///
44/// [`ZeroMap`]: crate::ZeroMap
45#[repr(transparent)]
46#[derive(PartialEq, Eq, PartialOrd, Ord)]
47#[allow(clippy::exhaustive_structs)] // transparent newtype
48pub struct UnvalidatedStr([u8]);
49
50impl fmt::Debug for UnvalidatedStr {
51 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
52 // Debug as a string if possible
53 match self.try_as_str() {
54 Ok(s: &str) => fmt::Debug::fmt(self:s, f),
55 Err(_) => fmt::Debug::fmt(&self.0, f),
56 }
57 }
58}
59
60impl UnvalidatedStr {
61 /// Create a [`UnvalidatedStr`] from a byte slice.
62 #[inline]
63 pub const fn from_bytes(other: &[u8]) -> &Self {
64 // Safety: UnvalidatedStr is transparent over [u8]
65 unsafe { core::mem::transmute(other) }
66 }
67
68 /// Create a [`UnvalidatedStr`] from a string slice.
69 #[inline]
70 pub const fn from_str(s: &str) -> &Self {
71 Self::from_bytes(s.as_bytes())
72 }
73
74 /// Create a [`UnvalidatedStr`] from boxed bytes.
75 #[inline]
76 pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> {
77 // Safety: UnvalidatedStr is transparent over [u8]
78 unsafe { core::mem::transmute(other) }
79 }
80
81 /// Create a [`UnvalidatedStr`] from a boxed `str`.
82 #[inline]
83 pub fn from_boxed_str(other: Box<str>) -> Box<Self> {
84 Self::from_boxed_bytes(other.into_boxed_bytes())
85 }
86
87 /// Get the bytes from a [`UnvalidatedStr].
88 #[inline]
89 pub const fn as_bytes(&self) -> &[u8] {
90 &self.0
91 }
92
93 /// Attempt to convert a [`UnvalidatedStr`] to a `str`.
94 ///
95 /// # Examples
96 ///
97 /// ```
98 /// use zerovec::ule::UnvalidatedStr;
99 ///
100 /// static A: &UnvalidatedStr = UnvalidatedStr::from_bytes(b"abc");
101 ///
102 /// let b = A.try_as_str().unwrap();
103 /// assert_eq!(b, "abc");
104 /// ```
105 // Note: this is const starting in 1.63
106 #[inline]
107 pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> {
108 core::str::from_utf8(&self.0)
109 }
110}
111
112impl<'a> From<&'a str> for &'a UnvalidatedStr {
113 #[inline]
114 fn from(other: &'a str) -> Self {
115 UnvalidatedStr::from_str(other)
116 }
117}
118
119impl From<Box<str>> for Box<UnvalidatedStr> {
120 #[inline]
121 fn from(other: Box<str>) -> Self {
122 UnvalidatedStr::from_boxed_str(other)
123 }
124}
125
126impl Deref for UnvalidatedStr {
127 type Target = [u8];
128 fn deref(&self) -> &Self::Target {
129 &self.0
130 }
131}
132
133impl<'a> ZeroMapKV<'a> for UnvalidatedStr {
134 type Container = VarZeroVec<'a, UnvalidatedStr>;
135 type Slice = VarZeroSlice<UnvalidatedStr>;
136 type GetType = UnvalidatedStr;
137 type OwnedType = Box<UnvalidatedStr>;
138}
139
140// Safety (based on the safety checklist on the VarULE trait):
141// 1. UnvalidatedStr does not include any uninitialized or padding bytes (transparent over a ULE)
142// 2. UnvalidatedStr is aligned to 1 byte (transparent over a ULE)
143// 3. The impl of `validate_byte_slice()` returns an error if any byte is not valid (impossible)
144// 4. The impl of `validate_byte_slice()` returns an error if the slice cannot be used in its entirety (impossible)
145// 5. The impl of `from_byte_slice_unchecked()` returns a reference to the same data (returns the argument directly)
146// 6. All other methods are defaulted
147// 7. `[T]` byte equality is semantic equality (transparent over a ULE)
148unsafe impl VarULE for UnvalidatedStr {
149 #[inline]
150 fn validate_byte_slice(_: &[u8]) -> Result<(), ZeroVecError> {
151 Ok(())
152 }
153 #[inline]
154 unsafe fn from_byte_slice_unchecked(bytes: &[u8]) -> &Self {
155 UnvalidatedStr::from_bytes(bytes)
156 }
157}
158
159/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
160#[cfg(feature = "serde")]
161impl serde::Serialize for UnvalidatedStr {
162 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
163 where
164 S: serde::Serializer,
165 {
166 use serde::ser::Error;
167 let s = self
168 .try_as_str()
169 .map_err(|_| S::Error::custom("invalid UTF-8 in UnvalidatedStr"))?;
170 if serializer.is_human_readable() {
171 serializer.serialize_str(s)
172 } else {
173 serializer.serialize_bytes(s.as_bytes())
174 }
175 }
176}
177
178/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
179#[cfg(feature = "serde")]
180impl<'de> serde::Deserialize<'de> for Box<UnvalidatedStr> {
181 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
182 where
183 D: serde::Deserializer<'de>,
184 {
185 if deserializer.is_human_readable() {
186 let boxed_str = Box::<str>::deserialize(deserializer)?;
187 Ok(UnvalidatedStr::from_boxed_str(boxed_str))
188 } else {
189 let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?;
190 Ok(UnvalidatedStr::from_boxed_bytes(boxed_bytes))
191 }
192 }
193}
194
195/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
196#[cfg(feature = "serde")]
197impl<'de, 'a> serde::Deserialize<'de> for &'a UnvalidatedStr
198where
199 'de: 'a,
200{
201 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
202 where
203 D: serde::Deserializer<'de>,
204 {
205 if deserializer.is_human_readable() {
206 let s = <&str>::deserialize(deserializer)?;
207 Ok(UnvalidatedStr::from_str(s))
208 } else {
209 let bytes = <&[u8]>::deserialize(deserializer)?;
210 Ok(UnvalidatedStr::from_bytes(bytes))
211 }
212 }
213}
214
215/// A u8 array of little-endian data that is expected to be a Unicode scalar value, but is not
216/// validated as such.
217///
218/// Use this type instead of `char` when you want to deal with data that is expected to be valid
219/// Unicode scalar values, but you want control over when or if you validate that assumption.
220///
221/// # Examples
222///
223/// ```
224/// use zerovec::ule::{RawBytesULE, UnvalidatedChar, ULE};
225/// use zerovec::{ZeroSlice, ZeroVec};
226///
227/// // data known to be little-endian three-byte chunks of valid Unicode scalar values
228/// let data = [0x68, 0x00, 0x00, 0x69, 0x00, 0x00, 0x4B, 0xF4, 0x01];
229/// // ground truth expectation
230/// let real = ['h', 'i', '👋'];
231///
232/// let chars: &ZeroSlice<UnvalidatedChar> = ZeroSlice::parse_byte_slice(&data).expect("invalid data length");
233/// let parsed: Vec<_> = chars.iter().map(|c| unsafe { c.to_char_unchecked() }).collect();
234/// assert_eq!(&parsed, &real);
235///
236/// let real_chars: ZeroVec<_> = real.iter().copied().map(UnvalidatedChar::from_char).collect();
237/// let serialized_data = chars.as_bytes();
238/// assert_eq!(serialized_data, &data);
239/// ```
240#[repr(transparent)]
241#[derive(PartialEq, Eq, Clone, Copy, Hash)]
242pub struct UnvalidatedChar([u8; 3]);
243
244impl UnvalidatedChar {
245 /// Create a [`UnvalidatedChar`] from a `char`.
246 ///
247 /// # Examples
248 ///
249 /// ```
250 /// use zerovec::ule::UnvalidatedChar;
251 ///
252 /// let a = UnvalidatedChar::from_char('a');
253 /// assert_eq!(a.try_to_char().unwrap(), 'a');
254 /// ```
255 #[inline]
256 pub const fn from_char(c: char) -> Self {
257 let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
258 Self([u0, u1, u2])
259 }
260
261 #[inline]
262 #[doc(hidden)]
263 pub const fn from_u24(c: u32) -> Self {
264 let [u0, u1, u2, _u3] = c.to_le_bytes();
265 Self([u0, u1, u2])
266 }
267
268 /// Attempt to convert a [`UnvalidatedChar`] to a `char`.
269 ///
270 /// # Examples
271 ///
272 /// ```
273 /// use zerovec::ule::{AsULE, UnvalidatedChar};
274 ///
275 /// let a = UnvalidatedChar::from_char('a');
276 /// assert_eq!(a.try_to_char(), Ok('a'));
277 ///
278 /// let b = UnvalidatedChar::from_unaligned([0xFF, 0xFF, 0xFF].into());
279 /// assert!(matches!(b.try_to_char(), Err(_)));
280 /// ```
281 #[inline]
282 pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> {
283 let [u0, u1, u2] = self.0;
284 char::try_from(u32::from_le_bytes([u0, u1, u2, 0]))
285 }
286
287 /// Convert a [`UnvalidatedChar`] to a `char', returning [`char::REPLACEMENT_CHARACTER`]
288 /// if the `UnvalidatedChar` does not represent a valid Unicode scalar value.
289 ///
290 /// # Examples
291 ///
292 /// ```
293 /// use zerovec::ule::{AsULE, UnvalidatedChar};
294 ///
295 /// let a = UnvalidatedChar::from_unaligned([0xFF, 0xFF, 0xFF].into());
296 /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER);
297 /// ```
298 #[inline]
299 pub fn to_char_lossy(self) -> char {
300 self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
301 }
302
303 /// Convert a [`UnvalidatedChar`] to a `char` without checking that it is
304 /// a valid Unicode scalar value.
305 ///
306 /// # Safety
307 ///
308 /// The `UnvalidatedChar` must be a valid Unicode scalar value in little-endian order.
309 ///
310 /// # Examples
311 ///
312 /// ```
313 /// use zerovec::ule::UnvalidatedChar;
314 ///
315 /// let a = UnvalidatedChar::from_char('a');
316 /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a');
317 /// ```
318 #[inline]
319 pub unsafe fn to_char_unchecked(self) -> char {
320 let [u0, u1, u2] = self.0;
321 char::from_u32_unchecked(u32::from_le_bytes([u0, u1, u2, 0]))
322 }
323}
324
325impl RawBytesULE<3> {
326 /// Converts a [`UnvalidatedChar`] to its ULE type. This is equivalent to calling
327 /// [`AsULE::to_unaligned`].
328 #[inline]
329 pub const fn from_unvalidated_char(uc: UnvalidatedChar) -> Self {
330 RawBytesULE(uc.0)
331 }
332}
333
334impl AsULE for UnvalidatedChar {
335 type ULE = RawBytesULE<3>;
336
337 #[inline]
338 fn to_unaligned(self) -> Self::ULE {
339 RawBytesULE(self.0)
340 }
341
342 #[inline]
343 fn from_unaligned(unaligned: Self::ULE) -> Self {
344 Self(unaligned.0)
345 }
346}
347
348// Safety: UnvalidatedChar is always the little-endian representation of a char,
349// which corresponds to its AsULE::ULE type
350unsafe impl EqULE for UnvalidatedChar {}
351
352impl fmt::Debug for UnvalidatedChar {
353 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
354 // Debug as a char if possible
355 match self.try_to_char() {
356 Ok(c: char) => fmt::Debug::fmt(&c, f),
357 Err(_) => fmt::Debug::fmt(&self.0, f),
358 }
359 }
360}
361
362impl PartialOrd for UnvalidatedChar {
363 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
364 Some(self.cmp(other))
365 }
366}
367
368impl Ord for UnvalidatedChar {
369 // custom implementation, as derived Ord would compare lexicographically
370 fn cmp(&self, other: &Self) -> Ordering {
371 let [a0: u8, a1: u8, a2: u8] = self.0;
372 let a: u32 = u32::from_le_bytes([a0, a1, a2, 0]);
373 let [b0: u8, b1: u8, b2: u8] = other.0;
374 let b: u32 = u32::from_le_bytes([b0, b1, b2, 0]);
375 a.cmp(&b)
376 }
377}
378
379impl From<char> for UnvalidatedChar {
380 #[inline]
381 fn from(value: char) -> Self {
382 Self::from_char(value)
383 }
384}
385
386impl TryFrom<UnvalidatedChar> for char {
387 type Error = core::char::CharTryFromError;
388
389 #[inline]
390 fn try_from(value: UnvalidatedChar) -> Result<char, Self::Error> {
391 value.try_to_char()
392 }
393}
394
395/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
396#[cfg(feature = "serde")]
397impl serde::Serialize for UnvalidatedChar {
398 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
399 where
400 S: serde::Serializer,
401 {
402 use serde::ser::Error;
403 let c = self
404 .try_to_char()
405 .map_err(|_| S::Error::custom("invalid Unicode scalar value in UnvalidatedChar"))?;
406 if serializer.is_human_readable() {
407 serializer.serialize_char(c)
408 } else {
409 self.0.serialize(serializer)
410 }
411 }
412}
413
414/// This impl requires enabling the optional `serde` Cargo feature of the `zerovec` crate
415#[cfg(feature = "serde")]
416impl<'de> serde::Deserialize<'de> for UnvalidatedChar {
417 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
418 where
419 D: serde::Deserializer<'de>,
420 {
421 if deserializer.is_human_readable() {
422 let c = <char>::deserialize(deserializer)?;
423 Ok(UnvalidatedChar::from_char(c))
424 } else {
425 let bytes = <[u8; 3]>::deserialize(deserializer)?;
426 Ok(UnvalidatedChar(bytes))
427 }
428 }
429}
430
431#[cfg(feature = "databake")]
432impl databake::Bake for UnvalidatedChar {
433 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
434 match self.try_to_char() {
435 Ok(ch) => {
436 env.insert("zerovec");
437 let ch = ch.bake(env);
438 databake::quote! {
439 zerovec::ule::UnvalidatedChar::from_char(#ch)
440 }
441 }
442 Err(_) => {
443 env.insert("zerovec");
444 let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]);
445 databake::quote! {
446 zerovec::ule::UnvalidatedChar::from_u24(#u24)
447 }
448 }
449 }
450 }
451}
452
453#[cfg(test)]
454mod test {
455 use super::*;
456 use crate::ZeroVec;
457
458 #[test]
459 fn test_serde_fail() {
460 let uc = UnvalidatedChar([0xFF, 0xFF, 0xFF]);
461 serde_json::to_string(&uc).expect_err("serialize invalid char bytes");
462 bincode::serialize(&uc).expect_err("serialize invalid char bytes");
463 }
464
465 #[test]
466 fn test_serde_json() {
467 let c = '🙃';
468 let uc = UnvalidatedChar::from_char(c);
469 let json_ser = serde_json::to_string(&uc).unwrap();
470
471 assert_eq!(json_ser, r#""🙃""#);
472
473 let json_de: UnvalidatedChar = serde_json::from_str(&json_ser).unwrap();
474
475 assert_eq!(uc, json_de);
476 }
477
478 #[test]
479 fn test_serde_bincode() {
480 let c = '🙃';
481 let uc = UnvalidatedChar::from_char(c);
482 let bytes_ser = bincode::serialize(&uc).unwrap();
483
484 assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]);
485
486 let bytes_de: UnvalidatedChar = bincode::deserialize(&bytes_ser).unwrap();
487
488 assert_eq!(uc, bytes_de);
489 }
490
491 #[test]
492 fn test_representation() {
493 let chars = ['w', 'ω', '文', '𑄃', '🙃'];
494
495 // backed by [UnvalidatedChar]
496 let uvchars: Vec<_> = chars
497 .iter()
498 .copied()
499 .map(UnvalidatedChar::from_char)
500 .collect();
501 // backed by [RawBytesULE<3>]
502 let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect();
503
504 let ule_bytes = zvec.as_bytes();
505 let uvbytes;
506 unsafe {
507 let ptr = &uvchars[..] as *const _ as *const u8;
508 uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len());
509 }
510
511 // UnvalidatedChar is defined as little-endian, so this must be true on all platforms
512 // also asserts that to_unaligned/from_unaligned are no-ops
513 assert_eq!(uvbytes, ule_bytes);
514
515 assert_eq!(
516 &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
517 ule_bytes
518 );
519 }
520
521 #[test]
522 fn test_char_bake() {
523 databake::test_bake!(UnvalidatedChar, const: crate::ule::UnvalidatedChar::from_char('b'), zerovec);
524 // surrogate code point
525 databake::test_bake!(UnvalidatedChar, const: crate::ule::UnvalidatedChar::from_u24(55296u32), zerovec);
526 }
527}
528