1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#![allow(clippy::upper_case_acronyms)]
6//! ULE implementation for the `char` type.
7
8use super::*;
9use crate::impl_ule_from_array;
10use core::cmp::Ordering;
11use core::convert::TryFrom;
12
13/// A u8 array of little-endian data corresponding to a Unicode scalar value.
14///
15/// The bytes of a `CharULE` are guaranteed to represent a little-endian-encoded u32 that is a
16/// valid `char` and can be converted without validation.
17///
18/// # Examples
19///
20/// Convert a `char` to a `CharULE` and back again:
21///
22/// ```
23/// use zerovec::ule::{AsULE, CharULE, ULE};
24///
25/// let c1 = '𑄃';
26/// let ule = c1.to_unaligned();
27/// assert_eq!(CharULE::as_byte_slice(&[ule]), &[0x03, 0x11, 0x01]);
28/// let c2 = char::from_unaligned(ule);
29/// assert_eq!(c1, c2);
30/// ```
31///
32/// Attempt to parse invalid bytes to a `CharULE`:
33///
34/// ```
35/// use zerovec::ule::{CharULE, ULE};
36///
37/// let bytes: &[u8] = &[0xFF, 0xFF, 0xFF, 0xFF];
38/// CharULE::parse_byte_slice(bytes).expect_err("Invalid bytes");
39/// ```
40#[repr(transparent)]
41#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
42pub struct CharULE([u8; 3]);
43
44impl CharULE {
45 /// Converts a [`char`] to a [`CharULE`]. This is equivalent to calling
46 /// [`AsULE::to_unaligned()`]
47 ///
48 /// See the type-level documentation for [`CharULE`] for more information.
49 #[inline]
50 pub const fn from_aligned(c: char) -> Self {
51 let [u0: u8, u1: u8, u2: u8, _u3: u8] = (c as u32).to_le_bytes();
52 Self([u0, u1, u2])
53 }
54
55 impl_ule_from_array!(char, CharULE, Self([0; 3]));
56}
57
58// Safety (based on the safety checklist on the ULE trait):
59// 1. CharULE does not include any uninitialized or padding bytes.
60// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
61// 2. CharULE is aligned to 1 byte.
62// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
63// 3. The impl of validate_byte_slice() returns an error if any byte is not valid.
64// 4. The impl of validate_byte_slice() returns an error if there are extra bytes.
65// 5. The other ULE methods use the default impl.
66// 6. CharULE byte equality is semantic equality
67unsafe impl ULE for CharULE {
68 #[inline]
69 fn validate_byte_slice(bytes: &[u8]) -> Result<(), ZeroVecError> {
70 if bytes.len() % 3 != 0 {
71 return Err(ZeroVecError::length::<Self>(bytes.len()));
72 }
73 // Validate the bytes
74 for chunk: &[u8] in bytes.chunks_exact(chunk_size:3) {
75 // TODO: Use slice::as_chunks() when stabilized
76 #[allow(clippy::indexing_slicing)]
77 // Won't panic because the chunks are always 3 bytes long
78 let u: u32 = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]);
79 char::try_from(u).map_err(|_| ZeroVecError::parse::<Self>())?;
80 }
81 Ok(())
82 }
83}
84
85impl AsULE for char {
86 type ULE = CharULE;
87
88 #[inline]
89 fn to_unaligned(self) -> Self::ULE {
90 CharULE::from_aligned(self)
91 }
92
93 #[inline]
94 fn from_unaligned(unaligned: Self::ULE) -> Self {
95 // Safe because the bytes of CharULE are defined to represent a valid Unicode scalar value.
96 unsafe {
97 Self::from_u32_unchecked(u32::from_le_bytes([
98 unaligned.0[0],
99 unaligned.0[1],
100 unaligned.0[2],
101 0,
102 ]))
103 }
104 }
105}
106
107impl PartialOrd for CharULE {
108 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
109 char::from_unaligned(*self).partial_cmp(&char::from_unaligned(*other))
110 }
111}
112
113impl Ord for CharULE {
114 fn cmp(&self, other: &Self) -> Ordering {
115 char::from_unaligned(*self).cmp(&char::from_unaligned(*other))
116 }
117}
118
119#[cfg(test)]
120mod test {
121 use super::*;
122
123 #[test]
124 fn test_from_array() {
125 const CHARS: [char; 2] = ['a', '🙃'];
126 const CHARS_ULE: [CharULE; 2] = CharULE::from_array(CHARS);
127 assert_eq!(
128 CharULE::as_byte_slice(&CHARS_ULE),
129 &[0x61, 0x00, 0x00, 0x43, 0xF6, 0x01]
130 );
131 }
132
133 #[test]
134 fn test_from_array_zst() {
135 const CHARS: [char; 0] = [];
136 const CHARS_ULE: [CharULE; 0] = CharULE::from_array(CHARS);
137 let bytes = CharULE::as_byte_slice(&CHARS_ULE);
138 let empty: &[u8] = &[];
139 assert_eq!(bytes, empty);
140 }
141
142 #[test]
143 fn test_parse() {
144 // 1-byte, 2-byte, 3-byte, and two 4-byte character in UTF-8 (not as relevant in UTF-32)
145 let chars = ['w', 'ω', '文', '𑄃', '🙃'];
146 let char_ules: Vec<CharULE> = chars.iter().copied().map(char::to_unaligned).collect();
147 let char_bytes: &[u8] = CharULE::as_byte_slice(&char_ules);
148
149 // Check parsing
150 let parsed_ules: &[CharULE] = CharULE::parse_byte_slice(char_bytes).unwrap();
151 assert_eq!(char_ules, parsed_ules);
152 let parsed_chars: Vec<char> = parsed_ules
153 .iter()
154 .copied()
155 .map(char::from_unaligned)
156 .collect();
157 assert_eq!(&chars, parsed_chars.as_slice());
158
159 // Compare to golden expected data
160 assert_eq!(
161 &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
162 char_bytes
163 );
164 }
165
166 #[test]
167 fn test_failures() {
168 // 119 and 120 are valid, but not 0xD800 (high surrogate)
169 let u32s = [119, 0xD800, 120];
170 let u32_ules: Vec<RawBytesULE<4>> = u32s
171 .iter()
172 .copied()
173 .map(<u32 as AsULE>::to_unaligned)
174 .collect();
175 let u32_bytes: &[u8] = RawBytesULE::<4>::as_byte_slice(&u32_ules);
176 let parsed_ules_result = CharULE::parse_byte_slice(u32_bytes);
177 assert!(parsed_ules_result.is_err());
178
179 // 0x20FFFF is out of range for a char
180 let u32s = [0x20FFFF];
181 let u32_ules: Vec<RawBytesULE<4>> = u32s
182 .iter()
183 .copied()
184 .map(<u32 as AsULE>::to_unaligned)
185 .collect();
186 let u32_bytes: &[u8] = RawBytesULE::<4>::as_byte_slice(&u32_ules);
187 let parsed_ules_result = CharULE::parse_byte_slice(u32_bytes);
188 assert!(parsed_ules_result.is_err());
189 }
190}
191