| 1 | // This file is part of ICU4X. For terms of use, please see the file |
| 2 | // called LICENSE at the top level of the ICU4X source tree |
| 3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| 4 | |
| 5 | #![allow (clippy::upper_case_acronyms)] |
| 6 | //! ULE implementation for the `char` type. |
| 7 | |
| 8 | use super::*; |
| 9 | use crate::impl_ule_from_array; |
| 10 | use core::cmp::Ordering; |
| 11 | use core::convert::TryFrom; |
| 12 | |
| 13 | /// A u8 array of little-endian data corresponding to a Unicode scalar value. |
| 14 | /// |
| 15 | /// The bytes of a `CharULE` are guaranteed to represent a little-endian-encoded u32 that is a |
| 16 | /// valid `char` and can be converted without validation. |
| 17 | /// |
| 18 | /// # Examples |
| 19 | /// |
| 20 | /// Convert a `char` to a `CharULE` and back again: |
| 21 | /// |
| 22 | /// ``` |
| 23 | /// use zerovec::ule::{AsULE, CharULE, ULE}; |
| 24 | /// |
| 25 | /// let c1 = '𑄃' ; |
| 26 | /// let ule = c1.to_unaligned(); |
| 27 | /// assert_eq!(CharULE::as_byte_slice(&[ule]), &[0x03, 0x11, 0x01]); |
| 28 | /// let c2 = char::from_unaligned(ule); |
| 29 | /// assert_eq!(c1, c2); |
| 30 | /// ``` |
| 31 | /// |
| 32 | /// Attempt to parse invalid bytes to a `CharULE`: |
| 33 | /// |
| 34 | /// ``` |
| 35 | /// use zerovec::ule::{CharULE, ULE}; |
| 36 | /// |
| 37 | /// let bytes: &[u8] = &[0xFF, 0xFF, 0xFF, 0xFF]; |
| 38 | /// CharULE::parse_byte_slice(bytes).expect_err("Invalid bytes" ); |
| 39 | /// ``` |
| 40 | #[repr (transparent)] |
| 41 | #[derive (Debug, PartialEq, Eq, Clone, Copy, Hash)] |
| 42 | pub struct CharULE([u8; 3]); |
| 43 | |
| 44 | impl CharULE { |
| 45 | /// Converts a [`char`] to a [`CharULE`]. This is equivalent to calling |
| 46 | /// [`AsULE::to_unaligned()`] |
| 47 | /// |
| 48 | /// See the type-level documentation for [`CharULE`] for more information. |
| 49 | #[inline ] |
| 50 | pub const fn from_aligned(c: char) -> Self { |
| 51 | let [u0: u8, u1: u8, u2: u8, _u3: u8] = (c as u32).to_le_bytes(); |
| 52 | Self([u0, u1, u2]) |
| 53 | } |
| 54 | |
| 55 | impl_ule_from_array!(char, CharULE, Self([0; 3])); |
| 56 | } |
| 57 | |
| 58 | // Safety (based on the safety checklist on the ULE trait): |
| 59 | // 1. CharULE does not include any uninitialized or padding bytes. |
| 60 | // (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) |
| 61 | // 2. CharULE is aligned to 1 byte. |
| 62 | // (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) |
| 63 | // 3. The impl of validate_byte_slice() returns an error if any byte is not valid. |
| 64 | // 4. The impl of validate_byte_slice() returns an error if there are extra bytes. |
| 65 | // 5. The other ULE methods use the default impl. |
| 66 | // 6. CharULE byte equality is semantic equality |
| 67 | unsafe impl ULE for CharULE { |
| 68 | #[inline ] |
| 69 | fn validate_byte_slice(bytes: &[u8]) -> Result<(), ZeroVecError> { |
| 70 | if bytes.len() % 3 != 0 { |
| 71 | return Err(ZeroVecError::length::<Self>(bytes.len())); |
| 72 | } |
| 73 | // Validate the bytes |
| 74 | for chunk: &[u8] in bytes.chunks_exact(chunk_size:3) { |
| 75 | // TODO: Use slice::as_chunks() when stabilized |
| 76 | #[allow (clippy::indexing_slicing)] |
| 77 | // Won't panic because the chunks are always 3 bytes long |
| 78 | let u: u32 = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]); |
| 79 | char::try_from(u).map_err(|_| ZeroVecError::parse::<Self>())?; |
| 80 | } |
| 81 | Ok(()) |
| 82 | } |
| 83 | } |
| 84 | |
| 85 | impl AsULE for char { |
| 86 | type ULE = CharULE; |
| 87 | |
| 88 | #[inline ] |
| 89 | fn to_unaligned(self) -> Self::ULE { |
| 90 | CharULE::from_aligned(self) |
| 91 | } |
| 92 | |
| 93 | #[inline ] |
| 94 | fn from_unaligned(unaligned: Self::ULE) -> Self { |
| 95 | // Safe because the bytes of CharULE are defined to represent a valid Unicode scalar value. |
| 96 | unsafe { |
| 97 | Self::from_u32_unchecked(u32::from_le_bytes([ |
| 98 | unaligned.0[0], |
| 99 | unaligned.0[1], |
| 100 | unaligned.0[2], |
| 101 | 0, |
| 102 | ])) |
| 103 | } |
| 104 | } |
| 105 | } |
| 106 | |
| 107 | impl PartialOrd for CharULE { |
| 108 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
| 109 | Some(self.cmp(other)) |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | impl Ord for CharULE { |
| 114 | fn cmp(&self, other: &Self) -> Ordering { |
| 115 | char::from_unaligned(*self).cmp(&char::from_unaligned(*other)) |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | #[cfg (test)] |
| 120 | mod test { |
| 121 | use super::*; |
| 122 | |
| 123 | #[test ] |
| 124 | fn test_from_array() { |
| 125 | const CHARS: [char; 2] = ['a' , '🙃' ]; |
| 126 | const CHARS_ULE: [CharULE; 2] = CharULE::from_array(CHARS); |
| 127 | assert_eq!( |
| 128 | CharULE::as_byte_slice(&CHARS_ULE), |
| 129 | &[0x61, 0x00, 0x00, 0x43, 0xF6, 0x01] |
| 130 | ); |
| 131 | } |
| 132 | |
| 133 | #[test ] |
| 134 | fn test_from_array_zst() { |
| 135 | const CHARS: [char; 0] = []; |
| 136 | const CHARS_ULE: [CharULE; 0] = CharULE::from_array(CHARS); |
| 137 | let bytes = CharULE::as_byte_slice(&CHARS_ULE); |
| 138 | let empty: &[u8] = &[]; |
| 139 | assert_eq!(bytes, empty); |
| 140 | } |
| 141 | |
| 142 | #[test ] |
| 143 | fn test_parse() { |
| 144 | // 1-byte, 2-byte, 3-byte, and two 4-byte character in UTF-8 (not as relevant in UTF-32) |
| 145 | let chars = ['w' , 'ω' , '文' , '𑄃' , '🙃' ]; |
| 146 | let char_ules: Vec<CharULE> = chars.iter().copied().map(char::to_unaligned).collect(); |
| 147 | let char_bytes: &[u8] = CharULE::as_byte_slice(&char_ules); |
| 148 | |
| 149 | // Check parsing |
| 150 | let parsed_ules: &[CharULE] = CharULE::parse_byte_slice(char_bytes).unwrap(); |
| 151 | assert_eq!(char_ules, parsed_ules); |
| 152 | let parsed_chars: Vec<char> = parsed_ules |
| 153 | .iter() |
| 154 | .copied() |
| 155 | .map(char::from_unaligned) |
| 156 | .collect(); |
| 157 | assert_eq!(&chars, parsed_chars.as_slice()); |
| 158 | |
| 159 | // Compare to golden expected data |
| 160 | assert_eq!( |
| 161 | &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], |
| 162 | char_bytes |
| 163 | ); |
| 164 | } |
| 165 | |
| 166 | #[test ] |
| 167 | fn test_failures() { |
| 168 | // 119 and 120 are valid, but not 0xD800 (high surrogate) |
| 169 | let u32s = [119, 0xD800, 120]; |
| 170 | let u32_ules: Vec<RawBytesULE<4>> = u32s |
| 171 | .iter() |
| 172 | .copied() |
| 173 | .map(<u32 as AsULE>::to_unaligned) |
| 174 | .collect(); |
| 175 | let u32_bytes: &[u8] = RawBytesULE::<4>::as_byte_slice(&u32_ules); |
| 176 | let parsed_ules_result = CharULE::parse_byte_slice(u32_bytes); |
| 177 | assert!(parsed_ules_result.is_err()); |
| 178 | |
| 179 | // 0x20FFFF is out of range for a char |
| 180 | let u32s = [0x20FFFF]; |
| 181 | let u32_ules: Vec<RawBytesULE<4>> = u32s |
| 182 | .iter() |
| 183 | .copied() |
| 184 | .map(<u32 as AsULE>::to_unaligned) |
| 185 | .collect(); |
| 186 | let u32_bytes: &[u8] = RawBytesULE::<4>::as_byte_slice(&u32_ules); |
| 187 | let parsed_ules_result = CharULE::parse_byte_slice(u32_bytes); |
| 188 | assert!(parsed_ules_result.is_err()); |
| 189 | } |
| 190 | } |
| 191 | |