1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14//!
15//! This module provides an efficient storage of data serving the following
16//! properties:
17//! - `Bidi_Paired_Bracket`
18//! - `Bidi_Paired_Bracket_Type`
19//! - `Bidi_Mirrored`
20//! - `Bidi_Mirroring_Glyph`
21
22use displaydoc::Display;
23use icu_collections::codepointtrie::{CodePointTrie, TrieValue};
24use icu_provider::prelude::*;
25use zerovec::ule::{AsULE, CharULE, ULE};
26use zerovec::ZeroVecError;
27
28/// A data provider struct for properties related to Bidi algorithms, including
29/// mirroring and bracket pairing.
30///
31/// <div class="stab unstable">
32/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
33/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
34/// to be stable, their Rust representation might not be. Use with caution.
35/// </div>
36#[icu_provider::data_struct(marker(
37 BidiAuxiliaryPropertiesV1Marker,
38 "props/bidiauxiliaryprops@1",
39 singleton
40))]
41#[derive(Debug, Eq, PartialEq, Clone)]
42#[cfg_attr(
43 feature = "datagen",
44 derive(serde::Serialize, databake::Bake),
45 databake(path = icu_properties::provider::bidi_data),
46)]
47#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
48pub struct BidiAuxiliaryPropertiesV1<'data> {
49 /// A `CodePointTrie` efficiently storing the data from which property values
50 /// can be extracted or derived for the supported Bidi properties.
51 #[cfg_attr(feature = "serde", serde(borrow))]
52 pub trie: CodePointTrie<'data, MirroredPairedBracketData>,
53}
54
55impl<'data> BidiAuxiliaryPropertiesV1<'data> {
56 #[doc(hidden)]
57 pub fn new(
58 trie: CodePointTrie<'data, MirroredPairedBracketData>,
59 ) -> BidiAuxiliaryPropertiesV1<'data> {
60 BidiAuxiliaryPropertiesV1 { trie }
61 }
62}
63
64#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)]
65#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
66#[cfg_attr(feature = "datagen", derive(databake::Bake))]
67#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::bidi_data))]
68#[doc(hidden)] // needed for datagen but not intended for users
69pub struct MirroredPairedBracketData {
70 pub mirroring_glyph: char,
71 pub mirrored: bool,
72 pub paired_bracket_type: CheckedBidiPairedBracketType,
73}
74
75impl Default for MirroredPairedBracketData {
76 fn default() -> Self {
77 Self {
78 mirroring_glyph: 0 as char,
79 mirrored: false,
80 paired_bracket_type: CheckedBidiPairedBracketType::None,
81 }
82 }
83}
84
85impl From<MirroredPairedBracketData> for u32 {
86 fn from(mpbd: MirroredPairedBracketData) -> u32 {
87 let mut result: u32 = mpbd.mirroring_glyph as u32;
88 result |= (mpbd.mirrored as u32) << 21;
89 result |= (mpbd.paired_bracket_type as u32) << 22;
90 result
91 }
92}
93
94/// A `u32` serialized value of `MirroredPairedBracketData` did not encode either a valid Bidi_Mirroring_Glyph or a valid Bidi_Paired_Bracket_Type
95#[derive(Display, Debug, Clone, Copy, PartialEq, Eq)]
96#[displaydoc("Invalid MirroredPairedBracketData serialized in int: {0}")]
97pub struct MirroredPairedBracketDataTryFromError(u32);
98
99impl TryFrom<u32> for MirroredPairedBracketData {
100 type Error = MirroredPairedBracketDataTryFromError;
101
102 fn try_from(i: u32) -> Result<Self, MirroredPairedBracketDataTryFromError> {
103 let code_point = i & 0x1FFFFF;
104 let mirroring_glyph =
105 char::try_from_u32(code_point).map_err(|_| MirroredPairedBracketDataTryFromError(i))?;
106 let mirrored = ((i >> 21) & 0x1) == 1;
107 let paired_bracket_type = {
108 let value = ((i >> 22) & 0x3) as u8;
109 match value {
110 0 => CheckedBidiPairedBracketType::None,
111 1 => CheckedBidiPairedBracketType::Open,
112 2 => CheckedBidiPairedBracketType::Close,
113 _ => {
114 return Err(MirroredPairedBracketDataTryFromError(i));
115 }
116 }
117 };
118 Ok(MirroredPairedBracketData {
119 mirroring_glyph,
120 mirrored,
121 paired_bracket_type,
122 })
123 }
124}
125
126/// A closed Rust enum representing a closed set of the incoming Bidi_Paired_Bracket_Type
127/// property values necessary in the internal representation of `MirroredPairedBracketData`
128/// to satisfy the ULE invariants on valid values.
129#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)]
130#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
131#[cfg_attr(feature = "datagen", derive(databake::Bake))]
132#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::bidi_data))]
133#[repr(u8)]
134#[zerovec::make_ule(CheckedBidiPairedBracketTypeULE)]
135// This enum is closed in order to help with ULE validation for MirroredPairedBracketData.
136#[allow(clippy::exhaustive_enums)]
137pub enum CheckedBidiPairedBracketType {
138 /// Not a paired bracket.
139 None = 0,
140 /// Open paired bracket.
141 Open = 1,
142 /// Close paired bracket.
143 Close = 2,
144}
145
146/// Bit layout for the 24 bits (0..=23) of the `[u8; 3]` ULE raw type.
147/// LE means first byte is 0..=7, second byte 8..=15, third byte is 16..=23
148/// 0..=20 Code point return value for Bidi_Mirroring_Glyph value
149/// extracted with: mask = 0x1FFFFF <=> [bytes[0], bytes[1], bytes[2] & 0x1F]
150/// 21..=21 Boolean for Bidi_Mirrored
151/// extracted with: bitshift right by 21 followed by mask = 0x1 <=> (bytes[2] >> 5) & 0x1
152/// 22..=23 Enum discriminant value for Bidi_Paired_Bracket_Type
153/// extracted with: bitshift right by 22 followed by mask = 0x3 <=> (bytes[2] >> 6) & 0x3
154/// <=> (bytes[2] >> 6) b/c we left fill with 0s on bitshift right for unsigned
155/// numbers and a byte has 8 bits
156#[doc(hidden)]
157/// needed for datagen but not intended for users
158#[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
159#[repr(C, packed)]
160pub struct MirroredPairedBracketDataULE([u8; 3]);
161
162// Safety (based on the safety checklist on the ULE trait):
163// 1. MirroredPairedBracketDataULE does not include any uninitialized or padding bytes
164// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
165// 2. MirroredPairedBracketDataULE is aligned to 1 byte.
166// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
167// 3. The impl of validate_byte_slice() returns an error if any byte is not valid.
168// 4. The impl of validate_byte_slice() returns an error if there are extra bytes.
169// 5. The other ULE methods use the default impl.
170// 6. MirroredPairedBracketDataULE byte equality is semantic equality because all bits
171// are used, so no unused bits requires no extra work to zero out unused bits
172unsafe impl ULE for MirroredPairedBracketDataULE {
173 #[inline]
174 fn validate_byte_slice(bytes: &[u8]) -> Result<(), ZeroVecError> {
175 if bytes.len() % 3 != 0 {
176 return Err(ZeroVecError::length::<Self>(bytes.len()));
177 }
178 // Validate the bytes
179 #[allow(clippy::indexing_slicing)] // Won't panic because the chunks are always 3 bytes long
180 for byte_triple in bytes.chunks_exact(3) {
181 // Bidi_Mirroring_Glyph validation
182 #[allow(clippy::unwrap_used)] // chunks_exact returns slices of length 3
183 let [byte0, byte1, byte2] = *<&[u8; 3]>::try_from(byte_triple).unwrap();
184 let mut mirroring_glyph_code_point: u32 = (byte2 & 0x1F) as u32;
185 mirroring_glyph_code_point = (mirroring_glyph_code_point << 8) | (byte1 as u32);
186 mirroring_glyph_code_point = (mirroring_glyph_code_point << 8) | (byte0 as u32);
187 let _mirroring_glyph =
188 char::from_u32(mirroring_glyph_code_point).ok_or(ZeroVecError::parse::<Self>())?;
189
190 // skip validating the Bidi_Mirrored boolean since it is always valid
191
192 // assert that Bidi_Paired_Bracket_Type cannot have a 4th value because it only
193 // has 3 values: Open, Close, None
194 if (byte2 & 0xC0) == 0xC0 {
195 return Err(ZeroVecError::parse::<Self>());
196 }
197 }
198
199 Ok(())
200 }
201}
202
203impl AsULE for MirroredPairedBracketData {
204 type ULE = MirroredPairedBracketDataULE;
205
206 #[inline]
207 fn to_unaligned(self) -> Self::ULE {
208 let mut ch = u32::from(self.mirroring_glyph);
209 ch |= u32::from(self.mirrored) << 21;
210 ch |= (self.paired_bracket_type as u32) << 22;
211 let [byte0, byte1, byte2, _] = ch.to_le_bytes();
212 MirroredPairedBracketDataULE([byte0, byte1, byte2])
213 }
214
215 #[inline]
216 fn from_unaligned(unaligned: Self::ULE) -> Self {
217 let [unaligned_byte0, unaligned_byte1, unaligned_byte2] = unaligned.0;
218 let mirroring_glyph_ule_bytes = &[unaligned_byte0, unaligned_byte1, unaligned_byte2 & 0x1F];
219 // Safe because the lower bits 20..0 of MirroredPairedBracketDataULE bytes are the CharULE bytes,
220 // and CharULE::from_unaligned is safe because bytes are defined to represent a valid Unicode code point.
221 let mirroring_glyph_ule =
222 unsafe { CharULE::from_byte_slice_unchecked(mirroring_glyph_ule_bytes) };
223 let mirroring_glyph = mirroring_glyph_ule
224 .first()
225 .map(|ule| char::from_unaligned(*ule))
226 .unwrap_or(char::REPLACEMENT_CHARACTER);
227 let mirrored = ((unaligned.0[2] >> 5) & 0x1) == 1;
228 let paired_bracket_type = {
229 let discriminant = unaligned.0[2] >> 6;
230 debug_assert!(
231 discriminant != 3,
232 "Bidi_Paired_Bracket_Type can only be Open/Close/None in MirroredPairedBracketData"
233 );
234 match discriminant {
235 1 => CheckedBidiPairedBracketType::Open,
236 2 => CheckedBidiPairedBracketType::Close,
237 _ => CheckedBidiPairedBracketType::None,
238 }
239 };
240
241 MirroredPairedBracketData {
242 mirroring_glyph,
243 mirrored,
244 paired_bracket_type,
245 }
246 }
247}
248
249#[cfg(test)]
250mod tests {
251 use super::*;
252
253 #[test]
254 fn test_parse() {
255 // data for U+007B LEFT CURLY BRACKET
256
257 // serialize to ULE bytes
258 let data = MirroredPairedBracketData {
259 mirroring_glyph: '}',
260 mirrored: true,
261 paired_bracket_type: CheckedBidiPairedBracketType::Open,
262 };
263 let expected_bytes = &[0x7D, 0x0, 0x60];
264 assert_eq!(
265 expected_bytes,
266 MirroredPairedBracketDataULE::as_byte_slice(&[data.to_unaligned()])
267 );
268
269 // deserialize from ULE bytes
270 let ule = MirroredPairedBracketDataULE::parse_byte_slice(expected_bytes).unwrap();
271 let parsed_data = MirroredPairedBracketData::from_unaligned(*ule.first().unwrap());
272 assert_eq!(data, parsed_data);
273 }
274
275 #[test]
276 fn test_parse_error() {
277 // data for U+007B LEFT CURLY BRACKET
278 let ule_bytes = &mut [0x7D, 0x0, 0x60];
279
280 // Set discriminant value for the CheckedBidiPairedBracketType enum to be invalid.
281 // CheckedBidiPairedBracketType only has 3 values (discriminants => 0..=2), so the 4th
282 // expressible value from the 2 bits (3) should not parse successfully.
283 ule_bytes[2] |= 0xC0;
284
285 // deserialize from ULE bytes
286 let ule_parse_result = MirroredPairedBracketDataULE::parse_byte_slice(ule_bytes);
287 assert!(ule_parse_result.is_err());
288 }
289}
290