1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | //! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. |
6 | //! |
7 | //! <div class="stab unstable"> |
8 | //! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
9 | //! including in SemVer minor releases. While the serde representation of data structs is guaranteed |
10 | //! to be stable, their Rust representation might not be. Use with caution. |
11 | //! </div> |
12 | //! |
13 | //! Read more about data providers: [`icu_provider`] |
14 | |
15 | // Provider structs must be stable |
16 | #![allow (clippy::exhaustive_structs, clippy::exhaustive_enums)] |
17 | |
18 | use icu_collections::char16trie::Char16Trie; |
19 | use icu_collections::codepointtrie::CodePointTrie; |
20 | use icu_provider::prelude::*; |
21 | use zerovec::ZeroVec; |
22 | |
23 | #[cfg (feature = "compiled_data" )] |
24 | #[derive (Debug)] |
25 | /// Baked data |
26 | /// |
27 | /// <div class="stab unstable"> |
28 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
29 | /// including in SemVer minor releases. In particular, the `DataProvider` implementations are only |
30 | /// guaranteed to match with this version's `*_unstable` providers. Use with caution. |
31 | /// </div> |
32 | pub struct Baked; |
33 | |
34 | #[cfg (feature = "compiled_data" )] |
35 | const _: () = { |
36 | pub mod icu { |
37 | pub use crate as normalizer; |
38 | pub use icu_collections as collections; |
39 | } |
40 | icu_normalizer_data::make_provider!(Baked); |
41 | icu_normalizer_data::impl_normalizer_comp_v1!(Baked); |
42 | icu_normalizer_data::impl_normalizer_decomp_v1!(Baked); |
43 | icu_normalizer_data::impl_normalizer_nfd_v1!(Baked); |
44 | icu_normalizer_data::impl_normalizer_nfdex_v1!(Baked); |
45 | icu_normalizer_data::impl_normalizer_nfkd_v1!(Baked); |
46 | icu_normalizer_data::impl_normalizer_nfkdex_v1!(Baked); |
47 | icu_normalizer_data::impl_normalizer_uts46d_v1!(Baked); |
48 | }; |
49 | |
50 | #[cfg (feature = "datagen" )] |
51 | /// The latest minimum set of keys required by this component. |
52 | pub const KEYS: &[DataKey] = &[ |
53 | CanonicalCompositionsV1Marker::KEY, |
54 | CanonicalDecompositionDataV1Marker::KEY, |
55 | CanonicalDecompositionTablesV1Marker::KEY, |
56 | CompatibilityDecompositionSupplementV1Marker::KEY, |
57 | CompatibilityDecompositionTablesV1Marker::KEY, |
58 | NonRecursiveDecompositionSupplementV1Marker::KEY, |
59 | Uts46DecompositionSupplementV1Marker::KEY, |
60 | ]; |
61 | |
62 | /// Main data for NFD |
63 | /// |
64 | /// <div class="stab unstable"> |
65 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
66 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
67 | /// to be stable, their Rust representation might not be. Use with caution. |
68 | /// </div> |
69 | #[icu_provider::data_struct (marker( |
70 | CanonicalDecompositionDataV1Marker, |
71 | "normalizer/nfd@1" , |
72 | singleton |
73 | ))] |
74 | #[derive (Debug, PartialEq, Clone)] |
75 | #[cfg_attr (feature = "datagen" , derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))] |
76 | #[cfg_attr (feature = "serde" , derive(serde::Deserialize))] |
77 | pub struct DecompositionDataV1<'data> { |
78 | /// Trie for NFD decomposition. |
79 | #[cfg_attr (feature = "serde" , serde(borrow))] |
80 | pub trie: CodePointTrie<'data, u32>, |
81 | } |
82 | |
83 | /// Data that either NFKD or the decomposed form of UTS 46 needs |
84 | /// _in addition to_ the NFD data. |
85 | /// |
86 | /// <div class="stab unstable"> |
87 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
88 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
89 | /// to be stable, their Rust representation might not be. Use with caution. |
90 | /// </div> |
91 | #[icu_provider::data_struct ( |
92 | marker( |
93 | CompatibilityDecompositionSupplementV1Marker, |
94 | "normalizer/nfkd@1" , |
95 | singleton |
96 | ), |
97 | marker(Uts46DecompositionSupplementV1Marker, "normalizer/uts46d@1" , singleton) |
98 | )] |
99 | #[derive (Debug, PartialEq, Clone)] |
100 | #[cfg_attr (feature = "datagen" , derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))] |
101 | #[cfg_attr (feature = "serde" , derive(serde::Deserialize))] |
102 | pub struct DecompositionSupplementV1<'data> { |
103 | /// Trie for the decompositions that differ from NFD. |
104 | /// Getting a zero from this trie means that you need |
105 | /// to make another lookup from `DecompositionDataV1::trie`. |
106 | #[cfg_attr (feature = "serde" , serde(borrow))] |
107 | pub trie: CodePointTrie<'data, u32>, |
108 | /// Flags that indicate how the set of characters whose |
109 | /// decompositions starts with a non-starter differs from |
110 | /// the set for NFD. |
111 | /// |
112 | /// Bit 0: Whether half-width kana voicing marks decompose |
113 | /// into non-starters (their full-width combining |
114 | /// counterparts). |
115 | /// Bit 1: Whether U+0345 COMBINING GREEK YPOGEGRAMMENI |
116 | /// decomposes into a starter (U+03B9 GREEK SMALL |
117 | /// LETTER IOTA). |
118 | /// (Other bits unused.) |
119 | pub flags: u8, |
120 | /// The passthrough bounds of NFD/NFC are lowered to this |
121 | /// maximum instead. (16-bit, because cannot be higher |
122 | /// than 0x0300, which is the bound for NFC.) |
123 | pub passthrough_cap: u16, |
124 | } |
125 | |
126 | impl DecompositionSupplementV1<'_> { |
127 | const HALF_WIDTH_VOICING_MARK_MASK: u8 = 1; |
128 | |
129 | /// Whether half-width kana voicing marks decompose into non-starters |
130 | /// (their full-width combining counterparts). |
131 | pub fn half_width_voicing_marks_become_non_starters(&self) -> bool { |
132 | (self.flags & DecompositionSupplementV1::HALF_WIDTH_VOICING_MARK_MASK) != 0 |
133 | } |
134 | } |
135 | |
136 | /// The expansion tables for cases where the decomposition isn't |
137 | /// contained in the trie value |
138 | /// |
139 | /// <div class="stab unstable"> |
140 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
141 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
142 | /// to be stable, their Rust representation might not be. Use with caution. |
143 | /// </div> |
144 | #[icu_provider::data_struct ( |
145 | marker(CanonicalDecompositionTablesV1Marker, "normalizer/nfdex@1" , singleton), |
146 | marker( |
147 | CompatibilityDecompositionTablesV1Marker, |
148 | "normalizer/nfkdex@1" , |
149 | singleton |
150 | ) |
151 | )] |
152 | #[derive (Debug, PartialEq, Clone)] |
153 | #[cfg_attr (feature = "datagen" , derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))] |
154 | #[cfg_attr (feature = "serde" , derive(serde::Deserialize))] |
155 | pub struct DecompositionTablesV1<'data> { |
156 | /// Decompositions that are fully within the BMP |
157 | #[cfg_attr (feature = "serde" , serde(borrow))] |
158 | pub scalars16: ZeroVec<'data, u16>, |
159 | /// Decompositions with at least one character outside |
160 | /// the BMP |
161 | #[cfg_attr (feature = "serde" , serde(borrow))] |
162 | pub scalars24: ZeroVec<'data, char>, |
163 | } |
164 | |
165 | /// Non-Hangul canonical compositions |
166 | /// |
167 | /// <div class="stab unstable"> |
168 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
169 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
170 | /// to be stable, their Rust representation might not be. Use with caution. |
171 | /// </div> |
172 | #[icu_provider::data_struct (marker(CanonicalCompositionsV1Marker, "normalizer/comp@1" , singleton))] |
173 | #[derive (Debug, PartialEq, Clone)] |
174 | #[cfg_attr (feature = "datagen" , derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))] |
175 | #[cfg_attr (feature = "serde" , derive(serde::Deserialize))] |
176 | pub struct CanonicalCompositionsV1<'data> { |
177 | /// Trie keys are two-`char` strings with the second |
178 | /// character coming first. The value, if any, is the |
179 | /// (non-Hangul) canonical composition. |
180 | #[cfg_attr (feature = "serde" , serde(borrow))] |
181 | pub canonical_compositions: Char16Trie<'data>, |
182 | } |
183 | |
184 | /// Non-recursive canonical decompositions that differ from |
185 | /// `DecompositionDataV1`. |
186 | /// |
187 | /// <div class="stab unstable"> |
188 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
189 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
190 | /// to be stable, their Rust representation might not be. Use with caution. |
191 | /// </div> |
192 | #[icu_provider::data_struct (marker( |
193 | NonRecursiveDecompositionSupplementV1Marker, |
194 | "normalizer/decomp@1" , |
195 | singleton |
196 | ))] |
197 | #[derive (Debug, PartialEq, Clone)] |
198 | #[cfg_attr (feature = "datagen" , derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))] |
199 | #[cfg_attr (feature = "serde" , derive(serde::Deserialize))] |
200 | pub struct NonRecursiveDecompositionSupplementV1<'data> { |
201 | /// Trie for the supplementary non-recursive decompositions |
202 | #[cfg_attr (feature = "serde" , serde(borrow))] |
203 | pub trie: CodePointTrie<'data, u32>, |
204 | /// Decompositions with at least one character outside |
205 | /// the BMP |
206 | #[cfg_attr (feature = "serde" , serde(borrow))] |
207 | pub scalars24: ZeroVec<'data, char>, |
208 | } |
209 | |