1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use alloc::borrow::Cow; |
6 | use icu_provider::prelude::*; |
7 | use regex_automata::dfa::sparse::DFA; |
8 | |
9 | /// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement |
10 | /// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian |
11 | /// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`. |
12 | /// |
13 | /// <div class="stab unstable"> |
14 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
15 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
16 | /// to be stable, their Rust representation might not be. Use with caution. |
17 | /// </div> |
18 | #[derive (Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] |
19 | pub struct SerdeDFA<'data> { |
20 | // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok()) |
21 | dfa_bytes: Cow<'data, [u8]>, |
22 | pattern: Option<Cow<'data, str>>, |
23 | } |
24 | |
25 | impl PartialEq for SerdeDFA<'_> { |
26 | fn eq(&self, other: &Self) -> bool { |
27 | self.dfa_bytes == other.dfa_bytes |
28 | } |
29 | } |
30 | |
31 | #[cfg (feature = "datagen" )] |
32 | impl databake::Bake for SerdeDFA<'_> { |
33 | fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { |
34 | env.insert("icu_list" ); |
35 | let le_bytes = databake::Bake::bake(&self.deref().to_bytes_little_endian().as_slice(), env); |
36 | let be_bytes = databake::Bake::bake(&self.deref().to_bytes_big_endian().as_slice(), env); |
37 | // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant: They produce |
38 | // valid DFA representations, and we consume them correctly taking care of the endianness of the target platform. |
39 | databake::quote! { |
40 | unsafe { |
41 | icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked( |
42 | if cfg!(target_endian = "little" ) { |
43 | #le_bytes |
44 | } else { |
45 | #be_bytes |
46 | } |
47 | ) |
48 | } |
49 | } |
50 | } |
51 | } |
52 | |
53 | #[cfg (feature = "datagen" )] |
54 | impl serde::Serialize for SerdeDFA<'_> { |
55 | fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> |
56 | where |
57 | S: serde::ser::Serializer, |
58 | { |
59 | if serializer.is_human_readable() { |
60 | self.pattern |
61 | .as_ref() |
62 | .map(|pattern| pattern.serialize(serializer)) |
63 | .unwrap_or_else(|| { |
64 | use serde::ser::Error; |
65 | Err(S::Error::custom( |
66 | "cannot serialize a deserialized bincode SerdeDFA to JSON" , |
67 | )) |
68 | }) |
69 | } else { |
70 | serializer.serialize_bytes(&self.deref().to_bytes_little_endian()) |
71 | } |
72 | } |
73 | } |
74 | |
75 | #[cfg (feature = "serde" )] |
76 | impl<'data> SerdeDFA<'data> { |
77 | /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization |
78 | /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive. |
79 | pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error> |
80 | where |
81 | D: serde::de::Deserializer<'de>, |
82 | { |
83 | use icu_provider::serde::borrow_de_utils::CowBytesWrap; |
84 | use serde::Deserialize; |
85 | |
86 | #[cfg (feature = "serde_human" )] |
87 | if deserializer.is_human_readable() { |
88 | #[cfg (not(feature = "std" ))] |
89 | use alloc::string::ToString; |
90 | use serde::de::Error; |
91 | return SerdeDFA::new(Cow::<str>::deserialize(deserializer)?) |
92 | .map(Some) |
93 | .map_err(|e| D::Error::custom(e.to_string())); |
94 | } |
95 | |
96 | let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0; |
97 | |
98 | if cfg!(target_endian = "big" ) { |
99 | return Ok(None); |
100 | } |
101 | |
102 | // Verify safety invariant |
103 | DFA::from_bytes(&dfa_bytes).map_err(|e| { |
104 | use serde::de::Error; |
105 | D::Error::custom(alloc::format!("Invalid DFA bytes: {e}" )) |
106 | })?; |
107 | |
108 | Ok(Some(SerdeDFA { |
109 | dfa_bytes, |
110 | pattern: None, |
111 | })) |
112 | } |
113 | } |
114 | |
115 | impl<'data> SerdeDFA<'data> { |
116 | /// Creates a `SerdeDFA` from raw bytes. Used internally by databake. |
117 | /// |
118 | /// # Safety |
119 | /// |
120 | /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok()) |
121 | pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self { |
122 | Self { |
123 | dfa_bytes: Cow::Borrowed(dfa_bytes), |
124 | pattern: None, |
125 | } |
126 | } |
127 | |
128 | /// Creates a `SerdeDFA` from a regex. |
129 | #[cfg (any(feature = "datagen" , feature = "serde_human" ,))] |
130 | pub fn new(pattern: Cow<'data, str>) -> Result<Self, icu_provider::DataError> { |
131 | use regex_automata::{ |
132 | dfa::dense::{Builder, Config}, |
133 | SyntaxConfig, |
134 | }; |
135 | |
136 | let mut builder = Builder::new(); |
137 | let dfa = builder |
138 | .syntax(SyntaxConfig::new().case_insensitive(true)) |
139 | .configure(Config::new().anchored(true).minimize(true)) |
140 | .build(&pattern) |
141 | .map_err(|_| { |
142 | icu_provider::DataError::custom("Cannot build DFA" ).with_display_context(&pattern) |
143 | })? |
144 | .to_sparse() |
145 | .map_err(|_| { |
146 | icu_provider::DataError::custom("Cannot sparsify DFA" ) |
147 | .with_display_context(&pattern) |
148 | })?; |
149 | |
150 | Ok(Self { |
151 | dfa_bytes: dfa.to_bytes_native_endian().into(), |
152 | pattern: Some(pattern), |
153 | }) |
154 | } |
155 | |
156 | /// Returns the represented [`DFA`] |
157 | #[allow (clippy::unwrap_used)] // by invariant |
158 | pub fn deref(&'data self) -> DFA<&'data [u8]> { |
159 | // Safe due to struct invariant. |
160 | unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 } |
161 | } |
162 | } |
163 | |
164 | #[cfg (all(test, feature = "datagen" ))] |
165 | mod test { |
166 | use super::*; |
167 | |
168 | #[test ] |
169 | fn test_serde_dfa() { |
170 | use regex_automata::dfa::Automaton; |
171 | |
172 | let matcher = SerdeDFA::new(Cow::Borrowed("abc" )).unwrap(); |
173 | |
174 | assert!(matcher.deref().find_earliest_fwd(b"ab" ).unwrap().is_none()); |
175 | assert!(matcher.deref().find_earliest_fwd(b"abc" ).unwrap().is_some()); |
176 | assert!(matcher |
177 | .deref() |
178 | .find_earliest_fwd(b"abcde" ) |
179 | .unwrap() |
180 | .is_some()); |
181 | assert!(matcher |
182 | .deref() |
183 | .find_earliest_fwd(b" abcde" ) |
184 | .unwrap() |
185 | .is_none()); |
186 | } |
187 | |
188 | #[derive (serde::Deserialize)] |
189 | struct OptionSerdeDFA<'data>( |
190 | #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize" )] Option<SerdeDFA<'data>>, |
191 | ); |
192 | |
193 | #[test ] |
194 | #[cfg (target_endian = "little" )] |
195 | fn test_postcard_serialization() { |
196 | let matcher = SerdeDFA::new(Cow::Borrowed("abc*" )).unwrap(); |
197 | |
198 | let mut bytes = postcard::to_stdvec(&matcher).unwrap(); |
199 | assert_eq!( |
200 | postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0, |
201 | Some(matcher) |
202 | ); |
203 | |
204 | // A corrupted byte leads to an error |
205 | bytes[17] ^= 255; |
206 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err()); |
207 | bytes[17] ^= 255; |
208 | |
209 | // An extra byte leads to an error |
210 | bytes.insert(123, 40); |
211 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err()); |
212 | bytes.remove(123); |
213 | |
214 | // Missing bytes lead to an error |
215 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err()); |
216 | } |
217 | |
218 | #[test ] |
219 | fn test_rmp_serialization() { |
220 | let matcher = SerdeDFA::new(Cow::Borrowed("abc*" )).unwrap(); |
221 | |
222 | let bytes = rmp_serde::to_vec(&matcher).unwrap(); |
223 | assert_eq!( |
224 | rmp_serde::from_slice::<OptionSerdeDFA>(&bytes).unwrap().0, |
225 | Some(matcher) |
226 | ); |
227 | } |
228 | |
229 | #[test ] |
230 | #[cfg (feature = "serde_human" )] |
231 | fn test_json_serialization() { |
232 | let matcher = SerdeDFA::new(Cow::Borrowed("abc*" )).unwrap(); |
233 | |
234 | let json = serde_json::to_string(&matcher).unwrap(); |
235 | assert_eq!( |
236 | serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0, |
237 | Some(matcher) |
238 | ); |
239 | assert!(serde_json::from_str::<OptionSerdeDFA>(".*[" ).is_err()); |
240 | } |
241 | |
242 | #[test ] |
243 | fn databake() { |
244 | // This is the DFA for ".*" |
245 | databake::test_bake!( |
246 | SerdeDFA, |
247 | const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little" ) { |
248 | b"rust-regex-automata-dfa-sparse \0\0\xFF\xFE\0\0\x02\0\0\0\0\0\0\0\x0E\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x06\x06\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\t\t\t\t\t\t\t\t\t\t\t\t\n\x0B\x0B\x0C\r\r\r\x0E\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x98\x01\0\0\x01\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\x80\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q \0\0\0\x12\0\0\0q \0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q \0\0\0q \0\0\0\x01\0\0\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0# \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x05\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x04\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x04\x05\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x03\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\0\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q \0\0\0\x12\0\0\0q \0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q \0\0\0q \0\0\0\0\x02\0\x03\x05\0\0\x89\x01\0\0\0\0\0\0\0\x02\0\x03\x05\0\0# \x01\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0# \x01\0\0# \x01\0\0# \x01\0\0# \x01\0\0# \x01\0\0\t\0\0\0\x12\0\0\0\x0C\x01\0\0\0\0\0\0\0\0\0\0# \x01\0\0# \x01\0\0" |
249 | } else { |
250 | b"rust-regex-automata-dfa-sparse \0\0\0\0\xFE\xFF\0\0\0\x02\0\0\0\0\0\0\0\x0E\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\x01\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x06\x06\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\t\t\t\t\t\t\t\t\t\t\t\t\n\x0B\x0B\x0C\r\r\r\x0E\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\0\0\x01\x98\x01\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\x80\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q \0\0\0\x12\0\0\0q \0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q \0\0\0q \0\0\0\x01\0\0\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0# \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x05\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x04\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x04\x05\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x03\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\0\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q \0\0\0\x12\0\0\0q \0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q \0\0\0q \0\0\0\0\x02\0\x03\x05\0\0\x89\x01\0\0\0\0\0\0\0\x02\0\x03\x05\0\0# \x01\0\0\0\0\0\0\0\0\0\0\x04\0\0\0\0# \x01\0\0# \x01\0\0# \x01\0\0# \x01\0\0\0\0\x01# \0\0\0\t\0\0\0\x12\0\0\x01\x0C\0\0\0\0\0\0\0\0\0\0\x01# \0\0\x01#" |
251 | })}, |
252 | icu_list |
253 | ); |
254 | } |
255 | } |
256 | |