1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use alloc::borrow::Cow; |
6 | use icu_provider::prelude::*; |
7 | use regex_automata::dfa::sparse::DFA; |
8 | |
9 | /// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement |
10 | /// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian |
11 | /// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`. |
12 | /// |
13 | /// <div class="stab unstable"> |
14 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
15 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
16 | /// to be stable, their Rust representation might not be. Use with caution. |
17 | /// </div> |
18 | #[derive (Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] |
19 | pub struct SerdeDFA<'data> { |
20 | // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok()) |
21 | dfa_bytes: Cow<'data, [u8]>, |
22 | pattern: Option<Cow<'data, str>>, |
23 | } |
24 | |
25 | impl PartialEq for SerdeDFA<'_> { |
26 | fn eq(&self, other: &Self) -> bool { |
27 | self.dfa_bytes == other.dfa_bytes |
28 | } |
29 | } |
30 | |
31 | #[cfg (feature = "datagen" )] |
32 | impl databake::Bake for SerdeDFA<'_> { |
33 | fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { |
34 | env.insert("icu_list" ); |
35 | let le_bytes = databake::Bake::bake(&self.deref().to_bytes_little_endian().as_slice(), env); |
36 | let be_bytes = databake::Bake::bake(&self.deref().to_bytes_big_endian().as_slice(), env); |
37 | // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant. |
38 | databake::quote! { |
39 | unsafe { |
40 | icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked( |
41 | if cfg!(target_endian = "little" ) { |
42 | #le_bytes |
43 | } else { |
44 | #be_bytes |
45 | } |
46 | ) |
47 | } |
48 | } |
49 | } |
50 | } |
51 | |
52 | #[cfg (feature = "datagen" )] |
53 | impl serde::Serialize for SerdeDFA<'_> { |
54 | fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> |
55 | where |
56 | S: serde::ser::Serializer, |
57 | { |
58 | if serializer.is_human_readable() { |
59 | self.pattern |
60 | .as_ref() |
61 | .map(|pattern| pattern.serialize(serializer)) |
62 | .unwrap_or_else(|| { |
63 | use serde::ser::Error; |
64 | Err(S::Error::custom( |
65 | "cannot serialize a deserialized bincode SerdeDFA to JSON" , |
66 | )) |
67 | }) |
68 | } else { |
69 | self.deref().to_bytes_little_endian().serialize(serializer) |
70 | } |
71 | } |
72 | } |
73 | |
74 | #[cfg (feature = "serde" )] |
75 | impl<'data> SerdeDFA<'data> { |
76 | /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization |
77 | /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive. |
78 | pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error> |
79 | where |
80 | D: serde::de::Deserializer<'de>, |
81 | { |
82 | use icu_provider::serde::borrow_de_utils::CowBytesWrap; |
83 | use serde::Deserialize; |
84 | |
85 | #[cfg (feature = "serde_human" )] |
86 | if deserializer.is_human_readable() { |
87 | #[cfg (not(feature = "std" ))] |
88 | use alloc::string::ToString; |
89 | use serde::de::Error; |
90 | return SerdeDFA::new(Cow::<str>::deserialize(deserializer)?) |
91 | .map(Some) |
92 | .map_err(|e| D::Error::custom(e.to_string())); |
93 | } |
94 | |
95 | let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0; |
96 | |
97 | if cfg!(target_endian = "big" ) { |
98 | return Ok(None); |
99 | } |
100 | |
101 | // Verify safety invariant |
102 | DFA::from_bytes(&dfa_bytes).map_err(|e| { |
103 | use serde::de::Error; |
104 | D::Error::custom(alloc::format!("Invalid DFA bytes: {e}" )) |
105 | })?; |
106 | |
107 | Ok(Some(SerdeDFA { |
108 | dfa_bytes, |
109 | pattern: None, |
110 | })) |
111 | } |
112 | } |
113 | |
114 | impl<'data> SerdeDFA<'data> { |
115 | /// Creates a `SerdeDFA` from raw bytes. Used internally by databake. |
116 | /// |
117 | /// # Safety |
118 | /// |
119 | /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok()) |
120 | pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self { |
121 | Self { |
122 | dfa_bytes: Cow::Borrowed(dfa_bytes), |
123 | pattern: None, |
124 | } |
125 | } |
126 | |
127 | /// Creates a `SerdeDFA` from a regex. |
128 | #[cfg (any(feature = "datagen" , feature = "serde_human" ,))] |
129 | pub fn new(pattern: Cow<'data, str>) -> Result<Self, icu_provider::DataError> { |
130 | use regex_automata::{ |
131 | dfa::dense::{Builder, Config}, |
132 | SyntaxConfig, |
133 | }; |
134 | |
135 | let mut builder = Builder::new(); |
136 | let dfa = builder |
137 | .syntax(SyntaxConfig::new().case_insensitive(true)) |
138 | .configure(Config::new().anchored(true).minimize(true)) |
139 | .build(&pattern) |
140 | .map_err(|_| { |
141 | icu_provider::DataError::custom("Cannot build DFA" ).with_display_context(&pattern) |
142 | })? |
143 | .to_sparse() |
144 | .map_err(|_| { |
145 | icu_provider::DataError::custom("Cannot sparsify DFA" ) |
146 | .with_display_context(&pattern) |
147 | })?; |
148 | |
149 | Ok(Self { |
150 | dfa_bytes: dfa.to_bytes_native_endian().into(), |
151 | pattern: Some(pattern), |
152 | }) |
153 | } |
154 | |
155 | /// Returns the represented [`DFA`] |
156 | #[allow (clippy::unwrap_used)] // by invariant |
157 | pub fn deref(&'data self) -> DFA<&'data [u8]> { |
158 | // Safe due to struct invariant. |
159 | unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 } |
160 | } |
161 | } |
162 | |
163 | #[cfg (all(test, feature = "datagen" ))] |
164 | mod test { |
165 | use super::*; |
166 | |
167 | #[test ] |
168 | fn test_serde_dfa() { |
169 | use regex_automata::dfa::Automaton; |
170 | |
171 | let matcher = SerdeDFA::new(Cow::Borrowed("abc" )).unwrap(); |
172 | |
173 | assert!(matcher.deref().find_earliest_fwd(b"ab" ).unwrap().is_none()); |
174 | assert!(matcher.deref().find_earliest_fwd(b"abc" ).unwrap().is_some()); |
175 | assert!(matcher |
176 | .deref() |
177 | .find_earliest_fwd(b"abcde" ) |
178 | .unwrap() |
179 | .is_some()); |
180 | assert!(matcher |
181 | .deref() |
182 | .find_earliest_fwd(b" abcde" ) |
183 | .unwrap() |
184 | .is_none()); |
185 | } |
186 | |
187 | #[derive (serde::Deserialize)] |
188 | struct OptionSerdeDFA<'data>( |
189 | #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize" )] Option<SerdeDFA<'data>>, |
190 | ); |
191 | |
192 | #[test ] |
193 | #[cfg (target_endian = "little" )] |
194 | fn test_postcard_serialization() { |
195 | let matcher = SerdeDFA::new(Cow::Borrowed("abc*" )).unwrap(); |
196 | |
197 | let mut bytes = postcard::to_stdvec(&matcher).unwrap(); |
198 | assert_eq!( |
199 | postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0, |
200 | Some(matcher) |
201 | ); |
202 | |
203 | // A corrupted byte leads to an error |
204 | bytes[17] ^= 255; |
205 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err()); |
206 | bytes[17] ^= 255; |
207 | |
208 | // An extra byte leads to an error |
209 | bytes.insert(123, 40); |
210 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err()); |
211 | bytes.remove(123); |
212 | |
213 | // Missing bytes lead to an error |
214 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err()); |
215 | } |
216 | |
217 | #[test ] |
218 | #[cfg (feature = "serde_human" )] |
219 | fn test_json_serialization() { |
220 | let matcher = SerdeDFA::new(Cow::Borrowed("abc*" )).unwrap(); |
221 | |
222 | let json = serde_json::to_string(&matcher).unwrap(); |
223 | assert_eq!( |
224 | serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0, |
225 | Some(matcher) |
226 | ); |
227 | assert!(serde_json::from_str::<OptionSerdeDFA>(".*[" ).is_err()); |
228 | } |
229 | |
230 | #[test ] |
231 | #[ignore ] // https://github.com/rust-lang/rust/issues/98906 |
232 | fn databake() { |
233 | databake::test_bake!( |
234 | SerdeDFA, |
235 | const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little" ) { |
236 | b"foo" // TODO: set this when activating the test |
237 | } else { |
238 | b"bar" // TODO: set this when activating the test |
239 | })}, |
240 | icu_list |
241 | ); |
242 | } |
243 | } |
244 | |