1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::borrow::Cow;
6use icu_provider::prelude::*;
7use regex_automata::dfa::sparse::DFA;
8
9/// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement
10/// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian
11/// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`.
12///
13/// <div class="stab unstable">
14/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
15/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
16/// to be stable, their Rust representation might not be. Use with caution.
17/// </div>
18#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
19pub struct SerdeDFA<'data> {
20 // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok())
21 dfa_bytes: Cow<'data, [u8]>,
22 pattern: Option<Cow<'data, str>>,
23}
24
25impl PartialEq for SerdeDFA<'_> {
26 fn eq(&self, other: &Self) -> bool {
27 self.dfa_bytes == other.dfa_bytes
28 }
29}
30
31#[cfg(feature = "datagen")]
32impl databake::Bake for SerdeDFA<'_> {
33 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
34 env.insert("icu_list");
35 let le_bytes = databake::Bake::bake(&self.deref().to_bytes_little_endian().as_slice(), env);
36 let be_bytes = databake::Bake::bake(&self.deref().to_bytes_big_endian().as_slice(), env);
37 // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant.
38 databake::quote! {
39 unsafe {
40 icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked(
41 if cfg!(target_endian = "little") {
42 #le_bytes
43 } else {
44 #be_bytes
45 }
46 )
47 }
48 }
49 }
50}
51
52#[cfg(feature = "datagen")]
53impl serde::Serialize for SerdeDFA<'_> {
54 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
55 where
56 S: serde::ser::Serializer,
57 {
58 if serializer.is_human_readable() {
59 self.pattern
60 .as_ref()
61 .map(|pattern| pattern.serialize(serializer))
62 .unwrap_or_else(|| {
63 use serde::ser::Error;
64 Err(S::Error::custom(
65 "cannot serialize a deserialized bincode SerdeDFA to JSON",
66 ))
67 })
68 } else {
69 self.deref().to_bytes_little_endian().serialize(serializer)
70 }
71 }
72}
73
74#[cfg(feature = "serde")]
75impl<'data> SerdeDFA<'data> {
76 /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization
77 /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive.
78 pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error>
79 where
80 D: serde::de::Deserializer<'de>,
81 {
82 use icu_provider::serde::borrow_de_utils::CowBytesWrap;
83 use serde::Deserialize;
84
85 #[cfg(feature = "serde_human")]
86 if deserializer.is_human_readable() {
87 #[cfg(not(feature = "std"))]
88 use alloc::string::ToString;
89 use serde::de::Error;
90 return SerdeDFA::new(Cow::<str>::deserialize(deserializer)?)
91 .map(Some)
92 .map_err(|e| D::Error::custom(e.to_string()));
93 }
94
95 let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0;
96
97 if cfg!(target_endian = "big") {
98 return Ok(None);
99 }
100
101 // Verify safety invariant
102 DFA::from_bytes(&dfa_bytes).map_err(|e| {
103 use serde::de::Error;
104 D::Error::custom(alloc::format!("Invalid DFA bytes: {e}"))
105 })?;
106
107 Ok(Some(SerdeDFA {
108 dfa_bytes,
109 pattern: None,
110 }))
111 }
112}
113
114impl<'data> SerdeDFA<'data> {
115 /// Creates a `SerdeDFA` from raw bytes. Used internally by databake.
116 ///
117 /// # Safety
118 ///
119 /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok())
120 pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self {
121 Self {
122 dfa_bytes: Cow::Borrowed(dfa_bytes),
123 pattern: None,
124 }
125 }
126
127 /// Creates a `SerdeDFA` from a regex.
128 #[cfg(any(feature = "datagen", feature = "serde_human",))]
129 pub fn new(pattern: Cow<'data, str>) -> Result<Self, icu_provider::DataError> {
130 use regex_automata::{
131 dfa::dense::{Builder, Config},
132 SyntaxConfig,
133 };
134
135 let mut builder = Builder::new();
136 let dfa = builder
137 .syntax(SyntaxConfig::new().case_insensitive(true))
138 .configure(Config::new().anchored(true).minimize(true))
139 .build(&pattern)
140 .map_err(|_| {
141 icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern)
142 })?
143 .to_sparse()
144 .map_err(|_| {
145 icu_provider::DataError::custom("Cannot sparsify DFA")
146 .with_display_context(&pattern)
147 })?;
148
149 Ok(Self {
150 dfa_bytes: dfa.to_bytes_native_endian().into(),
151 pattern: Some(pattern),
152 })
153 }
154
155 /// Returns the represented [`DFA`]
156 #[allow(clippy::unwrap_used)] // by invariant
157 pub fn deref(&'data self) -> DFA<&'data [u8]> {
158 // Safe due to struct invariant.
159 unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 }
160 }
161}
162
163#[cfg(all(test, feature = "datagen"))]
164mod test {
165 use super::*;
166
167 #[test]
168 fn test_serde_dfa() {
169 use regex_automata::dfa::Automaton;
170
171 let matcher = SerdeDFA::new(Cow::Borrowed("abc")).unwrap();
172
173 assert!(matcher.deref().find_earliest_fwd(b"ab").unwrap().is_none());
174 assert!(matcher.deref().find_earliest_fwd(b"abc").unwrap().is_some());
175 assert!(matcher
176 .deref()
177 .find_earliest_fwd(b"abcde")
178 .unwrap()
179 .is_some());
180 assert!(matcher
181 .deref()
182 .find_earliest_fwd(b" abcde")
183 .unwrap()
184 .is_none());
185 }
186
187 #[derive(serde::Deserialize)]
188 struct OptionSerdeDFA<'data>(
189 #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")] Option<SerdeDFA<'data>>,
190 );
191
192 #[test]
193 #[cfg(target_endian = "little")]
194 fn test_postcard_serialization() {
195 let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
196
197 let mut bytes = postcard::to_stdvec(&matcher).unwrap();
198 assert_eq!(
199 postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0,
200 Some(matcher)
201 );
202
203 // A corrupted byte leads to an error
204 bytes[17] ^= 255;
205 assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
206 bytes[17] ^= 255;
207
208 // An extra byte leads to an error
209 bytes.insert(123, 40);
210 assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
211 bytes.remove(123);
212
213 // Missing bytes lead to an error
214 assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err());
215 }
216
217 #[test]
218 #[cfg(feature = "serde_human")]
219 fn test_json_serialization() {
220 let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
221
222 let json = serde_json::to_string(&matcher).unwrap();
223 assert_eq!(
224 serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0,
225 Some(matcher)
226 );
227 assert!(serde_json::from_str::<OptionSerdeDFA>(".*[").is_err());
228 }
229
230 #[test]
231 #[ignore] // https://github.com/rust-lang/rust/issues/98906
232 fn databake() {
233 databake::test_bake!(
234 SerdeDFA,
235 const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little") {
236 b"foo" // TODO: set this when activating the test
237 } else {
238 b"bar" // TODO: set this when activating the test
239 })},
240 icu_list
241 );
242 }
243}
244