1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::borrow::Cow;
6use icu_provider::prelude::*;
7use regex_automata::dfa::sparse::DFA;
8
9/// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement
10/// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian
11/// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`.
12///
13/// <div class="stab unstable">
14/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
15/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
16/// to be stable, their Rust representation might not be. Use with caution.
17/// </div>
18#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
19pub struct SerdeDFA<'data> {
20 // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok())
21 dfa_bytes: Cow<'data, [u8]>,
22 pattern: Option<Cow<'data, str>>,
23}
24
25impl PartialEq for SerdeDFA<'_> {
26 fn eq(&self, other: &Self) -> bool {
27 self.dfa_bytes == other.dfa_bytes
28 }
29}
30
31#[cfg(feature = "datagen")]
32impl databake::Bake for SerdeDFA<'_> {
33 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
34 env.insert("icu_list");
35 let le_bytes = databake::Bake::bake(&self.deref().to_bytes_little_endian().as_slice(), env);
36 let be_bytes = databake::Bake::bake(&self.deref().to_bytes_big_endian().as_slice(), env);
37 // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant: They produce
38 // valid DFA representations, and we consume them correctly taking care of the endianness of the target platform.
39 databake::quote! {
40 unsafe {
41 icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked(
42 if cfg!(target_endian = "little") {
43 #le_bytes
44 } else {
45 #be_bytes
46 }
47 )
48 }
49 }
50 }
51}
52
53#[cfg(feature = "datagen")]
54impl serde::Serialize for SerdeDFA<'_> {
55 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
56 where
57 S: serde::ser::Serializer,
58 {
59 if serializer.is_human_readable() {
60 self.pattern
61 .as_ref()
62 .map(|pattern| pattern.serialize(serializer))
63 .unwrap_or_else(|| {
64 use serde::ser::Error;
65 Err(S::Error::custom(
66 "cannot serialize a deserialized bincode SerdeDFA to JSON",
67 ))
68 })
69 } else {
70 serializer.serialize_bytes(&self.deref().to_bytes_little_endian())
71 }
72 }
73}
74
75#[cfg(feature = "serde")]
76impl<'data> SerdeDFA<'data> {
77 /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization
78 /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive.
79 pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error>
80 where
81 D: serde::de::Deserializer<'de>,
82 {
83 use icu_provider::serde::borrow_de_utils::CowBytesWrap;
84 use serde::Deserialize;
85
86 #[cfg(feature = "serde_human")]
87 if deserializer.is_human_readable() {
88 #[cfg(not(feature = "std"))]
89 use alloc::string::ToString;
90 use serde::de::Error;
91 return SerdeDFA::new(Cow::<str>::deserialize(deserializer)?)
92 .map(Some)
93 .map_err(|e| D::Error::custom(e.to_string()));
94 }
95
96 let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0;
97
98 if cfg!(target_endian = "big") {
99 return Ok(None);
100 }
101
102 // Verify safety invariant
103 DFA::from_bytes(&dfa_bytes).map_err(|e| {
104 use serde::de::Error;
105 D::Error::custom(alloc::format!("Invalid DFA bytes: {e}"))
106 })?;
107
108 Ok(Some(SerdeDFA {
109 dfa_bytes,
110 pattern: None,
111 }))
112 }
113}
114
115impl<'data> SerdeDFA<'data> {
116 /// Creates a `SerdeDFA` from raw bytes. Used internally by databake.
117 ///
118 /// # Safety
119 ///
120 /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok())
121 pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self {
122 Self {
123 dfa_bytes: Cow::Borrowed(dfa_bytes),
124 pattern: None,
125 }
126 }
127
128 /// Creates a `SerdeDFA` from a regex.
129 #[cfg(any(feature = "datagen", feature = "serde_human",))]
130 pub fn new(pattern: Cow<'data, str>) -> Result<Self, icu_provider::DataError> {
131 use regex_automata::{
132 dfa::dense::{Builder, Config},
133 SyntaxConfig,
134 };
135
136 let mut builder = Builder::new();
137 let dfa = builder
138 .syntax(SyntaxConfig::new().case_insensitive(true))
139 .configure(Config::new().anchored(true).minimize(true))
140 .build(&pattern)
141 .map_err(|_| {
142 icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern)
143 })?
144 .to_sparse()
145 .map_err(|_| {
146 icu_provider::DataError::custom("Cannot sparsify DFA")
147 .with_display_context(&pattern)
148 })?;
149
150 Ok(Self {
151 dfa_bytes: dfa.to_bytes_native_endian().into(),
152 pattern: Some(pattern),
153 })
154 }
155
156 /// Returns the represented [`DFA`]
157 #[allow(clippy::unwrap_used)] // by invariant
158 pub fn deref(&'data self) -> DFA<&'data [u8]> {
159 // Safe due to struct invariant.
160 unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 }
161 }
162}
163
164#[cfg(all(test, feature = "datagen"))]
165mod test {
166 use super::*;
167
168 #[test]
169 fn test_serde_dfa() {
170 use regex_automata::dfa::Automaton;
171
172 let matcher = SerdeDFA::new(Cow::Borrowed("abc")).unwrap();
173
174 assert!(matcher.deref().find_earliest_fwd(b"ab").unwrap().is_none());
175 assert!(matcher.deref().find_earliest_fwd(b"abc").unwrap().is_some());
176 assert!(matcher
177 .deref()
178 .find_earliest_fwd(b"abcde")
179 .unwrap()
180 .is_some());
181 assert!(matcher
182 .deref()
183 .find_earliest_fwd(b" abcde")
184 .unwrap()
185 .is_none());
186 }
187
188 #[derive(serde::Deserialize)]
189 struct OptionSerdeDFA<'data>(
190 #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")] Option<SerdeDFA<'data>>,
191 );
192
193 #[test]
194 #[cfg(target_endian = "little")]
195 fn test_postcard_serialization() {
196 let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
197
198 let mut bytes = postcard::to_stdvec(&matcher).unwrap();
199 assert_eq!(
200 postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0,
201 Some(matcher)
202 );
203
204 // A corrupted byte leads to an error
205 bytes[17] ^= 255;
206 assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
207 bytes[17] ^= 255;
208
209 // An extra byte leads to an error
210 bytes.insert(123, 40);
211 assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
212 bytes.remove(123);
213
214 // Missing bytes lead to an error
215 assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err());
216 }
217
218 #[test]
219 fn test_rmp_serialization() {
220 let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
221
222 let bytes = rmp_serde::to_vec(&matcher).unwrap();
223 assert_eq!(
224 rmp_serde::from_slice::<OptionSerdeDFA>(&bytes).unwrap().0,
225 Some(matcher)
226 );
227 }
228
229 #[test]
230 #[cfg(feature = "serde_human")]
231 fn test_json_serialization() {
232 let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
233
234 let json = serde_json::to_string(&matcher).unwrap();
235 assert_eq!(
236 serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0,
237 Some(matcher)
238 );
239 assert!(serde_json::from_str::<OptionSerdeDFA>(".*[").is_err());
240 }
241
242 #[test]
243 fn databake() {
244 // This is the DFA for ".*"
245 databake::test_bake!(
246 SerdeDFA,
247 const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little") {
248 b"rust-regex-automata-dfa-sparse\0\0\xFF\xFE\0\0\x02\0\0\0\0\0\0\0\x0E\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x06\x06\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\t\t\t\t\t\t\t\t\t\t\t\t\n\x0B\x0B\x0C\r\r\r\x0E\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x98\x01\0\0\x01\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\x80\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q\0\0\0\x12\0\0\0q\0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q\0\0\0q\0\0\0\x01\0\0\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0#\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x05\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x04\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x04\x05\0\0z\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0z\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x03\0\0z\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\0\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q\0\0\0\x12\0\0\0q\0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q\0\0\0q\0\0\0\0\x02\0\x03\x05\0\0\x89\x01\0\0\0\0\0\0\0\x02\0\x03\x05\0\0#\x01\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0#\x01\0\0#\x01\0\0#\x01\0\0#\x01\0\0#\x01\0\0\t\0\0\0\x12\0\0\0\x0C\x01\0\0\0\0\0\0\0\0\0\0#\x01\0\0#\x01\0\0"
249 } else {
250 b"rust-regex-automata-dfa-sparse\0\0\0\0\xFE\xFF\0\0\0\x02\0\0\0\0\0\0\0\x0E\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\x01\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x06\x06\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\t\t\t\t\t\t\t\t\t\t\t\t\n\x0B\x0B\x0C\r\r\r\x0E\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\0\0\x01\x98\x01\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\x80\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q\0\0\0\x12\0\0\0q\0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q\0\0\0q\0\0\0\x01\0\0\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0#\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x05\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x04\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x04\x05\0\0z\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0z\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x03\0\0z\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\0\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q\0\0\0\x12\0\0\0q\0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q\0\0\0q\0\0\0\0\x02\0\x03\x05\0\0\x89\x01\0\0\0\0\0\0\0\x02\0\x03\x05\0\0#\x01\0\0\0\0\0\0\0\0\0\0\x04\0\0\0\0#\x01\0\0#\x01\0\0#\x01\0\0#\x01\0\0\0\0\x01#\0\0\0\t\0\0\0\x12\0\0\x01\x0C\0\0\0\0\0\0\0\0\0\0\x01#\0\0\x01#"
251 })},
252 icu_list
253 );
254 }
255}
256