| 1 | // This file is part of ICU4X. For terms of use, please see the file |
| 2 | // called LICENSE at the top level of the ICU4X source tree |
| 3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| 4 | |
| 5 | use alloc::borrow::Cow; |
| 6 | use icu_provider::prelude::*; |
| 7 | use regex_automata::dfa::sparse::DFA; |
| 8 | |
| 9 | /// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement |
| 10 | /// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian |
| 11 | /// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`. |
| 12 | /// |
| 13 | /// <div class="stab unstable"> |
| 14 | /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, |
| 15 | /// including in SemVer minor releases. While the serde representation of data structs is guaranteed |
| 16 | /// to be stable, their Rust representation might not be. Use with caution. |
| 17 | /// </div> |
| 18 | #[derive (Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] |
| 19 | pub struct SerdeDFA<'data> { |
| 20 | // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok()) |
| 21 | dfa_bytes: Cow<'data, [u8]>, |
| 22 | pattern: Option<Cow<'data, str>>, |
| 23 | } |
| 24 | |
| 25 | impl PartialEq for SerdeDFA<'_> { |
| 26 | fn eq(&self, other: &Self) -> bool { |
| 27 | self.dfa_bytes == other.dfa_bytes |
| 28 | } |
| 29 | } |
| 30 | |
| 31 | #[cfg (feature = "datagen" )] |
| 32 | impl databake::Bake for SerdeDFA<'_> { |
| 33 | fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { |
| 34 | env.insert("icu_list" ); |
| 35 | let le_bytes = databake::Bake::bake(&self.deref().to_bytes_little_endian().as_slice(), env); |
| 36 | let be_bytes = databake::Bake::bake(&self.deref().to_bytes_big_endian().as_slice(), env); |
| 37 | // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant: They produce |
| 38 | // valid DFA representations, and we consume them correctly taking care of the endianness of the target platform. |
| 39 | databake::quote! { |
| 40 | unsafe { |
| 41 | icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked( |
| 42 | if cfg!(target_endian = "little" ) { |
| 43 | #le_bytes |
| 44 | } else { |
| 45 | #be_bytes |
| 46 | } |
| 47 | ) |
| 48 | } |
| 49 | } |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | #[cfg (feature = "datagen" )] |
| 54 | impl serde::Serialize for SerdeDFA<'_> { |
| 55 | fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> |
| 56 | where |
| 57 | S: serde::ser::Serializer, |
| 58 | { |
| 59 | if serializer.is_human_readable() { |
| 60 | self.pattern |
| 61 | .as_ref() |
| 62 | .map(|pattern| pattern.serialize(serializer)) |
| 63 | .unwrap_or_else(|| { |
| 64 | use serde::ser::Error; |
| 65 | Err(S::Error::custom( |
| 66 | "cannot serialize a deserialized bincode SerdeDFA to JSON" , |
| 67 | )) |
| 68 | }) |
| 69 | } else { |
| 70 | serializer.serialize_bytes(&self.deref().to_bytes_little_endian()) |
| 71 | } |
| 72 | } |
| 73 | } |
| 74 | |
| 75 | #[cfg (feature = "serde" )] |
| 76 | impl<'data> SerdeDFA<'data> { |
| 77 | /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization |
| 78 | /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive. |
| 79 | pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error> |
| 80 | where |
| 81 | D: serde::de::Deserializer<'de>, |
| 82 | { |
| 83 | use icu_provider::serde::borrow_de_utils::CowBytesWrap; |
| 84 | use serde::Deserialize; |
| 85 | |
| 86 | #[cfg (feature = "serde_human" )] |
| 87 | if deserializer.is_human_readable() { |
| 88 | #[cfg (not(feature = "std" ))] |
| 89 | use alloc::string::ToString; |
| 90 | use serde::de::Error; |
| 91 | return SerdeDFA::new(Cow::<str>::deserialize(deserializer)?) |
| 92 | .map(Some) |
| 93 | .map_err(|e| D::Error::custom(e.to_string())); |
| 94 | } |
| 95 | |
| 96 | let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0; |
| 97 | |
| 98 | if cfg!(target_endian = "big" ) { |
| 99 | return Ok(None); |
| 100 | } |
| 101 | |
| 102 | // Verify safety invariant |
| 103 | DFA::from_bytes(&dfa_bytes).map_err(|e| { |
| 104 | use serde::de::Error; |
| 105 | D::Error::custom(alloc::format!("Invalid DFA bytes: {e}" )) |
| 106 | })?; |
| 107 | |
| 108 | Ok(Some(SerdeDFA { |
| 109 | dfa_bytes, |
| 110 | pattern: None, |
| 111 | })) |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | impl<'data> SerdeDFA<'data> { |
| 116 | /// Creates a `SerdeDFA` from raw bytes. Used internally by databake. |
| 117 | /// |
| 118 | /// # Safety |
| 119 | /// |
| 120 | /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok()) |
| 121 | pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self { |
| 122 | Self { |
| 123 | dfa_bytes: Cow::Borrowed(dfa_bytes), |
| 124 | pattern: None, |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | /// Creates a `SerdeDFA` from a regex. |
| 129 | #[cfg (any(feature = "datagen" , feature = "serde_human" ,))] |
| 130 | pub fn new(pattern: Cow<'data, str>) -> Result<Self, icu_provider::DataError> { |
| 131 | use regex_automata::{ |
| 132 | dfa::dense::{Builder, Config}, |
| 133 | SyntaxConfig, |
| 134 | }; |
| 135 | |
| 136 | let mut builder = Builder::new(); |
| 137 | let dfa = builder |
| 138 | .syntax(SyntaxConfig::new().case_insensitive(true)) |
| 139 | .configure(Config::new().anchored(true).minimize(true)) |
| 140 | .build(&pattern) |
| 141 | .map_err(|_| { |
| 142 | icu_provider::DataError::custom("Cannot build DFA" ).with_display_context(&pattern) |
| 143 | })? |
| 144 | .to_sparse() |
| 145 | .map_err(|_| { |
| 146 | icu_provider::DataError::custom("Cannot sparsify DFA" ) |
| 147 | .with_display_context(&pattern) |
| 148 | })?; |
| 149 | |
| 150 | Ok(Self { |
| 151 | dfa_bytes: dfa.to_bytes_native_endian().into(), |
| 152 | pattern: Some(pattern), |
| 153 | }) |
| 154 | } |
| 155 | |
| 156 | /// Returns the represented [`DFA`] |
| 157 | #[allow (clippy::unwrap_used)] // by invariant |
| 158 | pub fn deref(&'data self) -> DFA<&'data [u8]> { |
| 159 | // Safe due to struct invariant. |
| 160 | unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 } |
| 161 | } |
| 162 | } |
| 163 | |
| 164 | #[cfg (all(test, feature = "datagen" ))] |
| 165 | mod test { |
| 166 | use super::*; |
| 167 | |
| 168 | #[test ] |
| 169 | fn test_serde_dfa() { |
| 170 | use regex_automata::dfa::Automaton; |
| 171 | |
| 172 | let matcher = SerdeDFA::new(Cow::Borrowed("abc" )).unwrap(); |
| 173 | |
| 174 | assert!(matcher.deref().find_earliest_fwd(b"ab" ).unwrap().is_none()); |
| 175 | assert!(matcher.deref().find_earliest_fwd(b"abc" ).unwrap().is_some()); |
| 176 | assert!(matcher |
| 177 | .deref() |
| 178 | .find_earliest_fwd(b"abcde" ) |
| 179 | .unwrap() |
| 180 | .is_some()); |
| 181 | assert!(matcher |
| 182 | .deref() |
| 183 | .find_earliest_fwd(b" abcde" ) |
| 184 | .unwrap() |
| 185 | .is_none()); |
| 186 | } |
| 187 | |
| 188 | #[derive (serde::Deserialize)] |
| 189 | struct OptionSerdeDFA<'data>( |
| 190 | #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize" )] Option<SerdeDFA<'data>>, |
| 191 | ); |
| 192 | |
| 193 | #[test ] |
| 194 | #[cfg (target_endian = "little" )] |
| 195 | fn test_postcard_serialization() { |
| 196 | let matcher = SerdeDFA::new(Cow::Borrowed("abc*" )).unwrap(); |
| 197 | |
| 198 | let mut bytes = postcard::to_stdvec(&matcher).unwrap(); |
| 199 | assert_eq!( |
| 200 | postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0, |
| 201 | Some(matcher) |
| 202 | ); |
| 203 | |
| 204 | // A corrupted byte leads to an error |
| 205 | bytes[17] ^= 255; |
| 206 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err()); |
| 207 | bytes[17] ^= 255; |
| 208 | |
| 209 | // An extra byte leads to an error |
| 210 | bytes.insert(123, 40); |
| 211 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err()); |
| 212 | bytes.remove(123); |
| 213 | |
| 214 | // Missing bytes lead to an error |
| 215 | assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err()); |
| 216 | } |
| 217 | |
| 218 | #[test ] |
| 219 | fn test_rmp_serialization() { |
| 220 | let matcher = SerdeDFA::new(Cow::Borrowed("abc*" )).unwrap(); |
| 221 | |
| 222 | let bytes = rmp_serde::to_vec(&matcher).unwrap(); |
| 223 | assert_eq!( |
| 224 | rmp_serde::from_slice::<OptionSerdeDFA>(&bytes).unwrap().0, |
| 225 | Some(matcher) |
| 226 | ); |
| 227 | } |
| 228 | |
| 229 | #[test ] |
| 230 | #[cfg (feature = "serde_human" )] |
| 231 | fn test_json_serialization() { |
| 232 | let matcher = SerdeDFA::new(Cow::Borrowed("abc*" )).unwrap(); |
| 233 | |
| 234 | let json = serde_json::to_string(&matcher).unwrap(); |
| 235 | assert_eq!( |
| 236 | serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0, |
| 237 | Some(matcher) |
| 238 | ); |
| 239 | assert!(serde_json::from_str::<OptionSerdeDFA>(".*[" ).is_err()); |
| 240 | } |
| 241 | |
| 242 | #[test ] |
| 243 | fn databake() { |
| 244 | // This is the DFA for ".*" |
| 245 | databake::test_bake!( |
| 246 | SerdeDFA, |
| 247 | const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little" ) { |
| 248 | b"rust-regex-automata-dfa-sparse \0\0\xFF\xFE\0\0\x02\0\0\0\0\0\0\0\x0E\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x06\x06\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\t\t\t\t\t\t\t\t\t\t\t\t\n\x0B\x0B\x0C\r\r\r\x0E\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x98\x01\0\0\x01\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\x80\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q \0\0\0\x12\0\0\0q \0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q \0\0\0q \0\0\0\x01\0\0\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0# \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x05\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x04\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x04\x05\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x03\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\0\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q \0\0\0\x12\0\0\0q \0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q \0\0\0q \0\0\0\0\x02\0\x03\x05\0\0\x89\x01\0\0\0\0\0\0\0\x02\0\x03\x05\0\0# \x01\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0# \x01\0\0# \x01\0\0# \x01\0\0# \x01\0\0# \x01\0\0\t\0\0\0\x12\0\0\0\x0C\x01\0\0\0\0\0\0\0\0\0\0# \x01\0\0# \x01\0\0" |
| 249 | } else { |
| 250 | b"rust-regex-automata-dfa-sparse \0\0\0\0\xFE\xFF\0\0\0\x02\0\0\0\0\0\0\0\x0E\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\x01\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x06\x06\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\t\t\t\t\t\t\t\t\t\t\t\t\n\x0B\x0B\x0C\r\r\r\x0E\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\0\0\x01\x98\x01\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\x80\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q \0\0\0\x12\0\0\0q \0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q \0\0\0q \0\0\0\x01\0\0\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0# \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x05\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x04\0\0\x89\x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x04\x05\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x05\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x02\x80\x03\x03\0\0z \x01\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x0E\0\0\0\x01\x01\x02\x02\x03\x06\x07\x07\x08\x08\t\t\n\n\x0B\x0B\x0C\x0C\r\r\x0E\x0E\x0F\x0F\0\0\x12\0\0\0q \0\0\0\x12\0\0\0q \0\0\0\x82\0\0\0\x99\0\0\0\xB0\0\0\0\xC7\0\0\0\xB0\0\0\0\xDE\0\0\0\xF5\0\0\0\x0C\x01\0\0q \0\0\0q \0\0\0\0\x02\0\x03\x05\0\0\x89\x01\0\0\0\0\0\0\0\x02\0\x03\x05\0\0# \x01\0\0\0\0\0\0\0\0\0\0\x04\0\0\0\0# \x01\0\0# \x01\0\0# \x01\0\0# \x01\0\0\0\0\x01# \0\0\0\t\0\0\0\x12\0\0\x01\x0C\0\0\0\0\0\0\0\0\0\0\x01# \0\0\x01#" |
| 251 | })}, |
| 252 | icu_list |
| 253 | ); |
| 254 | } |
| 255 | } |
| 256 | |