| 1 | /*! |
| 2 | Types and routines specific to sparse DFAs. |
| 3 | |
| 4 | This module is the home of [`sparse::DFA`](DFA). |
| 5 | |
| 6 | Unlike the [`dense`] module, this module does not contain a builder or |
| 7 | configuration specific for sparse DFAs. Instead, the intended way to build a |
| 8 | sparse DFA is either by using a default configuration with its constructor |
| 9 | [`sparse::DFA::new`](DFA::new), or by first configuring the construction of a |
| 10 | dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. |
| 11 | For example, this configures a sparse DFA to do an overlapping search: |
| 12 | |
| 13 | ``` |
| 14 | use regex_automata::{ |
| 15 | dfa::{Automaton, OverlappingState, dense}, |
| 16 | HalfMatch, Input, MatchKind, |
| 17 | }; |
| 18 | |
| 19 | let dense_re = dense::Builder::new() |
| 20 | .configure(dense::Config::new().match_kind(MatchKind::All)) |
| 21 | .build(r"Samwise|Sam" )?; |
| 22 | let sparse_re = dense_re.to_sparse()?; |
| 23 | |
| 24 | // Setup our haystack and initial start state. |
| 25 | let input = Input::new("Samwise" ); |
| 26 | let mut state = OverlappingState::start(); |
| 27 | |
| 28 | // First, 'Sam' will match. |
| 29 | sparse_re.try_search_overlapping_fwd(&input, &mut state)?; |
| 30 | assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match()); |
| 31 | |
| 32 | // And now 'Samwise' will match. |
| 33 | sparse_re.try_search_overlapping_fwd(&input, &mut state)?; |
| 34 | assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); |
| 35 | # Ok::<(), Box<dyn std::error::Error>>(()) |
| 36 | ``` |
| 37 | */ |
| 38 | |
| 39 | #[cfg (feature = "dfa-build" )] |
| 40 | use core::iter; |
| 41 | use core::{fmt, mem::size_of}; |
| 42 | |
| 43 | #[cfg (feature = "dfa-build" )] |
| 44 | use alloc::{vec, vec::Vec}; |
| 45 | |
| 46 | #[cfg (feature = "dfa-build" )] |
| 47 | use crate::dfa::dense::{self, BuildError}; |
| 48 | use crate::{ |
| 49 | dfa::{ |
| 50 | automaton::{fmt_state_indicator, Automaton, StartError}, |
| 51 | dense::Flags, |
| 52 | special::Special, |
| 53 | StartKind, DEAD, |
| 54 | }, |
| 55 | util::{ |
| 56 | alphabet::{ByteClasses, ByteSet}, |
| 57 | escape::DebugByte, |
| 58 | int::{Pointer, Usize, U16, U32}, |
| 59 | prefilter::Prefilter, |
| 60 | primitives::{PatternID, StateID}, |
| 61 | search::Anchored, |
| 62 | start::{self, Start, StartByteMap}, |
| 63 | wire::{self, DeserializeError, Endian, SerializeError}, |
| 64 | }, |
| 65 | }; |
| 66 | |
| 67 | const LABEL: &str = "rust-regex-automata-dfa-sparse" ; |
| 68 | const VERSION: u32 = 2; |
| 69 | |
| 70 | /// A sparse deterministic finite automaton (DFA) with variable sized states. |
| 71 | /// |
| 72 | /// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient |
| 73 | /// representation for its transitions. Consequently, sparse DFAs may use much |
| 74 | /// less memory than dense DFAs, but this comes at a price. In particular, |
| 75 | /// reading the more space efficient transitions takes more work, and |
| 76 | /// consequently, searching using a sparse DFA is typically slower than a dense |
| 77 | /// DFA. |
| 78 | /// |
| 79 | /// A sparse DFA can be built using the default configuration via the |
| 80 | /// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a |
| 81 | /// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse |
| 82 | /// DFA using [`dense::DFA::to_sparse`]. |
| 83 | /// |
| 84 | /// In general, a sparse DFA supports all the same search operations as a dense |
| 85 | /// DFA. |
| 86 | /// |
| 87 | /// Making the choice between a dense and sparse DFA depends on your specific |
| 88 | /// work load. If you can sacrifice a bit of search time performance, then a |
| 89 | /// sparse DFA might be the best choice. In particular, while sparse DFAs are |
| 90 | /// probably always slower than dense DFAs, you may find that they are easily |
| 91 | /// fast enough for your purposes! |
| 92 | /// |
| 93 | /// # Type parameters |
| 94 | /// |
| 95 | /// A `DFA` has one type parameter, `T`, which is used to represent the parts |
| 96 | /// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`. |
| 97 | /// |
| 98 | /// # The `Automaton` trait |
| 99 | /// |
| 100 | /// This type implements the [`Automaton`] trait, which means it can be used |
| 101 | /// for searching. For example: |
| 102 | /// |
| 103 | /// ``` |
| 104 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 105 | /// |
| 106 | /// let dfa = DFA::new("foo[0-9]+" )?; |
| 107 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 108 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345" ))?); |
| 109 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 110 | /// ``` |
| 111 | #[derive (Clone)] |
| 112 | pub struct DFA<T> { |
| 113 | // When compared to a dense DFA, a sparse DFA *looks* a lot simpler |
| 114 | // representation-wise. In reality, it is perhaps more complicated. Namely, |
| 115 | // in a dense DFA, all information needs to be very cheaply accessible |
| 116 | // using only state IDs. In a sparse DFA however, each state uses a |
| 117 | // variable amount of space because each state encodes more information |
| 118 | // than just its transitions. Each state also includes an accelerator if |
| 119 | // one exists, along with the matching pattern IDs if the state is a match |
| 120 | // state. |
| 121 | // |
| 122 | // That is, a lot of the complexity is pushed down into how each state |
| 123 | // itself is represented. |
| 124 | tt: Transitions<T>, |
| 125 | st: StartTable<T>, |
| 126 | special: Special, |
| 127 | pre: Option<Prefilter>, |
| 128 | quitset: ByteSet, |
| 129 | flags: Flags, |
| 130 | } |
| 131 | |
| 132 | #[cfg (feature = "dfa-build" )] |
| 133 | impl DFA<Vec<u8>> { |
| 134 | /// Parse the given regular expression using a default configuration and |
| 135 | /// return the corresponding sparse DFA. |
| 136 | /// |
| 137 | /// If you want a non-default configuration, then use the |
| 138 | /// [`dense::Builder`] to set your own configuration, and then call |
| 139 | /// [`dense::DFA::to_sparse`] to create a sparse DFA. |
| 140 | /// |
| 141 | /// # Example |
| 142 | /// |
| 143 | /// ``` |
| 144 | /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; |
| 145 | /// |
| 146 | /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; |
| 147 | /// |
| 148 | /// let expected = Some(HalfMatch::must(0, 11)); |
| 149 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); |
| 150 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 151 | /// ``` |
| 152 | #[cfg (feature = "syntax" )] |
| 153 | pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> { |
| 154 | dense::Builder::new() |
| 155 | .build(pattern) |
| 156 | .and_then(|dense| dense.to_sparse()) |
| 157 | } |
| 158 | |
| 159 | /// Parse the given regular expressions using a default configuration and |
| 160 | /// return the corresponding multi-DFA. |
| 161 | /// |
| 162 | /// If you want a non-default configuration, then use the |
| 163 | /// [`dense::Builder`] to set your own configuration, and then call |
| 164 | /// [`dense::DFA::to_sparse`] to create a sparse DFA. |
| 165 | /// |
| 166 | /// # Example |
| 167 | /// |
| 168 | /// ``` |
| 169 | /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; |
| 170 | /// |
| 171 | /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; |
| 172 | /// let expected = Some(HalfMatch::must(1, 3)); |
| 173 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); |
| 174 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 175 | /// ``` |
| 176 | #[cfg (feature = "syntax" )] |
| 177 | pub fn new_many<P: AsRef<str>>( |
| 178 | patterns: &[P], |
| 179 | ) -> Result<DFA<Vec<u8>>, BuildError> { |
| 180 | dense::Builder::new() |
| 181 | .build_many(patterns) |
| 182 | .and_then(|dense| dense.to_sparse()) |
| 183 | } |
| 184 | } |
| 185 | |
| 186 | #[cfg (feature = "dfa-build" )] |
| 187 | impl DFA<Vec<u8>> { |
| 188 | /// Create a new DFA that matches every input. |
| 189 | /// |
| 190 | /// # Example |
| 191 | /// |
| 192 | /// ``` |
| 193 | /// use regex_automata::{ |
| 194 | /// dfa::{Automaton, sparse}, |
| 195 | /// HalfMatch, Input, |
| 196 | /// }; |
| 197 | /// |
| 198 | /// let dfa = sparse::DFA::always_match()?; |
| 199 | /// |
| 200 | /// let expected = Some(HalfMatch::must(0, 0)); |
| 201 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); |
| 202 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); |
| 203 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 204 | /// ``` |
| 205 | pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> { |
| 206 | dense::DFA::always_match()?.to_sparse() |
| 207 | } |
| 208 | |
| 209 | /// Create a new sparse DFA that never matches any input. |
| 210 | /// |
| 211 | /// # Example |
| 212 | /// |
| 213 | /// ``` |
| 214 | /// use regex_automata::{dfa::{Automaton, sparse}, Input}; |
| 215 | /// |
| 216 | /// let dfa = sparse::DFA::never_match()?; |
| 217 | /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); |
| 218 | /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); |
| 219 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 220 | /// ``` |
| 221 | pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> { |
| 222 | dense::DFA::never_match()?.to_sparse() |
| 223 | } |
| 224 | |
| 225 | /// The implementation for constructing a sparse DFA from a dense DFA. |
| 226 | pub(crate) fn from_dense<T: AsRef<[u32]>>( |
| 227 | dfa: &dense::DFA<T>, |
| 228 | ) -> Result<DFA<Vec<u8>>, BuildError> { |
| 229 | // In order to build the transition table, we need to be able to write |
| 230 | // state identifiers for each of the "next" transitions in each state. |
| 231 | // Our state identifiers correspond to the byte offset in the |
| 232 | // transition table at which the state is encoded. Therefore, we do not |
| 233 | // actually know what the state identifiers are until we've allocated |
| 234 | // exactly as much space as we need for each state. Thus, construction |
| 235 | // of the transition table happens in two passes. |
| 236 | // |
| 237 | // In the first pass, we fill out the shell of each state, which |
| 238 | // includes the transition length, the input byte ranges and |
| 239 | // zero-filled space for the transitions and accelerators, if present. |
| 240 | // In this first pass, we also build up a map from the state identifier |
| 241 | // index of the dense DFA to the state identifier in this sparse DFA. |
| 242 | // |
| 243 | // In the second pass, we fill in the transitions based on the map |
| 244 | // built in the first pass. |
| 245 | |
| 246 | // The capacity given here reflects a minimum. (Well, the true minimum |
| 247 | // is likely even bigger, but hopefully this saves a few reallocs.) |
| 248 | let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len()); |
| 249 | // This maps state indices from the dense DFA to StateIDs in the sparse |
| 250 | // DFA. We build out this map on the first pass, and then use it in the |
| 251 | // second pass to back-fill our transitions. |
| 252 | let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()]; |
| 253 | for state in dfa.states() { |
| 254 | let pos = sparse.len(); |
| 255 | |
| 256 | remap[dfa.to_index(state.id())] = StateID::new(pos) |
| 257 | .map_err(|_| BuildError::too_many_states())?; |
| 258 | // zero-filled space for the transition length |
| 259 | sparse.push(0); |
| 260 | sparse.push(0); |
| 261 | |
| 262 | let mut transition_len = 0; |
| 263 | for (unit1, unit2, _) in state.sparse_transitions() { |
| 264 | match (unit1.as_u8(), unit2.as_u8()) { |
| 265 | (Some(b1), Some(b2)) => { |
| 266 | transition_len += 1; |
| 267 | sparse.push(b1); |
| 268 | sparse.push(b2); |
| 269 | } |
| 270 | (None, None) => {} |
| 271 | (Some(_), None) | (None, Some(_)) => { |
| 272 | // can never occur because sparse_transitions never |
| 273 | // groups EOI with any other transition. |
| 274 | unreachable!() |
| 275 | } |
| 276 | } |
| 277 | } |
| 278 | // Add dummy EOI transition. This is never actually read while |
| 279 | // searching, but having space equivalent to the total number |
| 280 | // of transitions is convenient. Otherwise, we'd need to track |
| 281 | // a different number of transitions for the byte ranges as for |
| 282 | // the 'next' states. |
| 283 | // |
| 284 | // N.B. The loop above is not guaranteed to yield the EOI |
| 285 | // transition, since it may point to a DEAD state. By putting |
| 286 | // it here, we always write the EOI transition, and thus |
| 287 | // guarantee that our transition length is >0. Why do we always |
| 288 | // need the EOI transition? Because in order to implement |
| 289 | // Automaton::next_eoi_state, this lets us just ask for the last |
| 290 | // transition. There are probably other/better ways to do this. |
| 291 | transition_len += 1; |
| 292 | sparse.push(0); |
| 293 | sparse.push(0); |
| 294 | |
| 295 | // Check some assumptions about transition length. |
| 296 | assert_ne!( |
| 297 | transition_len, 0, |
| 298 | "transition length should be non-zero" , |
| 299 | ); |
| 300 | assert!( |
| 301 | transition_len <= 257, |
| 302 | "expected transition length {} to be <= 257" , |
| 303 | transition_len, |
| 304 | ); |
| 305 | |
| 306 | // Fill in the transition length. |
| 307 | // Since transition length is always <= 257, we use the most |
| 308 | // significant bit to indicate whether this is a match state or |
| 309 | // not. |
| 310 | let ntrans = if dfa.is_match_state(state.id()) { |
| 311 | transition_len | (1 << 15) |
| 312 | } else { |
| 313 | transition_len |
| 314 | }; |
| 315 | wire::NE::write_u16(ntrans, &mut sparse[pos..]); |
| 316 | |
| 317 | // zero-fill the actual transitions. |
| 318 | // Unwraps are OK since transition_length <= 257 and our minimum |
| 319 | // support usize size is 16-bits. |
| 320 | let zeros = usize::try_from(transition_len) |
| 321 | .unwrap() |
| 322 | .checked_mul(StateID::SIZE) |
| 323 | .unwrap(); |
| 324 | sparse.extend(iter::repeat(0).take(zeros)); |
| 325 | |
| 326 | // If this is a match state, write the pattern IDs matched by this |
| 327 | // state. |
| 328 | if dfa.is_match_state(state.id()) { |
| 329 | let plen = dfa.match_pattern_len(state.id()); |
| 330 | // Write the actual pattern IDs with a u32 length prefix. |
| 331 | // First, zero-fill space. |
| 332 | let mut pos = sparse.len(); |
| 333 | // Unwraps are OK since it's guaranteed that plen <= |
| 334 | // PatternID::LIMIT, which is in turn guaranteed to fit into a |
| 335 | // u32. |
| 336 | let zeros = size_of::<u32>() |
| 337 | .checked_mul(plen) |
| 338 | .unwrap() |
| 339 | .checked_add(size_of::<u32>()) |
| 340 | .unwrap(); |
| 341 | sparse.extend(iter::repeat(0).take(zeros)); |
| 342 | |
| 343 | // Now write the length prefix. |
| 344 | wire::NE::write_u32( |
| 345 | // Will never fail since u32::MAX is invalid pattern ID. |
| 346 | // Thus, the number of pattern IDs is representable by a |
| 347 | // u32. |
| 348 | plen.try_into().expect("pattern ID length fits in u32" ), |
| 349 | &mut sparse[pos..], |
| 350 | ); |
| 351 | pos += size_of::<u32>(); |
| 352 | |
| 353 | // Now write the pattern IDs. |
| 354 | for &pid in dfa.pattern_id_slice(state.id()) { |
| 355 | pos += wire::write_pattern_id::<wire::NE>( |
| 356 | pid, |
| 357 | &mut sparse[pos..], |
| 358 | ); |
| 359 | } |
| 360 | } |
| 361 | |
| 362 | // And now add the accelerator, if one exists. An accelerator is |
| 363 | // at most 4 bytes and at least 1 byte. The first byte is the |
| 364 | // length, N. N bytes follow the length. The set of bytes that |
| 365 | // follow correspond (exhaustively) to the bytes that must be seen |
| 366 | // to leave this state. |
| 367 | let accel = dfa.accelerator(state.id()); |
| 368 | sparse.push(accel.len().try_into().unwrap()); |
| 369 | sparse.extend_from_slice(accel); |
| 370 | } |
| 371 | |
| 372 | let mut new = DFA { |
| 373 | tt: Transitions { |
| 374 | sparse, |
| 375 | classes: dfa.byte_classes().clone(), |
| 376 | state_len: dfa.state_len(), |
| 377 | pattern_len: dfa.pattern_len(), |
| 378 | }, |
| 379 | st: StartTable::from_dense_dfa(dfa, &remap)?, |
| 380 | special: dfa.special().remap(|id| remap[dfa.to_index(id)]), |
| 381 | pre: dfa.get_prefilter().map(|p| p.clone()), |
| 382 | quitset: dfa.quitset().clone(), |
| 383 | flags: dfa.flags().clone(), |
| 384 | }; |
| 385 | // And here's our second pass. Iterate over all of the dense states |
| 386 | // again, and update the transitions in each of the states in the |
| 387 | // sparse DFA. |
| 388 | for old_state in dfa.states() { |
| 389 | let new_id = remap[dfa.to_index(old_state.id())]; |
| 390 | let mut new_state = new.tt.state_mut(new_id); |
| 391 | let sparse = old_state.sparse_transitions(); |
| 392 | for (i, (_, _, next)) in sparse.enumerate() { |
| 393 | let next = remap[dfa.to_index(next)]; |
| 394 | new_state.set_next_at(i, next); |
| 395 | } |
| 396 | } |
| 397 | debug!( |
| 398 | "created sparse DFA, memory usage: {} (dense memory usage: {})" , |
| 399 | new.memory_usage(), |
| 400 | dfa.memory_usage(), |
| 401 | ); |
| 402 | Ok(new) |
| 403 | } |
| 404 | } |
| 405 | |
| 406 | impl<T: AsRef<[u8]>> DFA<T> { |
| 407 | /// Cheaply return a borrowed version of this sparse DFA. Specifically, the |
| 408 | /// DFA returned always uses `&[u8]` for its transitions. |
| 409 | pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { |
| 410 | DFA { |
| 411 | tt: self.tt.as_ref(), |
| 412 | st: self.st.as_ref(), |
| 413 | special: self.special, |
| 414 | pre: self.pre.clone(), |
| 415 | quitset: self.quitset, |
| 416 | flags: self.flags, |
| 417 | } |
| 418 | } |
| 419 | |
| 420 | /// Return an owned version of this sparse DFA. Specifically, the DFA |
| 421 | /// returned always uses `Vec<u8>` for its transitions. |
| 422 | /// |
| 423 | /// Effectively, this returns a sparse DFA whose transitions live on the |
| 424 | /// heap. |
| 425 | #[cfg (feature = "alloc" )] |
| 426 | pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> { |
| 427 | DFA { |
| 428 | tt: self.tt.to_owned(), |
| 429 | st: self.st.to_owned(), |
| 430 | special: self.special, |
| 431 | pre: self.pre.clone(), |
| 432 | quitset: self.quitset, |
| 433 | flags: self.flags, |
| 434 | } |
| 435 | } |
| 436 | |
| 437 | /// Returns the starting state configuration for this DFA. |
| 438 | /// |
| 439 | /// The default is [`StartKind::Both`], which means the DFA supports both |
| 440 | /// unanchored and anchored searches. However, this can generally lead to |
| 441 | /// bigger DFAs. Therefore, a DFA might be compiled with support for just |
| 442 | /// unanchored or anchored searches. In that case, running a search with |
| 443 | /// an unsupported configuration will panic. |
| 444 | pub fn start_kind(&self) -> StartKind { |
| 445 | self.st.kind |
| 446 | } |
| 447 | |
| 448 | /// Returns true only if this DFA has starting states for each pattern. |
| 449 | /// |
| 450 | /// When a DFA has starting states for each pattern, then a search with the |
| 451 | /// DFA can be configured to only look for anchored matches of a specific |
| 452 | /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can |
| 453 | /// accept a [`Anchored::Pattern`] if and only if this method returns true. |
| 454 | /// Otherwise, an error will be returned. |
| 455 | /// |
| 456 | /// Note that if the DFA is empty, this always returns false. |
| 457 | pub fn starts_for_each_pattern(&self) -> bool { |
| 458 | self.st.pattern_len.is_some() |
| 459 | } |
| 460 | |
| 461 | /// Returns the equivalence classes that make up the alphabet for this DFA. |
| 462 | /// |
| 463 | /// Unless [`dense::Config::byte_classes`] was disabled, it is possible |
| 464 | /// that multiple distinct bytes are grouped into the same equivalence |
| 465 | /// class if it is impossible for them to discriminate between a match and |
| 466 | /// a non-match. This has the effect of reducing the overall alphabet size |
| 467 | /// and in turn potentially substantially reducing the size of the DFA's |
| 468 | /// transition table. |
| 469 | /// |
| 470 | /// The downside of using equivalence classes like this is that every state |
| 471 | /// transition will automatically use this map to convert an arbitrary |
| 472 | /// byte to its corresponding equivalence class. In practice this has a |
| 473 | /// negligible impact on performance. |
| 474 | pub fn byte_classes(&self) -> &ByteClasses { |
| 475 | &self.tt.classes |
| 476 | } |
| 477 | |
| 478 | /// Returns the memory usage, in bytes, of this DFA. |
| 479 | /// |
| 480 | /// The memory usage is computed based on the number of bytes used to |
| 481 | /// represent this DFA. |
| 482 | /// |
| 483 | /// This does **not** include the stack size used up by this DFA. To |
| 484 | /// compute that, use `std::mem::size_of::<sparse::DFA>()`. |
| 485 | pub fn memory_usage(&self) -> usize { |
| 486 | self.tt.memory_usage() + self.st.memory_usage() |
| 487 | } |
| 488 | } |
| 489 | |
| 490 | /// Routines for converting a sparse DFA to other representations, such as raw |
| 491 | /// bytes suitable for persistent storage. |
| 492 | impl<T: AsRef<[u8]>> DFA<T> { |
| 493 | /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian |
| 494 | /// format. |
| 495 | /// |
| 496 | /// The written bytes are guaranteed to be deserialized correctly and |
| 497 | /// without errors in a semver compatible release of this crate by a |
| 498 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
| 499 | /// deserialization APIs has been satisfied): |
| 500 | /// |
| 501 | /// * [`DFA::from_bytes`] |
| 502 | /// * [`DFA::from_bytes_unchecked`] |
| 503 | /// |
| 504 | /// Note that unlike a [`dense::DFA`]'s serialization methods, this does |
| 505 | /// not add any initial padding to the returned bytes. Padding isn't |
| 506 | /// required for sparse DFAs since they have no alignment requirements. |
| 507 | /// |
| 508 | /// # Example |
| 509 | /// |
| 510 | /// This example shows how to serialize and deserialize a DFA: |
| 511 | /// |
| 512 | /// ``` |
| 513 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 514 | /// |
| 515 | /// // Compile our original DFA. |
| 516 | /// let original_dfa = DFA::new("foo[0-9]+")?; |
| 517 | /// |
| 518 | /// // N.B. We use native endianness here to make the example work, but |
| 519 | /// // using to_bytes_little_endian would work on a little endian target. |
| 520 | /// let buf = original_dfa.to_bytes_native_endian(); |
| 521 | /// // Even if buf has initial padding, DFA::from_bytes will automatically |
| 522 | /// // ignore it. |
| 523 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |
| 524 | /// |
| 525 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 526 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |
| 527 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 528 | /// ``` |
| 529 | #[cfg (feature = "dfa-build" )] |
| 530 | pub fn to_bytes_little_endian(&self) -> Vec<u8> { |
| 531 | self.to_bytes::<wire::LE>() |
| 532 | } |
| 533 | |
| 534 | /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian |
| 535 | /// format. |
| 536 | /// |
| 537 | /// The written bytes are guaranteed to be deserialized correctly and |
| 538 | /// without errors in a semver compatible release of this crate by a |
| 539 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
| 540 | /// deserialization APIs has been satisfied): |
| 541 | /// |
| 542 | /// * [`DFA::from_bytes`] |
| 543 | /// * [`DFA::from_bytes_unchecked`] |
| 544 | /// |
| 545 | /// Note that unlike a [`dense::DFA`]'s serialization methods, this does |
| 546 | /// not add any initial padding to the returned bytes. Padding isn't |
| 547 | /// required for sparse DFAs since they have no alignment requirements. |
| 548 | /// |
| 549 | /// # Example |
| 550 | /// |
| 551 | /// This example shows how to serialize and deserialize a DFA: |
| 552 | /// |
| 553 | /// ``` |
| 554 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 555 | /// |
| 556 | /// // Compile our original DFA. |
| 557 | /// let original_dfa = DFA::new("foo[0-9]+")?; |
| 558 | /// |
| 559 | /// // N.B. We use native endianness here to make the example work, but |
| 560 | /// // using to_bytes_big_endian would work on a big endian target. |
| 561 | /// let buf = original_dfa.to_bytes_native_endian(); |
| 562 | /// // Even if buf has initial padding, DFA::from_bytes will automatically |
| 563 | /// // ignore it. |
| 564 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |
| 565 | /// |
| 566 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 567 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |
| 568 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 569 | /// ``` |
| 570 | #[cfg (feature = "dfa-build" )] |
| 571 | pub fn to_bytes_big_endian(&self) -> Vec<u8> { |
| 572 | self.to_bytes::<wire::BE>() |
| 573 | } |
| 574 | |
| 575 | /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian |
| 576 | /// format. |
| 577 | /// |
| 578 | /// The written bytes are guaranteed to be deserialized correctly and |
| 579 | /// without errors in a semver compatible release of this crate by a |
| 580 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
| 581 | /// deserialization APIs has been satisfied): |
| 582 | /// |
| 583 | /// * [`DFA::from_bytes`] |
| 584 | /// * [`DFA::from_bytes_unchecked`] |
| 585 | /// |
| 586 | /// Note that unlike a [`dense::DFA`]'s serialization methods, this does |
| 587 | /// not add any initial padding to the returned bytes. Padding isn't |
| 588 | /// required for sparse DFAs since they have no alignment requirements. |
| 589 | /// |
| 590 | /// Generally speaking, native endian format should only be used when |
| 591 | /// you know that the target you're compiling the DFA for matches the |
| 592 | /// endianness of the target on which you're compiling DFA. For example, |
| 593 | /// if serialization and deserialization happen in the same process or on |
| 594 | /// the same machine. Otherwise, when serializing a DFA for use in a |
| 595 | /// portable environment, you'll almost certainly want to serialize _both_ |
| 596 | /// a little endian and a big endian version and then load the correct one |
| 597 | /// based on the target's configuration. |
| 598 | /// |
| 599 | /// # Example |
| 600 | /// |
| 601 | /// This example shows how to serialize and deserialize a DFA: |
| 602 | /// |
| 603 | /// ``` |
| 604 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 605 | /// |
| 606 | /// // Compile our original DFA. |
| 607 | /// let original_dfa = DFA::new("foo[0-9]+")?; |
| 608 | /// |
| 609 | /// let buf = original_dfa.to_bytes_native_endian(); |
| 610 | /// // Even if buf has initial padding, DFA::from_bytes will automatically |
| 611 | /// // ignore it. |
| 612 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |
| 613 | /// |
| 614 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 615 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |
| 616 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 617 | /// ``` |
| 618 | #[cfg (feature = "dfa-build" )] |
| 619 | pub fn to_bytes_native_endian(&self) -> Vec<u8> { |
| 620 | self.to_bytes::<wire::NE>() |
| 621 | } |
| 622 | |
| 623 | /// The implementation of the public `to_bytes` serialization methods, |
| 624 | /// which is generic over endianness. |
| 625 | #[cfg (feature = "dfa-build" )] |
| 626 | fn to_bytes<E: Endian>(&self) -> Vec<u8> { |
| 627 | let mut buf = vec![0; self.write_to_len()]; |
| 628 | // This should always succeed since the only possible serialization |
| 629 | // error is providing a buffer that's too small, but we've ensured that |
| 630 | // `buf` is big enough here. |
| 631 | self.write_to::<E>(&mut buf).unwrap(); |
| 632 | buf |
| 633 | } |
| 634 | |
| 635 | /// Serialize this DFA as raw bytes to the given slice, in little endian |
| 636 | /// format. Upon success, the total number of bytes written to `dst` is |
| 637 | /// returned. |
| 638 | /// |
| 639 | /// The written bytes are guaranteed to be deserialized correctly and |
| 640 | /// without errors in a semver compatible release of this crate by a |
| 641 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
| 642 | /// deserialization APIs has been satisfied): |
| 643 | /// |
| 644 | /// * [`DFA::from_bytes`] |
| 645 | /// * [`DFA::from_bytes_unchecked`] |
| 646 | /// |
| 647 | /// # Errors |
| 648 | /// |
| 649 | /// This returns an error if the given destination slice is not big enough |
| 650 | /// to contain the full serialized DFA. If an error occurs, then nothing |
| 651 | /// is written to `dst`. |
| 652 | /// |
| 653 | /// # Example |
| 654 | /// |
| 655 | /// This example shows how to serialize and deserialize a DFA without |
| 656 | /// dynamic memory allocation. |
| 657 | /// |
| 658 | /// ``` |
| 659 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 660 | /// |
| 661 | /// // Compile our original DFA. |
| 662 | /// let original_dfa = DFA::new("foo[0-9]+" )?; |
| 663 | /// |
| 664 | /// // Create a 4KB buffer on the stack to store our serialized DFA. |
| 665 | /// let mut buf = [0u8; 4 * (1<<10)]; |
| 666 | /// // N.B. We use native endianness here to make the example work, but |
| 667 | /// // using write_to_little_endian would work on a little endian target. |
| 668 | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |
| 669 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |
| 670 | /// |
| 671 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 672 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345" ))?); |
| 673 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 674 | /// ``` |
| 675 | pub fn write_to_little_endian( |
| 676 | &self, |
| 677 | dst: &mut [u8], |
| 678 | ) -> Result<usize, SerializeError> { |
| 679 | self.write_to::<wire::LE>(dst) |
| 680 | } |
| 681 | |
| 682 | /// Serialize this DFA as raw bytes to the given slice, in big endian |
| 683 | /// format. Upon success, the total number of bytes written to `dst` is |
| 684 | /// returned. |
| 685 | /// |
| 686 | /// The written bytes are guaranteed to be deserialized correctly and |
| 687 | /// without errors in a semver compatible release of this crate by a |
| 688 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
| 689 | /// deserialization APIs has been satisfied): |
| 690 | /// |
| 691 | /// * [`DFA::from_bytes`] |
| 692 | /// * [`DFA::from_bytes_unchecked`] |
| 693 | /// |
| 694 | /// # Errors |
| 695 | /// |
| 696 | /// This returns an error if the given destination slice is not big enough |
| 697 | /// to contain the full serialized DFA. If an error occurs, then nothing |
| 698 | /// is written to `dst`. |
| 699 | /// |
| 700 | /// # Example |
| 701 | /// |
| 702 | /// This example shows how to serialize and deserialize a DFA without |
| 703 | /// dynamic memory allocation. |
| 704 | /// |
| 705 | /// ``` |
| 706 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 707 | /// |
| 708 | /// // Compile our original DFA. |
| 709 | /// let original_dfa = DFA::new("foo[0-9]+" )?; |
| 710 | /// |
| 711 | /// // Create a 4KB buffer on the stack to store our serialized DFA. |
| 712 | /// let mut buf = [0u8; 4 * (1<<10)]; |
| 713 | /// // N.B. We use native endianness here to make the example work, but |
| 714 | /// // using write_to_big_endian would work on a big endian target. |
| 715 | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |
| 716 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |
| 717 | /// |
| 718 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 719 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345" ))?); |
| 720 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 721 | /// ``` |
| 722 | pub fn write_to_big_endian( |
| 723 | &self, |
| 724 | dst: &mut [u8], |
| 725 | ) -> Result<usize, SerializeError> { |
| 726 | self.write_to::<wire::BE>(dst) |
| 727 | } |
| 728 | |
| 729 | /// Serialize this DFA as raw bytes to the given slice, in native endian |
| 730 | /// format. Upon success, the total number of bytes written to `dst` is |
| 731 | /// returned. |
| 732 | /// |
| 733 | /// The written bytes are guaranteed to be deserialized correctly and |
| 734 | /// without errors in a semver compatible release of this crate by a |
| 735 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |
| 736 | /// deserialization APIs has been satisfied): |
| 737 | /// |
| 738 | /// * [`DFA::from_bytes`] |
| 739 | /// * [`DFA::from_bytes_unchecked`] |
| 740 | /// |
| 741 | /// Generally speaking, native endian format should only be used when |
| 742 | /// you know that the target you're compiling the DFA for matches the |
| 743 | /// endianness of the target on which you're compiling DFA. For example, |
| 744 | /// if serialization and deserialization happen in the same process or on |
| 745 | /// the same machine. Otherwise, when serializing a DFA for use in a |
| 746 | /// portable environment, you'll almost certainly want to serialize _both_ |
| 747 | /// a little endian and a big endian version and then load the correct one |
| 748 | /// based on the target's configuration. |
| 749 | /// |
| 750 | /// # Errors |
| 751 | /// |
| 752 | /// This returns an error if the given destination slice is not big enough |
| 753 | /// to contain the full serialized DFA. If an error occurs, then nothing |
| 754 | /// is written to `dst`. |
| 755 | /// |
| 756 | /// # Example |
| 757 | /// |
| 758 | /// This example shows how to serialize and deserialize a DFA without |
| 759 | /// dynamic memory allocation. |
| 760 | /// |
| 761 | /// ``` |
| 762 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 763 | /// |
| 764 | /// // Compile our original DFA. |
| 765 | /// let original_dfa = DFA::new("foo[0-9]+" )?; |
| 766 | /// |
| 767 | /// // Create a 4KB buffer on the stack to store our serialized DFA. |
| 768 | /// let mut buf = [0u8; 4 * (1<<10)]; |
| 769 | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |
| 770 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |
| 771 | /// |
| 772 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 773 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345" ))?); |
| 774 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 775 | /// ``` |
| 776 | pub fn write_to_native_endian( |
| 777 | &self, |
| 778 | dst: &mut [u8], |
| 779 | ) -> Result<usize, SerializeError> { |
| 780 | self.write_to::<wire::NE>(dst) |
| 781 | } |
| 782 | |
| 783 | /// The implementation of the public `write_to` serialization methods, |
| 784 | /// which is generic over endianness. |
| 785 | fn write_to<E: Endian>( |
| 786 | &self, |
| 787 | dst: &mut [u8], |
| 788 | ) -> Result<usize, SerializeError> { |
| 789 | let mut nw = 0; |
| 790 | nw += wire::write_label(LABEL, &mut dst[nw..])?; |
| 791 | nw += wire::write_endianness_check::<E>(&mut dst[nw..])?; |
| 792 | nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?; |
| 793 | nw += { |
| 794 | // Currently unused, intended for future flexibility |
| 795 | E::write_u32(0, &mut dst[nw..]); |
| 796 | size_of::<u32>() |
| 797 | }; |
| 798 | nw += self.flags.write_to::<E>(&mut dst[nw..])?; |
| 799 | nw += self.tt.write_to::<E>(&mut dst[nw..])?; |
| 800 | nw += self.st.write_to::<E>(&mut dst[nw..])?; |
| 801 | nw += self.special.write_to::<E>(&mut dst[nw..])?; |
| 802 | nw += self.quitset.write_to::<E>(&mut dst[nw..])?; |
| 803 | Ok(nw) |
| 804 | } |
| 805 | |
| 806 | /// Return the total number of bytes required to serialize this DFA. |
| 807 | /// |
| 808 | /// This is useful for determining the size of the buffer required to pass |
| 809 | /// to one of the serialization routines: |
| 810 | /// |
| 811 | /// * [`DFA::write_to_little_endian`] |
| 812 | /// * [`DFA::write_to_big_endian`] |
| 813 | /// * [`DFA::write_to_native_endian`] |
| 814 | /// |
| 815 | /// Passing a buffer smaller than the size returned by this method will |
| 816 | /// result in a serialization error. |
| 817 | /// |
| 818 | /// # Example |
| 819 | /// |
| 820 | /// This example shows how to dynamically allocate enough room to serialize |
| 821 | /// a sparse DFA. |
| 822 | /// |
| 823 | /// ``` |
| 824 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 825 | /// |
| 826 | /// // Compile our original DFA. |
| 827 | /// let original_dfa = DFA::new("foo[0-9]+" )?; |
| 828 | /// |
| 829 | /// let mut buf = vec![0; original_dfa.write_to_len()]; |
| 830 | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |
| 831 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |
| 832 | /// |
| 833 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 834 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345" ))?); |
| 835 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 836 | /// ``` |
| 837 | pub fn write_to_len(&self) -> usize { |
| 838 | wire::write_label_len(LABEL) |
| 839 | + wire::write_endianness_check_len() |
| 840 | + wire::write_version_len() |
| 841 | + size_of::<u32>() // unused, intended for future flexibility |
| 842 | + self.flags.write_to_len() |
| 843 | + self.tt.write_to_len() |
| 844 | + self.st.write_to_len() |
| 845 | + self.special.write_to_len() |
| 846 | + self.quitset.write_to_len() |
| 847 | } |
| 848 | } |
| 849 | |
| 850 | impl<'a> DFA<&'a [u8]> { |
| 851 | /// Safely deserialize a sparse DFA with a specific state identifier |
| 852 | /// representation. Upon success, this returns both the deserialized DFA |
| 853 | /// and the number of bytes read from the given slice. Namely, the contents |
| 854 | /// of the slice beyond the DFA are not read. |
| 855 | /// |
| 856 | /// Deserializing a DFA using this routine will never allocate heap memory. |
| 857 | /// For safety purposes, the DFA's transitions will be verified such that |
| 858 | /// every transition points to a valid state. If this verification is too |
| 859 | /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which |
| 860 | /// will always execute in constant time. |
| 861 | /// |
| 862 | /// The bytes given must be generated by one of the serialization APIs |
| 863 | /// of a `DFA` using a semver compatible release of this crate. Those |
| 864 | /// include: |
| 865 | /// |
| 866 | /// * [`DFA::to_bytes_little_endian`] |
| 867 | /// * [`DFA::to_bytes_big_endian`] |
| 868 | /// * [`DFA::to_bytes_native_endian`] |
| 869 | /// * [`DFA::write_to_little_endian`] |
| 870 | /// * [`DFA::write_to_big_endian`] |
| 871 | /// * [`DFA::write_to_native_endian`] |
| 872 | /// |
| 873 | /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The |
| 874 | /// `write_to` methods do not allocate and write to an existing slice |
| 875 | /// (which may be on the stack). Since deserialization always uses the |
| 876 | /// native endianness of the target platform, the serialization API you use |
| 877 | /// should match the endianness of the target platform. (It's often a good |
| 878 | /// idea to generate serialized DFAs for both forms of endianness and then |
| 879 | /// load the correct one based on endianness.) |
| 880 | /// |
| 881 | /// # Errors |
| 882 | /// |
| 883 | /// Generally speaking, it's easier to state the conditions in which an |
| 884 | /// error is _not_ returned. All of the following must be true: |
| 885 | /// |
| 886 | /// * The bytes given must be produced by one of the serialization APIs |
| 887 | /// on this DFA, as mentioned above. |
| 888 | /// * The endianness of the target platform matches the endianness used to |
| 889 | /// serialized the provided DFA. |
| 890 | /// |
| 891 | /// If any of the above are not true, then an error will be returned. |
| 892 | /// |
| 893 | /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse |
| 894 | /// DFA has no alignment requirements. That is, an alignment of `1` is |
| 895 | /// valid. |
| 896 | /// |
| 897 | /// # Panics |
| 898 | /// |
| 899 | /// This routine will never panic for any input. |
| 900 | /// |
| 901 | /// # Example |
| 902 | /// |
| 903 | /// This example shows how to serialize a DFA to raw bytes, deserialize it |
| 904 | /// and then use it for searching. |
| 905 | /// |
| 906 | /// ``` |
| 907 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 908 | /// |
| 909 | /// let initial = DFA::new("foo[0-9]+" )?; |
| 910 | /// let bytes = initial.to_bytes_native_endian(); |
| 911 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; |
| 912 | /// |
| 913 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 914 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345" ))?); |
| 915 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 916 | /// ``` |
| 917 | /// |
| 918 | /// # Example: loading a DFA from static memory |
| 919 | /// |
| 920 | /// One use case this library supports is the ability to serialize a |
| 921 | /// DFA to disk and then use `include_bytes!` to store it in a compiled |
| 922 | /// Rust program. Those bytes can then be cheaply deserialized into a |
| 923 | /// `DFA` structure at runtime and used for searching without having to |
| 924 | /// re-compile the DFA (which can be quite costly). |
| 925 | /// |
| 926 | /// We can show this in two parts. The first part is serializing the DFA to |
| 927 | /// a file: |
| 928 | /// |
| 929 | /// ```no_run |
| 930 | /// use regex_automata::dfa::sparse::DFA; |
| 931 | /// |
| 932 | /// let dfa = DFA::new("foo[0-9]+" )?; |
| 933 | /// |
| 934 | /// // Write a big endian serialized version of this DFA to a file. |
| 935 | /// let bytes = dfa.to_bytes_big_endian(); |
| 936 | /// std::fs::write("foo.bigendian.dfa" , &bytes)?; |
| 937 | /// |
| 938 | /// // Do it again, but this time for little endian. |
| 939 | /// let bytes = dfa.to_bytes_little_endian(); |
| 940 | /// std::fs::write("foo.littleendian.dfa" , &bytes)?; |
| 941 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 942 | /// ``` |
| 943 | /// |
| 944 | /// And now the second part is embedding the DFA into the compiled program |
| 945 | /// and deserializing it at runtime on first use. We use conditional |
| 946 | /// compilation to choose the correct endianness. We do not need to employ |
| 947 | /// any special tricks to ensure a proper alignment, since a sparse DFA has |
| 948 | /// no alignment requirements. |
| 949 | /// |
| 950 | /// ```no_run |
| 951 | /// use regex_automata::{ |
| 952 | /// dfa::{Automaton, sparse::DFA}, |
| 953 | /// util::lazy::Lazy, |
| 954 | /// HalfMatch, Input, |
| 955 | /// }; |
| 956 | /// |
| 957 | /// // This crate provides its own "lazy" type, kind of like |
| 958 | /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc |
| 959 | /// // no-std environments and let's us write this using completely |
| 960 | /// // safe code. |
| 961 | /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { |
| 962 | /// # const _: &str = stringify! { |
| 963 | /// #[cfg(target_endian = "big" )] |
| 964 | /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa" ); |
| 965 | /// #[cfg(target_endian = "little" )] |
| 966 | /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa" ); |
| 967 | /// # }; |
| 968 | /// # static BYTES: &[u8] = b"" ; |
| 969 | /// |
| 970 | /// let (dfa, _) = DFA::from_bytes(BYTES) |
| 971 | /// .expect("serialized DFA should be valid" ); |
| 972 | /// dfa |
| 973 | /// }); |
| 974 | /// |
| 975 | /// let expected = Ok(Some(HalfMatch::must(0, 8))); |
| 976 | /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345" ))); |
| 977 | /// ``` |
| 978 | /// |
| 979 | /// Alternatively, consider using |
| 980 | /// [`lazy_static`](https://crates.io/crates/lazy_static) |
| 981 | /// or |
| 982 | /// [`once_cell`](https://crates.io/crates/once_cell), |
| 983 | /// which will guarantee safety for you. |
| 984 | pub fn from_bytes( |
| 985 | slice: &'a [u8], |
| 986 | ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { |
| 987 | // SAFETY: This is safe because we validate both the sparse transitions |
| 988 | // (by trying to decode every state) and start state ID list below. If |
| 989 | // either validation fails, then we return an error. |
| 990 | let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; |
| 991 | let seen = dfa.tt.validate(&dfa.special)?; |
| 992 | dfa.st.validate(&dfa.special, &seen)?; |
| 993 | // N.B. dfa.special doesn't have a way to do unchecked deserialization, |
| 994 | // so it has already been validated. |
| 995 | Ok((dfa, nread)) |
| 996 | } |
| 997 | |
| 998 | /// Deserialize a DFA with a specific state identifier representation in |
| 999 | /// constant time by omitting the verification of the validity of the |
| 1000 | /// sparse transitions. |
| 1001 | /// |
| 1002 | /// This is just like [`DFA::from_bytes`], except it can potentially return |
| 1003 | /// a DFA that exhibits undefined behavior if its transitions contains |
| 1004 | /// invalid state identifiers. |
| 1005 | /// |
| 1006 | /// This routine is useful if you need to deserialize a DFA cheaply and |
| 1007 | /// cannot afford the transition validation performed by `from_bytes`. |
| 1008 | /// |
| 1009 | /// # Safety |
| 1010 | /// |
| 1011 | /// This routine is not safe because it permits callers to provide |
| 1012 | /// arbitrary transitions with possibly incorrect state identifiers. While |
| 1013 | /// the various serialization routines will never return an incorrect |
| 1014 | /// DFA, there is no guarantee that the bytes provided here are correct. |
| 1015 | /// While `from_bytes_unchecked` will still do several forms of basic |
| 1016 | /// validation, this routine does not check that the transitions themselves |
| 1017 | /// are correct. Given an incorrect transition table, it is possible for |
| 1018 | /// the search routines to access out-of-bounds memory because of explicit |
| 1019 | /// bounds check elision. |
| 1020 | /// |
| 1021 | /// # Example |
| 1022 | /// |
| 1023 | /// ``` |
| 1024 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |
| 1025 | /// |
| 1026 | /// let initial = DFA::new("foo[0-9]+" )?; |
| 1027 | /// let bytes = initial.to_bytes_native_endian(); |
| 1028 | /// // SAFETY: This is guaranteed to be safe since the bytes given come |
| 1029 | /// // directly from a compatible serialization routine. |
| 1030 | /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; |
| 1031 | /// |
| 1032 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 1033 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345" ))?); |
| 1034 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1035 | /// ``` |
| 1036 | pub unsafe fn from_bytes_unchecked( |
| 1037 | slice: &'a [u8], |
| 1038 | ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { |
| 1039 | let mut nr = 0; |
| 1040 | |
| 1041 | nr += wire::read_label(&slice[nr..], LABEL)?; |
| 1042 | nr += wire::read_endianness_check(&slice[nr..])?; |
| 1043 | nr += wire::read_version(&slice[nr..], VERSION)?; |
| 1044 | |
| 1045 | let _unused = wire::try_read_u32(&slice[nr..], "unused space" )?; |
| 1046 | nr += size_of::<u32>(); |
| 1047 | |
| 1048 | let (flags, nread) = Flags::from_bytes(&slice[nr..])?; |
| 1049 | nr += nread; |
| 1050 | |
| 1051 | let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; |
| 1052 | nr += nread; |
| 1053 | |
| 1054 | let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; |
| 1055 | nr += nread; |
| 1056 | |
| 1057 | let (special, nread) = Special::from_bytes(&slice[nr..])?; |
| 1058 | nr += nread; |
| 1059 | if special.max.as_usize() >= tt.sparse().len() { |
| 1060 | return Err(DeserializeError::generic( |
| 1061 | "max should not be greater than or equal to sparse bytes" , |
| 1062 | )); |
| 1063 | } |
| 1064 | |
| 1065 | let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; |
| 1066 | nr += nread; |
| 1067 | |
| 1068 | // Prefilters don't support serialization, so they're always absent. |
| 1069 | let pre = None; |
| 1070 | Ok((DFA { tt, st, special, pre, quitset, flags }, nr)) |
| 1071 | } |
| 1072 | } |
| 1073 | |
| 1074 | impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> { |
| 1075 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 1076 | writeln!(f, "sparse::DFA(" )?; |
| 1077 | for state in self.tt.states() { |
| 1078 | fmt_state_indicator(f, self, state.id())?; |
| 1079 | writeln!(f, " {:06?}: {:?}" , state.id().as_usize(), state)?; |
| 1080 | } |
| 1081 | writeln!(f, "" )?; |
| 1082 | for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() { |
| 1083 | if i % self.st.stride == 0 { |
| 1084 | match anchored { |
| 1085 | Anchored::No => writeln!(f, "START-GROUP(unanchored)" )?, |
| 1086 | Anchored::Yes => writeln!(f, "START-GROUP(anchored)" )?, |
| 1087 | Anchored::Pattern(pid) => writeln!( |
| 1088 | f, |
| 1089 | "START_GROUP(pattern: {:?})" , |
| 1090 | pid.as_usize() |
| 1091 | )?, |
| 1092 | } |
| 1093 | } |
| 1094 | writeln!(f, " {:?} => {:06?}" , sty, start_id.as_usize())?; |
| 1095 | } |
| 1096 | writeln!(f, "state length: {:?}" , self.tt.state_len)?; |
| 1097 | writeln!(f, "pattern length: {:?}" , self.pattern_len())?; |
| 1098 | writeln!(f, "flags: {:?}" , self.flags)?; |
| 1099 | writeln!(f, ")" )?; |
| 1100 | Ok(()) |
| 1101 | } |
| 1102 | } |
| 1103 | |
| 1104 | // SAFETY: We assert that our implementation of each method is correct. |
| 1105 | unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { |
| 1106 | #[inline ] |
| 1107 | fn is_special_state(&self, id: StateID) -> bool { |
| 1108 | self.special.is_special_state(id) |
| 1109 | } |
| 1110 | |
| 1111 | #[inline ] |
| 1112 | fn is_dead_state(&self, id: StateID) -> bool { |
| 1113 | self.special.is_dead_state(id) |
| 1114 | } |
| 1115 | |
| 1116 | #[inline ] |
| 1117 | fn is_quit_state(&self, id: StateID) -> bool { |
| 1118 | self.special.is_quit_state(id) |
| 1119 | } |
| 1120 | |
| 1121 | #[inline ] |
| 1122 | fn is_match_state(&self, id: StateID) -> bool { |
| 1123 | self.special.is_match_state(id) |
| 1124 | } |
| 1125 | |
| 1126 | #[inline ] |
| 1127 | fn is_start_state(&self, id: StateID) -> bool { |
| 1128 | self.special.is_start_state(id) |
| 1129 | } |
| 1130 | |
| 1131 | #[inline ] |
| 1132 | fn is_accel_state(&self, id: StateID) -> bool { |
| 1133 | self.special.is_accel_state(id) |
| 1134 | } |
| 1135 | |
| 1136 | // This is marked as inline to help dramatically boost sparse searching, |
| 1137 | // which decodes each state it enters to follow the next transition. |
| 1138 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1139 | fn next_state(&self, current: StateID, input: u8) -> StateID { |
| 1140 | let input = self.tt.classes.get(input); |
| 1141 | self.tt.state(current).next(input) |
| 1142 | } |
| 1143 | |
| 1144 | #[inline ] |
| 1145 | unsafe fn next_state_unchecked( |
| 1146 | &self, |
| 1147 | current: StateID, |
| 1148 | input: u8, |
| 1149 | ) -> StateID { |
| 1150 | self.next_state(current, input) |
| 1151 | } |
| 1152 | |
| 1153 | #[inline ] |
| 1154 | fn next_eoi_state(&self, current: StateID) -> StateID { |
| 1155 | self.tt.state(current).next_eoi() |
| 1156 | } |
| 1157 | |
| 1158 | #[inline ] |
| 1159 | fn pattern_len(&self) -> usize { |
| 1160 | self.tt.pattern_len |
| 1161 | } |
| 1162 | |
| 1163 | #[inline ] |
| 1164 | fn match_len(&self, id: StateID) -> usize { |
| 1165 | self.tt.state(id).pattern_len() |
| 1166 | } |
| 1167 | |
| 1168 | #[inline ] |
| 1169 | fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { |
| 1170 | // This is an optimization for the very common case of a DFA with a |
| 1171 | // single pattern. This conditional avoids a somewhat more costly path |
| 1172 | // that finds the pattern ID from the state machine, which requires |
| 1173 | // a bit of slicing/pointer-chasing. This optimization tends to only |
| 1174 | // matter when matches are frequent. |
| 1175 | if self.tt.pattern_len == 1 { |
| 1176 | return PatternID::ZERO; |
| 1177 | } |
| 1178 | self.tt.state(id).pattern_id(match_index) |
| 1179 | } |
| 1180 | |
| 1181 | #[inline ] |
| 1182 | fn has_empty(&self) -> bool { |
| 1183 | self.flags.has_empty |
| 1184 | } |
| 1185 | |
| 1186 | #[inline ] |
| 1187 | fn is_utf8(&self) -> bool { |
| 1188 | self.flags.is_utf8 |
| 1189 | } |
| 1190 | |
| 1191 | #[inline ] |
| 1192 | fn is_always_start_anchored(&self) -> bool { |
| 1193 | self.flags.is_always_start_anchored |
| 1194 | } |
| 1195 | |
| 1196 | #[inline ] |
| 1197 | fn start_state( |
| 1198 | &self, |
| 1199 | config: &start::Config, |
| 1200 | ) -> Result<StateID, StartError> { |
| 1201 | let anchored = config.get_anchored(); |
| 1202 | let start = match config.get_look_behind() { |
| 1203 | None => Start::Text, |
| 1204 | Some(byte) => { |
| 1205 | if !self.quitset.is_empty() && self.quitset.contains(byte) { |
| 1206 | return Err(StartError::quit(byte)); |
| 1207 | } |
| 1208 | self.st.start_map.get(byte) |
| 1209 | } |
| 1210 | }; |
| 1211 | self.st.start(anchored, start) |
| 1212 | } |
| 1213 | |
| 1214 | #[inline ] |
| 1215 | fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { |
| 1216 | match mode { |
| 1217 | Anchored::No => self.st.universal_start_unanchored, |
| 1218 | Anchored::Yes => self.st.universal_start_anchored, |
| 1219 | Anchored::Pattern(_) => None, |
| 1220 | } |
| 1221 | } |
| 1222 | |
| 1223 | #[inline ] |
| 1224 | fn accelerator(&self, id: StateID) -> &[u8] { |
| 1225 | self.tt.state(id).accelerator() |
| 1226 | } |
| 1227 | |
| 1228 | #[inline ] |
| 1229 | fn get_prefilter(&self) -> Option<&Prefilter> { |
| 1230 | self.pre.as_ref() |
| 1231 | } |
| 1232 | } |
| 1233 | |
| 1234 | /// The transition table portion of a sparse DFA. |
| 1235 | /// |
| 1236 | /// The transition table is the core part of the DFA in that it describes how |
| 1237 | /// to move from one state to another based on the input sequence observed. |
| 1238 | /// |
| 1239 | /// Unlike a typical dense table based DFA, states in a sparse transition |
| 1240 | /// table have variable size. That is, states with more transitions use more |
| 1241 | /// space than states with fewer transitions. This means that finding the next |
| 1242 | /// transition takes more work than with a dense DFA, but also typically uses |
| 1243 | /// much less space. |
| 1244 | #[derive (Clone)] |
| 1245 | struct Transitions<T> { |
| 1246 | /// The raw encoding of each state in this DFA. |
| 1247 | /// |
| 1248 | /// Each state has the following information: |
| 1249 | /// |
| 1250 | /// * A set of transitions to subsequent states. Transitions to the dead |
| 1251 | /// state are omitted. |
| 1252 | /// * If the state can be accelerated, then any additional accelerator |
| 1253 | /// information. |
| 1254 | /// * If the state is a match state, then the state contains all pattern |
| 1255 | /// IDs that match when in that state. |
| 1256 | /// |
| 1257 | /// To decode a state, use Transitions::state. |
| 1258 | /// |
| 1259 | /// In practice, T is either Vec<u8> or &[u8]. |
| 1260 | sparse: T, |
| 1261 | /// A set of equivalence classes, where a single equivalence class |
| 1262 | /// represents a set of bytes that never discriminate between a match |
| 1263 | /// and a non-match in the DFA. Each equivalence class corresponds to a |
| 1264 | /// single character in this DFA's alphabet, where the maximum number of |
| 1265 | /// characters is 257 (each possible value of a byte plus the special |
| 1266 | /// EOI transition). Consequently, the number of equivalence classes |
| 1267 | /// corresponds to the number of transitions for each DFA state. Note |
| 1268 | /// though that the *space* used by each DFA state in the transition table |
| 1269 | /// may be larger. The total space used by each DFA state is known as the |
| 1270 | /// stride and is documented above. |
| 1271 | /// |
| 1272 | /// The only time the number of equivalence classes is fewer than 257 is |
| 1273 | /// if the DFA's kind uses byte classes which is the default. Equivalence |
| 1274 | /// classes should generally only be disabled when debugging, so that |
| 1275 | /// the transitions themselves aren't obscured. Disabling them has no |
| 1276 | /// other benefit, since the equivalence class map is always used while |
| 1277 | /// searching. In the vast majority of cases, the number of equivalence |
| 1278 | /// classes is substantially smaller than 257, particularly when large |
| 1279 | /// Unicode classes aren't used. |
| 1280 | /// |
| 1281 | /// N.B. Equivalence classes aren't particularly useful in a sparse DFA |
| 1282 | /// in the current implementation, since equivalence classes generally tend |
| 1283 | /// to correspond to continuous ranges of bytes that map to the same |
| 1284 | /// transition. So in a sparse DFA, equivalence classes don't really lead |
| 1285 | /// to a space savings. In the future, it would be good to try and remove |
| 1286 | /// them from sparse DFAs entirely, but requires a bit of work since sparse |
| 1287 | /// DFAs are built from dense DFAs, which are in turn built on top of |
| 1288 | /// equivalence classes. |
| 1289 | classes: ByteClasses, |
| 1290 | /// The total number of states in this DFA. Note that a DFA always has at |
| 1291 | /// least one state---the dead state---even the empty DFA. In particular, |
| 1292 | /// the dead state always has ID 0 and is correspondingly always the first |
| 1293 | /// state. The dead state is never a match state. |
| 1294 | state_len: usize, |
| 1295 | /// The total number of unique patterns represented by these match states. |
| 1296 | pattern_len: usize, |
| 1297 | } |
| 1298 | |
| 1299 | impl<'a> Transitions<&'a [u8]> { |
| 1300 | unsafe fn from_bytes_unchecked( |
| 1301 | mut slice: &'a [u8], |
| 1302 | ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { |
| 1303 | let slice_start = slice.as_ptr().as_usize(); |
| 1304 | |
| 1305 | let (state_len, nr) = |
| 1306 | wire::try_read_u32_as_usize(&slice, "state length" )?; |
| 1307 | slice = &slice[nr..]; |
| 1308 | |
| 1309 | let (pattern_len, nr) = |
| 1310 | wire::try_read_u32_as_usize(&slice, "pattern length" )?; |
| 1311 | slice = &slice[nr..]; |
| 1312 | |
| 1313 | let (classes, nr) = ByteClasses::from_bytes(&slice)?; |
| 1314 | slice = &slice[nr..]; |
| 1315 | |
| 1316 | let (len, nr) = |
| 1317 | wire::try_read_u32_as_usize(&slice, "sparse transitions length" )?; |
| 1318 | slice = &slice[nr..]; |
| 1319 | |
| 1320 | wire::check_slice_len(slice, len, "sparse states byte length" )?; |
| 1321 | let sparse = &slice[..len]; |
| 1322 | slice = &slice[len..]; |
| 1323 | |
| 1324 | let trans = Transitions { sparse, classes, state_len, pattern_len }; |
| 1325 | Ok((trans, slice.as_ptr().as_usize() - slice_start)) |
| 1326 | } |
| 1327 | } |
| 1328 | |
| 1329 | impl<T: AsRef<[u8]>> Transitions<T> { |
| 1330 | /// Writes a serialized form of this transition table to the buffer given. |
| 1331 | /// If the buffer is too small, then an error is returned. To determine |
| 1332 | /// how big the buffer must be, use `write_to_len`. |
| 1333 | fn write_to<E: Endian>( |
| 1334 | &self, |
| 1335 | mut dst: &mut [u8], |
| 1336 | ) -> Result<usize, SerializeError> { |
| 1337 | let nwrite = self.write_to_len(); |
| 1338 | if dst.len() < nwrite { |
| 1339 | return Err(SerializeError::buffer_too_small( |
| 1340 | "sparse transition table" , |
| 1341 | )); |
| 1342 | } |
| 1343 | dst = &mut dst[..nwrite]; |
| 1344 | |
| 1345 | // write state length |
| 1346 | E::write_u32(u32::try_from(self.state_len).unwrap(), dst); |
| 1347 | dst = &mut dst[size_of::<u32>()..]; |
| 1348 | |
| 1349 | // write pattern length |
| 1350 | E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); |
| 1351 | dst = &mut dst[size_of::<u32>()..]; |
| 1352 | |
| 1353 | // write byte class map |
| 1354 | let n = self.classes.write_to(dst)?; |
| 1355 | dst = &mut dst[n..]; |
| 1356 | |
| 1357 | // write number of bytes in sparse transitions |
| 1358 | E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst); |
| 1359 | dst = &mut dst[size_of::<u32>()..]; |
| 1360 | |
| 1361 | // write actual transitions |
| 1362 | let mut id = DEAD; |
| 1363 | while id.as_usize() < self.sparse().len() { |
| 1364 | let state = self.state(id); |
| 1365 | let n = state.write_to::<E>(&mut dst)?; |
| 1366 | dst = &mut dst[n..]; |
| 1367 | // The next ID is the offset immediately following `state`. |
| 1368 | id = StateID::new(id.as_usize() + state.write_to_len()).unwrap(); |
| 1369 | } |
| 1370 | Ok(nwrite) |
| 1371 | } |
| 1372 | |
| 1373 | /// Returns the number of bytes the serialized form of this transition |
| 1374 | /// table will use. |
| 1375 | fn write_to_len(&self) -> usize { |
| 1376 | size_of::<u32>() // state length |
| 1377 | + size_of::<u32>() // pattern length |
| 1378 | + self.classes.write_to_len() |
| 1379 | + size_of::<u32>() // sparse transitions length |
| 1380 | + self.sparse().len() |
| 1381 | } |
| 1382 | |
| 1383 | /// Validates that every state ID in this transition table is valid. |
| 1384 | /// |
| 1385 | /// That is, every state ID can be used to correctly index a state in this |
| 1386 | /// table. |
| 1387 | fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> { |
| 1388 | let mut verified = Seen::new(); |
| 1389 | // We need to make sure that we decode the correct number of states. |
| 1390 | // Otherwise, an empty set of transitions would validate even if the |
| 1391 | // recorded state length is non-empty. |
| 1392 | let mut len = 0; |
| 1393 | // We can't use the self.states() iterator because it assumes the state |
| 1394 | // encodings are valid. It could panic if they aren't. |
| 1395 | let mut id = DEAD; |
| 1396 | while id.as_usize() < self.sparse().len() { |
| 1397 | // Before we even decode the state, we check that the ID itself |
| 1398 | // is well formed. That is, if it's a special state then it must |
| 1399 | // actually be a quit, dead, accel, match or start state. |
| 1400 | if sp.is_special_state(id) { |
| 1401 | let is_actually_special = sp.is_dead_state(id) |
| 1402 | || sp.is_quit_state(id) |
| 1403 | || sp.is_match_state(id) |
| 1404 | || sp.is_start_state(id) |
| 1405 | || sp.is_accel_state(id); |
| 1406 | if !is_actually_special { |
| 1407 | // This is kind of a cryptic error message... |
| 1408 | return Err(DeserializeError::generic( |
| 1409 | "found sparse state tagged as special but \ |
| 1410 | wasn't actually special" , |
| 1411 | )); |
| 1412 | } |
| 1413 | } |
| 1414 | let state = self.try_state(sp, id)?; |
| 1415 | verified.insert(id); |
| 1416 | // The next ID should be the offset immediately following `state`. |
| 1417 | id = StateID::new(wire::add( |
| 1418 | id.as_usize(), |
| 1419 | state.write_to_len(), |
| 1420 | "next state ID offset" , |
| 1421 | )?) |
| 1422 | .map_err(|err| { |
| 1423 | DeserializeError::state_id_error(err, "next state ID offset" ) |
| 1424 | })?; |
| 1425 | len += 1; |
| 1426 | } |
| 1427 | // Now that we've checked that all top-level states are correct and |
| 1428 | // importantly, collected a set of valid state IDs, we have all the |
| 1429 | // information we need to check that all transitions are correct too. |
| 1430 | // |
| 1431 | // Note that we can't use `valid_ids` to iterate because it will |
| 1432 | // be empty in no-std no-alloc contexts. (And yes, that means our |
| 1433 | // verification isn't quite as good.) We can use `self.states()` |
| 1434 | // though at least, since we know that all states can at least be |
| 1435 | // decoded and traversed correctly. |
| 1436 | for state in self.states() { |
| 1437 | // Check that all transitions in this state are correct. |
| 1438 | for i in 0..state.ntrans { |
| 1439 | let to = state.next_at(i); |
| 1440 | // For no-alloc, we just check that the state can decode. It is |
| 1441 | // technically possible that the state ID could still point to |
| 1442 | // a non-existent state even if it decodes (fuzzing proved this |
| 1443 | // to be true), but it shouldn't result in any memory unsafety |
| 1444 | // or panics in non-debug mode. |
| 1445 | #[cfg (not(feature = "alloc" ))] |
| 1446 | { |
| 1447 | let _ = self.try_state(sp, to)?; |
| 1448 | } |
| 1449 | #[cfg (feature = "alloc" )] |
| 1450 | { |
| 1451 | if !verified.contains(&to) { |
| 1452 | return Err(DeserializeError::generic( |
| 1453 | "found transition that points to a \ |
| 1454 | non-existent state" , |
| 1455 | )); |
| 1456 | } |
| 1457 | } |
| 1458 | } |
| 1459 | } |
| 1460 | if len != self.state_len { |
| 1461 | return Err(DeserializeError::generic( |
| 1462 | "mismatching sparse state length" , |
| 1463 | )); |
| 1464 | } |
| 1465 | Ok(verified) |
| 1466 | } |
| 1467 | |
| 1468 | /// Converts these transitions to a borrowed value. |
| 1469 | fn as_ref(&self) -> Transitions<&'_ [u8]> { |
| 1470 | Transitions { |
| 1471 | sparse: self.sparse(), |
| 1472 | classes: self.classes.clone(), |
| 1473 | state_len: self.state_len, |
| 1474 | pattern_len: self.pattern_len, |
| 1475 | } |
| 1476 | } |
| 1477 | |
| 1478 | /// Converts these transitions to an owned value. |
| 1479 | #[cfg (feature = "alloc" )] |
| 1480 | fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> { |
| 1481 | Transitions { |
| 1482 | sparse: self.sparse().to_vec(), |
| 1483 | classes: self.classes.clone(), |
| 1484 | state_len: self.state_len, |
| 1485 | pattern_len: self.pattern_len, |
| 1486 | } |
| 1487 | } |
| 1488 | |
| 1489 | /// Return a convenient representation of the given state. |
| 1490 | /// |
| 1491 | /// This panics if the state is invalid. |
| 1492 | /// |
| 1493 | /// This is marked as inline to help dramatically boost sparse searching, |
| 1494 | /// which decodes each state it enters to follow the next transition. Other |
| 1495 | /// functions involved are also inlined, which should hopefully eliminate |
| 1496 | /// a lot of the extraneous decoding that is never needed just to follow |
| 1497 | /// the next transition. |
| 1498 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1499 | fn state(&self, id: StateID) -> State<'_> { |
| 1500 | let mut state = &self.sparse()[id.as_usize()..]; |
| 1501 | let mut ntrans = wire::read_u16(&state).as_usize(); |
| 1502 | let is_match = (1 << 15) & ntrans != 0; |
| 1503 | ntrans &= !(1 << 15); |
| 1504 | state = &state[2..]; |
| 1505 | |
| 1506 | let (input_ranges, state) = state.split_at(ntrans * 2); |
| 1507 | let (next, state) = state.split_at(ntrans * StateID::SIZE); |
| 1508 | let (pattern_ids, state) = if is_match { |
| 1509 | let npats = wire::read_u32(&state).as_usize(); |
| 1510 | state[4..].split_at(npats * 4) |
| 1511 | } else { |
| 1512 | (&[][..], state) |
| 1513 | }; |
| 1514 | |
| 1515 | let accel_len = usize::from(state[0]); |
| 1516 | let accel = &state[1..accel_len + 1]; |
| 1517 | State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } |
| 1518 | } |
| 1519 | |
| 1520 | /// Like `state`, but will return an error if the state encoding is |
| 1521 | /// invalid. This is useful for verifying states after deserialization, |
| 1522 | /// which is required for a safe deserialization API. |
| 1523 | /// |
| 1524 | /// Note that this only verifies that this state is decodable and that |
| 1525 | /// all of its data is consistent. It does not verify that its state ID |
| 1526 | /// transitions point to valid states themselves, nor does it verify that |
| 1527 | /// every pattern ID is valid. |
| 1528 | fn try_state( |
| 1529 | &self, |
| 1530 | sp: &Special, |
| 1531 | id: StateID, |
| 1532 | ) -> Result<State<'_>, DeserializeError> { |
| 1533 | if id.as_usize() > self.sparse().len() { |
| 1534 | return Err(DeserializeError::generic( |
| 1535 | "invalid caller provided sparse state ID" , |
| 1536 | )); |
| 1537 | } |
| 1538 | let mut state = &self.sparse()[id.as_usize()..]; |
| 1539 | // Encoding format starts with a u16 that stores the total number of |
| 1540 | // transitions in this state. |
| 1541 | let (mut ntrans, _) = |
| 1542 | wire::try_read_u16_as_usize(state, "state transition length" )?; |
| 1543 | let is_match = ((1 << 15) & ntrans) != 0; |
| 1544 | ntrans &= !(1 << 15); |
| 1545 | state = &state[2..]; |
| 1546 | if ntrans > 257 || ntrans == 0 { |
| 1547 | return Err(DeserializeError::generic( |
| 1548 | "invalid transition length" , |
| 1549 | )); |
| 1550 | } |
| 1551 | if is_match && !sp.is_match_state(id) { |
| 1552 | return Err(DeserializeError::generic( |
| 1553 | "state marked as match but not in match ID range" , |
| 1554 | )); |
| 1555 | } else if !is_match && sp.is_match_state(id) { |
| 1556 | return Err(DeserializeError::generic( |
| 1557 | "state in match ID range but not marked as match state" , |
| 1558 | )); |
| 1559 | } |
| 1560 | |
| 1561 | // Each transition has two pieces: an inclusive range of bytes on which |
| 1562 | // it is defined, and the state ID that those bytes transition to. The |
| 1563 | // pairs come first, followed by a corresponding sequence of state IDs. |
| 1564 | let input_ranges_len = ntrans.checked_mul(2).unwrap(); |
| 1565 | wire::check_slice_len(state, input_ranges_len, "sparse byte pairs" )?; |
| 1566 | let (input_ranges, state) = state.split_at(input_ranges_len); |
| 1567 | // Every range should be of the form A-B, where A<=B. |
| 1568 | for pair in input_ranges.chunks(2) { |
| 1569 | let (start, end) = (pair[0], pair[1]); |
| 1570 | if start > end { |
| 1571 | return Err(DeserializeError::generic("invalid input range" )); |
| 1572 | } |
| 1573 | } |
| 1574 | |
| 1575 | // And now extract the corresponding sequence of state IDs. We leave |
| 1576 | // this sequence as a &[u8] instead of a &[S] because sparse DFAs do |
| 1577 | // not have any alignment requirements. |
| 1578 | let next_len = ntrans |
| 1579 | .checked_mul(self.id_len()) |
| 1580 | .expect("state size * #trans should always fit in a usize" ); |
| 1581 | wire::check_slice_len(state, next_len, "sparse trans state IDs" )?; |
| 1582 | let (next, state) = state.split_at(next_len); |
| 1583 | // We can at least verify that every state ID is in bounds. |
| 1584 | for idbytes in next.chunks(self.id_len()) { |
| 1585 | let (id, _) = |
| 1586 | wire::read_state_id(idbytes, "sparse state ID in try_state" )?; |
| 1587 | wire::check_slice_len( |
| 1588 | self.sparse(), |
| 1589 | id.as_usize(), |
| 1590 | "invalid sparse state ID" , |
| 1591 | )?; |
| 1592 | } |
| 1593 | |
| 1594 | // If this is a match state, then read the pattern IDs for this state. |
| 1595 | // Pattern IDs is a u32-length prefixed sequence of native endian |
| 1596 | // encoded 32-bit integers. |
| 1597 | let (pattern_ids, state) = if is_match { |
| 1598 | let (npats, nr) = |
| 1599 | wire::try_read_u32_as_usize(state, "pattern ID length" )?; |
| 1600 | let state = &state[nr..]; |
| 1601 | if npats == 0 { |
| 1602 | return Err(DeserializeError::generic( |
| 1603 | "state marked as a match, but pattern length is zero" , |
| 1604 | )); |
| 1605 | } |
| 1606 | |
| 1607 | let pattern_ids_len = |
| 1608 | wire::mul(npats, 4, "sparse pattern ID byte length" )?; |
| 1609 | wire::check_slice_len( |
| 1610 | state, |
| 1611 | pattern_ids_len, |
| 1612 | "sparse pattern IDs" , |
| 1613 | )?; |
| 1614 | let (pattern_ids, state) = state.split_at(pattern_ids_len); |
| 1615 | for patbytes in pattern_ids.chunks(PatternID::SIZE) { |
| 1616 | wire::read_pattern_id( |
| 1617 | patbytes, |
| 1618 | "sparse pattern ID in try_state" , |
| 1619 | )?; |
| 1620 | } |
| 1621 | (pattern_ids, state) |
| 1622 | } else { |
| 1623 | (&[][..], state) |
| 1624 | }; |
| 1625 | if is_match && pattern_ids.is_empty() { |
| 1626 | return Err(DeserializeError::generic( |
| 1627 | "state marked as a match, but has no pattern IDs" , |
| 1628 | )); |
| 1629 | } |
| 1630 | if sp.is_match_state(id) && pattern_ids.is_empty() { |
| 1631 | return Err(DeserializeError::generic( |
| 1632 | "state marked special as a match, but has no pattern IDs" , |
| 1633 | )); |
| 1634 | } |
| 1635 | if sp.is_match_state(id) != is_match { |
| 1636 | return Err(DeserializeError::generic( |
| 1637 | "whether state is a match or not is inconsistent" , |
| 1638 | )); |
| 1639 | } |
| 1640 | |
| 1641 | // Now read this state's accelerator info. The first byte is the length |
| 1642 | // of the accelerator, which is typically 0 (for no acceleration) but |
| 1643 | // is no bigger than 3. The length indicates the number of bytes that |
| 1644 | // follow, where each byte corresponds to a transition out of this |
| 1645 | // state. |
| 1646 | if state.is_empty() { |
| 1647 | return Err(DeserializeError::generic("no accelerator length" )); |
| 1648 | } |
| 1649 | let (accel_len, state) = (usize::from(state[0]), &state[1..]); |
| 1650 | |
| 1651 | if accel_len > 3 { |
| 1652 | return Err(DeserializeError::generic( |
| 1653 | "sparse invalid accelerator length" , |
| 1654 | )); |
| 1655 | } else if accel_len == 0 && sp.is_accel_state(id) { |
| 1656 | return Err(DeserializeError::generic( |
| 1657 | "got no accelerators in state, but in accelerator ID range" , |
| 1658 | )); |
| 1659 | } else if accel_len > 0 && !sp.is_accel_state(id) { |
| 1660 | return Err(DeserializeError::generic( |
| 1661 | "state in accelerator ID range, but has no accelerators" , |
| 1662 | )); |
| 1663 | } |
| 1664 | |
| 1665 | wire::check_slice_len( |
| 1666 | state, |
| 1667 | accel_len, |
| 1668 | "sparse corrupt accelerator length" , |
| 1669 | )?; |
| 1670 | let (accel, _) = (&state[..accel_len], &state[accel_len..]); |
| 1671 | |
| 1672 | let state = State { |
| 1673 | id, |
| 1674 | is_match, |
| 1675 | ntrans, |
| 1676 | input_ranges, |
| 1677 | next, |
| 1678 | pattern_ids, |
| 1679 | accel, |
| 1680 | }; |
| 1681 | if sp.is_quit_state(state.next_at(state.ntrans - 1)) { |
| 1682 | return Err(DeserializeError::generic( |
| 1683 | "state with EOI transition to quit state is illegal" , |
| 1684 | )); |
| 1685 | } |
| 1686 | Ok(state) |
| 1687 | } |
| 1688 | |
| 1689 | /// Return an iterator over all of the states in this DFA. |
| 1690 | /// |
| 1691 | /// The iterator returned yields tuples, where the first element is the |
| 1692 | /// state ID and the second element is the state itself. |
| 1693 | fn states(&self) -> StateIter<'_, T> { |
| 1694 | StateIter { trans: self, id: DEAD.as_usize() } |
| 1695 | } |
| 1696 | |
| 1697 | /// Returns the sparse transitions as raw bytes. |
| 1698 | fn sparse(&self) -> &[u8] { |
| 1699 | self.sparse.as_ref() |
| 1700 | } |
| 1701 | |
| 1702 | /// Returns the number of bytes represented by a single state ID. |
| 1703 | fn id_len(&self) -> usize { |
| 1704 | StateID::SIZE |
| 1705 | } |
| 1706 | |
| 1707 | /// Return the memory usage, in bytes, of these transitions. |
| 1708 | /// |
| 1709 | /// This does not include the size of a `Transitions` value itself. |
| 1710 | fn memory_usage(&self) -> usize { |
| 1711 | self.sparse().len() |
| 1712 | } |
| 1713 | } |
| 1714 | |
| 1715 | #[cfg (feature = "dfa-build" )] |
| 1716 | impl<T: AsMut<[u8]>> Transitions<T> { |
| 1717 | /// Return a convenient mutable representation of the given state. |
| 1718 | /// This panics if the state is invalid. |
| 1719 | fn state_mut(&mut self, id: StateID) -> StateMut<'_> { |
| 1720 | let mut state = &mut self.sparse_mut()[id.as_usize()..]; |
| 1721 | let mut ntrans = wire::read_u16(&state).as_usize(); |
| 1722 | let is_match = (1 << 15) & ntrans != 0; |
| 1723 | ntrans &= !(1 << 15); |
| 1724 | state = &mut state[2..]; |
| 1725 | |
| 1726 | let (input_ranges, state) = state.split_at_mut(ntrans * 2); |
| 1727 | let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); |
| 1728 | let (pattern_ids, state) = if is_match { |
| 1729 | let npats = wire::read_u32(&state).as_usize(); |
| 1730 | state[4..].split_at_mut(npats * 4) |
| 1731 | } else { |
| 1732 | (&mut [][..], state) |
| 1733 | }; |
| 1734 | |
| 1735 | let accel_len = usize::from(state[0]); |
| 1736 | let accel = &mut state[1..accel_len + 1]; |
| 1737 | StateMut { |
| 1738 | id, |
| 1739 | is_match, |
| 1740 | ntrans, |
| 1741 | input_ranges, |
| 1742 | next, |
| 1743 | pattern_ids, |
| 1744 | accel, |
| 1745 | } |
| 1746 | } |
| 1747 | |
| 1748 | /// Returns the sparse transitions as raw mutable bytes. |
| 1749 | fn sparse_mut(&mut self) -> &mut [u8] { |
| 1750 | self.sparse.as_mut() |
| 1751 | } |
| 1752 | } |
| 1753 | |
| 1754 | /// The set of all possible starting states in a DFA. |
| 1755 | /// |
| 1756 | /// See the eponymous type in the `dense` module for more details. This type |
| 1757 | /// is very similar to `dense::StartTable`, except that its underlying |
| 1758 | /// representation is `&[u8]` instead of `&[S]`. (The latter would require |
| 1759 | /// sparse DFAs to be aligned, which is explicitly something we do not require |
| 1760 | /// because we don't really need it.) |
| 1761 | #[derive (Clone)] |
| 1762 | struct StartTable<T> { |
| 1763 | /// The initial start state IDs as a contiguous table of native endian |
| 1764 | /// encoded integers, represented by `S`. |
| 1765 | /// |
| 1766 | /// In practice, T is either Vec<u8> or &[u8] and has no alignment |
| 1767 | /// requirements. |
| 1768 | /// |
| 1769 | /// The first `2 * stride` (currently always 8) entries always correspond |
| 1770 | /// to the starts states for the entire DFA, with the first 4 entries being |
| 1771 | /// for unanchored searches and the second 4 entries being for anchored |
| 1772 | /// searches. To keep things simple, we always use 8 entries even if the |
| 1773 | /// `StartKind` is not both. |
| 1774 | /// |
| 1775 | /// After that, there are `stride * patterns` state IDs, where `patterns` |
| 1776 | /// may be zero in the case of a DFA with no patterns or in the case where |
| 1777 | /// the DFA was built without enabling starting states for each pattern. |
| 1778 | table: T, |
| 1779 | /// The starting state configuration supported. When 'both', both |
| 1780 | /// unanchored and anchored searches work. When 'unanchored', anchored |
| 1781 | /// searches panic. When 'anchored', unanchored searches panic. |
| 1782 | kind: StartKind, |
| 1783 | /// The start state configuration for every possible byte. |
| 1784 | start_map: StartByteMap, |
| 1785 | /// The number of starting state IDs per pattern. |
| 1786 | stride: usize, |
| 1787 | /// The total number of patterns for which starting states are encoded. |
| 1788 | /// This is `None` for DFAs that were built without start states for each |
| 1789 | /// pattern. Thus, one cannot use this field to say how many patterns |
| 1790 | /// are in the DFA in all cases. It is specific to how many patterns are |
| 1791 | /// represented in this start table. |
| 1792 | pattern_len: Option<usize>, |
| 1793 | /// The universal starting state for unanchored searches. This is only |
| 1794 | /// present when the DFA supports unanchored searches and when all starting |
| 1795 | /// state IDs for an unanchored search are equivalent. |
| 1796 | universal_start_unanchored: Option<StateID>, |
| 1797 | /// The universal starting state for anchored searches. This is only |
| 1798 | /// present when the DFA supports anchored searches and when all starting |
| 1799 | /// state IDs for an anchored search are equivalent. |
| 1800 | universal_start_anchored: Option<StateID>, |
| 1801 | } |
| 1802 | |
| 1803 | #[cfg (feature = "dfa-build" )] |
| 1804 | impl StartTable<Vec<u8>> { |
| 1805 | fn new<T: AsRef<[u32]>>( |
| 1806 | dfa: &dense::DFA<T>, |
| 1807 | pattern_len: Option<usize>, |
| 1808 | ) -> StartTable<Vec<u8>> { |
| 1809 | let stride = Start::len(); |
| 1810 | // This is OK since the only way we're here is if a dense DFA could be |
| 1811 | // constructed successfully, which uses the same space. |
| 1812 | let len = stride |
| 1813 | .checked_mul(pattern_len.unwrap_or(0)) |
| 1814 | .unwrap() |
| 1815 | .checked_add(stride.checked_mul(2).unwrap()) |
| 1816 | .unwrap() |
| 1817 | .checked_mul(StateID::SIZE) |
| 1818 | .unwrap(); |
| 1819 | StartTable { |
| 1820 | table: vec![0; len], |
| 1821 | kind: dfa.start_kind(), |
| 1822 | start_map: dfa.start_map().clone(), |
| 1823 | stride, |
| 1824 | pattern_len, |
| 1825 | universal_start_unanchored: dfa |
| 1826 | .universal_start_state(Anchored::No), |
| 1827 | universal_start_anchored: dfa.universal_start_state(Anchored::Yes), |
| 1828 | } |
| 1829 | } |
| 1830 | |
| 1831 | fn from_dense_dfa<T: AsRef<[u32]>>( |
| 1832 | dfa: &dense::DFA<T>, |
| 1833 | remap: &[StateID], |
| 1834 | ) -> Result<StartTable<Vec<u8>>, BuildError> { |
| 1835 | // Unless the DFA has start states compiled for each pattern, then |
| 1836 | // as far as the starting state table is concerned, there are zero |
| 1837 | // patterns to account for. It will instead only store starting states |
| 1838 | // for the entire DFA. |
| 1839 | let start_pattern_len = if dfa.starts_for_each_pattern() { |
| 1840 | Some(dfa.pattern_len()) |
| 1841 | } else { |
| 1842 | None |
| 1843 | }; |
| 1844 | let mut sl = StartTable::new(dfa, start_pattern_len); |
| 1845 | for (old_start_id, anchored, sty) in dfa.starts() { |
| 1846 | let new_start_id = remap[dfa.to_index(old_start_id)]; |
| 1847 | sl.set_start(anchored, sty, new_start_id); |
| 1848 | } |
| 1849 | Ok(sl) |
| 1850 | } |
| 1851 | } |
| 1852 | |
| 1853 | impl<'a> StartTable<&'a [u8]> { |
| 1854 | unsafe fn from_bytes_unchecked( |
| 1855 | mut slice: &'a [u8], |
| 1856 | ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { |
| 1857 | let slice_start = slice.as_ptr().as_usize(); |
| 1858 | |
| 1859 | let (kind, nr) = StartKind::from_bytes(slice)?; |
| 1860 | slice = &slice[nr..]; |
| 1861 | |
| 1862 | let (start_map, nr) = StartByteMap::from_bytes(slice)?; |
| 1863 | slice = &slice[nr..]; |
| 1864 | |
| 1865 | let (stride, nr) = |
| 1866 | wire::try_read_u32_as_usize(slice, "sparse start table stride" )?; |
| 1867 | slice = &slice[nr..]; |
| 1868 | if stride != Start::len() { |
| 1869 | return Err(DeserializeError::generic( |
| 1870 | "invalid sparse starting table stride" , |
| 1871 | )); |
| 1872 | } |
| 1873 | |
| 1874 | let (maybe_pattern_len, nr) = |
| 1875 | wire::try_read_u32_as_usize(slice, "sparse start table patterns" )?; |
| 1876 | slice = &slice[nr..]; |
| 1877 | let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { |
| 1878 | None |
| 1879 | } else { |
| 1880 | Some(maybe_pattern_len) |
| 1881 | }; |
| 1882 | if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { |
| 1883 | return Err(DeserializeError::generic( |
| 1884 | "sparse invalid number of patterns" , |
| 1885 | )); |
| 1886 | } |
| 1887 | |
| 1888 | let (universal_unanchored, nr) = |
| 1889 | wire::try_read_u32(slice, "universal unanchored start" )?; |
| 1890 | slice = &slice[nr..]; |
| 1891 | let universal_start_unanchored = if universal_unanchored == u32::MAX { |
| 1892 | None |
| 1893 | } else { |
| 1894 | Some(StateID::try_from(universal_unanchored).map_err(|e| { |
| 1895 | DeserializeError::state_id_error( |
| 1896 | e, |
| 1897 | "universal unanchored start" , |
| 1898 | ) |
| 1899 | })?) |
| 1900 | }; |
| 1901 | |
| 1902 | let (universal_anchored, nr) = |
| 1903 | wire::try_read_u32(slice, "universal anchored start" )?; |
| 1904 | slice = &slice[nr..]; |
| 1905 | let universal_start_anchored = if universal_anchored == u32::MAX { |
| 1906 | None |
| 1907 | } else { |
| 1908 | Some(StateID::try_from(universal_anchored).map_err(|e| { |
| 1909 | DeserializeError::state_id_error(e, "universal anchored start" ) |
| 1910 | })?) |
| 1911 | }; |
| 1912 | |
| 1913 | let pattern_table_size = wire::mul( |
| 1914 | stride, |
| 1915 | pattern_len.unwrap_or(0), |
| 1916 | "sparse invalid pattern length" , |
| 1917 | )?; |
| 1918 | // Our start states always start with a single stride of start states |
| 1919 | // for the entire automaton which permit it to match any pattern. What |
| 1920 | // follows it are an optional set of start states for each pattern. |
| 1921 | let start_state_len = wire::add( |
| 1922 | wire::mul(2, stride, "start state stride too big" )?, |
| 1923 | pattern_table_size, |
| 1924 | "sparse invalid 'any' pattern starts size" , |
| 1925 | )?; |
| 1926 | let table_bytes_len = wire::mul( |
| 1927 | start_state_len, |
| 1928 | StateID::SIZE, |
| 1929 | "sparse pattern table bytes length" , |
| 1930 | )?; |
| 1931 | wire::check_slice_len( |
| 1932 | slice, |
| 1933 | table_bytes_len, |
| 1934 | "sparse start ID table" , |
| 1935 | )?; |
| 1936 | let table = &slice[..table_bytes_len]; |
| 1937 | slice = &slice[table_bytes_len..]; |
| 1938 | |
| 1939 | let sl = StartTable { |
| 1940 | table, |
| 1941 | kind, |
| 1942 | start_map, |
| 1943 | stride, |
| 1944 | pattern_len, |
| 1945 | universal_start_unanchored, |
| 1946 | universal_start_anchored, |
| 1947 | }; |
| 1948 | Ok((sl, slice.as_ptr().as_usize() - slice_start)) |
| 1949 | } |
| 1950 | } |
| 1951 | |
| 1952 | impl<T: AsRef<[u8]>> StartTable<T> { |
| 1953 | fn write_to<E: Endian>( |
| 1954 | &self, |
| 1955 | mut dst: &mut [u8], |
| 1956 | ) -> Result<usize, SerializeError> { |
| 1957 | let nwrite = self.write_to_len(); |
| 1958 | if dst.len() < nwrite { |
| 1959 | return Err(SerializeError::buffer_too_small( |
| 1960 | "sparse starting table ids" , |
| 1961 | )); |
| 1962 | } |
| 1963 | dst = &mut dst[..nwrite]; |
| 1964 | |
| 1965 | // write start kind |
| 1966 | let nw = self.kind.write_to::<E>(dst)?; |
| 1967 | dst = &mut dst[nw..]; |
| 1968 | // write start byte map |
| 1969 | let nw = self.start_map.write_to(dst)?; |
| 1970 | dst = &mut dst[nw..]; |
| 1971 | // write stride |
| 1972 | E::write_u32(u32::try_from(self.stride).unwrap(), dst); |
| 1973 | dst = &mut dst[size_of::<u32>()..]; |
| 1974 | // write pattern length |
| 1975 | E::write_u32( |
| 1976 | u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), |
| 1977 | dst, |
| 1978 | ); |
| 1979 | dst = &mut dst[size_of::<u32>()..]; |
| 1980 | // write universal start unanchored state id, u32::MAX if absent |
| 1981 | E::write_u32( |
| 1982 | self.universal_start_unanchored |
| 1983 | .map_or(u32::MAX, |sid| sid.as_u32()), |
| 1984 | dst, |
| 1985 | ); |
| 1986 | dst = &mut dst[size_of::<u32>()..]; |
| 1987 | // write universal start anchored state id, u32::MAX if absent |
| 1988 | E::write_u32( |
| 1989 | self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), |
| 1990 | dst, |
| 1991 | ); |
| 1992 | dst = &mut dst[size_of::<u32>()..]; |
| 1993 | // write start IDs |
| 1994 | for (sid, _, _) in self.iter() { |
| 1995 | E::write_u32(sid.as_u32(), dst); |
| 1996 | dst = &mut dst[StateID::SIZE..]; |
| 1997 | } |
| 1998 | Ok(nwrite) |
| 1999 | } |
| 2000 | |
| 2001 | /// Returns the number of bytes the serialized form of this transition |
| 2002 | /// table will use. |
| 2003 | fn write_to_len(&self) -> usize { |
| 2004 | self.kind.write_to_len() |
| 2005 | + self.start_map.write_to_len() |
| 2006 | + size_of::<u32>() // stride |
| 2007 | + size_of::<u32>() // # patterns |
| 2008 | + size_of::<u32>() // universal unanchored start |
| 2009 | + size_of::<u32>() // universal anchored start |
| 2010 | + self.table().len() |
| 2011 | } |
| 2012 | |
| 2013 | /// Validates that every starting state ID in this table is valid. |
| 2014 | /// |
| 2015 | /// That is, every starting state ID can be used to correctly decode a |
| 2016 | /// state in the DFA's sparse transitions. |
| 2017 | fn validate( |
| 2018 | &self, |
| 2019 | sp: &Special, |
| 2020 | seen: &Seen, |
| 2021 | ) -> Result<(), DeserializeError> { |
| 2022 | for (id, _, _) in self.iter() { |
| 2023 | if !seen.contains(&id) { |
| 2024 | return Err(DeserializeError::generic( |
| 2025 | "found invalid start state ID" , |
| 2026 | )); |
| 2027 | } |
| 2028 | if sp.is_match_state(id) { |
| 2029 | return Err(DeserializeError::generic( |
| 2030 | "start states cannot be match states" , |
| 2031 | )); |
| 2032 | } |
| 2033 | } |
| 2034 | Ok(()) |
| 2035 | } |
| 2036 | |
| 2037 | /// Converts this start list to a borrowed value. |
| 2038 | fn as_ref(&self) -> StartTable<&'_ [u8]> { |
| 2039 | StartTable { |
| 2040 | table: self.table(), |
| 2041 | kind: self.kind, |
| 2042 | start_map: self.start_map.clone(), |
| 2043 | stride: self.stride, |
| 2044 | pattern_len: self.pattern_len, |
| 2045 | universal_start_unanchored: self.universal_start_unanchored, |
| 2046 | universal_start_anchored: self.universal_start_anchored, |
| 2047 | } |
| 2048 | } |
| 2049 | |
| 2050 | /// Converts this start list to an owned value. |
| 2051 | #[cfg (feature = "alloc" )] |
| 2052 | fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> { |
| 2053 | StartTable { |
| 2054 | table: self.table().to_vec(), |
| 2055 | kind: self.kind, |
| 2056 | start_map: self.start_map.clone(), |
| 2057 | stride: self.stride, |
| 2058 | pattern_len: self.pattern_len, |
| 2059 | universal_start_unanchored: self.universal_start_unanchored, |
| 2060 | universal_start_anchored: self.universal_start_anchored, |
| 2061 | } |
| 2062 | } |
| 2063 | |
| 2064 | /// Return the start state for the given index and pattern ID. If the |
| 2065 | /// pattern ID is None, then the corresponding start state for the entire |
| 2066 | /// DFA is returned. If the pattern ID is not None, then the corresponding |
| 2067 | /// starting state for the given pattern is returned. If this start table |
| 2068 | /// does not have individual starting states for each pattern, then this |
| 2069 | /// panics. |
| 2070 | fn start( |
| 2071 | &self, |
| 2072 | anchored: Anchored, |
| 2073 | start: Start, |
| 2074 | ) -> Result<StateID, StartError> { |
| 2075 | let start_index = start.as_usize(); |
| 2076 | let index = match anchored { |
| 2077 | Anchored::No => { |
| 2078 | if !self.kind.has_unanchored() { |
| 2079 | return Err(StartError::unsupported_anchored(anchored)); |
| 2080 | } |
| 2081 | start_index |
| 2082 | } |
| 2083 | Anchored::Yes => { |
| 2084 | if !self.kind.has_anchored() { |
| 2085 | return Err(StartError::unsupported_anchored(anchored)); |
| 2086 | } |
| 2087 | self.stride + start_index |
| 2088 | } |
| 2089 | Anchored::Pattern(pid) => { |
| 2090 | let len = match self.pattern_len { |
| 2091 | None => { |
| 2092 | return Err(StartError::unsupported_anchored(anchored)) |
| 2093 | } |
| 2094 | Some(len) => len, |
| 2095 | }; |
| 2096 | if pid.as_usize() >= len { |
| 2097 | return Ok(DEAD); |
| 2098 | } |
| 2099 | (2 * self.stride) |
| 2100 | + (self.stride * pid.as_usize()) |
| 2101 | + start_index |
| 2102 | } |
| 2103 | }; |
| 2104 | let start = index * StateID::SIZE; |
| 2105 | // This OK since we're allowed to assume that the start table contains |
| 2106 | // valid StateIDs. |
| 2107 | Ok(wire::read_state_id_unchecked(&self.table()[start..]).0) |
| 2108 | } |
| 2109 | |
| 2110 | /// Return an iterator over all start IDs in this table. |
| 2111 | fn iter(&self) -> StartStateIter<'_, T> { |
| 2112 | StartStateIter { st: self, i: 0 } |
| 2113 | } |
| 2114 | |
| 2115 | /// Returns the total number of start state IDs in this table. |
| 2116 | fn len(&self) -> usize { |
| 2117 | self.table().len() / StateID::SIZE |
| 2118 | } |
| 2119 | |
| 2120 | /// Returns the table as a raw slice of bytes. |
| 2121 | fn table(&self) -> &[u8] { |
| 2122 | self.table.as_ref() |
| 2123 | } |
| 2124 | |
| 2125 | /// Return the memory usage, in bytes, of this start list. |
| 2126 | /// |
| 2127 | /// This does not include the size of a `StartTable` value itself. |
| 2128 | fn memory_usage(&self) -> usize { |
| 2129 | self.table().len() |
| 2130 | } |
| 2131 | } |
| 2132 | |
| 2133 | #[cfg (feature = "dfa-build" )] |
| 2134 | impl<T: AsMut<[u8]>> StartTable<T> { |
| 2135 | /// Set the start state for the given index and pattern. |
| 2136 | /// |
| 2137 | /// If the pattern ID or state ID are not valid, then this will panic. |
| 2138 | fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { |
| 2139 | let start_index = start.as_usize(); |
| 2140 | let index = match anchored { |
| 2141 | Anchored::No => start_index, |
| 2142 | Anchored::Yes => self.stride + start_index, |
| 2143 | Anchored::Pattern(pid) => { |
| 2144 | let pid = pid.as_usize(); |
| 2145 | let len = self |
| 2146 | .pattern_len |
| 2147 | .expect("start states for each pattern enabled" ); |
| 2148 | assert!(pid < len, "invalid pattern ID {:?}" , pid); |
| 2149 | self.stride |
| 2150 | .checked_mul(pid) |
| 2151 | .unwrap() |
| 2152 | .checked_add(self.stride.checked_mul(2).unwrap()) |
| 2153 | .unwrap() |
| 2154 | .checked_add(start_index) |
| 2155 | .unwrap() |
| 2156 | } |
| 2157 | }; |
| 2158 | let start = index * StateID::SIZE; |
| 2159 | let end = start + StateID::SIZE; |
| 2160 | wire::write_state_id::<wire::NE>( |
| 2161 | id, |
| 2162 | &mut self.table.as_mut()[start..end], |
| 2163 | ); |
| 2164 | } |
| 2165 | } |
| 2166 | |
| 2167 | /// An iterator over all state state IDs in a sparse DFA. |
| 2168 | struct StartStateIter<'a, T> { |
| 2169 | st: &'a StartTable<T>, |
| 2170 | i: usize, |
| 2171 | } |
| 2172 | |
| 2173 | impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { |
| 2174 | type Item = (StateID, Anchored, Start); |
| 2175 | |
| 2176 | fn next(&mut self) -> Option<(StateID, Anchored, Start)> { |
| 2177 | let i = self.i; |
| 2178 | if i >= self.st.len() { |
| 2179 | return None; |
| 2180 | } |
| 2181 | self.i += 1; |
| 2182 | |
| 2183 | // This unwrap is okay since the stride of any DFA must always match |
| 2184 | // the number of start state types. |
| 2185 | let start_type = Start::from_usize(i % self.st.stride).unwrap(); |
| 2186 | let anchored = if i < self.st.stride { |
| 2187 | Anchored::No |
| 2188 | } else if i < (2 * self.st.stride) { |
| 2189 | Anchored::Yes |
| 2190 | } else { |
| 2191 | let pid = (i - (2 * self.st.stride)) / self.st.stride; |
| 2192 | Anchored::Pattern(PatternID::new(pid).unwrap()) |
| 2193 | }; |
| 2194 | let start = i * StateID::SIZE; |
| 2195 | let end = start + StateID::SIZE; |
| 2196 | let bytes = self.st.table()[start..end].try_into().unwrap(); |
| 2197 | // This is OK since we're allowed to assume that any IDs in this start |
| 2198 | // table are correct and valid for this DFA. |
| 2199 | let id = StateID::from_ne_bytes_unchecked(bytes); |
| 2200 | Some((id, anchored, start_type)) |
| 2201 | } |
| 2202 | } |
| 2203 | |
| 2204 | impl<'a, T> fmt::Debug for StartStateIter<'a, T> { |
| 2205 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 2206 | f.debug_struct("StartStateIter" ).field(name:"i" , &self.i).finish() |
| 2207 | } |
| 2208 | } |
| 2209 | |
| 2210 | /// An iterator over all states in a sparse DFA. |
| 2211 | /// |
| 2212 | /// This iterator yields tuples, where the first element is the state ID and |
| 2213 | /// the second element is the state itself. |
| 2214 | struct StateIter<'a, T> { |
| 2215 | trans: &'a Transitions<T>, |
| 2216 | id: usize, |
| 2217 | } |
| 2218 | |
| 2219 | impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { |
| 2220 | type Item = State<'a>; |
| 2221 | |
| 2222 | fn next(&mut self) -> Option<State<'a>> { |
| 2223 | if self.id >= self.trans.sparse().len() { |
| 2224 | return None; |
| 2225 | } |
| 2226 | let state: State<'_> = self.trans.state(id:StateID::new_unchecked(self.id)); |
| 2227 | self.id = self.id + state.write_to_len(); |
| 2228 | Some(state) |
| 2229 | } |
| 2230 | } |
| 2231 | |
| 2232 | impl<'a, T> fmt::Debug for StateIter<'a, T> { |
| 2233 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 2234 | f.debug_struct("StateIter" ).field(name:"id" , &self.id).finish() |
| 2235 | } |
| 2236 | } |
| 2237 | |
| 2238 | /// A representation of a sparse DFA state that can be cheaply materialized |
| 2239 | /// from a state identifier. |
| 2240 | #[derive (Clone)] |
| 2241 | struct State<'a> { |
| 2242 | /// The identifier of this state. |
| 2243 | id: StateID, |
| 2244 | /// Whether this is a match state or not. |
| 2245 | is_match: bool, |
| 2246 | /// The number of transitions in this state. |
| 2247 | ntrans: usize, |
| 2248 | /// Pairs of input ranges, where there is one pair for each transition. |
| 2249 | /// Each pair specifies an inclusive start and end byte range for the |
| 2250 | /// corresponding transition. |
| 2251 | input_ranges: &'a [u8], |
| 2252 | /// Transitions to the next state. This slice contains native endian |
| 2253 | /// encoded state identifiers, with `S` as the representation. Thus, there |
| 2254 | /// are `ntrans * size_of::<S>()` bytes in this slice. |
| 2255 | next: &'a [u8], |
| 2256 | /// If this is a match state, then this contains the pattern IDs that match |
| 2257 | /// when the DFA is in this state. |
| 2258 | /// |
| 2259 | /// This is a contiguous sequence of 32-bit native endian encoded integers. |
| 2260 | pattern_ids: &'a [u8], |
| 2261 | /// An accelerator for this state, if present. If this state has no |
| 2262 | /// accelerator, then this is an empty slice. When non-empty, this slice |
| 2263 | /// has length at most 3 and corresponds to the exhaustive set of bytes |
| 2264 | /// that must be seen in order to transition out of this state. |
| 2265 | accel: &'a [u8], |
| 2266 | } |
| 2267 | |
| 2268 | impl<'a> State<'a> { |
| 2269 | /// Searches for the next transition given an input byte. If no such |
| 2270 | /// transition could be found, then a dead state is returned. |
| 2271 | /// |
| 2272 | /// This is marked as inline to help dramatically boost sparse searching, |
| 2273 | /// which decodes each state it enters to follow the next transition. |
| 2274 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 2275 | fn next(&self, input: u8) -> StateID { |
| 2276 | // This straight linear search was observed to be much better than |
| 2277 | // binary search on ASCII haystacks, likely because a binary search |
| 2278 | // visits the ASCII case last but a linear search sees it first. A |
| 2279 | // binary search does do a little better on non-ASCII haystacks, but |
| 2280 | // not by much. There might be a better trade off lurking here. |
| 2281 | for i in 0..(self.ntrans - 1) { |
| 2282 | let (start, end) = self.range(i); |
| 2283 | if start <= input && input <= end { |
| 2284 | return self.next_at(i); |
| 2285 | } |
| 2286 | // We could bail early with an extra branch: if input < b1, then |
| 2287 | // we know we'll never find a matching transition. Interestingly, |
| 2288 | // this extra branch seems to not help performance, or will even |
| 2289 | // hurt it. It's likely very dependent on the DFA itself and what |
| 2290 | // is being searched. |
| 2291 | } |
| 2292 | DEAD |
| 2293 | } |
| 2294 | |
| 2295 | /// Returns the next state ID for the special EOI transition. |
| 2296 | fn next_eoi(&self) -> StateID { |
| 2297 | self.next_at(self.ntrans - 1) |
| 2298 | } |
| 2299 | |
| 2300 | /// Returns the identifier for this state. |
| 2301 | fn id(&self) -> StateID { |
| 2302 | self.id |
| 2303 | } |
| 2304 | |
| 2305 | /// Returns the inclusive input byte range for the ith transition in this |
| 2306 | /// state. |
| 2307 | fn range(&self, i: usize) -> (u8, u8) { |
| 2308 | (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) |
| 2309 | } |
| 2310 | |
| 2311 | /// Returns the next state for the ith transition in this state. |
| 2312 | fn next_at(&self, i: usize) -> StateID { |
| 2313 | let start = i * StateID::SIZE; |
| 2314 | let end = start + StateID::SIZE; |
| 2315 | let bytes = self.next[start..end].try_into().unwrap(); |
| 2316 | StateID::from_ne_bytes_unchecked(bytes) |
| 2317 | } |
| 2318 | |
| 2319 | /// Returns the pattern ID for the given match index. If the match index |
| 2320 | /// is invalid, then this panics. |
| 2321 | fn pattern_id(&self, match_index: usize) -> PatternID { |
| 2322 | let start = match_index * PatternID::SIZE; |
| 2323 | wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 |
| 2324 | } |
| 2325 | |
| 2326 | /// Returns the total number of pattern IDs for this state. This is always |
| 2327 | /// zero when `is_match` is false. |
| 2328 | fn pattern_len(&self) -> usize { |
| 2329 | assert_eq!(0, self.pattern_ids.len() % 4); |
| 2330 | self.pattern_ids.len() / 4 |
| 2331 | } |
| 2332 | |
| 2333 | /// Return an accelerator for this state. |
| 2334 | fn accelerator(&self) -> &'a [u8] { |
| 2335 | self.accel |
| 2336 | } |
| 2337 | |
| 2338 | /// Write the raw representation of this state to the given buffer using |
| 2339 | /// the given endianness. |
| 2340 | fn write_to<E: Endian>( |
| 2341 | &self, |
| 2342 | mut dst: &mut [u8], |
| 2343 | ) -> Result<usize, SerializeError> { |
| 2344 | let nwrite = self.write_to_len(); |
| 2345 | if dst.len() < nwrite { |
| 2346 | return Err(SerializeError::buffer_too_small( |
| 2347 | "sparse state transitions" , |
| 2348 | )); |
| 2349 | } |
| 2350 | |
| 2351 | let ntrans = |
| 2352 | if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans }; |
| 2353 | E::write_u16(u16::try_from(ntrans).unwrap(), dst); |
| 2354 | dst = &mut dst[size_of::<u16>()..]; |
| 2355 | |
| 2356 | dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges); |
| 2357 | dst = &mut dst[self.input_ranges.len()..]; |
| 2358 | |
| 2359 | for i in 0..self.ntrans { |
| 2360 | E::write_u32(self.next_at(i).as_u32(), dst); |
| 2361 | dst = &mut dst[StateID::SIZE..]; |
| 2362 | } |
| 2363 | |
| 2364 | if self.is_match { |
| 2365 | E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst); |
| 2366 | dst = &mut dst[size_of::<u32>()..]; |
| 2367 | for i in 0..self.pattern_len() { |
| 2368 | let pid = self.pattern_id(i); |
| 2369 | E::write_u32(pid.as_u32(), dst); |
| 2370 | dst = &mut dst[PatternID::SIZE..]; |
| 2371 | } |
| 2372 | } |
| 2373 | |
| 2374 | dst[0] = u8::try_from(self.accel.len()).unwrap(); |
| 2375 | dst[1..][..self.accel.len()].copy_from_slice(self.accel); |
| 2376 | |
| 2377 | Ok(nwrite) |
| 2378 | } |
| 2379 | |
| 2380 | /// Return the total number of bytes that this state consumes in its |
| 2381 | /// encoded form. |
| 2382 | fn write_to_len(&self) -> usize { |
| 2383 | let mut len = 2 |
| 2384 | + (self.ntrans * 2) |
| 2385 | + (self.ntrans * StateID::SIZE) |
| 2386 | + (1 + self.accel.len()); |
| 2387 | if self.is_match { |
| 2388 | len += size_of::<u32>() + self.pattern_ids.len(); |
| 2389 | } |
| 2390 | len |
| 2391 | } |
| 2392 | } |
| 2393 | |
| 2394 | impl<'a> fmt::Debug for State<'a> { |
| 2395 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 2396 | let mut printed = false; |
| 2397 | for i in 0..(self.ntrans - 1) { |
| 2398 | let next = self.next_at(i); |
| 2399 | if next == DEAD { |
| 2400 | continue; |
| 2401 | } |
| 2402 | |
| 2403 | if printed { |
| 2404 | write!(f, ", " )?; |
| 2405 | } |
| 2406 | let (start, end) = self.range(i); |
| 2407 | if start == end { |
| 2408 | write!(f, " {:?} => {:?}" , DebugByte(start), next.as_usize())?; |
| 2409 | } else { |
| 2410 | write!( |
| 2411 | f, |
| 2412 | " {:?}- {:?} => {:?}" , |
| 2413 | DebugByte(start), |
| 2414 | DebugByte(end), |
| 2415 | next.as_usize(), |
| 2416 | )?; |
| 2417 | } |
| 2418 | printed = true; |
| 2419 | } |
| 2420 | let eoi = self.next_at(self.ntrans - 1); |
| 2421 | if eoi != DEAD { |
| 2422 | if printed { |
| 2423 | write!(f, ", " )?; |
| 2424 | } |
| 2425 | write!(f, "EOI => {:?}" , eoi.as_usize())?; |
| 2426 | } |
| 2427 | Ok(()) |
| 2428 | } |
| 2429 | } |
| 2430 | |
| 2431 | /// A representation of a mutable sparse DFA state that can be cheaply |
| 2432 | /// materialized from a state identifier. |
| 2433 | #[cfg (feature = "dfa-build" )] |
| 2434 | struct StateMut<'a> { |
| 2435 | /// The identifier of this state. |
| 2436 | id: StateID, |
| 2437 | /// Whether this is a match state or not. |
| 2438 | is_match: bool, |
| 2439 | /// The number of transitions in this state. |
| 2440 | ntrans: usize, |
| 2441 | /// Pairs of input ranges, where there is one pair for each transition. |
| 2442 | /// Each pair specifies an inclusive start and end byte range for the |
| 2443 | /// corresponding transition. |
| 2444 | input_ranges: &'a mut [u8], |
| 2445 | /// Transitions to the next state. This slice contains native endian |
| 2446 | /// encoded state identifiers, with `S` as the representation. Thus, there |
| 2447 | /// are `ntrans * size_of::<S>()` bytes in this slice. |
| 2448 | next: &'a mut [u8], |
| 2449 | /// If this is a match state, then this contains the pattern IDs that match |
| 2450 | /// when the DFA is in this state. |
| 2451 | /// |
| 2452 | /// This is a contiguous sequence of 32-bit native endian encoded integers. |
| 2453 | pattern_ids: &'a [u8], |
| 2454 | /// An accelerator for this state, if present. If this state has no |
| 2455 | /// accelerator, then this is an empty slice. When non-empty, this slice |
| 2456 | /// has length at most 3 and corresponds to the exhaustive set of bytes |
| 2457 | /// that must be seen in order to transition out of this state. |
| 2458 | accel: &'a mut [u8], |
| 2459 | } |
| 2460 | |
| 2461 | #[cfg (feature = "dfa-build" )] |
| 2462 | impl<'a> StateMut<'a> { |
| 2463 | /// Sets the ith transition to the given state. |
| 2464 | fn set_next_at(&mut self, i: usize, next: StateID) { |
| 2465 | let start = i * StateID::SIZE; |
| 2466 | let end = start + StateID::SIZE; |
| 2467 | wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]); |
| 2468 | } |
| 2469 | } |
| 2470 | |
| 2471 | #[cfg (feature = "dfa-build" )] |
| 2472 | impl<'a> fmt::Debug for StateMut<'a> { |
| 2473 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 2474 | let state = State { |
| 2475 | id: self.id, |
| 2476 | is_match: self.is_match, |
| 2477 | ntrans: self.ntrans, |
| 2478 | input_ranges: self.input_ranges, |
| 2479 | next: self.next, |
| 2480 | pattern_ids: self.pattern_ids, |
| 2481 | accel: self.accel, |
| 2482 | }; |
| 2483 | fmt::Debug::fmt(&state, f) |
| 2484 | } |
| 2485 | } |
| 2486 | |
| 2487 | // In order to validate everything, we not only need to make sure we |
| 2488 | // can decode every state, but that every transition in every state |
| 2489 | // points to a valid state. There are many duplicative transitions, so |
| 2490 | // we record state IDs that we've verified so that we don't redo the |
| 2491 | // decoding work. |
| 2492 | // |
| 2493 | // Except, when in no_std mode, we don't have dynamic memory allocation |
| 2494 | // available to us, so we skip this optimization. It's not clear |
| 2495 | // whether doing something more clever is worth it just yet. If you're |
| 2496 | // profiling this code and need it to run faster, please file an issue. |
| 2497 | // |
| 2498 | // OK, so we also use this to record the set of valid state IDs. Since |
| 2499 | // it is possible for a transition to point to an invalid state ID that |
| 2500 | // still (somehow) deserializes to a valid state. So we need to make |
| 2501 | // sure our transitions are limited to actually correct state IDs. |
| 2502 | // The problem is, I'm not sure how to do this verification step in |
| 2503 | // no-std no-alloc mode. I think we'd *have* to store the set of valid |
| 2504 | // state IDs in the DFA itself. For now, we don't do this verification |
| 2505 | // in no-std no-alloc mode. The worst thing that can happen is an |
| 2506 | // incorrect result. But no panics or memory safety problems should |
| 2507 | // result. Because we still do validate that the state itself is |
| 2508 | // "valid" in the sense that everything it points to actually exists. |
| 2509 | // |
| 2510 | // ---AG |
| 2511 | #[derive (Debug)] |
| 2512 | struct Seen { |
| 2513 | #[cfg (feature = "alloc" )] |
| 2514 | set: alloc::collections::BTreeSet<StateID>, |
| 2515 | #[cfg (not(feature = "alloc" ))] |
| 2516 | set: core::marker::PhantomData<StateID>, |
| 2517 | } |
| 2518 | |
| 2519 | #[cfg (feature = "alloc" )] |
| 2520 | impl Seen { |
| 2521 | fn new() -> Seen { |
| 2522 | Seen { set: alloc::collections::BTreeSet::new() } |
| 2523 | } |
| 2524 | fn insert(&mut self, id: StateID) { |
| 2525 | self.set.insert(id); |
| 2526 | } |
| 2527 | fn contains(&self, id: &StateID) -> bool { |
| 2528 | self.set.contains(id) |
| 2529 | } |
| 2530 | } |
| 2531 | |
| 2532 | #[cfg (not(feature = "alloc" ))] |
| 2533 | impl Seen { |
| 2534 | fn new() -> Seen { |
| 2535 | Seen { set: core::marker::PhantomData } |
| 2536 | } |
| 2537 | fn insert(&mut self, _id: StateID) {} |
| 2538 | fn contains(&self, _id: &StateID) -> bool { |
| 2539 | true |
| 2540 | } |
| 2541 | } |
| 2542 | |
| 2543 | /* |
| 2544 | /// A binary search routine specialized specifically to a sparse DFA state's |
| 2545 | /// transitions. Specifically, the transitions are defined as a set of pairs |
| 2546 | /// of input bytes that delineate an inclusive range of bytes. If the input |
| 2547 | /// byte is in the range, then the corresponding transition is a match. |
| 2548 | /// |
| 2549 | /// This binary search accepts a slice of these pairs and returns the position |
| 2550 | /// of the matching pair (the ith transition), or None if no matching pair |
| 2551 | /// could be found. |
| 2552 | /// |
| 2553 | /// Note that this routine is not currently used since it was observed to |
| 2554 | /// either decrease performance when searching ASCII, or did not provide enough |
| 2555 | /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here |
| 2556 | /// for posterity in case we can find a way to use it. |
| 2557 | /// |
| 2558 | /// In theory, we could use the standard library's search routine if we could |
| 2559 | /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently |
| 2560 | /// guaranteed to be safe and is thus UB (since I don't think the in-memory |
| 2561 | /// representation of `(u8, u8)` has been nailed down). One could define a |
| 2562 | /// repr(C) type, but the casting doesn't seem justified. |
| 2563 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 2564 | fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { |
| 2565 | debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); |
| 2566 | debug_assert!(ranges.len() <= 512, "ranges should be short"); |
| 2567 | |
| 2568 | let (mut left, mut right) = (0, ranges.len() / 2); |
| 2569 | while left < right { |
| 2570 | let mid = (left + right) / 2; |
| 2571 | let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); |
| 2572 | if needle < b1 { |
| 2573 | right = mid; |
| 2574 | } else if needle > b2 { |
| 2575 | left = mid + 1; |
| 2576 | } else { |
| 2577 | return Some(mid); |
| 2578 | } |
| 2579 | } |
| 2580 | None |
| 2581 | } |
| 2582 | */ |
| 2583 | |
| 2584 | #[cfg (all(test, feature = "syntax" , feature = "dfa-build" ))] |
| 2585 | mod tests { |
| 2586 | use crate::{ |
| 2587 | dfa::{dense::DFA, Automaton}, |
| 2588 | nfa::thompson, |
| 2589 | Input, MatchError, |
| 2590 | }; |
| 2591 | |
| 2592 | // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. |
| 2593 | #[test ] |
| 2594 | fn heuristic_unicode_forward() { |
| 2595 | let dfa = DFA::builder() |
| 2596 | .configure(DFA::config().unicode_word_boundary(true)) |
| 2597 | .thompson(thompson::Config::new().reverse(true)) |
| 2598 | .build(r"\b[0-9]+\b" ) |
| 2599 | .unwrap() |
| 2600 | .to_sparse() |
| 2601 | .unwrap(); |
| 2602 | |
| 2603 | let input = Input::new("β123" ).range(2..); |
| 2604 | let expected = MatchError::quit(0xB2, 1); |
| 2605 | let got = dfa.try_search_fwd(&input); |
| 2606 | assert_eq!(Err(expected), got); |
| 2607 | |
| 2608 | let input = Input::new("123β" ).range(..3); |
| 2609 | let expected = MatchError::quit(0xCE, 3); |
| 2610 | let got = dfa.try_search_fwd(&input); |
| 2611 | assert_eq!(Err(expected), got); |
| 2612 | } |
| 2613 | |
| 2614 | // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. |
| 2615 | #[test ] |
| 2616 | fn heuristic_unicode_reverse() { |
| 2617 | let dfa = DFA::builder() |
| 2618 | .configure(DFA::config().unicode_word_boundary(true)) |
| 2619 | .thompson(thompson::Config::new().reverse(true)) |
| 2620 | .build(r"\b[0-9]+\b" ) |
| 2621 | .unwrap() |
| 2622 | .to_sparse() |
| 2623 | .unwrap(); |
| 2624 | |
| 2625 | let input = Input::new("β123" ).range(2..); |
| 2626 | let expected = MatchError::quit(0xB2, 1); |
| 2627 | let got = dfa.try_search_rev(&input); |
| 2628 | assert_eq!(Err(expected), got); |
| 2629 | |
| 2630 | let input = Input::new("123β" ).range(..3); |
| 2631 | let expected = MatchError::quit(0xCE, 3); |
| 2632 | let got = dfa.try_search_rev(&input); |
| 2633 | assert_eq!(Err(expected), got); |
| 2634 | } |
| 2635 | } |
| 2636 | |