| 1 | // This is a regression test for a bug in how special states are handled. The |
| 2 | // fuzzer found a case where a state returned true for 'is_special_state' but |
| 3 | // *didn't* return true for 'is_dead_state', 'is_quit_state', 'is_match_state', |
| 4 | // 'is_start_state' or 'is_accel_state'. This in turn tripped a debug assertion |
| 5 | // in the core matching loop that requires 'is_special_state' being true to |
| 6 | // imply that one of the other routines returns true. |
| 7 | // |
| 8 | // We fixed this by adding some validation to both dense and sparse DFAs that |
| 9 | // checks that this property is true for every state ID in the DFA. |
| 10 | #[test] |
| 11 | fn invalid_special_state() { |
| 12 | let data = include_bytes!( |
| 13 | "testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838" , |
| 14 | ); |
| 15 | let _ = fuzz_run(data); |
| 16 | } |
| 17 | |
| 18 | // This is an interesting case where a fuzzer generated a DFA with |
| 19 | // a transition to a state ID that decoded as a valid state, but |
| 20 | // where the ID itself did not point to one of the two existing |
| 21 | // states for this particular DFA. This combined with marking this |
| 22 | // transition's state ID as special but without actually making one of the |
| 23 | // 'is_{dead,quit,match,start,accel}_state' predicates return true ended up |
| 24 | // tripping the 'debug_assert(dfa.is_quit_state(sid))' code in the search |
| 25 | // routine. |
| 26 | // |
| 27 | // We fixed this in alloc mode by checking that every transition points to a |
| 28 | // valid state ID. Technically this bug still exists in core-only mode, but |
| 29 | // it's not clear how to fix it. And it's worth pointing out that the search |
| 30 | // routine won't panic in production. It will just provide invalid results. And |
| 31 | // that's acceptable within the contract of DFA::from_bytes. |
| 32 | #[test] |
| 33 | fn transition_to_invalid_but_valid_state() { |
| 34 | let data = include_bytes!( |
| 35 | "testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9" , |
| 36 | ); |
| 37 | let _ = fuzz_run(data); |
| 38 | } |
| 39 | |
| 40 | // Another one caught by the fuzzer where it generated a DFA that reported a |
| 41 | // start state as a match state. Since matches are always delayed by one byte, |
| 42 | // start states specifically cannot be match states. And indeed, the search |
| 43 | // code relies on this. |
| 44 | #[test] |
| 45 | fn start_state_is_not_match_state() { |
| 46 | let data = include_bytes!( |
| 47 | "testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000" , |
| 48 | ); |
| 49 | let _ = fuzz_run(data); |
| 50 | } |
| 51 | |
| 52 | // This is variation on 'transition_to_invalid_but_valid_state', but happens |
| 53 | // to a start state. Namely, the fuzz data here builds a DFA with a start |
| 54 | // state ID that is incorrect but points to a sequence of bytes that satisfies |
| 55 | // state decoding validation. This errant state in turn has a non-zero number |
| 56 | // of transitions, and its those transitions that point to a state that does |
| 57 | // *not* satisfy state decoding validation. But we never checked those. So the |
| 58 | // fix here was to add validation of the transitions off of the start state. |
| 59 | #[test] |
| 60 | fn start_state_has_valid_transitions() { |
| 61 | let data = include_bytes!( |
| 62 | "testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98" , |
| 63 | ); |
| 64 | let _ = fuzz_run(data); |
| 65 | } |
| 66 | |
| 67 | // This fuzz input generated a DFA with a state whose ID was in the match state |
| 68 | // ID range, but where the state itself was encoded with zero pattern IDs. We |
| 69 | // added validation code to check this case. |
| 70 | #[test] |
| 71 | fn match_state_inconsistency() { |
| 72 | let data = include_bytes!( |
| 73 | "testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570" , |
| 74 | ); |
| 75 | let _ = fuzz_run(data); |
| 76 | } |
| 77 | |
| 78 | // This fuzz input generated a DFA with a state whose ID was in the accelerator |
| 79 | // range, but who didn't have any accelerators. This violated an invariant that |
| 80 | // assumes that if 'dfa.is_accel_state(sid)' returns true, then the state must |
| 81 | // have some accelerators. |
| 82 | #[test] |
| 83 | fn invalid_accelerators() { |
| 84 | let data = include_bytes!( |
| 85 | "testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b" , |
| 86 | ); |
| 87 | let _ = fuzz_run(data); |
| 88 | } |
| 89 | |
| 90 | // This fuzz input generated a DFA with a state whose EOI transition led to |
| 91 | // a quit state, which is generally considered illegal. Why? Because the EOI |
| 92 | // transition is defined over a special sentinel alphabet element and one |
| 93 | // cannot configure a DFA to "quit" on that sentinel. |
| 94 | #[test] |
| 95 | fn eoi_transition_to_quit_state() { |
| 96 | let data = include_bytes!( |
| 97 | "testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9" , |
| 98 | ); |
| 99 | let _ = fuzz_run(data); |
| 100 | } |
| 101 | |
| 102 | // This is the code from the fuzz target. Kind of sucks to duplicate it here, |
| 103 | // but this is fundamentally how we interpret the date. |
| 104 | fn fuzz_run(given_data: &[u8]) -> Option<()> { |
| 105 | use regex_automata::dfa::Automaton; |
| 106 | |
| 107 | if given_data.len() < 2 { |
| 108 | return None; |
| 109 | } |
| 110 | let haystack_len = usize::from(given_data[0]); |
| 111 | let haystack = given_data.get(1..1 + haystack_len)?; |
| 112 | let given_dfa_bytes = given_data.get(1 + haystack_len..)?; |
| 113 | |
| 114 | // We help the fuzzer along by adding a preamble to the bytes that should |
| 115 | // at least make these first parts valid. The preamble expects a very |
| 116 | // specific sequence of bytes, so it makes sense to just force this. |
| 117 | let label = "rust-regex-automata-dfa-sparse \x00\x00" ; |
| 118 | assert_eq!(0, label.len() % 4); |
| 119 | let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec(); |
| 120 | let version_check = 2u32.to_ne_bytes().to_vec(); |
| 121 | let mut dfa_bytes: Vec<u8> = vec![]; |
| 122 | dfa_bytes.extend(label.as_bytes()); |
| 123 | dfa_bytes.extend(&endianness_check); |
| 124 | dfa_bytes.extend(&version_check); |
| 125 | dfa_bytes.extend(given_dfa_bytes); |
| 126 | // This is the real test: checking that any input we give to |
| 127 | // DFA::from_bytes will never result in a panic. |
| 128 | let (dfa, _) = |
| 129 | regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?; |
| 130 | let _ = dfa.try_search_fwd(®ex_automata::Input::new(haystack)); |
| 131 | Some(()) |
| 132 | } |
| 133 | |