| 1 | // This set of tests is different from regression_fuzz in that the tests start |
| 2 | // from the fuzzer data directly. The test essentially duplicates the fuzz |
| 3 | // target. I wonder if there's a better way to set this up... Hmmm. I bet |
| 4 | // `cargo fuzz` has something where it can run a target against crash files and |
| 5 | // verify that they pass. |
| 6 | |
| 7 | // This case found by the fuzzer causes the meta engine to use the "reverse |
| 8 | // inner" literal strategy. That in turn uses a specialized search routine |
| 9 | // for the lazy DFA in order to avoid worst case quadratic behavior. That |
| 10 | // specialized search routine had a bug where it assumed that start state |
| 11 | // specialization was disabled. But this is indeed not the case, since it |
| 12 | // reuses the "general" lazy DFA for the full regex created as part of the core |
| 13 | // strategy, which might very well have start states specialized due to the |
| 14 | // existence of a prefilter. |
| 15 | // |
| 16 | // This is a somewhat weird case because if the core engine has a prefilter, |
| 17 | // then it's usually the case that the "reverse inner" optimization won't be |
| 18 | // pursued in that case. But there are some heuristics that try to detect |
| 19 | // whether a prefilter is "fast" or not. If it's not, then the meta engine will |
| 20 | // attempt the reverse inner optimization. And indeed, that's what happens |
| 21 | // here. So the reverse inner optimization ends up with a lazy DFA that has |
| 22 | // start states specialized. Ideally this wouldn't happen because specializing |
| 23 | // start states without a prefilter inside the DFA can be disastrous for |
| 24 | // performance by causing the DFA to ping-pong in and out of the special state |
| 25 | // handling. In this case, it's probably not a huge deal because the lazy |
| 26 | // DFA is only used for part of the matching where as the work horse is the |
| 27 | // prefilter found by the reverse inner optimization. |
| 28 | // |
| 29 | // We could maybe fix this by refactoring the meta engine to be a little more |
| 30 | // careful. For example, by attempting the optimizations before building the |
| 31 | // core engine. But this is perhaps a little tricky. |
| 32 | #[test] |
| 33 | fn meta_stopat_specialize_start_states() { |
| 34 | let data = include_bytes!( |
| 35 | "testdata/crash-8760b19b25d74e3603d4c643e9c7404fdd3631f9" , |
| 36 | ); |
| 37 | let _ = run(data); |
| 38 | } |
| 39 | |
| 40 | // Same bug as meta_stopat_specialize_start_states, but minimized by the |
| 41 | // fuzzer. |
| 42 | #[test] |
| 43 | fn meta_stopat_specialize_start_states_min() { |
| 44 | let data = include_bytes!( |
| 45 | "testdata/minimized-from-8760b19b25d74e3603d4c643e9c7404fdd3631f9" , |
| 46 | ); |
| 47 | let _ = run(data); |
| 48 | } |
| 49 | |
| 50 | // This input generated a pattern with a fail state (e.g., \P{any}, [^\s\S] |
| 51 | // or [a&&b]). But the fail state was in a branch, where a subsequent branch |
| 52 | // should have led to an overall match, but handling of the fail state |
| 53 | // prevented it from doing so. A hand-minimized version of this is '[^\s\S]A|B' |
| 54 | // on the haystack 'B'. That should yield a match of 'B'. |
| 55 | // |
| 56 | // The underlying cause was an issue in how DFA determinization handled fail |
| 57 | // states. The bug didn't impact the PikeVM or the bounded backtracker. |
| 58 | #[test] |
| 59 | fn fail_branch_prevents_match() { |
| 60 | let data = include_bytes!( |
| 61 | "testdata/crash-cd33b13df59ea9d74503986f9d32a270dd43cc04" , |
| 62 | ); |
| 63 | let _ = run(data); |
| 64 | } |
| 65 | |
| 66 | // This input generated a pattern that contained a sub-expression like this: |
| 67 | // |
| 68 | // a{0}{50000} |
| 69 | // |
| 70 | // This turned out to provoke quadratic behavior in the NFA compiler. |
| 71 | // Basically, the NFA compiler works in two phases. The first phase builds |
| 72 | // a more complicated-but-simpler-to-construct sequence of NFA states that |
| 73 | // includes unconditional epsilon transitions. As part of converting this |
| 74 | // sequence to the "final" NFA, we remove those unconditional espilon |
| 75 | // transition. The code responsible for doing this follows every chain of |
| 76 | // these transitions and remaps the state IDs. The way we were doing this |
| 77 | // before resulted in re-following every subsequent part of the chain for each |
| 78 | // state in the chain, which ended up being quadratic behavior. We effectively |
| 79 | // memoized this, which fixed the performance bug. |
| 80 | #[test] |
| 81 | fn slow_big_empty_chain() { |
| 82 | let data = include_bytes!( |
| 83 | "testdata/slow-unit-9ca9cc9929fee1fcbb847a78384effb8b98ea18a" , |
| 84 | ); |
| 85 | let _ = run(data); |
| 86 | } |
| 87 | |
| 88 | // A different case of slow_big_empty_chain. |
| 89 | #[test] |
| 90 | fn slow_big_empty_chain2() { |
| 91 | let data = include_bytes!( |
| 92 | "testdata/slow-unit-3ab758ea520027fefd3f00e1384d9aeef155739e" , |
| 93 | ); |
| 94 | let _ = run(data); |
| 95 | } |
| 96 | |
| 97 | // A different case of slow_big_empty_chain. |
| 98 | #[test] |
| 99 | fn slow_big_empty_chain3() { |
| 100 | let data = include_bytes!( |
| 101 | "testdata/slow-unit-b8a052f4254802edbe5f569b6ce6e9b6c927e9d6" , |
| 102 | ); |
| 103 | let _ = run(data); |
| 104 | } |
| 105 | |
| 106 | // A different case of slow_big_empty_chain. |
| 107 | #[test] |
| 108 | fn slow_big_empty_chain4() { |
| 109 | let data = include_bytes!( |
| 110 | "testdata/slow-unit-93c73a43581f205f9aaffd9c17e52b34b17becd0" , |
| 111 | ); |
| 112 | let _ = run(data); |
| 113 | } |
| 114 | |
| 115 | // A different case of slow_big_empty_chain. |
| 116 | #[test] |
| 117 | fn slow_big_empty_chain5() { |
| 118 | let data = include_bytes!( |
| 119 | "testdata/slow-unit-5345fccadf3812c53c3ccc7af5aa2741b7b2106c" , |
| 120 | ); |
| 121 | let _ = run(data); |
| 122 | } |
| 123 | |
| 124 | // A different case of slow_big_empty_chain. |
| 125 | #[test] |
| 126 | fn slow_big_empty_chain6() { |
| 127 | let data = include_bytes!( |
| 128 | "testdata/slow-unit-6bd643eec330166e4ada91da2d3f284268481085" , |
| 129 | ); |
| 130 | let _ = run(data); |
| 131 | } |
| 132 | |
| 133 | // This fuzz input generated a pattern with a large repetition that would fail |
| 134 | // NFA compilation, but its HIR was small. (HIR doesn't expand repetitions.) |
| 135 | // But, the bounds were high enough that the minimum length calculation |
| 136 | // overflowed. We fixed this by using saturating arithmetic (and also checked |
| 137 | // arithmetic for the maximum length calculation). |
| 138 | // |
| 139 | // Incidentally, this was the only unguarded arithmetic operation performed in |
| 140 | // the HIR smart constructors. And the fuzzer found it. Hah. Nice. |
| 141 | #[test] |
| 142 | fn minimum_len_overflow() { |
| 143 | let data = include_bytes!( |
| 144 | "testdata/crash-7eb3351f0965e5d6c1cb98aa8585949ef96531ff" , |
| 145 | ); |
| 146 | let _ = run(data); |
| 147 | } |
| 148 | |
| 149 | // This is the fuzz target function. We duplicate it here since this is the |
| 150 | // thing we use to interpret the data. It is ultimately what we want to |
| 151 | // succeed. |
| 152 | fn run(data: &[u8]) -> Option<()> { |
| 153 | if data.len() < 2 { |
| 154 | return None; |
| 155 | } |
| 156 | let mut split_at = usize::from(data[0]); |
| 157 | let data = std::str::from_utf8(&data[1..]).ok()?; |
| 158 | // Split data into a regex and haystack to search. |
| 159 | let len = usize::try_from(data.chars().count()).ok()?; |
| 160 | split_at = std::cmp::max(split_at, 1) % len; |
| 161 | let char_index = data.char_indices().nth(split_at)?.0; |
| 162 | let (pattern, input) = data.split_at(char_index); |
| 163 | let re = regex::Regex::new(pattern).ok()?; |
| 164 | re.is_match(input); |
| 165 | Some(()) |
| 166 | } |
| 167 | |