1 | // This set of tests is different from regression_fuzz in that the tests start |
2 | // from the fuzzer data directly. The test essentially duplicates the fuzz |
3 | // target. I wonder if there's a better way to set this up... Hmmm. I bet |
4 | // `cargo fuzz` has something where it can run a target against crash files and |
5 | // verify that they pass. |
6 | |
7 | // This case found by the fuzzer causes the meta engine to use the "reverse |
8 | // inner" literal strategy. That in turn uses a specialized search routine |
9 | // for the lazy DFA in order to avoid worst case quadratic behavior. That |
10 | // specialized search routine had a bug where it assumed that start state |
11 | // specialization was disabled. But this is indeed not the case, since it |
12 | // reuses the "general" lazy DFA for the full regex created as part of the core |
13 | // strategy, which might very well have start states specialized due to the |
14 | // existence of a prefilter. |
15 | // |
16 | // This is a somewhat weird case because if the core engine has a prefilter, |
17 | // then it's usually the case that the "reverse inner" optimization won't be |
18 | // pursued in that case. But there are some heuristics that try to detect |
19 | // whether a prefilter is "fast" or not. If it's not, then the meta engine will |
20 | // attempt the reverse inner optimization. And indeed, that's what happens |
21 | // here. So the reverse inner optimization ends up with a lazy DFA that has |
22 | // start states specialized. Ideally this wouldn't happen because specializing |
23 | // start states without a prefilter inside the DFA can be disastrous for |
24 | // performance by causing the DFA to ping-pong in and out of the special state |
25 | // handling. In this case, it's probably not a huge deal because the lazy |
26 | // DFA is only used for part of the matching where as the work horse is the |
27 | // prefilter found by the reverse inner optimization. |
28 | // |
29 | // We could maybe fix this by refactoring the meta engine to be a little more |
30 | // careful. For example, by attempting the optimizations before building the |
31 | // core engine. But this is perhaps a little tricky. |
32 | #[test] |
33 | fn meta_stopat_specialize_start_states() { |
34 | let data = include_bytes!( |
35 | "testdata/crash-8760b19b25d74e3603d4c643e9c7404fdd3631f9" , |
36 | ); |
37 | let _ = run(data); |
38 | } |
39 | |
40 | // Same bug as meta_stopat_specialize_start_states, but minimized by the |
41 | // fuzzer. |
42 | #[test] |
43 | fn meta_stopat_specialize_start_states_min() { |
44 | let data = include_bytes!( |
45 | "testdata/minimized-from-8760b19b25d74e3603d4c643e9c7404fdd3631f9" , |
46 | ); |
47 | let _ = run(data); |
48 | } |
49 | |
50 | // This input generated a pattern with a fail state (e.g., \P{any}, [^\s\S] |
51 | // or [a&&b]). But the fail state was in a branch, where a subsequent branch |
52 | // should have led to an overall match, but handling of the fail state |
53 | // prevented it from doing so. A hand-minimized version of this is '[^\s\S]A|B' |
54 | // on the haystack 'B'. That should yield a match of 'B'. |
55 | // |
56 | // The underlying cause was an issue in how DFA determinization handled fail |
57 | // states. The bug didn't impact the PikeVM or the bounded backtracker. |
58 | #[test] |
59 | fn fail_branch_prevents_match() { |
60 | let data = include_bytes!( |
61 | "testdata/crash-cd33b13df59ea9d74503986f9d32a270dd43cc04" , |
62 | ); |
63 | let _ = run(data); |
64 | } |
65 | |
66 | // This input generated a pattern that contained a sub-expression like this: |
67 | // |
68 | // a{0}{50000} |
69 | // |
70 | // This turned out to provoke quadratic behavior in the NFA compiler. |
71 | // Basically, the NFA compiler works in two phases. The first phase builds |
72 | // a more complicated-but-simpler-to-construct sequence of NFA states that |
73 | // includes unconditional epsilon transitions. As part of converting this |
74 | // sequence to the "final" NFA, we remove those unconditional espilon |
75 | // transition. The code responsible for doing this follows every chain of |
76 | // these transitions and remaps the state IDs. The way we were doing this |
77 | // before resulted in re-following every subsequent part of the chain for each |
78 | // state in the chain, which ended up being quadratic behavior. We effectively |
79 | // memoized this, which fixed the performance bug. |
80 | #[test] |
81 | fn slow_big_empty_chain() { |
82 | let data = include_bytes!( |
83 | "testdata/slow-unit-9ca9cc9929fee1fcbb847a78384effb8b98ea18a" , |
84 | ); |
85 | let _ = run(data); |
86 | } |
87 | |
88 | // A different case of slow_big_empty_chain. |
89 | #[test] |
90 | fn slow_big_empty_chain2() { |
91 | let data = include_bytes!( |
92 | "testdata/slow-unit-3ab758ea520027fefd3f00e1384d9aeef155739e" , |
93 | ); |
94 | let _ = run(data); |
95 | } |
96 | |
97 | // A different case of slow_big_empty_chain. |
98 | #[test] |
99 | fn slow_big_empty_chain3() { |
100 | let data = include_bytes!( |
101 | "testdata/slow-unit-b8a052f4254802edbe5f569b6ce6e9b6c927e9d6" , |
102 | ); |
103 | let _ = run(data); |
104 | } |
105 | |
106 | // A different case of slow_big_empty_chain. |
107 | #[test] |
108 | fn slow_big_empty_chain4() { |
109 | let data = include_bytes!( |
110 | "testdata/slow-unit-93c73a43581f205f9aaffd9c17e52b34b17becd0" , |
111 | ); |
112 | let _ = run(data); |
113 | } |
114 | |
115 | // A different case of slow_big_empty_chain. |
116 | #[test] |
117 | fn slow_big_empty_chain5() { |
118 | let data = include_bytes!( |
119 | "testdata/slow-unit-5345fccadf3812c53c3ccc7af5aa2741b7b2106c" , |
120 | ); |
121 | let _ = run(data); |
122 | } |
123 | |
124 | // A different case of slow_big_empty_chain. |
125 | #[test] |
126 | fn slow_big_empty_chain6() { |
127 | let data = include_bytes!( |
128 | "testdata/slow-unit-6bd643eec330166e4ada91da2d3f284268481085" , |
129 | ); |
130 | let _ = run(data); |
131 | } |
132 | |
133 | // This fuzz input generated a pattern with a large repetition that would fail |
134 | // NFA compilation, but its HIR was small. (HIR doesn't expand repetitions.) |
135 | // But, the bounds were high enough that the minimum length calculation |
136 | // overflowed. We fixed this by using saturating arithmetic (and also checked |
137 | // arithmetic for the maximum length calculation). |
138 | // |
139 | // Incidentally, this was the only unguarded arithmetic operation performed in |
140 | // the HIR smart constructors. And the fuzzer found it. Hah. Nice. |
141 | #[test] |
142 | fn minimum_len_overflow() { |
143 | let data = include_bytes!( |
144 | "testdata/crash-7eb3351f0965e5d6c1cb98aa8585949ef96531ff" , |
145 | ); |
146 | let _ = run(data); |
147 | } |
148 | |
149 | // This is the fuzz target function. We duplicate it here since this is the |
150 | // thing we use to interpret the data. It is ultimately what we want to |
151 | // succeed. |
152 | fn run(data: &[u8]) -> Option<()> { |
153 | if data.len() < 2 { |
154 | return None; |
155 | } |
156 | let mut split_at = usize::from(data[0]); |
157 | let data = std::str::from_utf8(&data[1..]).ok()?; |
158 | // Split data into a regex and haystack to search. |
159 | let len = usize::try_from(data.chars().count()).ok()?; |
160 | split_at = std::cmp::max(split_at, 1) % len; |
161 | let char_index = data.char_indices().nth(split_at)?.0; |
162 | let (pattern, input) = data.split_at(char_index); |
163 | let re = regex::Regex::new(pattern).ok()?; |
164 | re.is_match(input); |
165 | Some(()) |
166 | } |
167 | |