1 | use std::error::Error; |
2 | |
3 | use regex_automata::{ |
4 | hybrid::dfa::{OverlappingState, DFA}, |
5 | nfa::thompson, |
6 | HalfMatch, Input, MatchError, |
7 | }; |
8 | |
9 | // Tests that too many cache resets cause the lazy DFA to quit. |
10 | // |
11 | // We only test this on 64-bit because the test is gingerly crafted based on |
12 | // implementation details of cache sizes. It's not a great test because of |
13 | // that, but it does check some interesting properties around how positions are |
14 | // reported when a search "gives up." |
15 | // |
16 | // NOTE: If you change something in lazy DFA implementation that causes this |
17 | // test to fail by reporting different "gave up" positions, then it's generally |
18 | // okay to update the positions in the test below as long as you're sure your |
19 | // changes are correct. Namely, it is expected that if there are changes in the |
20 | // cache size (or changes in how big things are inside the cache), then its |
21 | // utilization may change slightly and thus impact where a search gives up. |
22 | // Precisely where a search gives up is not an API guarantee, so changing the |
23 | // offsets here is OK. |
24 | #[test] |
25 | #[cfg (target_pointer_width = "64" )] |
26 | #[cfg (not(miri))] |
27 | fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> { |
28 | // This is a carefully chosen regex. The idea is to pick one that requires |
29 | // some decent number of states (hence the bounded repetition). But we |
30 | // specifically choose to create a class with an ASCII letter and a |
31 | // non-ASCII letter so that we can check that no new states are created |
32 | // once the cache is full. Namely, if we fill up the cache on a haystack |
33 | // of 'a's, then in order to match one 'β', a new state will need to be |
34 | // created since a 'β' is encoded with multiple bytes. |
35 | // |
36 | // So we proceed by "filling" up the cache by searching a haystack of just |
37 | // 'a's. The cache won't have enough room to add enough states to find the |
38 | // match (because of the bounded repetition), which should result in it |
39 | // giving up before it finds a match. |
40 | // |
41 | // Since there's now no more room to create states, we search a haystack |
42 | // of 'β' and confirm that it gives up immediately. |
43 | let pattern = r"[aβ]{99}" ; |
44 | let dfa = DFA::builder() |
45 | .configure( |
46 | // Configure it so that we have the minimum cache capacity |
47 | // possible. And that if any resets occur, the search quits. |
48 | DFA::config() |
49 | .skip_cache_capacity_check(true) |
50 | .cache_capacity(0) |
51 | .minimum_cache_clear_count(Some(0)), |
52 | ) |
53 | .thompson(thompson::NFA::config()) |
54 | .build(pattern)?; |
55 | let mut cache = dfa.create_cache(); |
56 | |
57 | let haystack = "a" .repeat(101).into_bytes(); |
58 | let err = MatchError::gave_up(24); |
59 | // Notice that we make the same amount of progress in each search! That's |
60 | // because the cache is reused and already has states to handle the first |
61 | // N bytes. |
62 | assert_eq!( |
63 | Err(err.clone()), |
64 | dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) |
65 | ); |
66 | assert_eq!( |
67 | Err(err.clone()), |
68 | dfa.try_search_overlapping_fwd( |
69 | &mut cache, |
70 | &Input::new(&haystack), |
71 | &mut OverlappingState::start() |
72 | ), |
73 | ); |
74 | |
75 | let haystack = "β" .repeat(101).into_bytes(); |
76 | let err = MatchError::gave_up(2); |
77 | assert_eq!( |
78 | Err(err), |
79 | dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) |
80 | ); |
81 | // no need to test that other find routines quit, since we did that above |
82 | |
83 | // OK, if we reset the cache, then we should be able to create more states |
84 | // and make more progress with searching for betas. |
85 | cache.reset(&dfa); |
86 | let err = MatchError::gave_up(26); |
87 | assert_eq!( |
88 | Err(err), |
89 | dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) |
90 | ); |
91 | |
92 | // ... switching back to ASCII still makes progress since it just needs to |
93 | // set transitions on existing states! |
94 | let haystack = "a" .repeat(101).into_bytes(); |
95 | let err = MatchError::gave_up(13); |
96 | assert_eq!( |
97 | Err(err), |
98 | dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) |
99 | ); |
100 | |
101 | Ok(()) |
102 | } |
103 | |
104 | // Tests that quit bytes in the forward direction work correctly. |
105 | #[test] |
106 | fn quit_fwd() -> Result<(), Box<dyn Error>> { |
107 | let dfa = DFA::builder() |
108 | .configure(DFA::config().quit(b'x' , true)) |
109 | .build("[[:word:]]+$" )?; |
110 | let mut cache = dfa.create_cache(); |
111 | |
112 | assert_eq!( |
113 | dfa.try_search_fwd(&mut cache, &Input::new("abcxyz" )), |
114 | Err(MatchError::quit(b'x' , 3)), |
115 | ); |
116 | assert_eq!( |
117 | dfa.try_search_overlapping_fwd( |
118 | &mut cache, |
119 | &Input::new(b"abcxyz" ), |
120 | &mut OverlappingState::start() |
121 | ), |
122 | Err(MatchError::quit(b'x' , 3)), |
123 | ); |
124 | |
125 | Ok(()) |
126 | } |
127 | |
128 | // Tests that quit bytes in the reverse direction work correctly. |
129 | #[test] |
130 | fn quit_rev() -> Result<(), Box<dyn Error>> { |
131 | let dfa = DFA::builder() |
132 | .configure(DFA::config().quit(b'x' , true)) |
133 | .thompson(thompson::Config::new().reverse(true)) |
134 | .build("^[[:word:]]+" )?; |
135 | let mut cache = dfa.create_cache(); |
136 | |
137 | assert_eq!( |
138 | dfa.try_search_rev(&mut cache, &Input::new("abcxyz" )), |
139 | Err(MatchError::quit(b'x' , 3)), |
140 | ); |
141 | |
142 | Ok(()) |
143 | } |
144 | |
145 | // Tests that if we heuristically enable Unicode word boundaries but then |
146 | // instruct that a non-ASCII byte should NOT be a quit byte, then the builder |
147 | // will panic. |
148 | #[test] |
149 | #[should_panic ] |
150 | fn quit_panics() { |
151 | DFA::config().unicode_word_boundary(true).quit(b' \xFF' , false); |
152 | } |
153 | |
154 | // This tests an intesting case where even if the Unicode word boundary option |
155 | // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode |
156 | // word boundaries to be enabled. |
157 | #[test] |
158 | fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> { |
159 | let mut config = DFA::config(); |
160 | for b in 0x80..=0xFF { |
161 | config = config.quit(b, true); |
162 | } |
163 | let dfa = DFA::builder().configure(config).build(r"\b" )?; |
164 | let mut cache = dfa.create_cache(); |
165 | let expected = HalfMatch::must(0, 1); |
166 | assert_eq!( |
167 | Ok(Some(expected)), |
168 | dfa.try_search_fwd(&mut cache, &Input::new(" a" )), |
169 | ); |
170 | Ok(()) |
171 | } |
172 | |