1use std::error::Error;
2
3use regex_automata::{
4 hybrid::dfa::{OverlappingState, DFA},
5 nfa::thompson,
6 HalfMatch, Input, MatchError,
7};
8
9// Tests that too many cache resets cause the lazy DFA to quit.
10//
11// We only test this on 64-bit because the test is gingerly crafted based on
12// implementation details of cache sizes. It's not a great test because of
13// that, but it does check some interesting properties around how positions are
14// reported when a search "gives up."
15//
16// NOTE: If you change something in lazy DFA implementation that causes this
17// test to fail by reporting different "gave up" positions, then it's generally
18// okay to update the positions in the test below as long as you're sure your
19// changes are correct. Namely, it is expected that if there are changes in the
20// cache size (or changes in how big things are inside the cache), then its
21// utilization may change slightly and thus impact where a search gives up.
22// Precisely where a search gives up is not an API guarantee, so changing the
23// offsets here is OK.
24#[test]
25#[cfg(target_pointer_width = "64")]
26#[cfg(not(miri))]
27fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
28 // This is a carefully chosen regex. The idea is to pick one that requires
29 // some decent number of states (hence the bounded repetition). But we
30 // specifically choose to create a class with an ASCII letter and a
31 // non-ASCII letter so that we can check that no new states are created
32 // once the cache is full. Namely, if we fill up the cache on a haystack
33 // of 'a's, then in order to match one 'β', a new state will need to be
34 // created since a 'β' is encoded with multiple bytes.
35 //
36 // So we proceed by "filling" up the cache by searching a haystack of just
37 // 'a's. The cache won't have enough room to add enough states to find the
38 // match (because of the bounded repetition), which should result in it
39 // giving up before it finds a match.
40 //
41 // Since there's now no more room to create states, we search a haystack
42 // of 'β' and confirm that it gives up immediately.
43 let pattern = r"[aβ]{99}";
44 let dfa = DFA::builder()
45 .configure(
46 // Configure it so that we have the minimum cache capacity
47 // possible. And that if any resets occur, the search quits.
48 DFA::config()
49 .skip_cache_capacity_check(true)
50 .cache_capacity(0)
51 .minimum_cache_clear_count(Some(0)),
52 )
53 .thompson(thompson::NFA::config())
54 .build(pattern)?;
55 let mut cache = dfa.create_cache();
56
57 let haystack = "a".repeat(101).into_bytes();
58 let err = MatchError::gave_up(24);
59 // Notice that we make the same amount of progress in each search! That's
60 // because the cache is reused and already has states to handle the first
61 // N bytes.
62 assert_eq!(
63 Err(err.clone()),
64 dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
65 );
66 assert_eq!(
67 Err(err.clone()),
68 dfa.try_search_overlapping_fwd(
69 &mut cache,
70 &Input::new(&haystack),
71 &mut OverlappingState::start()
72 ),
73 );
74
75 let haystack = "β".repeat(101).into_bytes();
76 let err = MatchError::gave_up(2);
77 assert_eq!(
78 Err(err),
79 dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
80 );
81 // no need to test that other find routines quit, since we did that above
82
83 // OK, if we reset the cache, then we should be able to create more states
84 // and make more progress with searching for betas.
85 cache.reset(&dfa);
86 let err = MatchError::gave_up(26);
87 assert_eq!(
88 Err(err),
89 dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
90 );
91
92 // ... switching back to ASCII still makes progress since it just needs to
93 // set transitions on existing states!
94 let haystack = "a".repeat(101).into_bytes();
95 let err = MatchError::gave_up(13);
96 assert_eq!(
97 Err(err),
98 dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
99 );
100
101 Ok(())
102}
103
104// Tests that quit bytes in the forward direction work correctly.
105#[test]
106fn quit_fwd() -> Result<(), Box<dyn Error>> {
107 let dfa = DFA::builder()
108 .configure(DFA::config().quit(b'x', true))
109 .build("[[:word:]]+$")?;
110 let mut cache = dfa.create_cache();
111
112 assert_eq!(
113 dfa.try_search_fwd(&mut cache, &Input::new("abcxyz")),
114 Err(MatchError::quit(b'x', 3)),
115 );
116 assert_eq!(
117 dfa.try_search_overlapping_fwd(
118 &mut cache,
119 &Input::new(b"abcxyz"),
120 &mut OverlappingState::start()
121 ),
122 Err(MatchError::quit(b'x', 3)),
123 );
124
125 Ok(())
126}
127
128// Tests that quit bytes in the reverse direction work correctly.
129#[test]
130fn quit_rev() -> Result<(), Box<dyn Error>> {
131 let dfa = DFA::builder()
132 .configure(DFA::config().quit(b'x', true))
133 .thompson(thompson::Config::new().reverse(true))
134 .build("^[[:word:]]+")?;
135 let mut cache = dfa.create_cache();
136
137 assert_eq!(
138 dfa.try_search_rev(&mut cache, &Input::new("abcxyz")),
139 Err(MatchError::quit(b'x', 3)),
140 );
141
142 Ok(())
143}
144
145// Tests that if we heuristically enable Unicode word boundaries but then
146// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
147// will panic.
148#[test]
149#[should_panic]
150fn quit_panics() {
151 DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
152}
153
154// This tests an intesting case where even if the Unicode word boundary option
155// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
156// word boundaries to be enabled.
157#[test]
158fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
159 let mut config = DFA::config();
160 for b in 0x80..=0xFF {
161 config = config.quit(b, true);
162 }
163 let dfa = DFA::builder().configure(config).build(r"\b")?;
164 let mut cache = dfa.create_cache();
165 let expected = HalfMatch::must(0, 1);
166 assert_eq!(
167 Ok(Some(expected)),
168 dfa.try_search_fwd(&mut cache, &Input::new(" a")),
169 );
170 Ok(())
171}
172