| 1 | use std::error::Error; |
| 2 | |
| 3 | use regex_automata::{ |
| 4 | hybrid::dfa::{OverlappingState, DFA}, |
| 5 | nfa::thompson, |
| 6 | HalfMatch, Input, MatchError, |
| 7 | }; |
| 8 | |
| 9 | // Tests that too many cache resets cause the lazy DFA to quit. |
| 10 | // |
| 11 | // We only test this on 64-bit because the test is gingerly crafted based on |
| 12 | // implementation details of cache sizes. It's not a great test because of |
| 13 | // that, but it does check some interesting properties around how positions are |
| 14 | // reported when a search "gives up." |
| 15 | // |
| 16 | // NOTE: If you change something in lazy DFA implementation that causes this |
| 17 | // test to fail by reporting different "gave up" positions, then it's generally |
| 18 | // okay to update the positions in the test below as long as you're sure your |
| 19 | // changes are correct. Namely, it is expected that if there are changes in the |
| 20 | // cache size (or changes in how big things are inside the cache), then its |
| 21 | // utilization may change slightly and thus impact where a search gives up. |
| 22 | // Precisely where a search gives up is not an API guarantee, so changing the |
| 23 | // offsets here is OK. |
| 24 | #[test] |
| 25 | #[cfg (target_pointer_width = "64" )] |
| 26 | #[cfg (not(miri))] |
| 27 | fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> { |
| 28 | // This is a carefully chosen regex. The idea is to pick one that requires |
| 29 | // some decent number of states (hence the bounded repetition). But we |
| 30 | // specifically choose to create a class with an ASCII letter and a |
| 31 | // non-ASCII letter so that we can check that no new states are created |
| 32 | // once the cache is full. Namely, if we fill up the cache on a haystack |
| 33 | // of 'a's, then in order to match one 'β', a new state will need to be |
| 34 | // created since a 'β' is encoded with multiple bytes. |
| 35 | // |
| 36 | // So we proceed by "filling" up the cache by searching a haystack of just |
| 37 | // 'a's. The cache won't have enough room to add enough states to find the |
| 38 | // match (because of the bounded repetition), which should result in it |
| 39 | // giving up before it finds a match. |
| 40 | // |
| 41 | // Since there's now no more room to create states, we search a haystack |
| 42 | // of 'β' and confirm that it gives up immediately. |
| 43 | let pattern = r"[aβ]{99}" ; |
| 44 | let dfa = DFA::builder() |
| 45 | .configure( |
| 46 | // Configure it so that we have the minimum cache capacity |
| 47 | // possible. And that if any resets occur, the search quits. |
| 48 | DFA::config() |
| 49 | .skip_cache_capacity_check(true) |
| 50 | .cache_capacity(0) |
| 51 | .minimum_cache_clear_count(Some(0)), |
| 52 | ) |
| 53 | .thompson(thompson::NFA::config()) |
| 54 | .build(pattern)?; |
| 55 | let mut cache = dfa.create_cache(); |
| 56 | |
| 57 | let haystack = "a" .repeat(101).into_bytes(); |
| 58 | let err = MatchError::gave_up(24); |
| 59 | // Notice that we make the same amount of progress in each search! That's |
| 60 | // because the cache is reused and already has states to handle the first |
| 61 | // N bytes. |
| 62 | assert_eq!( |
| 63 | Err(err.clone()), |
| 64 | dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) |
| 65 | ); |
| 66 | assert_eq!( |
| 67 | Err(err.clone()), |
| 68 | dfa.try_search_overlapping_fwd( |
| 69 | &mut cache, |
| 70 | &Input::new(&haystack), |
| 71 | &mut OverlappingState::start() |
| 72 | ), |
| 73 | ); |
| 74 | |
| 75 | let haystack = "β" .repeat(101).into_bytes(); |
| 76 | let err = MatchError::gave_up(2); |
| 77 | assert_eq!( |
| 78 | Err(err), |
| 79 | dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) |
| 80 | ); |
| 81 | // no need to test that other find routines quit, since we did that above |
| 82 | |
| 83 | // OK, if we reset the cache, then we should be able to create more states |
| 84 | // and make more progress with searching for betas. |
| 85 | cache.reset(&dfa); |
| 86 | let err = MatchError::gave_up(26); |
| 87 | assert_eq!( |
| 88 | Err(err), |
| 89 | dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) |
| 90 | ); |
| 91 | |
| 92 | // ... switching back to ASCII still makes progress since it just needs to |
| 93 | // set transitions on existing states! |
| 94 | let haystack = "a" .repeat(101).into_bytes(); |
| 95 | let err = MatchError::gave_up(13); |
| 96 | assert_eq!( |
| 97 | Err(err), |
| 98 | dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) |
| 99 | ); |
| 100 | |
| 101 | Ok(()) |
| 102 | } |
| 103 | |
| 104 | // Tests that quit bytes in the forward direction work correctly. |
| 105 | #[test] |
| 106 | fn quit_fwd() -> Result<(), Box<dyn Error>> { |
| 107 | let dfa = DFA::builder() |
| 108 | .configure(DFA::config().quit(b'x' , true)) |
| 109 | .build("[[:word:]]+$" )?; |
| 110 | let mut cache = dfa.create_cache(); |
| 111 | |
| 112 | assert_eq!( |
| 113 | dfa.try_search_fwd(&mut cache, &Input::new("abcxyz" )), |
| 114 | Err(MatchError::quit(b'x' , 3)), |
| 115 | ); |
| 116 | assert_eq!( |
| 117 | dfa.try_search_overlapping_fwd( |
| 118 | &mut cache, |
| 119 | &Input::new(b"abcxyz" ), |
| 120 | &mut OverlappingState::start() |
| 121 | ), |
| 122 | Err(MatchError::quit(b'x' , 3)), |
| 123 | ); |
| 124 | |
| 125 | Ok(()) |
| 126 | } |
| 127 | |
| 128 | // Tests that quit bytes in the reverse direction work correctly. |
| 129 | #[test] |
| 130 | fn quit_rev() -> Result<(), Box<dyn Error>> { |
| 131 | let dfa = DFA::builder() |
| 132 | .configure(DFA::config().quit(b'x' , true)) |
| 133 | .thompson(thompson::Config::new().reverse(true)) |
| 134 | .build("^[[:word:]]+" )?; |
| 135 | let mut cache = dfa.create_cache(); |
| 136 | |
| 137 | assert_eq!( |
| 138 | dfa.try_search_rev(&mut cache, &Input::new("abcxyz" )), |
| 139 | Err(MatchError::quit(b'x' , 3)), |
| 140 | ); |
| 141 | |
| 142 | Ok(()) |
| 143 | } |
| 144 | |
| 145 | // Tests that if we heuristically enable Unicode word boundaries but then |
| 146 | // instruct that a non-ASCII byte should NOT be a quit byte, then the builder |
| 147 | // will panic. |
| 148 | #[test] |
| 149 | #[should_panic ] |
| 150 | fn quit_panics() { |
| 151 | DFA::config().unicode_word_boundary(true).quit(b' \xFF' , false); |
| 152 | } |
| 153 | |
| 154 | // This tests an intesting case where even if the Unicode word boundary option |
| 155 | // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode |
| 156 | // word boundaries to be enabled. |
| 157 | #[test] |
| 158 | fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> { |
| 159 | let mut config = DFA::config(); |
| 160 | for b in 0x80..=0xFF { |
| 161 | config = config.quit(b, true); |
| 162 | } |
| 163 | let dfa = DFA::builder().configure(config).build(r"\b" )?; |
| 164 | let mut cache = dfa.create_cache(); |
| 165 | let expected = HalfMatch::must(0, 1); |
| 166 | assert_eq!( |
| 167 | Ok(Some(expected)), |
| 168 | dfa.try_search_fwd(&mut cache, &Input::new(" a" )), |
| 169 | ); |
| 170 | Ok(()) |
| 171 | } |
| 172 | |