1 | #[cfg (feature = "std" )] |
2 | use dense::{self, DenseDFA}; |
3 | use dfa::DFA; |
4 | #[cfg (feature = "std" )] |
5 | use error::Result; |
6 | #[cfg (feature = "std" )] |
7 | use sparse::SparseDFA; |
8 | #[cfg (feature = "std" )] |
9 | use state_id::StateID; |
10 | |
11 | /// A regular expression that uses deterministic finite automata for fast |
12 | /// searching. |
13 | /// |
14 | /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
15 | /// "reverse" DFA. The forward DFA is responsible for detecting the end of a |
16 | /// match while the reverse DFA is responsible for detecting the start of a |
17 | /// match. Thus, in order to find the bounds of any given match, a forward |
18 | /// search must first be run followed by a reverse search. A match found by |
19 | /// the forward DFA guarantees that the reverse DFA will also find a match. |
20 | /// |
21 | /// The type of the DFA used by a `Regex` corresponds to the `D` type |
22 | /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, |
23 | /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a |
24 | /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but |
25 | /// search faster, while sparse DFAs use less memory but search more slowly. |
26 | /// |
27 | /// By default, a regex's DFA type parameter is set to |
28 | /// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the |
29 | /// most convenient type that gives the best search performance. |
30 | /// |
31 | /// # Sparse DFAs |
32 | /// |
33 | /// Since a `Regex` is generic over the `DFA` trait, it can be used with any |
34 | /// kind of DFA. While this crate constructs dense DFAs by default, it is easy |
35 | /// enough to build corresponding sparse DFAs, and then build a regex from |
36 | /// them: |
37 | /// |
38 | /// ``` |
39 | /// use regex_automata::Regex; |
40 | /// |
41 | /// # fn example() -> Result<(), regex_automata::Error> { |
42 | /// // First, build a regex that uses dense DFAs. |
43 | /// let dense_re = Regex::new("foo[0-9]+" )?; |
44 | /// |
45 | /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
46 | /// let fwd = dense_re.forward().to_sparse()?; |
47 | /// let rev = dense_re.reverse().to_sparse()?; |
48 | /// |
49 | /// // Third, build a new regex from the constituent sparse DFAs. |
50 | /// let sparse_re = Regex::from_dfas(fwd, rev); |
51 | /// |
52 | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
53 | /// assert_eq!(true, sparse_re.is_match(b"foo123" )); |
54 | /// # Ok(()) }; example().unwrap() |
55 | /// ``` |
56 | #[cfg (feature = "std" )] |
57 | #[derive(Clone, Debug)] |
58 | pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> { |
59 | forward: D, |
60 | reverse: D, |
61 | } |
62 | |
63 | /// A regular expression that uses deterministic finite automata for fast |
64 | /// searching. |
65 | /// |
66 | /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
67 | /// "reverse" DFA. The forward DFA is responsible for detecting the end of a |
68 | /// match while the reverse DFA is responsible for detecting the start of a |
69 | /// match. Thus, in order to find the bounds of any given match, a forward |
70 | /// search must first be run followed by a reverse search. A match found by |
71 | /// the forward DFA guarantees that the reverse DFA will also find a match. |
72 | /// |
73 | /// The type of the DFA used by a `Regex` corresponds to the `D` type |
74 | /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, |
75 | /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a |
76 | /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but |
77 | /// search faster, while sparse DFAs use less memory but search more slowly. |
78 | /// |
79 | /// When using this crate without the standard library, the `Regex` type has |
80 | /// no default type parameter. |
81 | /// |
82 | /// # Sparse DFAs |
83 | /// |
84 | /// Since a `Regex` is generic over the `DFA` trait, it can be used with any |
85 | /// kind of DFA. While this crate constructs dense DFAs by default, it is easy |
86 | /// enough to build corresponding sparse DFAs, and then build a regex from |
87 | /// them: |
88 | /// |
89 | /// ``` |
90 | /// use regex_automata::Regex; |
91 | /// |
92 | /// # fn example() -> Result<(), regex_automata::Error> { |
93 | /// // First, build a regex that uses dense DFAs. |
94 | /// let dense_re = Regex::new("foo[0-9]+")?; |
95 | /// |
96 | /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
97 | /// let fwd = dense_re.forward().to_sparse()?; |
98 | /// let rev = dense_re.reverse().to_sparse()?; |
99 | /// |
100 | /// // Third, build a new regex from the constituent sparse DFAs. |
101 | /// let sparse_re = Regex::from_dfas(fwd, rev); |
102 | /// |
103 | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
104 | /// assert_eq!(true, sparse_re.is_match(b"foo123")); |
105 | /// # Ok(()) }; example().unwrap() |
106 | /// ``` |
107 | #[cfg (not(feature = "std" ))] |
108 | #[derive(Clone, Debug)] |
109 | pub struct Regex<D> { |
110 | forward: D, |
111 | reverse: D, |
112 | } |
113 | |
114 | #[cfg (feature = "std" )] |
115 | impl Regex { |
116 | /// Parse the given regular expression using a default configuration and |
117 | /// return the corresponding regex. |
118 | /// |
119 | /// The default configuration uses `usize` for state IDs, premultiplies |
120 | /// them and reduces the alphabet size by splitting bytes into equivalence |
121 | /// classes. The underlying DFAs are *not* minimized. |
122 | /// |
123 | /// If you want a non-default configuration, then use the |
124 | /// [`RegexBuilder`](struct.RegexBuilder.html) |
125 | /// to set your own configuration. |
126 | /// |
127 | /// # Example |
128 | /// |
129 | /// ``` |
130 | /// use regex_automata::Regex; |
131 | /// |
132 | /// # fn example() -> Result<(), regex_automata::Error> { |
133 | /// let re = Regex::new("foo[0-9]+bar" )?; |
134 | /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz" )); |
135 | /// # Ok(()) }; example().unwrap() |
136 | /// ``` |
137 | pub fn new(pattern: &str) -> Result<Regex> { |
138 | RegexBuilder::new().build(pattern) |
139 | } |
140 | } |
141 | |
142 | #[cfg (feature = "std" )] |
143 | impl Regex<SparseDFA<Vec<u8>, usize>> { |
144 | /// Parse the given regular expression using a default configuration and |
145 | /// return the corresponding regex using sparse DFAs. |
146 | /// |
147 | /// The default configuration uses `usize` for state IDs, reduces the |
148 | /// alphabet size by splitting bytes into equivalence classes. The |
149 | /// underlying DFAs are *not* minimized. |
150 | /// |
151 | /// If you want a non-default configuration, then use the |
152 | /// [`RegexBuilder`](struct.RegexBuilder.html) |
153 | /// to set your own configuration. |
154 | /// |
155 | /// # Example |
156 | /// |
157 | /// ``` |
158 | /// use regex_automata::Regex; |
159 | /// |
160 | /// # fn example() -> Result<(), regex_automata::Error> { |
161 | /// let re = Regex::new_sparse("foo[0-9]+bar" )?; |
162 | /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz" )); |
163 | /// # Ok(()) }; example().unwrap() |
164 | /// ``` |
165 | pub fn new_sparse( |
166 | pattern: &str, |
167 | ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> { |
168 | RegexBuilder::new().build_sparse(pattern) |
169 | } |
170 | } |
171 | |
172 | impl<D: DFA> Regex<D> { |
173 | /// Returns true if and only if the given bytes match. |
174 | /// |
175 | /// This routine may short circuit if it knows that scanning future input |
176 | /// will never lead to a different result. In particular, if the underlying |
177 | /// DFA enters a match state or a dead state, then this routine will return |
178 | /// `true` or `false`, respectively, without inspecting any future input. |
179 | /// |
180 | /// # Example |
181 | /// |
182 | /// ``` |
183 | /// use regex_automata::Regex; |
184 | /// |
185 | /// # fn example() -> Result<(), regex_automata::Error> { |
186 | /// let re = Regex::new("foo[0-9]+bar" )?; |
187 | /// assert_eq!(true, re.is_match(b"foo12345bar" )); |
188 | /// assert_eq!(false, re.is_match(b"foobar" )); |
189 | /// # Ok(()) }; example().unwrap() |
190 | /// ``` |
191 | pub fn is_match(&self, input: &[u8]) -> bool { |
192 | self.is_match_at(input, 0) |
193 | } |
194 | |
195 | /// Returns the first position at which a match is found. |
196 | /// |
197 | /// This routine stops scanning input in precisely the same circumstances |
198 | /// as `is_match`. The key difference is that this routine returns the |
199 | /// position at which it stopped scanning input if and only if a match |
200 | /// was found. If no match is found, then `None` is returned. |
201 | /// |
202 | /// # Example |
203 | /// |
204 | /// ``` |
205 | /// use regex_automata::Regex; |
206 | /// |
207 | /// # fn example() -> Result<(), regex_automata::Error> { |
208 | /// let re = Regex::new("foo[0-9]+" )?; |
209 | /// assert_eq!(Some(4), re.shortest_match(b"foo12345" )); |
210 | /// |
211 | /// // Normally, the end of the leftmost first match here would be 3, |
212 | /// // but the shortest match semantics detect a match earlier. |
213 | /// let re = Regex::new("abc|a" )?; |
214 | /// assert_eq!(Some(1), re.shortest_match(b"abc" )); |
215 | /// # Ok(()) }; example().unwrap() |
216 | /// ``` |
217 | pub fn shortest_match(&self, input: &[u8]) -> Option<usize> { |
218 | self.shortest_match_at(input, 0) |
219 | } |
220 | |
221 | /// Returns the start and end offset of the leftmost first match. If no |
222 | /// match exists, then `None` is returned. |
223 | /// |
224 | /// The "leftmost first" match corresponds to the match with the smallest |
225 | /// starting offset, but where the end offset is determined by preferring |
226 | /// earlier branches in the original regular expression. For example, |
227 | /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will |
228 | /// match `Samwise` in `Samwise`. |
229 | /// |
230 | /// Generally speaking, the "leftmost first" match is how most backtracking |
231 | /// regular expressions tend to work. This is in contrast to POSIX-style |
232 | /// regular expressions that yield "leftmost longest" matches. Namely, |
233 | /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
234 | /// leftmost longest semantics. |
235 | /// |
236 | /// # Example |
237 | /// |
238 | /// ``` |
239 | /// use regex_automata::Regex; |
240 | /// |
241 | /// # fn example() -> Result<(), regex_automata::Error> { |
242 | /// let re = Regex::new("foo[0-9]+" )?; |
243 | /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz" )); |
244 | /// |
245 | /// // Even though a match is found after reading the first byte (`a`), |
246 | /// // the leftmost first match semantics demand that we find the earliest |
247 | /// // match that prefers earlier parts of the pattern over latter parts. |
248 | /// let re = Regex::new("abc|a" )?; |
249 | /// assert_eq!(Some((0, 3)), re.find(b"abc" )); |
250 | /// # Ok(()) }; example().unwrap() |
251 | /// ``` |
252 | pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> { |
253 | self.find_at(input, 0) |
254 | } |
255 | |
256 | /// Returns the same as `is_match`, but starts the search at the given |
257 | /// offset. |
258 | /// |
259 | /// The significance of the starting point is that it takes the surrounding |
260 | /// context into consideration. For example, if the DFA is anchored, then |
261 | /// a match can only occur when `start == 0`. |
262 | pub fn is_match_at(&self, input: &[u8], start: usize) -> bool { |
263 | self.forward().is_match_at(input, start) |
264 | } |
265 | |
266 | /// Returns the same as `shortest_match`, but starts the search at the |
267 | /// given offset. |
268 | /// |
269 | /// The significance of the starting point is that it takes the surrounding |
270 | /// context into consideration. For example, if the DFA is anchored, then |
271 | /// a match can only occur when `start == 0`. |
272 | pub fn shortest_match_at( |
273 | &self, |
274 | input: &[u8], |
275 | start: usize, |
276 | ) -> Option<usize> { |
277 | self.forward().shortest_match_at(input, start) |
278 | } |
279 | |
280 | /// Returns the same as `find`, but starts the search at the given |
281 | /// offset. |
282 | /// |
283 | /// The significance of the starting point is that it takes the surrounding |
284 | /// context into consideration. For example, if the DFA is anchored, then |
285 | /// a match can only occur when `start == 0`. |
286 | pub fn find_at( |
287 | &self, |
288 | input: &[u8], |
289 | start: usize, |
290 | ) -> Option<(usize, usize)> { |
291 | let end = match self.forward().find_at(input, start) { |
292 | None => return None, |
293 | Some(end) => end, |
294 | }; |
295 | let start = self |
296 | .reverse() |
297 | .rfind(&input[start..end]) |
298 | .map(|i| start + i) |
299 | .expect("reverse search must match if forward search does" ); |
300 | Some((start, end)) |
301 | } |
302 | |
303 | /// Returns an iterator over all non-overlapping leftmost first matches |
304 | /// in the given bytes. If no match exists, then the iterator yields no |
305 | /// elements. |
306 | /// |
307 | /// Note that if the regex can match the empty string, then it is |
308 | /// possible for the iterator to yield a zero-width match at a location |
309 | /// that is not a valid UTF-8 boundary (for example, between the code units |
310 | /// of a UTF-8 encoded codepoint). This can happen regardless of whether |
311 | /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8) |
312 | /// was enabled or not. |
313 | /// |
314 | /// # Example |
315 | /// |
316 | /// ``` |
317 | /// use regex_automata::Regex; |
318 | /// |
319 | /// # fn example() -> Result<(), regex_automata::Error> { |
320 | /// let re = Regex::new("foo[0-9]+" )?; |
321 | /// let text = b"foo1 foo12 foo123" ; |
322 | /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); |
323 | /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]); |
324 | /// # Ok(()) }; example().unwrap() |
325 | /// ``` |
326 | pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> { |
327 | Matches::new(self, input) |
328 | } |
329 | |
330 | /// Build a new regex from its constituent forward and reverse DFAs. |
331 | /// |
332 | /// This is useful when deserializing a regex from some arbitrary |
333 | /// memory region. This is also useful for building regexes from other |
334 | /// types of DFAs. |
335 | /// |
336 | /// # Example |
337 | /// |
338 | /// This example is a bit a contrived. The usual use of these methods |
339 | /// would involve serializing `initial_re` somewhere and then deserializing |
340 | /// it later to build a regex. |
341 | /// |
342 | /// ``` |
343 | /// use regex_automata::Regex; |
344 | /// |
345 | /// # fn example() -> Result<(), regex_automata::Error> { |
346 | /// let initial_re = Regex::new("foo[0-9]+" )?; |
347 | /// assert_eq!(true, initial_re.is_match(b"foo123" )); |
348 | /// |
349 | /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); |
350 | /// let re = Regex::from_dfas(fwd, rev); |
351 | /// assert_eq!(true, re.is_match(b"foo123" )); |
352 | /// # Ok(()) }; example().unwrap() |
353 | /// ``` |
354 | /// |
355 | /// This example shows how you might build smaller DFAs, and then use those |
356 | /// smaller DFAs to build a new regex. |
357 | /// |
358 | /// ``` |
359 | /// use regex_automata::Regex; |
360 | /// |
361 | /// # fn example() -> Result<(), regex_automata::Error> { |
362 | /// let initial_re = Regex::new("foo[0-9]+" )?; |
363 | /// assert_eq!(true, initial_re.is_match(b"foo123" )); |
364 | /// |
365 | /// let fwd = initial_re.forward().to_u16()?; |
366 | /// let rev = initial_re.reverse().to_u16()?; |
367 | /// let re = Regex::from_dfas(fwd, rev); |
368 | /// assert_eq!(true, re.is_match(b"foo123" )); |
369 | /// # Ok(()) }; example().unwrap() |
370 | /// ``` |
371 | /// |
372 | /// This example shows how to build a `Regex` that uses sparse DFAs instead |
373 | /// of dense DFAs: |
374 | /// |
375 | /// ``` |
376 | /// use regex_automata::Regex; |
377 | /// |
378 | /// # fn example() -> Result<(), regex_automata::Error> { |
379 | /// let initial_re = Regex::new("foo[0-9]+" )?; |
380 | /// assert_eq!(true, initial_re.is_match(b"foo123" )); |
381 | /// |
382 | /// let fwd = initial_re.forward().to_sparse()?; |
383 | /// let rev = initial_re.reverse().to_sparse()?; |
384 | /// let re = Regex::from_dfas(fwd, rev); |
385 | /// assert_eq!(true, re.is_match(b"foo123" )); |
386 | /// # Ok(()) }; example().unwrap() |
387 | /// ``` |
388 | pub fn from_dfas(forward: D, reverse: D) -> Regex<D> { |
389 | Regex { forward, reverse } |
390 | } |
391 | |
392 | /// Return the underlying DFA responsible for forward matching. |
393 | pub fn forward(&self) -> &D { |
394 | &self.forward |
395 | } |
396 | |
397 | /// Return the underlying DFA responsible for reverse matching. |
398 | pub fn reverse(&self) -> &D { |
399 | &self.reverse |
400 | } |
401 | } |
402 | |
403 | /// An iterator over all non-overlapping matches for a particular search. |
404 | /// |
405 | /// The iterator yields a `(usize, usize)` value until no more matches could be |
406 | /// found. The first `usize` is the start of the match (inclusive) while the |
407 | /// second `usize` is the end of the match (exclusive). |
408 | /// |
409 | /// `S` is the type used to represent state identifiers in the underlying |
410 | /// regex. The lifetime variables are as follows: |
411 | /// |
412 | /// * `'r` is the lifetime of the regular expression value itself. |
413 | /// * `'t` is the lifetime of the text being searched. |
414 | #[derive(Clone, Debug)] |
415 | pub struct Matches<'r, 't, D: DFA + 'r> { |
416 | re: &'r Regex<D>, |
417 | text: &'t [u8], |
418 | last_end: usize, |
419 | last_match: Option<usize>, |
420 | } |
421 | |
422 | impl<'r, 't, D: DFA> Matches<'r, 't, D> { |
423 | fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> { |
424 | Matches { re, text, last_end: 0, last_match: None } |
425 | } |
426 | } |
427 | |
428 | impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> { |
429 | type Item = (usize, usize); |
430 | |
431 | fn next(&mut self) -> Option<(usize, usize)> { |
432 | if self.last_end > self.text.len() { |
433 | return None; |
434 | } |
435 | let (s, e) = match self.re.find_at(self.text, self.last_end) { |
436 | None => return None, |
437 | Some((s, e)) => (s, e), |
438 | }; |
439 | if s == e { |
440 | // This is an empty match. To ensure we make progress, start |
441 | // the next search at the smallest possible starting position |
442 | // of the next match following this one. |
443 | self.last_end = e + 1; |
444 | // Don't accept empty matches immediately following a match. |
445 | // Just move on to the next match. |
446 | if Some(e) == self.last_match { |
447 | return self.next(); |
448 | } |
449 | } else { |
450 | self.last_end = e; |
451 | } |
452 | self.last_match = Some(e); |
453 | Some((s, e)) |
454 | } |
455 | } |
456 | |
457 | /// A builder for a regex based on deterministic finite automatons. |
458 | /// |
459 | /// This builder permits configuring several aspects of the construction |
460 | /// process such as case insensitivity, Unicode support and various options |
461 | /// that impact the size of the underlying DFAs. In some cases, options (like |
462 | /// performing DFA minimization) can come with a substantial additional cost. |
463 | /// |
464 | /// This builder generally constructs two DFAs, where one is responsible for |
465 | /// finding the end of a match and the other is responsible for finding the |
466 | /// start of a match. If you only need to detect whether something matched, |
467 | /// or only the end of a match, then you should use a |
468 | /// [`dense::Builder`](dense/struct.Builder.html) |
469 | /// to construct a single DFA, which is cheaper than building two DFAs. |
470 | #[cfg (feature = "std" )] |
471 | #[derive(Clone, Debug)] |
472 | pub struct RegexBuilder { |
473 | dfa: dense::Builder, |
474 | } |
475 | |
476 | #[cfg (feature = "std" )] |
477 | impl RegexBuilder { |
478 | /// Create a new regex builder with the default configuration. |
479 | pub fn new() -> RegexBuilder { |
480 | RegexBuilder { dfa: dense::Builder::new() } |
481 | } |
482 | |
483 | /// Build a regex from the given pattern. |
484 | /// |
485 | /// If there was a problem parsing or compiling the pattern, then an error |
486 | /// is returned. |
487 | pub fn build(&self, pattern: &str) -> Result<Regex> { |
488 | self.build_with_size::<usize>(pattern) |
489 | } |
490 | |
491 | /// Build a regex from the given pattern using sparse DFAs. |
492 | /// |
493 | /// If there was a problem parsing or compiling the pattern, then an error |
494 | /// is returned. |
495 | pub fn build_sparse( |
496 | &self, |
497 | pattern: &str, |
498 | ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> { |
499 | self.build_with_size_sparse::<usize>(pattern) |
500 | } |
501 | |
502 | /// Build a regex from the given pattern using a specific representation |
503 | /// for the underlying DFA state IDs. |
504 | /// |
505 | /// If there was a problem parsing or compiling the pattern, then an error |
506 | /// is returned. |
507 | /// |
508 | /// The representation of state IDs is determined by the `S` type |
509 | /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64` |
510 | /// or `usize`, where `usize` is the default used for `build`. The purpose |
511 | /// of specifying a representation for state IDs is to reduce the memory |
512 | /// footprint of the underlying DFAs. |
513 | /// |
514 | /// When using this routine, the chosen state ID representation will be |
515 | /// used throughout determinization and minimization, if minimization was |
516 | /// requested. Even if the minimized DFAs can fit into the chosen state ID |
517 | /// representation but the initial determinized DFA cannot, then this will |
518 | /// still return an error. To get a minimized DFA with a smaller state ID |
519 | /// representation, first build it with a bigger state ID representation, |
520 | /// and then shrink the sizes of the DFAs using one of its conversion |
521 | /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). |
522 | /// Finally, reconstitute the regex via |
523 | /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa). |
524 | pub fn build_with_size<S: StateID>( |
525 | &self, |
526 | pattern: &str, |
527 | ) -> Result<Regex<DenseDFA<Vec<S>, S>>> { |
528 | let forward = self.dfa.build_with_size(pattern)?; |
529 | let reverse = self |
530 | .dfa |
531 | .clone() |
532 | .anchored(true) |
533 | .reverse(true) |
534 | .longest_match(true) |
535 | .build_with_size(pattern)?; |
536 | Ok(Regex::from_dfas(forward, reverse)) |
537 | } |
538 | |
539 | /// Build a regex from the given pattern using a specific representation |
540 | /// for the underlying DFA state IDs using sparse DFAs. |
541 | pub fn build_with_size_sparse<S: StateID>( |
542 | &self, |
543 | pattern: &str, |
544 | ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> { |
545 | let re = self.build_with_size(pattern)?; |
546 | let fwd = re.forward().to_sparse()?; |
547 | let rev = re.reverse().to_sparse()?; |
548 | Ok(Regex::from_dfas(fwd, rev)) |
549 | } |
550 | |
551 | /// Set whether matching must be anchored at the beginning of the input. |
552 | /// |
553 | /// When enabled, a match must begin at the start of the input. When |
554 | /// disabled, the regex will act as if the pattern started with a `.*?`, |
555 | /// which enables a match to appear anywhere. |
556 | /// |
557 | /// By default this is disabled. |
558 | pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder { |
559 | self.dfa.anchored(yes); |
560 | self |
561 | } |
562 | |
563 | /// Enable or disable the case insensitive flag by default. |
564 | /// |
565 | /// By default this is disabled. It may alternatively be selectively |
566 | /// enabled in the regular expression itself via the `i` flag. |
567 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { |
568 | self.dfa.case_insensitive(yes); |
569 | self |
570 | } |
571 | |
572 | /// Enable verbose mode in the regular expression. |
573 | /// |
574 | /// When enabled, verbose mode permits insigificant whitespace in many |
575 | /// places in the regular expression, as well as comments. Comments are |
576 | /// started using `#` and continue until the end of the line. |
577 | /// |
578 | /// By default, this is disabled. It may be selectively enabled in the |
579 | /// regular expression by using the `x` flag regardless of this setting. |
580 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { |
581 | self.dfa.ignore_whitespace(yes); |
582 | self |
583 | } |
584 | |
585 | /// Enable or disable the "dot matches any character" flag by default. |
586 | /// |
587 | /// By default this is disabled. It may alternatively be selectively |
588 | /// enabled in the regular expression itself via the `s` flag. |
589 | pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { |
590 | self.dfa.dot_matches_new_line(yes); |
591 | self |
592 | } |
593 | |
594 | /// Enable or disable the "swap greed" flag by default. |
595 | /// |
596 | /// By default this is disabled. It may alternatively be selectively |
597 | /// enabled in the regular expression itself via the `U` flag. |
598 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { |
599 | self.dfa.swap_greed(yes); |
600 | self |
601 | } |
602 | |
603 | /// Enable or disable the Unicode flag (`u`) by default. |
604 | /// |
605 | /// By default this is **enabled**. It may alternatively be selectively |
606 | /// disabled in the regular expression itself via the `u` flag. |
607 | /// |
608 | /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by |
609 | /// default), a regular expression will fail to parse if Unicode mode is |
610 | /// disabled and a sub-expression could possibly match invalid UTF-8. |
611 | pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { |
612 | self.dfa.unicode(yes); |
613 | self |
614 | } |
615 | |
616 | /// When enabled, the builder will permit the construction of a regular |
617 | /// expression that may match invalid UTF-8. |
618 | /// |
619 | /// When disabled (the default), the builder is guaranteed to produce a |
620 | /// regex that will only ever match valid UTF-8 (otherwise, the builder |
621 | /// will return an error). |
622 | pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder { |
623 | self.dfa.allow_invalid_utf8(yes); |
624 | self |
625 | } |
626 | |
627 | /// Set the nesting limit used for the regular expression parser. |
628 | /// |
629 | /// The nesting limit controls how deep the abstract syntax tree is allowed |
630 | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
631 | /// groups), then an error is returned by the parser. |
632 | /// |
633 | /// The purpose of this limit is to act as a heuristic to prevent stack |
634 | /// overflow when building a finite automaton from a regular expression's |
635 | /// abstract syntax tree. In particular, construction currently uses |
636 | /// recursion. In the future, the implementation may stop using recursion |
637 | /// and this option will no longer be necessary. |
638 | /// |
639 | /// This limit is not checked until the entire AST is parsed. Therefore, |
640 | /// if callers want to put a limit on the amount of heap space used, then |
641 | /// they should impose a limit on the length, in bytes, of the concrete |
642 | /// pattern string. In particular, this is viable since the parser will |
643 | /// limit itself to heap space proportional to the lenth of the pattern |
644 | /// string. |
645 | /// |
646 | /// Note that a nest limit of `0` will return a nest limit error for most |
647 | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
648 | /// not `ab`, since `ab` requires a concatenation AST item, which results |
649 | /// in a nest depth of `1`. In general, a nest limit is not something that |
650 | /// manifests in an obvious way in the concrete syntax, therefore, it |
651 | /// should not be used in a granular way. |
652 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { |
653 | self.dfa.nest_limit(limit); |
654 | self |
655 | } |
656 | |
657 | /// Minimize the underlying DFAs. |
658 | /// |
659 | /// When enabled, the DFAs powering the resulting regex will be minimized |
660 | /// such that it is as small as possible. |
661 | /// |
662 | /// Whether one enables minimization or not depends on the types of costs |
663 | /// you're willing to pay and how much you care about its benefits. In |
664 | /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` |
665 | /// space, where `n` is the number of DFA states and `k` is the alphabet |
666 | /// size. In practice, minimization can be quite costly in terms of both |
667 | /// space and time, so it should only be done if you're willing to wait |
668 | /// longer to produce a DFA. In general, you might want a minimal DFA in |
669 | /// the following circumstances: |
670 | /// |
671 | /// 1. You would like to optimize for the size of the automaton. This can |
672 | /// manifest in one of two ways. Firstly, if you're converting the |
673 | /// DFA into Rust code (or a table embedded in the code), then a minimal |
674 | /// DFA will translate into a corresponding reduction in code size, and |
675 | /// thus, also the final compiled binary size. Secondly, if you are |
676 | /// building many DFAs and putting them on the heap, you'll be able to |
677 | /// fit more if they are smaller. Note though that building a minimal |
678 | /// DFA itself requires additional space; you only realize the space |
679 | /// savings once the minimal DFA is constructed (at which point, the |
680 | /// space used for minimization is freed). |
681 | /// 2. You've observed that a smaller DFA results in faster match |
682 | /// performance. Naively, this isn't guaranteed since there is no |
683 | /// inherent difference between matching with a bigger-than-minimal |
684 | /// DFA and a minimal DFA. However, a smaller DFA may make use of your |
685 | /// CPU's cache more efficiently. |
686 | /// 3. You are trying to establish an equivalence between regular |
687 | /// languages. The standard method for this is to build a minimal DFA |
688 | /// for each language and then compare them. If the DFAs are equivalent |
689 | /// (up to state renaming), then the languages are equivalent. |
690 | /// |
691 | /// This option is disabled by default. |
692 | pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder { |
693 | self.dfa.minimize(yes); |
694 | self |
695 | } |
696 | |
697 | /// Premultiply state identifiers in the underlying DFA transition tables. |
698 | /// |
699 | /// When enabled, state identifiers are premultiplied to point to their |
700 | /// corresponding row in the DFA's transition table. That is, given the |
701 | /// `i`th state, its corresponding premultiplied identifier is `i * k` |
702 | /// where `k` is the alphabet size of the DFA. (The alphabet size is at |
703 | /// most 256, but is in practice smaller if byte classes is enabled.) |
704 | /// |
705 | /// When state identifiers are not premultiplied, then the identifier of |
706 | /// the `i`th state is `i`. |
707 | /// |
708 | /// The advantage of premultiplying state identifiers is that is saves |
709 | /// a multiplication instruction per byte when searching with the DFA. |
710 | /// This has been observed to lead to a 20% performance benefit in |
711 | /// micro-benchmarks. |
712 | /// |
713 | /// The primary disadvantage of premultiplying state identifiers is |
714 | /// that they require a larger integer size to represent. For example, |
715 | /// if your DFA has 200 states, then its premultiplied form requires |
716 | /// 16 bits to represent every possible state identifier, where as its |
717 | /// non-premultiplied form only requires 8 bits. |
718 | /// |
719 | /// This option is enabled by default. |
720 | pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder { |
721 | self.dfa.premultiply(yes); |
722 | self |
723 | } |
724 | |
725 | /// Shrink the size of the underlying DFA alphabet by mapping bytes to |
726 | /// their equivalence classes. |
727 | /// |
728 | /// When enabled, each DFA will use a map from all possible bytes to their |
729 | /// corresponding equivalence class. Each equivalence class represents a |
730 | /// set of bytes that does not discriminate between a match and a non-match |
731 | /// in the DFA. For example, the pattern `[ab]+` has at least two |
732 | /// equivalence classes: a set containing `a` and `b` and a set containing |
733 | /// every byte except for `a` and `b`. `a` and `b` are in the same |
734 | /// equivalence classes because they never discriminate between a match |
735 | /// and a non-match. |
736 | /// |
737 | /// The advantage of this map is that the size of the transition table can |
738 | /// be reduced drastically from `#states * 256 * sizeof(id)` to |
739 | /// `#states * k * sizeof(id)` where `k` is the number of equivalence |
740 | /// classes. As a result, total space usage can decrease substantially. |
741 | /// Moreover, since a smaller alphabet is used, compilation becomes faster |
742 | /// as well. |
743 | /// |
744 | /// The disadvantage of this map is that every byte searched must be |
745 | /// passed through this map before it can be used to determine the next |
746 | /// transition. This has a small match time performance cost. |
747 | /// |
748 | /// This option is enabled by default. |
749 | pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder { |
750 | self.dfa.byte_classes(yes); |
751 | self |
752 | } |
753 | |
754 | /// Apply best effort heuristics to shrink the NFA at the expense of more |
755 | /// time/memory. |
756 | /// |
757 | /// This may be exposed in the future, but for now is exported for use in |
758 | /// the `regex-automata-debug` tool. |
759 | #[doc (hidden)] |
760 | pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder { |
761 | self.dfa.shrink(yes); |
762 | self |
763 | } |
764 | } |
765 | |
766 | #[cfg (feature = "std" )] |
767 | impl Default for RegexBuilder { |
768 | fn default() -> RegexBuilder { |
769 | RegexBuilder::new() |
770 | } |
771 | } |
772 | |