1//! Regex matchers on character and byte streams.
2//!
3//! ## Overview
4//!
5//! The [`regex`] crate implements regular expression matching on strings and byte
6//! arrays. However, in order to match the output of implementations of `fmt::Debug`
7//! and `fmt::Display`, or by any code which writes to an instance of `fmt::Write`
8//! or `io::Write`, it is necessary to first allocate a buffer, write to that
9//! buffer, and then match the buffer against a regex.
10//!
11//! In cases where it is not necessary to extract substrings, but only to test whether
12//! or not output matches a regex, it is not strictly necessary to allocate and
13//! write this output to a buffer. This crate provides a simple interface on top of
14//! the lower-level [`regex-automata`] library that implements `fmt::Write` and
15//! `io::Write` for regex patterns. This may be used to test whether streaming
16//! output matches a pattern without buffering that output.
17//!
18//! Users who need to extract substrings based on a pattern or who already have
19//! buffered data should probably use the [`regex`] crate instead.
20//!
21//! ## Syntax
22//!
23//! This crate uses the same [regex syntax][syntax] of the `regex-automata` crate.
24//!
25//! [`regex`]: https://crates.io/crates/regex
26//! [`regex-automata`]: https://crates.io/crates/regex-automata
27//! [syntax]: https://docs.rs/regex-automata/0.1.7/regex_automata/#syntax
28
29use regex_automata::{dense, DenseDFA, SparseDFA, StateID, DFA};
30use std::{fmt, io, marker::PhantomData, str::FromStr};
31
32pub use regex_automata::Error;
33
34/// A compiled match pattern that can match multipe inputs, or return a
35/// [`Matcher`] that matches a single input.
36///
37/// [`Matcher`]: ../struct.Matcher.html
38#[derive(Debug, Clone)]
39pub struct Pattern<S = usize, A = DenseDFA<Vec<S>, S>>
40where
41 S: StateID,
42 A: DFA<ID = S>,
43{
44 automaton: A,
45}
46
47/// A reference to a [`Pattern`] that matches a single input.
48///
49/// [`Pattern`]: ../struct.Pattern.html
50#[derive(Debug, Clone)]
51pub struct Matcher<'a, S = usize, A = DenseDFA<&'a [S], S>>
52where
53 S: StateID,
54 A: DFA<ID = S>,
55{
56 automaton: A,
57 state: S,
58 _lt: PhantomData<&'a ()>,
59}
60
61// === impl Pattern ===
62
63impl Pattern {
64 /// Returns a new `Pattern` for the given regex, or an error if the regex
65 /// was invalid.
66 ///
67 /// The returned `Pattern` will match occurances of the pattern which start
68 /// at *any* in a byte or character stream — the pattern may be preceded by
69 /// any number of non-matching characters. Essentially, it will behave as
70 /// though the regular expression started with a `.*?`, which enables a
71 /// match to appear anywhere. If this is not the desired behavior, use
72 /// [`Pattern::new_anchored`] instead.
73 ///
74 /// For example:
75 /// ```
76 /// use matchers::Pattern;
77 ///
78 /// // This pattern matches any number of `a`s followed by a `b`.
79 /// let pattern = Pattern::new("a+b").expect("regex is not invalid");
80 ///
81 /// // Of course, the pattern matches an input where the entire sequence of
82 /// // characters matches the pattern:
83 /// assert!(pattern.display_matches(&"aaaaab"));
84 ///
85 /// // And, since the pattern is unanchored, it will also match the
86 /// // sequence when it's followed by non-matching characters:
87 /// assert!(pattern.display_matches(&"hello world! aaaaab"));
88 /// ```
89 pub fn new(pattern: &str) -> Result<Self, Error> {
90 let automaton = DenseDFA::new(pattern)?;
91 Ok(Pattern { automaton })
92 }
93
94 /// Returns a new `Pattern` anchored at the beginning of the input stream,
95 /// or an error if the regex was invalid.
96 ///
97 /// The returned `Pattern` will *only* match an occurence of the pattern in
98 /// an input sequence if the first character or byte in the input matches
99 /// the pattern. If this is not the desired behavior, use [`Pattern::new`]
100 /// instead.
101 ///
102 /// For example:
103 /// ```
104 /// use matchers::Pattern;
105 ///
106 /// // This pattern matches any number of `a`s followed by a `b`.
107 /// let pattern = Pattern::new_anchored("a+b")
108 /// .expect("regex is not invalid");
109 ///
110 /// // The pattern matches an input where the entire sequence of
111 /// // characters matches the pattern:
112 /// assert!(pattern.display_matches(&"aaaaab"));
113 ///
114 /// // Since the pattern is anchored, it will *not* match an input that
115 /// // begins with non-matching characters:
116 /// assert!(!pattern.display_matches(&"hello world! aaaaab"));
117 ///
118 /// // ...however, if we create a pattern beginning with `.*?`, it will:
119 /// let pattern2 = Pattern::new_anchored(".*?a+b")
120 /// .expect("regex is not invalid");
121 /// assert!(pattern2.display_matches(&"hello world! aaaaab"));
122 /// ```
123 pub fn new_anchored(pattern: &str) -> Result<Self, Error> {
124 let automaton = dense::Builder::new().anchored(true).build(pattern)?;
125 Ok(Pattern { automaton })
126 }
127}
128
129impl FromStr for Pattern {
130 type Err = Error;
131 fn from_str(s: &str) -> Result<Self, Self::Err> {
132 Self::new(s)
133 }
134}
135
136impl<S, A> Pattern<S, A>
137where
138 S: StateID,
139 A: DFA<ID = S>,
140 Self: for<'a> ToMatcher<'a, S>,
141{
142 /// Returns `true` if this pattern matches the given string.
143 #[inline]
144 pub fn matches(&self, s: &impl AsRef<str>) -> bool {
145 self.matcher().matches(s)
146 }
147
148 /// Returns `true` if this pattern matches the formatted output of the given
149 /// type implementing `fmt::Debug`.
150 ///
151 /// For example:
152 /// ```rust
153 /// use matchers::Pattern;
154 ///
155 /// #[derive(Debug)]
156 /// pub struct Hello {
157 /// to: &'static str,
158 /// }
159 ///
160 /// let pattern = Pattern::new(r#"Hello \{ to: "W[^"]*" \}"#).unwrap();
161 ///
162 /// let hello_world = Hello { to: "World" };
163 /// assert!(pattern.debug_matches(&hello_world));
164 ///
165 /// let hello_sf = Hello { to: "San Francisco" };
166 /// assert_eq!(pattern.debug_matches(&hello_sf), false);
167 ///
168 /// let hello_washington = Hello { to: "Washington" };
169 /// assert!(pattern.debug_matches(&hello_washington));
170 /// ```
171 #[inline]
172 pub fn debug_matches(&self, d: &impl fmt::Debug) -> bool {
173 self.matcher().debug_matches(d)
174 }
175
176 /// Returns `true` if this pattern matches the formatted output of the given
177 /// type implementing `fmt::Display`.
178 ///
179 /// For example:
180 /// ```rust
181 /// # use std::fmt;
182 /// use matchers::Pattern;
183 ///
184 /// #[derive(Debug)]
185 /// pub struct Hello {
186 /// to: &'static str,
187 /// }
188 ///
189 /// impl fmt::Display for Hello {
190 /// fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
191 /// write!(f, "Hello {}", self.to)
192 /// }
193 /// }
194 ///
195 /// let pattern = Pattern::new("Hello [Ww].+").unwrap();
196 ///
197 /// let hello_world = Hello { to: "world" };
198 /// assert!(pattern.display_matches(&hello_world));
199 /// assert_eq!(pattern.debug_matches(&hello_world), false);
200 ///
201 /// let hello_sf = Hello { to: "San Francisco" };
202 /// assert_eq!(pattern.display_matches(&hello_sf), false);
203 ///
204 /// let hello_washington = Hello { to: "Washington" };
205 /// assert!(pattern.display_matches(&hello_washington));
206 /// ```
207 #[inline]
208 pub fn display_matches(&self, d: &impl fmt::Display) -> bool {
209 self.matcher().display_matches(d)
210 }
211
212 /// Returns either a `bool` indicating whether or not this pattern matches the
213 /// data read from the provided `io::Read` stream, or an `io::Error` if an
214 /// error occurred reading from the stream.
215 #[inline]
216 pub fn read_matches(&self, io: impl io::Read) -> io::Result<bool> {
217 self.matcher().read_matches(io)
218 }
219}
220
221// === impl Matcher ===
222
223impl<'a, S, A> Matcher<'a, S, A>
224where
225 S: StateID,
226 A: DFA<ID = S>,
227{
228 fn new(automaton: A) -> Self {
229 let state = automaton.start_state();
230 Self {
231 automaton,
232 state,
233 _lt: PhantomData,
234 }
235 }
236
237 #[inline]
238 fn advance(&mut self, input: u8) {
239 self.state = unsafe {
240 // It's safe to call `next_state_unchecked` since the matcher may
241 // only be constructed by a `Pattern`, which, in turn,can only be
242 // constructed with a valid DFA.
243 self.automaton.next_state_unchecked(self.state, input)
244 };
245 }
246
247 /// Returns `true` if this `Matcher` has matched any input that has been
248 /// provided.
249 #[inline]
250 pub fn is_matched(&self) -> bool {
251 self.automaton.is_match_state(self.state)
252 }
253
254 /// Returns `true` if this pattern matches the formatted output of the given
255 /// type implementing `fmt::Debug`.
256 pub fn matches(mut self, s: &impl AsRef<str>) -> bool {
257 for &byte in s.as_ref().as_bytes() {
258 self.advance(byte);
259 if self.automaton.is_dead_state(self.state) {
260 return false;
261 }
262 }
263 self.is_matched()
264 }
265
266 /// Returns `true` if this pattern matches the formatted output of the given
267 /// type implementing `fmt::Debug`.
268 pub fn debug_matches(mut self, d: &impl fmt::Debug) -> bool {
269 use std::fmt::Write;
270 write!(&mut self, "{:?}", d).expect("matcher write impl should not fail");
271 self.is_matched()
272 }
273
274 /// Returns `true` if this pattern matches the formatted output of the given
275 /// type implementing `fmt::Display`.
276 pub fn display_matches(mut self, d: &impl fmt::Display) -> bool {
277 use std::fmt::Write;
278 write!(&mut self, "{}", d).expect("matcher write impl should not fail");
279 self.is_matched()
280 }
281
282 /// Returns either a `bool` indicating whether or not this pattern matches the
283 /// data read from the provided `io::Read` stream, or an `io::Error` if an
284 /// error occurred reading from the stream.
285 pub fn read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool> {
286 for r in io.bytes() {
287 self.advance(r?);
288 if self.automaton.is_dead_state(self.state) {
289 return Ok(false);
290 }
291 }
292 Ok(self.is_matched())
293 }
294}
295
296impl<'a, S, A> fmt::Write for Matcher<'a, S, A>
297where
298 S: StateID,
299 A: DFA<ID = S>,
300{
301 fn write_str(&mut self, s: &str) -> fmt::Result {
302 for &byte in s.as_bytes() {
303 self.advance(byte);
304 if self.automaton.is_dead_state(self.state) {
305 break;
306 }
307 }
308 Ok(())
309 }
310}
311
312impl<'a, S, A> io::Write for Matcher<'a, S, A>
313where
314 S: StateID,
315 A: DFA<ID = S>,
316{
317 fn write(&mut self, bytes: &[u8]) -> Result<usize, io::Error> {
318 let mut i = 0;
319 for &byte in bytes {
320 self.advance(byte);
321 i += 1;
322 if self.automaton.is_dead_state(self.state) {
323 break;
324 }
325 }
326 Ok(i)
327 }
328
329 fn flush(&mut self) -> Result<(), io::Error> {
330 Ok(())
331 }
332}
333
334pub trait ToMatcher<'a, S>
335where
336 Self: crate::sealed::Sealed,
337 S: StateID + 'a,
338{
339 type Automaton: DFA<ID = S>;
340 fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton>;
341}
342
343impl<S> crate::sealed::Sealed for Pattern<S, DenseDFA<Vec<S>, S>> where S: StateID {}
344
345impl<'a, S> ToMatcher<'a, S> for Pattern<S, DenseDFA<Vec<S>, S>>
346where
347 S: StateID + 'a,
348{
349 type Automaton = DenseDFA<&'a [S], S>;
350 fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> {
351 Matcher::new(self.automaton.as_ref())
352 }
353}
354
355impl<'a, S> ToMatcher<'a, S> for Pattern<S, SparseDFA<Vec<u8>, S>>
356where
357 S: StateID + 'a,
358{
359 type Automaton = SparseDFA<&'a [u8], S>;
360 fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> {
361 Matcher::new(self.automaton.as_ref())
362 }
363}
364
365impl<S> crate::sealed::Sealed for Pattern<S, SparseDFA<Vec<u8>, S>> where S: StateID {}
366
367mod sealed {
368 pub trait Sealed {}
369}
370
371#[cfg(test)]
372mod test {
373 use super::*;
374
375 struct Str<'a>(&'a str);
376 struct ReadStr<'a>(io::Cursor<&'a [u8]>);
377
378 impl<'a> fmt::Debug for Str<'a> {
379 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
380 write!(f, "{}", self.0)
381 }
382 }
383
384 impl<'a> fmt::Display for Str<'a> {
385 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
386 write!(f, "{}", self.0)
387 }
388 }
389
390 impl<'a> io::Read for ReadStr<'a> {
391 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
392 self.0.read(buf)
393 }
394 }
395
396 impl Str<'static> {
397 fn hello_world() -> Self {
398 Self::new("hello world")
399 }
400 }
401
402 impl<'a> Str<'a> {
403 fn new(s: &'a str) -> Self {
404 Str(s)
405 }
406
407 fn to_reader(self) -> ReadStr<'a> {
408 ReadStr(io::Cursor::new(self.0.as_bytes()))
409 }
410 }
411
412 fn test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
413 let pat = new_pattern("hello world").unwrap();
414 assert!(pat.debug_matches(&Str::hello_world()));
415
416 let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
417 assert!(pat.debug_matches(&Str::hello_world()));
418
419 let pat = new_pattern("goodbye world").unwrap();
420 assert_eq!(pat.debug_matches(&Str::hello_world()), false);
421 }
422
423 fn test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
424 let pat = new_pattern("hello world").unwrap();
425 assert!(pat.display_matches(&Str::hello_world()));
426
427 let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
428 assert!(pat.display_matches(&Str::hello_world()));
429
430 let pat = new_pattern("goodbye world").unwrap();
431 assert_eq!(pat.display_matches(&Str::hello_world()), false);
432 }
433
434 fn test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
435 let pat = new_pattern("hello world").unwrap();
436 assert!(pat
437 .read_matches(Str::hello_world().to_reader())
438 .expect("no io error should occur"));
439
440 let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
441 assert!(pat
442 .read_matches(Str::hello_world().to_reader())
443 .expect("no io error should occur"));
444
445 let pat = new_pattern("goodbye world").unwrap();
446 assert_eq!(
447 pat.read_matches(Str::hello_world().to_reader())
448 .expect("no io error should occur"),
449 false
450 );
451 }
452
453 fn test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
454 let pat = new_pattern("a+b").unwrap();
455 assert!(pat.debug_matches(&Str::new("ab")));
456 assert!(pat.debug_matches(&Str::new("aaaab")));
457 assert!(pat.debug_matches(&Str::new("aaaaaaaaaab")));
458 assert_eq!(pat.debug_matches(&Str::new("b")), false);
459 assert_eq!(pat.debug_matches(&Str::new("abb")), false);
460 assert_eq!(pat.debug_matches(&Str::new("aaaaabb")), false);
461 }
462
463 mod anchored {
464 use super::*;
465 #[test]
466 fn debug_matches() {
467 test_debug_matches(Pattern::new_anchored)
468 }
469
470 #[test]
471 fn display_matches() {
472 test_display_matches(Pattern::new_anchored)
473 }
474
475 #[test]
476 fn reader_matches() {
477 test_reader_matches(Pattern::new_anchored)
478 }
479
480 #[test]
481 fn debug_rep_patterns() {
482 test_debug_rep_patterns(Pattern::new_anchored)
483 }
484
485 // === anchored behavior =============================================
486 // Tests that anchored patterns match each input type only beginning at
487 // the first character.
488 fn test_is_anchored(f: impl Fn(&Pattern, Str) -> bool) {
489 let pat = Pattern::new_anchored("a+b").unwrap();
490 assert!(f(&pat, Str::new("ab")));
491 assert!(f(&pat, Str::new("aaaab")));
492 assert!(f(&pat, Str::new("aaaaaaaaaab")));
493 assert!(!f(&pat, Str::new("bab")));
494 assert!(!f(&pat, Str::new("ffab")));
495 assert!(!f(&pat, Str::new("qqqqqqqaaaaab")));
496 }
497
498 #[test]
499 fn debug_is_anchored() {
500 test_is_anchored(|pat, input| pat.debug_matches(&input))
501 }
502
503 #[test]
504 fn display_is_anchored() {
505 test_is_anchored(|pat, input| pat.display_matches(&input));
506 }
507
508 #[test]
509 fn reader_is_anchored() {
510 test_is_anchored(|pat, input| {
511 pat.read_matches(input.to_reader())
512 .expect("no io error occurs")
513 });
514 }
515
516 // === explicitly unanchored =========================================
517 // Tests that if an "anchored" pattern begins with `.*?`, it matches as
518 // though it was unanchored.
519 fn test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
520 let pat = Pattern::new_anchored(".*?a+b").unwrap();
521 assert!(f(&pat, Str::new("ab")));
522 assert!(f(&pat, Str::new("aaaab")));
523 assert!(f(&pat, Str::new("aaaaaaaaaab")));
524 assert!(f(&pat, Str::new("bab")));
525 assert!(f(&pat, Str::new("ffab")));
526 assert!(f(&pat, Str::new("qqqqqqqaaaaab")));
527 }
528
529 #[test]
530 fn debug_explicitly_unanchored() {
531 test_explicitly_unanchored(|pat, input| pat.debug_matches(&input))
532 }
533
534 #[test]
535 fn display_explicitly_unanchored() {
536 test_explicitly_unanchored(|pat, input| pat.display_matches(&input));
537 }
538
539 #[test]
540 fn reader_explicitly_unanchored() {
541 test_explicitly_unanchored(|pat, input| {
542 pat.read_matches(input.to_reader())
543 .expect("no io error occurs")
544 });
545 }
546 }
547
548 mod unanchored {
549 use super::*;
550 #[test]
551 fn debug_matches() {
552 test_debug_matches(Pattern::new)
553 }
554
555 #[test]
556 fn display_matches() {
557 test_display_matches(Pattern::new)
558 }
559
560 #[test]
561 fn reader_matches() {
562 test_reader_matches(Pattern::new)
563 }
564
565 #[test]
566 fn debug_rep_patterns() {
567 test_debug_rep_patterns(Pattern::new)
568 }
569
570 // === anchored behavior =============================================
571 // Tests that unanchored patterns match anywhere in the input stream.
572 fn test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
573 let pat = Pattern::new("a+b").unwrap();
574 assert!(f(&pat, Str::new("ab")));
575 assert!(f(&pat, Str::new("aaaab")));
576 assert!(f(&pat, Str::new("aaaaaaaaaab")));
577 assert!(f(&pat, Str::new("bab")));
578 assert!(f(&pat, Str::new("ffab")));
579 assert!(f(&pat, Str::new("qqqfqqqqaaaaab")));
580 }
581
582 #[test]
583 fn debug_is_unanchored() {
584 test_is_unanchored(|pat, input| pat.debug_matches(&input))
585 }
586
587 #[test]
588 fn display_is_unanchored() {
589 test_is_unanchored(|pat, input| pat.display_matches(&input));
590 }
591
592 #[test]
593 fn reader_is_unanchored() {
594 test_is_unanchored(|pat, input| {
595 pat.read_matches(input.to_reader())
596 .expect("no io error occurs")
597 });
598 }
599 }
600}
601