1use std::fmt;
2use std::iter::FusedIterator;
3
4/// Slot is a single saved capture location. Note that there are two slots for
5/// every capture in a regular expression (one slot each for the start and end
6/// of the capture).
7pub type Slot = Option<usize>;
8
9/// Locations represents the offsets of each capturing group in a regex for
10/// a single match.
11///
12/// Unlike `Captures`, a `Locations` value only stores offsets.
13#[doc(hidden)]
14#[derive(Clone, Debug)]
15pub struct Locations(Vec<Slot>);
16
17impl Locations {
18 /// Returns the start and end positions of the Nth capture group. Returns
19 /// `None` if `i` is not a valid capture group or if the capture group did
20 /// not match anything. The positions returned are *always* byte indices
21 /// with respect to the original string matched.
22 pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
23 let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?);
24 match (self.0.get(s), self.0.get(e)) {
25 (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
26 _ => None,
27 }
28 }
29
30 /// Creates an iterator of all the capture group positions in order of
31 /// appearance in the regular expression. Positions are byte indices
32 /// in terms of the original string matched.
33 pub fn iter(&self) -> SubCapturesPosIter<'_> {
34 SubCapturesPosIter { idx: 0, locs: self }
35 }
36
37 /// Returns the total number of capturing groups.
38 ///
39 /// This is always at least `1` since every regex has at least `1`
40 /// capturing group that corresponds to the entire match.
41 pub fn len(&self) -> usize {
42 self.0.len() / 2
43 }
44
45 /// Return the individual slots as a slice.
46 pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
47 &mut self.0
48 }
49}
50
51/// An iterator over capture group positions for a particular match of a
52/// regular expression.
53///
54/// Positions are byte indices in terms of the original string matched.
55///
56/// `'c` is the lifetime of the captures.
57#[derive(Clone, Debug)]
58pub struct SubCapturesPosIter<'c> {
59 idx: usize,
60 locs: &'c Locations,
61}
62
63impl<'c> Iterator for SubCapturesPosIter<'c> {
64 type Item = Option<(usize, usize)>;
65
66 fn next(&mut self) -> Option<Option<(usize, usize)>> {
67 if self.idx >= self.locs.len() {
68 return None;
69 }
70 let x: Option> = match self.locs.pos(self.idx) {
71 None => Some(None),
72 Some((s: usize, e: usize)) => Some(Some((s, e))),
73 };
74 self.idx += 1;
75 x
76 }
77
78 fn size_hint(&self) -> (usize, Option<usize>) {
79 let len: usize = self.locs.len() - self.idx;
80 (len, Some(len))
81 }
82
83 fn count(self) -> usize {
84 self.len()
85 }
86}
87
88impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {}
89
90impl<'c> FusedIterator for SubCapturesPosIter<'c> {}
91
92/// `RegularExpression` describes types that can implement regex searching.
93///
94/// This trait is my attempt at reducing code duplication and to standardize
95/// the internal API. Specific duplication that is avoided are the `find`
96/// and `capture` iterators, which are slightly tricky.
97///
98/// It's not clear whether this trait is worth it, and it also isn't
99/// clear whether it's useful as a public trait or not. Methods like
100/// `next_after_empty` reak of bad design, but the rest of the methods seem
101/// somewhat reasonable. One particular thing this trait would expose would be
102/// the ability to start the search of a regex anywhere in a haystack, which
103/// isn't possible in the current public API.
104pub trait RegularExpression: Sized + fmt::Debug {
105 /// The type of the haystack.
106 type Text: ?Sized + fmt::Debug;
107
108 /// The number of capture slots in the compiled regular expression. This is
109 /// always two times the number of capture groups (two slots per group).
110 fn slots_len(&self) -> usize;
111
112 /// Allocates fresh space for all capturing groups in this regex.
113 fn locations(&self) -> Locations {
114 Locations(vec![None; self.slots_len()])
115 }
116
117 /// Returns the position of the next character after `i`.
118 ///
119 /// For example, a haystack with type `&[u8]` probably returns `i+1`,
120 /// whereas a haystack with type `&str` probably returns `i` plus the
121 /// length of the next UTF-8 sequence.
122 fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
123
124 /// Returns the location of the shortest match.
125 fn shortest_match_at(
126 &self,
127 text: &Self::Text,
128 start: usize,
129 ) -> Option<usize>;
130
131 /// Returns whether the regex matches the text given.
132 fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
133
134 /// Returns the leftmost-first match location if one exists.
135 fn find_at(
136 &self,
137 text: &Self::Text,
138 start: usize,
139 ) -> Option<(usize, usize)>;
140
141 /// Returns the leftmost-first match location if one exists, and also
142 /// fills in any matching capture slot locations.
143 fn captures_read_at(
144 &self,
145 locs: &mut Locations,
146 text: &Self::Text,
147 start: usize,
148 ) -> Option<(usize, usize)>;
149
150 /// Returns an iterator over all non-overlapping successive leftmost-first
151 /// matches.
152 fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
153 Matches { re: self, text, last_end: 0, last_match: None }
154 }
155
156 /// Returns an iterator over all non-overlapping successive leftmost-first
157 /// matches with captures.
158 fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> {
159 CaptureMatches(self.find_iter(text))
160 }
161}
162
163/// An iterator over all non-overlapping successive leftmost-first matches.
164#[derive(Debug)]
165pub struct Matches<'t, R>
166where
167 R: RegularExpression,
168 R::Text: 't,
169{
170 re: R,
171 text: &'t R::Text,
172 last_end: usize,
173 last_match: Option<usize>,
174}
175
176impl<'t, R> Matches<'t, R>
177where
178 R: RegularExpression,
179 R::Text: 't,
180{
181 /// Return the text being searched.
182 pub fn text(&self) -> &'t R::Text {
183 self.text
184 }
185
186 /// Return the underlying regex.
187 pub fn regex(&self) -> &R {
188 &self.re
189 }
190}
191
192impl<'t, R> Iterator for Matches<'t, R>
193where
194 R: RegularExpression,
195 R::Text: 't + AsRef<[u8]>,
196{
197 type Item = (usize, usize);
198
199 fn next(&mut self) -> Option<(usize, usize)> {
200 if self.last_end > self.text.as_ref().len() {
201 return None;
202 }
203 let (s, e) = match self.re.find_at(self.text, self.last_end) {
204 None => return None,
205 Some((s, e)) => (s, e),
206 };
207 if s == e {
208 // This is an empty match. To ensure we make progress, start
209 // the next search at the smallest possible starting position
210 // of the next match following this one.
211 self.last_end = self.re.next_after_empty(self.text, e);
212 // Don't accept empty matches immediately following a match.
213 // Just move on to the next match.
214 if Some(e) == self.last_match {
215 return self.next();
216 }
217 } else {
218 self.last_end = e;
219 }
220 self.last_match = Some(e);
221 Some((s, e))
222 }
223}
224
225impl<'t, R> FusedIterator for Matches<'t, R>
226where
227 R: RegularExpression,
228 R::Text: 't + AsRef<[u8]>,
229{
230}
231
232/// An iterator over all non-overlapping successive leftmost-first matches with
233/// captures.
234#[derive(Debug)]
235pub struct CaptureMatches<'t, R>(Matches<'t, R>)
236where
237 R: RegularExpression,
238 R::Text: 't;
239
240impl<'t, R> CaptureMatches<'t, R>
241where
242 R: RegularExpression,
243 R::Text: 't,
244{
245 /// Return the text being searched.
246 pub fn text(&self) -> &'t R::Text {
247 self.0.text()
248 }
249
250 /// Return the underlying regex.
251 pub fn regex(&self) -> &R {
252 self.0.regex()
253 }
254}
255
256impl<'t, R> Iterator for CaptureMatches<'t, R>
257where
258 R: RegularExpression,
259 R::Text: 't + AsRef<[u8]>,
260{
261 type Item = Locations;
262
263 fn next(&mut self) -> Option<Locations> {
264 if self.0.last_end > self.0.text.as_ref().len() {
265 return None;
266 }
267 let mut locs = self.0.re.locations();
268 let (s, e) = match self.0.re.captures_read_at(
269 &mut locs,
270 self.0.text,
271 self.0.last_end,
272 ) {
273 None => return None,
274 Some((s, e)) => (s, e),
275 };
276 if s == e {
277 self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
278 if Some(e) == self.0.last_match {
279 return self.next();
280 }
281 } else {
282 self.0.last_end = e;
283 }
284 self.0.last_match = Some(e);
285 Some(locs)
286 }
287}
288
289impl<'t, R> FusedIterator for CaptureMatches<'t, R>
290where
291 R: RegularExpression,
292 R::Text: 't + AsRef<[u8]>,
293{
294}
295