1 | use std::fmt; |
2 | use std::iter::FusedIterator; |
3 | |
4 | /// Slot is a single saved capture location. Note that there are two slots for |
5 | /// every capture in a regular expression (one slot each for the start and end |
6 | /// of the capture). |
7 | pub type Slot = Option<usize>; |
8 | |
9 | /// Locations represents the offsets of each capturing group in a regex for |
10 | /// a single match. |
11 | /// |
12 | /// Unlike `Captures`, a `Locations` value only stores offsets. |
13 | #[doc (hidden)] |
14 | #[derive (Clone, Debug)] |
15 | pub struct Locations(Vec<Slot>); |
16 | |
17 | impl Locations { |
18 | /// Returns the start and end positions of the Nth capture group. Returns |
19 | /// `None` if `i` is not a valid capture group or if the capture group did |
20 | /// not match anything. The positions returned are *always* byte indices |
21 | /// with respect to the original string matched. |
22 | pub fn pos(&self, i: usize) -> Option<(usize, usize)> { |
23 | let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?); |
24 | match (self.0.get(s), self.0.get(e)) { |
25 | (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), |
26 | _ => None, |
27 | } |
28 | } |
29 | |
30 | /// Creates an iterator of all the capture group positions in order of |
31 | /// appearance in the regular expression. Positions are byte indices |
32 | /// in terms of the original string matched. |
33 | pub fn iter(&self) -> SubCapturesPosIter<'_> { |
34 | SubCapturesPosIter { idx: 0, locs: self } |
35 | } |
36 | |
37 | /// Returns the total number of capturing groups. |
38 | /// |
39 | /// This is always at least `1` since every regex has at least `1` |
40 | /// capturing group that corresponds to the entire match. |
41 | pub fn len(&self) -> usize { |
42 | self.0.len() / 2 |
43 | } |
44 | |
45 | /// Return the individual slots as a slice. |
46 | pub(crate) fn as_slots(&mut self) -> &mut [Slot] { |
47 | &mut self.0 |
48 | } |
49 | } |
50 | |
51 | /// An iterator over capture group positions for a particular match of a |
52 | /// regular expression. |
53 | /// |
54 | /// Positions are byte indices in terms of the original string matched. |
55 | /// |
56 | /// `'c` is the lifetime of the captures. |
57 | #[derive (Clone, Debug)] |
58 | pub struct SubCapturesPosIter<'c> { |
59 | idx: usize, |
60 | locs: &'c Locations, |
61 | } |
62 | |
63 | impl<'c> Iterator for SubCapturesPosIter<'c> { |
64 | type Item = Option<(usize, usize)>; |
65 | |
66 | fn next(&mut self) -> Option<Option<(usize, usize)>> { |
67 | if self.idx >= self.locs.len() { |
68 | return None; |
69 | } |
70 | let x: Option = match self.locs.pos(self.idx) { |
71 | None => Some(None), |
72 | Some((s: usize, e: usize)) => Some(Some((s, e))), |
73 | }; |
74 | self.idx += 1; |
75 | x |
76 | } |
77 | |
78 | fn size_hint(&self) -> (usize, Option<usize>) { |
79 | let len: usize = self.locs.len() - self.idx; |
80 | (len, Some(len)) |
81 | } |
82 | |
83 | fn count(self) -> usize { |
84 | self.len() |
85 | } |
86 | } |
87 | |
88 | impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {} |
89 | |
90 | impl<'c> FusedIterator for SubCapturesPosIter<'c> {} |
91 | |
92 | /// `RegularExpression` describes types that can implement regex searching. |
93 | /// |
94 | /// This trait is my attempt at reducing code duplication and to standardize |
95 | /// the internal API. Specific duplication that is avoided are the `find` |
96 | /// and `capture` iterators, which are slightly tricky. |
97 | /// |
98 | /// It's not clear whether this trait is worth it, and it also isn't |
99 | /// clear whether it's useful as a public trait or not. Methods like |
100 | /// `next_after_empty` reak of bad design, but the rest of the methods seem |
101 | /// somewhat reasonable. One particular thing this trait would expose would be |
102 | /// the ability to start the search of a regex anywhere in a haystack, which |
103 | /// isn't possible in the current public API. |
104 | pub trait RegularExpression: Sized + fmt::Debug { |
105 | /// The type of the haystack. |
106 | type Text: ?Sized + fmt::Debug; |
107 | |
108 | /// The number of capture slots in the compiled regular expression. This is |
109 | /// always two times the number of capture groups (two slots per group). |
110 | fn slots_len(&self) -> usize; |
111 | |
112 | /// Allocates fresh space for all capturing groups in this regex. |
113 | fn locations(&self) -> Locations { |
114 | Locations(vec![None; self.slots_len()]) |
115 | } |
116 | |
117 | /// Returns the position of the next character after `i`. |
118 | /// |
119 | /// For example, a haystack with type `&[u8]` probably returns `i+1`, |
120 | /// whereas a haystack with type `&str` probably returns `i` plus the |
121 | /// length of the next UTF-8 sequence. |
122 | fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; |
123 | |
124 | /// Returns the location of the shortest match. |
125 | fn shortest_match_at( |
126 | &self, |
127 | text: &Self::Text, |
128 | start: usize, |
129 | ) -> Option<usize>; |
130 | |
131 | /// Returns whether the regex matches the text given. |
132 | fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; |
133 | |
134 | /// Returns the leftmost-first match location if one exists. |
135 | fn find_at( |
136 | &self, |
137 | text: &Self::Text, |
138 | start: usize, |
139 | ) -> Option<(usize, usize)>; |
140 | |
141 | /// Returns the leftmost-first match location if one exists, and also |
142 | /// fills in any matching capture slot locations. |
143 | fn captures_read_at( |
144 | &self, |
145 | locs: &mut Locations, |
146 | text: &Self::Text, |
147 | start: usize, |
148 | ) -> Option<(usize, usize)>; |
149 | |
150 | /// Returns an iterator over all non-overlapping successive leftmost-first |
151 | /// matches. |
152 | fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { |
153 | Matches { re: self, text, last_end: 0, last_match: None } |
154 | } |
155 | |
156 | /// Returns an iterator over all non-overlapping successive leftmost-first |
157 | /// matches with captures. |
158 | fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { |
159 | CaptureMatches(self.find_iter(text)) |
160 | } |
161 | } |
162 | |
163 | /// An iterator over all non-overlapping successive leftmost-first matches. |
164 | #[derive (Debug)] |
165 | pub struct Matches<'t, R> |
166 | where |
167 | R: RegularExpression, |
168 | R::Text: 't, |
169 | { |
170 | re: R, |
171 | text: &'t R::Text, |
172 | last_end: usize, |
173 | last_match: Option<usize>, |
174 | } |
175 | |
176 | impl<'t, R> Matches<'t, R> |
177 | where |
178 | R: RegularExpression, |
179 | R::Text: 't, |
180 | { |
181 | /// Return the text being searched. |
182 | pub fn text(&self) -> &'t R::Text { |
183 | self.text |
184 | } |
185 | |
186 | /// Return the underlying regex. |
187 | pub fn regex(&self) -> &R { |
188 | &self.re |
189 | } |
190 | } |
191 | |
192 | impl<'t, R> Iterator for Matches<'t, R> |
193 | where |
194 | R: RegularExpression, |
195 | R::Text: 't + AsRef<[u8]>, |
196 | { |
197 | type Item = (usize, usize); |
198 | |
199 | fn next(&mut self) -> Option<(usize, usize)> { |
200 | if self.last_end > self.text.as_ref().len() { |
201 | return None; |
202 | } |
203 | let (s, e) = match self.re.find_at(self.text, self.last_end) { |
204 | None => return None, |
205 | Some((s, e)) => (s, e), |
206 | }; |
207 | if s == e { |
208 | // This is an empty match. To ensure we make progress, start |
209 | // the next search at the smallest possible starting position |
210 | // of the next match following this one. |
211 | self.last_end = self.re.next_after_empty(self.text, e); |
212 | // Don't accept empty matches immediately following a match. |
213 | // Just move on to the next match. |
214 | if Some(e) == self.last_match { |
215 | return self.next(); |
216 | } |
217 | } else { |
218 | self.last_end = e; |
219 | } |
220 | self.last_match = Some(e); |
221 | Some((s, e)) |
222 | } |
223 | } |
224 | |
225 | impl<'t, R> FusedIterator for Matches<'t, R> |
226 | where |
227 | R: RegularExpression, |
228 | R::Text: 't + AsRef<[u8]>, |
229 | { |
230 | } |
231 | |
232 | /// An iterator over all non-overlapping successive leftmost-first matches with |
233 | /// captures. |
234 | #[derive (Debug)] |
235 | pub struct CaptureMatches<'t, R>(Matches<'t, R>) |
236 | where |
237 | R: RegularExpression, |
238 | R::Text: 't; |
239 | |
240 | impl<'t, R> CaptureMatches<'t, R> |
241 | where |
242 | R: RegularExpression, |
243 | R::Text: 't, |
244 | { |
245 | /// Return the text being searched. |
246 | pub fn text(&self) -> &'t R::Text { |
247 | self.0.text() |
248 | } |
249 | |
250 | /// Return the underlying regex. |
251 | pub fn regex(&self) -> &R { |
252 | self.0.regex() |
253 | } |
254 | } |
255 | |
256 | impl<'t, R> Iterator for CaptureMatches<'t, R> |
257 | where |
258 | R: RegularExpression, |
259 | R::Text: 't + AsRef<[u8]>, |
260 | { |
261 | type Item = Locations; |
262 | |
263 | fn next(&mut self) -> Option<Locations> { |
264 | if self.0.last_end > self.0.text.as_ref().len() { |
265 | return None; |
266 | } |
267 | let mut locs = self.0.re.locations(); |
268 | let (s, e) = match self.0.re.captures_read_at( |
269 | &mut locs, |
270 | self.0.text, |
271 | self.0.last_end, |
272 | ) { |
273 | None => return None, |
274 | Some((s, e)) => (s, e), |
275 | }; |
276 | if s == e { |
277 | self.0.last_end = self.0.re.next_after_empty(self.0.text, e); |
278 | if Some(e) == self.0.last_match { |
279 | return self.next(); |
280 | } |
281 | } else { |
282 | self.0.last_end = e; |
283 | } |
284 | self.0.last_match = Some(e); |
285 | Some(locs) |
286 | } |
287 | } |
288 | |
289 | impl<'t, R> FusedIterator for CaptureMatches<'t, R> |
290 | where |
291 | R: RegularExpression, |
292 | R::Text: 't + AsRef<[u8]>, |
293 | { |
294 | } |
295 | |