1/*!
2Provides types for dealing with capturing groups.
3
4Capturing groups refer to sub-patterns of regexes that some regex engines can
5report matching offsets for. For example, matching `[a-z]([0-9]+)` against
6`a789` would give `a789` as the overall match (for the implicit capturing group
7at index `0`) and `789` as the match for the capturing group `([0-9]+)` (an
8explicit capturing group at index `1`).
9
10Not all regex engines can report match offsets for capturing groups. Indeed,
11to a first approximation, regex engines that can report capturing group offsets
12tend to be quite a bit slower than regex engines that can't. This is because
13tracking capturing groups at search time usually requires more "power" that
14in turn adds overhead.
15
16Other regex implementations might call capturing groups "submatches."
17
18# Overview
19
20The main types in this module are:
21
22* [`Captures`] records the capturing group offsets found during a search. It
23provides convenience routines for looking up capturing group offsets by either
24index or name.
25* [`GroupInfo`] records the mapping between capturing groups and "slots,"
26where the latter are how capturing groups are recorded during a regex search.
27This also keeps a mapping from capturing group name to index, and capture
28group index to name. A `GroupInfo` is used by `Captures` internally to
29provide a convenient API. It is unlikely that you'll use a `GroupInfo`
30directly, but for example, if you've compiled an Thompson NFA, then you can use
31[`thompson::NFA::group_info`](crate::nfa::thompson::NFA::group_info) to get its
32underlying `GroupInfo`.
33*/
34
35use alloc::{string::String, sync::Arc, vec, vec::Vec};
36
37use crate::util::{
38 interpolate,
39 primitives::{
40 NonMaxUsize, PatternID, PatternIDError, PatternIDIter, SmallIndex,
41 },
42 search::{Match, Span},
43};
44
45/// The span offsets of capturing groups after a match has been found.
46///
47/// This type represents the output of regex engines that can report the
48/// offsets at which capturing groups matches or "submatches" occur. For
49/// example, the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). When a match
50/// occurs, it will at minimum contain the [`PatternID`] of the pattern that
51/// matched. Depending upon how it was constructed, it may also contain the
52/// start/end offsets of the entire match of the pattern and the start/end
53/// offsets of each capturing group that participated in the match.
54///
55/// Values of this type are always created for a specific [`GroupInfo`]. It is
56/// unspecified behavior to use a `Captures` value in a search with any regex
57/// engine that has a different `GroupInfo` than the one the `Captures` were
58/// created with.
59///
60/// # Constructors
61///
62/// There are three constructors for this type that control what kind of
63/// information is available upon a match:
64///
65/// * [`Captures::all`]: Will store overall pattern match offsets in addition
66/// to the offsets of capturing groups that participated in the match.
67/// * [`Captures::matches`]: Will store only the overall pattern
68/// match offsets. The offsets of capturing groups (even ones that participated
69/// in the match) are not available.
70/// * [`Captures::empty`]: Will only store the pattern ID that matched. No
71/// match offsets are available at all.
72///
73/// If you aren't sure which to choose, then pick the first one. The first one
74/// is what convenience routines like,
75/// [`PikeVM::create_captures`](crate::nfa::thompson::pikevm::PikeVM::create_captures),
76/// will use automatically.
77///
78/// The main difference between these choices is performance. Namely, if you
79/// ask for _less_ information, then the execution of regex search may be able
80/// to run more quickly.
81///
82/// # Notes
83///
84/// It is worth pointing out that this type is not coupled to any one specific
85/// regex engine. Instead, its coupling is with [`GroupInfo`], which is the
86/// thing that is responsible for mapping capturing groups to "slot" offsets.
87/// Slot offsets are indices into a single sequence of memory at which matching
88/// haystack offsets for the corresponding group are written by regex engines.
89///
90/// # Example
91///
92/// This example shows how to parse a simple date and extract the components of
93/// the date via capturing groups:
94///
95/// ```
96/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
97///
98/// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
99/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
100///
101/// re.captures(&mut cache, "2010-03-14", &mut caps);
102/// assert!(caps.is_match());
103/// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
104/// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
105/// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
106///
107/// # Ok::<(), Box<dyn std::error::Error>>(())
108/// ```
109///
110/// # Example: named capturing groups
111///
112/// This example is like the one above, but leverages the ability to name
113/// capturing groups in order to make the code a bit clearer:
114///
115/// ```
116/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
117///
118/// let re = PikeVM::new(r"^(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})$")?;
119/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
120///
121/// re.captures(&mut cache, "2010-03-14", &mut caps);
122/// assert!(caps.is_match());
123/// assert_eq!(Some(Span::from(0..4)), caps.get_group_by_name("y"));
124/// assert_eq!(Some(Span::from(5..7)), caps.get_group_by_name("m"));
125/// assert_eq!(Some(Span::from(8..10)), caps.get_group_by_name("d"));
126///
127/// # Ok::<(), Box<dyn std::error::Error>>(())
128/// ```
129#[derive(Clone)]
130pub struct Captures {
131 /// The group info that these capture groups are coupled to. This is what
132 /// gives the "convenience" of the `Captures` API. Namely, it provides the
133 /// slot mapping and the name|-->index mapping for capture lookups by name.
134 group_info: GroupInfo,
135 /// The ID of the pattern that matched. Regex engines must set this to
136 /// None when no match occurs.
137 pid: Option<PatternID>,
138 /// The slot values, i.e., submatch offsets.
139 ///
140 /// In theory, the smallest sequence of slots would be something like
141 /// `max(groups(pattern) for pattern in regex) * 2`, but instead, we use
142 /// `sum(groups(pattern) for pattern in regex) * 2`. Why?
143 ///
144 /// Well, the former could be used in theory, because we don't generally
145 /// have any overlapping APIs that involve capturing groups. Therefore,
146 /// there's technically never any need to have slots set for multiple
147 /// patterns. However, this might change some day, in which case, we would
148 /// need to have slots available.
149 ///
150 /// The other reason is that during the execution of some regex engines,
151 /// there exists a point in time where multiple slots for different
152 /// patterns may be written to before knowing which pattern has matched.
153 /// Therefore, the regex engines themselves, in order to support multiple
154 /// patterns correctly, must have all slots available. If `Captures`
155 /// doesn't have all slots available, then regex engines can't write
156 /// directly into the caller provided `Captures` and must instead write
157 /// into some other storage and then copy the slots involved in the match
158 /// at the end of the search.
159 ///
160 /// So overall, at least as of the time of writing, it seems like the path
161 /// of least resistance is to just require allocating all possible slots
162 /// instead of the conceptual minimum. Another way to justify this is that
163 /// the most common case is a single pattern, in which case, there is no
164 /// inefficiency here since the 'max' and 'sum' calculations above are
165 /// equivalent in that case.
166 ///
167 /// N.B. The mapping from group index to slot is maintained by `GroupInfo`
168 /// and is considered an API guarantee. See `GroupInfo` for more details on
169 /// that mapping.
170 ///
171 /// N.B. `Option<NonMaxUsize>` has the same size as a `usize`.
172 slots: Vec<Option<NonMaxUsize>>,
173}
174
175impl Captures {
176 /// Create new storage for the offsets of all matching capturing groups.
177 ///
178 /// This routine provides the most information for matches---namely, the
179 /// spans of matching capturing groups---but also requires the regex search
180 /// routines to do the most work.
181 ///
182 /// It is unspecified behavior to use the returned `Captures` value in a
183 /// search with a `GroupInfo` other than the one that is provided to this
184 /// constructor.
185 ///
186 /// # Example
187 ///
188 /// This example shows that all capturing groups---but only ones that
189 /// participated in a match---are available to query after a match has
190 /// been found:
191 ///
192 /// ```
193 /// use regex_automata::{
194 /// nfa::thompson::pikevm::PikeVM,
195 /// util::captures::Captures,
196 /// Span, Match,
197 /// };
198 ///
199 /// let re = PikeVM::new(
200 /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$",
201 /// )?;
202 /// let mut cache = re.create_cache();
203 /// let mut caps = Captures::all(re.get_nfa().group_info().clone());
204 ///
205 /// re.captures(&mut cache, "ABC123", &mut caps);
206 /// assert!(caps.is_match());
207 /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match());
208 /// // The 'lower' group didn't match, so it won't have any offsets.
209 /// assert_eq!(None, caps.get_group_by_name("lower"));
210 /// assert_eq!(Some(Span::from(0..3)), caps.get_group_by_name("upper"));
211 /// assert_eq!(Some(Span::from(3..6)), caps.get_group_by_name("digits"));
212 ///
213 /// # Ok::<(), Box<dyn std::error::Error>>(())
214 /// ```
215 pub fn all(group_info: GroupInfo) -> Captures {
216 let slots = group_info.slot_len();
217 Captures { group_info, pid: None, slots: vec![None; slots] }
218 }
219
220 /// Create new storage for only the full match spans of a pattern. This
221 /// does not include any capturing group offsets.
222 ///
223 /// It is unspecified behavior to use the returned `Captures` value in a
224 /// search with a `GroupInfo` other than the one that is provided to this
225 /// constructor.
226 ///
227 /// # Example
228 ///
229 /// This example shows that only overall match offsets are reported when
230 /// this constructor is used. Accessing any capturing groups other than
231 /// the 0th will always return `None`.
232 ///
233 /// ```
234 /// use regex_automata::{
235 /// nfa::thompson::pikevm::PikeVM,
236 /// util::captures::Captures,
237 /// Match,
238 /// };
239 ///
240 /// let re = PikeVM::new(
241 /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$",
242 /// )?;
243 /// let mut cache = re.create_cache();
244 /// let mut caps = Captures::matches(re.get_nfa().group_info().clone());
245 ///
246 /// re.captures(&mut cache, "ABC123", &mut caps);
247 /// assert!(caps.is_match());
248 /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match());
249 /// // We didn't ask for capturing group offsets, so they aren't available.
250 /// assert_eq!(None, caps.get_group_by_name("lower"));
251 /// assert_eq!(None, caps.get_group_by_name("upper"));
252 /// assert_eq!(None, caps.get_group_by_name("digits"));
253 ///
254 /// # Ok::<(), Box<dyn std::error::Error>>(())
255 /// ```
256 pub fn matches(group_info: GroupInfo) -> Captures {
257 // This is OK because we know there are at least this many slots,
258 // and GroupInfo construction guarantees that the number of slots fits
259 // into a usize.
260 let slots = group_info.pattern_len().checked_mul(2).unwrap();
261 Captures { group_info, pid: None, slots: vec![None; slots] }
262 }
263
264 /// Create new storage for only tracking which pattern matched. No offsets
265 /// are stored at all.
266 ///
267 /// It is unspecified behavior to use the returned `Captures` value in a
268 /// search with a `GroupInfo` other than the one that is provided to this
269 /// constructor.
270 ///
271 /// # Example
272 ///
273 /// This example shows that only the pattern that matched can be accessed
274 /// from a `Captures` value created via this constructor.
275 ///
276 /// ```
277 /// use regex_automata::{
278 /// nfa::thompson::pikevm::PikeVM,
279 /// util::captures::Captures,
280 /// PatternID,
281 /// };
282 ///
283 /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?;
284 /// let mut cache = re.create_cache();
285 /// let mut caps = Captures::empty(re.get_nfa().group_info().clone());
286 ///
287 /// re.captures(&mut cache, "aABCz", &mut caps);
288 /// assert!(caps.is_match());
289 /// assert_eq!(Some(PatternID::must(0)), caps.pattern());
290 /// // We didn't ask for any offsets, so they aren't available.
291 /// assert_eq!(None, caps.get_match());
292 ///
293 /// re.captures(&mut cache, &"aABCz"[1..], &mut caps);
294 /// assert!(caps.is_match());
295 /// assert_eq!(Some(PatternID::must(1)), caps.pattern());
296 /// // We didn't ask for any offsets, so they aren't available.
297 /// assert_eq!(None, caps.get_match());
298 ///
299 /// # Ok::<(), Box<dyn std::error::Error>>(())
300 /// ```
301 pub fn empty(group_info: GroupInfo) -> Captures {
302 Captures { group_info, pid: None, slots: vec![] }
303 }
304
305 /// Returns true if and only if this capturing group represents a match.
306 ///
307 /// This is a convenience routine for `caps.pattern().is_some()`.
308 ///
309 /// # Example
310 ///
311 /// When using the PikeVM (for example), the lightest weight way of
312 /// detecting whether a match exists is to create capturing groups that
313 /// only track the ID of the pattern that match (if any):
314 ///
315 /// ```
316 /// use regex_automata::{
317 /// nfa::thompson::pikevm::PikeVM,
318 /// util::captures::Captures,
319 /// };
320 ///
321 /// let re = PikeVM::new(r"[a-z]+")?;
322 /// let mut cache = re.create_cache();
323 /// let mut caps = Captures::empty(re.get_nfa().group_info().clone());
324 ///
325 /// re.captures(&mut cache, "aABCz", &mut caps);
326 /// assert!(caps.is_match());
327 ///
328 /// # Ok::<(), Box<dyn std::error::Error>>(())
329 /// ```
330 #[inline]
331 pub fn is_match(&self) -> bool {
332 self.pid.is_some()
333 }
334
335 /// Returns the identifier of the pattern that matched when this
336 /// capturing group represents a match. If no match was found, then this
337 /// always returns `None`.
338 ///
339 /// This returns a pattern ID in precisely the cases in which `is_match`
340 /// returns `true`. Similarly, the pattern ID returned is always the
341 /// same pattern ID found in the `Match` returned by `get_match`.
342 ///
343 /// # Example
344 ///
345 /// When using the PikeVM (for example), the lightest weight way of
346 /// detecting which pattern matched is to create capturing groups that only
347 /// track the ID of the pattern that match (if any):
348 ///
349 /// ```
350 /// use regex_automata::{
351 /// nfa::thompson::pikevm::PikeVM,
352 /// util::captures::Captures,
353 /// PatternID,
354 /// };
355 ///
356 /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?;
357 /// let mut cache = re.create_cache();
358 /// let mut caps = Captures::empty(re.get_nfa().group_info().clone());
359 ///
360 /// re.captures(&mut cache, "ABC", &mut caps);
361 /// assert_eq!(Some(PatternID::must(1)), caps.pattern());
362 /// // Recall that offsets are only available when using a non-empty
363 /// // Captures value. So even though a match occurred, this returns None!
364 /// assert_eq!(None, caps.get_match());
365 ///
366 /// # Ok::<(), Box<dyn std::error::Error>>(())
367 /// ```
368 #[inline]
369 pub fn pattern(&self) -> Option<PatternID> {
370 self.pid
371 }
372
373 /// Returns the pattern ID and the span of the match, if one occurred.
374 ///
375 /// This always returns `None` when `Captures` was created with
376 /// [`Captures::empty`], even if a match was found.
377 ///
378 /// If this routine returns a non-`None` value, then `is_match` is
379 /// guaranteed to return `true` and `pattern` is also guaranteed to return
380 /// a non-`None` value.
381 ///
382 /// # Example
383 ///
384 /// This example shows how to get the full match from a search:
385 ///
386 /// ```
387 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
388 ///
389 /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?;
390 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
391 ///
392 /// re.captures(&mut cache, "ABC", &mut caps);
393 /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
394 ///
395 /// # Ok::<(), Box<dyn std::error::Error>>(())
396 /// ```
397 #[inline]
398 pub fn get_match(&self) -> Option<Match> {
399 Some(Match::new(self.pattern()?, self.get_group(0)?))
400 }
401
402 /// Returns the span of a capturing group match corresponding to the group
403 /// index given, only if both the overall pattern matched and the capturing
404 /// group participated in that match.
405 ///
406 /// This returns `None` if `index` is invalid. `index` is valid if and only
407 /// if it's less than [`Captures::group_len`] for the matching pattern.
408 ///
409 /// This always returns `None` when `Captures` was created with
410 /// [`Captures::empty`], even if a match was found. This also always
411 /// returns `None` for any `index > 0` when `Captures` was created with
412 /// [`Captures::matches`].
413 ///
414 /// If this routine returns a non-`None` value, then `is_match` is
415 /// guaranteed to return `true`, `pattern` is guaranteed to return a
416 /// non-`None` value and `get_match` is guaranteed to return a non-`None`
417 /// value.
418 ///
419 /// By convention, the 0th capture group will always return the same
420 /// span as the span returned by `get_match`. This is because the 0th
421 /// capture group always corresponds to the entirety of the pattern's
422 /// match. (It is similarly always unnamed because it is implicit.) This
423 /// isn't necessarily true of all regex engines. For example, one can
424 /// hand-compile a [`thompson::NFA`](crate::nfa::thompson::NFA) via a
425 /// [`thompson::Builder`](crate::nfa::thompson::Builder), which isn't
426 /// technically forced to make the 0th capturing group always correspond to
427 /// the entire match.
428 ///
429 /// # Example
430 ///
431 /// This example shows how to get the capturing groups, by index, from a
432 /// match:
433 ///
434 /// ```
435 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
436 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match};
437 ///
438 /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
439 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
440 ///
441 /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
442 /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match());
443 /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1));
444 /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2));
445 /// // Looking for a non-existent capturing group will return None:
446 /// assert_eq!(None, caps.get_group(3));
447 /// # // literals are too big for 32-bit usize: #1039
448 /// # #[cfg(target_pointer_width = "64")]
449 /// assert_eq!(None, caps.get_group(9944060567225171988));
450 ///
451 /// # Ok::<(), Box<dyn std::error::Error>>(())
452 /// ```
453 #[inline]
454 pub fn get_group(&self, index: usize) -> Option<Span> {
455 let pid = self.pattern()?;
456 // There's a little bit of work needed to map captures to slots in the
457 // fully general case. But in the overwhelming common case of a single
458 // pattern, we can just do some simple arithmetic.
459 let (slot_start, slot_end) = if self.group_info().pattern_len() == 1 {
460 (index.checked_mul(2)?, index.checked_mul(2)?.checked_add(1)?)
461 } else {
462 self.group_info().slots(pid, index)?
463 };
464 let start = self.slots.get(slot_start).copied()??;
465 let end = self.slots.get(slot_end).copied()??;
466 Some(Span { start: start.get(), end: end.get() })
467 }
468
469 /// Returns the span of a capturing group match corresponding to the group
470 /// name given, only if both the overall pattern matched and the capturing
471 /// group participated in that match.
472 ///
473 /// This returns `None` if `name` does not correspond to a valid capturing
474 /// group for the pattern that matched.
475 ///
476 /// This always returns `None` when `Captures` was created with
477 /// [`Captures::empty`], even if a match was found. This also always
478 /// returns `None` for any `index > 0` when `Captures` was created with
479 /// [`Captures::matches`].
480 ///
481 /// If this routine returns a non-`None` value, then `is_match` is
482 /// guaranteed to return `true`, `pattern` is guaranteed to return a
483 /// non-`None` value and `get_match` is guaranteed to return a non-`None`
484 /// value.
485 ///
486 /// # Example
487 ///
488 /// This example shows how to get the capturing groups, by name, from a
489 /// match:
490 ///
491 /// ```
492 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
493 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match};
494 ///
495 /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
496 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
497 ///
498 /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
499 /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match());
500 /// assert_eq!(Some(Span::from(0..5)), caps.get_group_by_name("first"));
501 /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last"));
502 /// // Looking for a non-existent capturing group will return None:
503 /// assert_eq!(None, caps.get_group_by_name("middle"));
504 ///
505 /// # Ok::<(), Box<dyn std::error::Error>>(())
506 /// ```
507 pub fn get_group_by_name(&self, name: &str) -> Option<Span> {
508 let index = self.group_info().to_index(self.pattern()?, name)?;
509 self.get_group(index)
510 }
511
512 /// Returns an iterator of possible spans for every capturing group in the
513 /// matching pattern.
514 ///
515 /// If this `Captures` value does not correspond to a match, then the
516 /// iterator returned yields no elements.
517 ///
518 /// Note that the iterator returned yields elements of type `Option<Span>`.
519 /// A span is present if and only if it corresponds to a capturing group
520 /// that participated in a match.
521 ///
522 /// # Example
523 ///
524 /// This example shows how to collect all capturing groups:
525 ///
526 /// ```
527 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
528 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
529 ///
530 /// let re = PikeVM::new(
531 /// // Matches first/last names, with an optional middle name.
532 /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$",
533 /// )?;
534 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
535 ///
536 /// re.captures(&mut cache, "Harry James Potter", &mut caps);
537 /// assert!(caps.is_match());
538 /// let groups: Vec<Option<Span>> = caps.iter().collect();
539 /// assert_eq!(groups, vec![
540 /// Some(Span::from(0..18)),
541 /// Some(Span::from(0..5)),
542 /// Some(Span::from(6..11)),
543 /// Some(Span::from(12..18)),
544 /// ]);
545 ///
546 /// # Ok::<(), Box<dyn std::error::Error>>(())
547 /// ```
548 ///
549 /// This example uses the same regex as the previous example, but with a
550 /// haystack that omits the middle name. This results in a capturing group
551 /// that is present in the elements yielded by the iterator but without a
552 /// match:
553 ///
554 /// ```
555 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
556 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
557 ///
558 /// let re = PikeVM::new(
559 /// // Matches first/last names, with an optional middle name.
560 /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$",
561 /// )?;
562 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
563 ///
564 /// re.captures(&mut cache, "Harry Potter", &mut caps);
565 /// assert!(caps.is_match());
566 /// let groups: Vec<Option<Span>> = caps.iter().collect();
567 /// assert_eq!(groups, vec![
568 /// Some(Span::from(0..12)),
569 /// Some(Span::from(0..5)),
570 /// None,
571 /// Some(Span::from(6..12)),
572 /// ]);
573 ///
574 /// # Ok::<(), Box<dyn std::error::Error>>(())
575 /// ```
576 pub fn iter(&self) -> CapturesPatternIter<'_> {
577 let names = self
578 .pattern()
579 .map_or(GroupInfoPatternNames::empty().enumerate(), |pid| {
580 self.group_info().pattern_names(pid).enumerate()
581 });
582 CapturesPatternIter { caps: self, names }
583 }
584
585 /// Return the total number of capturing groups for the matching pattern.
586 ///
587 /// If this `Captures` value does not correspond to a match, then this
588 /// always returns `0`.
589 ///
590 /// This always returns the same number of elements yielded by
591 /// [`Captures::iter`]. That is, the number includes capturing groups even
592 /// if they don't participate in the match.
593 ///
594 /// # Example
595 ///
596 /// This example shows how to count the total number of capturing groups
597 /// associated with a pattern. Notice that it includes groups that did not
598 /// participate in a match (just like `Captures::iter` does).
599 ///
600 /// ```
601 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
602 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
603 ///
604 /// let re = PikeVM::new(
605 /// // Matches first/last names, with an optional middle name.
606 /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$",
607 /// )?;
608 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
609 ///
610 /// re.captures(&mut cache, "Harry Potter", &mut caps);
611 /// assert_eq!(4, caps.group_len());
612 ///
613 /// # Ok::<(), Box<dyn std::error::Error>>(())
614 /// ```
615 pub fn group_len(&self) -> usize {
616 let pid = match self.pattern() {
617 None => return 0,
618 Some(pid) => pid,
619 };
620 self.group_info().group_len(pid)
621 }
622
623 /// Returns a reference to the underlying group info on which these
624 /// captures are based.
625 ///
626 /// The difference between `GroupInfo` and `Captures` is that the former
627 /// defines the structure of capturing groups where as the latter is what
628 /// stores the actual match information. So where as `Captures` only gives
629 /// you access to the current match, `GroupInfo` lets you query any
630 /// information about all capturing groups, even ones for patterns that
631 /// weren't involved in a match.
632 ///
633 /// Note that a `GroupInfo` uses reference counting internally, so it may
634 /// be cloned cheaply.
635 ///
636 /// # Example
637 ///
638 /// This example shows how to get all capturing group names from the
639 /// underlying `GroupInfo`. Notice that we don't even need to run a
640 /// search.
641 ///
642 /// ```
643 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
644 ///
645 /// let re = PikeVM::new_many(&[
646 /// r"(?P<foo>a)",
647 /// r"(a)(b)",
648 /// r"ab",
649 /// r"(?P<bar>a)(?P<quux>a)",
650 /// r"(?P<foo>z)",
651 /// ])?;
652 /// let caps = re.create_captures();
653 ///
654 /// let expected = vec![
655 /// (PatternID::must(0), 0, None),
656 /// (PatternID::must(0), 1, Some("foo")),
657 /// (PatternID::must(1), 0, None),
658 /// (PatternID::must(1), 1, None),
659 /// (PatternID::must(1), 2, None),
660 /// (PatternID::must(2), 0, None),
661 /// (PatternID::must(3), 0, None),
662 /// (PatternID::must(3), 1, Some("bar")),
663 /// (PatternID::must(3), 2, Some("quux")),
664 /// (PatternID::must(4), 0, None),
665 /// (PatternID::must(4), 1, Some("foo")),
666 /// ];
667 /// // We could also just use 're.get_nfa().group_info()'.
668 /// let got: Vec<(PatternID, usize, Option<&str>)> =
669 /// caps.group_info().all_names().collect();
670 /// assert_eq!(expected, got);
671 ///
672 /// # Ok::<(), Box<dyn std::error::Error>>(())
673 /// ```
674 pub fn group_info(&self) -> &GroupInfo {
675 &self.group_info
676 }
677
678 /// Interpolates the capture references in `replacement` with the
679 /// corresponding substrings in `haystack` matched by each reference. The
680 /// interpolated string is returned.
681 ///
682 /// See the [`interpolate` module](interpolate) for documentation on the
683 /// format of the replacement string.
684 ///
685 /// # Example
686 ///
687 /// This example shows how to use interpolation, and also shows how it
688 /// can work with multi-pattern regexes.
689 ///
690 /// ```
691 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
692 ///
693 /// let re = PikeVM::new_many(&[
694 /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
695 /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})",
696 /// ])?;
697 /// let mut cache = re.create_cache();
698 /// let mut caps = re.create_captures();
699 ///
700 /// let replacement = "year=$year, month=$month, day=$day";
701 ///
702 /// // This matches the first pattern.
703 /// let hay = "On 14-03-2010, I became a Tenneessee lamb.";
704 /// re.captures(&mut cache, hay, &mut caps);
705 /// let result = caps.interpolate_string(hay, replacement);
706 /// assert_eq!("year=2010, month=03, day=14", result);
707 ///
708 /// // And this matches the second pattern.
709 /// let hay = "On 2010-03-14, I became a Tenneessee lamb.";
710 /// re.captures(&mut cache, hay, &mut caps);
711 /// let result = caps.interpolate_string(hay, replacement);
712 /// assert_eq!("year=2010, month=03, day=14", result);
713 ///
714 /// # Ok::<(), Box<dyn std::error::Error>>(())
715 /// ```
716 pub fn interpolate_string(
717 &self,
718 haystack: &str,
719 replacement: &str,
720 ) -> String {
721 let mut dst = String::new();
722 self.interpolate_string_into(haystack, replacement, &mut dst);
723 dst
724 }
725
726 /// Interpolates the capture references in `replacement` with the
727 /// corresponding substrings in `haystack` matched by each reference. The
728 /// interpolated string is written to `dst`.
729 ///
730 /// See the [`interpolate` module](interpolate) for documentation on the
731 /// format of the replacement string.
732 ///
733 /// # Example
734 ///
735 /// This example shows how to use interpolation, and also shows how it
736 /// can work with multi-pattern regexes.
737 ///
738 /// ```
739 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
740 ///
741 /// let re = PikeVM::new_many(&[
742 /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
743 /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})",
744 /// ])?;
745 /// let mut cache = re.create_cache();
746 /// let mut caps = re.create_captures();
747 ///
748 /// let replacement = "year=$year, month=$month, day=$day";
749 ///
750 /// // This matches the first pattern.
751 /// let hay = "On 14-03-2010, I became a Tenneessee lamb.";
752 /// re.captures(&mut cache, hay, &mut caps);
753 /// let mut dst = String::new();
754 /// caps.interpolate_string_into(hay, replacement, &mut dst);
755 /// assert_eq!("year=2010, month=03, day=14", dst);
756 ///
757 /// // And this matches the second pattern.
758 /// let hay = "On 2010-03-14, I became a Tenneessee lamb.";
759 /// re.captures(&mut cache, hay, &mut caps);
760 /// let mut dst = String::new();
761 /// caps.interpolate_string_into(hay, replacement, &mut dst);
762 /// assert_eq!("year=2010, month=03, day=14", dst);
763 ///
764 /// # Ok::<(), Box<dyn std::error::Error>>(())
765 /// ```
766 pub fn interpolate_string_into(
767 &self,
768 haystack: &str,
769 replacement: &str,
770 dst: &mut String,
771 ) {
772 interpolate::string(
773 replacement,
774 |index, dst| {
775 let span = match self.get_group(index) {
776 None => return,
777 Some(span) => span,
778 };
779 dst.push_str(&haystack[span]);
780 },
781 |name| self.group_info().to_index(self.pattern()?, name),
782 dst,
783 );
784 }
785
786 /// Interpolates the capture references in `replacement` with the
787 /// corresponding substrings in `haystack` matched by each reference. The
788 /// interpolated byte string is returned.
789 ///
790 /// See the [`interpolate` module](interpolate) for documentation on the
791 /// format of the replacement string.
792 ///
793 /// # Example
794 ///
795 /// This example shows how to use interpolation, and also shows how it
796 /// can work with multi-pattern regexes.
797 ///
798 /// ```
799 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
800 ///
801 /// let re = PikeVM::new_many(&[
802 /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
803 /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})",
804 /// ])?;
805 /// let mut cache = re.create_cache();
806 /// let mut caps = re.create_captures();
807 ///
808 /// let replacement = b"year=$year, month=$month, day=$day";
809 ///
810 /// // This matches the first pattern.
811 /// let hay = b"On 14-03-2010, I became a Tenneessee lamb.";
812 /// re.captures(&mut cache, hay, &mut caps);
813 /// let result = caps.interpolate_bytes(hay, replacement);
814 /// assert_eq!(&b"year=2010, month=03, day=14"[..], result);
815 ///
816 /// // And this matches the second pattern.
817 /// let hay = b"On 2010-03-14, I became a Tenneessee lamb.";
818 /// re.captures(&mut cache, hay, &mut caps);
819 /// let result = caps.interpolate_bytes(hay, replacement);
820 /// assert_eq!(&b"year=2010, month=03, day=14"[..], result);
821 ///
822 /// # Ok::<(), Box<dyn std::error::Error>>(())
823 /// ```
824 pub fn interpolate_bytes(
825 &self,
826 haystack: &[u8],
827 replacement: &[u8],
828 ) -> Vec<u8> {
829 let mut dst = vec![];
830 self.interpolate_bytes_into(haystack, replacement, &mut dst);
831 dst
832 }
833
834 /// Interpolates the capture references in `replacement` with the
835 /// corresponding substrings in `haystack` matched by each reference. The
836 /// interpolated byte string is written to `dst`.
837 ///
838 /// See the [`interpolate` module](interpolate) for documentation on the
839 /// format of the replacement string.
840 ///
841 /// # Example
842 ///
843 /// This example shows how to use interpolation, and also shows how it
844 /// can work with multi-pattern regexes.
845 ///
846 /// ```
847 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
848 ///
849 /// let re = PikeVM::new_many(&[
850 /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
851 /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})",
852 /// ])?;
853 /// let mut cache = re.create_cache();
854 /// let mut caps = re.create_captures();
855 ///
856 /// let replacement = b"year=$year, month=$month, day=$day";
857 ///
858 /// // This matches the first pattern.
859 /// let hay = b"On 14-03-2010, I became a Tenneessee lamb.";
860 /// re.captures(&mut cache, hay, &mut caps);
861 /// let mut dst = vec![];
862 /// caps.interpolate_bytes_into(hay, replacement, &mut dst);
863 /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst);
864 ///
865 /// // And this matches the second pattern.
866 /// let hay = b"On 2010-03-14, I became a Tenneessee lamb.";
867 /// re.captures(&mut cache, hay, &mut caps);
868 /// let mut dst = vec![];
869 /// caps.interpolate_bytes_into(hay, replacement, &mut dst);
870 /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst);
871 ///
872 /// # Ok::<(), Box<dyn std::error::Error>>(())
873 /// ```
874 pub fn interpolate_bytes_into(
875 &self,
876 haystack: &[u8],
877 replacement: &[u8],
878 dst: &mut Vec<u8>,
879 ) {
880 interpolate::bytes(
881 replacement,
882 |index, dst| {
883 let span = match self.get_group(index) {
884 None => return,
885 Some(span) => span,
886 };
887 dst.extend_from_slice(&haystack[span]);
888 },
889 |name| self.group_info().to_index(self.pattern()?, name),
890 dst,
891 );
892 }
893
894 /// This is a convenience routine for extracting the substrings
895 /// corresponding to matching capture groups in the given `haystack`. The
896 /// `haystack` should be the same substring used to find the match spans in
897 /// this `Captures` value.
898 ///
899 /// This is identical to [`Captures::extract_bytes`], except it works with
900 /// `&str` instead of `&[u8]`.
901 ///
902 /// # Panics
903 ///
904 /// This panics if the number of explicit matching groups in this
905 /// `Captures` value is less than `N`. This also panics if this `Captures`
906 /// value does not correspond to a match.
907 ///
908 /// Note that this does *not* panic if the number of explicit matching
909 /// groups is bigger than `N`. In that case, only the first `N` matching
910 /// groups are extracted.
911 ///
912 /// # Example
913 ///
914 /// ```
915 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
916 ///
917 /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?;
918 /// let mut cache = re.create_cache();
919 /// let mut caps = re.create_captures();
920 ///
921 /// let hay = "On 2010-03-14, I became a Tenneessee lamb.";
922 /// re.captures(&mut cache, hay, &mut caps);
923 /// assert!(caps.is_match());
924 /// let (full, [year, month, day]) = caps.extract(hay);
925 /// assert_eq!("2010-03-14", full);
926 /// assert_eq!("2010", year);
927 /// assert_eq!("03", month);
928 /// assert_eq!("14", day);
929 ///
930 /// // We can also ask for fewer than all capture groups.
931 /// let (full, [year]) = caps.extract(hay);
932 /// assert_eq!("2010-03-14", full);
933 /// assert_eq!("2010", year);
934 ///
935 /// # Ok::<(), Box<dyn std::error::Error>>(())
936 /// ```
937 pub fn extract<'h, const N: usize>(
938 &self,
939 haystack: &'h str,
940 ) -> (&'h str, [&'h str; N]) {
941 let mut matched = self.iter().flatten();
942 let whole_match = &haystack[matched.next().expect("a match")];
943 let group_matches = [0; N].map(|_| {
944 let sp = matched.next().expect("too few matching groups");
945 &haystack[sp]
946 });
947 (whole_match, group_matches)
948 }
949
950 /// This is a convenience routine for extracting the substrings
951 /// corresponding to matching capture groups in the given `haystack`. The
952 /// `haystack` should be the same substring used to find the match spans in
953 /// this `Captures` value.
954 ///
955 /// This is identical to [`Captures::extract`], except it works with
956 /// `&[u8]` instead of `&str`.
957 ///
958 /// # Panics
959 ///
960 /// This panics if the number of explicit matching groups in this
961 /// `Captures` value is less than `N`. This also panics if this `Captures`
962 /// value does not correspond to a match.
963 ///
964 /// Note that this does *not* panic if the number of explicit matching
965 /// groups is bigger than `N`. In that case, only the first `N` matching
966 /// groups are extracted.
967 ///
968 /// # Example
969 ///
970 /// ```
971 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
972 ///
973 /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?;
974 /// let mut cache = re.create_cache();
975 /// let mut caps = re.create_captures();
976 ///
977 /// let hay = b"On 2010-03-14, I became a Tenneessee lamb.";
978 /// re.captures(&mut cache, hay, &mut caps);
979 /// assert!(caps.is_match());
980 /// let (full, [year, month, day]) = caps.extract_bytes(hay);
981 /// assert_eq!(b"2010-03-14", full);
982 /// assert_eq!(b"2010", year);
983 /// assert_eq!(b"03", month);
984 /// assert_eq!(b"14", day);
985 ///
986 /// // We can also ask for fewer than all capture groups.
987 /// let (full, [year]) = caps.extract_bytes(hay);
988 /// assert_eq!(b"2010-03-14", full);
989 /// assert_eq!(b"2010", year);
990 ///
991 /// # Ok::<(), Box<dyn std::error::Error>>(())
992 /// ```
993 pub fn extract_bytes<'h, const N: usize>(
994 &self,
995 haystack: &'h [u8],
996 ) -> (&'h [u8], [&'h [u8]; N]) {
997 let mut matched = self.iter().flatten();
998 let whole_match = &haystack[matched.next().expect("a match")];
999 let group_matches = [0; N].map(|_| {
1000 let sp = matched.next().expect("too few matching groups");
1001 &haystack[sp]
1002 });
1003 (whole_match, group_matches)
1004 }
1005}
1006
1007/// Lower level "slot" oriented APIs. One does not typically need to use these
1008/// when executing a search. They are instead mostly intended for folks that
1009/// are writing their own regex engine while reusing this `Captures` type.
1010impl Captures {
1011 /// Clear this `Captures` value.
1012 ///
1013 /// After clearing, all slots inside this `Captures` value will be set to
1014 /// `None`. Similarly, any pattern ID that it was previously associated
1015 /// with (for a match) is erased.
1016 ///
1017 /// It is not usually necessary to call this routine. Namely, a `Captures`
1018 /// value only provides high level access to the capturing groups of the
1019 /// pattern that matched, and only low level access to individual slots.
1020 /// Thus, even if slots corresponding to groups that aren't associated
1021 /// with the matching pattern are set, then it won't impact the higher
1022 /// level APIs. Namely, higher level APIs like [`Captures::get_group`] will
1023 /// return `None` if no pattern ID is present, even if there are spans set
1024 /// in the underlying slots.
1025 ///
1026 /// Thus, to "clear" a `Captures` value of a match, it is usually only
1027 /// necessary to call [`Captures::set_pattern`] with `None`.
1028 ///
1029 /// # Example
1030 ///
1031 /// This example shows what happens when a `Captures` value is cleared.
1032 ///
1033 /// ```
1034 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1035 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
1036 ///
1037 /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
1038 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1039 ///
1040 /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
1041 /// assert!(caps.is_match());
1042 /// let slots: Vec<Option<usize>> =
1043 /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect();
1044 /// // Note that the following ordering is considered an API guarantee.
1045 /// assert_eq!(slots, vec![
1046 /// Some(0),
1047 /// Some(17),
1048 /// Some(0),
1049 /// Some(5),
1050 /// Some(6),
1051 /// Some(17),
1052 /// ]);
1053 ///
1054 /// // Now clear the slots. Everything is gone and it is no longer a match.
1055 /// caps.clear();
1056 /// assert!(!caps.is_match());
1057 /// let slots: Vec<Option<usize>> =
1058 /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect();
1059 /// assert_eq!(slots, vec![
1060 /// None,
1061 /// None,
1062 /// None,
1063 /// None,
1064 /// None,
1065 /// None,
1066 /// ]);
1067 ///
1068 /// # Ok::<(), Box<dyn std::error::Error>>(())
1069 /// ```
1070 #[inline]
1071 pub fn clear(&mut self) {
1072 self.pid = None;
1073 for slot in self.slots.iter_mut() {
1074 *slot = None;
1075 }
1076 }
1077
1078 /// Set the pattern on this `Captures` value.
1079 ///
1080 /// When the pattern ID is `None`, then this `Captures` value does not
1081 /// correspond to a match (`is_match` will return `false`). Otherwise, it
1082 /// corresponds to a match.
1083 ///
1084 /// This is useful in search implementations where you might want to
1085 /// initially call `set_pattern(None)` in order to avoid the cost of
1086 /// calling `clear()` if it turns out to not be necessary.
1087 ///
1088 /// # Example
1089 ///
1090 /// This example shows that `set_pattern` merely overwrites the pattern ID.
1091 /// It does not actually change the underlying slot values.
1092 ///
1093 /// ```
1094 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1095 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
1096 ///
1097 /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
1098 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1099 ///
1100 /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
1101 /// assert!(caps.is_match());
1102 /// assert!(caps.pattern().is_some());
1103 /// let slots: Vec<Option<usize>> =
1104 /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect();
1105 /// // Note that the following ordering is considered an API guarantee.
1106 /// assert_eq!(slots, vec![
1107 /// Some(0),
1108 /// Some(17),
1109 /// Some(0),
1110 /// Some(5),
1111 /// Some(6),
1112 /// Some(17),
1113 /// ]);
1114 ///
1115 /// // Now set the pattern to None. Note that the slot values remain.
1116 /// caps.set_pattern(None);
1117 /// assert!(!caps.is_match());
1118 /// assert!(!caps.pattern().is_some());
1119 /// let slots: Vec<Option<usize>> =
1120 /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect();
1121 /// // Note that the following ordering is considered an API guarantee.
1122 /// assert_eq!(slots, vec![
1123 /// Some(0),
1124 /// Some(17),
1125 /// Some(0),
1126 /// Some(5),
1127 /// Some(6),
1128 /// Some(17),
1129 /// ]);
1130 ///
1131 /// # Ok::<(), Box<dyn std::error::Error>>(())
1132 /// ```
1133 #[inline]
1134 pub fn set_pattern(&mut self, pid: Option<PatternID>) {
1135 self.pid = pid;
1136 }
1137
1138 /// Returns the underlying slots, where each slot stores a single offset.
1139 ///
1140 /// Every matching capturing group generally corresponds to two slots: one
1141 /// slot for the starting position and another for the ending position.
1142 /// Typically, either both are present or neither are. (The weasel word
1143 /// "typically" is used here because it really depends on the regex engine
1144 /// implementation. Every sensible regex engine likely adheres to this
1145 /// invariant, and every regex engine in this crate is sensible.)
1146 ///
1147 /// Generally speaking, callers should prefer to use higher level routines
1148 /// like [`Captures::get_match`] or [`Captures::get_group`].
1149 ///
1150 /// An important note here is that a regex engine may not reset all of the
1151 /// slots to `None` values when no match occurs, or even when a match of
1152 /// a different pattern occurs. But this depends on how the regex engine
1153 /// implementation deals with slots.
1154 ///
1155 /// # Example
1156 ///
1157 /// This example shows how to get the underlying slots from a regex match.
1158 ///
1159 /// ```
1160 /// use regex_automata::{
1161 /// nfa::thompson::pikevm::PikeVM,
1162 /// util::primitives::{PatternID, NonMaxUsize},
1163 /// };
1164 ///
1165 /// let re = PikeVM::new_many(&[
1166 /// r"[a-z]+",
1167 /// r"[0-9]+",
1168 /// ])?;
1169 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1170 ///
1171 /// re.captures(&mut cache, "123", &mut caps);
1172 /// assert_eq!(Some(PatternID::must(1)), caps.pattern());
1173 /// // Note that the only guarantee we have here is that slots 2 and 3
1174 /// // are set to correct values. The contents of the first two slots are
1175 /// // unspecified since the 0th pattern did not match.
1176 /// let expected = &[
1177 /// None,
1178 /// None,
1179 /// NonMaxUsize::new(0),
1180 /// NonMaxUsize::new(3),
1181 /// ];
1182 /// assert_eq!(expected, caps.slots());
1183 ///
1184 /// # Ok::<(), Box<dyn std::error::Error>>(())
1185 /// ```
1186 #[inline]
1187 pub fn slots(&self) -> &[Option<NonMaxUsize>] {
1188 &self.slots
1189 }
1190
1191 /// Returns the underlying slots as a mutable slice, where each slot stores
1192 /// a single offset.
1193 ///
1194 /// This tends to be most useful for regex engine implementations for
1195 /// writing offsets for matching capturing groups to slots.
1196 ///
1197 /// See [`Captures::slots`] for more information about slots.
1198 #[inline]
1199 pub fn slots_mut(&mut self) -> &mut [Option<NonMaxUsize>] {
1200 &mut self.slots
1201 }
1202}
1203
1204impl core::fmt::Debug for Captures {
1205 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1206 let mut dstruct = f.debug_struct("Captures");
1207 dstruct.field("pid", &self.pid);
1208 if let Some(pid) = self.pid {
1209 dstruct.field("spans", &CapturesDebugMap { pid, caps: self });
1210 }
1211 dstruct.finish()
1212 }
1213}
1214
1215/// A little helper type to provide a nice map-like debug representation for
1216/// our capturing group spans.
1217struct CapturesDebugMap<'a> {
1218 pid: PatternID,
1219 caps: &'a Captures,
1220}
1221
1222impl<'a> core::fmt::Debug for CapturesDebugMap<'a> {
1223 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1224 struct Key<'a>(usize, Option<&'a str>);
1225
1226 impl<'a> core::fmt::Debug for Key<'a> {
1227 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1228 write!(f, "{}", self.0)?;
1229 if let Some(name) = self.1 {
1230 write!(f, "/{:?}", name)?;
1231 }
1232 Ok(())
1233 }
1234 }
1235
1236 let mut map = f.debug_map();
1237 let names = self.caps.group_info().pattern_names(self.pid);
1238 for (group_index, maybe_name) in names.enumerate() {
1239 let key = Key(group_index, maybe_name);
1240 match self.caps.get_group(group_index) {
1241 None => map.entry(&key, &None::<()>),
1242 Some(span) => map.entry(&key, &span),
1243 };
1244 }
1245 map.finish()
1246 }
1247}
1248
1249/// An iterator over all capturing groups in a `Captures` value.
1250///
1251/// This iterator includes capturing groups that did not participate in a
1252/// match. See the [`Captures::iter`] method documentation for more details
1253/// and examples.
1254///
1255/// The lifetime parameter `'a` refers to the lifetime of the underlying
1256/// `Captures` value.
1257#[derive(Clone, Debug)]
1258pub struct CapturesPatternIter<'a> {
1259 caps: &'a Captures,
1260 names: core::iter::Enumerate<GroupInfoPatternNames<'a>>,
1261}
1262
1263impl<'a> Iterator for CapturesPatternIter<'a> {
1264 type Item = Option<Span>;
1265
1266 fn next(&mut self) -> Option<Option<Span>> {
1267 let (group_index, _) = self.names.next()?;
1268 Some(self.caps.get_group(group_index))
1269 }
1270
1271 fn size_hint(&self) -> (usize, Option<usize>) {
1272 self.names.size_hint()
1273 }
1274
1275 fn count(self) -> usize {
1276 self.names.count()
1277 }
1278}
1279
1280impl<'a> ExactSizeIterator for CapturesPatternIter<'a> {}
1281impl<'a> core::iter::FusedIterator for CapturesPatternIter<'a> {}
1282
1283/// Represents information about capturing groups in a compiled regex.
1284///
1285/// The information encapsulated by this type consists of the following. For
1286/// each pattern:
1287///
1288/// * A map from every capture group name to its corresponding capture group
1289/// index.
1290/// * A map from every capture group index to its corresponding capture group
1291/// name.
1292/// * A map from capture group index to its corresponding slot index. A slot
1293/// refers to one half of a capturing group. That is, a capture slot is either
1294/// the start or end of a capturing group. A slot is usually the mechanism
1295/// by which a regex engine records offsets for each capturing group during a
1296/// search.
1297///
1298/// A `GroupInfo` uses reference counting internally and is thus cheap to
1299/// clone.
1300///
1301/// # Mapping from capture groups to slots
1302///
1303/// One of the main responsibilities of a `GroupInfo` is to build a mapping
1304/// from `(PatternID, u32)` (where the `u32` is a capture index) to something
1305/// called a "slot." As mentioned above, a slot refers to one half of a
1306/// capturing group. Both combined provide the start and end offsets of
1307/// a capturing group that participated in a match.
1308///
1309/// **The mapping between group indices and slots is an API guarantee.** That
1310/// is, the mapping won't change within a semver compatible release.
1311///
1312/// Slots exist primarily because this is a convenient mechanism by which
1313/// regex engines report group offsets at search time. For example, the
1314/// [`nfa::thompson::State::Capture`](crate::nfa::thompson::State::Capture)
1315/// NFA state includes the slot index. When a regex engine transitions through
1316/// this state, it will likely use the slot index to write the current haystack
1317/// offset to some region of memory. When a match is found, those slots are
1318/// then reported to the caller, typically via a convenient abstraction like a
1319/// [`Captures`] value.
1320///
1321/// Because this crate provides first class support for multi-pattern regexes,
1322/// and because of some performance related reasons, the mapping between
1323/// capturing groups and slots is a little complex. However, in the case of a
1324/// single pattern, the mapping can be described very simply: for all capture
1325/// group indices `i`, its corresponding slots are at `i * 2` and `i * 2 + 1`.
1326/// Notice that the pattern ID isn't involved at all here, because it only
1327/// applies to a single-pattern regex, it is therefore always `0`.
1328///
1329/// In the multi-pattern case, the mapping is a bit more complicated. To talk
1330/// about it, we must define what we mean by "implicit" vs "explicit"
1331/// capturing groups:
1332///
1333/// * An **implicit** capturing group refers to the capturing group that is
1334/// present for every pattern automatically, and corresponds to the overall
1335/// match of a pattern. Every pattern has precisely one implicit capturing
1336/// group. It is always unnamed and it always corresponds to the capture group
1337/// index `0`.
1338/// * An **explicit** capturing group refers to any capturing group that
1339/// appears in the concrete syntax of the pattern. (Or, if an NFA was hand
1340/// built without any concrete syntax, it refers to any capturing group with an
1341/// index greater than `0`.)
1342///
1343/// Some examples:
1344///
1345/// * `\w+` has one implicit capturing group and zero explicit capturing
1346/// groups.
1347/// * `(\w+)` has one implicit group and one explicit group.
1348/// * `foo(\d+)(?:\pL+)(\d+)` has one implicit group and two explicit groups.
1349///
1350/// Turning back to the slot mapping, we can now state it as follows:
1351///
1352/// * Given a pattern ID `pid`, the slots for its implicit group are always
1353/// at `pid * 2` and `pid * 2 + 1`.
1354/// * Given a pattern ID `0`, the slots for its explicit groups start
1355/// at `group_info.pattern_len() * 2`.
1356/// * Given a pattern ID `pid > 0`, the slots for its explicit groups start
1357/// immediately following where the slots for the explicit groups of `pid - 1`
1358/// end.
1359///
1360/// In particular, while there is a concrete formula one can use to determine
1361/// where the slots for the implicit group of any pattern are, there is no
1362/// general formula for determining where the slots for explicit capturing
1363/// groups are. This is because each pattern can contain a different number
1364/// of groups.
1365///
1366/// The intended way of getting the slots for a particular capturing group
1367/// (whether implicit or explicit) is via the [`GroupInfo::slot`] or
1368/// [`GroupInfo::slots`] method.
1369///
1370/// See below for a concrete example of how capturing groups get mapped to
1371/// slots.
1372///
1373/// # Example
1374///
1375/// This example shows how to build a new `GroupInfo` and query it for
1376/// information.
1377///
1378/// ```
1379/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID};
1380///
1381/// let info = GroupInfo::new(vec![
1382/// vec![None, Some("foo")],
1383/// vec![None],
1384/// vec![None, None, None, Some("bar"), None],
1385/// vec![None, None, Some("foo")],
1386/// ])?;
1387/// // The number of patterns being tracked.
1388/// assert_eq!(4, info.pattern_len());
1389/// // We can query the number of groups for any pattern.
1390/// assert_eq!(2, info.group_len(PatternID::must(0)));
1391/// assert_eq!(1, info.group_len(PatternID::must(1)));
1392/// assert_eq!(5, info.group_len(PatternID::must(2)));
1393/// assert_eq!(3, info.group_len(PatternID::must(3)));
1394/// // An invalid pattern always has zero groups.
1395/// assert_eq!(0, info.group_len(PatternID::must(999)));
1396/// // 2 slots per group
1397/// assert_eq!(22, info.slot_len());
1398///
1399/// // We can map a group index for a particular pattern to its name, if
1400/// // one exists.
1401/// assert_eq!(Some("foo"), info.to_name(PatternID::must(3), 2));
1402/// assert_eq!(None, info.to_name(PatternID::must(2), 4));
1403/// // Or map a name to its group index.
1404/// assert_eq!(Some(1), info.to_index(PatternID::must(0), "foo"));
1405/// assert_eq!(Some(2), info.to_index(PatternID::must(3), "foo"));
1406///
1407/// # Ok::<(), Box<dyn std::error::Error>>(())
1408/// ```
1409///
1410/// # Example: mapping from capture groups to slots
1411///
1412/// This example shows the specific mapping from capture group indices for
1413/// each pattern to their corresponding slots. The slot values shown in this
1414/// example are considered an API guarantee.
1415///
1416/// ```
1417/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID};
1418///
1419/// let info = GroupInfo::new(vec![
1420/// vec![None, Some("foo")],
1421/// vec![None],
1422/// vec![None, None, None, Some("bar"), None],
1423/// vec![None, None, Some("foo")],
1424/// ])?;
1425///
1426/// // We first show the slots for each pattern's implicit group.
1427/// assert_eq!(Some((0, 1)), info.slots(PatternID::must(0), 0));
1428/// assert_eq!(Some((2, 3)), info.slots(PatternID::must(1), 0));
1429/// assert_eq!(Some((4, 5)), info.slots(PatternID::must(2), 0));
1430/// assert_eq!(Some((6, 7)), info.slots(PatternID::must(3), 0));
1431///
1432/// // And now we show the slots for each pattern's explicit group.
1433/// assert_eq!(Some((8, 9)), info.slots(PatternID::must(0), 1));
1434/// assert_eq!(Some((10, 11)), info.slots(PatternID::must(2), 1));
1435/// assert_eq!(Some((12, 13)), info.slots(PatternID::must(2), 2));
1436/// assert_eq!(Some((14, 15)), info.slots(PatternID::must(2), 3));
1437/// assert_eq!(Some((16, 17)), info.slots(PatternID::must(2), 4));
1438/// assert_eq!(Some((18, 19)), info.slots(PatternID::must(3), 1));
1439/// assert_eq!(Some((20, 21)), info.slots(PatternID::must(3), 2));
1440///
1441/// // Asking for the slots for an invalid pattern ID or even for an invalid
1442/// // group index for a specific pattern will return None. So for example,
1443/// // you're guaranteed to not get the slots for a different pattern than the
1444/// // one requested.
1445/// assert_eq!(None, info.slots(PatternID::must(5), 0));
1446/// assert_eq!(None, info.slots(PatternID::must(1), 1));
1447///
1448/// # Ok::<(), Box<dyn std::error::Error>>(())
1449/// ```
1450#[derive(Clone, Debug, Default)]
1451pub struct GroupInfo(Arc<GroupInfoInner>);
1452
1453impl GroupInfo {
1454 /// Creates a new group info from a sequence of patterns, where each
1455 /// sequence of patterns yields a sequence of possible group names. The
1456 /// index of each pattern in the sequence corresponds to its `PatternID`,
1457 /// and the index of each group in each pattern's sequence corresponds to
1458 /// its corresponding group index.
1459 ///
1460 /// While this constructor is very generic and therefore perhaps hard to
1461 /// chew on, an example of a valid concrete type that can be passed to
1462 /// this constructor is `Vec<Vec<Option<String>>>`. The outer `Vec`
1463 /// corresponds to the patterns, i.e., one `Vec<Option<String>>` per
1464 /// pattern. The inner `Vec` corresponds to the capturing groups for
1465 /// each pattern. The `Option<String>` corresponds to the name of the
1466 /// capturing group, if present.
1467 ///
1468 /// It is legal to pass an empty iterator to this constructor. It will
1469 /// return an empty group info with zero slots. An empty group info is
1470 /// useful for cases where you have no patterns or for cases where slots
1471 /// aren't being used at all (e.g., for most DFAs in this crate).
1472 ///
1473 /// # Errors
1474 ///
1475 /// This constructor returns an error if the given capturing groups are
1476 /// invalid in some way. Those reasons include, but are not necessarily
1477 /// limited to:
1478 ///
1479 /// * Too many patterns (i.e., `PatternID` would overflow).
1480 /// * Too many capturing groups (e.g., `u32` would overflow).
1481 /// * A pattern is given that has no capturing groups. (All patterns must
1482 /// have at least an implicit capturing group at index `0`.)
1483 /// * The capturing group at index `0` has a name. It must be unnamed.
1484 /// * There are duplicate capturing group names within the same pattern.
1485 /// (Multiple capturing groups with the same name may exist, but they
1486 /// must be in different patterns.)
1487 ///
1488 /// An example below shows how to trigger some of the above error
1489 /// conditions.
1490 ///
1491 /// # Example
1492 ///
1493 /// This example shows how to build a new `GroupInfo` and query it for
1494 /// information.
1495 ///
1496 /// ```
1497 /// use regex_automata::util::captures::GroupInfo;
1498 ///
1499 /// let info = GroupInfo::new(vec![
1500 /// vec![None, Some("foo")],
1501 /// vec![None],
1502 /// vec![None, None, None, Some("bar"), None],
1503 /// vec![None, None, Some("foo")],
1504 /// ])?;
1505 /// // The number of patterns being tracked.
1506 /// assert_eq!(4, info.pattern_len());
1507 /// // 2 slots per group
1508 /// assert_eq!(22, info.slot_len());
1509 ///
1510 /// # Ok::<(), Box<dyn std::error::Error>>(())
1511 /// ```
1512 ///
1513 /// # Example: empty `GroupInfo`
1514 ///
1515 /// This example shows how to build a new `GroupInfo` and query it for
1516 /// information.
1517 ///
1518 /// ```
1519 /// use regex_automata::util::captures::GroupInfo;
1520 ///
1521 /// let info = GroupInfo::empty();
1522 /// // Everything is zero.
1523 /// assert_eq!(0, info.pattern_len());
1524 /// assert_eq!(0, info.slot_len());
1525 ///
1526 /// # Ok::<(), Box<dyn std::error::Error>>(())
1527 /// ```
1528 ///
1529 /// # Example: error conditions
1530 ///
1531 /// This example shows how to provoke some of the ways in which building
1532 /// a `GroupInfo` can fail.
1533 ///
1534 /// ```
1535 /// use regex_automata::util::captures::GroupInfo;
1536 ///
1537 /// // Either the group info is empty, or all patterns must have at least
1538 /// // one capturing group.
1539 /// assert!(GroupInfo::new(vec![
1540 /// vec![None, Some("a")], // ok
1541 /// vec![None], // ok
1542 /// vec![], // not ok
1543 /// ]).is_err());
1544 /// // Note that building an empty group info is OK.
1545 /// assert!(GroupInfo::new(Vec::<Vec<Option<String>>>::new()).is_ok());
1546 ///
1547 /// // The first group in each pattern must correspond to an implicit
1548 /// // anonymous group. i.e., One that is not named. By convention, this
1549 /// // group corresponds to the overall match of a regex. Every other group
1550 /// // in a pattern is explicit and optional.
1551 /// assert!(GroupInfo::new(vec![vec![Some("foo")]]).is_err());
1552 ///
1553 /// // There must not be duplicate group names within the same pattern.
1554 /// assert!(GroupInfo::new(vec![
1555 /// vec![None, Some("foo"), Some("foo")],
1556 /// ]).is_err());
1557 /// // But duplicate names across distinct patterns is OK.
1558 /// assert!(GroupInfo::new(vec![
1559 /// vec![None, Some("foo")],
1560 /// vec![None, Some("foo")],
1561 /// ]).is_ok());
1562 ///
1563 /// # Ok::<(), Box<dyn std::error::Error>>(())
1564 /// ```
1565 ///
1566 /// There are other ways for building a `GroupInfo` to fail but are
1567 /// difficult to show. For example, if the number of patterns given would
1568 /// overflow `PatternID`.
1569 pub fn new<P, G, N>(pattern_groups: P) -> Result<GroupInfo, GroupInfoError>
1570 where
1571 P: IntoIterator<Item = G>,
1572 G: IntoIterator<Item = Option<N>>,
1573 N: AsRef<str>,
1574 {
1575 let mut group_info = GroupInfoInner {
1576 slot_ranges: vec![],
1577 name_to_index: vec![],
1578 index_to_name: vec![],
1579 memory_extra: 0,
1580 };
1581 for (pattern_index, groups) in pattern_groups.into_iter().enumerate() {
1582 // If we can't convert the pattern index to an ID, then the caller
1583 // tried to build capture info for too many patterns.
1584 let pid = PatternID::new(pattern_index)
1585 .map_err(GroupInfoError::too_many_patterns)?;
1586
1587 let mut groups_iter = groups.into_iter().enumerate();
1588 match groups_iter.next() {
1589 None => return Err(GroupInfoError::missing_groups(pid)),
1590 Some((_, Some(_))) => {
1591 return Err(GroupInfoError::first_must_be_unnamed(pid))
1592 }
1593 Some((_, None)) => {}
1594 }
1595 group_info.add_first_group(pid);
1596 // Now iterate over the rest, which correspond to all of the
1597 // (conventionally) explicit capture groups in a regex pattern.
1598 for (group_index, maybe_name) in groups_iter {
1599 // Just like for patterns, if the group index can't be
1600 // converted to a "small" index, then the caller has given too
1601 // many groups for a particular pattern.
1602 let group = SmallIndex::new(group_index).map_err(|_| {
1603 GroupInfoError::too_many_groups(pid, group_index)
1604 })?;
1605 group_info.add_explicit_group(pid, group, maybe_name)?;
1606 }
1607 }
1608 group_info.fixup_slot_ranges()?;
1609 Ok(GroupInfo(Arc::new(group_info)))
1610 }
1611
1612 /// This creates an empty `GroupInfo`.
1613 ///
1614 /// This is a convenience routine for calling `GroupInfo::new` with an
1615 /// iterator that yields no elements.
1616 ///
1617 /// # Example
1618 ///
1619 /// This example shows how to build a new empty `GroupInfo` and query it
1620 /// for information.
1621 ///
1622 /// ```
1623 /// use regex_automata::util::captures::GroupInfo;
1624 ///
1625 /// let info = GroupInfo::empty();
1626 /// // Everything is zero.
1627 /// assert_eq!(0, info.pattern_len());
1628 /// assert_eq!(0, info.all_group_len());
1629 /// assert_eq!(0, info.slot_len());
1630 ///
1631 /// # Ok::<(), Box<dyn std::error::Error>>(())
1632 /// ```
1633 pub fn empty() -> GroupInfo {
1634 GroupInfo::new(core::iter::empty::<[Option<&str>; 0]>())
1635 .expect("empty group info is always valid")
1636 }
1637
1638 /// Return the capture group index corresponding to the given name in the
1639 /// given pattern. If no such capture group name exists in the given
1640 /// pattern, then this returns `None`.
1641 ///
1642 /// If the given pattern ID is invalid, then this returns `None`.
1643 ///
1644 /// This also returns `None` for all inputs if these captures are empty
1645 /// (e.g., built from an empty [`GroupInfo`]). To check whether captures
1646 /// are are present for a specific pattern, use [`GroupInfo::group_len`].
1647 ///
1648 /// # Example
1649 ///
1650 /// This example shows how to find the capture index for the given pattern
1651 /// and group name.
1652 ///
1653 /// Remember that capture indices are relative to the pattern, such that
1654 /// the same capture index value may refer to different capturing groups
1655 /// for distinct patterns.
1656 ///
1657 /// ```
1658 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1659 /// use regex_automata::{nfa::thompson::NFA, PatternID};
1660 ///
1661 /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1));
1662 ///
1663 /// let nfa = NFA::new_many(&[
1664 /// r"a(?P<quux>\w+)z(?P<foo>\s+)",
1665 /// r"a(?P<foo>\d+)z",
1666 /// ])?;
1667 /// let groups = nfa.group_info();
1668 /// assert_eq!(Some(2), groups.to_index(pid0, "foo"));
1669 /// // Recall that capture index 0 is always unnamed and refers to the
1670 /// // entire pattern. So the first capturing group present in the pattern
1671 /// // itself always starts at index 1.
1672 /// assert_eq!(Some(1), groups.to_index(pid1, "foo"));
1673 ///
1674 /// // And if a name does not exist for a particular pattern, None is
1675 /// // returned.
1676 /// assert!(groups.to_index(pid0, "quux").is_some());
1677 /// assert!(groups.to_index(pid1, "quux").is_none());
1678 ///
1679 /// # Ok::<(), Box<dyn std::error::Error>>(())
1680 /// ```
1681 #[inline]
1682 pub fn to_index(&self, pid: PatternID, name: &str) -> Option<usize> {
1683 let indices = self.0.name_to_index.get(pid.as_usize())?;
1684 indices.get(name).cloned().map(|i| i.as_usize())
1685 }
1686
1687 /// Return the capture name for the given index and given pattern. If the
1688 /// corresponding group does not have a name, then this returns `None`.
1689 ///
1690 /// If the pattern ID is invalid, then this returns `None`.
1691 ///
1692 /// If the group index is invalid for the given pattern, then this returns
1693 /// `None`. A group `index` is valid for a pattern `pid` in an `nfa` if and
1694 /// only if `index < nfa.pattern_capture_len(pid)`.
1695 ///
1696 /// This also returns `None` for all inputs if these captures are empty
1697 /// (e.g., built from an empty [`GroupInfo`]). To check whether captures
1698 /// are are present for a specific pattern, use [`GroupInfo::group_len`].
1699 ///
1700 /// # Example
1701 ///
1702 /// This example shows how to find the capture group name for the given
1703 /// pattern and group index.
1704 ///
1705 /// ```
1706 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1707 /// use regex_automata::{nfa::thompson::NFA, PatternID};
1708 ///
1709 /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1));
1710 ///
1711 /// let nfa = NFA::new_many(&[
1712 /// r"a(?P<foo>\w+)z(\s+)x(\d+)",
1713 /// r"a(\d+)z(?P<foo>\s+)",
1714 /// ])?;
1715 /// let groups = nfa.group_info();
1716 /// assert_eq!(None, groups.to_name(pid0, 0));
1717 /// assert_eq!(Some("foo"), groups.to_name(pid0, 1));
1718 /// assert_eq!(None, groups.to_name(pid0, 2));
1719 /// assert_eq!(None, groups.to_name(pid0, 3));
1720 ///
1721 /// assert_eq!(None, groups.to_name(pid1, 0));
1722 /// assert_eq!(None, groups.to_name(pid1, 1));
1723 /// assert_eq!(Some("foo"), groups.to_name(pid1, 2));
1724 /// // '3' is not a valid capture index for the second pattern.
1725 /// assert_eq!(None, groups.to_name(pid1, 3));
1726 ///
1727 /// # Ok::<(), Box<dyn std::error::Error>>(())
1728 /// ```
1729 #[inline]
1730 pub fn to_name(&self, pid: PatternID, group_index: usize) -> Option<&str> {
1731 let pattern_names = self.0.index_to_name.get(pid.as_usize())?;
1732 pattern_names.get(group_index)?.as_deref()
1733 }
1734
1735 /// Return an iterator of all capture groups and their names (if present)
1736 /// for a particular pattern.
1737 ///
1738 /// If the given pattern ID is invalid or if this `GroupInfo` is empty,
1739 /// then the iterator yields no elements.
1740 ///
1741 /// The number of elements yielded by this iterator is always equal to
1742 /// the result of calling [`GroupInfo::group_len`] with the same
1743 /// `PatternID`.
1744 ///
1745 /// # Example
1746 ///
1747 /// This example shows how to get a list of all capture group names for
1748 /// a particular pattern.
1749 ///
1750 /// ```
1751 /// use regex_automata::{nfa::thompson::NFA, PatternID};
1752 ///
1753 /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?;
1754 /// // The first is the implicit group that is always unnammed. The next
1755 /// // 5 groups are the explicit groups found in the concrete syntax above.
1756 /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")];
1757 /// let got: Vec<Option<&str>> =
1758 /// nfa.group_info().pattern_names(PatternID::ZERO).collect();
1759 /// assert_eq!(expected, got);
1760 ///
1761 /// // Using an invalid pattern ID will result in nothing yielded.
1762 /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count();
1763 /// assert_eq!(0, got);
1764 ///
1765 /// # Ok::<(), Box<dyn std::error::Error>>(())
1766 /// ```
1767 #[inline]
1768 pub fn pattern_names(&self, pid: PatternID) -> GroupInfoPatternNames<'_> {
1769 GroupInfoPatternNames {
1770 it: self
1771 .0
1772 .index_to_name
1773 .get(pid.as_usize())
1774 .map(|indices| indices.iter())
1775 .unwrap_or([].iter()),
1776 }
1777 }
1778
1779 /// Return an iterator of all capture groups for all patterns supported by
1780 /// this `GroupInfo`. Each item yielded is a triple of the group's pattern
1781 /// ID, index in the pattern and the group's name, if present.
1782 ///
1783 /// # Example
1784 ///
1785 /// This example shows how to get a list of all capture groups found in
1786 /// one NFA, potentially spanning multiple patterns.
1787 ///
1788 /// ```
1789 /// use regex_automata::{nfa::thompson::NFA, PatternID};
1790 ///
1791 /// let nfa = NFA::new_many(&[
1792 /// r"(?P<foo>a)",
1793 /// r"a",
1794 /// r"(a)",
1795 /// ])?;
1796 /// let expected = vec![
1797 /// (PatternID::must(0), 0, None),
1798 /// (PatternID::must(0), 1, Some("foo")),
1799 /// (PatternID::must(1), 0, None),
1800 /// (PatternID::must(2), 0, None),
1801 /// (PatternID::must(2), 1, None),
1802 /// ];
1803 /// let got: Vec<(PatternID, usize, Option<&str>)> =
1804 /// nfa.group_info().all_names().collect();
1805 /// assert_eq!(expected, got);
1806 ///
1807 /// # Ok::<(), Box<dyn std::error::Error>>(())
1808 /// ```
1809 ///
1810 /// Unlike other capturing group related routines, this routine doesn't
1811 /// panic even if captures aren't enabled on this NFA:
1812 ///
1813 /// ```
1814 /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
1815 ///
1816 /// let nfa = NFA::compiler()
1817 /// .configure(NFA::config().which_captures(WhichCaptures::None))
1818 /// .build_many(&[
1819 /// r"(?P<foo>a)",
1820 /// r"a",
1821 /// r"(a)",
1822 /// ])?;
1823 /// // When captures aren't enabled, there's nothing to return.
1824 /// assert_eq!(0, nfa.group_info().all_names().count());
1825 ///
1826 /// # Ok::<(), Box<dyn std::error::Error>>(())
1827 /// ```
1828 #[inline]
1829 pub fn all_names(&self) -> GroupInfoAllNames<'_> {
1830 GroupInfoAllNames {
1831 group_info: self,
1832 pids: PatternID::iter(self.pattern_len()),
1833 current_pid: None,
1834 names: None,
1835 }
1836 }
1837
1838 /// Returns the starting and ending slot corresponding to the given
1839 /// capturing group for the given pattern. The ending slot is always one
1840 /// more than the starting slot returned.
1841 ///
1842 /// Note that this is like [`GroupInfo::slot`], except that it also returns
1843 /// the ending slot value for convenience.
1844 ///
1845 /// If either the pattern ID or the capture index is invalid, then this
1846 /// returns None.
1847 ///
1848 /// # Example
1849 ///
1850 /// This example shows that the starting slots for the first capturing
1851 /// group of each pattern are distinct.
1852 ///
1853 /// ```
1854 /// use regex_automata::{nfa::thompson::NFA, PatternID};
1855 ///
1856 /// let nfa = NFA::new_many(&["a", "b"])?;
1857 /// assert_ne!(
1858 /// nfa.group_info().slots(PatternID::must(0), 0),
1859 /// nfa.group_info().slots(PatternID::must(1), 0),
1860 /// );
1861 ///
1862 /// // Also, the start and end slot values are never equivalent.
1863 /// let (start, end) = nfa.group_info().slots(PatternID::ZERO, 0).unwrap();
1864 /// assert_ne!(start, end);
1865 ///
1866 /// # Ok::<(), Box<dyn std::error::Error>>(())
1867 /// ```
1868 #[inline]
1869 pub fn slots(
1870 &self,
1871 pid: PatternID,
1872 group_index: usize,
1873 ) -> Option<(usize, usize)> {
1874 // Since 'slot' only even returns valid starting slots, we know that
1875 // there must also be an end slot and that end slot is always one more
1876 // than the start slot.
1877 self.slot(pid, group_index).map(|start| (start, start + 1))
1878 }
1879
1880 /// Returns the starting slot corresponding to the given capturing group
1881 /// for the given pattern. The ending slot is always one more than the
1882 /// value returned.
1883 ///
1884 /// If either the pattern ID or the capture index is invalid, then this
1885 /// returns None.
1886 ///
1887 /// # Example
1888 ///
1889 /// This example shows that the starting slots for the first capturing
1890 /// group of each pattern are distinct.
1891 ///
1892 /// ```
1893 /// use regex_automata::{nfa::thompson::NFA, PatternID};
1894 ///
1895 /// let nfa = NFA::new_many(&["a", "b"])?;
1896 /// assert_ne!(
1897 /// nfa.group_info().slot(PatternID::must(0), 0),
1898 /// nfa.group_info().slot(PatternID::must(1), 0),
1899 /// );
1900 ///
1901 /// # Ok::<(), Box<dyn std::error::Error>>(())
1902 /// ```
1903 #[inline]
1904 pub fn slot(&self, pid: PatternID, group_index: usize) -> Option<usize> {
1905 if group_index >= self.group_len(pid) {
1906 return None;
1907 }
1908 // At this point, we know that 'pid' refers to a real pattern and that
1909 // 'group_index' refers to a real group. We therefore also know that
1910 // the pattern and group can be combined to return a correct slot.
1911 // That's why we don't need to use checked arithmetic below.
1912 if group_index == 0 {
1913 Some(pid.as_usize() * 2)
1914 } else {
1915 // As above, we don't need to check that our slot is less than the
1916 // end of our range since we already know the group index is a
1917 // valid index for the given pattern.
1918 let (start, _) = self.0.slot_ranges[pid];
1919 Some(start.as_usize() + ((group_index - 1) * 2))
1920 }
1921 }
1922
1923 /// Returns the total number of patterns in this `GroupInfo`.
1924 ///
1925 /// This may return zero if the `GroupInfo` was constructed with no
1926 /// patterns.
1927 ///
1928 /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because
1929 /// `GroupInfo` construction will fail if too many patterns are added.
1930 ///
1931 /// # Example
1932 ///
1933 /// ```
1934 /// use regex_automata::nfa::thompson::NFA;
1935 ///
1936 /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
1937 /// assert_eq!(3, nfa.group_info().pattern_len());
1938 ///
1939 /// let nfa = NFA::never_match();
1940 /// assert_eq!(0, nfa.group_info().pattern_len());
1941 ///
1942 /// let nfa = NFA::always_match();
1943 /// assert_eq!(1, nfa.group_info().pattern_len());
1944 ///
1945 /// # Ok::<(), Box<dyn std::error::Error>>(())
1946 /// ```
1947 #[inline]
1948 pub fn pattern_len(&self) -> usize {
1949 self.0.pattern_len()
1950 }
1951
1952 /// Return the number of capture groups in a pattern.
1953 ///
1954 /// If the pattern ID is invalid, then this returns `0`.
1955 ///
1956 /// # Example
1957 ///
1958 /// This example shows how the values returned by this routine may vary
1959 /// for different patterns and NFA configurations.
1960 ///
1961 /// ```
1962 /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID};
1963 ///
1964 /// let nfa = NFA::new(r"(a)(b)(c)")?;
1965 /// // There are 3 explicit groups in the pattern's concrete syntax and
1966 /// // 1 unnamed and implicit group spanning the entire pattern.
1967 /// assert_eq!(4, nfa.group_info().group_len(PatternID::ZERO));
1968 ///
1969 /// let nfa = NFA::new(r"abc")?;
1970 /// // There is just the unnamed implicit group.
1971 /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO));
1972 ///
1973 /// let nfa = NFA::compiler()
1974 /// .configure(NFA::config().which_captures(WhichCaptures::None))
1975 /// .build(r"abc")?;
1976 /// // We disabled capturing groups, so there are none.
1977 /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO));
1978 ///
1979 /// let nfa = NFA::compiler()
1980 /// .configure(NFA::config().which_captures(WhichCaptures::None))
1981 /// .build(r"(a)(b)(c)")?;
1982 /// // We disabled capturing groups, so there are none, even if there are
1983 /// // explicit groups in the concrete syntax.
1984 /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO));
1985 ///
1986 /// # Ok::<(), Box<dyn std::error::Error>>(())
1987 /// ```
1988 #[inline]
1989 pub fn group_len(&self, pid: PatternID) -> usize {
1990 self.0.group_len(pid)
1991 }
1992
1993 /// Return the total number of capture groups across all patterns.
1994 ///
1995 /// This includes implicit groups that represent the entire match of a
1996 /// pattern.
1997 ///
1998 /// # Example
1999 ///
2000 /// This example shows how the values returned by this routine may vary
2001 /// for different patterns and NFA configurations.
2002 ///
2003 /// ```
2004 /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID};
2005 ///
2006 /// let nfa = NFA::new(r"(a)(b)(c)")?;
2007 /// // There are 3 explicit groups in the pattern's concrete syntax and
2008 /// // 1 unnamed and implicit group spanning the entire pattern.
2009 /// assert_eq!(4, nfa.group_info().all_group_len());
2010 ///
2011 /// let nfa = NFA::new(r"abc")?;
2012 /// // There is just the unnamed implicit group.
2013 /// assert_eq!(1, nfa.group_info().all_group_len());
2014 ///
2015 /// let nfa = NFA::new_many(&["(a)", "b", "(c)"])?;
2016 /// // Each pattern has one implicit groups, and two
2017 /// // patterns have one explicit group each.
2018 /// assert_eq!(5, nfa.group_info().all_group_len());
2019 ///
2020 /// let nfa = NFA::compiler()
2021 /// .configure(NFA::config().which_captures(WhichCaptures::None))
2022 /// .build(r"abc")?;
2023 /// // We disabled capturing groups, so there are none.
2024 /// assert_eq!(0, nfa.group_info().all_group_len());
2025 ///
2026 /// let nfa = NFA::compiler()
2027 /// .configure(NFA::config().which_captures(WhichCaptures::None))
2028 /// .build(r"(a)(b)(c)")?;
2029 /// // We disabled capturing groups, so there are none, even if there are
2030 /// // explicit groups in the concrete syntax.
2031 /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO));
2032 ///
2033 /// # Ok::<(), Box<dyn std::error::Error>>(())
2034 /// ```
2035 #[inline]
2036 pub fn all_group_len(&self) -> usize {
2037 self.slot_len() / 2
2038 }
2039
2040 /// Returns the total number of slots in this `GroupInfo` across all
2041 /// patterns.
2042 ///
2043 /// The total number of slots is always twice the total number of capturing
2044 /// groups, including both implicit and explicit groups.
2045 ///
2046 /// # Example
2047 ///
2048 /// This example shows the relationship between the number of capturing
2049 /// groups and slots.
2050 ///
2051 /// ```
2052 /// use regex_automata::util::captures::GroupInfo;
2053 ///
2054 /// // There are 11 total groups here.
2055 /// let info = GroupInfo::new(vec![
2056 /// vec![None, Some("foo")],
2057 /// vec![None],
2058 /// vec![None, None, None, Some("bar"), None],
2059 /// vec![None, None, Some("foo")],
2060 /// ])?;
2061 /// // 2 slots per group gives us 11*2=22 slots.
2062 /// assert_eq!(22, info.slot_len());
2063 ///
2064 /// # Ok::<(), Box<dyn std::error::Error>>(())
2065 /// ```
2066 #[inline]
2067 pub fn slot_len(&self) -> usize {
2068 self.0.small_slot_len().as_usize()
2069 }
2070
2071 /// Returns the total number of slots for implicit capturing groups.
2072 ///
2073 /// This is like [`GroupInfo::slot_len`], except it doesn't include the
2074 /// explicit slots for each pattern. Since there are always exactly 2
2075 /// implicit slots for each pattern, the number of implicit slots is always
2076 /// equal to twice the number of patterns.
2077 ///
2078 /// # Example
2079 ///
2080 /// This example shows the relationship between the number of capturing
2081 /// groups, implicit slots and explicit slots.
2082 ///
2083 /// ```
2084 /// use regex_automata::util::captures::GroupInfo;
2085 ///
2086 /// // There are 11 total groups here.
2087 /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?;
2088 /// // 2 slots per group gives us 11*2=22 slots.
2089 /// assert_eq!(6, info.slot_len());
2090 /// // 2 implicit slots per pattern gives us 2 implicit slots since there
2091 /// // is 1 pattern.
2092 /// assert_eq!(2, info.implicit_slot_len());
2093 /// // 2 explicit capturing groups gives us 2*2=4 explicit slots.
2094 /// assert_eq!(4, info.explicit_slot_len());
2095 ///
2096 /// # Ok::<(), Box<dyn std::error::Error>>(())
2097 /// ```
2098 #[inline]
2099 pub fn implicit_slot_len(&self) -> usize {
2100 self.pattern_len() * 2
2101 }
2102
2103 /// Returns the total number of slots for explicit capturing groups.
2104 ///
2105 /// This is like [`GroupInfo::slot_len`], except it doesn't include the
2106 /// implicit slots for each pattern. (There are always 2 implicit slots for
2107 /// each pattern.)
2108 ///
2109 /// For a non-empty `GroupInfo`, it is always the case that `slot_len` is
2110 /// strictly greater than `explicit_slot_len`. For an empty `GroupInfo`,
2111 /// both the total number of slots and the number of explicit slots is
2112 /// `0`.
2113 ///
2114 /// # Example
2115 ///
2116 /// This example shows the relationship between the number of capturing
2117 /// groups, implicit slots and explicit slots.
2118 ///
2119 /// ```
2120 /// use regex_automata::util::captures::GroupInfo;
2121 ///
2122 /// // There are 11 total groups here.
2123 /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?;
2124 /// // 2 slots per group gives us 11*2=22 slots.
2125 /// assert_eq!(6, info.slot_len());
2126 /// // 2 implicit slots per pattern gives us 2 implicit slots since there
2127 /// // is 1 pattern.
2128 /// assert_eq!(2, info.implicit_slot_len());
2129 /// // 2 explicit capturing groups gives us 2*2=4 explicit slots.
2130 /// assert_eq!(4, info.explicit_slot_len());
2131 ///
2132 /// # Ok::<(), Box<dyn std::error::Error>>(())
2133 /// ```
2134 #[inline]
2135 pub fn explicit_slot_len(&self) -> usize {
2136 self.slot_len().saturating_sub(self.implicit_slot_len())
2137 }
2138
2139 /// Returns the memory usage, in bytes, of this `GroupInfo`.
2140 ///
2141 /// This does **not** include the stack size used up by this `GroupInfo`.
2142 /// To compute that, use `std::mem::size_of::<GroupInfo>()`.
2143 #[inline]
2144 pub fn memory_usage(&self) -> usize {
2145 use core::mem::size_of as s;
2146
2147 s::<GroupInfoInner>()
2148 + self.0.slot_ranges.len() * s::<(SmallIndex, SmallIndex)>()
2149 + self.0.name_to_index.len() * s::<CaptureNameMap>()
2150 + self.0.index_to_name.len() * s::<Vec<Option<Arc<str>>>>()
2151 + self.0.memory_extra
2152 }
2153}
2154
2155/// A map from capture group name to its corresponding capture group index.
2156///
2157/// This type is actually wrapped inside a Vec indexed by pattern ID on a
2158/// `GroupInfo`, since multiple patterns may have the same capture group name.
2159/// That is, each pattern gets its own namespace of capture group names.
2160///
2161/// Perhaps a more memory efficient representation would be
2162/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look
2163/// up a capture index by name without producing a `Arc<str>`, which requires
2164/// an allocation. To fix this, I think we'd need to define our own unsized
2165/// type or something? Anyway, I didn't give this much thought since it
2166/// probably doesn't matter much in the grand scheme of things. But it did
2167/// stand out to me as mildly wasteful.
2168#[cfg(feature = "std")]
2169type CaptureNameMap = std::collections::HashMap<Arc<str>, SmallIndex>;
2170#[cfg(not(feature = "std"))]
2171type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, SmallIndex>;
2172
2173/// The inner guts of `GroupInfo`. This type only exists so that it can
2174/// be wrapped in an `Arc` to make `GroupInfo` reference counted.
2175#[derive(Debug, Default)]
2176struct GroupInfoInner {
2177 slot_ranges: Vec<(SmallIndex, SmallIndex)>,
2178 name_to_index: Vec<CaptureNameMap>,
2179 index_to_name: Vec<Vec<Option<Arc<str>>>>,
2180 memory_extra: usize,
2181}
2182
2183impl GroupInfoInner {
2184 /// This adds the first unnamed group for the given pattern ID. The given
2185 /// pattern ID must be zero if this is the first time this method is
2186 /// called, or must be exactly one more than the pattern ID supplied to the
2187 /// previous call to this method. (This method panics if this rule is
2188 /// violated.)
2189 ///
2190 /// This can be thought of as initializing the GroupInfo state for the
2191 /// given pattern and closing off the state for any previous pattern.
2192 fn add_first_group(&mut self, pid: PatternID) {
2193 assert_eq!(pid.as_usize(), self.slot_ranges.len());
2194 assert_eq!(pid.as_usize(), self.name_to_index.len());
2195 assert_eq!(pid.as_usize(), self.index_to_name.len());
2196 // This is the start of our slots for the explicit capturing groups.
2197 // Note that since the slots for the 0th group for every pattern appear
2198 // before any slots for the nth group (where n > 0) in any pattern, we
2199 // will have to fix up the slot ranges once we know how many patterns
2200 // we've added capture groups for.
2201 let slot_start = self.small_slot_len();
2202 self.slot_ranges.push((slot_start, slot_start));
2203 self.name_to_index.push(CaptureNameMap::new());
2204 self.index_to_name.push(vec![None]);
2205 self.memory_extra += core::mem::size_of::<Option<Arc<str>>>();
2206 }
2207
2208 /// Add an explicit capturing group for the given pattern with the given
2209 /// index. If the group has a name, then that must be given as well.
2210 ///
2211 /// Note that every capturing group except for the first or zeroth group is
2212 /// explicit.
2213 ///
2214 /// This returns an error if adding this group would result in overflowing
2215 /// slot indices or if a capturing group with the same name for this
2216 /// pattern has already been added.
2217 fn add_explicit_group<N: AsRef<str>>(
2218 &mut self,
2219 pid: PatternID,
2220 group: SmallIndex,
2221 maybe_name: Option<N>,
2222 ) -> Result<(), GroupInfoError> {
2223 // We also need to check that the slot index generated for
2224 // this group is also valid. Although, this is a little weird
2225 // because we offset these indices below, at which point, we'll
2226 // have to recheck them. Gosh this is annoying. Note that
2227 // the '+2' below is OK because 'end' is guaranteed to be less
2228 // than isize::MAX.
2229 let end = &mut self.slot_ranges[pid].1;
2230 *end = SmallIndex::new(end.as_usize() + 2).map_err(|_| {
2231 GroupInfoError::too_many_groups(pid, group.as_usize())
2232 })?;
2233 if let Some(name) = maybe_name {
2234 let name = Arc::<str>::from(name.as_ref());
2235 if self.name_to_index[pid].contains_key(&*name) {
2236 return Err(GroupInfoError::duplicate(pid, &name));
2237 }
2238 let len = name.len();
2239 self.name_to_index[pid].insert(Arc::clone(&name), group);
2240 self.index_to_name[pid].push(Some(name));
2241 // Adds the memory used by the Arc<str> in both maps.
2242 self.memory_extra +=
2243 2 * (len + core::mem::size_of::<Option<Arc<str>>>());
2244 // And also the value entry for the 'name_to_index' map.
2245 // This is probably an underestimate for 'name_to_index' since
2246 // hashmaps/btrees likely have some non-zero overhead, but we
2247 // assume here that they have zero overhead.
2248 self.memory_extra += core::mem::size_of::<SmallIndex>();
2249 } else {
2250 self.index_to_name[pid].push(None);
2251 self.memory_extra += core::mem::size_of::<Option<Arc<str>>>();
2252 }
2253 // This is a sanity assert that checks that our group index
2254 // is in line with the number of groups added so far for this
2255 // pattern.
2256 assert_eq!(group.one_more(), self.group_len(pid));
2257 // And is also in line with the 'index_to_name' map.
2258 assert_eq!(group.one_more(), self.index_to_name[pid].len());
2259 Ok(())
2260 }
2261
2262 /// This corrects the slot ranges to account for the slots corresponding
2263 /// to the zeroth group of each pattern. That is, every slot range is
2264 /// offset by 'pattern_len() * 2', since each pattern uses two slots to
2265 /// represent the zeroth group.
2266 fn fixup_slot_ranges(&mut self) -> Result<(), GroupInfoError> {
2267 use crate::util::primitives::IteratorIndexExt;
2268 // Since we know number of patterns fits in PatternID and
2269 // PatternID::MAX < isize::MAX, it follows that multiplying by 2 will
2270 // never overflow usize.
2271 let offset = self.pattern_len().checked_mul(2).unwrap();
2272 for (pid, &mut (ref mut start, ref mut end)) in
2273 self.slot_ranges.iter_mut().with_pattern_ids()
2274 {
2275 let group_len = 1 + ((end.as_usize() - start.as_usize()) / 2);
2276 let new_end = match end.as_usize().checked_add(offset) {
2277 Some(new_end) => new_end,
2278 None => {
2279 return Err(GroupInfoError::too_many_groups(
2280 pid, group_len,
2281 ))
2282 }
2283 };
2284 *end = SmallIndex::new(new_end).map_err(|_| {
2285 GroupInfoError::too_many_groups(pid, group_len)
2286 })?;
2287 // Since start <= end, if end is valid then start must be too.
2288 *start = SmallIndex::new(start.as_usize() + offset).unwrap();
2289 }
2290 Ok(())
2291 }
2292
2293 /// Return the total number of patterns represented by this capture slot
2294 /// info.
2295 fn pattern_len(&self) -> usize {
2296 self.slot_ranges.len()
2297 }
2298
2299 /// Return the total number of capturing groups for the given pattern. If
2300 /// the given pattern isn't valid for this capture slot info, then 0 is
2301 /// returned.
2302 fn group_len(&self, pid: PatternID) -> usize {
2303 let (start, end) = match self.slot_ranges.get(pid.as_usize()) {
2304 None => return 0,
2305 Some(range) => range,
2306 };
2307 // The difference between any two SmallIndex values always fits in a
2308 // usize since we know that SmallIndex::MAX <= isize::MAX-1. We also
2309 // know that start<=end by construction and that the number of groups
2310 // never exceeds SmallIndex and thus never overflows usize.
2311 1 + ((end.as_usize() - start.as_usize()) / 2)
2312 }
2313
2314 /// Return the total number of slots in this capture slot info as a
2315 /// "small index."
2316 fn small_slot_len(&self) -> SmallIndex {
2317 // Since slots are allocated in order of pattern (starting at 0) and
2318 // then in order of capture group, it follows that the number of slots
2319 // is the end of the range of slots for the last pattern. This is
2320 // true even when the last pattern has no capturing groups, since
2321 // 'slot_ranges' will still represent it explicitly with an empty
2322 // range.
2323 self.slot_ranges.last().map_or(SmallIndex::ZERO, |&(_, end)| end)
2324 }
2325}
2326
2327/// An error that may occur when building a `GroupInfo`.
2328///
2329/// Building a `GroupInfo` does a variety of checks to make sure the
2330/// capturing groups satisfy a number of invariants. This includes, but is not
2331/// limited to, ensuring that the first capturing group is unnamed and that
2332/// there are no duplicate capture groups for a specific pattern.
2333#[derive(Clone, Debug)]
2334pub struct GroupInfoError {
2335 kind: GroupInfoErrorKind,
2336}
2337
2338/// The kind of error that occurs when building a `GroupInfo` fails.
2339///
2340/// We keep this un-exported because it's not clear how useful it is to
2341/// export it.
2342#[derive(Clone, Debug)]
2343enum GroupInfoErrorKind {
2344 /// This occurs when too many patterns have been added. i.e., It would
2345 /// otherwise overflow a `PatternID`.
2346 TooManyPatterns { err: PatternIDError },
2347 /// This occurs when too many capturing groups have been added for a
2348 /// particular pattern.
2349 TooManyGroups {
2350 /// The ID of the pattern that had too many groups.
2351 pattern: PatternID,
2352 /// The minimum number of groups that the caller has tried to add for
2353 /// a pattern.
2354 minimum: usize,
2355 },
2356 /// An error that occurs when a pattern has no capture groups. Either the
2357 /// group info must be empty, or all patterns must have at least one group
2358 /// (corresponding to the unnamed group for the entire pattern).
2359 MissingGroups {
2360 /// The ID of the pattern that had no capturing groups.
2361 pattern: PatternID,
2362 },
2363 /// An error that occurs when one tries to provide a name for the capture
2364 /// group at index 0. This capturing group must currently always be
2365 /// unnamed.
2366 FirstMustBeUnnamed {
2367 /// The ID of the pattern that was found to have a named first
2368 /// capturing group.
2369 pattern: PatternID,
2370 },
2371 /// An error that occurs when duplicate capture group names for the same
2372 /// pattern are added.
2373 ///
2374 /// NOTE: At time of writing, this error can never occur if you're using
2375 /// regex-syntax, since the parser itself will reject patterns with
2376 /// duplicate capture group names. This error can only occur when the
2377 /// builder is used to hand construct NFAs.
2378 Duplicate {
2379 /// The pattern in which the duplicate capture group name was found.
2380 pattern: PatternID,
2381 /// The duplicate name.
2382 name: String,
2383 },
2384}
2385
2386impl GroupInfoError {
2387 fn too_many_patterns(err: PatternIDError) -> GroupInfoError {
2388 GroupInfoError { kind: GroupInfoErrorKind::TooManyPatterns { err } }
2389 }
2390
2391 fn too_many_groups(pattern: PatternID, minimum: usize) -> GroupInfoError {
2392 GroupInfoError {
2393 kind: GroupInfoErrorKind::TooManyGroups { pattern, minimum },
2394 }
2395 }
2396
2397 fn missing_groups(pattern: PatternID) -> GroupInfoError {
2398 GroupInfoError { kind: GroupInfoErrorKind::MissingGroups { pattern } }
2399 }
2400
2401 fn first_must_be_unnamed(pattern: PatternID) -> GroupInfoError {
2402 GroupInfoError {
2403 kind: GroupInfoErrorKind::FirstMustBeUnnamed { pattern },
2404 }
2405 }
2406
2407 fn duplicate(pattern: PatternID, name: &str) -> GroupInfoError {
2408 GroupInfoError {
2409 kind: GroupInfoErrorKind::Duplicate {
2410 pattern,
2411 name: String::from(name),
2412 },
2413 }
2414 }
2415}
2416
2417#[cfg(feature = "std")]
2418impl std::error::Error for GroupInfoError {
2419 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
2420 match self.kind {
2421 GroupInfoErrorKind::TooManyPatterns { .. }
2422 | GroupInfoErrorKind::TooManyGroups { .. }
2423 | GroupInfoErrorKind::MissingGroups { .. }
2424 | GroupInfoErrorKind::FirstMustBeUnnamed { .. }
2425 | GroupInfoErrorKind::Duplicate { .. } => None,
2426 }
2427 }
2428}
2429
2430impl core::fmt::Display for GroupInfoError {
2431 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
2432 use self::GroupInfoErrorKind::*;
2433
2434 match self.kind {
2435 TooManyPatterns { ref err } => {
2436 write!(f, "too many patterns to build capture info: {}", err)
2437 }
2438 TooManyGroups { pattern, minimum } => {
2439 write!(
2440 f,
2441 "too many capture groups (at least {}) were \
2442 found for pattern {}",
2443 minimum,
2444 pattern.as_usize()
2445 )
2446 }
2447 MissingGroups { pattern } => write!(
2448 f,
2449 "no capturing groups found for pattern {} \
2450 (either all patterns have zero groups or all patterns have \
2451 at least one group)",
2452 pattern.as_usize(),
2453 ),
2454 FirstMustBeUnnamed { pattern } => write!(
2455 f,
2456 "first capture group (at index 0) for pattern {} has a name \
2457 (it must be unnamed)",
2458 pattern.as_usize(),
2459 ),
2460 Duplicate { pattern, ref name } => write!(
2461 f,
2462 "duplicate capture group name '{}' found for pattern {}",
2463 name,
2464 pattern.as_usize(),
2465 ),
2466 }
2467 }
2468}
2469
2470/// An iterator over capturing groups and their names for a specific pattern.
2471///
2472/// This iterator is created by [`GroupInfo::pattern_names`].
2473///
2474/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo`
2475/// from which this iterator was created.
2476#[derive(Clone, Debug)]
2477pub struct GroupInfoPatternNames<'a> {
2478 it: core::slice::Iter<'a, Option<Arc<str>>>,
2479}
2480
2481impl GroupInfoPatternNames<'static> {
2482 fn empty() -> GroupInfoPatternNames<'static> {
2483 GroupInfoPatternNames { it: [].iter() }
2484 }
2485}
2486
2487impl<'a> Iterator for GroupInfoPatternNames<'a> {
2488 type Item = Option<&'a str>;
2489
2490 fn next(&mut self) -> Option<Option<&'a str>> {
2491 self.it.next().map(|x| x.as_deref())
2492 }
2493
2494 fn size_hint(&self) -> (usize, Option<usize>) {
2495 self.it.size_hint()
2496 }
2497
2498 fn count(self) -> usize {
2499 self.it.count()
2500 }
2501}
2502
2503impl<'a> ExactSizeIterator for GroupInfoPatternNames<'a> {}
2504impl<'a> core::iter::FusedIterator for GroupInfoPatternNames<'a> {}
2505
2506/// An iterator over capturing groups and their names for a `GroupInfo`.
2507///
2508/// This iterator is created by [`GroupInfo::all_names`].
2509///
2510/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo`
2511/// from which this iterator was created.
2512#[derive(Debug)]
2513pub struct GroupInfoAllNames<'a> {
2514 group_info: &'a GroupInfo,
2515 pids: PatternIDIter,
2516 current_pid: Option<PatternID>,
2517 names: Option<core::iter::Enumerate<GroupInfoPatternNames<'a>>>,
2518}
2519
2520impl<'a> Iterator for GroupInfoAllNames<'a> {
2521 type Item = (PatternID, usize, Option<&'a str>);
2522
2523 fn next(&mut self) -> Option<(PatternID, usize, Option<&'a str>)> {
2524 // If the group info has no captures, then we never have anything
2525 // to yield. We need to consider this case explicitly (at time of
2526 // writing) because 'pattern_capture_names' will panic if captures
2527 // aren't enabled.
2528 if self.group_info.0.index_to_name.is_empty() {
2529 return None;
2530 }
2531 if self.current_pid.is_none() {
2532 self.current_pid = Some(self.pids.next()?);
2533 }
2534 let pid = self.current_pid.unwrap();
2535 if self.names.is_none() {
2536 self.names = Some(self.group_info.pattern_names(pid).enumerate());
2537 }
2538 let (group_index, name) = match self.names.as_mut().unwrap().next() {
2539 Some((group_index, name)) => (group_index, name),
2540 None => {
2541 self.current_pid = None;
2542 self.names = None;
2543 return self.next();
2544 }
2545 };
2546 Some((pid, group_index, name))
2547 }
2548}
2549